From 49a2ae9ce96fc0a43f12e433399a0a497a794f1a Mon Sep 17 00:00:00 2001 From: mwfj Date: Mon, 13 Apr 2026 16:09:33 +0800 Subject: [PATCH 01/37] Support Circut break Phase1-2 --- Makefile | 8 +- .../circuit_breaker/circuit_breaker_slice.h | 115 ++++ .../circuit_breaker/circuit_breaker_state.h | 66 ++ .../circuit_breaker/circuit_breaker_window.h | 59 ++ include/config/server_config.h | 47 +- server/circuit_breaker_slice.cc | 277 ++++++++ server/circuit_breaker_window.cc | 81 +++ server/config_loader.cc | 111 +++ test/circuit_breaker_test.h | 647 ++++++++++++++++++ test/config_test.h | 221 ++++++ test/run_test.cc | 7 + 11 files changed, 1636 insertions(+), 3 deletions(-) create mode 100644 include/circuit_breaker/circuit_breaker_slice.h create mode 100644 include/circuit_breaker/circuit_breaker_state.h create mode 100644 include/circuit_breaker/circuit_breaker_window.h create mode 100644 server/circuit_breaker_slice.cc create mode 100644 server/circuit_breaker_window.cc create mode 100644 test/circuit_breaker_test.h diff --git a/Makefile b/Makefile index 68d5f781..8f4ec3f2 100644 --- a/Makefile +++ b/Makefile @@ -76,6 +76,9 @@ UPSTREAM_SRCS = $(SERVER_DIR)/upstream_connection.cc $(SERVER_DIR)/pool_partitio # Rate limit layer sources RATE_LIMIT_SRCS = $(SERVER_DIR)/token_bucket.cc $(SERVER_DIR)/rate_limit_zone.cc $(SERVER_DIR)/rate_limiter.cc +# Circuit breaker layer sources +CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc + # CLI layer sources CLI_SRCS = $(SERVER_DIR)/cli_parser.cc $(SERVER_DIR)/signal_handler.cc $(SERVER_DIR)/pid_file.cc $(SERVER_DIR)/daemonizer.cc @@ -122,7 +125,7 @@ NGHTTP2_SRC = $(THIRD_PARTY_DIR)/nghttp2/nghttp2_alpn.c \ NGHTTP2_OBJ = $(NGHTTP2_SRC:.c=.o) # Server library sources (shared between test and production binaries) -LIB_SRCS = $(REACTOR_SRCS) $(NETWORK_SRCS) $(SERVER_SRCS) $(THREAD_POOL_SRCS) $(FOUNDATION_SRCS) $(HTTP_SRCS) $(HTTP2_SRCS) $(WS_SRCS) $(TLS_SRCS) $(UPSTREAM_SRCS) $(RATE_LIMIT_SRCS) $(CLI_SRCS) $(UTIL_SRCS) +LIB_SRCS = $(REACTOR_SRCS) $(NETWORK_SRCS) $(SERVER_SRCS) $(THREAD_POOL_SRCS) $(FOUNDATION_SRCS) $(HTTP_SRCS) $(HTTP2_SRCS) $(WS_SRCS) $(TLS_SRCS) $(UPSTREAM_SRCS) $(RATE_LIMIT_SRCS) $(CIRCUIT_BREAKER_SRCS) $(CLI_SRCS) $(UTIL_SRCS) # Test binary sources TEST_SRCS = $(LIB_SRCS) $(TEST_DIR)/test_framework.cc $(TEST_DIR)/run_test.cc @@ -142,11 +145,12 @@ WS_HEADERS = $(LIB_DIR)/ws/websocket_connection.h $(LIB_DIR)/ws/websocket_frame. TLS_HEADERS = $(LIB_DIR)/tls/tls_context.h $(LIB_DIR)/tls/tls_connection.h $(LIB_DIR)/tls/tls_client_context.h UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/upstream_host_pool.h $(LIB_DIR)/upstream/pool_partition.h $(LIB_DIR)/upstream/upstream_connection.h $(LIB_DIR)/upstream/upstream_lease.h $(LIB_DIR)/upstream/upstream_http_codec.h $(LIB_DIR)/upstream/http_request_serializer.h $(LIB_DIR)/upstream/header_rewriter.h $(LIB_DIR)/upstream/retry_policy.h $(LIB_DIR)/upstream/proxy_transaction.h $(LIB_DIR)/upstream/proxy_handler.h $(LIB_DIR)/upstream/upstream_response.h $(LIB_DIR)/upstream/upstream_callbacks.h RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h +CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h # All headers combined -HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) +HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) # Default target .DEFAULT_GOAL := all diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h new file mode 100644 index 00000000..5633c355 --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -0,0 +1,115 @@ +#pragma once + +#include "common.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_window.h" +// , , provided by common.h + +namespace circuit_breaker { + +// One per-dispatcher slice of the breaker state for a given upstream host. +// Dispatcher-thread-local for hot-path correctness: TryAcquire, ReportSuccess, +// ReportFailure must only be called on the dispatcher that owns this slice. +// +// Observability counters (`trips_`, `rejected_`, etc.) are atomic so other +// threads can snapshot them without synchronization. Everything else is +// plain (no atomics) — single-writer, single-reader. +class CircuitBreakerSlice { +public: + // `time_source` defaults to steady_clock::now. Tests inject a mock clock. + using TimeSource = std::function; + + CircuitBreakerSlice(std::string host_label, + size_t dispatcher_index, + const CircuitBreakerConfig& config, + TimeSource time_source = nullptr); + + // Non-copyable, non-movable: slices are pinned in a Host's vector and + // callbacks capture raw pointers. + CircuitBreakerSlice(const CircuitBreakerSlice&) = delete; + CircuitBreakerSlice& operator=(const CircuitBreakerSlice&) = delete; + + // Hot-path decision. Consults state + (if applicable) advances OPEN→HALF_OPEN + // and reserves a probe slot. Increments `rejected_` on REJECTED_OPEN* + // (both enforce and dry-run). Emits reject log on dispatcher thread. + Decision TryAcquire(); + + // Outcome reporting. `probe` is true iff the paired TryAcquire returned + // ADMITTED_PROBE. Report* may trigger state transitions and fire the + // transition callback. + void ReportSuccess(bool probe); + void ReportFailure(FailureKind kind, bool probe); + + // Apply a new config (called on this slice's dispatcher thread). + // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if + // window_seconds changed. + void Reload(const CircuitBreakerConfig& new_config); + + // Install or replace the state-transition callback. Safe to call before + // any traffic (startup wiring) OR after a hot-reload flips enabled=false→true. + // Callers must invoke on this slice's dispatcher thread. + void SetTransitionCallback(StateTransitionCallback cb); + + // Observability — safe from any thread. + State CurrentState() const { return state_.load(std::memory_order_acquire); } + int64_t Trips() const { return trips_.load(std::memory_order_relaxed); } + int64_t Rejected() const { return rejected_.load(std::memory_order_relaxed); } + int64_t ProbeSuccesses() const { return probe_successes_.load(std::memory_order_relaxed); } + int64_t ProbeFailures() const { return probe_failures_.load(std::memory_order_relaxed); } + + const std::string& host_label() const { return host_label_; } + size_t dispatcher_index() const { return dispatcher_index_; } + + // Current open_until time. Used by ProxyTransaction to compute + // Retry-After. Returns zero ns when not OPEN. + std::chrono::steady_clock::time_point OpenUntil() const; + +private: + // Logging label: "service=X host=Y:Z partition=N" built once. + std::string host_label_; + size_t dispatcher_index_; + CircuitBreakerConfig config_; + + TimeSource time_source_; + + // Hot-path state — state_ written on dispatcher, read by observers. + std::atomic state_{State::CLOSED}; + // Nanoseconds since steady_clock epoch — 0 when not OPEN. + std::atomic open_until_steady_ns_{0}; + // Count of consecutive trips (OPEN entries) since last CLOSED — + // drives exponential backoff of open duration. + std::atomic consecutive_trips_{0}; + + // Dispatcher-thread-only (no atomics). + int consecutive_failures_ = 0; + CircuitBreakerWindow window_; + int half_open_inflight_ = 0; + int half_open_successes_ = 0; + bool half_open_saw_failure_ = false; + + // Observability counters. + std::atomic trips_{0}; + std::atomic rejected_{0}; + std::atomic probe_successes_{0}; + std::atomic probe_failures_{0}; + + StateTransitionCallback transition_cb_; + + // Internal transitions (dispatcher-thread). + void TripClosedToOpen(const char* trigger); + void TransitionOpenToHalfOpen(); + void TransitionHalfOpenToClosed(); + void TripHalfOpenToOpen(const char* trigger); + + // Compute open duration for the current consecutive_trips_ value: + // min(base * 2^consecutive_trips, max). Always >= base_open_duration_ms. + std::chrono::nanoseconds ComputeOpenDuration() const; + + // Check whether CLOSED trip conditions are met. Called after every failure. + bool ShouldTripClosed(); + + std::chrono::steady_clock::time_point Now() const; +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h new file mode 100644 index 00000000..06fa695d --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_state.h @@ -0,0 +1,66 @@ +#pragma once + +#include "common.h" +// , , provided by common.h + +// Circuit breaker state machine and classification enums. Used by +// CircuitBreakerSlice, CircuitBreakerHost, CircuitBreakerManager, and +// ProxyTransaction to talk about state, admission decisions, and +// failure kinds. +// +// Three-state resilience4j-style machine: +// +// CLOSED ──trip── OPEN ──(open_until elapsed)── HALF_OPEN ──success── CLOSED +// │ +// failure +// ▼ +// OPEN +namespace circuit_breaker { + +enum class State : uint8_t { + CLOSED = 0, + OPEN = 1, + HALF_OPEN = 2, +}; + +// Result of CircuitBreakerSlice::TryAcquire. Callers branch on this enum +// only — they never read the CircuitBreakerConfig directly. Dry-run policy +// is encoded in the decision, not in a separate flag. +enum class Decision : uint8_t { + ADMITTED, // CLOSED — proceed to pool + ADMITTED_PROBE, // HALF_OPEN probe slot consumed — proceed, tag as probe + REJECTED_OPEN, // OPEN (or HALF_OPEN-full); ENFORCE — drop with 503 + REJECTED_OPEN_DRYRUN, // Shadow mode: slice would reject but operator asked + // for pass-through. Caller proceeds to pool. Counters + // and log already updated by TryAcquire. +}; + +// Failure classification. Only these kinds feed ReportFailure — 4xx and +// local-capacity issues (POOL_EXHAUSTED, QUEUE_TIMEOUT, shutdown) are NOT +// reported as failures. +enum class FailureKind : uint8_t { + CONNECT_FAILURE, + RESPONSE_5XX, + RESPONSE_TIMEOUT, + UPSTREAM_DISCONNECT, +}; + +// Callback fired on every slice state transition. Runs on the slice's +// owning dispatcher thread. Callers can compare old/new to key off a +// specific edge (e.g. CLOSED→OPEN fires wait-queue drain). +// `trigger` is a short static string such as "consecutive" / "rate" / +// "probe_success" / "probe_failure" for logging. +using StateTransitionCallback = + std::function; + +// Convert a state to a short lowercase label for logging. +inline const char* StateName(State s) { + switch (s) { + case State::CLOSED: return "closed"; + case State::OPEN: return "open"; + case State::HALF_OPEN: return "half_open"; + } + return "unknown"; +} + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_window.h b/include/circuit_breaker/circuit_breaker_window.h new file mode 100644 index 00000000..12679bcd --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_window.h @@ -0,0 +1,59 @@ +#pragma once + +#include "common.h" +// , provided by common.h + +namespace circuit_breaker { + +// Time-bucketed sliding window. One bucket per second; ring indexed by +// `epoch_sec % window_seconds`. Advances lazily on every Add* call: +// when the incoming `now` is ahead of the recorded head, all buckets +// that have aged out of the window are zeroed before the new increment. +// +// Dispatcher-thread-local by design — NO synchronization. Used from +// CircuitBreakerSlice, which is owned by a single dispatcher. +class CircuitBreakerWindow { +public: + explicit CircuitBreakerWindow(int window_seconds); + + // Record one outcome at `now`. Advances the ring if needed. + void AddSuccess(std::chrono::steady_clock::time_point now); + void AddFailure(std::chrono::steady_clock::time_point now); + + // Observed counts across the current window. `now` is used to expire + // stale buckets before reading. + int64_t TotalCount(std::chrono::steady_clock::time_point now); + int64_t FailureCount(std::chrono::steady_clock::time_point now); + + // Reset the ring to zero. Called on state transitions that should + // start a fresh observation (e.g. HALF_OPEN → CLOSED). + void Reset(); + + // Reinitialize for a new window size (config reload). Resets buckets. + void Resize(int new_window_seconds); + + int window_seconds() const { return window_seconds_; } + +private: + struct Bucket { + int64_t total = 0; + int64_t failures = 0; + }; + + int window_seconds_; + std::vector buckets_; + + // Epoch-seconds of the most recent observation. Used to compute how + // many buckets need to be zeroed on advance. + int64_t head_epoch_sec_ = -1; + + // Advance the ring if `now_sec` is newer than `head_epoch_sec_`, + // zeroing any buckets that aged out. + void Advance(int64_t now_sec); + + // Convert a steady_clock time_point to epoch-seconds (we only + // care about relative seconds; steady_clock is monotonic). + static int64_t ToEpochSec(std::chrono::steady_clock::time_point now); +}; + +} // namespace circuit_breaker diff --git a/include/config/server_config.h b/include/config/server_config.h index 4af21543..7dd949d1 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -131,6 +131,49 @@ struct ProxyConfig { bool operator!=(const ProxyConfig& o) const { return !(*this == o); } }; +struct CircuitBreakerConfig { + bool enabled = false; // Opt-in; off by default + bool dry_run = false; // Compute + log, but do not reject + + // Trip conditions (ORed). Either alone is sufficient. + int consecutive_failure_threshold = 5; // Trip after N consecutive failures + int failure_rate_threshold = 50; // Trip when fail_rate >= N percent + int minimum_volume = 20; // Required window volume before + // failure_rate is consulted + int window_seconds = 10; // Sliding-window duration + + // HALF_OPEN admission + int permitted_half_open_calls = 5; + + // Recovery timing. open_duration = min(base * 2^consecutive_trips, max). + int base_open_duration_ms = 5000; + int max_open_duration_ms = 60000; + + // Safety valve (future-proof for load-balanced services; no-op v1). + int max_ejection_percent_per_host_set = 50; + + // Retry budget (orthogonal to the breaker). Caps concurrent retries to + // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100). + int retry_budget_percent = 20; + int retry_budget_min_concurrency = 3; + + bool operator==(const CircuitBreakerConfig& o) const { + return enabled == o.enabled && + dry_run == o.dry_run && + consecutive_failure_threshold == o.consecutive_failure_threshold && + failure_rate_threshold == o.failure_rate_threshold && + minimum_volume == o.minimum_volume && + window_seconds == o.window_seconds && + permitted_half_open_calls == o.permitted_half_open_calls && + base_open_duration_ms == o.base_open_duration_ms && + max_open_duration_ms == o.max_open_duration_ms && + max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set && + retry_budget_percent == o.retry_budget_percent && + retry_budget_min_concurrency == o.retry_budget_min_concurrency; + } + bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); } +}; + struct UpstreamConfig { std::string name; std::string host; @@ -138,10 +181,12 @@ struct UpstreamConfig { UpstreamTlsConfig tls; UpstreamPoolConfig pool; ProxyConfig proxy; + CircuitBreakerConfig circuit_breaker; bool operator==(const UpstreamConfig& o) const { return name == o.name && host == o.host && port == o.port && - tls == o.tls && pool == o.pool && proxy == o.proxy; + tls == o.tls && pool == o.pool && proxy == o.proxy && + circuit_breaker == o.circuit_breaker; } bool operator!=(const UpstreamConfig& o) const { return !(*this == o); } }; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc new file mode 100644 index 00000000..5a30737b --- /dev/null +++ b/server/circuit_breaker_slice.cc @@ -0,0 +1,277 @@ +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" + +namespace circuit_breaker { + +CircuitBreakerSlice::CircuitBreakerSlice(std::string host_label, + size_t dispatcher_index, + const CircuitBreakerConfig& config, + TimeSource time_source) + : host_label_(std::move(host_label)), + dispatcher_index_(dispatcher_index), + config_(config), + time_source_(std::move(time_source)), + window_(config.window_seconds) { +} + +std::chrono::steady_clock::time_point CircuitBreakerSlice::Now() const { + if (time_source_) return time_source_(); + return std::chrono::steady_clock::now(); +} + +std::chrono::steady_clock::time_point CircuitBreakerSlice::OpenUntil() const { + int64_t ns = open_until_steady_ns_.load(std::memory_order_acquire); + if (ns == 0) return std::chrono::steady_clock::time_point{}; + return std::chrono::steady_clock::time_point(std::chrono::nanoseconds(ns)); +} + +std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const { + // Duration = base << consecutive_trips_ (shift expresses 2^n exponential). + // `consecutive_trips_` is the number of trips observed BEFORE this one, so + // the first trip uses 2^0 = 1x base, the second trip uses 2x, etc. + // Callers must increment consecutive_trips_ AFTER calling this method. + int trips = consecutive_trips_.load(std::memory_order_relaxed); + // Saturate shift at 30 to avoid UB on huge trip counts. + if (trips > 30) trips = 30; + int64_t base_ms = config_.base_open_duration_ms; + int64_t max_ms = config_.max_open_duration_ms; + int64_t scaled_ms = base_ms << trips; + if (scaled_ms < base_ms /* overflow */ || scaled_ms > max_ms) { + scaled_ms = max_ms; + } + return std::chrono::milliseconds(scaled_ms); +} + +bool CircuitBreakerSlice::ShouldTripClosed() { + if (consecutive_failures_ >= config_.consecutive_failure_threshold) { + return true; + } + auto now = Now(); + int64_t total = window_.TotalCount(now); + if (total < config_.minimum_volume) return false; + int64_t fails = window_.FailureCount(now); + // Compare without floating point: fails * 100 >= threshold * total. + return (fails * 100) >= (static_cast(config_.failure_rate_threshold) * total); +} + +void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { + auto duration = ComputeOpenDuration(); // uses current consecutive_trips_ + consecutive_trips_.fetch_add(1, std::memory_order_relaxed); + auto now = Now(); + auto open_until = now + duration; + int64_t open_until_ns = + std::chrono::duration_cast( + open_until.time_since_epoch()).count(); + + open_until_steady_ns_.store(open_until_ns, std::memory_order_release); + state_.store(State::OPEN, std::memory_order_release); + + // Reset on-trip bookkeeping. + consecutive_failures_ = 0; + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + + trips_.fetch_add(1, std::memory_order_relaxed); + + logging::Get()->warn( + "circuit breaker tripped {} trigger={} open_for_ms={} consecutive_trips={}", + host_label_, trigger, + std::chrono::duration_cast(duration).count(), + consecutive_trips_.load(std::memory_order_relaxed)); + + if (transition_cb_) transition_cb_(State::CLOSED, State::OPEN, trigger); +} + +void CircuitBreakerSlice::TransitionOpenToHalfOpen() { + state_.store(State::HALF_OPEN, std::memory_order_release); + // Keep open_until_steady_ns_ so observers see the "last open" boundary; + // it's cleared on transition to CLOSED. + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + + logging::Get()->info( + "circuit breaker half-open {} probes_allowed={}", + host_label_, config_.permitted_half_open_calls); + + if (transition_cb_) { + transition_cb_(State::OPEN, State::HALF_OPEN, "open_elapsed"); + } +} + +void CircuitBreakerSlice::TransitionHalfOpenToClosed() { + state_.store(State::CLOSED, std::memory_order_release); + open_until_steady_ns_.store(0, std::memory_order_release); + consecutive_trips_.store(0, std::memory_order_relaxed); + consecutive_failures_ = 0; + window_.Reset(); + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + + logging::Get()->info( + "circuit breaker closed {} probes_succeeded={}", + host_label_, config_.permitted_half_open_calls); + + if (transition_cb_) { + transition_cb_(State::HALF_OPEN, State::CLOSED, "probe_success"); + } +} + +void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { + auto duration = ComputeOpenDuration(); // uses current consecutive_trips_ + consecutive_trips_.fetch_add(1, std::memory_order_relaxed); + auto now = Now(); + auto open_until = now + duration; + int64_t open_until_ns = + std::chrono::duration_cast( + open_until.time_since_epoch()).count(); + + open_until_steady_ns_.store(open_until_ns, std::memory_order_release); + state_.store(State::OPEN, std::memory_order_release); + + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + + trips_.fetch_add(1, std::memory_order_relaxed); + + logging::Get()->warn( + "circuit breaker re-tripped {} trigger={} open_for_ms={} consecutive_trips={}", + host_label_, trigger, + std::chrono::duration_cast(duration).count(), + consecutive_trips_.load(std::memory_order_relaxed)); + + if (transition_cb_) transition_cb_(State::HALF_OPEN, State::OPEN, trigger); +} + +Decision CircuitBreakerSlice::TryAcquire() { + // Disabled fast path — zero overhead when config.enabled=false. + if (!config_.enabled) return Decision::ADMITTED; + + State s = state_.load(std::memory_order_acquire); + + if (s == State::OPEN) { + // Check whether the open window has elapsed. + int64_t open_until_ns = + open_until_steady_ns_.load(std::memory_order_acquire); + int64_t now_ns = std::chrono::duration_cast( + Now().time_since_epoch()).count(); + if (now_ns >= open_until_ns) { + // Transition OPEN → HALF_OPEN on this thread. Because slices are + // dispatcher-thread-pinned, no CAS is needed (a plain store is + // safe under the single-writer invariant). + TransitionOpenToHalfOpen(); + s = State::HALF_OPEN; + } else { + rejected_.fetch_add(1, std::memory_order_relaxed); + if (config_.dry_run) { + logging::Get()->info( + "[dry-run] circuit breaker would reject {} state=open", + host_label_); + return Decision::REJECTED_OPEN_DRYRUN; + } + logging::Get()->debug( + "circuit breaker rejected {} state=open", host_label_); + return Decision::REJECTED_OPEN; + } + } + + if (s == State::HALF_OPEN) { + if (half_open_inflight_ >= config_.permitted_half_open_calls) { + rejected_.fetch_add(1, std::memory_order_relaxed); + if (config_.dry_run) { + logging::Get()->info( + "[dry-run] circuit breaker would reject {} state=half_open_full", + host_label_); + return Decision::REJECTED_OPEN_DRYRUN; + } + logging::Get()->debug( + "circuit breaker rejected {} state=half_open_full", host_label_); + return Decision::REJECTED_OPEN; + } + half_open_inflight_++; + return Decision::ADMITTED_PROBE; + } + + // CLOSED: fast path. + return Decision::ADMITTED; +} + +void CircuitBreakerSlice::ReportSuccess(bool probe) { + if (!config_.enabled) return; + + if (probe) { + probe_successes_.fetch_add(1, std::memory_order_relaxed); + // Count the completed probe regardless of saw_failure state (we still + // decrement inflight to release the slot). + if (half_open_inflight_ > 0) half_open_inflight_--; + if (half_open_saw_failure_) { + // A sibling probe already failed; whichever probe finishes last + // transitions to OPEN. Handle here only if this is the last probe. + if (half_open_inflight_ == 0) { + TripHalfOpenToOpen("probe_fail"); + } + return; + } + half_open_successes_++; + if (half_open_successes_ >= config_.permitted_half_open_calls) { + TransitionHalfOpenToClosed(); + } + return; + } + + // CLOSED success: reset consecutive counter, record in window. + consecutive_failures_ = 0; + window_.AddSuccess(Now()); +} + +void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { + (void)kind; // Kind is used by higher layers for logging; slice itself + // treats all failures the same way for trip math. + if (!config_.enabled) return; + + if (probe) { + probe_failures_.fetch_add(1, std::memory_order_relaxed); + if (half_open_inflight_ > 0) half_open_inflight_--; + half_open_saw_failure_ = true; + // On the last probe (or if all remaining complete) transition OPEN. + if (half_open_inflight_ == 0) { + TripHalfOpenToOpen("probe_fail"); + } + return; + } + + // CLOSED failure path. + consecutive_failures_++; + window_.AddFailure(Now()); + + if (ShouldTripClosed()) { + const char* trigger = + (consecutive_failures_ >= config_.consecutive_failure_threshold) + ? "consecutive" : "rate"; + TripClosedToOpen(trigger); + } +} + +void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { + bool window_changed = (config_.window_seconds != new_config.window_seconds); + config_ = new_config; + if (window_changed) window_.Resize(new_config.window_seconds); + // Live state preserved — operator expects new thresholds to apply to the + // next evaluation, not to reset an in-progress trip. + + logging::Get()->info( + "circuit breaker config applied {} enabled={} window_s={} " + "fail_rate={} consec_threshold={}", + host_label_, new_config.enabled, new_config.window_seconds, + new_config.failure_rate_threshold, + new_config.consecutive_failure_threshold); +} + +void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) { + transition_cb_ = std::move(cb); +} + +} // namespace circuit_breaker diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc new file mode 100644 index 00000000..14ea34a5 --- /dev/null +++ b/server/circuit_breaker_window.cc @@ -0,0 +1,81 @@ +#include "circuit_breaker/circuit_breaker_window.h" + +namespace circuit_breaker { + +CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds) + : window_seconds_(window_seconds), + buckets_(window_seconds > 0 ? static_cast(window_seconds) : 1) { +} + +int64_t CircuitBreakerWindow::ToEpochSec( + std::chrono::steady_clock::time_point now) { + return std::chrono::duration_cast( + now.time_since_epoch()).count(); +} + +void CircuitBreakerWindow::Advance(int64_t now_sec) { + if (head_epoch_sec_ < 0) { + head_epoch_sec_ = now_sec; + return; + } + if (now_sec <= head_epoch_sec_) return; + int64_t delta = now_sec - head_epoch_sec_; + // If delta exceeds window size, everything is stale — full reset. + if (delta >= window_seconds_) { + for (auto& b : buckets_) { b.total = 0; b.failures = 0; } + } else { + // Zero buckets from head+1..now_sec inclusive. + for (int64_t s = head_epoch_sec_ + 1; s <= now_sec; ++s) { + size_t idx = static_cast(s % window_seconds_); + buckets_[idx].total = 0; + buckets_[idx].failures = 0; + } + } + head_epoch_sec_ = now_sec; +} + +void CircuitBreakerWindow::AddSuccess( + std::chrono::steady_clock::time_point now) { + int64_t now_sec = ToEpochSec(now); + Advance(now_sec); + size_t idx = static_cast(now_sec % window_seconds_); + buckets_[idx].total++; +} + +void CircuitBreakerWindow::AddFailure( + std::chrono::steady_clock::time_point now) { + int64_t now_sec = ToEpochSec(now); + Advance(now_sec); + size_t idx = static_cast(now_sec % window_seconds_); + buckets_[idx].total++; + buckets_[idx].failures++; +} + +int64_t CircuitBreakerWindow::TotalCount( + std::chrono::steady_clock::time_point now) { + Advance(ToEpochSec(now)); + int64_t sum = 0; + for (const auto& b : buckets_) sum += b.total; + return sum; +} + +int64_t CircuitBreakerWindow::FailureCount( + std::chrono::steady_clock::time_point now) { + Advance(ToEpochSec(now)); + int64_t sum = 0; + for (const auto& b : buckets_) sum += b.failures; + return sum; +} + +void CircuitBreakerWindow::Reset() { + for (auto& b : buckets_) { b.total = 0; b.failures = 0; } + head_epoch_sec_ = -1; +} + +void CircuitBreakerWindow::Resize(int new_window_seconds) { + window_seconds_ = new_window_seconds > 0 ? new_window_seconds : 1; + buckets_.assign(static_cast(window_seconds_), Bucket{}); + head_epoch_sec_ = -1; +} + +} // namespace circuit_breaker diff --git a/server/config_loader.cc b/server/config_loader.cc index 9ae4e212..c17a544d 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -262,6 +262,36 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) { } } + if (item.contains("circuit_breaker")) { + if (!item["circuit_breaker"].is_object()) + throw std::runtime_error("upstream circuit_breaker must be an object"); + auto& cb = item["circuit_breaker"]; + upstream.circuit_breaker.enabled = + cb.value("enabled", false); + upstream.circuit_breaker.dry_run = + cb.value("dry_run", false); + upstream.circuit_breaker.consecutive_failure_threshold = + cb.value("consecutive_failure_threshold", 5); + upstream.circuit_breaker.failure_rate_threshold = + cb.value("failure_rate_threshold", 50); + upstream.circuit_breaker.minimum_volume = + cb.value("minimum_volume", 20); + upstream.circuit_breaker.window_seconds = + cb.value("window_seconds", 10); + upstream.circuit_breaker.permitted_half_open_calls = + cb.value("permitted_half_open_calls", 5); + upstream.circuit_breaker.base_open_duration_ms = + cb.value("base_open_duration_ms", 5000); + upstream.circuit_breaker.max_open_duration_ms = + cb.value("max_open_duration_ms", 60000); + upstream.circuit_breaker.max_ejection_percent_per_host_set = + cb.value("max_ejection_percent_per_host_set", 50); + upstream.circuit_breaker.retry_budget_percent = + cb.value("retry_budget_percent", 20); + upstream.circuit_breaker.retry_budget_min_concurrency = + cb.value("retry_budget_min_concurrency", 3); + } + config.upstreams.push_back(std::move(upstream)); } } @@ -791,6 +821,62 @@ void ConfigLoader::Validate(const ServerConfig& config) { idx + " ('" + u.name + "'): proxy.retry.max_retries must be >= 0 and <= 10"); } + + // Circuit breaker validation + { + const auto& cb = u.circuit_breaker; + if (cb.consecutive_failure_threshold < 1) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.consecutive_failure_threshold must be >= 1"); + } + if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.failure_rate_threshold must be in [0, 100]"); + } + if (cb.minimum_volume < 1) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.minimum_volume must be >= 1"); + } + if (cb.window_seconds < 1 || cb.window_seconds > 3600) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.window_seconds must be in [1, 3600]"); + } + if (cb.permitted_half_open_calls < 1) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.permitted_half_open_calls must be >= 1"); + } + if (cb.base_open_duration_ms < 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.base_open_duration_ms must be >= 100"); + } + if (cb.max_open_duration_ms < cb.base_open_duration_ms) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_open_duration_ms must be >= base_open_duration_ms"); + } + if (cb.max_ejection_percent_per_host_set < 0 || + cb.max_ejection_percent_per_host_set > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]"); + } + if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_percent must be in [0, 100]"); + } + if (cb.retry_budget_min_concurrency < 0) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_min_concurrency must be >= 0"); + } + } // Validate method names — reject unknowns and duplicates. // Duplicates would cause RouteAsync to throw at startup. { @@ -1052,6 +1138,31 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) { uj["proxy"] = pj; } + // Always serialize circuit_breaker — same rationale as proxy block. + if (u.circuit_breaker != CircuitBreakerConfig{}) { + nlohmann::json cbj; + cbj["enabled"] = u.circuit_breaker.enabled; + cbj["dry_run"] = u.circuit_breaker.dry_run; + cbj["consecutive_failure_threshold"] = + u.circuit_breaker.consecutive_failure_threshold; + cbj["failure_rate_threshold"] = + u.circuit_breaker.failure_rate_threshold; + cbj["minimum_volume"] = u.circuit_breaker.minimum_volume; + cbj["window_seconds"] = u.circuit_breaker.window_seconds; + cbj["permitted_half_open_calls"] = + u.circuit_breaker.permitted_half_open_calls; + cbj["base_open_duration_ms"] = + u.circuit_breaker.base_open_duration_ms; + cbj["max_open_duration_ms"] = + u.circuit_breaker.max_open_duration_ms; + cbj["max_ejection_percent_per_host_set"] = + u.circuit_breaker.max_ejection_percent_per_host_set; + cbj["retry_budget_percent"] = + u.circuit_breaker.retry_budget_percent; + cbj["retry_budget_min_concurrency"] = + u.circuit_breaker.retry_budget_min_concurrency; + uj["circuit_breaker"] = cbj; + } j["upstreams"].push_back(uj); } diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h new file mode 100644 index 00000000..bd932a28 --- /dev/null +++ b/test/circuit_breaker_test.h @@ -0,0 +1,647 @@ +#pragma once + +#include "test_framework.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_window.h" +#include "circuit_breaker/circuit_breaker_slice.h" + +#include +#include +#include + +namespace CircuitBreakerTests { + +using circuit_breaker::CircuitBreakerSlice; +using circuit_breaker::CircuitBreakerWindow; +using circuit_breaker::Decision; +using circuit_breaker::FailureKind; +using circuit_breaker::State; + +// A simple mock clock that advances only when the test tells it to. +class MockClock { +public: + std::chrono::steady_clock::time_point now{ + // Choose a non-zero base so 0 is distinguishable from "not OPEN". + std::chrono::steady_clock::time_point(std::chrono::seconds(1'000'000)) + }; + void Advance(std::chrono::milliseconds ms) { now += ms; } + void AdvanceSec(int seconds) { now += std::chrono::seconds(seconds); } + std::chrono::steady_clock::time_point operator()() const { return now; } +}; + +// Build a config with default values — tests override specific fields. +static CircuitBreakerConfig DefaultEnabledConfig() { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 50; + cb.minimum_volume = 20; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + return cb; +} + +// ============================================================================ +// State machine tests +// ============================================================================ + +void TestDisabledFastPath() { + std::cout << "\n[TEST] CB: Disabled fast path..." << std::endl; + try { + CircuitBreakerConfig cb; // enabled=false by default + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + bool pass = slice.TryAcquire() == Decision::ADMITTED && + slice.CurrentState() == State::CLOSED; + + // Reporting 100 failures must not trip. + for (int i = 0; i < 100; ++i) { + slice.ReportFailure(FailureKind::CONNECT_FAILURE, false); + } + pass = pass && slice.CurrentState() == State::CLOSED && + slice.Trips() == 0; + + TestFramework::RecordTest("CB: disabled fast path", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: disabled fast path", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +void TestClosedStaysClosedBelowConsecutiveThreshold() { + std::cout << "\n[TEST] CB: 4 failures below threshold..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 4; ++i) { + slice.ReportFailure(FailureKind::CONNECT_FAILURE, false); + } + bool pass = slice.CurrentState() == State::CLOSED && + slice.TryAcquire() == Decision::ADMITTED && + slice.Trips() == 0; + TestFramework::RecordTest("CB: 4 failures below threshold", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: 4 failures below threshold", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestConsecutiveFailureTrip() { + std::cout << "\n[TEST] CB: 5 consecutive failures trip..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + bool pass = slice.CurrentState() == State::OPEN && + slice.Trips() == 1 && + slice.TryAcquire() == Decision::REJECTED_OPEN; + TestFramework::RecordTest("CB: 5 consecutive failures trip", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: 5 consecutive failures trip", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestFailureRateTrip() { + std::cout << "\n[TEST] CB: failure-rate trip (50% of 20)..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 1000; // disable consec path + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Alternate 10 failures and 10 successes within the same second — + // ratio = 50%, total = 20 (>= minimum_volume). + for (int i = 0; i < 10; ++i) { + slice.ReportSuccess(false); + } + // A success between-failures clears consecutive_failures_, confirming + // only rate path can trip here. + for (int i = 0; i < 9; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + // Still CLOSED — 9/19 < 50%. + bool pass_pre = slice.CurrentState() == State::CLOSED; + // 10th failure brings ratio to 10/20 = 50% exactly — tripper. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + bool pass = pass_pre && slice.CurrentState() == State::OPEN && + slice.Trips() == 1; + TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestMinimumVolumeGate() { + std::cout << "\n[TEST] CB: minimum_volume gate..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 1000; // disable consec path + cb.minimum_volume = 20; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // 19 total calls, all failures — should NOT trip (below volume). + for (int i = 0; i < 19; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + bool pass = slice.CurrentState() == State::CLOSED && slice.Trips() == 0; + TestFramework::RecordTest("CB: minimum_volume gate", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: minimum_volume gate", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +void TestOpenBeforeDurationStaysOpen() { + std::cout << "\n[TEST] CB: OPEN rejects before elapsed..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + // Advance less than base_open_duration_ms (5000ms). + clock->Advance(std::chrono::milliseconds(2000)); + Decision d = slice.TryAcquire(); + bool pass = d == Decision::REJECTED_OPEN && + slice.CurrentState() == State::OPEN; + TestFramework::RecordTest("CB: OPEN rejects before elapsed", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: OPEN rejects before elapsed", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestOpenToHalfOpenAfterDuration() { + std::cout << "\n[TEST] CB: OPEN → HALF_OPEN after duration..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + Decision d = slice.TryAcquire(); + bool pass = d == Decision::ADMITTED_PROBE && + slice.CurrentState() == State::HALF_OPEN; + TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestHalfOpenAllProbesSucceed() { + std::cout << "\n[TEST] CB: HALF_OPEN 5 probe successes close..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Take 5 probes; report success on each. + for (int i = 0; i < cb.permitted_half_open_calls; ++i) { + Decision d = slice.TryAcquire(); + if (d != Decision::ADMITTED_PROBE) { + TestFramework::RecordTest( + "CB: HALF_OPEN 5 probe successes close", false, + "probe " + std::to_string(i) + " not ADMITTED_PROBE", + TestFramework::TestCategory::OTHER); + return; + } + slice.ReportSuccess(true); + } + bool pass = slice.CurrentState() == State::CLOSED && + slice.ProbeSuccesses() == 5; + TestFramework::RecordTest("CB: HALF_OPEN 5 probe successes close", + pass, "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN 5 probe successes close", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestHalfOpenProbeFailureReopens() { + std::cout << "\n[TEST] CB: HALF_OPEN single probe fail re-opens..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Take 1 probe, fail it. + Decision d = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + bool pass = d == Decision::ADMITTED_PROBE && + slice.CurrentState() == State::OPEN && + slice.Trips() == 2 && // initial trip + re-trip + slice.ProbeFailures() == 1; + TestFramework::RecordTest("CB: HALF_OPEN probe fail re-opens", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN probe fail re-opens", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestHalfOpenExhaustedSlotsRejected() { + std::cout << "\n[TEST] CB: HALF_OPEN over capacity rejects..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + // Take 5 probes but DON'T report outcomes yet. + for (int i = 0; i < 5; ++i) slice.TryAcquire(); + // 6th TryAcquire must reject (all slots taken). + Decision d = slice.TryAcquire(); + bool pass = d == Decision::REJECTED_OPEN; + TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects", + pass, "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestExponentialBackoff() { + std::cout << "\n[TEST] CB: exponential backoff progression..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.base_open_duration_ms = 1000; + cb.max_open_duration_ms = 8000; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + auto trip_then_probe_fail = [&]() { + // Reach OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + }; + auto measure_open_ms = [&]() { + // open_until - now at the instant of the trip. + auto open_until = slice.OpenUntil(); + auto remaining = open_until - clock->now; + return std::chrono::duration_cast( + remaining).count(); + }; + + // Trip 1 — expect ~1000ms. + trip_then_probe_fail(); + int64_t d1 = measure_open_ms(); + // Move to HALF_OPEN and fail the probe → trip 2. + clock->Advance(std::chrono::milliseconds(d1 + 1)); + slice.TryAcquire(); // HALF_OPEN, ADMITTED_PROBE + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + int64_t d2 = measure_open_ms(); + clock->Advance(std::chrono::milliseconds(d2 + 1)); + slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + int64_t d3 = measure_open_ms(); + clock->Advance(std::chrono::milliseconds(d3 + 1)); + slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + int64_t d4 = measure_open_ms(); + clock->Advance(std::chrono::milliseconds(d4 + 1)); + slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + int64_t d5 = measure_open_ms(); + + // Expect 1000, 2000, 4000, 8000, 8000 (capped). + bool pass = d1 == 1000 && d2 == 2000 && d3 == 4000 && + d4 == 8000 && d5 == 8000; + std::string err = "d1=" + std::to_string(d1) + " d2=" + std::to_string(d2) + + " d3=" + std::to_string(d3) + " d4=" + std::to_string(d4) + + " d5=" + std::to_string(d5); + TestFramework::RecordTest("CB: exponential backoff", + pass, pass ? "" : err, TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: exponential backoff", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +void TestResetOnClose() { + std::cout << "\n[TEST] CB: consecutive_trips resets on close..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.base_open_duration_ms = 1000; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip 1. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(1001)); + // Move to HALF_OPEN. + for (int i = 0; i < 5; ++i) { + slice.TryAcquire(); + slice.ReportSuccess(true); + } + // Now CLOSED. Trip again — expect base_duration again (not doubled). + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + auto open_until = slice.OpenUntil(); + auto remaining = open_until - clock->now; + int64_t d_after_close = std::chrono::duration_cast< + std::chrono::milliseconds>(remaining).count(); + bool pass = d_after_close == 1000; + TestFramework::RecordTest("CB: trips reset on close", pass, + pass ? "" : "expected 1000ms, got " + std::to_string(d_after_close), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: trips reset on close", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// Window tests +// ============================================================================ + +void TestWindowBucketByCurrentSecond() { + std::cout << "\n[TEST] CB Window: bucket by current second..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddSuccess(t0); + w.AddFailure(t0); + w.AddFailure(t0); + bool pass = w.TotalCount(t0) == 3 && w.FailureCount(t0) == 2; + TestFramework::RecordTest("CB Window: bucket by current second", pass, + "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: bucket by current second", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestWindowAdvanceSkipsStale() { + std::cout << "\n[TEST] CB Window: advance skips stale..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddFailure(t0); // bucket 100%10 = 0 + auto t1 = t0 + std::chrono::seconds(15); // beyond window + // After long idle, incoming record should see zero history. + bool pre = w.TotalCount(t1) == 0; + w.AddSuccess(t1); + bool pass = pre && w.TotalCount(t1) == 1 && w.FailureCount(t1) == 0; + TestFramework::RecordTest("CB Window: advance skips stale", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: advance skips stale", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestWindowPartialExpiry() { + std::cout << "\n[TEST] CB Window: partial expiry..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddFailure(t0); // sec 100 + auto t1 = t0 + std::chrono::seconds(5); + w.AddFailure(t1); // sec 105 + auto t2 = t0 + std::chrono::seconds(11); + // sec 100 is now out of window (100 + 10 <= 111 - 1 = 110). So: + // bucket 0 (sec 100 or sec 110) would have been zeroed when advancing + // from head=105 past sec 110. + bool pass = w.TotalCount(t2) == 1 && w.FailureCount(t2) == 1; + TestFramework::RecordTest("CB Window: partial expiry", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: partial expiry", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestWindowReset() { + std::cout << "\n[TEST] CB Window: reset clears..." << std::endl; + try { + CircuitBreakerWindow w(10); + auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100)); + w.AddFailure(t0); w.AddSuccess(t0); w.AddFailure(t0); + w.Reset(); + bool pass = w.TotalCount(t0) == 0 && w.FailureCount(t0) == 0; + TestFramework::RecordTest("CB Window: reset clears", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Window: reset clears", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// Dry-run + Reload + Edge cases +// ============================================================================ + +void TestDryRunAdmits() { + std::cout << "\n[TEST] CB: dry_run admits through OPEN..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.dry_run = true; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + // OPEN + dry_run → REJECTED_OPEN_DRYRUN (caller proceeds). + Decision d = slice.TryAcquire(); + bool pass = d == Decision::REJECTED_OPEN_DRYRUN && + slice.CurrentState() == State::OPEN && + slice.Rejected() == 1; + TestFramework::RecordTest("CB: dry_run admits through OPEN", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: dry_run admits through OPEN", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestReloadPreservesState() { + std::cout << "\n[TEST] CB: reload preserves live state..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + // OPEN at this point. + auto cb2 = cb; + cb2.consecutive_failure_threshold = 2; // tighter + cb2.window_seconds = 30; // triggers ring resize + slice.Reload(cb2); + // Still OPEN immediately after reload — live state preserved. + bool pass = slice.CurrentState() == State::OPEN; + TestFramework::RecordTest("CB: reload preserves live state", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: reload preserves live state", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestConsecutiveThresholdOne() { + std::cout << "\n[TEST] CB: threshold=1 single failure trips..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 1; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + bool pass = slice.CurrentState() == State::OPEN && slice.Trips() == 1; + TestFramework::RecordTest("CB: threshold=1 single failure trips", + pass, "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: threshold=1 single failure trips", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestSuccessClearsConsecutive() { + std::cout << "\n[TEST] CB: success clears consecutive..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 4; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + slice.ReportSuccess(false); // resets consecutive + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + // consecutive is back to 1, no trip. + bool pass = slice.CurrentState() == State::CLOSED; + TestFramework::RecordTest("CB: success clears consecutive", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: success clears consecutive", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +void TestTransitionCallbackInvoked() { + std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + int closed_to_open = 0; + int open_to_halfopen = 0; + int halfopen_to_closed = 0; + slice.SetTransitionCallback( + [&](State o, State n, const char*) { + if (o == State::CLOSED && n == State::OPEN) closed_to_open++; + else if (o == State::OPEN && n == State::HALF_OPEN) open_to_halfopen++; + else if (o == State::HALF_OPEN && n == State::CLOSED) halfopen_to_closed++; + }); + + // Full cycle. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + for (int i = 0; i < cb.permitted_half_open_calls; ++i) { + slice.TryAcquire(); + slice.ReportSuccess(true); + } + bool pass = closed_to_open == 1 && open_to_halfopen == 1 && + halfopen_to_closed == 1; + TestFramework::RecordTest("CB: transition callback invoked", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: transition callback invoked", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Run all circuit breaker unit tests. +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - UNIT TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestDisabledFastPath(); + TestClosedStaysClosedBelowConsecutiveThreshold(); + TestConsecutiveFailureTrip(); + TestFailureRateTrip(); + TestMinimumVolumeGate(); + TestOpenBeforeDurationStaysOpen(); + TestOpenToHalfOpenAfterDuration(); + TestHalfOpenAllProbesSucceed(); + TestHalfOpenProbeFailureReopens(); + TestHalfOpenExhaustedSlotsRejected(); + TestExponentialBackoff(); + TestResetOnClose(); + TestWindowBucketByCurrentSecond(); + TestWindowAdvanceSkipsStale(); + TestWindowPartialExpiry(); + TestWindowReset(); + TestDryRunAdmits(); + TestReloadPreservesState(); + TestConsecutiveThresholdOne(); + TestSuccessClearsConsecutive(); + TestTransitionCallbackInvoked(); +} + +} // namespace CircuitBreakerTests diff --git a/test/config_test.h b/test/config_test.h index cfb90c7a..213fd8ac 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -348,6 +348,219 @@ namespace ConfigTests { } } + // Test 9: Circuit breaker defaults + void TestCircuitBreakerDefaults() { + std::cout << "\n[TEST] Circuit Breaker Defaults..." << std::endl; + try { + CircuitBreakerConfig cb; // value-initialized defaults + bool pass = cb.enabled == false && + cb.dry_run == false && + cb.consecutive_failure_threshold == 5 && + cb.failure_rate_threshold == 50 && + cb.minimum_volume == 20 && + cb.window_seconds == 10 && + cb.permitted_half_open_calls == 5 && + cb.base_open_duration_ms == 5000 && + cb.max_open_duration_ms == 60000 && + cb.max_ejection_percent_per_host_set == 50 && + cb.retry_budget_percent == 20 && + cb.retry_budget_min_concurrency == 3; + TestFramework::RecordTest("Circuit Breaker Defaults", pass, + pass ? "" : "default value mismatch", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker Defaults", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 10: Circuit breaker JSON parsing (populated block) + void TestCircuitBreakerJsonParse() { + std::cout << "\n[TEST] Circuit Breaker JSON Parse..." << std::endl; + try { + std::string json = R"({ + "upstreams": [{ + "name": "svc", + "host": "10.0.0.1", + "port": 8080, + "circuit_breaker": { + "enabled": true, + "dry_run": true, + "consecutive_failure_threshold": 7, + "failure_rate_threshold": 75, + "minimum_volume": 50, + "window_seconds": 30, + "permitted_half_open_calls": 3, + "base_open_duration_ms": 2000, + "max_open_duration_ms": 120000, + "max_ejection_percent_per_host_set": 33, + "retry_budget_percent": 10, + "retry_budget_min_concurrency": 5 + } + }] + })"; + ServerConfig config = ConfigLoader::LoadFromString(json); + const auto& cb = config.upstreams.at(0).circuit_breaker; + bool pass = cb.enabled == true && cb.dry_run == true && + cb.consecutive_failure_threshold == 7 && + cb.failure_rate_threshold == 75 && + cb.minimum_volume == 50 && + cb.window_seconds == 30 && + cb.permitted_half_open_calls == 3 && + cb.base_open_duration_ms == 2000 && + cb.max_open_duration_ms == 120000 && + cb.max_ejection_percent_per_host_set == 33 && + cb.retry_budget_percent == 10 && + cb.retry_budget_min_concurrency == 5; + TestFramework::RecordTest("Circuit Breaker JSON Parse", pass, + pass ? "" : "parsed values mismatch", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker JSON Parse", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 11: Circuit breaker JSON partial block uses defaults for missing fields + void TestCircuitBreakerJsonPartial() { + std::cout << "\n[TEST] Circuit Breaker JSON Partial..." << std::endl; + try { + std::string json = R"({ + "upstreams": [{ + "name": "svc", "host": "10.0.0.1", "port": 8080, + "circuit_breaker": {"enabled": true} + }] + })"; + ServerConfig config = ConfigLoader::LoadFromString(json); + const auto& cb = config.upstreams.at(0).circuit_breaker; + bool pass = cb.enabled == true && + cb.consecutive_failure_threshold == 5 && + cb.window_seconds == 10; + TestFramework::RecordTest("Circuit Breaker JSON Partial", pass, + pass ? "" : "expected defaults for unset fields", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker JSON Partial", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 12: Round-trip via ToJson() preserves circuit_breaker + void TestCircuitBreakerJsonRoundTrip() { + std::cout << "\n[TEST] Circuit Breaker JSON Round-Trip..." << std::endl; + try { + ServerConfig in; + UpstreamConfig u; + u.name = "svc"; u.host = "10.0.0.1"; u.port = 8080; + u.circuit_breaker.enabled = true; + u.circuit_breaker.window_seconds = 25; + u.circuit_breaker.failure_rate_threshold = 42; + in.upstreams.push_back(u); + + std::string serialized = ConfigLoader::ToJson(in); + ServerConfig out = ConfigLoader::LoadFromString(serialized); + + const auto& cb = out.upstreams.at(0).circuit_breaker; + bool pass = cb.enabled == true && cb.window_seconds == 25 && + cb.failure_rate_threshold == 42; + TestFramework::RecordTest("Circuit Breaker JSON Round-Trip", pass, + pass ? "" : "round-trip lost fields", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker JSON Round-Trip", false, + e.what(), TestFramework::TestCategory::OTHER); + } + } + + // Helper: assert a circuit_breaker JSON override is rejected by Validate(). + static void ExpectValidationFailure(const std::string& name, + const std::string& cb_json_override, + const std::string& expected_substr) { + std::string json = std::string(R"({ + "upstreams": [{ + "name": "svc", "host": "10.0.0.1", "port": 8080, + "circuit_breaker": )") + cb_json_override + R"( + }] + })"; + try { + ServerConfig config = ConfigLoader::LoadFromString(json); + ConfigLoader::Validate(config); + TestFramework::RecordTest(name, false, + "expected validation failure containing: " + expected_substr, + TestFramework::TestCategory::OTHER); + } catch (const std::invalid_argument& e) { + std::string msg(e.what()); + bool pass = msg.find(expected_substr) != std::string::npos; + TestFramework::RecordTest(name, pass, + pass ? "" : std::string("wrong error: ") + msg, + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest(name, false, + std::string("wrong exception type: ") + e.what(), + TestFramework::TestCategory::OTHER); + } + } + + // Test 13: Validation rejects bad circuit_breaker fields + void TestCircuitBreakerValidation() { + std::cout << "\n[TEST] Circuit Breaker Validation..." << std::endl; + ExpectValidationFailure("CB Validation: consecutive_failure_threshold<1", + R"({"consecutive_failure_threshold": 0})", + "consecutive_failure_threshold must be >= 1"); + ExpectValidationFailure("CB Validation: failure_rate_threshold>100", + R"({"failure_rate_threshold": 101})", + "failure_rate_threshold must be in [0, 100]"); + ExpectValidationFailure("CB Validation: minimum_volume<1", + R"({"minimum_volume": 0})", + "minimum_volume must be >= 1"); + ExpectValidationFailure("CB Validation: window_seconds<1", + R"({"window_seconds": 0})", + "window_seconds must be in [1, 3600]"); + ExpectValidationFailure("CB Validation: window_seconds>3600", + R"({"window_seconds": 3601})", + "window_seconds must be in [1, 3600]"); + ExpectValidationFailure("CB Validation: base_open_duration_ms<100", + R"({"base_open_duration_ms": 50})", + "base_open_duration_ms must be >= 100"); + ExpectValidationFailure("CB Validation: max= base_open_duration_ms"); + ExpectValidationFailure("CB Validation: retry_budget_percent>100", + R"({"retry_budget_percent": 200})", + "retry_budget_percent must be in [0, 100]"); + ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0", + R"({"retry_budget_min_concurrency": -1})", + "retry_budget_min_concurrency must be >= 0"); + ExpectValidationFailure("CB Validation: max_ejection_percent>100", + R"({"max_ejection_percent_per_host_set": 150})", + "max_ejection_percent_per_host_set must be in [0, 100]"); + ExpectValidationFailure("CB Validation: permitted_half_open_calls<1", + R"({"permitted_half_open_calls": 0})", + "permitted_half_open_calls must be >= 1"); + } + + // Test 14: Equality operator covers circuit_breaker field + void TestCircuitBreakerEquality() { + std::cout << "\n[TEST] Circuit Breaker Equality..." << std::endl; + try { + UpstreamConfig a; + a.name = "svc"; a.host = "h"; a.port = 80; + UpstreamConfig b = a; + bool equal_default = (a == b); + + b.circuit_breaker.enabled = true; + bool not_equal_after_diff = (a != b); + + bool pass = equal_default && not_equal_after_diff; + TestFramework::RecordTest("Circuit Breaker Equality", pass, + pass ? "" : "operator== failed for circuit_breaker", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("Circuit Breaker Equality", false, e.what(), + TestFramework::TestCategory::OTHER); + } + } + // Run all config tests void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; @@ -362,6 +575,14 @@ namespace ConfigTests { TestValidationTlsNoCert(); TestEnvOverrides(); TestMissingFile(); + + // Phase 1: Circuit breaker config + TestCircuitBreakerDefaults(); + TestCircuitBreakerJsonParse(); + TestCircuitBreakerJsonPartial(); + TestCircuitBreakerJsonRoundTrip(); + TestCircuitBreakerValidation(); + TestCircuitBreakerEquality(); } } // namespace ConfigTests diff --git a/test/run_test.cc b/test/run_test.cc index 4edb0139..3d55f06f 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -13,6 +13,7 @@ #include "upstream_pool_test.h" #include "proxy_test.h" #include "rate_limit_test.h" +#include "circuit_breaker_test.h" #include "test_framework.h" #include #include @@ -77,6 +78,9 @@ void RunAllTest(){ // Run rate limit tests RateLimitTests::RunAllTests(); + // Run circuit breaker tests + CircuitBreakerTests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } @@ -155,6 +159,9 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); + // Run circuit breaker tests + }else if(mode == "circuit_breaker" || mode == "-B"){ + CircuitBreakerTests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]); From 14921ec90244245eb53594bf8ea34206b635593c Mon Sep 17 00:00:00 2001 From: mwfj Date: Mon, 13 Apr 2026 17:09:09 +0800 Subject: [PATCH 02/37] Fix review comment --- Makefile | 7 +- .../circuit_breaker/circuit_breaker_slice.h | 19 ++ .../circuit_breaker/circuit_breaker_state.h | 6 +- server/circuit_breaker_slice.cc | 109 +++++++--- server/circuit_breaker_window.cc | 20 +- server/config_loader.cc | 21 +- test/circuit_breaker_test.h | 201 ++++++++++++++++++ test/config_test.h | 16 +- 8 files changed, 355 insertions(+), 44 deletions(-) diff --git a/Makefile b/Makefile index 8f4ec3f2..4dd6b83a 100644 --- a/Makefile +++ b/Makefile @@ -242,6 +242,11 @@ test_rate_limit: $(TARGET) @echo "Running rate limit tests only..." ./$(TARGET) rate_limit +# Run only circuit breaker tests +test_circuit_breaker: $(TARGET) + @echo "Running circuit breaker tests only..." + ./$(TARGET) circuit_breaker + # Display help information help: @echo "Reactor Server C++ - Makefile Help" @@ -322,4 +327,4 @@ help: # Build only the production server binary server: $(SERVER_TARGET) -.PHONY: all clean test server test_basic test_stress test_race test_config test_http test_ws test_tls test_cli test_http2 test_upstream test_proxy test_rate_limit help +.PHONY: all clean test server test_basic test_stress test_race test_config test_http test_ws test_tls test_cli test_http2 test_upstream test_proxy test_rate_limit test_circuit_breaker help diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 5633c355..8d7af6d7 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -57,6 +57,12 @@ class CircuitBreakerSlice { int64_t Rejected() const { return rejected_.load(std::memory_order_relaxed); } int64_t ProbeSuccesses() const { return probe_successes_.load(std::memory_order_relaxed); } int64_t ProbeFailures() const { return probe_failures_.load(std::memory_order_relaxed); } + // Rejections specifically caused by HALF_OPEN being out of probe slots + // (subset of `Rejected()`). Lets dashboards distinguish "backoff has not + // elapsed" from "probing, no capacity left". + int64_t RejectedHalfOpenFull() const { + return rejected_half_open_full_.load(std::memory_order_relaxed); + } const std::string& host_label() const { return host_label_; } size_t dispatcher_index() const { return dispatcher_index_; } @@ -91,9 +97,17 @@ class CircuitBreakerSlice { // Observability counters. std::atomic trips_{0}; std::atomic rejected_{0}; + std::atomic rejected_half_open_full_{0}; std::atomic probe_successes_{0}; std::atomic probe_failures_{0}; + // One-shot flag: true after the slice has emitted a higher-level + // (info) log for the first rejection in the current OPEN/HALF_OPEN + // cycle. Reset on transition to CLOSED and on each fresh trip. Keeps + // per-request reject logs at debug while still surfacing the first + // post-trip reject in default-warn operator logs. Dispatcher-thread only. + bool first_reject_logged_for_open_ = false; + StateTransitionCallback transition_cb_; // Internal transitions (dispatcher-thread). @@ -102,6 +116,11 @@ class CircuitBreakerSlice { void TransitionHalfOpenToClosed(); void TripHalfOpenToOpen(const char* trigger); + // Emit the correct reject log line, bump counters, and return the matching + // Decision (enforce or dry-run). Used by both the OPEN (backoff active) + // and HALF_OPEN-full paths — keeps the three loggers/counters consistent. + Decision RejectWithLog(const char* state_label, bool half_open_full); + // Compute open duration for the current consecutive_trips_ value: // min(base * 2^consecutive_trips, max). Always >= base_open_duration_ms. std::chrono::nanoseconds ComputeOpenDuration() const; diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h index 06fa695d..6a758a57 100644 --- a/include/circuit_breaker/circuit_breaker_state.h +++ b/include/circuit_breaker/circuit_breaker_state.h @@ -49,7 +49,11 @@ enum class FailureKind : uint8_t { // owning dispatcher thread. Callers can compare old/new to key off a // specific edge (e.g. CLOSED→OPEN fires wait-queue drain). // `trigger` is a short static string such as "consecutive" / "rate" / -// "probe_success" / "probe_failure" for logging. +// "probe_success" / "probe_fail" / "open_elapsed" for logging. +// +// TODO(phase-7): once a snapshot / admin JSON endpoint lands, convert +// `trigger` to an `enum class TransitionTrigger` so the valid set is +// compile-time checked rather than string-compared. See design doc §15.8. using StateTransitionCallback = std::function; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 5a30737b..3b794c65 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -25,14 +25,21 @@ std::chrono::steady_clock::time_point CircuitBreakerSlice::OpenUntil() const { return std::chrono::steady_clock::time_point(std::chrono::nanoseconds(ns)); } +// Cap the left-shift exponent used to compute open duration. `1 << 30` already +// covers ~12.4 days of base open duration even before the `max_open_duration_ms` +// clamp — higher shift amounts would invoke undefined behavior on `int`. +static constexpr int MAX_OPEN_DURATION_SHIFT = 30; + +// Scale factor for integer percent math: `fails * PERCENT_SCALE >= threshold * total`. +static constexpr int PERCENT_SCALE = 100; + std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const { // Duration = base << consecutive_trips_ (shift expresses 2^n exponential). // `consecutive_trips_` is the number of trips observed BEFORE this one, so // the first trip uses 2^0 = 1x base, the second trip uses 2x, etc. // Callers must increment consecutive_trips_ AFTER calling this method. int trips = consecutive_trips_.load(std::memory_order_relaxed); - // Saturate shift at 30 to avoid UB on huge trip counts. - if (trips > 30) trips = 30; + if (trips > MAX_OPEN_DURATION_SHIFT) trips = MAX_OPEN_DURATION_SHIFT; int64_t base_ms = config_.base_open_duration_ms; int64_t max_ms = config_.max_open_duration_ms; int64_t scaled_ms = base_ms << trips; @@ -50,8 +57,9 @@ bool CircuitBreakerSlice::ShouldTripClosed() { int64_t total = window_.TotalCount(now); if (total < config_.minimum_volume) return false; int64_t fails = window_.FailureCount(now); - // Compare without floating point: fails * 100 >= threshold * total. - return (fails * 100) >= (static_cast(config_.failure_rate_threshold) * total); + // Integer percent math: fails * PERCENT_SCALE >= threshold_pct * total. + return (fails * PERCENT_SCALE) >= + (static_cast(config_.failure_rate_threshold) * total); } void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { @@ -71,6 +79,7 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + first_reject_logged_for_open_ = false; trips_.fetch_add(1, std::memory_order_relaxed); @@ -101,6 +110,12 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { } void CircuitBreakerSlice::TransitionHalfOpenToClosed() { + // Capture actual probes-succeeded BEFORE resetting — the log then reflects + // reality instead of the configured target (the two are equal at the moment + // of transition today, but relying on that is brittle if the transition + // logic ever changes). + int probes_succeeded = half_open_successes_; + state_.store(State::CLOSED, std::memory_order_release); open_until_steady_ns_.store(0, std::memory_order_release); consecutive_trips_.store(0, std::memory_order_relaxed); @@ -109,10 +124,11 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + first_reject_logged_for_open_ = false; logging::Get()->info( "circuit breaker closed {} probes_succeeded={}", - host_label_, config_.permitted_half_open_calls); + host_label_, probes_succeeded); if (transition_cb_) { transition_cb_(State::HALF_OPEN, State::CLOSED, "probe_success"); @@ -134,6 +150,7 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + first_reject_logged_for_open_ = false; trips_.fetch_add(1, std::memory_order_relaxed); @@ -165,31 +182,20 @@ Decision CircuitBreakerSlice::TryAcquire() { TransitionOpenToHalfOpen(); s = State::HALF_OPEN; } else { - rejected_.fetch_add(1, std::memory_order_relaxed); - if (config_.dry_run) { - logging::Get()->info( - "[dry-run] circuit breaker would reject {} state=open", - host_label_); - return Decision::REJECTED_OPEN_DRYRUN; - } - logging::Get()->debug( - "circuit breaker rejected {} state=open", host_label_); - return Decision::REJECTED_OPEN; + return RejectWithLog("open", /*half_open_full=*/false); } } if (s == State::HALF_OPEN) { - if (half_open_inflight_ >= config_.permitted_half_open_calls) { - rejected_.fetch_add(1, std::memory_order_relaxed); - if (config_.dry_run) { - logging::Get()->info( - "[dry-run] circuit breaker would reject {} state=half_open_full", - host_label_); - return Decision::REJECTED_OPEN_DRYRUN; - } - logging::Get()->debug( - "circuit breaker rejected {} state=half_open_full", host_label_); - return Decision::REJECTED_OPEN; + // Short-circuit as soon as any probe has failed: the breaker is + // guaranteed to re-trip once the remaining in-flight probes drain, so + // admitting more probes just wastes capacity on a known-bad upstream. + // Previously this path kept admitting probes until `permitted_half_open_calls` + // in-flight was reached, which under continued failure could keep + // traffic flowing indefinitely instead of converging back to OPEN. + if (half_open_saw_failure_ || + half_open_inflight_ >= config_.permitted_half_open_calls) { + return RejectWithLog("half_open_full", /*half_open_full=*/true); } half_open_inflight_++; return Decision::ADMITTED_PROBE; @@ -199,6 +205,41 @@ Decision CircuitBreakerSlice::TryAcquire() { return Decision::ADMITTED; } +Decision CircuitBreakerSlice::RejectWithLog(const char* state_label, + bool half_open_full) { + rejected_.fetch_add(1, std::memory_order_relaxed); + if (half_open_full) { + rejected_half_open_full_.fetch_add(1, std::memory_order_relaxed); + } + // First reject in this OPEN/HALF_OPEN cycle is info — gives operators + // looking at a flurry of 503s a single high-level breadcrumb in default- + // warn logs without flooding them. Subsequent rejects are debug. + const bool first = !first_reject_logged_for_open_; + if (first) first_reject_logged_for_open_ = true; + + if (config_.dry_run) { + if (first) { + logging::Get()->info( + "[dry-run] circuit breaker would reject {} state={}", + host_label_, state_label); + } else { + logging::Get()->debug( + "[dry-run] circuit breaker would reject {} state={}", + host_label_, state_label); + } + return Decision::REJECTED_OPEN_DRYRUN; + } + if (first) { + logging::Get()->info( + "circuit breaker rejecting {} state={} (first reject this cycle)", + host_label_, state_label); + } else { + logging::Get()->debug( + "circuit breaker rejected {} state={}", host_label_, state_label); + } + return Decision::REJECTED_OPEN; +} + void CircuitBreakerSlice::ReportSuccess(bool probe) { if (!config_.enabled) return; @@ -222,7 +263,12 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) { return; } - // CLOSED success: reset consecutive counter, record in window. + // Non-probe success: only meaningful when state is CLOSED. If the slice + // has since transitioned (e.g., other requests in this burst tripped it), + // this late outcome must NOT retroactively reset `consecutive_failures_` + // or pollute the window — a fresh CLOSED cycle after recovery would start + // with bogus success history. + if (state_.load(std::memory_order_acquire) != State::CLOSED) return; consecutive_failures_ = 0; window_.AddSuccess(Now()); } @@ -243,7 +289,14 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { return; } - // CLOSED failure path. + // Non-probe failure: only count when CLOSED. Late failures from requests + // admitted in CLOSED but completing after a trip must NOT re-enter + // `TripClosedToOpen` — doing so double-increments `consecutive_trips_` + // (inflating open_duration) and fires a spurious CLOSED→OPEN transition + // edge that downstream consumers (wait-queue drain, snapshot telemetry) + // would see as a ghost trip. + if (state_.load(std::memory_order_acquire) != State::CLOSED) return; + consecutive_failures_++; window_.AddFailure(Now()); diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc index 14ea34a5..06fccc18 100644 --- a/server/circuit_breaker_window.cc +++ b/server/circuit_breaker_window.cc @@ -2,6 +2,19 @@ namespace circuit_breaker { +// Map an epoch-second value into a non-negative bucket index. C++ built-in `%` +// can return a negative result when the dividend is negative — and while +// `steady_clock::time_since_epoch()` is zero-based on all mainstream +// libstdc++/libc++ implementations, the standard does not strictly guarantee a +// non-negative epoch across every implementation. The extra `+ w` and second +// `% w` costs a single add + mod on the slow (negative) branch, zero observable +// overhead on the common positive branch after the compiler eliminates the +// redundant math. +static inline size_t BucketIndex(int64_t epoch_sec, int window_seconds) { + const int64_t w = window_seconds; + return static_cast(((epoch_sec % w) + w) % w); +} + CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds) : window_seconds_(window_seconds), buckets_(window_seconds > 0 ? static_cast(window_seconds) : 1) { @@ -26,7 +39,7 @@ void CircuitBreakerWindow::Advance(int64_t now_sec) { } else { // Zero buckets from head+1..now_sec inclusive. for (int64_t s = head_epoch_sec_ + 1; s <= now_sec; ++s) { - size_t idx = static_cast(s % window_seconds_); + size_t idx = BucketIndex(s, window_seconds_); buckets_[idx].total = 0; buckets_[idx].failures = 0; } @@ -38,15 +51,14 @@ void CircuitBreakerWindow::AddSuccess( std::chrono::steady_clock::time_point now) { int64_t now_sec = ToEpochSec(now); Advance(now_sec); - size_t idx = static_cast(now_sec % window_seconds_); - buckets_[idx].total++; + buckets_[BucketIndex(now_sec, window_seconds_)].total++; } void CircuitBreakerWindow::AddFailure( std::chrono::steady_clock::time_point now) { int64_t now_sec = ToEpochSec(now); Advance(now_sec); - size_t idx = static_cast(now_sec % window_seconds_); + size_t idx = BucketIndex(now_sec, window_seconds_); buckets_[idx].total++; buckets_[idx].failures++; } diff --git a/server/config_loader.cc b/server/config_loader.cc index c17a544d..f6ff4698 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -822,33 +822,40 @@ void ConfigLoader::Validate(const ServerConfig& config) { "'): proxy.retry.max_retries must be >= 0 and <= 10"); } - // Circuit breaker validation + // Circuit breaker validation. + // + // Upper bounds on counting fields are generous — they exist to + // catch pathological configs (typo like "10_000_000_000" or a + // missing unit conversion), not to constrain legitimate tuning. + // Defaults are 5 / 20 / 5; limits are 1000× to 50000× the defaults. { const auto& cb = u.circuit_breaker; - if (cb.consecutive_failure_threshold < 1) { + if (cb.consecutive_failure_threshold < 1 || + cb.consecutive_failure_threshold > 10000) { throw std::invalid_argument( idx + " ('" + u.name + - "'): circuit_breaker.consecutive_failure_threshold must be >= 1"); + "'): circuit_breaker.consecutive_failure_threshold must be in [1, 10000]"); } if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) { throw std::invalid_argument( idx + " ('" + u.name + "'): circuit_breaker.failure_rate_threshold must be in [0, 100]"); } - if (cb.minimum_volume < 1) { + if (cb.minimum_volume < 1 || cb.minimum_volume > 10000000) { throw std::invalid_argument( idx + " ('" + u.name + - "'): circuit_breaker.minimum_volume must be >= 1"); + "'): circuit_breaker.minimum_volume must be in [1, 10000000]"); } if (cb.window_seconds < 1 || cb.window_seconds > 3600) { throw std::invalid_argument( idx + " ('" + u.name + "'): circuit_breaker.window_seconds must be in [1, 3600]"); } - if (cb.permitted_half_open_calls < 1) { + if (cb.permitted_half_open_calls < 1 || + cb.permitted_half_open_calls > 1000) { throw std::invalid_argument( idx + " ('" + u.name + - "'): circuit_breaker.permitted_half_open_calls must be >= 1"); + "'): circuit_breaker.permitted_half_open_calls must be in [1, 1000]"); } if (cb.base_open_duration_ms < 100) { throw std::invalid_argument( diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index bd932a28..f8b265d7 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -578,6 +578,203 @@ void TestSuccessClearsConsecutive() { } } +// ============================================================================ +// Regression tests — critical bugs caught in code review +// ============================================================================ + +// BUG: late non-probe failure after trip re-entered TripClosedToOpen, inflating +// consecutive_trips_ (→ longer backoff) and firing a spurious CLOSED→OPEN +// transition edge. Fix: guard ReportFailure(probe=false) on state_ == CLOSED. +void TestLateFailureAfterTripDoesNotInflateBackoff() { + std::cout << "\n[TEST] CB: late failure after trip does not inflate backoff..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.base_open_duration_ms = 1000; + cb.max_open_duration_ms = 60000; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Admit 10 requests in CLOSED. Slice state is single-threaded so + // admission + bookkeeping is serialized by the event loop — but in + // production the outcomes for those admitted requests can arrive after + // the slice has already tripped. + for (int i = 0; i < 10; ++i) { + Decision d = slice.TryAcquire(); + if (d != Decision::ADMITTED) { + TestFramework::RecordTest("CB: late failure after trip", + false, "admission i=" + std::to_string(i) + " not ADMITTED", + TestFramework::TestCategory::OTHER); + return; + } + } + // Report 5 failures — trip at the 5th. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + if (slice.CurrentState() != State::OPEN) { + TestFramework::RecordTest("CB: late failure after trip", false, + "expected OPEN after 5 failures", + TestFramework::TestCategory::OTHER); + return; + } + int64_t trips_after_first_trip = slice.Trips(); + // Capture open_until immediately post-trip. + auto open_until_initial = slice.OpenUntil(); + + // Now the remaining 5 in-flight requests land with late failures. + // Before the fix, each of these would go through the CLOSED path, + // climb consecutive_failures_, and trigger another TripClosedToOpen + // even though state is already OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + int64_t trips_after_late = slice.Trips(); + auto open_until_after_late = slice.OpenUntil(); + + bool pass = slice.CurrentState() == State::OPEN && + trips_after_late == trips_after_first_trip && // no ghost trip + open_until_after_late == open_until_initial; // backoff unchanged + TestFramework::RecordTest( + "CB: late failure after trip does not inflate backoff", + pass, pass ? "" : + "trips: " + std::to_string(trips_after_first_trip) + + " → " + std::to_string(trips_after_late), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: late failure after trip does not inflate backoff", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG: late non-probe success after trip would reset consecutive_failures_ +// and pollute the sliding window (pretending a fresh CLOSED cycle observed +// successes). Fix: guard ReportSuccess(probe=false) on state_ == CLOSED. +void TestLateSuccessAfterTripIgnored() { + std::cout << "\n[TEST] CB: late success after trip ignored..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + // Slice is OPEN now. A late success arrives — must not change state. + State pre = slice.CurrentState(); + slice.ReportSuccess(false); + bool pass = pre == State::OPEN && slice.CurrentState() == State::OPEN; + TestFramework::RecordTest("CB: late success after trip ignored", pass, + "", TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: late success after trip ignored", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG: HALF_OPEN admission kept accepting probes after the first probe +// failure (only enforcing `inflight < permitted`), so under load a failed +// recovery cycle could keep leaking traffic indefinitely instead of re-OPENing +// after the in-flight probes drained. Fix: short-circuit on saw_failure. +void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() { + std::cout << "\n[TEST] CB: HALF_OPEN stops admitting after probe fail..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 5; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip the breaker. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit 2 probes. Report failure on the first (but NOT the second yet + // — leave 1 in-flight so we can observe the short-circuit). + Decision d1 = slice.TryAcquire(); // ADMITTED_PROBE, inflight=1 + Decision d2 = slice.TryAcquire(); // ADMITTED_PROBE, inflight=2 + if (d1 != Decision::ADMITTED_PROBE || d2 != Decision::ADMITTED_PROBE) { + TestFramework::RecordTest( + "CB: HALF_OPEN stops admitting after probe fail", + false, "probes not admitted as expected", + TestFramework::TestCategory::OTHER); + return; + } + // Fail the first probe — inflight drops to 1, saw_failure=true. + // Last-probe trip does not yet fire (inflight is still 1). + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + + // State must still be HALF_OPEN (final probe not yet completed). + State mid = slice.CurrentState(); + + // Subsequent TryAcquire — BEFORE fix this would succeed because + // inflight (1) < permitted (5). AFTER fix it short-circuits because + // saw_failure is set. + Decision d3 = slice.TryAcquire(); + + bool pass = mid == State::HALF_OPEN && + d3 == Decision::REJECTED_OPEN; + TestFramework::RecordTest( + "CB: HALF_OPEN stops admitting after probe fail", + pass, pass ? "" : "expected REJECTED_OPEN on 3rd TryAcquire", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN stops admitting after probe fail", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Verifies the dedicated HALF_OPEN-full counter is bumped separately from the +// generic `rejected_` counter, so Phase 7 snapshots can distinguish +// "open, backoff not elapsed" from "probing, no slots left". +void TestHalfOpenFullCounterSeparate() { + std::cout << "\n[TEST] CB: HALF_OPEN_FULL counter separate..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 2; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip → OPEN reject increments generic counter only. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + slice.TryAcquire(); // REJECTED_OPEN (backoff active) + int64_t rejected_open_only = slice.Rejected(); + int64_t half_open_full_open_only = slice.RejectedHalfOpenFull(); + + // Elapse backoff → HALF_OPEN. Fill the probe budget, then a 3rd + // TryAcquire rejects with half_open_full, incrementing both counters. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + slice.TryAcquire(); // probe 1 admitted + slice.TryAcquire(); // probe 2 admitted (budget full) + slice.TryAcquire(); // REJECTED (full) + int64_t rejected_total = slice.Rejected(); + int64_t half_open_full_total = slice.RejectedHalfOpenFull(); + + bool pass = rejected_open_only == 1 && + half_open_full_open_only == 0 && + rejected_total == 2 && // 1 OPEN + 1 HALF_OPEN_FULL + half_open_full_total == 1; // only the HALF_OPEN one + TestFramework::RecordTest("CB: HALF_OPEN_FULL counter separate", + pass, pass ? "" : + "rej=" + std::to_string(rejected_total) + + " hof=" + std::to_string(half_open_full_total), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB: HALF_OPEN_FULL counter separate", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -641,6 +838,10 @@ void RunAllTests() { TestReloadPreservesState(); TestConsecutiveThresholdOne(); TestSuccessClearsConsecutive(); + TestLateFailureAfterTripDoesNotInflateBackoff(); + TestLateSuccessAfterTripIgnored(); + TestHalfOpenStopsAdmittingAfterFirstProbeFailure(); + TestHalfOpenFullCounterSeparate(); TestTransitionCallbackInvoked(); } diff --git a/test/config_test.h b/test/config_test.h index 213fd8ac..94c60763 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -506,13 +506,13 @@ namespace ConfigTests { std::cout << "\n[TEST] Circuit Breaker Validation..." << std::endl; ExpectValidationFailure("CB Validation: consecutive_failure_threshold<1", R"({"consecutive_failure_threshold": 0})", - "consecutive_failure_threshold must be >= 1"); + "consecutive_failure_threshold must be in [1, 10000]"); ExpectValidationFailure("CB Validation: failure_rate_threshold>100", R"({"failure_rate_threshold": 101})", "failure_rate_threshold must be in [0, 100]"); ExpectValidationFailure("CB Validation: minimum_volume<1", R"({"minimum_volume": 0})", - "minimum_volume must be >= 1"); + "minimum_volume must be in [1, 10000000]"); ExpectValidationFailure("CB Validation: window_seconds<1", R"({"window_seconds": 0})", "window_seconds must be in [1, 3600]"); @@ -536,7 +536,17 @@ namespace ConfigTests { "max_ejection_percent_per_host_set must be in [0, 100]"); ExpectValidationFailure("CB Validation: permitted_half_open_calls<1", R"({"permitted_half_open_calls": 0})", - "permitted_half_open_calls must be >= 1"); + "permitted_half_open_calls must be in [1, 1000]"); + // Upper-bound regressions — pathological configs must be rejected. + ExpectValidationFailure("CB Validation: consecutive_failure_threshold>10000", + R"({"consecutive_failure_threshold": 10001})", + "consecutive_failure_threshold must be in [1, 10000]"); + ExpectValidationFailure("CB Validation: minimum_volume>10000000", + R"({"minimum_volume": 10000001})", + "minimum_volume must be in [1, 10000000]"); + ExpectValidationFailure("CB Validation: permitted_half_open_calls>1000", + R"({"permitted_half_open_calls": 1001})", + "permitted_half_open_calls must be in [1, 1000]"); } // Test 14: Equality operator covers circuit_breaker field From 8c7a64ba9ef3f01d23c3a2d39237e97de2a4af20 Mon Sep 17 00:00:00 2001 From: mwfj Date: Mon, 13 Apr 2026 19:38:49 +0800 Subject: [PATCH 03/37] Fix review comment --- Makefile | 2 +- server/circuit_breaker_slice.cc | 91 +++++++++++++--- test/circuit_breaker_test.h | 185 ++++++++++++++++++++++++++++++++ 3 files changed, 262 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 4dd6b83a..23a46ce0 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 3b794c65..17f6113d 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -99,6 +99,12 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + // Reset the info-log "first reject" breadcrumb so the first rejection + // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN + // rejection (recovery attempt failing or probe budget full) is + // operationally distinct from OPEN rejection (still backing off) and + // deserves its own breadcrumb in default-warn operator logs. + first_reject_logged_for_open_ = false; logging::Get()->info( "circuit breaker half-open {} probes_allowed={}", @@ -187,14 +193,20 @@ Decision CircuitBreakerSlice::TryAcquire() { } if (s == State::HALF_OPEN) { - // Short-circuit as soon as any probe has failed: the breaker is - // guaranteed to re-trip once the remaining in-flight probes drain, so - // admitting more probes just wastes capacity on a known-bad upstream. - // Previously this path kept admitting probes until `permitted_half_open_calls` - // in-flight was reached, which under continued failure could keep - // traffic flowing indefinitely instead of converging back to OPEN. - if (half_open_saw_failure_ || - half_open_inflight_ >= config_.permitted_half_open_calls) { + // Case A: a sibling probe already failed. Short-circuit remaining + // admissions — the breaker is guaranteed to re-trip once in-flight + // probes drain. This is operationally DIFFERENT from "budget + // exhausted" (case B): probe slots may still be free, we just know + // using them can't change the outcome. Track it with its own log + // label and do NOT bump `rejected_half_open_full_` — that counter + // is specifically "probing, no capacity left" for dashboards. + if (half_open_saw_failure_) { + return RejectWithLog("half_open_recovery_failing", + /*half_open_full=*/false); + } + // Case B: probe budget fully in flight. "No capacity" — bump the + // dedicated counter so dashboards can tell these two apart. + if (half_open_inflight_ >= config_.permitted_half_open_calls) { return RejectWithLog("half_open_full", /*half_open_full=*/true); } half_open_inflight_++; @@ -244,9 +256,20 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) { if (!config_.enabled) return; if (probe) { + // Record the completed-probe outcome for observability regardless of + // current state — this is a signal about upstream behavior, not a + // signal about our state machine. probe_successes_.fetch_add(1, std::memory_order_relaxed); - // Count the completed probe regardless of saw_failure state (we still - // decrement inflight to release the slot). + + // Stale probe defense: we admitted this probe in HALF_OPEN, but the + // slice may have transitioned out (e.g., `Reload()` flipped enabled + // or resized the window, `TransitionHalfOpenToClosed` already fired + // on sibling probes, or — post-Phase 8 — an operator toggle + // transitioned us to CLOSED). Only touch HALF_OPEN bookkeeping / + // fire transitions when state is STILL HALF_OPEN. Otherwise the + // probe is informational only. + if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; + if (half_open_inflight_ > 0) half_open_inflight_--; if (half_open_saw_failure_) { // A sibling probe already failed; whichever probe finishes last @@ -280,6 +303,10 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { if (probe) { probe_failures_.fetch_add(1, std::memory_order_relaxed); + + // Stale probe defense — see matching comment in ReportSuccess above. + if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; + if (half_open_inflight_ > 0) half_open_inflight_--; half_open_saw_failure_ = true; // On the last probe (or if all remaining complete) transition OPEN. @@ -309,18 +336,52 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { } void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { - bool window_changed = (config_.window_seconds != new_config.window_seconds); + const bool enabled_changed = (config_.enabled != new_config.enabled); + const bool window_changed = + (config_.window_seconds != new_config.window_seconds); + config_ = new_config; if (window_changed) window_.Resize(new_config.window_seconds); - // Live state preserved — operator expects new thresholds to apply to the - // next evaluation, not to reset an in-progress trip. + + if (enabled_changed) { + // Toggling `enabled` is an operator intent to start fresh, not a + // runtime state transition. Without this reset: + // - Disabling while OPEN and re-enabling later would resume the + // OPEN state and reject requests even though the operator + // explicitly turned the breaker off and back on. + // - Disabling while HALF_OPEN with in-flight probes would leave + // inconsistent bookkeeping (inflight > 0, state=HALF_OPEN) that + // a subsequent enable would interpret as live probes. + // - Disabling mid-CLOSED-cycle and re-enabling would trip on the + // very next failure because consecutive_failures_ persisted. + // Matches design doc §10.1 (enabled→disabled / disabled→enabled + // transitions both get a clean CLOSED start). + // + // Silent reset — no transition callback. The change is operator- + // initiated configuration, not a runtime state signal; firing the + // callback would cause PoolPartition::DrainWaitQueueOnTrip-style + // consumers (Phase 6) to spuriously drain waiters on a config edit. + state_.store(State::CLOSED, std::memory_order_release); + open_until_steady_ns_.store(0, std::memory_order_release); + consecutive_trips_.store(0, std::memory_order_relaxed); + consecutive_failures_ = 0; + window_.Reset(); + half_open_inflight_ = 0; + half_open_successes_ = 0; + half_open_saw_failure_ = false; + first_reject_logged_for_open_ = false; + } + // When `enabled` is unchanged: live state preserved — operator expects + // new thresholds to apply to the next evaluation, not to reset an + // in-progress trip. logging::Get()->info( "circuit breaker config applied {} enabled={} window_s={} " - "fail_rate={} consec_threshold={}", + "fail_rate={} consec_threshold={}{}", host_label_, new_config.enabled, new_config.window_seconds, new_config.failure_rate_threshold, - new_config.consecutive_failure_threshold); + new_config.consecutive_failure_threshold, + enabled_changed ? " (enabled toggled — state reset to CLOSED)" : ""); } void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) { diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index f8b265d7..d39ab52a 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -775,6 +775,187 @@ void TestHalfOpenFullCounterSeparate() { } } +// BUG (review round 2, P2): Reload preserved stale state across enabled +// toggles. Disabling while OPEN and re-enabling later resumed the OPEN state, +// rejecting requests despite an explicit operator off→on cycle. Disabling +// after accumulated consecutive failures would re-trip on the very next +// failure. Fix: reset state to CLOSED whenever enabled toggles. +void TestReloadResetsStateOnEnabledToggleWhileOpen() { + std::cout << "\n[TEST] CB: reload resets state on enabled toggle (while OPEN)..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Drive to OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + if (slice.CurrentState() != State::OPEN) { + TestFramework::RecordTest( + "CB: reload resets state on enabled toggle (OPEN)", false, + "precondition: slice not OPEN", + TestFramework::TestCategory::OTHER); + return; + } + + // Disable via reload — state must reset to CLOSED. + auto disabled = cb; + disabled.enabled = false; + slice.Reload(disabled); + bool disabled_closed = slice.CurrentState() == State::CLOSED; + + // Re-enable via reload — state must remain CLOSED (no stale OPEN). + slice.Reload(cb); + bool reenabled_closed = slice.CurrentState() == State::CLOSED; + + // And the slice must NOT insta-trip on a single failure (pre-fix, + // consecutive_failures_ could have persisted ≥ threshold). + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + bool one_fail_no_trip = slice.CurrentState() == State::CLOSED; + + bool pass = disabled_closed && reenabled_closed && one_fail_no_trip; + TestFramework::RecordTest( + "CB: reload resets state on enabled toggle (OPEN)", pass, + pass ? "" : "disabled_closed=" + std::to_string(disabled_closed) + + " reenabled_closed=" + std::to_string(reenabled_closed) + + " one_fail_no_trip=" + std::to_string(one_fail_no_trip), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: reload resets state on enabled toggle (OPEN)", false, e.what(), + TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 2, P2, variant): if disable happens while +// consecutive_failures_ has accumulated but not yet tripped, re-enable would +// inherit that count and trip early on the next failure. +void TestReloadResetsConsecutiveFailuresOnEnabledToggle() { + std::cout << "\n[TEST] CB: reload clears consecutive_failures on enable toggle..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 5; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // 4 failures — just under threshold. State still CLOSED. + for (int i = 0; i < 4; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + if (slice.CurrentState() != State::CLOSED) { + TestFramework::RecordTest( + "CB: reload clears consecutive_failures", false, + "precondition: slice not CLOSED", + TestFramework::TestCategory::OTHER); + return; + } + + // Disable then re-enable. + auto disabled = cb; disabled.enabled = false; + slice.Reload(disabled); + slice.Reload(cb); + + // A single failure post-reenable must NOT trip — consecutive_failures_ + // should have been reset to 0, not preserved at 4. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + bool pass = slice.CurrentState() == State::CLOSED; + TestFramework::RecordTest( + "CB: reload clears consecutive_failures on enable toggle", + pass, + pass ? "" : "expected CLOSED after 1 post-reenable failure, got " + + std::string(circuit_breaker::StateName(slice.CurrentState())), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: reload clears consecutive_failures on enable toggle", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Threshold-change-only reload (enabled unchanged) MUST preserve live state +// per design §10. Regression guard for fix #1. +void TestReloadThresholdChangePreservesState() { + std::cout << "\n[TEST] CB: reload preserves state when only thresholds change..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + // OPEN. Reload with a tighter threshold but enabled unchanged. + auto tighter = cb; + tighter.consecutive_failure_threshold = 2; + slice.Reload(tighter); + // State must remain OPEN — live state preservation. + bool pass = slice.CurrentState() == State::OPEN; + TestFramework::RecordTest( + "CB: reload preserves state on threshold-only change", + pass, pass ? "" : "expected OPEN, got " + + std::string(circuit_breaker::StateName(slice.CurrentState())), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: reload preserves state on threshold-only change", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 2, P3): saw_failure short-circuit incorrectly bumped the +// HALF_OPEN_FULL counter, polluting dashboards that need to distinguish +// "probing, no capacity left" from "recovery attempt is failing". +void TestSawFailureDoesNotBumpHalfOpenFullCounter() { + std::cout << "\n[TEST] CB: saw_failure reject does not bump HALF_OPEN_FULL..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 5; // plenty of capacity + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit 2 probes, fail the first — saw_failure=true, inflight=1. + slice.TryAcquire(); // probe 1 admitted + slice.TryAcquire(); // probe 2 admitted + slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + + int64_t hof_before = slice.RejectedHalfOpenFull(); + // Reject via saw_failure short-circuit (capacity is NOT exhausted — + // only 1 probe actually in flight, and permitted is 5). + Decision d = slice.TryAcquire(); + int64_t hof_after = slice.RejectedHalfOpenFull(); + + // Still REJECTED_OPEN (same client-visible outcome), but + // RejectedHalfOpenFull must NOT be incremented — this is a + // "recovery failing" reject, not a capacity reject. + bool pass = d == Decision::REJECTED_OPEN && + hof_before == 0 && + hof_after == 0; + TestFramework::RecordTest( + "CB: saw_failure reject does not bump HALF_OPEN_FULL", + pass, pass ? "" : "hof_before=" + std::to_string(hof_before) + + " hof_after=" + std::to_string(hof_after), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: saw_failure reject does not bump HALF_OPEN_FULL", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -842,6 +1023,10 @@ void RunAllTests() { TestLateSuccessAfterTripIgnored(); TestHalfOpenStopsAdmittingAfterFirstProbeFailure(); TestHalfOpenFullCounterSeparate(); + TestReloadResetsStateOnEnabledToggleWhileOpen(); + TestReloadResetsConsecutiveFailuresOnEnabledToggle(); + TestReloadThresholdChangePreservesState(); + TestSawFailureDoesNotBumpHalfOpenFullCounter(); TestTransitionCallbackInvoked(); } From 6d5cac69ce333ca23b226d0459b168f37d24c54d Mon Sep 17 00:00:00 2001 From: mwfj Date: Mon, 13 Apr 2026 20:45:46 +0800 Subject: [PATCH 04/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 55 ++- server/circuit_breaker_slice.cc | 133 ++++-- test/circuit_breaker_test.h | 404 +++++++++++++++--- 3 files changed, 499 insertions(+), 93 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 8d7af6d7..4ba1331d 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -30,16 +30,34 @@ class CircuitBreakerSlice { CircuitBreakerSlice(const CircuitBreakerSlice&) = delete; CircuitBreakerSlice& operator=(const CircuitBreakerSlice&) = delete; + // Return value of TryAcquire. `generation` is a monotonically-increasing + // token identifying which state-machine cycle the admission belongs to. + // Callers MUST pass it back to Report*() unchanged so the slice can drop + // late completions that belong to a prior cycle (crossed a state + // transition or a Reload()-reset boundary). Without this, stale + // completions can pollute the bookkeeping of a fresh CLOSED/HALF_OPEN + // cycle (e.g., a pre-toggle failure incrementing the post-toggle + // consecutive_failures_, or a pre-CLOSED'-cycle success wiping a + // legitimate post-CLOSED' counter). + struct Admission { + Decision decision; + uint64_t generation; + }; + // Hot-path decision. Consults state + (if applicable) advances OPEN→HALF_OPEN // and reserves a probe slot. Increments `rejected_` on REJECTED_OPEN* // (both enforce and dry-run). Emits reject log on dispatcher thread. - Decision TryAcquire(); + // Returned generation must be threaded to the paired Report*(). + Admission TryAcquire(); // Outcome reporting. `probe` is true iff the paired TryAcquire returned - // ADMITTED_PROBE. Report* may trigger state transitions and fire the - // transition callback. - void ReportSuccess(bool probe); - void ReportFailure(FailureKind kind, bool probe); + // ADMITTED_PROBE. `admission_generation` is the generation returned by + // the paired TryAcquire — reports from a stale generation are silently + // dropped (observability counters still update so the outcome is not + // lost from dashboards). Report* may trigger state transitions and fire + // the transition callback. + void ReportSuccess(bool probe, uint64_t admission_generation); + void ReportFailure(FailureKind kind, bool probe, uint64_t admission_generation); // Apply a new config (called on this slice's dispatcher thread). // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if @@ -63,6 +81,20 @@ class CircuitBreakerSlice { int64_t RejectedHalfOpenFull() const { return rejected_half_open_full_.load(std::memory_order_relaxed); } + // Number of Report* calls silently dropped because their admission + // generation no longer matches the slice's current generation. These + // are reports of requests admitted before a state transition or a + // Reload()-reset. Useful for detecting mis-threaded admission tokens. + int64_t ReportsStaleGeneration() const { + return reports_stale_generation_.load(std::memory_order_relaxed); + } + + // **Test-only** accessor for the current generation. Production callers + // MUST use the generation returned by TryAcquire (racy otherwise — this + // getter is not atomic). Tests use it as ergonomic shorthand for + // "admission just happened in the current cycle", bypassing the need to + // thread a token per synthetic Report* call. + uint64_t CurrentGenerationForTesting() const { return generation_; } const std::string& host_label() const { return host_label_; } size_t dispatcher_index() const { return dispatcher_index_; } @@ -108,6 +140,19 @@ class CircuitBreakerSlice { // post-trip reject in default-warn operator logs. Dispatcher-thread only. bool first_reject_logged_for_open_ = false; + // Monotonic generation counter. Incremented on every state transition + // AND on every Reload() enabled-toggle reset. TryAcquire captures the + // current generation at admission time; Report* compares against it + // and drops reports from a stale generation (e.g., a request admitted + // before an operator reset whose outcome arrives after). Dispatcher- + // thread only — plain int (no atomic needed). + uint64_t generation_ = 1; + + // Rejections silently dropped because their admission generation no + // longer matches `generation_`. Observability only; lets dashboards see + // how often the generation guard fires. + std::atomic reports_stale_generation_{0}; + StateTransitionCallback transition_cb_; // Internal transitions (dispatcher-thread). diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 17f6113d..32fcdfc5 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -80,6 +80,9 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; + // Bump generation: any in-flight admission from the closing CLOSED + // cycle is now stale. Late Report*() for those requests is dropped. + ++generation_; trips_.fetch_add(1, std::memory_order_relaxed); @@ -94,8 +97,13 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { void CircuitBreakerSlice::TransitionOpenToHalfOpen() { state_.store(State::HALF_OPEN, std::memory_order_release); - // Keep open_until_steady_ns_ so observers see the "last open" boundary; - // it's cleared on transition to CLOSED. + // Clear open_until_steady_ns_ per the OpenUntil() contract ("zero when + // not OPEN"). Leaving a stale deadline here would cause Phase 4's + // ProxyTransaction::MakeCircuitOpenResponse to compute a Retry-After + // from a past time_point (negative delta → floor at 1s, misleading for + // a reject in the HALF_OPEN probe-budget-full path). Retry-After for + // HALF_OPEN rejects is computed fresh by callers when needed. + open_until_steady_ns_.store(0, std::memory_order_release); half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; @@ -105,6 +113,9 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { // operationally distinct from OPEN rejection (still backing off) and // deserves its own breadcrumb in default-warn operator logs. first_reject_logged_for_open_ = false; + // Fresh HALF_OPEN cycle — any stale probe admissions from a prior + // HALF_OPEN cycle (after re-trip then re-enter) are now invalidated. + ++generation_; logging::Get()->info( "circuit breaker half-open {} probes_allowed={}", @@ -131,6 +142,10 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; + // Fresh CLOSED cycle — any non-probe admissions from the PREVIOUS + // CLOSED cycle (before trip) are now stale, and any probe admissions + // from the just-completed HALF_OPEN cycle are too. + ++generation_; logging::Get()->info( "circuit breaker closed {} probes_succeeded={}", @@ -157,6 +172,9 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; + // Bump generation — any in-flight probe admissions from the closing + // HALF_OPEN cycle are now stale. + ++generation_; trips_.fetch_add(1, std::memory_order_relaxed); @@ -169,9 +187,12 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { if (transition_cb_) transition_cb_(State::HALF_OPEN, State::OPEN, trigger); } -Decision CircuitBreakerSlice::TryAcquire() { +CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() { // Disabled fast path — zero overhead when config.enabled=false. - if (!config_.enabled) return Decision::ADMITTED; + // Use generation 0 (sentinel) since the slice won't consult it on report. + if (!config_.enabled) { + return Admission{Decision::ADMITTED, /*generation=*/0}; + } State s = state_.load(std::memory_order_acquire); @@ -188,7 +209,8 @@ Decision CircuitBreakerSlice::TryAcquire() { TransitionOpenToHalfOpen(); s = State::HALF_OPEN; } else { - return RejectWithLog("open", /*half_open_full=*/false); + return Admission{RejectWithLog("open", /*half_open_full=*/false), + generation_}; } } @@ -201,20 +223,23 @@ Decision CircuitBreakerSlice::TryAcquire() { // label and do NOT bump `rejected_half_open_full_` — that counter // is specifically "probing, no capacity left" for dashboards. if (half_open_saw_failure_) { - return RejectWithLog("half_open_recovery_failing", - /*half_open_full=*/false); + return Admission{RejectWithLog("half_open_recovery_failing", + /*half_open_full=*/false), + generation_}; } // Case B: probe budget fully in flight. "No capacity" — bump the // dedicated counter so dashboards can tell these two apart. if (half_open_inflight_ >= config_.permitted_half_open_calls) { - return RejectWithLog("half_open_full", /*half_open_full=*/true); + return Admission{RejectWithLog("half_open_full", + /*half_open_full=*/true), + generation_}; } half_open_inflight_++; - return Decision::ADMITTED_PROBE; + return Admission{Decision::ADMITTED_PROBE, generation_}; } // CLOSED: fast path. - return Decision::ADMITTED; + return Admission{Decision::ADMITTED, generation_}; } Decision CircuitBreakerSlice::RejectWithLog(const char* state_label, @@ -252,7 +277,8 @@ Decision CircuitBreakerSlice::RejectWithLog(const char* state_label, return Decision::REJECTED_OPEN; } -void CircuitBreakerSlice::ReportSuccess(bool probe) { +void CircuitBreakerSlice::ReportSuccess(bool probe, + uint64_t admission_generation) { if (!config_.enabled) return; if (probe) { @@ -261,13 +287,22 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) { // signal about our state machine. probe_successes_.fetch_add(1, std::memory_order_relaxed); + // Generation guard: drop reports for admissions that pre-date the + // current cycle (a state transition or Reload reset invalidated them). + // Belt-and-suspenders with the state guard below — the generation + // catches stale-report-in-same-state cases (e.g., HALF_OPEN cycle + // A probe completing after re-trip and re-entry into HALF_OPEN B). + if (admission_generation != generation_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + // Stale probe defense: we admitted this probe in HALF_OPEN, but the - // slice may have transitioned out (e.g., `Reload()` flipped enabled - // or resized the window, `TransitionHalfOpenToClosed` already fired - // on sibling probes, or — post-Phase 8 — an operator toggle - // transitioned us to CLOSED). Only touch HALF_OPEN bookkeeping / - // fire transitions when state is STILL HALF_OPEN. Otherwise the - // probe is informational only. + // slice may have transitioned out (e.g., `Reload()` flipped enabled, + // `TransitionHalfOpenToClosed` already fired on sibling probes, or — + // post-Phase 8 — an operator toggle transitioned us to CLOSED). + // Only touch HALF_OPEN bookkeeping / fire transitions when state is + // STILL HALF_OPEN. if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; if (half_open_inflight_ > 0) half_open_inflight_--; @@ -286,17 +321,25 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) { return; } - // Non-probe success: only meaningful when state is CLOSED. If the slice - // has since transitioned (e.g., other requests in this burst tripped it), - // this late outcome must NOT retroactively reset `consecutive_failures_` - // or pollute the window — a fresh CLOSED cycle after recovery would start - // with bogus success history. + // Non-probe success path. + if (admission_generation != generation_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + // Only meaningful when state is CLOSED. If the slice has since + // transitioned (e.g., other requests in this burst tripped it), this + // late outcome must NOT retroactively reset `consecutive_failures_` or + // pollute the window — a fresh CLOSED cycle after recovery would start + // with bogus success history. (Transitions bump `generation_`, so the + // guard above catches this too; the state check is a direct guard for + // observability clarity.) if (state_.load(std::memory_order_acquire) != State::CLOSED) return; consecutive_failures_ = 0; window_.AddSuccess(Now()); } -void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { +void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, + uint64_t admission_generation) { (void)kind; // Kind is used by higher layers for logging; slice itself // treats all failures the same way for trip math. if (!config_.enabled) return; @@ -304,6 +347,11 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { if (probe) { probe_failures_.fetch_add(1, std::memory_order_relaxed); + if (admission_generation != generation_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + // Stale probe defense — see matching comment in ReportSuccess above. if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; @@ -316,12 +364,18 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) { return; } - // Non-probe failure: only count when CLOSED. Late failures from requests - // admitted in CLOSED but completing after a trip must NOT re-enter - // `TripClosedToOpen` — doing so double-increments `consecutive_trips_` - // (inflating open_duration) and fires a spurious CLOSED→OPEN transition - // edge that downstream consumers (wait-queue drain, snapshot telemetry) - // would see as a ghost trip. + // Non-probe failure path. + if (admission_generation != generation_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + // Only count when CLOSED. Late failures from requests admitted in CLOSED + // but completing after a trip must NOT re-enter `TripClosedToOpen` — + // doing so double-increments `consecutive_trips_` (inflating + // open_duration) and fires a spurious CLOSED→OPEN transition edge that + // downstream consumers (wait-queue drain, snapshot telemetry) would see + // as a ghost trip. (Again, the generation guard above catches this too; + // keep the state check for observability clarity.) if (state_.load(std::memory_order_acquire) != State::CLOSED) return; consecutive_failures_++; @@ -341,7 +395,22 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { (config_.window_seconds != new_config.window_seconds); config_ = new_config; - if (window_changed) window_.Resize(new_config.window_seconds); + if (window_changed) { + // Resize wipes the failure-rate ring buckets. Without bumping + // generation_ here, late completions from pre-reload admissions + // would still carry the matching generation, pass the guard, and + // repopulate the freshly empty window — mixing pre-reload and + // post-reload traffic. A pre-reload failure plus one new failure + // could then immediately satisfy minimum_volume / failure_rate + // and trip on the next evaluation, despite this being a fresh + // observation cycle by operator intent. + // + // Skip when enabled_changed is also true: the full-reset branch + // below bumps the generation as part of its larger reset, and + // double-bumping is harmless but noisy. + window_.Resize(new_config.window_seconds); + if (!enabled_changed) ++generation_; + } if (enabled_changed) { // Toggling `enabled` is an operator intent to start fresh, not a @@ -370,6 +439,10 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; + // Fresh generation: reports of requests admitted before this + // reset will carry the old generation and be silently dropped, + // preserving clean-restart semantics. + ++generation_; } // When `enabled` is unchanged: live state preserved — operator expects // new thresholds to apply to the next evaluation, not to reset an diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index d39ab52a..828dfe4f 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -56,12 +56,12 @@ void TestDisabledFastPath() { CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, [clock]() { return clock->now; }); - bool pass = slice.TryAcquire() == Decision::ADMITTED && + bool pass = slice.TryAcquire().decision == Decision::ADMITTED && slice.CurrentState() == State::CLOSED; // Reporting 100 failures must not trip. for (int i = 0; i < 100; ++i) { - slice.ReportFailure(FailureKind::CONNECT_FAILURE, false); + slice.ReportFailure(FailureKind::CONNECT_FAILURE, false, slice.CurrentGenerationForTesting()); } pass = pass && slice.CurrentState() == State::CLOSED && slice.Trips() == 0; @@ -83,10 +83,10 @@ void TestClosedStaysClosedBelowConsecutiveThreshold() { [clock]() { return clock->now; }); for (int i = 0; i < 4; ++i) { - slice.ReportFailure(FailureKind::CONNECT_FAILURE, false); + slice.ReportFailure(FailureKind::CONNECT_FAILURE, false, slice.CurrentGenerationForTesting()); } bool pass = slice.CurrentState() == State::CLOSED && - slice.TryAcquire() == Decision::ADMITTED && + slice.TryAcquire().decision == Decision::ADMITTED && slice.Trips() == 0; TestFramework::RecordTest("CB: 4 failures below threshold", pass, "", TestFramework::TestCategory::OTHER); @@ -105,11 +105,11 @@ void TestConsecutiveFailureTrip() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } bool pass = slice.CurrentState() == State::OPEN && slice.Trips() == 1 && - slice.TryAcquire() == Decision::REJECTED_OPEN; + slice.TryAcquire().decision == Decision::REJECTED_OPEN; TestFramework::RecordTest("CB: 5 consecutive failures trip", pass, "", TestFramework::TestCategory::OTHER); } catch (const std::exception& e) { @@ -130,17 +130,17 @@ void TestFailureRateTrip() { // Alternate 10 failures and 10 successes within the same second — // ratio = 50%, total = 20 (>= minimum_volume). for (int i = 0; i < 10; ++i) { - slice.ReportSuccess(false); + slice.ReportSuccess(false, slice.CurrentGenerationForTesting()); } // A success between-failures clears consecutive_failures_, confirming // only rate path can trip here. for (int i = 0; i < 9; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } // Still CLOSED — 9/19 < 50%. bool pass_pre = slice.CurrentState() == State::CLOSED; // 10th failure brings ratio to 10/20 = 50% exactly — tripper. - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); bool pass = pass_pre && slice.CurrentState() == State::OPEN && slice.Trips() == 1; TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", pass, "", @@ -163,7 +163,7 @@ void TestMinimumVolumeGate() { // 19 total calls, all failures — should NOT trip (below volume). for (int i = 0; i < 19; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } bool pass = slice.CurrentState() == State::CLOSED && slice.Trips() == 0; TestFramework::RecordTest("CB: minimum_volume gate", pass, "", @@ -183,11 +183,11 @@ void TestOpenBeforeDurationStaysOpen() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } // Advance less than base_open_duration_ms (5000ms). clock->Advance(std::chrono::milliseconds(2000)); - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; bool pass = d == Decision::REJECTED_OPEN && slice.CurrentState() == State::OPEN; TestFramework::RecordTest("CB: OPEN rejects before elapsed", pass, "", @@ -207,10 +207,10 @@ void TestOpenToHalfOpenAfterDuration() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; bool pass = d == Decision::ADMITTED_PROBE && slice.CurrentState() == State::HALF_OPEN; TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", pass, "", @@ -230,13 +230,13 @@ void TestHalfOpenAllProbesSucceed() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); // Take 5 probes; report success on each. for (int i = 0; i < cb.permitted_half_open_calls; ++i) { - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; if (d != Decision::ADMITTED_PROBE) { TestFramework::RecordTest( "CB: HALF_OPEN 5 probe successes close", false, @@ -244,7 +244,7 @@ void TestHalfOpenAllProbesSucceed() { TestFramework::TestCategory::OTHER); return; } - slice.ReportSuccess(true); + slice.ReportSuccess(true, slice.CurrentGenerationForTesting()); } bool pass = slice.CurrentState() == State::CLOSED && slice.ProbeSuccesses() == 5; @@ -265,13 +265,13 @@ void TestHalfOpenProbeFailureReopens() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); // Take 1 probe, fail it. - Decision d = slice.TryAcquire(); - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + Decision d = slice.TryAcquire().decision; + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); bool pass = d == Decision::ADMITTED_PROBE && slice.CurrentState() == State::OPEN && slice.Trips() == 2 && // initial trip + re-trip @@ -293,13 +293,13 @@ void TestHalfOpenExhaustedSlotsRejected() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); // Take 5 probes but DON'T report outcomes yet. for (int i = 0; i < 5; ++i) slice.TryAcquire(); // 6th TryAcquire must reject (all slots taken). - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; bool pass = d == Decision::REJECTED_OPEN; TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects", pass, "", TestFramework::TestCategory::OTHER); @@ -322,7 +322,7 @@ void TestExponentialBackoff() { auto trip_then_probe_fail = [&]() { // Reach OPEN. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } }; auto measure_open_ms = [&]() { @@ -339,19 +339,19 @@ void TestExponentialBackoff() { // Move to HALF_OPEN and fail the probe → trip 2. clock->Advance(std::chrono::milliseconds(d1 + 1)); slice.TryAcquire(); // HALF_OPEN, ADMITTED_PROBE - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); int64_t d2 = measure_open_ms(); clock->Advance(std::chrono::milliseconds(d2 + 1)); slice.TryAcquire(); - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); int64_t d3 = measure_open_ms(); clock->Advance(std::chrono::milliseconds(d3 + 1)); slice.TryAcquire(); - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); int64_t d4 = measure_open_ms(); clock->Advance(std::chrono::milliseconds(d4 + 1)); slice.TryAcquire(); - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); int64_t d5 = measure_open_ms(); // Expect 1000, 2000, 4000, 8000, 8000 (capped). @@ -379,17 +379,17 @@ void TestResetOnClose() { // Trip 1. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(1001)); // Move to HALF_OPEN. for (int i = 0; i < 5; ++i) { slice.TryAcquire(); - slice.ReportSuccess(true); + slice.ReportSuccess(true, slice.CurrentGenerationForTesting()); } // Now CLOSED. Trip again — expect base_duration again (not doubled). for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } auto open_until = slice.OpenUntil(); auto remaining = open_until - clock->now; @@ -496,10 +496,10 @@ void TestDryRunAdmits() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } // OPEN + dry_run → REJECTED_OPEN_DRYRUN (caller proceeds). - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; bool pass = d == Decision::REJECTED_OPEN_DRYRUN && slice.CurrentState() == State::OPEN && slice.Rejected() == 1; @@ -520,7 +520,7 @@ void TestReloadPreservesState() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } // OPEN at this point. auto cb2 = cb; @@ -545,7 +545,7 @@ void TestConsecutiveThresholdOne() { auto clock = std::make_shared(); CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, [clock]() { return clock->now; }); - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); bool pass = slice.CurrentState() == State::OPEN && slice.Trips() == 1; TestFramework::RecordTest("CB: threshold=1 single failure trips", pass, "", TestFramework::TestCategory::OTHER); @@ -564,10 +564,10 @@ void TestSuccessClearsConsecutive() { [clock]() { return clock->now; }); for (int i = 0; i < 4; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } - slice.ReportSuccess(false); // resets consecutive - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportSuccess(false, slice.CurrentGenerationForTesting()); // resets consecutive + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); // consecutive is back to 1, no trip. bool pass = slice.CurrentState() == State::CLOSED; TestFramework::RecordTest("CB: success clears consecutive", pass, "", @@ -601,7 +601,7 @@ void TestLateFailureAfterTripDoesNotInflateBackoff() { // production the outcomes for those admitted requests can arrive after // the slice has already tripped. for (int i = 0; i < 10; ++i) { - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; if (d != Decision::ADMITTED) { TestFramework::RecordTest("CB: late failure after trip", false, "admission i=" + std::to_string(i) + " not ADMITTED", @@ -611,7 +611,7 @@ void TestLateFailureAfterTripDoesNotInflateBackoff() { } // Report 5 failures — trip at the 5th. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } if (slice.CurrentState() != State::OPEN) { TestFramework::RecordTest("CB: late failure after trip", false, @@ -628,7 +628,7 @@ void TestLateFailureAfterTripDoesNotInflateBackoff() { // climb consecutive_failures_, and trigger another TripClosedToOpen // even though state is already OPEN. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } int64_t trips_after_late = slice.Trips(); auto open_until_after_late = slice.OpenUntil(); @@ -661,11 +661,11 @@ void TestLateSuccessAfterTripIgnored() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } // Slice is OPEN now. A late success arrives — must not change state. State pre = slice.CurrentState(); - slice.ReportSuccess(false); + slice.ReportSuccess(false, slice.CurrentGenerationForTesting()); bool pass = pre == State::OPEN && slice.CurrentState() == State::OPEN; TestFramework::RecordTest("CB: late success after trip ignored", pass, "", TestFramework::TestCategory::OTHER); @@ -691,14 +691,14 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() { // Trip the breaker. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); // Admit 2 probes. Report failure on the first (but NOT the second yet // — leave 1 in-flight so we can observe the short-circuit). - Decision d1 = slice.TryAcquire(); // ADMITTED_PROBE, inflight=1 - Decision d2 = slice.TryAcquire(); // ADMITTED_PROBE, inflight=2 + Decision d1 = slice.TryAcquire().decision; // ADMITTED_PROBE, inflight=1 + Decision d2 = slice.TryAcquire().decision; // ADMITTED_PROBE, inflight=2 if (d1 != Decision::ADMITTED_PROBE || d2 != Decision::ADMITTED_PROBE) { TestFramework::RecordTest( "CB: HALF_OPEN stops admitting after probe fail", @@ -708,7 +708,7 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() { } // Fail the first probe — inflight drops to 1, saw_failure=true. // Last-probe trip does not yet fire (inflight is still 1). - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); // State must still be HALF_OPEN (final probe not yet completed). State mid = slice.CurrentState(); @@ -716,7 +716,7 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() { // Subsequent TryAcquire — BEFORE fix this would succeed because // inflight (1) < permitted (5). AFTER fix it short-circuits because // saw_failure is set. - Decision d3 = slice.TryAcquire(); + Decision d3 = slice.TryAcquire().decision; bool pass = mid == State::HALF_OPEN && d3 == Decision::REJECTED_OPEN; @@ -745,7 +745,7 @@ void TestHalfOpenFullCounterSeparate() { // Trip → OPEN reject increments generic counter only. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } slice.TryAcquire(); // REJECTED_OPEN (backoff active) int64_t rejected_open_only = slice.Rejected(); @@ -791,7 +791,7 @@ void TestReloadResetsStateOnEnabledToggleWhileOpen() { // Drive to OPEN. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } if (slice.CurrentState() != State::OPEN) { TestFramework::RecordTest( @@ -813,7 +813,7 @@ void TestReloadResetsStateOnEnabledToggleWhileOpen() { // And the slice must NOT insta-trip on a single failure (pre-fix, // consecutive_failures_ could have persisted ≥ threshold). - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); bool one_fail_no_trip = slice.CurrentState() == State::CLOSED; bool pass = disabled_closed && reenabled_closed && one_fail_no_trip; @@ -845,7 +845,7 @@ void TestReloadResetsConsecutiveFailuresOnEnabledToggle() { // 4 failures — just under threshold. State still CLOSED. for (int i = 0; i < 4; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } if (slice.CurrentState() != State::CLOSED) { TestFramework::RecordTest( @@ -862,7 +862,7 @@ void TestReloadResetsConsecutiveFailuresOnEnabledToggle() { // A single failure post-reenable must NOT trip — consecutive_failures_ // should have been reset to 0, not preserved at 4. - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); bool pass = slice.CurrentState() == State::CLOSED; TestFramework::RecordTest( "CB: reload clears consecutive_failures on enable toggle", @@ -889,7 +889,7 @@ void TestReloadThresholdChangePreservesState() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } // OPEN. Reload with a tighter threshold but enabled unchanged. auto tighter = cb; @@ -923,19 +923,19 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() { [clock]() { return clock->now; }); for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); // Admit 2 probes, fail the first — saw_failure=true, inflight=1. slice.TryAcquire(); // probe 1 admitted slice.TryAcquire(); // probe 2 admitted - slice.ReportFailure(FailureKind::RESPONSE_5XX, true); + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting()); int64_t hof_before = slice.RejectedHalfOpenFull(); // Reject via saw_failure short-circuit (capacity is NOT exhausted — // only 1 probe actually in flight, and permitted is 5). - Decision d = slice.TryAcquire(); + Decision d = slice.TryAcquire().decision; int64_t hof_after = slice.RejectedHalfOpenFull(); // Still REJECTED_OPEN (same client-visible outcome), but @@ -956,6 +956,289 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() { } } +// BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left +// `open_until_steady_ns_` populated, violating the documented OpenUntil() +// contract ("zero when not OPEN"). A Phase 4 consumer computing Retry-After +// from a HALF_OPEN slice would compute (stale_deadline - now), which is +// negative once HALF_OPEN begins. +void TestOpenUntilZeroWhenHalfOpen() { + std::cout << "\n[TEST] CB: OpenUntil() zero in HALF_OPEN..." << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip → OPEN. OpenUntil() must be non-zero (contract: zero iff NOT OPEN). + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, + slice.CurrentGenerationForTesting()); + } + auto open_ns = slice.OpenUntil(); + bool open_nonzero = open_ns != std::chrono::steady_clock::time_point{}; + + // Elapse backoff → HALF_OPEN via TryAcquire. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + auto a = slice.TryAcquire(); + bool halfopen = slice.CurrentState() == State::HALF_OPEN && + a.decision == Decision::ADMITTED_PROBE; + + // Contract: OpenUntil() zero now that state != OPEN. + auto halfopen_ns = slice.OpenUntil(); + bool halfopen_zero = halfopen_ns == std::chrono::steady_clock::time_point{}; + + bool pass = open_nonzero && halfopen && halfopen_zero; + TestFramework::RecordTest( + "CB: OpenUntil() zero in HALF_OPEN", + pass, pass ? "" : + "open_nonzero=" + std::to_string(open_nonzero) + + " halfopen=" + std::to_string(halfopen) + + " halfopen_zero=" + std::to_string(halfopen_zero), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: OpenUntil() zero in HALF_OPEN", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 3, P1): Reload reset the state on enabled toggle but +// gave Report* no way to distinguish pre-toggle admissions from post-toggle +// ones. Stale completions then polluted the fresh CLOSED cycle. Fixed with +// a generation token captured at admission and checked at report. +void TestStaleGenerationReportsDroppedAfterReloadToggle() { + std::cout << "\n[TEST] CB: stale-generation reports dropped after reload toggle..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.consecutive_failure_threshold = 3; // make insta-trip detection easy + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Admit 3 requests in the original CLOSED cycle (generation = A). + auto a1 = slice.TryAcquire(); + auto a2 = slice.TryAcquire(); + auto a3 = slice.TryAcquire(); + uint64_t gen_A = a1.generation; + bool same_gen_pre = a2.generation == gen_A && a3.generation == gen_A; + + // Operator toggles: disable then re-enable → fresh CLOSED cycle. + auto disabled = cb; disabled.enabled = false; + slice.Reload(disabled); + slice.Reload(cb); + // After toggle, state is CLOSED and generation has advanced. + uint64_t gen_B = slice.CurrentGenerationForTesting(); + bool generation_advanced = gen_B != gen_A; + + // Late failures from the pre-toggle cycle arrive. Without the fix, + // these would increment consecutive_failures_ and trip the fresh + // cycle IMMEDIATELY (threshold=3, 3 late failures). + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A); + + // Fresh cycle must be untouched. + bool state_still_closed = slice.CurrentState() == State::CLOSED; + bool stale_counter_bumped = slice.ReportsStaleGeneration() == 3; + + // A fresh post-toggle admission + 3 REAL failures should still trip — + // so the guard didn't over-drop. + auto fresh = slice.TryAcquire(); + for (int i = 0; i < 3; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, fresh.generation); + } + bool fresh_trips = slice.CurrentState() == State::OPEN; + + bool pass = same_gen_pre && generation_advanced && + state_still_closed && stale_counter_bumped && fresh_trips; + TestFramework::RecordTest( + "CB: stale-generation reports dropped after reload toggle", + pass, pass ? "" : + "same_gen_pre=" + std::to_string(same_gen_pre) + + " gen_advanced=" + std::to_string(generation_advanced) + + " state_closed=" + std::to_string(state_still_closed) + + " stale_cnt=" + std::to_string(slice.ReportsStaleGeneration()) + + " fresh_trips=" + std::to_string(fresh_trips), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: stale-generation reports dropped after reload toggle", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Generation also advances across state transitions (not just Reload), so +// a report admitted in CLOSED cycle A that completes after OPEN → HALF_OPEN +// → CLOSED cycle B is dropped instead of polluting cycle B's counters. +void TestStaleGenerationReportsDroppedAcrossStateTransitions() { + std::cout << "\n[TEST] CB: stale reports dropped across CLOSED->OPEN->CLOSED..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // CLOSED cycle A — admit a request, capture its generation. + auto admit_A = slice.TryAcquire(); + uint64_t gen_A = admit_A.generation; + + // Drive to OPEN, then HALF_OPEN, then CLOSED (cycle B) via probe success. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, + slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + for (int i = 0; i < cb.permitted_half_open_calls; ++i) { + auto p = slice.TryAcquire(); // probe + slice.ReportSuccess(true, p.generation); + } + bool cycleB_closed = slice.CurrentState() == State::CLOSED; + uint64_t gen_B = slice.CurrentGenerationForTesting(); + bool gen_advanced = gen_B > gen_A; + + // Now the original cycle-A request finally reports a success. In a + // world without the generation guard, this would reset cycle B's + // (freshly-zero) consecutive_failures_ and add to cycle B's window, + // polluting fresh telemetry. + int64_t stale_before = slice.ReportsStaleGeneration(); + slice.ReportSuccess(false, gen_A); + int64_t stale_after = slice.ReportsStaleGeneration(); + bool dropped = stale_after == stale_before + 1; + + bool pass = cycleB_closed && gen_advanced && dropped; + TestFramework::RecordTest( + "CB: stale reports dropped across CLOSED->OPEN->CLOSED", + pass, pass ? "" : + "cycleB_closed=" + std::to_string(cycleB_closed) + + " gen_advanced=" + std::to_string(gen_advanced) + + " dropped=" + std::to_string(dropped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: stale reports dropped across CLOSED->OPEN->CLOSED", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// BUG (review round 4, P2): Reload that resizes the rolling window without +// toggling enabled cleared the window buckets but left generation_ unchanged. +// Late reports from pre-reload admissions would carry the still-current +// generation, pass the guard, and re-populate the freshly empty window — +// mixing pre-reload and post-reload traffic. A pre-reload + post-reload +// failure pair could satisfy minimum_volume / failure_rate immediately on +// what should be a fresh observation cycle. +void TestWindowResizeAdvancesGeneration() { + std::cout << "\n[TEST] CB: window resize advances generation..." << std::endl; + try { + // Use rate-trip path only (high consec threshold disables that path), + // a low minimum_volume so 2 failures suffice, and a high + // failure_rate_threshold so the trip relies on the rate calc. + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 1000; // disable consecutive path + cb.failure_rate_threshold = 50; + cb.minimum_volume = 2; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Pre-reload: admit a request and capture its generation. + auto admit_pre = slice.TryAcquire(); + uint64_t gen_pre = admit_pre.generation; + + // Reload: change window_seconds but keep enabled=true. Window is + // resized (cleared) and generation MUST advance so the pre-reload + // admission's late report doesn't seed the new window. + auto resized = cb; + resized.window_seconds = 30; + slice.Reload(resized); + + uint64_t gen_post = slice.CurrentGenerationForTesting(); + bool gen_advanced = gen_post != gen_pre; + + // The pre-reload admission completes (failure). Without the fix, + // this would add one failure to the freshly-empty window. Then + // a post-reload admission's failure brings total=2 >= minimum_volume, + // failures=2/2=100% >= 50% → IMMEDIATE TRIP on a fresh window. + // With the fix, the pre-reload report is dropped (counted as stale). + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre); + + int64_t stale_after_pre = slice.ReportsStaleGeneration(); + + // Now a real post-reload admission and failure — single failure in + // a fresh window of size 30s. total=1, below minimum_volume=2 → no trip. + auto admit_post = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, admit_post.generation); + + bool state_still_closed = slice.CurrentState() == State::CLOSED; + bool stale_dropped = stale_after_pre == 1; + + bool pass = gen_advanced && state_still_closed && stale_dropped; + TestFramework::RecordTest( + "CB: window resize advances generation", + pass, pass ? "" : + "gen_advanced=" + std::to_string(gen_advanced) + + " state_closed=" + std::to_string(state_still_closed) + + " stale_count=" + std::to_string(slice.ReportsStaleGeneration()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize advances generation", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Regression guard: a reload that changes only thresholds (no window resize, +// no enabled toggle) MUST preserve generation. Operator intent is "apply new +// thresholds to existing observations" — the round-4 fix's window-resize +// generation bump must NOT trigger here. +void TestThresholdOnlyReloadDoesNotAdvanceGeneration() { + std::cout << "\n[TEST] CB: threshold-only reload preserves generation..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + auto admit = slice.TryAcquire(); + uint64_t gen_pre = admit.generation; + + // Tighten thresholds; same enabled, same window_seconds. + auto tightened = cb; + tightened.consecutive_failure_threshold = 2; + tightened.failure_rate_threshold = 30; + slice.Reload(tightened); + + uint64_t gen_post = slice.CurrentGenerationForTesting(); + bool gen_preserved = gen_post == gen_pre; + + // The pre-reload admission's report should NOT be dropped — operator + // wants the new thresholds applied to existing in-flight observations. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre); + bool stale_zero = slice.ReportsStaleGeneration() == 0; + + bool pass = gen_preserved && stale_zero; + TestFramework::RecordTest( + "CB: threshold-only reload preserves generation", + pass, pass ? "" : + "gen_preserved=" + std::to_string(gen_preserved) + + " stale_zero=" + std::to_string(stale_zero), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: threshold-only reload preserves generation", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -976,12 +1259,12 @@ void TestTransitionCallbackInvoked() { // Full cycle. for (int i = 0; i < 5; ++i) { - slice.ReportFailure(FailureKind::RESPONSE_5XX, false); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting()); } clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); for (int i = 0; i < cb.permitted_half_open_calls; ++i) { slice.TryAcquire(); - slice.ReportSuccess(true); + slice.ReportSuccess(true, slice.CurrentGenerationForTesting()); } bool pass = closed_to_open == 1 && open_to_halfopen == 1 && halfopen_to_closed == 1; @@ -1027,6 +1310,11 @@ void RunAllTests() { TestReloadResetsConsecutiveFailuresOnEnabledToggle(); TestReloadThresholdChangePreservesState(); TestSawFailureDoesNotBumpHalfOpenFullCounter(); + TestOpenUntilZeroWhenHalfOpen(); + TestStaleGenerationReportsDroppedAfterReloadToggle(); + TestStaleGenerationReportsDroppedAcrossStateTransitions(); + TestWindowResizeAdvancesGeneration(); + TestThresholdOnlyReloadDoesNotAdvanceGeneration(); TestTransitionCallbackInvoked(); } From fa585ac508f1f277379daca5721a9167e39da1bc Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 00:10:07 +0800 Subject: [PATCH 05/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 57 +++++-- include/config/server_config.h | 11 +- server/circuit_breaker_slice.cc | 109 ++++++++------ test/circuit_breaker_test.h | 140 ++++++++++++++++++ test/config_test.h | 48 ++++-- 5 files changed, 294 insertions(+), 71 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 4ba1331d..f08a8358 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -82,19 +82,32 @@ class CircuitBreakerSlice { return rejected_half_open_full_.load(std::memory_order_relaxed); } // Number of Report* calls silently dropped because their admission - // generation no longer matches the slice's current generation. These - // are reports of requests admitted before a state transition or a + // generation no longer matches the relevant per-domain counter + // (closed_gen_ for non-probe, halfopen_gen_ for probe). These are + // reports of requests admitted before a state transition or a // Reload()-reset. Useful for detecting mis-threaded admission tokens. int64_t ReportsStaleGeneration() const { return reports_stale_generation_.load(std::memory_order_relaxed); } - // **Test-only** accessor for the current generation. Production callers - // MUST use the generation returned by TryAcquire (racy otherwise — this - // getter is not atomic). Tests use it as ergonomic shorthand for - // "admission just happened in the current cycle", bypassing the need to - // thread a token per synthetic Report* call. - uint64_t CurrentGenerationForTesting() const { return generation_; } + // **Test-only** accessor for the generation that the current state's + // next admission would receive. Returns `halfopen_gen_` when state is + // HALF_OPEN (probe admissions use that counter), otherwise `closed_gen_` + // (non-probe admissions use that counter). This matches what TryAcquire + // would stamp on a new admission right now. + // + // Production callers MUST use the generation returned by TryAcquire + // (racy otherwise — these getters are not atomic). Tests use it as + // ergonomic shorthand for "admission just happened in the current + // cycle", bypassing the need to thread a token per synthetic Report*. + uint64_t CurrentGenerationForTesting() const { + return (state_.load(std::memory_order_acquire) == State::HALF_OPEN) + ? halfopen_gen_ : closed_gen_; + } + // Explicit per-domain getters for tests that cross state transitions + // while holding a captured generation from a specific domain. + uint64_t CurrentClosedGenForTesting() const { return closed_gen_; } + uint64_t CurrentHalfOpenGenForTesting() const { return halfopen_gen_; } const std::string& host_label() const { return host_label_; } size_t dispatcher_index() const { return dispatcher_index_; } @@ -140,13 +153,27 @@ class CircuitBreakerSlice { // post-trip reject in default-warn operator logs. Dispatcher-thread only. bool first_reject_logged_for_open_ = false; - // Monotonic generation counter. Incremented on every state transition - // AND on every Reload() enabled-toggle reset. TryAcquire captures the - // current generation at admission time; Report* compares against it - // and drops reports from a stale generation (e.g., a request admitted - // before an operator reset whose outcome arrives after). Dispatcher- - // thread only — plain int (no atomic needed). - uint64_t generation_ = 1; + // Monotonic generation counters — one per admission domain. TryAcquire + // stamps the admission with the domain's current value; Report* compares + // against it and drops reports whose admission no longer matches a live + // cycle. Split into two counters so operations that reset ONE domain + // (e.g., window_seconds reload wipes the CLOSED rate window) don't + // invalidate admissions in the OTHER domain (HALF_OPEN probes) — which + // would strand probe capacity and wedge the slice in HALF_OPEN. + // + // Dispatcher-thread only — plain ints (no atomics needed). + // + // closed_gen_ bumps on: TripClosedToOpen (CLOSED cycle ends), + // Reload enabled-toggle reset, + // Reload window_seconds change (rate-window wipe). + // halfopen_gen_ bumps on: TripHalfOpenToOpen (HALF_OPEN cycle ends), + // TransitionHalfOpenToClosed (HALF_OPEN cycle ends on success), + // Reload enabled-toggle reset. + // + // Initial value 1 (so 0 can be a "not-applicable" sentinel for + // admissions returned from disabled slices or the REJECTED_* paths). + uint64_t closed_gen_ = 1; + uint64_t halfopen_gen_ = 1; // Rejections silently dropped because their admission generation no // longer matches `generation_`. Observability only; lets dashboards see diff --git a/include/config/server_config.h b/include/config/server_config.h index 7dd949d1..6a82521a 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -183,10 +183,17 @@ struct UpstreamConfig { ProxyConfig proxy; CircuitBreakerConfig circuit_breaker; + // Intentionally EXCLUDES circuit_breaker — breaker tuning is live- + // reloadable (§10 of CIRCUIT_BREAKER_DESIGN.md) and must not trigger + // the "upstream configuration changes require a restart" warning in + // HttpServer::Reload (http_server.cc:3383). Phase 8's breaker-reload + // path compares CircuitBreakerConfig fields directly (per-host + // iteration), not via this operator==. All other fields here are + // restart-required: changing name/host/port/tls rebuilds pool + // topology; changing pool/proxy would re-register routes. bool operator==(const UpstreamConfig& o) const { return name == o.name && host == o.host && port == o.port && - tls == o.tls && pool == o.pool && proxy == o.proxy && - circuit_breaker == o.circuit_breaker; + tls == o.tls && pool == o.pool && proxy == o.proxy; } bool operator!=(const UpstreamConfig& o) const { return !(*this == o); } }; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 32fcdfc5..03313173 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -80,9 +80,10 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; - // Bump generation: any in-flight admission from the closing CLOSED - // cycle is now stale. Late Report*() for those requests is dropped. - ++generation_; + // Bump closed_gen_: non-probe admissions from the closing CLOSED cycle + // are now stale. Late Report(false, ...) calls for those requests drop. + // halfopen_gen_ is NOT bumped — OPEN holds no HALF_OPEN admissions. + ++closed_gen_; trips_.fetch_add(1, std::memory_order_relaxed); @@ -113,9 +114,11 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { // operationally distinct from OPEN rejection (still backing off) and // deserves its own breadcrumb in default-warn operator logs. first_reject_logged_for_open_ = false; - // Fresh HALF_OPEN cycle — any stale probe admissions from a prior - // HALF_OPEN cycle (after re-trip then re-enter) are now invalidated. - ++generation_; + // NOTE: neither closed_gen_ nor halfopen_gen_ is bumped here. No + // admissions are made in OPEN — the previous HALF_OPEN cycle (if any) + // already bumped halfopen_gen_ on its exit (TripHalfOpenToOpen) or on + // cycle-complete (TransitionHalfOpenToClosed), so any latent stale + // probes are already tagged. Bumping again would be redundant. logging::Get()->info( "circuit breaker half-open {} probes_allowed={}", @@ -142,10 +145,11 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; - // Fresh CLOSED cycle — any non-probe admissions from the PREVIOUS - // CLOSED cycle (before trip) are now stale, and any probe admissions - // from the just-completed HALF_OPEN cycle are too. - ++generation_; + // Bump halfopen_gen_: the just-completed HALF_OPEN cycle's probe + // admissions are now stale. closed_gen_ is NOT bumped — pre-trip + // CLOSED admissions were already invalidated by TripClosedToOpen + // when we left CLOSED. + ++halfopen_gen_; logging::Get()->info( "circuit breaker closed {} probes_succeeded={}", @@ -172,9 +176,10 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; - // Bump generation — any in-flight probe admissions from the closing - // HALF_OPEN cycle are now stale. - ++generation_; + // Bump halfopen_gen_: probe admissions from the closing HALF_OPEN + // cycle are now stale. closed_gen_ is NOT bumped — no CLOSED + // admissions are outstanding (we came from HALF_OPEN, not CLOSED). + ++halfopen_gen_; trips_.fetch_add(1, std::memory_order_relaxed); @@ -209,8 +214,12 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() { TransitionOpenToHalfOpen(); s = State::HALF_OPEN; } else { + // Rejected admissions get generation 0 — callers must not call + // Report* for a rejected admission, and 0 always compares stale + // (domain gens start at 1), so an accidental Report would drop + // safely rather than mutating state. return Admission{RejectWithLog("open", /*half_open_full=*/false), - generation_}; + /*generation=*/0}; } } @@ -225,21 +234,22 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() { if (half_open_saw_failure_) { return Admission{RejectWithLog("half_open_recovery_failing", /*half_open_full=*/false), - generation_}; + /*generation=*/0}; } // Case B: probe budget fully in flight. "No capacity" — bump the // dedicated counter so dashboards can tell these two apart. if (half_open_inflight_ >= config_.permitted_half_open_calls) { return Admission{RejectWithLog("half_open_full", /*half_open_full=*/true), - generation_}; + /*generation=*/0}; } half_open_inflight_++; - return Admission{Decision::ADMITTED_PROBE, generation_}; + // Probe admission — stamp with halfopen_gen_. + return Admission{Decision::ADMITTED_PROBE, halfopen_gen_}; } - // CLOSED: fast path. - return Admission{Decision::ADMITTED, generation_}; + // CLOSED: fast path — stamp with closed_gen_. + return Admission{Decision::ADMITTED, closed_gen_}; } Decision CircuitBreakerSlice::RejectWithLog(const char* state_label, @@ -287,12 +297,13 @@ void CircuitBreakerSlice::ReportSuccess(bool probe, // signal about our state machine. probe_successes_.fetch_add(1, std::memory_order_relaxed); - // Generation guard: drop reports for admissions that pre-date the - // current cycle (a state transition or Reload reset invalidated them). - // Belt-and-suspenders with the state guard below — the generation - // catches stale-report-in-same-state cases (e.g., HALF_OPEN cycle - // A probe completing after re-trip and re-entry into HALF_OPEN B). - if (admission_generation != generation_) { + // Generation guard: drop reports for probes admitted before the + // current HALF_OPEN cycle. Probes use halfopen_gen_ exclusively — + // so a window_seconds reload (bumps closed_gen_, NOT halfopen_gen_) + // does NOT invalidate in-flight probes, which would otherwise + // strand half_open_inflight_ at its pre-reload value and wedge the + // slice in HALF_OPEN/half_open_full. + if (admission_generation != halfopen_gen_) { reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); return; } @@ -321,8 +332,8 @@ void CircuitBreakerSlice::ReportSuccess(bool probe, return; } - // Non-probe success path. - if (admission_generation != generation_) { + // Non-probe success path — checked against closed_gen_. + if (admission_generation != closed_gen_) { reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); return; } @@ -330,7 +341,7 @@ void CircuitBreakerSlice::ReportSuccess(bool probe, // transitioned (e.g., other requests in this burst tripped it), this // late outcome must NOT retroactively reset `consecutive_failures_` or // pollute the window — a fresh CLOSED cycle after recovery would start - // with bogus success history. (Transitions bump `generation_`, so the + // with bogus success history. (Transitions bump `closed_gen_`, so the // guard above catches this too; the state check is a direct guard for // observability clarity.) if (state_.load(std::memory_order_acquire) != State::CLOSED) return; @@ -347,7 +358,8 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, if (probe) { probe_failures_.fetch_add(1, std::memory_order_relaxed); - if (admission_generation != generation_) { + // Probes use halfopen_gen_ — see matching comment in ReportSuccess. + if (admission_generation != halfopen_gen_) { reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); return; } @@ -364,8 +376,8 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, return; } - // Non-probe failure path. - if (admission_generation != generation_) { + // Non-probe failure path — checked against closed_gen_. + if (admission_generation != closed_gen_) { reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); return; } @@ -397,19 +409,24 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { config_ = new_config; if (window_changed) { // Resize wipes the failure-rate ring buckets. Without bumping - // generation_ here, late completions from pre-reload admissions - // would still carry the matching generation, pass the guard, and - // repopulate the freshly empty window — mixing pre-reload and - // post-reload traffic. A pre-reload failure plus one new failure - // could then immediately satisfy minimum_volume / failure_rate - // and trip on the next evaluation, despite this being a fresh - // observation cycle by operator intent. + // closed_gen_ here, late completions from pre-reload CLOSED + // admissions would pass the generation guard and repopulate the + // freshly empty window — mixing pre-reload and post-reload traffic + // in the rate-trip calc. + // + // CRUCIALLY: we bump ONLY closed_gen_, NOT halfopen_gen_. + // window_seconds affects only the CLOSED rate window. Bumping + // halfopen_gen_ too (as prior fix did) would invalidate in-flight + // probes, whose late reports could no longer decrement + // half_open_inflight_ or honor saw_failure/TripHalfOpenToOpen — + // wedging the slice in HALF_OPEN/half_open_full with full probe + // slots until another reset. Probe bookkeeping is untouched by + // Resize, so preserving halfopen_gen_ keeps probes live. // // Skip when enabled_changed is also true: the full-reset branch - // below bumps the generation as part of its larger reset, and - // double-bumping is harmless but noisy. + // below bumps both generations as part of its larger reset. window_.Resize(new_config.window_seconds); - if (!enabled_changed) ++generation_; + if (!enabled_changed) ++closed_gen_; } if (enabled_changed) { @@ -439,10 +456,12 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { half_open_successes_ = 0; half_open_saw_failure_ = false; first_reject_logged_for_open_ = false; - // Fresh generation: reports of requests admitted before this - // reset will carry the old generation and be silently dropped, - // preserving clean-restart semantics. - ++generation_; + // Fresh generations for BOTH domains: this is a full reset. + // Both pre-toggle non-probe admissions (closed_gen) and in-flight + // probes (halfopen_gen) are invalidated — their late reports + // silently drop, preserving clean-restart semantics. + ++closed_gen_; + ++halfopen_gen_; } // When `enabled` is unchanged: live state preserved — operator expects // new thresholds to apply to the next evaluation, not to reset an diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index 828dfe4f..6a6f4354 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1239,6 +1239,144 @@ void TestThresholdOnlyReloadDoesNotAdvanceGeneration() { } } +// BUG (review round 5, P1): Reload with window_seconds change while the +// slice is HALF_OPEN used to bump the single `generation_`, invalidating +// every in-flight probe. Those probes' late Report* calls then dropped +// WITHOUT decrementing half_open_inflight_, wedging the slice in HALF_OPEN +// with all probe slots stuck "in flight" forever — subsequent TryAcquires +// rejected with half_open_full indefinitely until another full reset. +// +// Fix: split generation into closed_gen_ (non-probe, CLOSED-state data) +// and halfopen_gen_ (probe, HALF_OPEN-state data). window_seconds reload +// bumps only closed_gen_ because it only resets CLOSED-state data. +void TestWindowResizeDuringHalfOpenDoesNotStrandProbes() { + std::cout << "\n[TEST] CB: window resize during HALF_OPEN preserves probes..." + << std::endl; + try { + auto cb = DefaultEnabledConfig(); + cb.permitted_half_open_calls = 3; + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Drive to HALF_OPEN. + for (int i = 0; i < 5; ++i) { + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, + slice.CurrentGenerationForTesting()); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit all 3 probes (capture their admission tokens). + auto p1 = slice.TryAcquire(); + auto p2 = slice.TryAcquire(); + auto p3 = slice.TryAcquire(); + bool all_admitted_probe = p1.decision == Decision::ADMITTED_PROBE && + p2.decision == Decision::ADMITTED_PROBE && + p3.decision == Decision::ADMITTED_PROBE; + + // Reload window_seconds (enabled unchanged). PRE-fix: bumps single + // generation, invalidates p1/p2/p3 probes → stranded. POST-fix: + // bumps only closed_gen_, probe tokens still match halfopen_gen_. + auto resized = cb; + resized.window_seconds = 30; + slice.Reload(resized); + + // closed_gen advanced, halfopen_gen preserved. + bool closed_gen_advanced = slice.CurrentClosedGenForTesting() != + p1.generation; // p1 was admitted in HALF_OPEN + // but let's check against gen + // we'd have captured in CLOSED + // Actually, directly: probes tokens must still match halfopen_gen_. + bool probe_gen_preserved = + p1.generation == slice.CurrentHalfOpenGenForTesting() && + p2.generation == slice.CurrentHalfOpenGenForTesting() && + p3.generation == slice.CurrentHalfOpenGenForTesting(); + + // Probes report success — each must be accepted and advance the + // HALF_OPEN → CLOSED transition. + slice.ReportSuccess(true, p1.generation); + slice.ReportSuccess(true, p2.generation); + slice.ReportSuccess(true, p3.generation); + + // After 3 probe successes at permitted_half_open_calls=3, slice + // MUST have transitioned to CLOSED. Pre-fix: probes dropped, no + // progression, still HALF_OPEN with inflight stuck at 3. + bool closed_now = slice.CurrentState() == State::CLOSED; + // None of the probes were dropped as stale. + bool no_stale_drops = slice.ReportsStaleGeneration() == 0; + // All 3 probe successes counted. + bool all_probes_counted = slice.ProbeSuccesses() == 3; + + bool pass = all_admitted_probe && probe_gen_preserved && + closed_now && no_stale_drops && all_probes_counted; + (void)closed_gen_advanced; // (informational only) + + TestFramework::RecordTest( + "CB: window resize during HALF_OPEN preserves probes", + pass, pass ? "" : + "admitted=" + std::to_string(all_admitted_probe) + + " probe_gen_preserved=" + std::to_string(probe_gen_preserved) + + " closed_now=" + std::to_string(closed_now) + + " stale=" + std::to_string(slice.ReportsStaleGeneration()) + + " probe_success=" + std::to_string(slice.ProbeSuccesses()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize during HALF_OPEN preserves probes", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Companion guard: window_seconds reload MUST still invalidate pre-reload +// CLOSED (non-probe) admissions. Ensures the split-gen didn't weaken the +// round-4 fix. +void TestWindowResizeStillInvalidatesClosedAdmissions() { + std::cout << "\n[TEST] CB: window resize invalidates CLOSED admissions..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 1000; // disable consec path + cb.failure_rate_threshold = 50; + cb.minimum_volume = 2; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + auto admit_pre = slice.TryAcquire(); + uint64_t gen_pre = admit_pre.generation; + + auto resized = cb; resized.window_seconds = 30; + slice.Reload(resized); + + // Pre-reload CLOSED admission reports — must drop as stale. + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre); + bool dropped_stale = slice.ReportsStaleGeneration() == 1; + + // And state must remain CLOSED (pre-reload failure did NOT seed window). + auto admit_post = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, admit_post.generation); + bool still_closed = slice.CurrentState() == State::CLOSED; + + bool pass = dropped_stale && still_closed; + TestFramework::RecordTest( + "CB: window resize invalidates CLOSED admissions", + pass, pass ? "" : + "dropped=" + std::to_string(dropped_stale) + + " closed=" + std::to_string(still_closed), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize invalidates CLOSED admissions", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -1315,6 +1453,8 @@ void RunAllTests() { TestStaleGenerationReportsDroppedAcrossStateTransitions(); TestWindowResizeAdvancesGeneration(); TestThresholdOnlyReloadDoesNotAdvanceGeneration(); + TestWindowResizeDuringHalfOpenDoesNotStrandProbes(); + TestWindowResizeStillInvalidatesClosedAdmissions(); TestTransitionCallbackInvoked(); } diff --git a/test/config_test.h b/test/config_test.h index 94c60763..69b5cfe7 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -549,25 +549,55 @@ namespace ConfigTests { "permitted_half_open_calls must be in [1, 1000]"); } - // Test 14: Equality operator covers circuit_breaker field + // Test 14: UpstreamConfig::operator== EXCLUDES circuit_breaker field. + // Rationale: breaker tuning is live-reloadable (design §10). Including + // it here would make HttpServer::Reload (http_server.cc:3383) treat a + // breaker-only edit as an upstream topology change, fire the "restart + // required" warning, and block the hot-reload path. Topology fields + // (name/host/port/tls/pool/proxy) ARE included — they require a restart. void TestCircuitBreakerEquality() { - std::cout << "\n[TEST] Circuit Breaker Equality..." << std::endl; + std::cout << "\n[TEST] Circuit Breaker Equality (topology only)..." << std::endl; try { UpstreamConfig a; a.name = "svc"; a.host = "h"; a.port = 80; UpstreamConfig b = a; + + // Default equal. bool equal_default = (a == b); + // Circuit-breaker-only edit must NOT change UpstreamConfig equality. b.circuit_breaker.enabled = true; - bool not_equal_after_diff = (a != b); - - bool pass = equal_default && not_equal_after_diff; - TestFramework::RecordTest("Circuit Breaker Equality", pass, - pass ? "" : "operator== failed for circuit_breaker", + b.circuit_breaker.window_seconds = 30; + bool topology_still_equal = (a == b); + + // BUT CircuitBreakerConfig::operator== catches the field diff + // (Phase 8 reload uses this to detect what changed per-host). + bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker); + + // Topology changes DO make configs unequal. + UpstreamConfig c = a; + c.host = "different"; + bool topology_changed = (a != c); + + UpstreamConfig d = a; + d.port = 9999; + bool port_change_detected = (a != d); + + bool pass = equal_default && topology_still_equal && + cb_fields_differ && topology_changed && + port_change_detected; + TestFramework::RecordTest("Circuit Breaker Equality (topology only)", + pass, + pass ? "" : + "equal_default=" + std::to_string(equal_default) + + " topology_still_equal=" + std::to_string(topology_still_equal) + + " cb_fields_differ=" + std::to_string(cb_fields_differ) + + " topology_changed=" + std::to_string(topology_changed) + + " port_change_detected=" + std::to_string(port_change_detected), TestFramework::TestCategory::OTHER); } catch (const std::exception& e) { - TestFramework::RecordTest("Circuit Breaker Equality", false, e.what(), - TestFramework::TestCategory::OTHER); + TestFramework::RecordTest("Circuit Breaker Equality (topology only)", + false, e.what(), TestFramework::TestCategory::OTHER); } } From eae864e641162a8b8080eb9e79f19720b8d79f62 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 00:24:29 +0800 Subject: [PATCH 06/37] Fix review comment --- include/config/server_config.h | 21 ++++++++++++--------- test/config_test.h | 32 ++++++++++++++++---------------- 2 files changed, 28 insertions(+), 25 deletions(-) diff --git a/include/config/server_config.h b/include/config/server_config.h index 6a82521a..606de9d9 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -183,17 +183,20 @@ struct UpstreamConfig { ProxyConfig proxy; CircuitBreakerConfig circuit_breaker; - // Intentionally EXCLUDES circuit_breaker — breaker tuning is live- - // reloadable (§10 of CIRCUIT_BREAKER_DESIGN.md) and must not trigger - // the "upstream configuration changes require a restart" warning in - // HttpServer::Reload (http_server.cc:3383). Phase 8's breaker-reload - // path compares CircuitBreakerConfig fields directly (per-host - // iteration), not via this operator==. All other fields here are - // restart-required: changing name/host/port/tls rebuilds pool - // topology; changing pool/proxy would re-register routes. + // Includes circuit_breaker until Phase 8 ships CircuitBreakerManager::Reload. + // A CB-only SIGHUP currently has no propagation path into live slice state, + // so operator== must return false to trigger the "restart required" warning + // rather than silently committing the new config object while the live slices + // continue running with the old settings. + // + // TODO(phase-8): once CircuitBreakerManager::Reload is wired into + // HttpServer::Reload, remove circuit_breaker from this operator and diff it + // separately (per-host CircuitBreakerConfig comparison) so breaker-only + // edits are hot-reloadable without a restart. bool operator==(const UpstreamConfig& o) const { return name == o.name && host == o.host && port == o.port && - tls == o.tls && pool == o.pool && proxy == o.proxy; + tls == o.tls && pool == o.pool && proxy == o.proxy && + circuit_breaker == o.circuit_breaker; } bool operator!=(const UpstreamConfig& o) const { return !(*this == o); } }; diff --git a/test/config_test.h b/test/config_test.h index 69b5cfe7..18ee718f 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -549,14 +549,14 @@ namespace ConfigTests { "permitted_half_open_calls must be in [1, 1000]"); } - // Test 14: UpstreamConfig::operator== EXCLUDES circuit_breaker field. - // Rationale: breaker tuning is live-reloadable (design §10). Including - // it here would make HttpServer::Reload (http_server.cc:3383) treat a - // breaker-only edit as an upstream topology change, fire the "restart - // required" warning, and block the hot-reload path. Topology fields - // (name/host/port/tls/pool/proxy) ARE included — they require a restart. + // Test 14: UpstreamConfig::operator== INCLUDES circuit_breaker until Phase 8. + // Until CircuitBreakerManager::Reload is wired in HttpServer::Reload, a + // CB-only SIGHUP has no propagation path. Keeping circuit_breaker in the + // equality check ensures the server fires the "restart required" warning + // rather than silently reporting "reload OK" with stale live settings. + // TODO(phase-8): flip this test when CB hot-reload is implemented. void TestCircuitBreakerEquality() { - std::cout << "\n[TEST] Circuit Breaker Equality (topology only)..." << std::endl; + std::cout << "\n[TEST] Circuit Breaker Equality (CB included until Phase 8)..." << std::endl; try { UpstreamConfig a; a.name = "svc"; a.host = "h"; a.port = 80; @@ -565,16 +565,16 @@ namespace ConfigTests { // Default equal. bool equal_default = (a == b); - // Circuit-breaker-only edit must NOT change UpstreamConfig equality. + // Circuit-breaker-only edit DOES change UpstreamConfig equality + // (until Phase 8 ships the live-reload path). b.circuit_breaker.enabled = true; b.circuit_breaker.window_seconds = 30; - bool topology_still_equal = (a == b); + bool cb_edit_detected = (a != b); - // BUT CircuitBreakerConfig::operator== catches the field diff - // (Phase 8 reload uses this to detect what changed per-host). + // CircuitBreakerConfig::operator== agrees on the field diff. bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker); - // Topology changes DO make configs unequal. + // Topology changes also make configs unequal. UpstreamConfig c = a; c.host = "different"; bool topology_changed = (a != c); @@ -583,20 +583,20 @@ namespace ConfigTests { d.port = 9999; bool port_change_detected = (a != d); - bool pass = equal_default && topology_still_equal && + bool pass = equal_default && cb_edit_detected && cb_fields_differ && topology_changed && port_change_detected; - TestFramework::RecordTest("Circuit Breaker Equality (topology only)", + TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)", pass, pass ? "" : "equal_default=" + std::to_string(equal_default) + - " topology_still_equal=" + std::to_string(topology_still_equal) + + " cb_edit_detected=" + std::to_string(cb_edit_detected) + " cb_fields_differ=" + std::to_string(cb_fields_differ) + " topology_changed=" + std::to_string(topology_changed) + " port_change_detected=" + std::to_string(port_change_detected), TestFramework::TestCategory::OTHER); } catch (const std::exception& e) { - TestFramework::RecordTest("Circuit Breaker Equality (topology only)", + TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)", false, e.what(), TestFramework::TestCategory::OTHER); } } From a52f1dfeab9238d65fa64f22f69d26e0bdffc846 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 00:43:39 +0800 Subject: [PATCH 07/37] Fix review comment --- server/circuit_breaker_slice.cc | 15 ++++++- test/circuit_breaker_test.h | 76 +++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 03313173..59970641 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -426,7 +426,20 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { // Skip when enabled_changed is also true: the full-reset branch // below bumps both generations as part of its larger reset. window_.Resize(new_config.window_seconds); - if (!enabled_changed) ++closed_gen_; + if (!enabled_changed) { + // Reset consecutive_failures_ alongside the window wipe. + // Both are CLOSED-domain state from the same observation cycle. + // Bumping closed_gen_ drops all pre-reload CLOSED reports + // (correct — they must not seed the fresh window). But if + // consecutive_failures_ is NOT also reset, those dropped reports + // can no longer clear or advance the counter either, so the + // leftover count becomes an orphaned value that mis-fires future + // trip evaluations (spurious trip: pre-reload success was going + // to clear the counter but got dropped, so the next real failure + // crosses the threshold using a stale count). + consecutive_failures_ = 0; + ++closed_gen_; + } } if (enabled_changed) { diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index 6a6f4354..bbd9f5e7 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1377,6 +1377,81 @@ void TestWindowResizeStillInvalidatesClosedAdmissions() { } } +// BUG (review round 6, P2): Reload with window_seconds change preserved +// consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED +// reports are correctly blocked (stale gen), but they can no longer +// clear or advance consecutive_failures_ either. The counter becomes an +// orphaned relic from a prior observation cycle: +// +// Scenario: 4 consecutive failures (threshold=5), reload window_seconds. +// Pre-reload success arrives → stale gen → DROPPED. +// Without fix: consecutive_failures_ stays at 4. +// Next real failure: consecutive_failures_ = 5 → SPURIOUS TRIP. +// +// Fix: reset consecutive_failures_ = 0 in the same branch that clears +// the window on resize. Both are CLOSED-domain state from the same +// observation cycle; invalidating one without resetting the other leaves +// an inconsistent counter. +void TestWindowResizeResetConsecutiveFailures() { + std::cout << "\n[TEST] CB: window resize resets consecutive_failures_..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 100; // rate-trip disabled (100% threshold) + cb.minimum_volume = 1000; // rate-trip disabled (high volume gate) + cb.window_seconds = 10; + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Accumulate 4 consecutive failures (one below the threshold of 5). + for (int i = 0; i < 4; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + bool pre_reload_closed = slice.CurrentState() == State::CLOSED; + + // Capture a pre-reload admission. + auto pre_admit = slice.TryAcquire(); + uint64_t pre_gen = pre_admit.generation; + + // Window-only reload: wipes the rate window, bumps closed_gen_, + // and (with the fix) resets consecutive_failures_ to 0. + auto resized = cb; + resized.window_seconds = 30; + slice.Reload(resized); + + // Pre-reload success arrives late — must be dropped (stale gen). + slice.ReportSuccess(false, pre_gen); + bool stale_dropped = slice.ReportsStaleGeneration() == 1; + + // Verify consecutive_failures_ was reset: one real post-reload failure + // must NOT trip the breaker (counter is 1/5, not 5/5). + auto post_admit = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, post_admit.generation); + bool no_spurious_trip = slice.CurrentState() == State::CLOSED; + + bool pass = pre_reload_closed && stale_dropped && no_spurious_trip; + TestFramework::RecordTest( + "CB: window resize resets consecutive_failures_", + pass, pass ? "" : + "pre_reload_closed=" + std::to_string(pre_reload_closed) + + " stale_dropped=" + std::to_string(stale_dropped) + + " no_spurious_trip=" + std::to_string(no_spurious_trip), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window resize resets consecutive_failures_", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -1455,6 +1530,7 @@ void RunAllTests() { TestThresholdOnlyReloadDoesNotAdvanceGeneration(); TestWindowResizeDuringHalfOpenDoesNotStrandProbes(); TestWindowResizeStillInvalidatesClosedAdmissions(); + TestWindowResizeResetConsecutiveFailures(); TestTransitionCallbackInvoked(); } From 833c150cbcd3c451af75cddaa25898a4699448f7 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 01:04:04 +0800 Subject: [PATCH 08/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 7 ++ server/circuit_breaker_slice.cc | 21 ++++- test/circuit_breaker_test.h | 87 +++++++++++++++++++ 3 files changed, 112 insertions(+), 3 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index f08a8358..0bf16afb 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -138,6 +138,13 @@ class CircuitBreakerSlice { int half_open_inflight_ = 0; int half_open_successes_ = 0; bool half_open_saw_failure_ = false; + // Probe budget for the CURRENT HALF_OPEN cycle. Snapshotted from + // config_.permitted_half_open_calls at the moment TransitionOpenToHalfOpen + // fires. A live Reload() may lower (or raise) the config field mid-cycle; + // the snapshot ensures TryAcquire's slot gate and ReportSuccess's close + // check both operate against the budget that was in effect when the probes + // were admitted — preventing early close or indefinitely-open behaviour. + int half_open_permitted_snapshot_ = 0; // Observability counters. std::atomic trips_{0}; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 59970641..4bc03410 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -108,6 +108,15 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + // Snapshot the probe budget for this cycle. A live Reload() during this + // HALF_OPEN episode may lower or raise config_.permitted_half_open_calls, + // but TryAcquire's slot gate (Case B) and ReportSuccess's close check must + // both operate against the budget that was in effect when probes were + // admitted. Without the snapshot: lowering the limit causes premature close + // (first success satisfies the reduced count → TransitionHalfOpenToClosed + // bumps halfopen_gen_ → remaining admitted probes become stale → their + // failures are silently dropped and the breaker falsely closes). + half_open_permitted_snapshot_ = config_.permitted_half_open_calls; // Reset the info-log "first reject" breadcrumb so the first rejection // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN // rejection (recovery attempt failing or probe budget full) is @@ -122,7 +131,7 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { logging::Get()->info( "circuit breaker half-open {} probes_allowed={}", - host_label_, config_.permitted_half_open_calls); + host_label_, half_open_permitted_snapshot_); if (transition_cb_) { transition_cb_(State::OPEN, State::HALF_OPEN, "open_elapsed"); @@ -238,7 +247,10 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() { } // Case B: probe budget fully in flight. "No capacity" — bump the // dedicated counter so dashboards can tell these two apart. - if (half_open_inflight_ >= config_.permitted_half_open_calls) { + // Use the cycle snapshot, not config_, so a live Reload() that + // lowers permitted_half_open_calls mid-cycle doesn't change how many + // probes were promised to this cycle. + if (half_open_inflight_ >= half_open_permitted_snapshot_) { return Admission{RejectWithLog("half_open_full", /*half_open_full=*/true), /*generation=*/0}; @@ -326,7 +338,10 @@ void CircuitBreakerSlice::ReportSuccess(bool probe, return; } half_open_successes_++; - if (half_open_successes_ >= config_.permitted_half_open_calls) { + // Use the cycle snapshot so a mid-cycle Reload() that lowers the + // limit doesn't close the breaker early (before all admitted probes + // have reported back), silently dropping the remaining probes' failures. + if (half_open_successes_ >= half_open_permitted_snapshot_) { TransitionHalfOpenToClosed(); } return; diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index bbd9f5e7..801b2048 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1377,6 +1377,92 @@ void TestWindowResizeStillInvalidatesClosedAdmissions() { } } +// BUG (review round 7, P2): Reload() lowering permitted_half_open_calls +// while a HALF_OPEN cycle is active could close the breaker early and +// discard failures from already-admitted probes. +// +// Scenario (5-probe cycle reloaded down to 1): +// TransitionOpenToHalfOpen: snapshot=5, admit 5 probes. +// Reload: permitted_half_open_calls → 1. +// First success arrives → half_open_successes_=1 ≥ NEW limit (1) +// → TransitionHalfOpenToClosed() fires → halfopen_gen_ bumped. +// Remaining 4 admitted probes are now stale → their failures DROPPED. +// Breaker falsely closes even though 4 probes have not reported yet. +// +// Fix: snapshot config_.permitted_half_open_calls into +// half_open_permitted_snapshot_ at TransitionOpenToHalfOpen time. +// TryAcquire (slot gate) and ReportSuccess (close check) both use the +// snapshot so the cycle budget is frozen for its lifetime. +void TestHalfOpenBudgetFrozenAcrossReload() { + std::cout << "\n[TEST] CB: HALF_OPEN budget frozen across mid-cycle reload..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 100; // disable rate-trip + cb.minimum_volume = 1000; // disable rate-trip + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; // exactly 2 probes for clean drain + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip the breaker. + for (int i = 0; i < 5; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + bool is_open = slice.CurrentState() == State::OPEN; + + // Advance past open_until → OPEN→HALF_OPEN on next TryAcquire. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit both probes (budget=2; snapshot set to 2 at TransitionOpenToHalfOpen). + auto a0 = slice.TryAcquire(); + auto a1 = slice.TryAcquire(); + bool both_probes = (a0.decision == Decision::ADMITTED_PROBE) && + (a1.decision == Decision::ADMITTED_PROBE); + bool is_halfopen = slice.CurrentState() == State::HALF_OPEN; + + // Lower the limit to 1 mid-cycle. + auto lowered = cb; + lowered.permitted_half_open_calls = 1; + slice.Reload(lowered); + + // First probe succeeds. + // Without fix: successes(1) >= NEW config(1) → TransitionHalfOpenToClosed + // → halfopen_gen_ bumped → second probe's failure DROPPED + // → breaker falsely CLOSED. + // With fix: successes(1) >= snapshot(2) is false → stays HALF_OPEN. + slice.ReportSuccess(true, a0.generation); + bool not_closed_after_one = slice.CurrentState() == State::HALF_OPEN; + + // Second probe fails. inflight drops to 0 → TripHalfOpenToOpen fires. + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, a1.generation); + bool retripped = slice.CurrentState() == State::OPEN; + + bool pass = is_open && both_probes && is_halfopen && + not_closed_after_one && retripped; + TestFramework::RecordTest( + "CB: HALF_OPEN budget frozen across mid-cycle reload", + pass, pass ? "" : + "is_open=" + std::to_string(is_open) + + " both_probes=" + std::to_string(both_probes) + + " is_halfopen=" + std::to_string(is_halfopen) + + " not_closed_after_one=" + std::to_string(not_closed_after_one) + + " retripped=" + std::to_string(retripped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN budget frozen across mid-cycle reload", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + // BUG (review round 6, P2): Reload with window_seconds change preserved // consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED // reports are correctly blocked (stale gen), but they can no longer @@ -1531,6 +1617,7 @@ void RunAllTests() { TestWindowResizeDuringHalfOpenDoesNotStrandProbes(); TestWindowResizeStillInvalidatesClosedAdmissions(); TestWindowResizeResetConsecutiveFailures(); + TestHalfOpenBudgetFrozenAcrossReload(); TestTransitionCallbackInvoked(); } From b8d3a1f7fa20d462c7e27868fc21460a16485f15 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 08:49:54 +0800 Subject: [PATCH 09/37] Fix review comment --- server/circuit_breaker_window.cc | 10 +++++++-- test/circuit_breaker_test.h | 38 ++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 2 deletions(-) diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc index 06fccc18..776c00ec 100644 --- a/server/circuit_breaker_window.cc +++ b/server/circuit_breaker_window.cc @@ -16,8 +16,14 @@ static inline size_t BucketIndex(int64_t epoch_sec, int window_seconds) { } CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds) - : window_seconds_(window_seconds), - buckets_(window_seconds > 0 ? static_cast(window_seconds) : 1) { + // Clamp to a minimum of 1 bucket. ConfigLoader::Validate() rejects + // window_seconds <= 0 on the production path, but the constructor is a + // public API and programmatic callers (tests, future direct users) may + // bypass that validation. Without the clamp, BucketIndex() does `% 0` on + // the first Add/TotalCount and crashes; negative values violate the ring + // math. Matches Resize()'s clamp so the two entry points are symmetric. + : window_seconds_(window_seconds > 0 ? window_seconds : 1), + buckets_(static_cast(window_seconds_)) { } int64_t CircuitBreakerWindow::ToEpochSec( diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index 801b2048..f0e32d30 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1538,6 +1538,43 @@ void TestWindowResizeResetConsecutiveFailures() { } } +// BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated +// `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value. +// Programmatic callers bypassing ConfigLoader::Validate() (tests, future +// direct users) that passed window_seconds <= 0 would trigger BucketIndex's +// `% window_seconds_` on the first Add*/TotalCount call — dividing by zero +// for 0, or violating ring math for negatives. Resize() already clamped. +// Fix: constructor applies the same clamp so both entry points are symmetric. +void TestWindowNonPositiveWindowSizeClamp() { + std::cout << "\n[TEST] CB: window ctor clamps non-positive sizes..." + << std::endl; + try { + // Zero would have crashed on % 0 before the fix. + CircuitBreakerWindow w0(0); + auto t = std::chrono::steady_clock::time_point(std::chrono::seconds(1000)); + w0.AddSuccess(t); + w0.AddFailure(t); + bool zero_ok = (w0.TotalCount(t) == 2) && (w0.FailureCount(t) == 1); + + // Negative values would have violated the ring math. + CircuitBreakerWindow wn(-5); + wn.AddSuccess(t); + bool negative_ok = wn.TotalCount(t) == 1; + + bool pass = zero_ok && negative_ok; + TestFramework::RecordTest( + "CB: window ctor clamps non-positive sizes", + pass, pass ? "" : + "zero_ok=" + std::to_string(zero_ok) + + " negative_ok=" + std::to_string(negative_ok), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: window ctor clamps non-positive sizes", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -1618,6 +1655,7 @@ void RunAllTests() { TestWindowResizeStillInvalidatesClosedAdmissions(); TestWindowResizeResetConsecutiveFailures(); TestHalfOpenBudgetFrozenAcrossReload(); + TestWindowNonPositiveWindowSizeClamp(); TestTransitionCallbackInvoked(); } From 679fc733918dddafcfc33b802b8dd08729abc125 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 09:13:42 +0800 Subject: [PATCH 10/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 7 +- include/config/server_config.h | 14 ++-- server/circuit_breaker_slice.cc | 14 ++-- server/config_loader.cc | 23 ++----- test/circuit_breaker_test.h | 65 +++++++++++++++++++ test/config_test.h | 21 ++---- 6 files changed, 99 insertions(+), 45 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 0bf16afb..1ff8fe1d 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -205,7 +205,12 @@ class CircuitBreakerSlice { std::chrono::nanoseconds ComputeOpenDuration() const; // Check whether CLOSED trip conditions are met. Called after every failure. - bool ShouldTripClosed(); + // Takes `now` as a parameter so the caller can record the failure and + // evaluate the trip against THE SAME timestamp — otherwise a clock tick + // between AddFailure() and ShouldTripClosed() can advance the ring and + // wipe the just-recorded failure (critical when window_seconds is small: + // with window=1, a 1-second delta triggers the full-reset path). + bool ShouldTripClosed(std::chrono::steady_clock::time_point now); std::chrono::steady_clock::time_point Now() const; }; diff --git a/include/config/server_config.h b/include/config/server_config.h index 606de9d9..5a6a39f4 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -152,10 +152,12 @@ struct CircuitBreakerConfig { // Safety valve (future-proof for load-balanced services; no-op v1). int max_ejection_percent_per_host_set = 50; - // Retry budget (orthogonal to the breaker). Caps concurrent retries to - // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100). - int retry_budget_percent = 20; - int retry_budget_min_concurrency = 3; + // NOTE: retry_budget_percent and retry_budget_min_concurrency have been + // REMOVED from Phase 2. They'll be re-added in Phase 3 when the + // RetryBudget class is introduced (design §4.5). Exposing them here as + // config knobs without any runtime code reading them was misleading to + // operators — setting them produced no protection against retry storms + // since ProxyHandler's RetryPolicy reads proxy.retry.*, not these fields. bool operator==(const CircuitBreakerConfig& o) const { return enabled == o.enabled && @@ -167,9 +169,7 @@ struct CircuitBreakerConfig { permitted_half_open_calls == o.permitted_half_open_calls && base_open_duration_ms == o.base_open_duration_ms && max_open_duration_ms == o.max_open_duration_ms && - max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set && - retry_budget_percent == o.retry_budget_percent && - retry_budget_min_concurrency == o.retry_budget_min_concurrency; + max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set; } bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); } }; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 4bc03410..3e22f014 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -49,11 +49,11 @@ std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const { return std::chrono::milliseconds(scaled_ms); } -bool CircuitBreakerSlice::ShouldTripClosed() { +bool CircuitBreakerSlice::ShouldTripClosed( + std::chrono::steady_clock::time_point now) { if (consecutive_failures_ >= config_.consecutive_failure_threshold) { return true; } - auto now = Now(); int64_t total = window_.TotalCount(now); if (total < config_.minimum_volume) return false; int64_t fails = window_.FailureCount(now); @@ -406,9 +406,15 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, if (state_.load(std::memory_order_acquire) != State::CLOSED) return; consecutive_failures_++; - window_.AddFailure(Now()); + // Capture Now() once and reuse for both the record and the trip check. + // Separate Now() calls can cross a second boundary, letting TotalCount's + // internal Advance() zero the bucket we just wrote — with window_seconds=1, + // a 1-second delta trips the Advance full-reset path and the just-recorded + // failure disappears from the ring, missing a rate trip that should fire. + auto now = Now(); + window_.AddFailure(now); - if (ShouldTripClosed()) { + if (ShouldTripClosed(now)) { const char* trigger = (consecutive_failures_ >= config_.consecutive_failure_threshold) ? "consecutive" : "rate"; diff --git a/server/config_loader.cc b/server/config_loader.cc index f6ff4698..552ccf5c 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -286,10 +286,9 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) { cb.value("max_open_duration_ms", 60000); upstream.circuit_breaker.max_ejection_percent_per_host_set = cb.value("max_ejection_percent_per_host_set", 50); - upstream.circuit_breaker.retry_budget_percent = - cb.value("retry_budget_percent", 20); - upstream.circuit_breaker.retry_budget_min_concurrency = - cb.value("retry_budget_min_concurrency", 3); + // retry_budget_* fields removed from Phase 2 — re-added in + // Phase 3 when the RetryBudget class lands. Unknown keys in + // input JSON are silently ignored by nlohmann::json. } config.upstreams.push_back(std::move(upstream)); @@ -873,16 +872,7 @@ void ConfigLoader::Validate(const ServerConfig& config) { idx + " ('" + u.name + "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]"); } - if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) { - throw std::invalid_argument( - idx + " ('" + u.name + - "'): circuit_breaker.retry_budget_percent must be in [0, 100]"); - } - if (cb.retry_budget_min_concurrency < 0) { - throw std::invalid_argument( - idx + " ('" + u.name + - "'): circuit_breaker.retry_budget_min_concurrency must be >= 0"); - } + // retry_budget_* validation removed — fields moved to Phase 3. } // Validate method names — reject unknowns and duplicates. // Duplicates would cause RouteAsync to throw at startup. @@ -1164,10 +1154,7 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) { u.circuit_breaker.max_open_duration_ms; cbj["max_ejection_percent_per_host_set"] = u.circuit_breaker.max_ejection_percent_per_host_set; - cbj["retry_budget_percent"] = - u.circuit_breaker.retry_budget_percent; - cbj["retry_budget_min_concurrency"] = - u.circuit_breaker.retry_budget_min_concurrency; + // retry_budget_* fields dropped from serialization — Phase 3 adds. uj["circuit_breaker"] = cbj; } j["upstreams"].push_back(uj); diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index f0e32d30..bd2809f4 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1538,6 +1538,70 @@ void TestWindowResizeResetConsecutiveFailures() { } } +// BUG (review round 9, P2-1): ReportFailure captured Now() separately in +// AddFailure() and ShouldTripClosed()'s internal TotalCount/FailureCount +// calls. If a second boundary elapsed between the two calls, Advance() could +// wipe the just-recorded failure — with window_seconds=1, the 1-second delta +// hits the delta >= window_seconds full-reset path and the failure +// disappears before the trip evaluation runs. Fix: capture Now() once in +// ReportFailure and thread it through ShouldTripClosed(now), AddFailure(now). +// +// Regression test injects a time source that returns T on the first call +// and T+1s on every subsequent call, simulating the boundary crossing. +// Post-fix, ReportFailure only calls Now() once — the fix is effective. +// Pre-fix, the second Now() call inside ShouldTripClosed would advance the +// ring and wipe the failure → no trip. +void TestReportFailureUsesOneTimestampAcrossTripEval() { + std::cout << "\n[TEST] CB: ReportFailure uses single timestamp for trip eval..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 1000; // disable consec path + cb.failure_rate_threshold = 100; // rate=100% to trip on fail + cb.minimum_volume = 1; // single failure suffices + cb.window_seconds = 1; // boundary-sensitive + cb.permitted_half_open_calls = 5; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + + // Time source returns base on call #1 and base+1s on every call after. + // This simulates a clock tick between AddFailure (call 1) and any + // subsequent Now() inside ShouldTripClosed (call 2+). + auto base = std::chrono::steady_clock::time_point( + std::chrono::seconds(1'000'000)); + int call_count = 0; + auto time_source = [&call_count, base]() { + int n = call_count++; + return n == 0 ? base : base + std::chrono::seconds(1); + }; + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, time_source); + + // Admit + fail one request. + // Pre-fix trace (BUGGY): AddFailure(base) records in bucket[0]. Then + // ShouldTripClosed()'s internal TotalCount(base+1s) calls Advance + // → delta=1 >= window=1 → full reset wipes the bucket → total=0 < + // minimum_volume=1 → NO TRIP. Rate trip missed. + // Post-fix: ReportFailure captures Now() once (=base), passes to + // AddFailure(base) AND ShouldTripClosed(base). Ring stays aligned; + // total=1, failures=1 → rate fires → TRIP to OPEN. + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + + bool pass = slice.CurrentState() == State::OPEN; + TestFramework::RecordTest( + "CB: ReportFailure uses single timestamp for trip eval", + pass, pass ? "" : + "expected OPEN, got state=" + + std::to_string(static_cast(slice.CurrentState())), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ReportFailure uses single timestamp for trip eval", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + // BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated // `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value. // Programmatic callers bypassing ConfigLoader::Validate() (tests, future @@ -1656,6 +1720,7 @@ void RunAllTests() { TestWindowResizeResetConsecutiveFailures(); TestHalfOpenBudgetFrozenAcrossReload(); TestWindowNonPositiveWindowSizeClamp(); + TestReportFailureUsesOneTimestampAcrossTripEval(); TestTransitionCallbackInvoked(); } diff --git a/test/config_test.h b/test/config_test.h index 18ee718f..f0bd4599 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -362,9 +362,8 @@ namespace ConfigTests { cb.permitted_half_open_calls == 5 && cb.base_open_duration_ms == 5000 && cb.max_open_duration_ms == 60000 && - cb.max_ejection_percent_per_host_set == 50 && - cb.retry_budget_percent == 20 && - cb.retry_budget_min_concurrency == 3; + cb.max_ejection_percent_per_host_set == 50; + // retry_budget_* fields removed from Phase 2 — Phase 3 adds. TestFramework::RecordTest("Circuit Breaker Defaults", pass, pass ? "" : "default value mismatch", TestFramework::TestCategory::OTHER); @@ -393,9 +392,7 @@ namespace ConfigTests { "permitted_half_open_calls": 3, "base_open_duration_ms": 2000, "max_open_duration_ms": 120000, - "max_ejection_percent_per_host_set": 33, - "retry_budget_percent": 10, - "retry_budget_min_concurrency": 5 + "max_ejection_percent_per_host_set": 33 } }] })"; @@ -409,9 +406,7 @@ namespace ConfigTests { cb.permitted_half_open_calls == 3 && cb.base_open_duration_ms == 2000 && cb.max_open_duration_ms == 120000 && - cb.max_ejection_percent_per_host_set == 33 && - cb.retry_budget_percent == 10 && - cb.retry_budget_min_concurrency == 5; + cb.max_ejection_percent_per_host_set == 33; TestFramework::RecordTest("Circuit Breaker JSON Parse", pass, pass ? "" : "parsed values mismatch", TestFramework::TestCategory::OTHER); @@ -525,12 +520,8 @@ namespace ConfigTests { ExpectValidationFailure("CB Validation: max= base_open_duration_ms"); - ExpectValidationFailure("CB Validation: retry_budget_percent>100", - R"({"retry_budget_percent": 200})", - "retry_budget_percent must be in [0, 100]"); - ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0", - R"({"retry_budget_min_concurrency": -1})", - "retry_budget_min_concurrency must be >= 0"); + // retry_budget_percent / retry_budget_min_concurrency validation + // cases removed — fields moved to Phase 3. ExpectValidationFailure("CB Validation: max_ejection_percent>100", R"({"max_ejection_percent_per_host_set": 150})", "max_ejection_percent_per_host_set must be in [0, 100]"); From 628ca72a02e39bcd86836d6ea3f06e81850f90aa Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 09:29:34 +0800 Subject: [PATCH 11/37] Fix review comment --- server/circuit_breaker_slice.cc | 12 +++++- server/config_loader.cc | 44 +++++++++++++++++----- test/circuit_breaker_test.h | 65 +++++++++++++++++++++++++++++++++ test/config_test.h | 13 +++++++ 4 files changed, 123 insertions(+), 11 deletions(-) diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 3e22f014..bb0568fb 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -116,7 +116,17 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { // (first success satisfies the reduced count → TransitionHalfOpenToClosed // bumps halfopen_gen_ → remaining admitted probes become stale → their // failures are silently dropped and the breaker falsely closes). - half_open_permitted_snapshot_ = config_.permitted_half_open_calls; + // + // Clamp to a minimum of 1. ConfigLoader::Validate() enforces >= 1 on the + // production path, but programmatic callers (tests, future direct users) + // that bypass validation could set permitted_half_open_calls <= 0. With + // snapshot=0, TryAcquire's Case B check (`inflight >= snapshot`) is + // immediately true for every probe → no probe ever admitted → no probe + // ever completes → half_open_inflight_ stays at 0 forever → slice is + // permanently stuck in HALF_OPEN rejecting all traffic. Matches the + // symmetric clamp in CircuitBreakerWindow's ctor. + int permitted = config_.permitted_half_open_calls; + half_open_permitted_snapshot_ = permitted > 0 ? permitted : 1; // Reset the info-log "first reject" breadcrumb so the first rejection // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN // rejection (recovery attempt failing or probe budget full) is diff --git a/server/config_loader.cc b/server/config_loader.cc index 552ccf5c..e3f7f6fe 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -266,26 +266,50 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) { if (!item["circuit_breaker"].is_object()) throw std::runtime_error("upstream circuit_breaker must be an object"); auto& cb = item["circuit_breaker"]; + // Strict integer accessor: rejects float/bool/string inputs + // that nlohmann's default value() would silently coerce + // (e.g., 1.9 → 1, true → 1). Without this, malformed configs + // pass Validate() and change breaker behavior in production. + auto cb_int = [&cb](const char* name, int default_val) -> int { + if (!cb.contains(name)) return default_val; + const auto& v = cb[name]; + if (!v.is_number_integer()) { + throw std::invalid_argument( + std::string("circuit_breaker.") + name + + " must be an integer"); + } + return v.get(); + }; + auto cb_bool = [&cb](const char* name, bool default_val) -> bool { + if (!cb.contains(name)) return default_val; + const auto& v = cb[name]; + if (!v.is_boolean()) { + throw std::invalid_argument( + std::string("circuit_breaker.") + name + + " must be a boolean"); + } + return v.get(); + }; upstream.circuit_breaker.enabled = - cb.value("enabled", false); + cb_bool("enabled", false); upstream.circuit_breaker.dry_run = - cb.value("dry_run", false); + cb_bool("dry_run", false); upstream.circuit_breaker.consecutive_failure_threshold = - cb.value("consecutive_failure_threshold", 5); + cb_int("consecutive_failure_threshold", 5); upstream.circuit_breaker.failure_rate_threshold = - cb.value("failure_rate_threshold", 50); + cb_int("failure_rate_threshold", 50); upstream.circuit_breaker.minimum_volume = - cb.value("minimum_volume", 20); + cb_int("minimum_volume", 20); upstream.circuit_breaker.window_seconds = - cb.value("window_seconds", 10); + cb_int("window_seconds", 10); upstream.circuit_breaker.permitted_half_open_calls = - cb.value("permitted_half_open_calls", 5); + cb_int("permitted_half_open_calls", 5); upstream.circuit_breaker.base_open_duration_ms = - cb.value("base_open_duration_ms", 5000); + cb_int("base_open_duration_ms", 5000); upstream.circuit_breaker.max_open_duration_ms = - cb.value("max_open_duration_ms", 60000); + cb_int("max_open_duration_ms", 60000); upstream.circuit_breaker.max_ejection_percent_per_host_set = - cb.value("max_ejection_percent_per_host_set", 50); + cb_int("max_ejection_percent_per_host_set", 50); // retry_budget_* fields removed from Phase 2 — re-added in // Phase 3 when the RetryBudget class lands. Unknown keys in // input JSON are silently ignored by nlohmann::json. diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index bd2809f4..bf95f2be 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1639,6 +1639,70 @@ void TestWindowNonPositiveWindowSizeClamp() { } } +// BUG (review round 9, P3): CircuitBreakerSlice copied permitted_half_open_calls +// into the HALF_OPEN snapshot verbatim. For programmatic callers bypassing +// ConfigLoader::Validate() (same class as the window ctor clamp), a zero or +// negative budget would permanently wedge the breaker in HALF_OPEN: +// TryAcquire (HALF_OPEN, case B): half_open_inflight_(0) >= snapshot(0) +// → every probe rejected as half_open_full → no probe ever admitted +// → no report ever fires → half_open_inflight_ stays at 0 forever. +// +// Fix: clamp the snapshot to min 1 at TransitionOpenToHalfOpen. Symmetric +// with CircuitBreakerWindow's constructor clamp from round 8. +void TestHalfOpenClampsNonPositiveProbeBudget() { + std::cout << "\n[TEST] CB: HALF_OPEN clamps non-positive probe budget..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 0; // bypasses Validate() — direct ctor + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + + // Advance past open_until → OPEN→HALF_OPEN on next TryAcquire. + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // First TryAcquire triggers the transition. With the clamp, snapshot=1 + // and this probe is admitted. Without the clamp, snapshot=0 → rejected + // as half_open_full → breaker stuck forever. + auto a0 = slice.TryAcquire(); + bool probe_admitted = a0.decision == Decision::ADMITTED_PROBE; + + // A successful probe closes the cycle (successes(1) >= snapshot(1)). + // Without the clamp this branch would never execute. + if (probe_admitted) { + slice.ReportSuccess(true, a0.generation); + } + bool recovered = slice.CurrentState() == State::CLOSED; + + bool pass = probe_admitted && recovered; + TestFramework::RecordTest( + "CB: HALF_OPEN clamps non-positive probe budget", + pass, pass ? "" : + "probe_admitted=" + std::to_string(probe_admitted) + + " recovered=" + std::to_string(recovered), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN clamps non-positive probe budget", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -1721,6 +1785,7 @@ void RunAllTests() { TestHalfOpenBudgetFrozenAcrossReload(); TestWindowNonPositiveWindowSizeClamp(); TestReportFailureUsesOneTimestampAcrossTripEval(); + TestHalfOpenClampsNonPositiveProbeBudget(); TestTransitionCallbackInvoked(); } diff --git a/test/config_test.h b/test/config_test.h index f0bd4599..6317151f 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -538,6 +538,19 @@ namespace ConfigTests { ExpectValidationFailure("CB Validation: permitted_half_open_calls>1000", R"({"permitted_half_open_calls": 1001})", "permitted_half_open_calls must be in [1, 1000]"); + // Type-strictness guards: nlohmann's value() silently coerces + // float/bool to int (1.9 → 1, true → 1). Rejecting at parse time is + // safer than letting malformed configs pass Validate() and change + // production breaker behavior. + ExpectValidationFailure("CB Validation: float rejected for int field", + R"({"window_seconds": 1.9})", + "circuit_breaker.window_seconds must be an integer"); + ExpectValidationFailure("CB Validation: bool rejected for int field", + R"({"consecutive_failure_threshold": true})", + "circuit_breaker.consecutive_failure_threshold must be an integer"); + ExpectValidationFailure("CB Validation: int rejected for bool field", + R"({"enabled": 1})", + "circuit_breaker.enabled must be a boolean"); } // Test 14: UpstreamConfig::operator== INCLUDES circuit_breaker until Phase 8. From 0a4290a432c867e5edae74f8e6025dc3fd3606dd Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 09:53:31 +0800 Subject: [PATCH 12/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 12 +++ server/circuit_breaker_slice.cc | 29 +++++-- test/circuit_breaker_test.h | 86 +++++++++++++++++++ 3 files changed, 121 insertions(+), 6 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 1ff8fe1d..edaea211 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -138,6 +138,18 @@ class CircuitBreakerSlice { int half_open_inflight_ = 0; int half_open_successes_ = 0; bool half_open_saw_failure_ = false; + // Total probes admitted in the CURRENT HALF_OPEN cycle. Never decrements + // within a cycle; resets on every cycle entry (TransitionOpenToHalfOpen) + // and cycle exit (TransitionHalfOpenToClosed / TripHalfOpenToOpen). This + // is what caps the cycle's probe budget — NOT half_open_inflight_, which + // can free slots as probes complete. Gating on inflight would let an + // early-completing probe's slot be reused, causing the cycle to admit + // more than permitted_half_open_calls total probes. The close check + // (successes >= snapshot) could then fire while a late-admitted probe + // is still running; its eventual failure would drop as stale (generation + // bumped by the transition) and the breaker would falsely mark an + // unhealthy host recovered. + int half_open_admitted_ = 0; // Probe budget for the CURRENT HALF_OPEN cycle. Snapshotted from // config_.permitted_half_open_calls at the moment TransitionOpenToHalfOpen // fires. A live Reload() may lower (or raise) the config field mid-cycle; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index bb0568fb..f3821ab1 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -79,6 +79,7 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + half_open_admitted_ = 0; first_reject_logged_for_open_ = false; // Bump closed_gen_: non-probe admissions from the closing CLOSED cycle // are now stale. Late Report(false, ...) calls for those requests drop. @@ -108,6 +109,7 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + half_open_admitted_ = 0; // Snapshot the probe budget for this cycle. A live Reload() during this // HALF_OPEN episode may lower or raise config_.permitted_half_open_calls, // but TryAcquire's slot gate (Case B) and ReportSuccess's close check must @@ -163,6 +165,7 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + half_open_admitted_ = 0; first_reject_logged_for_open_ = false; // Bump halfopen_gen_: the just-completed HALF_OPEN cycle's probe // admissions are now stale. closed_gen_ is NOT bumped — pre-trip @@ -194,6 +197,7 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) { half_open_inflight_ = 0; half_open_successes_ = 0; half_open_saw_failure_ = false; + half_open_admitted_ = 0; first_reject_logged_for_open_ = false; // Bump halfopen_gen_: probe admissions from the closing HALF_OPEN // cycle are now stale. closed_gen_ is NOT bumped — no CLOSED @@ -255,16 +259,29 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() { /*half_open_full=*/false), /*generation=*/0}; } - // Case B: probe budget fully in flight. "No capacity" — bump the - // dedicated counter so dashboards can tell these two apart. - // Use the cycle snapshot, not config_, so a live Reload() that - // lowers permitted_half_open_calls mid-cycle doesn't change how many - // probes were promised to this cycle. - if (half_open_inflight_ >= half_open_permitted_snapshot_) { + // Case B: probe budget exhausted for this cycle. "No capacity" — bump + // the dedicated counter so dashboards can tell this apart from + // saw_failure rejects. + // + // Gate on `half_open_admitted_` (total cycle admissions, never + // decrements), NOT on `half_open_inflight_`. Inflight drops when a + // probe completes, so gating on it would reuse the freed slot and let + // the cycle admit more than `snapshot` total probes. Consequences of + // that bug: the close check `successes >= snapshot` could fire before + // ALL admitted probes have reported (the reused-slot probe is still + // in flight); TransitionHalfOpenToClosed would bump halfopen_gen_; + // the late probe's failure would drop as stale — falsely marking an + // unhealthy host recovered. + // + // Use the cycle snapshot so a live Reload() that lowers + // permitted_half_open_calls mid-cycle doesn't change how many probes + // were promised to this cycle. + if (half_open_admitted_ >= half_open_permitted_snapshot_) { return Admission{RejectWithLog("half_open_full", /*half_open_full=*/true), /*generation=*/0}; } + half_open_admitted_++; half_open_inflight_++; // Probe admission — stamp with halfopen_gen_. return Admission{Decision::ADMITTED_PROBE, halfopen_gen_}; diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index bf95f2be..af6f976d 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1703,6 +1703,91 @@ void TestHalfOpenClampsNonPositiveProbeBudget() { } } +// BUG (review round 10, P1): TryAcquire gated HALF_OPEN admission on +// half_open_inflight_, so a probe slot was reused once an earlier probe +// completed. With permitted_half_open_calls=2: +// +// admit A → inflight=1, admitted=1 +// admit B → inflight=2, admitted=2 +// Report success on A → inflight=1, successes=1 +// admit C → inflight(1) < snapshot(2) → ACCEPTED (BUG: 3rd admission) +// Report success on B → inflight=0, successes=2 +// successes(2) >= snapshot(2) → TransitionHalfOpenToClosed fires +// → halfopen_gen_ bumped → C's eventual failure DROPPED as stale +// → breaker falsely marked recovered despite the probe failing. +// +// Fix: gate on half_open_admitted_ (total cycle admissions, never +// decrements) instead of half_open_inflight_. The cycle can admit at most +// `snapshot` probes total, regardless of how quickly earlier probes drain. +void TestHalfOpenDoesNotReuseProbeSlots() { + std::cout << "\n[TEST] CB: HALF_OPEN does not reuse probe slots..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + // Admit 2 probes (budget=2). + auto a = slice.TryAcquire(); + auto b = slice.TryAcquire(); + bool both_admitted = a.decision == Decision::ADMITTED_PROBE && + b.decision == Decision::ADMITTED_PROBE; + + // Report success on A — freeing its inflight slot. + slice.ReportSuccess(true, a.generation); + bool still_halfopen = slice.CurrentState() == State::HALF_OPEN; + + // Third admission attempt. With the fix: admitted(2) >= snapshot(2) + // → REJECTED. Without the fix: inflight(1) < snapshot(2) → ADMITTED, + // creating a ghost probe. + auto c = slice.TryAcquire(); + bool third_rejected = c.decision == Decision::REJECTED_OPEN; + + // Close the cycle by succeeding B. + slice.ReportSuccess(true, b.generation); + bool closed = slice.CurrentState() == State::CLOSED; + + // Verify no stale-generation reports accumulated — if the 3rd admission + // had slipped through, its (dropped) report after the close would have + // bumped this counter. Since the admission is now rejected up front, + // this should stay zero. + bool no_stale_reports = slice.ReportsStaleGeneration() == 0; + + bool pass = both_admitted && still_halfopen && third_rejected && + closed && no_stale_reports; + TestFramework::RecordTest( + "CB: HALF_OPEN does not reuse probe slots", + pass, pass ? "" : + "both_admitted=" + std::to_string(both_admitted) + + " still_halfopen=" + std::to_string(still_halfopen) + + " third_rejected=" + std::to_string(third_rejected) + + " closed=" + std::to_string(closed) + + " no_stale_reports=" + std::to_string(no_stale_reports), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: HALF_OPEN does not reuse probe slots", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -1786,6 +1871,7 @@ void RunAllTests() { TestWindowNonPositiveWindowSizeClamp(); TestReportFailureUsesOneTimestampAcrossTripEval(); TestHalfOpenClampsNonPositiveProbeBudget(); + TestHalfOpenDoesNotReuseProbeSlots(); TestTransitionCallbackInvoked(); } From 2516637bef3dcf8072613a5c876ec454235818bb Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 11:07:56 +0800 Subject: [PATCH 13/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 17 +++ server/circuit_breaker_slice.cc | 40 ++++++ server/main.cc | 14 ++ test/circuit_breaker_test.h | 135 ++++++++++++++++++ 4 files changed, 206 insertions(+) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index edaea211..95a5beee 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -59,6 +59,23 @@ class CircuitBreakerSlice { void ReportSuccess(bool probe, uint64_t admission_generation); void ReportFailure(FailureKind kind, bool probe, uint64_t admission_generation); + // Neutral completion — the admission never exercised the upstream. + // Use when the request was terminated locally before reaching the + // upstream (POOL_EXHAUSTED after admission, shutdown draining, client + // disconnect, RESULT_PARSE_ERROR self-attributable). Must NOT be used + // for upstream outcomes — those go to ReportSuccess / ReportFailure. + // + // For probe=true (HALF_OPEN admission): returns the probe slot to the + // cycle — decrements `half_open_inflight_` AND `half_open_admitted_` + // so a replacement probe can still exercise the upstream within this + // cycle's budget. Without this path, a probe that dies locally leaks + // its slot forever, eventually wedging the slice in HALF_OPEN. + // + // For probe=false (CLOSED admission): no-op — CLOSED admissions have + // no slot to release. The bool matches ReportSuccess/ReportFailure so + // callers can use the same dispatch pattern. + void ReportNeutral(bool probe, uint64_t admission_generation); + // Apply a new config (called on this slice's dispatcher thread). // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if // window_seconds changed. diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index f3821ab1..ae037a90 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -449,6 +449,46 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, } } +void CircuitBreakerSlice::ReportNeutral(bool probe, + uint64_t admission_generation) { + if (!config_.enabled) return; + if (!probe) { + // CLOSED-state admission: no slot to release. The bool parameter + // exists for API symmetry with ReportSuccess/ReportFailure; a + // neutral outcome in CLOSED simply means the breaker records + // nothing (which matches pre-neutral behavior — POOL_EXHAUSTED, + // shutdown, and similar local terminations were already "ignored" + // on the CLOSED path). + return; + } + + // Probe: gate on halfopen_gen_ + current state, matching the other + // Report* paths. Stale (pre-transition or pre-reload) neutral + // completions drop silently into the stale-generation counter. + if (admission_generation != halfopen_gen_) { + reports_stale_generation_.fetch_add(1, std::memory_order_relaxed); + return; + } + if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; + + // Return the slot to the cycle. Decrement BOTH inflight and admitted: + // - inflight so the last-probe re-trip logic below fires correctly, + // - admitted so a replacement probe can still be admitted within + // this cycle's budget (the whole point of a neutral release — + // the upstream wasn't actually exercised by this admission). + if (half_open_inflight_ > 0) half_open_inflight_--; + if (half_open_admitted_ > 0) half_open_admitted_--; + + // If an earlier sibling probe failed and this neutral release drains + // the last in-flight probe, the cycle must re-trip — otherwise the + // slice would wedge in HALF_OPEN with saw_failure=true, rejecting all + // future admissions via Case A forever. Mirrors the failure-path + // last-probe trigger. + if (half_open_saw_failure_ && half_open_inflight_ == 0) { + TripHalfOpenToOpen("probe_fail"); + } +} + void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { const bool enabled_changed = (config_.enabled != new_config.enabled); const bool window_changed = diff --git a/server/main.cc b/server/main.cc index 06dd2551..86f7598d 100644 --- a/server/main.cc +++ b/server/main.cc @@ -427,6 +427,19 @@ static bool ReloadConfig(const std::string& config_path, auto saved_tls = current_config.tls; auto saved_workers = current_config.worker_threads; auto saved_h2_enabled = current_config.http2.enabled; + // Preserve upstreams for the same reason: HttpServer::Reload treats + // the whole upstream block as restart-required (see http_server.cc + // upstream_configs_ comparison), and that internal copy never changes + // post-startup. If we overwrote current_config.upstreams here, a + // breaker-only edit would stage into current_config while the live + // server keeps running the startup values — /stats and other + // current_config consumers would report phantom state, and subsequent + // identical reloads could produce inconsistent diagnostics. Pin to + // the running values until Phase 8 implements + // CircuitBreakerManager::Reload (the only upstream sub-field that + // becomes hot-reloadable); at that point this save becomes a + // partial-field save excluding circuit_breaker. + auto saved_upstreams = current_config.upstreams; current_config = new_config; @@ -435,6 +448,7 @@ static bool ReloadConfig(const std::string& config_path, current_config.tls = saved_tls; current_config.worker_threads = saved_workers; current_config.http2.enabled = saved_h2_enabled; + current_config.upstreams = std::move(saved_upstreams); // Commit file-backed state only after full success — a failed reload // must not flip this flag or future reloads lose the defaults+env fallback. diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index af6f976d..daa5aaa3 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1788,6 +1788,139 @@ void TestHalfOpenDoesNotReuseProbeSlots() { } } +// BUG (review round 11, P1): Admission contract has ReportSuccess and +// ReportFailure but no path for probes that complete without touching the +// upstream (POOL_EXHAUSTED after probe admission, shutdown, client +// disconnect, PARSE_ERROR). Following the §7 "don't report these as +// failures" contract strictly, such probes would leak their inflight slot +// forever — once half_open_admitted_ reaches snapshot, all further +// admissions reject as half_open_full and nothing ever drains the cycle, +// wedging the slice in HALF_OPEN. +// +// Fix: ReportNeutral decrements BOTH inflight (so the last-probe re-trip +// still fires) and admitted (so a replacement probe can still exercise +// the upstream within the cycle budget). No touch to successes / fails. +void TestReportNeutralReleasesProbeSlot() { + std::cout << "\n[TEST] CB: ReportNeutral releases probe slot..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN, advance past backoff, fully consume probe budget. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + auto a = slice.TryAcquire(); + auto b = slice.TryAcquire(); + bool both_probes = a.decision == Decision::ADMITTED_PROBE && + b.decision == Decision::ADMITTED_PROBE; + + // Budget full: 3rd admission rejected. + auto pre_release = slice.TryAcquire(); + bool budget_full_before = pre_release.decision == Decision::REJECTED_OPEN; + + // Neutral-release A: slot returns, replacement probe fits within budget. + slice.ReportNeutral(true, a.generation); + + auto c = slice.TryAcquire(); + bool replacement_admitted = c.decision == Decision::ADMITTED_PROBE; + + // Cycle completes cleanly via B + C successes → CLOSED. + slice.ReportSuccess(true, b.generation); + slice.ReportSuccess(true, c.generation); + bool closed = slice.CurrentState() == State::CLOSED; + + // Neutral release must NOT have bumped probe_failures / probe_successes. + bool counters_clean = slice.ProbeSuccesses() == 2 && + slice.ProbeFailures() == 0; + + bool pass = both_probes && budget_full_before && + replacement_admitted && closed && counters_clean; + TestFramework::RecordTest( + "CB: ReportNeutral releases probe slot", + pass, pass ? "" : + "both_probes=" + std::to_string(both_probes) + + " budget_full_before=" + std::to_string(budget_full_before) + + " replacement_admitted=" + std::to_string(replacement_admitted) + + " closed=" + std::to_string(closed) + + " counters_clean=" + std::to_string(counters_clean), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ReportNeutral releases probe slot", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Companion: a neutral release that drains the last in-flight probe AFTER +// a sibling failure must still trigger the HALF_OPEN→OPEN re-trip. Without +// this last-probe hook in ReportNeutral, the slice would wedge in HALF_OPEN +// with saw_failure=true rejecting every admission via Case A. +void TestReportNeutralLastProbeAfterFailureReTrips() { + std::cout << "\n[TEST] CB: ReportNeutral re-trips as last probe after sibling fail..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 2; + cb.base_open_duration_ms = 100; + cb.max_open_duration_ms = 60000; + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1)); + + auto a = slice.TryAcquire(); + auto b = slice.TryAcquire(); + + // A fails → saw_failure=true, inflight=1 (B still running), no re-trip yet. + slice.ReportFailure(FailureKind::RESPONSE_5XX, true, a.generation); + bool still_halfopen = slice.CurrentState() == State::HALF_OPEN; + + // B neutral-releases → last in-flight drains. With the fix, the + // sibling-failure + last-probe hook fires TripHalfOpenToOpen. + slice.ReportNeutral(true, b.generation); + bool retripped = slice.CurrentState() == State::OPEN; + + bool pass = still_halfopen && retripped; + TestFramework::RecordTest( + "CB: ReportNeutral re-trips as last probe after sibling fail", + pass, pass ? "" : + "still_halfopen=" + std::to_string(still_halfopen) + + " retripped=" + std::to_string(retripped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ReportNeutral re-trips as last probe after sibling fail", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -1872,6 +2005,8 @@ void RunAllTests() { TestReportFailureUsesOneTimestampAcrossTripEval(); TestHalfOpenClampsNonPositiveProbeBudget(); TestHalfOpenDoesNotReuseProbeSlots(); + TestReportNeutralReleasesProbeSlot(); + TestReportNeutralLastProbeAfterFailureReTrips(); TestTransitionCallbackInvoked(); } From 60e1f903d03eded23e8efa2a92140a65569e1df7 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 11:26:46 +0800 Subject: [PATCH 14/37] Fix review comment --- server/circuit_breaker_slice.cc | 17 ++++++++-- test/circuit_breaker_test.h | 57 +++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index ae037a90..be9da56a 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -40,8 +40,21 @@ std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const { // Callers must increment consecutive_trips_ AFTER calling this method. int trips = consecutive_trips_.load(std::memory_order_relaxed); if (trips > MAX_OPEN_DURATION_SHIFT) trips = MAX_OPEN_DURATION_SHIFT; - int64_t base_ms = config_.base_open_duration_ms; - int64_t max_ms = config_.max_open_duration_ms; + // Clamp base/max for programmatic callers that bypass ConfigLoader::Validate + // (same hardening as CircuitBreakerWindow's ctor and the HALF_OPEN probe + // budget snapshot). Without these clamps: + // - base_open_duration_ms <= 0: `base_ms << trips` is <= 0 → open_until + // <= now → next TryAcquire immediately drains OPEN→HALF_OPEN, + // disabling the backoff entirely. + // - max_open_duration_ms < base_open_duration_ms: the overflow/clamp + // branch (`scaled_ms > max_ms`) fires on every trip, pinning the + // duration to a value smaller than base — same "no meaningful + // backoff" effect. + // Clamp floors: base >= 1ms, max >= base. + int64_t base_ms = config_.base_open_duration_ms > 0 + ? config_.base_open_duration_ms : 1; + int64_t max_ms = config_.max_open_duration_ms >= base_ms + ? config_.max_open_duration_ms : base_ms; int64_t scaled_ms = base_ms << trips; if (scaled_ms < base_ms /* overflow */ || scaled_ms > max_ms) { scaled_ms = max_ms; diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index daa5aaa3..65b03777 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -1921,6 +1921,62 @@ void TestReportNeutralLastProbeAfterFailureReTrips() { } } +// BUG (review round 12, P2): ComputeOpenDuration read base/max durations +// straight from config_, so a programmatic caller bypassing +// ConfigLoader::Validate() with base_open_duration_ms <= 0 or max < base +// would compute scaled_ms <= 0. open_until = now + 0 → next TryAcquire +// sees now_ns >= open_until_ns → transition to HALF_OPEN immediately. +// The breaker never actually backed off. Fix: clamp base to >= 1ms and +// max to >= base at the compute site, matching the window and probe +// budget clamps. +void TestComputeOpenDurationClampsInvalidBase() { + std::cout << "\n[TEST] CB: ComputeOpenDuration clamps invalid base/max..." + << std::endl; + try { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 1; + cb.base_open_duration_ms = 0; // bypass — would kill backoff + cb.max_open_duration_ms = 0; // bypass — would kill backoff + + auto clock = std::make_shared(); + CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, + [clock]() { return clock->now; }); + + // Trip to OPEN. + for (int i = 0; i < 2; ++i) { + auto a = slice.TryAcquire(); + slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + bool is_open = slice.CurrentState() == State::OPEN; + + // Immediate TryAcquire: clock hasn't moved, so if the clamp holds + // (open_until >= now + 1ms), this MUST reject as "open" (not drain + // to HALF_OPEN). Without the fix, scaled_ms=0 → open_until==now → + // admission path immediately transitions to HALF_OPEN. + auto immediate = slice.TryAcquire(); + bool rejected_as_open = immediate.decision == Decision::REJECTED_OPEN; + bool still_open = slice.CurrentState() == State::OPEN; + + bool pass = is_open && rejected_as_open && still_open; + TestFramework::RecordTest( + "CB: ComputeOpenDuration clamps invalid base/max", + pass, pass ? "" : + "is_open=" + std::to_string(is_open) + + " rejected_as_open=" + std::to_string(rejected_as_open) + + " still_open=" + std::to_string(still_open), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB: ComputeOpenDuration clamps invalid base/max", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + void TestTransitionCallbackInvoked() { std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl; try { @@ -2007,6 +2063,7 @@ void RunAllTests() { TestHalfOpenDoesNotReuseProbeSlots(); TestReportNeutralReleasesProbeSlot(); TestReportNeutralLastProbeAfterFailureReTrips(); + TestComputeOpenDurationClampsInvalidBase(); TestTransitionCallbackInvoked(); } From 360b55e058da44553d7bfa6aaa96f623f691054a Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 12:03:30 +0800 Subject: [PATCH 15/37] Finished Phase4: Host + manager + retry budget --- Makefile | 6 +- .../circuit_breaker/circuit_breaker_host.h | 118 +++++ .../circuit_breaker/circuit_breaker_manager.h | 80 +++ include/circuit_breaker/retry_budget.h | 126 +++++ include/config/server_config.h | 16 +- server/circuit_breaker_host.cc | 140 +++++ server/circuit_breaker_manager.cc | 105 ++++ server/config_loader.cc | 23 +- server/retry_budget.cc | 72 +++ test/circuit_breaker_phase3_test.h | 496 ++++++++++++++++++ test/config_test.h | 21 +- test/run_test.cc | 4 + 12 files changed, 1186 insertions(+), 21 deletions(-) create mode 100644 include/circuit_breaker/circuit_breaker_host.h create mode 100644 include/circuit_breaker/circuit_breaker_manager.h create mode 100644 include/circuit_breaker/retry_budget.h create mode 100644 server/circuit_breaker_host.cc create mode 100644 server/circuit_breaker_manager.cc create mode 100644 server/retry_budget.cc create mode 100644 test/circuit_breaker_phase3_test.h diff --git a/Makefile b/Makefile index 23a46ce0..935949c8 100644 --- a/Makefile +++ b/Makefile @@ -77,7 +77,7 @@ UPSTREAM_SRCS = $(SERVER_DIR)/upstream_connection.cc $(SERVER_DIR)/pool_partitio RATE_LIMIT_SRCS = $(SERVER_DIR)/token_bucket.cc $(SERVER_DIR)/rate_limit_zone.cc $(SERVER_DIR)/rate_limiter.cc # Circuit breaker layer sources -CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc +CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc $(SERVER_DIR)/retry_budget.cc $(SERVER_DIR)/circuit_breaker_host.cc $(SERVER_DIR)/circuit_breaker_manager.cc # CLI layer sources CLI_SRCS = $(SERVER_DIR)/cli_parser.cc $(SERVER_DIR)/signal_handler.cc $(SERVER_DIR)/pid_file.cc $(SERVER_DIR)/daemonizer.cc @@ -145,9 +145,9 @@ WS_HEADERS = $(LIB_DIR)/ws/websocket_connection.h $(LIB_DIR)/ws/websocket_frame. TLS_HEADERS = $(LIB_DIR)/tls/tls_context.h $(LIB_DIR)/tls/tls_connection.h $(LIB_DIR)/tls/tls_client_context.h UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/upstream_host_pool.h $(LIB_DIR)/upstream/pool_partition.h $(LIB_DIR)/upstream/upstream_connection.h $(LIB_DIR)/upstream/upstream_lease.h $(LIB_DIR)/upstream/upstream_http_codec.h $(LIB_DIR)/upstream/http_request_serializer.h $(LIB_DIR)/upstream/header_rewriter.h $(LIB_DIR)/upstream/retry_policy.h $(LIB_DIR)/upstream/proxy_transaction.h $(LIB_DIR)/upstream/proxy_handler.h $(LIB_DIR)/upstream/upstream_response.h $(LIB_DIR)/upstream/upstream_callbacks.h RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h -CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h +CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/include/circuit_breaker/circuit_breaker_host.h b/include/circuit_breaker/circuit_breaker_host.h new file mode 100644 index 00000000..6aff2965 --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_host.h @@ -0,0 +1,118 @@ +#pragma once + +#include "common.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "circuit_breaker/retry_budget.h" +// , , provided by common.h + +class Dispatcher; + +namespace circuit_breaker { + +// Observability snapshot of a single host, aggregated across all its +// partition slices. Safe to call from any thread (relaxed reads of +// atomic counters). Per-slice rows let dashboards detect skewed +// failure distribution across dispatchers. +struct CircuitBreakerHostSnapshot { + std::string service_name; + std::string host; + int port = 0; + + struct SliceRow { + size_t dispatcher_index = 0; + State state = State::CLOSED; + int64_t trips = 0; + int64_t rejected = 0; + int64_t probe_successes = 0; + int64_t probe_failures = 0; + }; + std::vector slices; + + // Aggregates across slices. + int64_t total_trips = 0; + int64_t total_rejected = 0; + int open_partitions = 0; + int half_open_partitions = 0; + + // Retry budget state (per-host, shared across partitions). + int64_t retries_in_flight = 0; + int64_t retries_rejected = 0; + int64_t in_flight = 0; +}; + +// Per-upstream-service aggregation layer. Owns: +// - N CircuitBreakerSlice instances (one per dispatcher partition, +// each pinned to its dispatcher for lock-free hot-path access). +// - One RetryBudget (shared across partitions — retry %-of-in-flight +// is a host-level metric, not per-dispatcher). +// +// Lifetime: constructed by CircuitBreakerManager at server start, lives +// for the server's lifetime. `service_name`, `host`, `port`, and the +// slice vector are never mutated post-construction (keys are stable for +// lock-free map lookup in the manager). +class CircuitBreakerHost { +public: + // `partition_count` must equal the number of dispatcher partitions + // in the server — typically NetServer's socket worker count or + // upstream pool's partition count. One slice is created per + // partition up-front. + CircuitBreakerHost(std::string service_name, + std::string host, + int port, + size_t partition_count, + const CircuitBreakerConfig& config); + + CircuitBreakerHost(const CircuitBreakerHost&) = delete; + CircuitBreakerHost& operator=(const CircuitBreakerHost&) = delete; + + // Hot-path lookup — returns nullptr only if `dispatcher_index` is + // out of range (programming error). Caller must invoke the + // returned slice's methods on its owning dispatcher thread. + CircuitBreakerSlice* GetSlice(size_t dispatcher_index); + + // Owned retry budget. Never null for the host's lifetime; safe to + // cache the pointer. Shared across all partitions of this host. + RetryBudget* GetRetryBudget() { return retry_budget_.get(); } + const RetryBudget* GetRetryBudget() const { return retry_budget_.get(); } + + // Aggregate snapshot across all slices + retry budget. Reads are + // relaxed atomic — eventually consistent across threads, which is + // fine for dashboards. + CircuitBreakerHostSnapshot Snapshot() const; + + // Apply a new config to every slice. Because each slice is pinned + // to its dispatcher thread, the call is dispatched per-partition — + // the caller provides the dispatcher list in the same order used at + // construction. If `dispatchers.size() != slices_.size()`, the + // method logs an error and returns without applying. + // + // The retry-budget sub-fields (percent, min_concurrency) are + // updated immediately (atomic stores, any thread) as part of this + // call — they don't need dispatcher routing. + void Reload(const std::vector>& dispatchers, + const CircuitBreakerConfig& new_config); + + // Install a transition callback on every slice. Uniform callback + // across partitions — callers that need partition-specific behavior + // can read `slice->dispatcher_index()` inside the callback. + // Must be called before live traffic; thread-safety depends on + // slice-dispatcher affinity at the Reload layer (Phase 8 wires this). + void SetTransitionCallbackOnAllSlices(StateTransitionCallback cb); + + // Accessors. + const std::string& service_name() const { return service_name_; } + const std::string& host() const { return host_; } + int port() const { return port_; } + size_t partition_count() const { return slices_.size(); } + +private: + std::string service_name_; + std::string host_; + int port_; + CircuitBreakerConfig config_; + std::vector> slices_; + std::unique_ptr retry_budget_; +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/circuit_breaker_manager.h b/include/circuit_breaker/circuit_breaker_manager.h new file mode 100644 index 00000000..66c2b33d --- /dev/null +++ b/include/circuit_breaker/circuit_breaker_manager.h @@ -0,0 +1,80 @@ +#pragma once + +#include "common.h" +#include "circuit_breaker/circuit_breaker_host.h" +// , , , , provided by common.h + +class Dispatcher; + +namespace circuit_breaker { + +// Top-level circuit-breaker orchestrator. Mirrors the shape of +// RateLimitManager: one instance lives on HttpServer, built once at +// MarkServerReady, survives for the server's lifetime. +// +// Ownership (per design §3.1): +// HttpServer +// ├── upstream_manager_ (declared FIRST, destructs last) +// └── circuit_breaker_manager_ (declared SECOND, destructs first) +// +// CircuitBreakerManager +// └── hosts_: unordered_map> +// +// `hosts_` is built once in the constructor — keys are never added or +// removed at runtime (topology is restart-only per the existing +// upstream policy). This makes GetHost lock-free after construction, +// which is critical for the hot path. +// +// Hot-reload (Phase 8): only `circuit_breaker` sub-fields on EXISTING +// upstream services can be live-reloaded. New or removed service names +// log a warn and are skipped — the caller (HttpServer::Reload) still +// fires the "restart required" diagnostic in that case. +class CircuitBreakerManager { +public: + // Builds one CircuitBreakerHost per upstream in `upstreams` — even + // when upstreams[i].circuit_breaker.enabled is false — so a later + // reload that flips enabled to true can take effect without + // re-wiring transition callbacks (disabled slices hold the callback + // but never invoke it). + // + // `partition_count` must match the server's dispatcher partition + // count (upstream pool / NetServer worker count). `dispatchers` + // captures the dispatcher list so Reload can route per-slice work. + CircuitBreakerManager( + const std::vector& upstreams, + size_t partition_count, + std::vector> dispatchers); + + CircuitBreakerManager(const CircuitBreakerManager&) = delete; + CircuitBreakerManager& operator=(const CircuitBreakerManager&) = delete; + + // Hot-path lookup — returns nullptr for unknown service names. + // Thread-safe (post-construction `hosts_` is read-only). + CircuitBreakerHost* GetHost(const std::string& service_name); + const CircuitBreakerHost* GetHost(const std::string& service_name) const; + + // Apply breaker-field edits to EXISTING upstream services. Topology + // changes (new/removed service names) are logged at warn and + // skipped — HttpServer::Reload is the only layer that warns about + // topology, and this manager trusts that signal. Serialized by + // reload_mtx_ so concurrent Reload calls queue cleanly; the hot + // path does NOT take this lock. + void Reload(const std::vector& new_upstreams); + + // Observability — snapshots every host. Safe from any thread. + std::vector SnapshotAll() const; + + // Test/admin helpers. + size_t host_count() const { return hosts_.size(); } + +private: + // Post-construction read-only — keys and unique_ptr values never + // change, so lookups don't need a lock. + std::unordered_map> hosts_; + std::vector> dispatchers_; + + // Serializes concurrent Reload calls. NOT taken on the hot path. + mutable std::mutex reload_mtx_; +}; + +} // namespace circuit_breaker diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h new file mode 100644 index 00000000..dd4da11c --- /dev/null +++ b/include/circuit_breaker/retry_budget.h @@ -0,0 +1,126 @@ +#pragma once + +#include "common.h" +// , provided by common.h + +namespace circuit_breaker { + +// Retry budget — orthogonal to the breaker state machine. +// +// Problem: even when the circuit is CLOSED, a cascading failure on a +// healthy-looking upstream can be amplified by per-request retries. If +// 100 requests are in flight and each retries once, the upstream sees +// 200. If each retries twice, 300. A sick-but-not-dead upstream gets +// tipped over by the retry multiplier itself. +// +// Fix: cap concurrent retries as a fraction of concurrent non-retry +// traffic plus a floor for low-volume correctness. +// +// allowed_retries = max(min_concurrency, in_flight * percent / 100) +// +// The retry budget is PER-HOST (one instance owned by CircuitBreakerHost, +// shared across its partitions — the percent math is about aggregate +// upstream load, not per-dispatcher slicing). All counters are atomic +// relaxed — snapshots can be slightly stale, which is fine for a +// capacity gate on a retry storm. +// +// Usage (Phase 5 wires this in): +// 1. On every attempt (first or retry), call TrackInFlight() and keep +// the returned guard alive until the attempt completes. The guard +// decrements in_flight_ in its destructor. +// 2. Before issuing a retry attempt, call TryConsumeRetry(). Proceed +// if it returns true; reject as RETRY_BUDGET_EXHAUSTED if false. +// 3. When the retried attempt completes, call ReleaseRetry(). +class RetryBudget { +public: + // `percent` — cap retries at this % of in-flight (0-100). + // `min_concurrency` — always allow at least this many concurrent + // retries regardless of in_flight; ensures low-volume correctness + // (without it, a 20% budget allows 0 retries when in_flight < 5). + RetryBudget(int percent, int min_concurrency); + + // Non-copyable, non-movable. Lifetime-stable under its owner + // (CircuitBreakerHost). + RetryBudget(const RetryBudget&) = delete; + RetryBudget& operator=(const RetryBudget&) = delete; + + // RAII guard — decrements in_flight_ on destruction. Move-only. + class InFlightGuard { + public: + InFlightGuard() = default; + explicit InFlightGuard(std::atomic* counter) : counter_(counter) {} + ~InFlightGuard() { + if (counter_) counter_->fetch_sub(1, std::memory_order_relaxed); + } + InFlightGuard(InFlightGuard&& o) noexcept : counter_(o.counter_) { + o.counter_ = nullptr; + } + InFlightGuard& operator=(InFlightGuard&& o) noexcept { + if (this != &o) { + if (counter_) counter_->fetch_sub(1, std::memory_order_relaxed); + counter_ = o.counter_; + o.counter_ = nullptr; + } + return *this; + } + InFlightGuard(const InFlightGuard&) = delete; + InFlightGuard& operator=(const InFlightGuard&) = delete; + + private: + std::atomic* counter_ = nullptr; + }; + + // Call on every upstream attempt entry (first try OR retry). The + // returned guard MUST outlive the attempt — typically stored as a + // ProxyTransaction member. Never returns an empty guard. + InFlightGuard TrackInFlight(); + + // Call BEFORE issuing a retry attempt. Returns true if the retry + // fits under the budget (retries_in_flight < cap); caller must pair + // a true return with a matching ReleaseRetry when the retry + // completes. Returns false if over budget — caller must NOT retry + // and must NOT call ReleaseRetry. + // + // The cap is computed against a freshly-loaded in_flight snapshot: + // cap = max(min_concurrency, in_flight * percent / 100) + bool TryConsumeRetry(); + + // Call when a consumed retry attempt finishes. Must be paired with a + // prior successful TryConsumeRetry. + void ReleaseRetry(); + + // Apply new tuning. Thread-safe (atomics). Preserves in-flight counters + // — only the admission formula changes. + void Reload(int percent, int min_concurrency); + + // Observability — safe from any thread, relaxed. + int64_t InFlight() const { + return in_flight_.load(std::memory_order_relaxed); + } + int64_t RetriesInFlight() const { + return retries_in_flight_.load(std::memory_order_relaxed); + } + int64_t RetriesRejected() const { + return retries_rejected_.load(std::memory_order_relaxed); + } + + int percent() const { return percent_.load(std::memory_order_relaxed); } + int min_concurrency() const { + return min_concurrency_.load(std::memory_order_relaxed); + } + +private: + // Tuning — atomic so Reload() is lock-free. + std::atomic percent_; + std::atomic min_concurrency_; + + // Counters (relaxed — admission decisions tolerate slightly stale + // reads; correctness depends on each guard's fetch_sub pairing with + // its increment, which holds under relaxed because they touch the + // same atomic). + std::atomic in_flight_{0}; + std::atomic retries_in_flight_{0}; + std::atomic retries_rejected_{0}; +}; + +} // namespace circuit_breaker diff --git a/include/config/server_config.h b/include/config/server_config.h index 5a6a39f4..8a8e8ed4 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -152,12 +152,12 @@ struct CircuitBreakerConfig { // Safety valve (future-proof for load-balanced services; no-op v1). int max_ejection_percent_per_host_set = 50; - // NOTE: retry_budget_percent and retry_budget_min_concurrency have been - // REMOVED from Phase 2. They'll be re-added in Phase 3 when the - // RetryBudget class is introduced (design §4.5). Exposing them here as - // config knobs without any runtime code reading them was misleading to - // operators — setting them produced no protection against retry storms - // since ProxyHandler's RetryPolicy reads proxy.retry.*, not these fields. + // Retry budget (orthogonal to the breaker). Caps concurrent retries to + // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100). + // Wired into the request path in Phase 5; in Phase 3 these are read by + // CircuitBreakerHost to construct its owned RetryBudget. + int retry_budget_percent = 20; + int retry_budget_min_concurrency = 3; bool operator==(const CircuitBreakerConfig& o) const { return enabled == o.enabled && @@ -169,7 +169,9 @@ struct CircuitBreakerConfig { permitted_half_open_calls == o.permitted_half_open_calls && base_open_duration_ms == o.base_open_duration_ms && max_open_duration_ms == o.max_open_duration_ms && - max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set; + max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set && + retry_budget_percent == o.retry_budget_percent && + retry_budget_min_concurrency == o.retry_budget_min_concurrency; } bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); } }; diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc new file mode 100644 index 00000000..b41635a6 --- /dev/null +++ b/server/circuit_breaker_host.cc @@ -0,0 +1,140 @@ +#include "circuit_breaker/circuit_breaker_host.h" +#include "dispatcher.h" +#include "log/logger.h" + +namespace circuit_breaker { + +CircuitBreakerHost::CircuitBreakerHost(std::string service_name, + std::string host, + int port, + size_t partition_count, + const CircuitBreakerConfig& config) + : service_name_(std::move(service_name)), + host_(std::move(host)), + port_(port), + config_(config), + retry_budget_(std::make_unique( + config.retry_budget_percent, + config.retry_budget_min_concurrency)) { + // Clamp partition_count — a zero-partition host would be unusable + // (no slices to dispatch to). Tests or misuse may pass 0; log and + // clamp to 1 so the host is at least consistent. + if (partition_count == 0) { + logging::Get()->error( + "CircuitBreakerHost({}, {}:{}) constructed with 0 partitions; " + "clamping to 1", + service_name_, host_, port_); + partition_count = 1; + } + + slices_.reserve(partition_count); + for (size_t i = 0; i < partition_count; ++i) { + // Per-slice label for logs — lets operators grep logs for a + // specific host:partition pair. + std::string label = service_name_ + ":" + host_ + ":" + + std::to_string(port_) + " p=" + std::to_string(i); + slices_.emplace_back(std::make_unique( + std::move(label), i, config_)); + } + logging::Get()->debug( + "CircuitBreakerHost created service={} host={}:{} partitions={} " + "enabled={} retry_budget={}%,min={}", + service_name_, host_, port_, partition_count, + config_.enabled, + config_.retry_budget_percent, + config_.retry_budget_min_concurrency); +} + +CircuitBreakerSlice* CircuitBreakerHost::GetSlice(size_t dispatcher_index) { + if (dispatcher_index >= slices_.size()) return nullptr; + return slices_[dispatcher_index].get(); +} + +CircuitBreakerHostSnapshot CircuitBreakerHost::Snapshot() const { + CircuitBreakerHostSnapshot snap; + snap.service_name = service_name_; + snap.host = host_; + snap.port = port_; + snap.slices.reserve(slices_.size()); + + for (const auto& slice : slices_) { + CircuitBreakerHostSnapshot::SliceRow row; + row.dispatcher_index = slice->dispatcher_index(); + row.state = slice->CurrentState(); + row.trips = slice->Trips(); + row.rejected = slice->Rejected(); + row.probe_successes = slice->ProbeSuccesses(); + row.probe_failures = slice->ProbeFailures(); + + snap.total_trips += row.trips; + snap.total_rejected += row.rejected; + if (row.state == State::OPEN) ++snap.open_partitions; + else if (row.state == State::HALF_OPEN) ++snap.half_open_partitions; + + snap.slices.push_back(row); + } + + // Retry budget aggregate (host-level, not per-partition). + snap.retries_in_flight = retry_budget_->RetriesInFlight(); + snap.retries_rejected = retry_budget_->RetriesRejected(); + snap.in_flight = retry_budget_->InFlight(); + + return snap; +} + +void CircuitBreakerHost::Reload( + const std::vector>& dispatchers, + const CircuitBreakerConfig& new_config) { + // Dispatcher list must match the slice count one-for-one — the + // slice at index i lives on dispatcher i. A size mismatch is a + // programming error (topology changed post-construction, which is + // restart-only); log and bail rather than mis-dispatching. + if (dispatchers.size() != slices_.size()) { + logging::Get()->error( + "CircuitBreakerHost::Reload({}:{}) dispatcher count mismatch: " + "got {}, expected {} — reload skipped", + service_name_, host_, dispatchers.size(), slices_.size()); + return; + } + + // Update host-level retry budget fields immediately — atomic stores, + // no dispatcher routing needed. RetryBudget::Reload clamps internally. + retry_budget_->Reload(new_config.retry_budget_percent, + new_config.retry_budget_min_concurrency); + + // Enqueue per-slice Reload on each owning dispatcher. The slice is + // dispatcher-thread-local for mutation, so the config swap must + // happen there. Passing slice as raw pointer is safe: slices_ is + // owned by `this` (the host), which outlives the manager's reload + // (enforced by CircuitBreakerManager's lifetime). + for (size_t i = 0; i < slices_.size(); ++i) { + CircuitBreakerSlice* slice = slices_[i].get(); + auto& dispatcher = dispatchers[i]; + if (!dispatcher) { + logging::Get()->error( + "CircuitBreakerHost::Reload({}:{}) null dispatcher at index {}", + service_name_, host_, i); + continue; + } + dispatcher->EnQueue([slice, new_config]() { + slice->Reload(new_config); + }); + } + + // Save the new config for future Snapshot() / construction-like + // operations. Other threads never read config_ directly. + config_ = new_config; +} + +void CircuitBreakerHost::SetTransitionCallbackOnAllSlices( + StateTransitionCallback cb) { + for (auto& slice : slices_) { + // Copy the callback so each slice owns its own std::function. + // Passing by value into SetTransitionCallback gives each slice + // an independent copy, avoiding cross-partition std::function + // data races. + slice->SetTransitionCallback(cb); + } +} + +} // namespace circuit_breaker diff --git a/server/circuit_breaker_manager.cc b/server/circuit_breaker_manager.cc new file mode 100644 index 00000000..7e4a8035 --- /dev/null +++ b/server/circuit_breaker_manager.cc @@ -0,0 +1,105 @@ +#include "circuit_breaker/circuit_breaker_manager.h" +#include "log/logger.h" +#include + +namespace circuit_breaker { + +CircuitBreakerManager::CircuitBreakerManager( + const std::vector& upstreams, + size_t partition_count, + std::vector> dispatchers) + : dispatchers_(std::move(dispatchers)) { + // Build one Host per upstream regardless of .circuit_breaker.enabled. + // Disabled hosts still need a live Slice so a later reload can flip + // them on without re-wiring transition callbacks (design §3.1). + hosts_.reserve(upstreams.size()); + for (const auto& u : upstreams) { + if (u.name.empty()) { + // ConfigLoader::Validate rejects empty names upstream, but + // defense in depth — skip rather than insert an unreachable + // host with an empty key that would shadow future lookups. + logging::Get()->error( + "CircuitBreakerManager: skipping upstream with empty name"); + continue; + } + auto [it, inserted] = hosts_.emplace( + u.name, + std::make_unique( + u.name, u.host, u.port, partition_count, u.circuit_breaker)); + if (!inserted) { + // Duplicate service name — shouldn't happen (Validate checks + // uniqueness), but log so the collision is visible rather + // than silently dropping the second entry. + logging::Get()->error( + "CircuitBreakerManager: duplicate upstream name '{}' ignored", + u.name); + } + } + logging::Get()->info( + "CircuitBreakerManager initialized hosts={} partitions={}", + hosts_.size(), partition_count); +} + +CircuitBreakerHost* CircuitBreakerManager::GetHost( + const std::string& service_name) { + auto it = hosts_.find(service_name); + return it == hosts_.end() ? nullptr : it->second.get(); +} + +const CircuitBreakerHost* CircuitBreakerManager::GetHost( + const std::string& service_name) const { + auto it = hosts_.find(service_name); + return it == hosts_.end() ? nullptr : it->second.get(); +} + +void CircuitBreakerManager::Reload( + const std::vector& new_upstreams) { + // Serialize with any other Reload calls. Hot path doesn't take this. + std::lock_guard lk(reload_mtx_); + + // Detect topology changes (added / removed service names) so we can + // log and skip — the authoritative "restart required" warning lives + // in HttpServer::Reload; we just honor the "existing hosts only" + // contract by applying breaker fields to matching names and nothing + // else. + std::unordered_set new_names; + new_names.reserve(new_upstreams.size()); + for (const auto& u : new_upstreams) new_names.insert(u.name); + + for (const auto& u : new_upstreams) { + auto* host = GetHost(u.name); + if (!host) { + // New service name — topology change, skip. The outer + // reload layer warns. + logging::Get()->warn( + "CircuitBreakerManager::Reload: new upstream '{}' requires " + "restart (ignored)", + u.name); + continue; + } + host->Reload(dispatchers_, u.circuit_breaker); + } + + // Log removals without touching the hosts (their removal also + // requires a restart). + for (const auto& [name, _] : hosts_) { + if (new_names.find(name) == new_names.end()) { + logging::Get()->warn( + "CircuitBreakerManager::Reload: removed upstream '{}' requires " + "restart (ignored)", + name); + } + } +} + +std::vector +CircuitBreakerManager::SnapshotAll() const { + std::vector snapshots; + snapshots.reserve(hosts_.size()); + for (const auto& [_, host] : hosts_) { + snapshots.push_back(host->Snapshot()); + } + return snapshots; +} + +} // namespace circuit_breaker diff --git a/server/config_loader.cc b/server/config_loader.cc index e3f7f6fe..f9b82540 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -310,9 +310,10 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) { cb_int("max_open_duration_ms", 60000); upstream.circuit_breaker.max_ejection_percent_per_host_set = cb_int("max_ejection_percent_per_host_set", 50); - // retry_budget_* fields removed from Phase 2 — re-added in - // Phase 3 when the RetryBudget class lands. Unknown keys in - // input JSON are silently ignored by nlohmann::json. + upstream.circuit_breaker.retry_budget_percent = + cb_int("retry_budget_percent", 20); + upstream.circuit_breaker.retry_budget_min_concurrency = + cb_int("retry_budget_min_concurrency", 3); } config.upstreams.push_back(std::move(upstream)); @@ -896,7 +897,16 @@ void ConfigLoader::Validate(const ServerConfig& config) { idx + " ('" + u.name + "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]"); } - // retry_budget_* validation removed — fields moved to Phase 3. + if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_percent must be in [0, 100]"); + } + if (cb.retry_budget_min_concurrency < 0) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_min_concurrency must be >= 0"); + } } // Validate method names — reject unknowns and duplicates. // Duplicates would cause RouteAsync to throw at startup. @@ -1178,7 +1188,10 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) { u.circuit_breaker.max_open_duration_ms; cbj["max_ejection_percent_per_host_set"] = u.circuit_breaker.max_ejection_percent_per_host_set; - // retry_budget_* fields dropped from serialization — Phase 3 adds. + cbj["retry_budget_percent"] = + u.circuit_breaker.retry_budget_percent; + cbj["retry_budget_min_concurrency"] = + u.circuit_breaker.retry_budget_min_concurrency; uj["circuit_breaker"] = cbj; } j["upstreams"].push_back(uj); diff --git a/server/retry_budget.cc b/server/retry_budget.cc new file mode 100644 index 00000000..7246eb26 --- /dev/null +++ b/server/retry_budget.cc @@ -0,0 +1,72 @@ +#include "circuit_breaker/retry_budget.h" + +namespace circuit_breaker { + +namespace { + +// Clamp floors for direct-ctor / Reload callers that bypass +// ConfigLoader::Validate(). Mirrors the hardening elsewhere in the +// circuit-breaker code (window ctor, probe budget snapshot, +// ComputeOpenDuration) so programmatic callers can't disable the +// budget by passing pathological values. +// percent < 0 → 0 (pure min_concurrency floor, no %-based cap) +// percent > 100 → 100 (retries capped at total in_flight) +// min_concurrency < 0 → 0 (no floor) +int ClampPercent(int p) { + if (p < 0) return 0; + if (p > 100) return 100; + return p; +} +int ClampMinConcurrency(int m) { + return m < 0 ? 0 : m; +} + +} // namespace + +RetryBudget::RetryBudget(int percent, int min_concurrency) + : percent_(ClampPercent(percent)), + min_concurrency_(ClampMinConcurrency(min_concurrency)) {} + +RetryBudget::InFlightGuard RetryBudget::TrackInFlight() { + in_flight_.fetch_add(1, std::memory_order_relaxed); + return InFlightGuard(&in_flight_); +} + +bool RetryBudget::TryConsumeRetry() { + // Snapshot counters with relaxed — the gate is an approximate + // capacity check, not a strict admission lock. Racing callers may + // both read cap=N and both try to reserve; the worst case is that + // both succeed and we momentarily sit at retries_in_flight_ = + // cap+1, which is acceptable for a traffic-shaping gate (unlike a + // security-critical gate). + int64_t in_flight = in_flight_.load(std::memory_order_relaxed); + int pct = percent_.load(std::memory_order_relaxed); + int min_conc = min_concurrency_.load(std::memory_order_relaxed); + + // cap = max(min_concurrency, in_flight * percent / 100) + // Integer math is fine — percent is 0..100, in_flight is an int64. + // Overflow is impossible within reasonable load levels (in_flight + // would need to exceed ~2e16 to overflow after multiplying by 100). + int64_t pct_cap = (in_flight * pct) / 100; + int64_t cap = pct_cap > min_conc ? pct_cap : min_conc; + + int64_t current = retries_in_flight_.load(std::memory_order_relaxed); + if (current >= cap) { + retries_rejected_.fetch_add(1, std::memory_order_relaxed); + return false; + } + retries_in_flight_.fetch_add(1, std::memory_order_relaxed); + return true; +} + +void RetryBudget::ReleaseRetry() { + retries_in_flight_.fetch_sub(1, std::memory_order_relaxed); +} + +void RetryBudget::Reload(int percent, int min_concurrency) { + percent_.store(ClampPercent(percent), std::memory_order_relaxed); + min_concurrency_.store(ClampMinConcurrency(min_concurrency), + std::memory_order_relaxed); +} + +} // namespace circuit_breaker diff --git a/test/circuit_breaker_phase3_test.h b/test/circuit_breaker_phase3_test.h new file mode 100644 index 00000000..ba2f5554 --- /dev/null +++ b/test/circuit_breaker_phase3_test.h @@ -0,0 +1,496 @@ +#pragma once + +#include "test_framework.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "circuit_breaker/retry_budget.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "dispatcher.h" + +#include +#include +#include +#include + +// Phase 3 unit tests: RetryBudget, CircuitBreakerHost, CircuitBreakerManager. +// +// These tests exercise the standalone data structures introduced in Phase 3 +// without any integration into the request path (that comes in Phase 4). +// Every test constructs the object under test in isolation — no live +// dispatchers, no network I/O. A minimal Dispatcher is instantiated only +// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload +// calls. +namespace CircuitBreakerPhase3Tests { + +using circuit_breaker::CircuitBreakerHost; +using circuit_breaker::CircuitBreakerHostSnapshot; +using circuit_breaker::CircuitBreakerManager; +using circuit_breaker::Decision; +using circuit_breaker::FailureKind; +using circuit_breaker::RetryBudget; +using circuit_breaker::State; + +static CircuitBreakerConfig DefaultCbConfig() { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 50; + cb.minimum_volume = 20; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 3; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + cb.retry_budget_percent = 20; + cb.retry_budget_min_concurrency = 3; + return cb; +} + +// ============================================================================ +// RetryBudget tests +// ============================================================================ + +// Min-concurrency floor: with tiny in_flight, min_concurrency still permits +// the configured floor of concurrent retries (otherwise a 20% budget allows 0 +// retries when in_flight < 5 — useless in low-volume services). +void TestRetryBudgetMinConcurrencyFloor() { + std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..." + << std::endl; + try { + // percent=20, min=3. Even with 0 in_flight, 3 retries allowed. + RetryBudget rb(20, 3); + + // Without any in_flight, min floor is what gates us. + bool r1 = rb.TryConsumeRetry(); // 1/3 + bool r2 = rb.TryConsumeRetry(); // 2/3 + bool r3 = rb.TryConsumeRetry(); // 3/3 + bool r4 = rb.TryConsumeRetry(); // over → rejected + + bool pass = r1 && r2 && r3 && !r4 && + rb.RetriesInFlight() == 3 && + rb.RetriesRejected() == 1; + + rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry(); + pass = pass && rb.RetriesInFlight() == 0; + + TestFramework::RecordTest("RetryBudget min_concurrency floor", pass, + pass ? "" : "r1=" + std::to_string(r1) + + " r2=" + std::to_string(r2) + + " r3=" + std::to_string(r3) + + " r4=" + std::to_string(r4) + + " inflight=" + std::to_string(rb.RetriesInFlight()) + + " rejected=" + std::to_string(rb.RetriesRejected()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget min_concurrency floor", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Percent-based cap scales with in_flight. +// percent=20, min=0, in_flight=50 → cap = 10 retries. +void TestRetryBudgetPercentCap() { + std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..." + << std::endl; + try { + RetryBudget rb(20, 0); // no min floor — pure percent + + // Push in_flight to 50 via guards that we intentionally keep alive. + std::vector guards; + for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight()); + + // 50 * 20% = 10 retries allowed. + int admitted = 0; + for (int i = 0; i < 20; ++i) { + if (rb.TryConsumeRetry()) ++admitted; + } + bool cap_hit = admitted == 10; + bool rejected_count = rb.RetriesRejected() == 10; + + // Release guards — in_flight drops to 0; future TryConsumeRetry with + // min=0 and in_flight=0 rejects everything. + for (auto& g : guards) (void)std::move(g); + guards.clear(); + for (int i = 0; i < admitted; ++i) rb.ReleaseRetry(); + + bool pass = cap_hit && rejected_count && rb.InFlight() == 0 && + rb.RetriesInFlight() == 0; + TestFramework::RecordTest("RetryBudget percent cap", pass, + pass ? "" : "admitted=" + std::to_string(admitted) + + " rejected=" + std::to_string(rb.RetriesRejected()) + + " inflight=" + std::to_string(rb.InFlight()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget percent cap", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// TrackInFlight guards must be RAII-safe: destroying the guard decrements +// in_flight_; moving the guard transfers ownership; self-move safe. +void TestRetryBudgetInFlightGuardRaii() { + std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl; + try { + RetryBudget rb(20, 3); + + bool zero_init = rb.InFlight() == 0; + { + auto g = rb.TrackInFlight(); + bool one_after_track = rb.InFlight() == 1; + + // Move-construct: counter transfers, original is empty. + auto g2 = std::move(g); + bool still_one_after_move = rb.InFlight() == 1; + // g is now empty, destroying it decrements nothing. + (void)g; + + // g2 goes out of scope next. + if (!zero_init || !one_after_track || !still_one_after_move) { + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + false, "mid-test state wrong", + TestFramework::TestCategory::OTHER); + return; + } + } + bool zero_after_drop = rb.InFlight() == 0; + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + zero_after_drop, + zero_after_drop ? "" : "in_flight not zero after guard drop", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Reload updates tuning atomically without resetting in-flight counters — +// the admission formula changes, outstanding retries keep running. +void TestRetryBudgetReloadPreservesCounters() { + std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..." + << std::endl; + try { + RetryBudget rb(20, 3); + bool r1 = rb.TryConsumeRetry(); // 1/3 + + // Tighten tuning mid-flight. + rb.Reload(10, 1); + + // Outstanding retry is still tracked. + bool inflight_preserved = rb.RetriesInFlight() == 1; + + // New tuning applies — min=1, so 1/1 retry allowed max. + // Current retries_in_flight=1 already, next attempt rejects. + bool r2 = rb.TryConsumeRetry(); + + rb.ReleaseRetry(); + bool cleanup_ok = rb.RetriesInFlight() == 0; + + bool pass = r1 && inflight_preserved && !r2 && cleanup_ok; + TestFramework::RecordTest("RetryBudget Reload preserves counters", pass, + pass ? "" : "r1=" + std::to_string(r1) + + " inflight_preserved=" + std::to_string(inflight_preserved) + + " r2=" + std::to_string(r2) + + " cleanup_ok=" + std::to_string(cleanup_ok), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget Reload preserves counters", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Clamp guards: negative percent / negative min_concurrency are clamped at +// construction (mirrors ConfigLoader::Validate — programmatic callers that +// bypass validation get safe defaults). +void TestRetryBudgetClampsInvalidTuning() { + std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl; + try { + RetryBudget rb(-50, -10); + bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0; + + // Over-max percent clamps to 100. + RetryBudget rb2(500, 5); + bool over_clamped = rb2.percent() == 100; + + // Reload also clamps. + rb.Reload(-1, -1); + bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0; + + bool pass = clamped && over_clamped && reload_clamped; + TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass, + pass ? "" : + "clamped=" + std::to_string(clamped) + + " over_clamped=" + std::to_string(over_clamped) + + " reload_clamped=" + std::to_string(reload_clamped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget clamps invalid tuning", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// CircuitBreakerHost tests +// ============================================================================ + +// Host creates partition_count slices, GetSlice looks up by index, out-of- +// range returns nullptr (not a crash). +void TestHostCreatesSlicesAndGetSlice() { + std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb); + + bool count_ok = host.partition_count() == 4; + bool slice0 = host.GetSlice(0) != nullptr; + bool slice3 = host.GetSlice(3) != nullptr; + bool slice4_null = host.GetSlice(4) == nullptr; // out of range + bool slice_big_null = host.GetSlice(100) == nullptr; + + // Retry budget always present. + bool rb_present = host.GetRetryBudget() != nullptr; + + // Field getters. + bool fields_ok = host.service_name() == "svc" && + host.host() == "10.0.0.1" && + host.port() == 8080; + + bool pass = count_ok && slice0 && slice3 && slice4_null && + slice_big_null && rb_present && fields_ok; + TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost GetSlice", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Host Snapshot aggregates counters across slices and rolls up states. +void TestHostSnapshotAggregates() { + std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + CircuitBreakerHost host("svc", "h", 80, 3, cb); + + // Trip slice 0 and 2 → 2 open_partitions, 1 closed. + for (int p : {0, 2}) { + auto* s = host.GetSlice(p); + for (int i = 0; i < 2; ++i) { + auto a = s->TryAcquire(); + s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + } + + auto snap = host.Snapshot(); + + bool rows_ok = snap.slices.size() == 3; + bool total_trips = snap.total_trips == 2; + bool open = snap.open_partitions == 2; + bool halfopen = snap.half_open_partitions == 0; + bool svc_ok = snap.service_name == "svc" && + snap.host == "h" && snap.port == 80; + + bool pass = rows_ok && total_trips && open && halfopen && svc_ok; + TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass, + pass ? "" : + "rows=" + std::to_string(snap.slices.size()) + + " trips=" + std::to_string(snap.total_trips) + + " open=" + std::to_string(snap.open_partitions), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Host Reload with mismatched dispatcher count logs error and does nothing. +// Uses an empty dispatcher vector — the mismatch path must NOT dereference. +void TestHostReloadDispatcherMismatchIsSafe() { + std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + CircuitBreakerHost host("svc", "h", 80, 3, cb); + + auto new_cb = cb; + new_cb.failure_rate_threshold = 80; + + // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not + // apply (retry budget atomics should stay at old values). + std::vector> empty; + host.Reload(empty, new_cb); + + // Retry budget fields should be unchanged — Reload bailed early. + bool rb_unchanged = + host.GetRetryBudget()->percent() == cb.retry_budget_percent && + host.GetRetryBudget()->min_concurrency() == + cb.retry_budget_min_concurrency; + + TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", + rb_unchanged, + rb_unchanged ? "" : "retry budget incorrectly updated on bail", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// CircuitBreakerManager tests +// ============================================================================ + +// Manager builds one host per upstream (regardless of enabled). GetHost +// returns non-null for known names and null for unknown. +void TestManagerGetHostLookup() { + std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..." + << std::endl; + try { + std::vector upstreams(2); + upstreams[0].name = "svc-a"; + upstreams[0].host = "10.0.0.1"; + upstreams[0].port = 8080; + upstreams[0].circuit_breaker = DefaultCbConfig(); + upstreams[1].name = "svc-b"; + upstreams[1].host = "10.0.0.2"; + upstreams[1].port = 9090; + upstreams[1].circuit_breaker = DefaultCbConfig(); + upstreams[1].circuit_breaker.enabled = false; // disabled still built + + CircuitBreakerManager mgr(upstreams, 4, {}); + + bool count_ok = mgr.host_count() == 2; + auto* a = mgr.GetHost("svc-a"); + auto* b = mgr.GetHost("svc-b"); + auto* unknown = mgr.GetHost("nope"); + + bool a_ok = a != nullptr && a->port() == 8080 && + a->partition_count() == 4; + bool b_ok = b != nullptr && b->port() == 9090 && + b->partition_count() == 4; + bool unknown_null = unknown == nullptr; + + bool pass = count_ok && a_ok && b_ok && unknown_null; + TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass, + pass ? "" : + "count_ok=" + std::to_string(count_ok) + + " a=" + std::to_string(a_ok) + + " b=" + std::to_string(b_ok) + + " unknown_null=" + std::to_string(unknown_null), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// SnapshotAll returns one entry per host; topology-preserved Reload logs and +// skips new/removed names without crashing. +void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() { + std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..." + << std::endl; + try { + std::vector upstreams(1); + upstreams[0].name = "svc-a"; + upstreams[0].host = "h"; + upstreams[0].port = 80; + upstreams[0].circuit_breaker = DefaultCbConfig(); + + CircuitBreakerManager mgr(upstreams, 2, {}); + + auto snaps = mgr.SnapshotAll(); + bool one_snapshot = snaps.size() == 1; + bool snap_name_ok = snaps[0].service_name == "svc-a"; + + // Reload with a NEW name + REMOVED existing name — both must log + // warn and do nothing (topology is restart-only). + std::vector new_upstreams(1); + new_upstreams[0].name = "svc-NEW"; + new_upstreams[0].host = "h"; + new_upstreams[0].port = 80; + new_upstreams[0].circuit_breaker = DefaultCbConfig(); + + mgr.Reload(new_upstreams); + + // Manager must still only know about svc-a (the original). + bool original_preserved = mgr.GetHost("svc-a") != nullptr; + bool new_not_added = mgr.GetHost("svc-NEW") == nullptr; + bool count_stable = mgr.host_count() == 1; + + bool pass = one_snapshot && snap_name_ok && original_preserved && + new_not_added && count_stable; + TestFramework::RecordTest( + "CircuitBreakerManager SnapshotAll + topology-skip", pass, + pass ? "" : + "one_snap=" + std::to_string(one_snapshot) + + " name_ok=" + std::to_string(snap_name_ok) + + " preserved=" + std::to_string(original_preserved) + + " new_not_added=" + std::to_string(new_not_added) + + " count=" + std::to_string(mgr.host_count()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CircuitBreakerManager SnapshotAll + topology-skip", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects +// empty names, but manager must not blow up if something slips through). +void TestManagerSkipsEmptyNameUpstream() { + std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..." + << std::endl; + try { + std::vector upstreams(2); + upstreams[0].name = ""; // defensive — should be skipped + upstreams[0].host = "h"; + upstreams[0].port = 80; + upstreams[0].circuit_breaker = DefaultCbConfig(); + upstreams[1].name = "svc-b"; + upstreams[1].host = "h"; + upstreams[1].port = 81; + upstreams[1].circuit_breaker = DefaultCbConfig(); + + CircuitBreakerManager mgr(upstreams, 2, {}); + + bool pass = mgr.host_count() == 1 && + mgr.GetHost("svc-b") != nullptr && + mgr.GetHost("") == nullptr; + TestFramework::RecordTest( + "CircuitBreakerManager skips empty-name upstream", pass, + pass ? "" : "count=" + std::to_string(mgr.host_count()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CircuitBreakerManager skips empty-name upstream", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Run all Phase 3 tests. +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER PHASE 3 - UNIT TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestRetryBudgetMinConcurrencyFloor(); + TestRetryBudgetPercentCap(); + TestRetryBudgetInFlightGuardRaii(); + TestRetryBudgetReloadPreservesCounters(); + TestRetryBudgetClampsInvalidTuning(); + + TestHostCreatesSlicesAndGetSlice(); + TestHostSnapshotAggregates(); + TestHostReloadDispatcherMismatchIsSafe(); + + TestManagerGetHostLookup(); + TestManagerSnapshotAllAndReloadSkipsTopologyChanges(); + TestManagerSkipsEmptyNameUpstream(); +} + +} // namespace CircuitBreakerPhase3Tests diff --git a/test/config_test.h b/test/config_test.h index 6317151f..fe164ec3 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -362,8 +362,9 @@ namespace ConfigTests { cb.permitted_half_open_calls == 5 && cb.base_open_duration_ms == 5000 && cb.max_open_duration_ms == 60000 && - cb.max_ejection_percent_per_host_set == 50; - // retry_budget_* fields removed from Phase 2 — Phase 3 adds. + cb.max_ejection_percent_per_host_set == 50 && + cb.retry_budget_percent == 20 && + cb.retry_budget_min_concurrency == 3; TestFramework::RecordTest("Circuit Breaker Defaults", pass, pass ? "" : "default value mismatch", TestFramework::TestCategory::OTHER); @@ -392,7 +393,9 @@ namespace ConfigTests { "permitted_half_open_calls": 3, "base_open_duration_ms": 2000, "max_open_duration_ms": 120000, - "max_ejection_percent_per_host_set": 33 + "max_ejection_percent_per_host_set": 33, + "retry_budget_percent": 10, + "retry_budget_min_concurrency": 5 } }] })"; @@ -406,7 +409,9 @@ namespace ConfigTests { cb.permitted_half_open_calls == 3 && cb.base_open_duration_ms == 2000 && cb.max_open_duration_ms == 120000 && - cb.max_ejection_percent_per_host_set == 33; + cb.max_ejection_percent_per_host_set == 33 && + cb.retry_budget_percent == 10 && + cb.retry_budget_min_concurrency == 5; TestFramework::RecordTest("Circuit Breaker JSON Parse", pass, pass ? "" : "parsed values mismatch", TestFramework::TestCategory::OTHER); @@ -520,8 +525,12 @@ namespace ConfigTests { ExpectValidationFailure("CB Validation: max= base_open_duration_ms"); - // retry_budget_percent / retry_budget_min_concurrency validation - // cases removed — fields moved to Phase 3. + ExpectValidationFailure("CB Validation: retry_budget_percent>100", + R"({"retry_budget_percent": 200})", + "retry_budget_percent must be in [0, 100]"); + ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0", + R"({"retry_budget_min_concurrency": -1})", + "retry_budget_min_concurrency must be >= 0"); ExpectValidationFailure("CB Validation: max_ejection_percent>100", R"({"max_ejection_percent_per_host_set": 150})", "max_ejection_percent_per_host_set must be in [0, 100]"); diff --git a/test/run_test.cc b/test/run_test.cc index 3d55f06f..f118d495 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -14,6 +14,7 @@ #include "proxy_test.h" #include "rate_limit_test.h" #include "circuit_breaker_test.h" +#include "circuit_breaker_phase3_test.h" #include "test_framework.h" #include #include @@ -81,6 +82,9 @@ void RunAllTest(){ // Run circuit breaker tests CircuitBreakerTests::RunAllTests(); + // Run circuit breaker Phase 3 tests (host / manager / retry budget) + CircuitBreakerPhase3Tests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } From 548e16982becc0a670045b28726e1ec7d3e148e5 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 13:22:11 +0800 Subject: [PATCH 16/37] Finished Phase4: Host + manager + retry budget --- Makefile | 2 +- include/http/http_server.h | 14 + include/upstream/pool_partition.h | 5 + include/upstream/proxy_transaction.h | 87 +++++- include/upstream/upstream_manager.h | 29 ++ server/http_server.cc | 23 ++ server/proxy_transaction.cc | 275 ++++++++++++++++- test/circuit_breaker_phase4_test.h | 440 +++++++++++++++++++++++++++ test/run_test.cc | 5 + 9 files changed, 865 insertions(+), 15 deletions(-) create mode 100644 test/circuit_breaker_phase4_test.h diff --git a/Makefile b/Makefile index 935949c8..2dbd8c2a 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/include/http/http_server.h b/include/http/http_server.h index 8a497d8b..75e1e9a0 100644 --- a/include/http/http_server.h +++ b/include/http/http_server.h @@ -22,6 +22,10 @@ class UpstreamManager; class ProxyHandler; +namespace circuit_breaker { +class CircuitBreakerManager; +} + class HttpServer { public: // Snapshot of server runtime statistics. All values are approximate @@ -321,6 +325,16 @@ class HttpServer { std::vector upstream_configs_; std::unique_ptr upstream_manager_; + // Circuit breaker — declared AFTER upstream_manager_ so destruction + // order is breaker-FIRST, pool-SECOND (design §3.1). On shutdown the + // breaker's slices may still be consulted by in-flight + // ProxyTransactions until they drain; destroying the breaker first + // (before the pool) is safe because UpstreamManager's outstanding + // breaker_manager_ pointer is checked against null on every lookup. + // Destroying the pool first would leave breaker slices holding + // dangling references. + std::unique_ptr circuit_breaker_manager_; + // Rate limiting RateLimitConfig rate_limit_config_; std::unique_ptr rate_limit_manager_; diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h index 4c33a0cd..f259204a 100644 --- a/include/upstream/pool_partition.h +++ b/include/upstream/pool_partition.h @@ -25,6 +25,11 @@ class PoolPartition { static constexpr int CHECKOUT_CONNECT_TIMEOUT = -3; static constexpr int CHECKOUT_SHUTTING_DOWN = -4; static constexpr int CHECKOUT_QUEUE_TIMEOUT = -5; + // Delivered to wait-queue waiters drained on a breaker trip (Phase 6 + // implements the drain path). ProxyTransaction::OnCheckoutError maps + // this to RESULT_CIRCUIT_OPEN so the queued client gets the same + // circuit-open response a fresh requester would get. + static constexpr int CHECKOUT_CIRCUIT_OPEN = -6; PoolPartition(std::shared_ptr dispatcher, const std::string& upstream_host, int upstream_port, diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h index 6e25c689..6befe5a0 100644 --- a/include/upstream/proxy_transaction.h +++ b/include/upstream/proxy_transaction.h @@ -15,16 +15,29 @@ class UpstreamManager; class ConnectionHandler; class Dispatcher; +namespace circuit_breaker { +class CircuitBreakerSlice; +} + class ProxyTransaction : public std::enable_shared_from_this { public: // Result codes for internal state tracking - static constexpr int RESULT_SUCCESS = 0; - static constexpr int RESULT_CHECKOUT_FAILED = -1; // Upstream connect failure → 502 - static constexpr int RESULT_SEND_FAILED = -2; - static constexpr int RESULT_PARSE_ERROR = -3; - static constexpr int RESULT_RESPONSE_TIMEOUT = -4; + static constexpr int RESULT_SUCCESS = 0; + static constexpr int RESULT_CHECKOUT_FAILED = -1; // Upstream connect failure → 502 + static constexpr int RESULT_SEND_FAILED = -2; + static constexpr int RESULT_PARSE_ERROR = -3; + static constexpr int RESULT_RESPONSE_TIMEOUT = -4; static constexpr int RESULT_UPSTREAM_DISCONNECT = -5; - static constexpr int RESULT_POOL_EXHAUSTED = -6; // Local capacity → 503 + static constexpr int RESULT_POOL_EXHAUSTED = -6; // Local capacity → 503 + // Circuit breaker rejected this attempt before it touched the upstream. + // Carries Retry-After + X-Circuit-Breaker headers (§12.1). + // Terminal — retry loop MUST NOT retry this outcome (§8). + static constexpr int RESULT_CIRCUIT_OPEN = -7; + // Retry budget exhausted (Phase 5 wires the actual gate; the code is + // reserved here so MakeErrorResponse and the retry loop both know it + // exists and terminal-classify it). No Retry-After; distinct header + // X-Retry-Budget-Exhausted so operators can tell the two 503s apart. + static constexpr int RESULT_RETRY_BUDGET_EXHAUSTED = -8; // Constructor copies all needed fields from client_request (method, path, // query, headers, body, params, dispatcher_index, client_ip, client_tls, @@ -145,6 +158,30 @@ class ProxyTransaction : public std::enable_shared_from_this { // Timing std::chrono::steady_clock::time_point start_time_; + // Circuit breaker integration (Phase 4). Resolved once in Start() from + // `service_name_` + `dispatcher_index_`. Null when there's no + // CircuitBreakerManager attached (server has no upstreams, or the + // breaker is being built lazily) — the breaker is simply skipped in + // that case. Lifetime: the slice is owned by CircuitBreakerHost in + // CircuitBreakerManager on HttpServer, which outlives this transaction. + circuit_breaker::CircuitBreakerSlice* slice_ = nullptr; + + // Per-ATTEMPT admission state. Reset on each call to ConsultBreaker(); + // paired Report*() calls thread the `generation` back so the slice + // can drop stale completions across state transitions (see + // CircuitBreakerSlice::Admission doc). generation_==0 is a sentinel + // for "no admission held" — slice domain gens start at 1 so a 0-gen + // report always drops safely. + uint64_t admission_generation_ = 0; + bool is_probe_ = false; + + // Retry-budget token held by this transaction's most recent retry + // attempt. Phase 5 flips this to true on successful TryConsumeRetry + // and clears it on ReleaseRetry. Phase 4 declares the field so + // Cleanup() and Cancel() have something to check, but the retry + // loop does not yet consume the budget. + bool retry_token_held_ = false; + // Internal methods void AttemptCheckout(); void OnCheckoutReady(UpstreamLease lease); @@ -170,6 +207,42 @@ class ProxyTransaction : public std::enable_shared_from_this { void ArmResponseTimeout(int explicit_budget_ms = 0); void ClearResponseTimeout(); - // Error response factory (maps result codes to HTTP responses) + // Error response factory (maps result codes to HTTP responses). + // Circuit-open and retry-budget responses need richer context + // (Retry-After from slice_, distinguishing header), so they have + // dedicated factories below — MakeErrorResponse falls back to a + // plain 503 for those codes if called generically. static HttpResponse MakeErrorResponse(int result_code); + + // Phase 4: emit the §12.1 circuit-open response. + // 503 + Retry-After (seconds until slice->OpenUntil()) + // + X-Circuit-Breaker: open + // + X-Upstream-Host: service:host:port + HttpResponse MakeCircuitOpenResponse() const; + + // Phase 5 will emit this. Declared here so Phase 4's + // MakeErrorResponse RESULT_RETRY_BUDGET_EXHAUSTED branch has a + // target to dispatch to and so tests can assert the response shape + // even before the retry-budget gate is wired. + // 503 + X-Retry-Budget-Exhausted: 1 + static HttpResponse MakeRetryBudgetResponse(); + + // Phase 4 helpers — breaker gate and outcome classification. + // + // ConsultBreaker: call at the top of AttemptCheckout. Populates + // admission_generation_ and is_probe_ on admission; delivers the + // circuit-open response and returns false on reject. Dry-run admits + // and returns true (slice already counted the would-reject). + // Returns true if the caller should proceed to CheckoutAsync. + bool ConsultBreaker(); + + // ReportBreakerOutcome: classify a result_code into + // success/failure/neutral (per design §7) and call slice->Report* + // with admission_generation_. Clears admission_generation_ so a + // double-report is impossible. + // + // failure_kind is ignored unless the outcome is a FailureKind-bearing + // result; the caller passes the appropriate kind for 5xx vs disconnect + // vs timeout since the slice treats them differently only for logs. + void ReportBreakerOutcome(int result_code); }; diff --git a/include/upstream/upstream_manager.h b/include/upstream/upstream_manager.h index c308cbd3..f647d3b3 100644 --- a/include/upstream/upstream_manager.h +++ b/include/upstream/upstream_manager.h @@ -9,6 +9,10 @@ class TlsClientContext; +namespace circuit_breaker { +class CircuitBreakerManager; +} + class UpstreamManager { public: UpstreamManager(const std::vector& upstreams, @@ -59,6 +63,23 @@ class UpstreamManager { // Check if an upstream service is configured bool HasUpstream(const std::string& service_name) const; + // Install a non-owning pointer to the server's CircuitBreakerManager. + // Called once from HttpServer::MarkServerReady after both managers are + // constructed (§3.1). Lifetime guarantee: the CircuitBreakerManager + // is declared AFTER upstream_manager_ on HttpServer, so it destructs + // FIRST — UpstreamManager never reads through a dangling pointer on + // shutdown. Passing nullptr is allowed (detaches). + void AttachCircuitBreakerManager(circuit_breaker::CircuitBreakerManager* mgr) { + breaker_manager_.store(mgr, std::memory_order_release); + } + + // Returns the attached breaker manager, or nullptr if no manager is + // attached. Safe from any thread (atomic load, acquire so any + // Attach-time publication is visible). + circuit_breaker::CircuitBreakerManager* GetCircuitBreakerManager() const { + return breaker_manager_.load(std::memory_order_acquire); + } + private: // service_name → host pool. Built once at construction, never modified. std::unordered_map> pools_; @@ -73,6 +94,14 @@ class UpstreamManager { // reject new checkouts before per-partition shutdown tasks execute. std::atomic shutting_down_{false}; + // Non-owning pointer to the circuit-breaker manager, installed by + // HttpServer::MarkServerReady after both managers exist. Atomic so + // late-arriving hot-path reads in ProxyTransaction see either a + // coherent pointer or nullptr (never torn). Owned by HttpServer; + // lifetime outlives UpstreamManager (breaker destructs first — + // §3.1 ownership). Default nullptr — breaker is an opt-in layer. + std::atomic breaker_manager_{nullptr}; + // Manager-owned atomic counter: total outstanding connections std::atomic outstanding_conns_{0}; diff --git a/server/http_server.cc b/server/http_server.cc index ecfff96f..fbf06947 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -5,6 +5,7 @@ #include "http2/http2_constants.h" #include "upstream/upstream_manager.h" #include "upstream/proxy_handler.h" +#include "circuit_breaker/circuit_breaker_manager.h" #include "log/logger.h" #include "log/log_utils.h" #include @@ -361,6 +362,28 @@ void HttpServer::MarkServerReady() { throw; } + // Circuit breaker — built alongside the pool. One host per + // configured upstream (regardless of enabled), with one slice + // per dispatcher so hot-path TryAcquire is lock-free. Attached + // to UpstreamManager via a non-owning pointer so ProxyTransaction + // can reach it on the hot path via upstream_manager_-> + // GetCircuitBreakerManager(). The manager is declared AFTER + // upstream_manager_ on HttpServer (see header) so teardown runs + // breaker-first, which matches the dangling-pointer safety rule + // in UpstreamManager::breaker_manager_. + try { + circuit_breaker_manager_ = + std::make_unique( + upstream_configs_, dispatchers.size(), dispatchers); + upstream_manager_->AttachCircuitBreakerManager( + circuit_breaker_manager_.get()); + } catch (...) { + logging::Get()->error( + "Circuit breaker init failed, stopping server"); + net_server_.Stop(); + throw; + } + // Ensure the timer cadence is fast enough for upstream connect timeouts. // SetDeadline stores a ms-precision deadline, but TimerHandler only fires // at the timer scan interval. If connect_timeout_ms < current interval, diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 18aa6193..c263332e 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -2,6 +2,9 @@ #include "upstream/upstream_manager.h" #include "upstream/upstream_connection.h" #include "upstream/http_request_serializer.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" #include "connection_handler.h" #include "dispatcher.h" // config/server_config.h provided by proxy_transaction.h (ProxyConfig stored by value) @@ -110,12 +113,39 @@ void ProxyTransaction::Start() { upstream_host_, upstream_port_, method_, upstream_path); + // Resolve the circuit-breaker slice once. Null when no breaker is + // attached (server has no upstreams configured, or Phase 4 skipped + // on this deployment), or when the service/dispatcher pair is out of + // range. In any null case the breaker is simply bypassed — the + // transaction proceeds as if circuit breaking were disabled. + if (upstream_manager_ && dispatcher_index_ >= 0) { + auto* cbm = upstream_manager_->GetCircuitBreakerManager(); + if (cbm) { + auto* host = cbm->GetHost(service_name_); + if (host) { + slice_ = host->GetSlice(static_cast(dispatcher_index_)); + } + } + } + AttemptCheckout(); } void ProxyTransaction::AttemptCheckout() { state_ = State::CHECKOUT_PENDING; + // Circuit breaker gate — consulted before every attempt (first try and + // retries both). Each attempt gets a fresh admission stamped with the + // slice's current generation. If the slice rejects with REJECTED_OPEN, + // ConsultBreaker delivers the §12.1 response and returns false; the + // retry loop treats RESULT_CIRCUIT_OPEN as terminal (§8) so a rejected + // retry produces a single 503 to the client, not a nested retry. + // Dry-run reject logs inside TryAcquire and returns ADMITTED through + // the decision enum (REJECTED_OPEN_DRYRUN), so ConsultBreaker proceeds. + if (!ConsultBreaker()) { + return; + } + auto self = shared_from_this(); // Lazily allocate the shared cancel token so the pool can drop @@ -224,21 +254,58 @@ void ProxyTransaction::OnCheckoutError(int error_code) { // Only retry actual network connect failures. Pool saturation // (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown should fail fast — // retrying under backpressure amplifies load on an already-stressed - // pool and stretches client latency with no benefit. + // pool and stretches client latency with no benefit. A breaker-drain + // reject (CHECKOUT_CIRCUIT_OPEN, Phase 6) is also terminal: the + // client gets the same circuit-open response a fresh requester + // would, and the retry loop must not retry it. + // + // Breaker reporting: connect failures (both timeout and refused) are + // upstream-health signals → ReportFailure(CONNECT_FAILURE). Local + // capacity (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown are NOT + // reported — they don't imply upstream unhealthiness (design §7). + // CHECKOUT_CIRCUIT_OPEN is also not reported to the breaker (would + // be a feedback loop — our own reject counting against the upstream). + // // Import error codes from PoolPartition: - // CHECKOUT_CONNECT_FAILED = -2 → retryable - // CHECKOUT_CONNECT_TIMEOUT = -3 → retryable - // CHECKOUT_POOL_EXHAUSTED = -1 → not retryable - // CHECKOUT_QUEUE_TIMEOUT = -5 → not retryable - // CHECKOUT_SHUTTING_DOWN = -4 → not retryable + // CHECKOUT_CONNECT_FAILED = -2 → retryable, report CONNECT_FAILURE + // CHECKOUT_CONNECT_TIMEOUT = -3 → retryable, report CONNECT_FAILURE + // CHECKOUT_POOL_EXHAUSTED = -1 → not retryable, neutral-release probe + // CHECKOUT_QUEUE_TIMEOUT = -5 → not retryable, neutral-release probe + // CHECKOUT_SHUTTING_DOWN = -4 → not retryable, neutral-release probe + // CHECKOUT_CIRCUIT_OPEN = -6 → not retryable, do NOT report static constexpr int CONNECT_FAILED = -2; static constexpr int CONNECT_TIMEOUT = -3; + static constexpr int CIRCUIT_OPEN = -6; + + if (error_code == CIRCUIT_OPEN) { + // Drain path: breaker tripped while this transaction was queued + // (Phase 6 implements the drain). Do NOT Report to the slice — + // our own reject must not feed back into the failure math. Emit + // the §12.1 circuit-open response directly. + logging::Get()->info( + "ProxyTransaction checkout drained by circuit breaker " + "client_fd={} service={}", + client_fd_, service_name_); + DeliverResponse(MakeCircuitOpenResponse()); + // Clear admission_generation_ so Cleanup / destructor doesn't + // double-report. The admission was already fire-and-forget — + // slice-side bookkeeping is intact (the drain itself doesn't + // touch inflight counters because the breaker didn't admit). + admission_generation_ = 0; + return; + } if (error_code == CONNECT_FAILED || error_code == CONNECT_TIMEOUT) { + // Report connect failure to the breaker BEFORE retrying — + // otherwise the retry's ConsultBreaker might admit against a + // stale success count, delaying trip detection. + ReportBreakerOutcome(RESULT_CHECKOUT_FAILED); MaybeRetry(RetryPolicy::RetryCondition::CONNECT_FAILURE); } else { // Pool exhaustion, queue timeout, or shutdown — local capacity issue. // Use RESULT_POOL_EXHAUSTED → 503 (not 502 which implies upstream failure). + // Release the breaker slot neutrally — admission never reached upstream. + ReportBreakerOutcome(RESULT_POOL_EXHAUSTED); OnError(RESULT_POOL_EXHAUSTED, "Pool checkout failed (local capacity, error=" + std::to_string(error_code) + ")"); @@ -517,10 +584,20 @@ void ProxyTransaction::OnResponseComplete() { "service={} status={} attempt={}", client_fd_, service_name_, response.status_code, attempt_); + // Report failure BEFORE MaybeRetry — the retry's fresh + // ConsultBreaker must see the just-added failure in the window + // (and potentially reject if this was the trip-causing call). + // Pass a synthetic RESULT_CHECKOUT_FAILED-like signal; the + // classifier maps 5xx → FailureKind::RESPONSE_5XX. + ReportBreakerOutcome(/* sentinel */ -1000); MaybeRetry(RetryPolicy::RetryCondition::RESPONSE_5XX); return; } + // 2xx / 3xx / 4xx: upstream is healthy (from the breaker's + // perspective — 4xx is a client-side problem). Report success. + ReportBreakerOutcome(RESULT_SUCCESS); + state_ = State::COMPLETE; auto duration = std::chrono::duration_cast( @@ -550,8 +627,19 @@ void ProxyTransaction::OnError(int result_code, client_fd_, service_name_, result_code, attempt_, duration.count(), log_message); + // Report the outcome if an admission is still held. Most error paths + // call ReportBreakerOutcome themselves BEFORE reaching OnError (so a + // retry's ConsultBreaker sees the fresh signal) — this is a safety + // net for error paths that skipped reporting, e.g., RESULT_SEND_FAILED + // and RESULT_RESPONSE_TIMEOUT from the on-upstream-data paths. + // ReportBreakerOutcome is idempotent: it clears admission_generation_ + // on the first call so a double-call drops harmlessly. + ReportBreakerOutcome(result_code); + state_ = State::FAILED; - HttpResponse error_response = MakeErrorResponse(result_code); + HttpResponse error_response = (result_code == RESULT_CIRCUIT_OPEN) + ? MakeCircuitOpenResponse() + : MakeErrorResponse(result_code); DeliverResponse(std::move(error_response)); } @@ -886,6 +974,15 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { if (result_code == RESULT_POOL_EXHAUSTED) { return HttpResponse::ServiceUnavailable(); } + if (result_code == RESULT_RETRY_BUDGET_EXHAUSTED) { + return MakeRetryBudgetResponse(); + } + if (result_code == RESULT_CIRCUIT_OPEN) { + // MakeErrorResponse is static and has no `this` — the richer + // MakeCircuitOpenResponse(slice_) path is preferred. Fall back + // to a plain 503 here for the rare static-context invocation. + return HttpResponse::ServiceUnavailable(); + } if (result_code == RESULT_CHECKOUT_FAILED || result_code == RESULT_SEND_FAILED || result_code == RESULT_PARSE_ERROR || @@ -894,3 +991,167 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { } return HttpResponse::InternalError(); } + +HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { + // Compute Retry-After from slice->OpenUntil() if the slice is known. + // Falls back to a conservative 1-second hint if the slice is null + // (shouldn't happen on the circuit-open path — that path requires a + // slice — but defense in depth). + int retry_after_secs = 1; + if (slice_) { + auto open_until = slice_->OpenUntil(); + // OpenUntil returns a zero time_point when NOT OPEN. Checking + // against zero with steady_clock::time_point is fiddly; use + // time_since_epoch().count() > 0 as the "is-set" check. + if (open_until.time_since_epoch().count() > 0) { + auto now = std::chrono::steady_clock::now(); + auto diff = std::chrono::duration_cast( + open_until - now).count(); + // Clamp to [1, 300] — Retry-After=0 is silly, and an hour+ + // is misleading (ops usually want operators to check + // sooner). The breaker's open duration caps out around + // minutes; anything larger means we're dealing with a + // cascade and we should hint sooner. + if (diff < 1) diff = 1; + if (diff > 300) diff = 300; + retry_after_secs = static_cast(diff); + } + } + + HttpResponse resp; + resp.Status(HttpStatus::SERVICE_UNAVAILABLE); + resp.Text("Upstream circuit breaker is open; please retry later.\n"); + resp.Header("Retry-After", std::to_string(retry_after_secs)); + resp.Header("X-Circuit-Breaker", "open"); + // Hint operators (not clients) at which upstream tripped. Useful + // when a gateway fronts multiple backends; without this header, a + // 503 is opaque. + resp.Header("X-Upstream-Host", + upstream_host_ + ":" + std::to_string(upstream_port_)); + resp.Header("Connection", "close"); + return resp; +} + +HttpResponse ProxyTransaction::MakeRetryBudgetResponse() { + HttpResponse resp; + resp.Status(HttpStatus::SERVICE_UNAVAILABLE); + resp.Text("Upstream retry budget exhausted.\n"); + resp.Header("X-Retry-Budget-Exhausted", "1"); + resp.Header("Connection", "close"); + return resp; +} + +bool ProxyTransaction::ConsultBreaker() { + if (!slice_) { + // No breaker attached for this service. Proceed as if the + // breaker layer didn't exist. admission_generation_ stays 0 so + // any accidental ReportBreakerOutcome call is a no-op. + is_probe_ = false; + admission_generation_ = 0; + return true; + } + auto admission = slice_->TryAcquire(); + + // Stash the admission metadata for the paired Report*() call. Note + // we record this EVEN for REJECTED_OPEN (where generation_==0 is a + // sentinel) — it's harmless and keeps the branches simpler. + admission_generation_ = admission.generation; + is_probe_ = (admission.decision == + circuit_breaker::Decision::ADMITTED_PROBE); + + if (admission.decision == circuit_breaker::Decision::REJECTED_OPEN) { + // Hard reject — slice counted it, logged it, and we must not + // touch the upstream. Emit §12.1 response and DO NOT Report + // back (would create a feedback loop — our own reject counting + // as a failure against the already-OPEN slice). + state_ = State::FAILED; + logging::Get()->info( + "ProxyTransaction circuit-open reject client_fd={} service={} " + "attempt={}", + client_fd_, service_name_, attempt_); + DeliverResponse(MakeCircuitOpenResponse()); + // Clear admission_generation_ — there's nothing to Report. + admission_generation_ = 0; + return false; + } + + // REJECTED_OPEN_DRYRUN: slice logged the would-reject and counted + // it; caller proceeds to the upstream. Fall through as admitted. + // ADMITTED / ADMITTED_PROBE: proceed. + return true; +} + +void ProxyTransaction::ReportBreakerOutcome(int result_code) { + // No slice, or already reported: bail. admission_generation_==0 is + // the sentinel — slice domain generations start at 1, so a 0 gen + // would be rejected as stale anyway; the early return just avoids + // an unnecessary atomic load. The Report* methods themselves are + // idempotent against stale gens, but we also must not increment a + // probe_*/rejected_ counter for a non-event. + if (!slice_ || admission_generation_ == 0) return; + + // Capture + clear in one go so concurrent / re-entrant calls bail. + uint64_t gen = admission_generation_; + admission_generation_ = 0; + bool probe = is_probe_; + is_probe_ = false; + + using circuit_breaker::FailureKind; + + // Synthetic sentinel for the OnResponseComplete 5xx path — maps to + // RESPONSE_5XX without needing a new public result code. Callers + // other than OnResponseComplete never use this value. + static constexpr int SENTINEL_5XX = -1000; + + switch (result_code) { + case RESULT_SUCCESS: + slice_->ReportSuccess(probe, gen); + return; + + case SENTINEL_5XX: + slice_->ReportFailure(FailureKind::RESPONSE_5XX, probe, gen); + return; + + case RESULT_CHECKOUT_FAILED: + slice_->ReportFailure(FailureKind::CONNECT_FAILURE, probe, gen); + return; + + case RESULT_RESPONSE_TIMEOUT: + slice_->ReportFailure(FailureKind::RESPONSE_TIMEOUT, probe, gen); + return; + + case RESULT_UPSTREAM_DISCONNECT: + case RESULT_SEND_FAILED: + slice_->ReportFailure(FailureKind::UPSTREAM_DISCONNECT, probe, gen); + return; + + case RESULT_POOL_EXHAUSTED: + case RESULT_PARSE_ERROR: + // Local outcomes — no upstream health signal. Release the + // admission slot neutrally so a probe doesn't leak the + // HALF_OPEN slot. + slice_->ReportNeutral(probe, gen); + return; + + case RESULT_CIRCUIT_OPEN: + case RESULT_RETRY_BUDGET_EXHAUSTED: + // Our own rejects — MUST NOT feed back into the slice. + // These paths should not reach ReportBreakerOutcome (both + // clear admission_generation_ before delivering), but the + // defensive branch keeps the class-wide invariant: these + // outcomes are invisible to the breaker. + return; + + default: + // Unknown result code — log and neutral-release to keep the + // probe bookkeeping consistent. A runtime log here is + // cheaper than a slice stuck in HALF_OPEN forever because a + // new result code slipped through unclassified. + logging::Get()->error( + "ReportBreakerOutcome: unclassified result_code={} " + "service={} — releasing neutrally", + result_code, service_name_); + slice_->ReportNeutral(probe, gen); + return; + } +} diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h new file mode 100644 index 00000000..db2b095d --- /dev/null +++ b/test/circuit_breaker_phase4_test.h @@ -0,0 +1,440 @@ +#pragma once + +// Phase 4 integration tests: circuit breaker wired into ProxyTransaction + +// UpstreamManager + HttpServer. Exercises the full request path end-to-end. +// +// Strategy: use a backend that returns 5xx on every request so repeated hits +// trip the breaker via the consecutive-failure threshold. 5xx responses are +// the cheapest way to accumulate failures (no connect timeouts to wait for). +// Low thresholds keep tests fast. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" + +#include +#include +#include + +namespace CircuitBreakerPhase4Tests { + +using circuit_breaker::State; + +// Shared helper: build an upstream config that proxies /echo → backend and +// has a breaker configured with low thresholds for fast trip. +static UpstreamConfig MakeBreakerUpstream(const std::string& name, + const std::string& host, + int port, + bool breaker_enabled, + int consecutive_threshold = 3) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + // Exact-match route — simpler than prefix patterns for integration tests. + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + // No retries — keeps the test deterministic: one request = one attempt. + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = breaker_enabled; + u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; + // Disable the rate-based trip path — we drive everything through + // consecutive failures to keep the test count predictable. + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 500; // short so recovery test is quick + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open +// headers on the rejected request. +// --------------------------------------------------------------------------- +void TestBreakerTripsAfterConsecutiveFailures() { + std::cout << "\n[TEST] CB Phase 4: breaker trips after consecutive 5xx..." + << std::endl; + try { + // Backend always returns 502 — gateway classifies the response as + // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt. + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; // match the existing proxy test pattern // single thread → single breaker partition exercised + gw.upstreams.push_back( + MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Hit the failing backend threshold times — each 502 from backend + // propagates to the client as 502 (gateway pass-through) AND counts + // as a RESPONSE_5XX failure in the breaker. + for (int i = 0; i < 3; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Phase 4: trip after consecutive failures", false, + "pre-trip request " + std::to_string(i) + " expected 502, got: " + + r.substr(0, 32)); + return; + } + } + + // Next request must be rejected by the breaker (not proxied). The + // response is 503 with X-Circuit-Breaker: open and Retry-After. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_breaker_header = + r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos; + bool has_retry_after = + r.find("Retry-After:") != std::string::npos || + r.find("retry-after:") != std::string::npos; + bool has_upstream_host = + r.find("X-Upstream-Host:") != std::string::npos || + r.find("x-upstream-host:") != std::string::npos; + + bool pass = is_503 && has_breaker_header && has_retry_after && + has_upstream_host; + TestFramework::RecordTest( + "CB Phase 4: trip after consecutive failures", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " breaker_hdr=" + std::to_string(has_breaker_header) + + " retry_after=" + std::to_string(has_retry_after) + + " upstream_host=" + std::to_string(has_upstream_host) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: trip after consecutive failures", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely. +// The same failure pattern that would trip an enabled breaker must leave the +// pass-through path untouched — every request still reaches the backend. +// --------------------------------------------------------------------------- +void TestBreakerDisabledPassesThrough() { + std::cout << "\n[TEST] CB Phase 4: disabled breaker passes through..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; // match the existing proxy test pattern + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/false, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 10 requests — with breaker disabled, all 10 reach backend. + for (int i = 0; i < 10; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Phase 4: disabled breaker passes through", false, + "request " + std::to_string(i) + " expected 502, got: " + + r.substr(0, 32)); + return; + } + } + + bool all_hit = backend_hits.load() == 10; + TestFramework::RecordTest( + "CB Phase 4: disabled breaker passes through", all_hit, + all_hit ? "" : + "expected 10 backend hits, got " + std::to_string(backend_hits.load())); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: disabled breaker passes through", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: 2xx responses are reported as success — they reset the +// consecutive-failure counter so the breaker doesn't trip on interleaved +// success/failure traffic. +// --------------------------------------------------------------------------- +void TestSuccessResetsConsecutiveFailureCounter() { + std::cout << "\n[TEST] CB Phase 4: 2xx success resets consecutive-failure counter..." + << std::endl; + try { + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/toggle", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; // match the existing proxy test pattern + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails. + // With reset semantics, consecutive_failures_ never exceeds 2 → no trip. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000); // FAIL + } + fail_mode.store(false); + TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000); // SUCCESS → reset + fail_mode.store(true); + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000); // FAIL + } + + // Inspect the breaker's state directly — it should still be CLOSED. + auto* cbm = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; + auto* host = cbm ? cbm->GetHost("svc") : nullptr; + auto* slice = host ? host->GetSlice(0) : nullptr; + bool still_closed = slice && slice->CurrentState() == State::CLOSED; + + TestFramework::RecordTest( + "CB Phase 4: success resets consecutive counter", still_closed, + still_closed ? "" : + "slice not CLOSED after S resets failures: state=" + + std::to_string(static_cast( + slice ? slice->CurrentState() : State::CLOSED))); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: success resets consecutive counter", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: After the trip, the live slice state is OPEN. Verifies the +// integration actually drives the slice state machine (not just the response). +// --------------------------------------------------------------------------- +void TestTripDrivesSliceState() { + std::cout << "\n[TEST] CB Phase 4: trip drives slice state to OPEN..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; // match the existing proxy test pattern + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 3 failures → trip. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // With worker_threads > 1 the 3 failing requests can land on either + // dispatcher (hash-dependent). Check the aggregate snapshot — at + // least one partition must be OPEN with exactly one trip recorded. + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* host = cbm->GetHost("svc"); + auto snap = host->Snapshot(); + bool at_least_one_open = snap.open_partitions >= 1; + bool one_trip = snap.total_trips == 1; + // Sanity: the tripped partition should be the one that saw all 3 + // failures (consecutive trip is single-slice, not cross-slice). + bool single_partition_tripped = snap.open_partitions == 1; + + bool pass = at_least_one_open && one_trip && single_partition_tripped; + TestFramework::RecordTest( + "CB Phase 4: trip drives slice state to OPEN", pass, + pass ? "" : + "at_least_one_open=" + std::to_string(at_least_one_open) + + " one_trip=" + std::to_string(one_trip) + + " single_partition=" + std::to_string(single_partition_tripped) + + " (open_partitions=" + std::to_string(snap.open_partitions) + + ", total_trips=" + std::to_string(snap.total_trips) + ")"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: trip drives slice state to OPEN", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip, +// subsequent requests must be served locally (503) without any upstream I/O. +// Prevents regression where the gate leaked admissions to a known-bad upstream. +// --------------------------------------------------------------------------- +void TestOpenBreakerShortCircuitsUpstreamCall() { + std::cout << "\n[TEST] CB Phase 4: OPEN breaker short-circuits upstream call..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; // match the existing proxy test pattern + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 3 failing requests to trip. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + int hits_at_trip = backend_hits.load(); + + // 5 more requests — all should be rejected locally. + for (int i = 0; i < 5; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + int hits_after = backend_hits.load(); + + // Backend hits must not grow during the post-trip burst. + bool no_leak = hits_after == hits_at_trip; + TestFramework::RecordTest( + "CB Phase 4: OPEN short-circuits upstream call", no_leak, + no_leak ? "" : + "backend hits grew from " + std::to_string(hits_at_trip) + + " to " + std::to_string(hits_after) + " after trip"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: OPEN short-circuits upstream call", false, e.what()); + } +} + +// Sanity check: verify the bare proxy setup works without the breaker +// before blaming the breaker integration. +void TestBareProxyWorks() { + std::cout << "\n[TEST] CB Phase 4: bare proxy (sanity)..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + UpstreamConfig u; + u.name = "svc"; + u.host = "127.0.0.1"; + u.port = backend_port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.proxy.route_prefix = "/fail"; + u.proxy.response_timeout_ms = 5000; + u.circuit_breaker.enabled = true; // sanity + breaker enabled + u.circuit_breaker.consecutive_failure_threshold = 3; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 500; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); + bool pass = TestHttpClient::HasStatus(r, 502); + TestFramework::RecordTest( + "CB Phase 4: bare proxy sanity", pass, + pass ? "" : "expected 502, got: " + r.substr(0, 128)); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Phase 4: bare proxy sanity", + false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestBareProxyWorks(); + TestBreakerTripsAfterConsecutiveFailures(); + TestBreakerDisabledPassesThrough(); + TestSuccessResetsConsecutiveFailureCounter(); + TestTripDrivesSliceState(); + TestOpenBreakerShortCircuitsUpstreamCall(); +} + +} // namespace CircuitBreakerPhase4Tests diff --git a/test/run_test.cc b/test/run_test.cc index f118d495..fbf84d49 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -15,6 +15,7 @@ #include "rate_limit_test.h" #include "circuit_breaker_test.h" #include "circuit_breaker_phase3_test.h" +#include "circuit_breaker_phase4_test.h" #include "test_framework.h" #include #include @@ -85,6 +86,10 @@ void RunAllTest(){ // Run circuit breaker Phase 3 tests (host / manager / retry budget) CircuitBreakerPhase3Tests::RunAllTests(); + // Run circuit breaker Phase 4 integration tests (end-to-end through + // ProxyTransaction + UpstreamManager + HttpServer) + CircuitBreakerPhase4Tests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } From 83bf5d115468ea4ec7d17f2bc26ca8210add08c2 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 14:02:32 +0800 Subject: [PATCH 17/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 13 + include/upstream/proxy_transaction.h | 10 +- server/circuit_breaker_host.cc | 8 +- server/circuit_breaker_manager.cc | 19 + server/proxy_transaction.cc | 54 +-- server/retry_budget.cc | 32 +- test/circuit_breaker_phase4_test.h | 330 ++++++++++++++++++ test/run_test.cc | 4 +- 8 files changed, 430 insertions(+), 40 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 95a5beee..6e9734df 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -129,10 +129,23 @@ class CircuitBreakerSlice { const std::string& host_label() const { return host_label_; } size_t dispatcher_index() const { return dispatcher_index_; } + // Read-only view of the live config. Dispatcher-thread-owned for + // writes (Reload only mutates here); readers on other threads get a + // potentially-torn read, which is acceptable for observability hints + // like Retry-After clamping. + const CircuitBreakerConfig& config() const { return config_; } + // Current open_until time. Used by ProxyTransaction to compute // Retry-After. Returns zero ns when not OPEN. std::chrono::steady_clock::time_point OpenUntil() const; + // Convenience predicate: whether OpenUntil() currently holds a + // non-zero deadline. Avoids callers hand-rolling the zero-epoch + // check against `time_since_epoch().count() > 0`. + bool IsOpenDeadlineSet() const { + return open_until_steady_ns_.load(std::memory_order_relaxed) > 0; + } + private: // Logging label: "service=X host=Y:Z partition=N" built once. std::string host_label_; diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h index 6befe5a0..eba34973 100644 --- a/include/upstream/proxy_transaction.h +++ b/include/upstream/proxy_transaction.h @@ -175,11 +175,11 @@ class ProxyTransaction : public std::enable_shared_from_this { uint64_t admission_generation_ = 0; bool is_probe_ = false; - // Retry-budget token held by this transaction's most recent retry - // attempt. Phase 5 flips this to true on successful TryConsumeRetry - // and clears it on ReleaseRetry. Phase 4 declares the field so - // Cleanup() and Cancel() have something to check, but the retry - // loop does not yet consume the budget. + // TODO(phase-5): retry-budget token held by this transaction's most + // recent retry attempt. Phase 5 flips this to true on successful + // TryConsumeRetry and clears it on ReleaseRetry. Phase 4 declares + // the field so Cleanup() and Cancel() have something to check, but + // the retry loop does not yet consume the budget. bool retry_token_held_ = false; // Internal methods diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc index b41635a6..4523d3be 100644 --- a/server/circuit_breaker_host.cc +++ b/server/circuit_breaker_host.cc @@ -30,9 +30,11 @@ CircuitBreakerHost::CircuitBreakerHost(std::string service_name, slices_.reserve(partition_count); for (size_t i = 0; i < partition_count; ++i) { // Per-slice label for logs — lets operators grep logs for a - // specific host:partition pair. - std::string label = service_name_ + ":" + host_ + ":" + - std::to_string(port_) + " p=" + std::to_string(i); + // specific host:partition pair. Key=value form matches the + // format documented in circuit_breaker_slice.h:host_label_. + std::string label = "service=" + service_name_ + + " host=" + host_ + ":" + std::to_string(port_) + + " partition=" + std::to_string(i); slices_.emplace_back(std::make_unique( std::move(label), i, config_)); } diff --git a/server/circuit_breaker_manager.cc b/server/circuit_breaker_manager.cc index 7e4a8035..9e4934a3 100644 --- a/server/circuit_breaker_manager.cc +++ b/server/circuit_breaker_manager.cc @@ -9,6 +9,25 @@ CircuitBreakerManager::CircuitBreakerManager( size_t partition_count, std::vector> dispatchers) : dispatchers_(std::move(dispatchers)) { + // Invariant (production path): slices are indexed by dispatcher, + // so partition_count must match dispatcher count. Any divergence + // would cause every subsequent host->Reload() to silently skip + // (size-mismatch guard in CircuitBreakerHost::Reload) — fail + // loudly at startup instead of on reload. + // + // Exception: pure unit tests that don't exercise Reload pass an + // empty dispatcher list; skip the check in that case so those + // tests can continue to allocate slices without wiring up live + // dispatchers. + if (!dispatchers_.empty() && partition_count != dispatchers_.size()) { + logging::Get()->critical( + "CircuitBreakerManager: partition_count ({}) != dispatcher count " + "({}) — topology mismatch", + partition_count, dispatchers_.size()); + throw std::invalid_argument( + "CircuitBreakerManager: partition_count must equal dispatcher count"); + } + // Build one Host per upstream regardless of .circuit_breaker.enabled. // Disabled hosts still need a live Slice so a later reload can flip // them on without re-wiring transition callbacks (design §3.1). diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index c263332e..ac5713c8 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -978,9 +978,18 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { return MakeRetryBudgetResponse(); } if (result_code == RESULT_CIRCUIT_OPEN) { - // MakeErrorResponse is static and has no `this` — the richer - // MakeCircuitOpenResponse(slice_) path is preferred. Fall back - // to a plain 503 here for the rare static-context invocation. + // The static factory has no `this`, so it cannot build the + // §12.1-compliant response (Retry-After / X-Circuit-Breaker / + // X-Upstream-Host). All in-class paths for CIRCUIT_OPEN use + // the non-static MakeCircuitOpenResponse() — reaching this + // branch means a future caller forgot that rule, and would + // silently serve a non-compliant 503. Log loudly so the + // mistake shows up in logs instead of producing a stealth + // regression against the public contract. + logging::Get()->error( + "ProxyTransaction::MakeErrorResponse(RESULT_CIRCUIT_OPEN) " + "invoked from static context — use MakeCircuitOpenResponse() " + "to emit §12.1-compliant headers"); return HttpResponse::ServiceUnavailable(); } if (result_code == RESULT_CHECKOUT_FAILED || @@ -998,24 +1007,29 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { // (shouldn't happen on the circuit-open path — that path requires a // slice — but defense in depth). int retry_after_secs = 1; - if (slice_) { + if (slice_ && slice_->IsOpenDeadlineSet()) { auto open_until = slice_->OpenUntil(); - // OpenUntil returns a zero time_point when NOT OPEN. Checking - // against zero with steady_clock::time_point is fiddly; use - // time_since_epoch().count() > 0 as the "is-set" check. - if (open_until.time_since_epoch().count() > 0) { - auto now = std::chrono::steady_clock::now(); - auto diff = std::chrono::duration_cast( - open_until - now).count(); - // Clamp to [1, 300] — Retry-After=0 is silly, and an hour+ - // is misleading (ops usually want operators to check - // sooner). The breaker's open duration caps out around - // minutes; anything larger means we're dealing with a - // cascade and we should hint sooner. - if (diff < 1) diff = 1; - if (diff > 300) diff = 300; - retry_after_secs = static_cast(diff); - } + auto now = std::chrono::steady_clock::now(); + auto ms_remaining = std::chrono::duration_cast( + open_until - now).count(); + // Ceiling-round to seconds so we never advertise a window + // shorter than the actual remaining backoff (e.g. 5.9s → 6, + // not 5). Truncating by one second is enough to cause a + // well-behaved client to retry while the breaker is still OPEN + // and get another avoidable 503. + int64_t diff = (ms_remaining + 999) / 1000; + // Clamp: Retry-After=0 is silly; upper bound tracks the + // configured max_open_duration_ms (clamped to 1s min), so we + // don't under-report backoff windows on operators who tune the + // breaker longer than 5 minutes. Absolute safety ceiling of + // 3600s (1 hour) — anything longer likely means the breaker + // is mis-configured and the hint is noise. + int cfg_cap_secs = static_cast( + std::max(1, slice_->config().max_open_duration_ms / 1000)); + int upper = std::min(cfg_cap_secs, 3600); + if (diff < 1) diff = 1; + if (diff > upper) diff = upper; + retry_after_secs = static_cast(diff); } HttpResponse resp; diff --git a/server/retry_budget.cc b/server/retry_budget.cc index 7246eb26..cc984e6d 100644 --- a/server/retry_budget.cc +++ b/server/retry_budget.cc @@ -33,12 +33,10 @@ RetryBudget::InFlightGuard RetryBudget::TrackInFlight() { } bool RetryBudget::TryConsumeRetry() { - // Snapshot counters with relaxed — the gate is an approximate - // capacity check, not a strict admission lock. Racing callers may - // both read cap=N and both try to reserve; the worst case is that - // both succeed and we momentarily sit at retries_in_flight_ = - // cap+1, which is acceptable for a traffic-shaping gate (unlike a - // security-critical gate). + // Snapshot tuning + in_flight once — cap is computed against a + // consistent slice. Retrying the cap math inside the CAS loop would + // just churn without improving accuracy (in_flight is inherently a + // moving target). int64_t in_flight = in_flight_.load(std::memory_order_relaxed); int pct = percent_.load(std::memory_order_relaxed); int min_conc = min_concurrency_.load(std::memory_order_relaxed); @@ -50,13 +48,25 @@ bool RetryBudget::TryConsumeRetry() { int64_t pct_cap = (in_flight * pct) / 100; int64_t cap = pct_cap > min_conc ? pct_cap : min_conc; + // Atomically reserve a slot: load current, verify under cap, CAS up + // by 1. Separate load + fetch_add would let N concurrent callers + // all observe current < cap and all increment past the cap — under + // the cross-dispatcher load the retry budget is meant to protect + // against, the gate would stop bounding anything. int64_t current = retries_in_flight_.load(std::memory_order_relaxed); - if (current >= cap) { - retries_rejected_.fetch_add(1, std::memory_order_relaxed); - return false; + while (current < cap) { + if (retries_in_flight_.compare_exchange_weak( + current, current + 1, + std::memory_order_acq_rel, + std::memory_order_relaxed)) { + return true; + } + // CAS failure — `current` was updated with the latest value; + // loop re-evaluates against cap. Spurious wakeups on weak CAS + // are also handled by the retry. } - retries_in_flight_.fetch_add(1, std::memory_order_relaxed); - return true; + retries_rejected_.fetch_add(1, std::memory_order_relaxed); + return false; } void RetryBudget::ReleaseRetry() { diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h index db2b095d..6a6cc2f5 100644 --- a/test/circuit_breaker_phase4_test.h +++ b/test/circuit_breaker_phase4_test.h @@ -424,6 +424,332 @@ void TestBareProxyWorks() { } } +// --------------------------------------------------------------------------- +// Test 7: Retry-After header carries a sensible value — within [1, configured +// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now. +// --------------------------------------------------------------------------- +void TestRetryAfterHeaderValue() { + std::cout << "\n[TEST] CB Phase 4: Retry-After value correctness..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + // base_open_duration 2000ms, max 60_000ms — Retry-After should + // ceiling-round and fall inside [1, 60]. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 2000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Capture the open-rejection response. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + + // Extract Retry-After integer value (case-insensitive header). + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60). + // For base_open_duration 2000ms the remaining-seconds at this + // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper + // sanity bound is generous but still rules out 300/3600-class + // buggy fallbacks. + bool in_range = (retry_after >= 1 && retry_after <= 60); + bool reasonable = (retry_after >= 1 && retry_after <= 3); + + bool pass = is_503 && in_range && reasonable; + TestFramework::RecordTest( + "CB Phase 4: Retry-After value in range", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " retry_after=" + std::to_string(retry_after) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: Retry-After value in range", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3, +// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored +// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into +// the breaker and MaybeRetry stays out. +// --------------------------------------------------------------------------- +void TestCircuitOpenTerminalForRetry() { + std::cout << "\n[TEST] CB Phase 4: CIRCUIT_OPEN terminal for retry loop..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + // Retries enabled on 5xx — if the breaker reject leaked into + // MaybeRetry, the test would see extra backend hits after the + // trip. Long open window so the breaker stays OPEN for the + // duration of the post-trip assertion (no HALF_OPEN probe + // admission racing the test). + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. Each pre-trip request may retry up to 3 + // times (all failing 5xx), so backend sees up to 3*threshold=12 + // hits. That's acceptable — we just care about post-trip behavior. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + } + int pre_trip_hits = backend_hits.load(); + + // Post-trip request: expect a single 503 and NO new backend hits. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + int post_trip_hits = backend_hits.load(); + bool no_new_hits = (post_trip_hits == pre_trip_hits); + + bool pass = is_503 && no_new_hits; + TestFramework::RecordTest( + "CB Phase 4: CIRCUIT_OPEN terminal for retry", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " pre=" + std::to_string(pre_trip_hits) + + " post=" + std::to_string(post_trip_hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: CIRCUIT_OPEN terminal for retry", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the +// upstream (pass-through) but still increments the rejected_ counter so +// operators can observe the would-reject rate without production impact. +// --------------------------------------------------------------------------- +void TestDryRunPassthrough() { + std::cout << "\n[TEST] CB Phase 4: dry-run passthrough..." << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.dry_run = true; // would-reject, but still forward + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip thresholds with 5 requests. All should reach backend (502), + // not a 503 — dry-run never short-circuits. + for (int i = 0; i < 5; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Phase 4: dry-run passthrough", false, + "request " + std::to_string(i) + + " expected 502, got: " + r.substr(0, 64)); + return; + } + } + + bool all_hit = (backend_hits.load() == 5); + + // Verify the slice observed trips/rejected even though traffic passed. + auto* mgr = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : + nullptr; + int64_t trips = 0, rejected = 0; + if (mgr) { + auto* host = mgr->GetHost("svc"); + if (host) { + auto snap = host->Snapshot(); + trips = snap.total_trips; + rejected = snap.total_rejected; + } + } + // At least one trip fired (consecutive_threshold=3 → slice + // transitioned at least once during the run), and the post-trip + // requests were counted as would-reject (rejected > 0). + bool observed = (trips >= 1) && (rejected >= 1); + + bool pass = all_hit && observed; + TestFramework::RecordTest( + "CB Phase 4: dry-run passthrough", pass, + pass ? "" : + "hits=" + std::to_string(backend_hits.load()) + + " trips=" + std::to_string(trips) + + " rejected=" + std::to_string(rejected)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: dry-run passthrough", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the +// breaker, wait for the open window to elapse, then serve success responses +// and assert the slice transitions back to CLOSED (consecutive_successes +// crosses the threshold — default 2 from DefaultCbConfig / phase-4 config). +// --------------------------------------------------------------------------- +void TestHalfOpenRecoveryRoundTrip() { + std::cout << "\n[TEST] CB Phase 4: HALF_OPEN → CLOSED recovery..." + << std::endl; + try { + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + // Short open duration so recovery path finishes quickly. + u.circuit_breaker.base_open_duration_ms = 300; + u.circuit_breaker.max_open_duration_ms = 1000; + // Two probes needed to close (default permitted_half_open_calls=2). + u.circuit_breaker.permitted_half_open_calls = 2; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip by hitting the failing backend. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Flip backend to success and wait for the open window to elapse. + fail_mode.store(false); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + // Probe the proxy — each successful 200 advances HALF_OPEN toward + // CLOSED. Do more than permitted_half_open_calls; some will be + // rejected as half_open_full but the ones that are admitted will + // close the breaker. + bool saw_success = false; + for (int i = 0; i < 8; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (TestHttpClient::HasStatus(r, 200)) saw_success = true; + // Small gap between probes — HALF_OPEN only admits permitted + // probes per cycle; spacing lets subsequent probes observe a + // possibly-closed breaker. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + // Verify slice aggregate: at least one CLOSED transition observed + // (probe_successes >= 1 and total_trips == 1 — we only tripped once). + auto* mgr = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : + nullptr; + int64_t probe_succ = 0; + int open_parts = 0, half_open_parts = 0; + if (mgr) { + auto* host = mgr->GetHost("svc"); + if (host) { + auto snap = host->Snapshot(); + probe_succ = 0; + for (const auto& row : snap.slices) { + probe_succ += row.probe_successes; + } + open_parts = snap.open_partitions; + half_open_parts = snap.half_open_partitions; + } + } + + // Recovery complete: saw at least one 200 through the breaker, + // at least one probe success counted, and no partition still + // stuck in OPEN (HALF_OPEN may still linger on the unused slice, + // which is fine for a 2-partition setup). + bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0); + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN → CLOSED recovery", pass, + pass ? "" : + "saw_success=" + std::to_string(saw_success) + + " probe_succ=" + std::to_string(probe_succ) + + " open_parts=" + std::to_string(open_parts) + + " half_open_parts=" + std::to_string(half_open_parts)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN → CLOSED recovery", false, e.what()); + } +} + void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl; @@ -435,6 +761,10 @@ void RunAllTests() { TestSuccessResetsConsecutiveFailureCounter(); TestTripDrivesSliceState(); TestOpenBreakerShortCircuitsUpstreamCall(); + TestRetryAfterHeaderValue(); + TestCircuitOpenTerminalForRetry(); + TestDryRunPassthrough(); + TestHalfOpenRecoveryRoundTrip(); } } // namespace CircuitBreakerPhase4Tests diff --git a/test/run_test.cc b/test/run_test.cc index fbf84d49..ab7bdb9b 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -168,9 +168,11 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); - // Run circuit breaker tests + // Run circuit breaker tests (phases 1-4: unit + phase3 + phase4) }else if(mode == "circuit_breaker" || mode == "-B"){ CircuitBreakerTests::RunAllTests(); + CircuitBreakerPhase3Tests::RunAllTests(); + CircuitBreakerPhase4Tests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]); From a1ddde5818fcc5eecf23e40f43de5f6bac944f47 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 14:51:08 +0800 Subject: [PATCH 18/37] Fix review comment --- include/upstream/proxy_transaction.h | 12 +++ server/proxy_transaction.cc | 53 +++++++++- test/circuit_breaker_phase4_test.h | 144 +++++++++++++++++++++++++++ 3 files changed, 208 insertions(+), 1 deletion(-) diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h index eba34973..40886be4 100644 --- a/include/upstream/proxy_transaction.h +++ b/include/upstream/proxy_transaction.h @@ -245,4 +245,16 @@ class ProxyTransaction : public std::enable_shared_from_this { // result; the caller passes the appropriate kind for 5xx vs disconnect // vs timeout since the slice treats them differently only for logs. void ReportBreakerOutcome(int result_code); + + // ReleaseBreakerAdmissionNeutral: release the admission slot without + // counting a success or failure. Used when the transaction is aborted + // locally (Cancel() on client disconnect, cancelled_ early-return + // after checkout, etc.) before an upstream health signal was observed. + // + // Without this, a HALF_OPEN probe slot is stranded if the client + // disconnects mid-probe — the slice stays in half_open_full until an + // external reset. No-op if admission_generation_ == 0. Clears + // admission_generation_ so a following ReportBreakerOutcome is a + // no-op. + void ReleaseBreakerAdmissionNeutral(); }; diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index ac5713c8..31e5dfe3 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -179,6 +179,11 @@ void ProxyTransaction::OnCheckoutReady(UpstreamLease lease) { // returns to the pool for another request to use, instead of // sitting idle attached to a torn-down transaction. lease.Release(); + // Release the breaker admission neutrally — the upstream was + // never exercised, and stranding the slot would wedge a + // HALF_OPEN probe cycle. Cancel() may already have released; + // the helper is no-op in that case. + ReleaseBreakerAdmissionNeutral(); return; } if (state_ != State::CHECKOUT_PENDING) { @@ -330,6 +335,13 @@ void ProxyTransaction::SendUpstreamRequest() { logging::Get()->warn("ProxyTransaction stale connection before send " "client_fd={} service={} attempt={}", client_fd_, service_name_, attempt_); + // Report to the breaker BEFORE retrying — MaybeRetry's + // AttemptCheckout will overwrite admission_generation_ on the + // next ConsultBreaker. Without this call, a probe in HALF_OPEN + // would leak its slot and the slice could stall in + // half_open_full; in CLOSED, the failure would be under-counted + // until the last retry ran through OnError. + ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT); MaybeRetry(RetryPolicy::RetryCondition::UPSTREAM_DISCONNECT); return; } @@ -407,6 +419,8 @@ void ProxyTransaction::OnUpstreamData( "state={} attempt={}", client_fd_, service_name_, upstream_fd, static_cast(state_), attempt_); + // Report BEFORE retry — see stale-connection path above for why. + ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT); MaybeRetry(RetryPolicy::RetryCondition::UPSTREAM_DISCONNECT); return; } @@ -822,6 +836,13 @@ void ProxyTransaction::Cancel() { if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) { poison_connection_ = true; } + // Release any held breaker admission neutrally before tearing down. + // A client disconnect during CHECKOUT_PENDING, mid-send, or mid- + // response leaves admission_generation_ set; without this neutral + // release a probe slot stays occupied and HALF_OPEN can stall in + // half_open_full until an external reset. No-op when no admission + // is held (INIT, or an outcome already reported). + ReleaseBreakerAdmissionNeutral(); // Release the upstream lease back to the pool (or destroy it if // poisoned) and clear transport callbacks so any in-flight upstream // bytes land harmlessly. @@ -939,6 +960,13 @@ void ProxyTransaction::ArmResponseTimeout(int explicit_budget_ms) { if (self->state_ == State::SENDING_REQUEST || self->state_ == State::AWAITING_RESPONSE || self->state_ == State::RECEIVING_BODY) { + // Report BEFORE retry — MaybeRetry's AttemptCheckout will + // overwrite admission_generation_ on the next + // ConsultBreaker, stranding the current attempt's + // admission (probe slot leaks in HALF_OPEN; CLOSED + // under-counts the failure until the last retry hits + // OnError). + self->ReportBreakerOutcome(RESULT_RESPONSE_TIMEOUT); self->MaybeRetry(RetryPolicy::RetryCondition::RESPONSE_TIMEOUT); } else { self->OnError(RESULT_RESPONSE_TIMEOUT, "Response timeout"); @@ -1024,8 +1052,16 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { // breaker longer than 5 minutes. Absolute safety ceiling of // 3600s (1 hour) — anything longer likely means the breaker // is mis-configured and the hint is noise. + // + // Ceil the cap: floor-rounding max_open_duration_ms would + // under-report non-second-aligned configs. E.g. a 1500ms or + // 6500ms (exponential-backoff saturation) max floor-rounds to + // 1s/6s, advertising a shorter window than the breaker will + // actually honor. Clients retrying on the hint would hit + // another avoidable 503. + long long cfg_ms = slice_->config().max_open_duration_ms; int cfg_cap_secs = static_cast( - std::max(1, slice_->config().max_open_duration_ms / 1000)); + std::max(1, (cfg_ms + 999) / 1000)); int upper = std::min(cfg_cap_secs, 3600); if (diff < 1) diff = 1; if (diff > upper) diff = upper; @@ -1095,6 +1131,21 @@ bool ProxyTransaction::ConsultBreaker() { return true; } +void ProxyTransaction::ReleaseBreakerAdmissionNeutral() { + if (!slice_ || admission_generation_ == 0) return; + + uint64_t gen = admission_generation_; + admission_generation_ = 0; + bool probe = is_probe_; + is_probe_ = false; + + // Neutral release — no upstream health signal. Decrements the + // per-partition inflight (CLOSED) or the HALF_OPEN probe admitted + // counter, so a cancelled probe doesn't wedge the slice in + // half_open_full. + slice_->ReportNeutral(probe, gen); +} + void ProxyTransaction::ReportBreakerOutcome(int result_code) { // No slice, or already reported: bail. admission_generation_==0 is // the sentinel — slice domain generations start at 1, so a 0 gen diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h index 6a6cc2f5..f6b4fa16 100644 --- a/test/circuit_breaker_phase4_test.h +++ b/test/circuit_breaker_phase4_test.h @@ -750,6 +750,148 @@ void TestHalfOpenRecoveryRoundTrip() { } } +// --------------------------------------------------------------------------- +// Test 11: Retry-After ceils the config cap from a non-second-aligned +// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap +// would clamp the advertised retry window below what the breaker honors, +// causing well-behaved clients to re-hit the 503. +// --------------------------------------------------------------------------- +void TestRetryAfterCapCeilsNonAlignedMax() { + std::cout << "\n[TEST] CB Phase 4: Retry-After cap ceils non-aligned max..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + // Configure a non-second-aligned max backoff. base = 1500ms so + // the actual OpenUntil-now at trip time is ~1.5s, which ceil- + // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration + // (1500ms → 1s), the clamp would drop Retry-After to 1s even + // though the breaker would keep rejecting through the second + // half of that window. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 1500; + u.circuit_breaker.max_open_duration_ms = 1500; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil- + // rounds 1500ms to 2s, and the remaining-time ceil-rounds to + // 2 at the moment of trip (may be 1 if enough wall-clock has + // elapsed between trip and response). Critically it must NEVER + // be zero or exceed 2 (clamped to the 2s cap). + bool in_range = (retry_after >= 1 && retry_after <= 2); + TestFramework::RecordTest( + "CB Phase 4: Retry-After ceils non-aligned cap", in_range, + in_range ? "" : + "retry_after=" + std::to_string(retry_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: Retry-After ceils non-aligned cap", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 12: Retried failures are reported BEFORE the retry fires. With retries +// enabled on 5xx, each attempt's outcome must be counted against the breaker; +// otherwise the slice trips only after the final retry exhausts, under- +// counting failures and potentially never tripping if retries mask enough of +// them. Verifies the trip still happens within the expected number of client +// requests once reporting is attached to the retry path. +// --------------------------------------------------------------------------- +void TestRetriedFailuresCountTowardTrip() { + std::cout << "\n[TEST] CB Phase 4: retried failures count toward trip..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each + // client request produces 1 + max_retries=3 = 4 upstream + // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome + // path that this fix patches in. The breaker must trip after + // at most 3 upstream failure reports (which the first client + // request alone produces). + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // One client request → 4 upstream attempts → 4 RESPONSE_5XX + // reports. Threshold=3 should trip during this single request. + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + // Second client request must hit the OPEN breaker → 503. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_breaker_header = + r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos; + + bool pass = is_503 && has_breaker_header; + TestFramework::RecordTest( + "CB Phase 4: retried failures count toward trip", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " breaker_hdr=" + std::to_string(has_breaker_header) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: retried failures count toward trip", false, e.what()); + } +} + void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl; @@ -765,6 +907,8 @@ void RunAllTests() { TestCircuitOpenTerminalForRetry(); TestDryRunPassthrough(); TestHalfOpenRecoveryRoundTrip(); + TestRetryAfterCapCeilsNonAlignedMax(); + TestRetriedFailuresCountTowardTrip(); } } // namespace CircuitBreakerPhase4Tests From 30fb10ea50e4a7a17a089f0b45d10a99531ccfa8 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 15:23:02 +0800 Subject: [PATCH 19/37] Fix review comment --- include/circuit_breaker/retry_budget.h | 10 ++- server/proxy_transaction.cc | 120 ++++++++++++++++--------- server/retry_budget.cc | 35 +++++--- test/circuit_breaker_phase3_test.h | 18 +++- test/circuit_breaker_phase4_test.h | 103 +++++++++++++++++++++ 5 files changed, 227 insertions(+), 59 deletions(-) diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h index dd4da11c..001bfccb 100644 --- a/include/circuit_breaker/retry_budget.h +++ b/include/circuit_breaker/retry_budget.h @@ -16,7 +16,15 @@ namespace circuit_breaker { // Fix: cap concurrent retries as a fraction of concurrent non-retry // traffic plus a floor for low-volume correctness. // -// allowed_retries = max(min_concurrency, in_flight * percent / 100) +// allowed_retries = max(min_concurrency, +// (in_flight - retries_in_flight) * percent / 100) +// +// The subtraction is load-bearing: callers hold TrackInFlight() for +// BOTH first attempts and retries (so the guard's RAII paired with +// ReleaseRetry doesn't need a second counter on the hot path). +// Without subtracting retries, admitting a retry increases in_flight +// which increases the cap, and in steady state the effective ratio +// converges above the configured percent of original traffic. // // The retry budget is PER-HOST (one instance owned by CircuitBreakerHost, // shared across its partitions — the percent math is about aggregate diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 31e5dfe3..192f4328 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -833,16 +833,30 @@ void ProxyTransaction::Cancel() { // In INIT and CHECKOUT_PENDING no bytes have left the client side // toward the upstream yet, so the connection (if any) is still // clean and safe to return to the pool. - if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) { + const bool upstream_exercised = + (state_ != State::INIT && state_ != State::CHECKOUT_PENDING); + if (upstream_exercised) { poison_connection_ = true; } - // Release any held breaker admission neutrally before tearing down. - // A client disconnect during CHECKOUT_PENDING, mid-send, or mid- - // response leaves admission_generation_ set; without this neutral - // release a probe slot stays occupied and HALF_OPEN can stall in - // half_open_full until an external reset. No-op when no admission - // is held (INIT, or an outcome already reported). - ReleaseBreakerAdmissionNeutral(); + // Release any held breaker admission before tearing down. Two paths: + // * Pre-upstream (INIT / CHECKOUT_PENDING): upstream was never + // touched — neutral release so a HALF_OPEN probe slot stays + // eligible for replacement (matches ReportNeutral's design + // contract: "the upstream wasn't actually exercised"). + // * Post-send (SENDING_REQUEST / AWAITING_RESPONSE / RECEIVING_BODY): + // we poisoned the pooled connection, which from the upstream's + // point of view is indistinguishable from a mid-flight disconnect. + // Report as UPSTREAM_DISCONNECT so the probe counts against the + // HALF_OPEN cycle (no replacement, re-trip on saw_failure drain) + // and CLOSED-state accounting sees the disruption instead of + // silently dropping a real signal. + // Both branches clear admission_generation_ internally, so late + // transport callbacks (if any) become no-ops. + if (upstream_exercised) { + ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT); + } else { + ReleaseBreakerAdmissionNeutral(); + } // Release the upstream lease back to the pool (or destroy it if // poisoned) and clear transport callbacks so any in-flight upstream // bytes land harmlessly. @@ -1030,49 +1044,67 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { } HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { - // Compute Retry-After from slice->OpenUntil() if the slice is known. - // Falls back to a conservative 1-second hint if the slice is null - // (shouldn't happen on the circuit-open path — that path requires a - // slice — but defense in depth). + // TryAcquire() returns REJECTED_OPEN for three distinct situations: + // * True OPEN: slice is in OPEN state, IsOpenDeadlineSet() is true, + // Retry-After reflects remaining backoff from OpenUntil(). + // * HALF_OPEN reject (half_open_full or half_open_recovery_failing): + // slice transitioned HALF_OPEN via TransitionOpenToHalfOpen, which + // clears open_until. IsOpenDeadlineSet() is false. These rejects + // wait on the in-flight probe cycle completing (success → CLOSED, + // failure → re-trip with fresh backoff). Retry-After = 1 in this + // branch would under-report the likely wait on a re-trip; ceil to + // base_open_duration_ms as a conservative hint (the worst case is + // re-trip + fresh backoff window). + // Emit a distinct X-Circuit-Breaker label for observability so + // operators can separate "true OPEN" from "HALF_OPEN recovery back- + // pressure" on dashboards. int retry_after_secs = 1; - if (slice_ && slice_->IsOpenDeadlineSet()) { - auto open_until = slice_->OpenUntil(); - auto now = std::chrono::steady_clock::now(); - auto ms_remaining = std::chrono::duration_cast( - open_until - now).count(); - // Ceiling-round to seconds so we never advertise a window - // shorter than the actual remaining backoff (e.g. 5.9s → 6, - // not 5). Truncating by one second is enough to cause a - // well-behaved client to retry while the breaker is still OPEN - // and get another avoidable 503. - int64_t diff = (ms_remaining + 999) / 1000; - // Clamp: Retry-After=0 is silly; upper bound tracks the - // configured max_open_duration_ms (clamped to 1s min), so we - // don't under-report backoff windows on operators who tune the - // breaker longer than 5 minutes. Absolute safety ceiling of - // 3600s (1 hour) — anything longer likely means the breaker - // is mis-configured and the hint is noise. - // - // Ceil the cap: floor-rounding max_open_duration_ms would - // under-report non-second-aligned configs. E.g. a 1500ms or - // 6500ms (exponential-backoff saturation) max floor-rounds to - // 1s/6s, advertising a shorter window than the breaker will - // actually honor. Clients retrying on the hint would hit - // another avoidable 503. - long long cfg_ms = slice_->config().max_open_duration_ms; - int cfg_cap_secs = static_cast( - std::max(1, (cfg_ms + 999) / 1000)); - int upper = std::min(cfg_cap_secs, 3600); - if (diff < 1) diff = 1; - if (diff > upper) diff = upper; - retry_after_secs = static_cast(diff); + const char* breaker_label = "open"; + if (slice_) { + if (slice_->IsOpenDeadlineSet()) { + // True OPEN — Retry-After from actual deadline. + auto open_until = slice_->OpenUntil(); + auto now = std::chrono::steady_clock::now(); + auto ms_remaining = std::chrono::duration_cast( + open_until - now).count(); + // Ceiling-round to seconds so we never advertise a window + // shorter than the actual remaining backoff. + int64_t diff = (ms_remaining + 999) / 1000; + // Upper bound tracks the configured max_open_duration_ms + // (ceiling-rounded to avoid under-reporting non-second- + // aligned configs), with an absolute safety ceiling at + // 3600s. + long long cfg_ms = slice_->config().max_open_duration_ms; + int cfg_cap_secs = static_cast( + std::max(1, (cfg_ms + 999) / 1000)); + int upper = std::min(cfg_cap_secs, 3600); + if (diff < 1) diff = 1; + if (diff > upper) diff = upper; + retry_after_secs = static_cast(diff); + breaker_label = "open"; + } else if (slice_->CurrentState() == + circuit_breaker::State::HALF_OPEN) { + // HALF_OPEN reject — no deadline to read; hint the operator + // with a ceiled base_open_duration so retrying clients wait + // for at least the worst-case re-trip window instead of + // bouncing immediately on Retry-After=1. + long long base_ms = slice_->config().base_open_duration_ms; + int hint = static_cast( + std::max(1, (base_ms + 999) / 1000)); + retry_after_secs = std::min(hint, 3600); + breaker_label = "half_open"; + } + // Any other state (CLOSED): shouldn't reach here — ConsultBreaker + // only calls this on REJECTED_OPEN. Fall through with the + // conservative defaults (Retry-After=1, label="open") so a + // regression can't silently emit Retry-After=0. } HttpResponse resp; resp.Status(HttpStatus::SERVICE_UNAVAILABLE); resp.Text("Upstream circuit breaker is open; please retry later.\n"); resp.Header("Retry-After", std::to_string(retry_after_secs)); - resp.Header("X-Circuit-Breaker", "open"); + resp.Header("X-Circuit-Breaker", breaker_label); // Hint operators (not clients) at which upstream tripped. Useful // when a gateway fronts multiple backends; without this header, a // 503 is opaque. diff --git a/server/retry_budget.cc b/server/retry_budget.cc index cc984e6d..9723d949 100644 --- a/server/retry_budget.cc +++ b/server/retry_budget.cc @@ -33,19 +33,34 @@ RetryBudget::InFlightGuard RetryBudget::TrackInFlight() { } bool RetryBudget::TryConsumeRetry() { - // Snapshot tuning + in_flight once — cap is computed against a - // consistent slice. Retrying the cap math inside the CAS loop would - // just churn without improving accuracy (in_flight is inherently a - // moving target). + // Snapshot tuning + both in-flight counters once so the cap is + // computed against a consistent slice. Retrying the cap math inside + // the CAS loop would just churn without improving accuracy + // (in_flight is inherently a moving target). int64_t in_flight = in_flight_.load(std::memory_order_relaxed); + int64_t retries_in_flight = retries_in_flight_.load(std::memory_order_relaxed); int pct = percent_.load(std::memory_order_relaxed); int min_conc = min_concurrency_.load(std::memory_order_relaxed); - // cap = max(min_concurrency, in_flight * percent / 100) - // Integer math is fine — percent is 0..100, in_flight is an int64. - // Overflow is impossible within reasonable load levels (in_flight - // would need to exceed ~2e16 to overflow after multiplying by 100). - int64_t pct_cap = (in_flight * pct) / 100; + // cap = max(min_concurrency, (in_flight - retries_in_flight) * percent / 100) + // + // Subtracting retries from the in_flight base prevents the budget + // from self-inflating: callers hold TrackInFlight() for BOTH first- + // attempts and retries (per the documented API), so admitting a + // retry increases in_flight_. Using the raw in_flight as the base + // would then increase the cap, which in steady state converges + // above the configured percentage of ORIGINAL traffic (e.g. a 20% + // budget with retries counted in would allow ~25% of originals to + // retry simultaneously; at higher percents the amplification grows + // faster). + // + // Floor the subtraction at 0: `retries_in_flight > in_flight` is + // transiently possible under racing increments (retry admitted and + // in_flight guard observed before first-attempt guard's pair) — + // clamp rather than letting the multiply go negative. + int64_t non_retry_in_flight = in_flight - retries_in_flight; + if (non_retry_in_flight < 0) non_retry_in_flight = 0; + int64_t pct_cap = (non_retry_in_flight * pct) / 100; int64_t cap = pct_cap > min_conc ? pct_cap : min_conc; // Atomically reserve a slot: load current, verify under cap, CAS up @@ -53,7 +68,7 @@ bool RetryBudget::TryConsumeRetry() { // all observe current < cap and all increment past the cap — under // the cross-dispatcher load the retry budget is meant to protect // against, the gate would stop bounding anything. - int64_t current = retries_in_flight_.load(std::memory_order_relaxed); + int64_t current = retries_in_flight; while (current < cap) { if (retries_in_flight_.compare_exchange_weak( current, current + 1, diff --git a/test/circuit_breaker_phase3_test.h b/test/circuit_breaker_phase3_test.h index ba2f5554..87ed28e7 100644 --- a/test/circuit_breaker_phase3_test.h +++ b/test/circuit_breaker_phase3_test.h @@ -96,17 +96,27 @@ void TestRetryBudgetPercentCap() { try { RetryBudget rb(20, 0); // no min floor — pure percent - // Push in_flight to 50 via guards that we intentionally keep alive. + // Push in_flight to 50 via guards that we intentionally keep + // alive. Per the documented API, callers hold TrackInFlight() + // for BOTH first attempts and retries — but TryConsumeRetry + // subtracts retries_in_flight from the base so the budget + // doesn't self-inflate as retries are admitted. std::vector guards; for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight()); - // 50 * 20% = 10 retries allowed. + // With 50 non-retry in-flight and 20% budget the first + // admission is against cap=10, but each admission shrinks the + // non-retry base by 1. The admission count converges at r + // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The + // pre-fix formula (cap computed from raw in_flight) would + // admit 10, drifting the effective ratio above 20% of + // originals. int admitted = 0; for (int i = 0; i < 20; ++i) { if (rb.TryConsumeRetry()) ++admitted; } - bool cap_hit = admitted == 10; - bool rejected_count = rb.RetriesRejected() == 10; + bool cap_hit = admitted == 8; + bool rejected_count = rb.RetriesRejected() == 12; // Release guards — in_flight drops to 0; future TryConsumeRetry with // min=0 and in_flight=0 rejects everything. diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h index f6b4fa16..64b68571 100644 --- a/test/circuit_breaker_phase4_test.h +++ b/test/circuit_breaker_phase4_test.h @@ -892,6 +892,108 @@ void TestRetriedFailuresCountTowardTrip() { } } +// --------------------------------------------------------------------------- +// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label. +// TryAcquire returns REJECTED_OPEN for three situations (true OPEN, +// half_open_full, half_open_recovery_failing). When the slice is in +// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse +// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading +// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects +// with a more conservative Retry-After hint. +// +// Strategy: trip the breaker, wait for the open window to elapse so the +// slice transitions HALF_OPEN on the next admission attempt, then flood +// concurrent requests so some hit half_open_full. +// --------------------------------------------------------------------------- +void TestHalfOpenRejectLabel() { + std::cout << "\n[TEST] CB Phase 4: HALF_OPEN reject label..." + << std::endl; + try { + // Backend hangs to keep probes in-flight so later concurrent + // requests hit half_open_full. + std::atomic hang{false}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { + if (hang.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(600)); + } + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 200; + u.circuit_breaker.max_open_duration_ms = 500; + u.circuit_breaker.permitted_half_open_calls = 1; // tiny budget + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + // Wait for the open window to elapse so the next admission + // flips the slice to HALF_OPEN. + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + + // Flip backend to hang so the probe occupies the single probe + // slot while we fire sibling requests that must hit half_open_full. + hang.store(true); + + std::atomic saw_half_open{false}; + std::atomic saw_open{false}; + auto probe = [&](int id) { + (void)id; + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); + if (!TestHttpClient::HasStatus(r, 503)) return; + if (r.find("X-Circuit-Breaker: half_open") != std::string::npos || + r.find("x-circuit-breaker: half_open") != std::string::npos) { + saw_half_open.store(true); + } + if (r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos) { + // We want to distinguish the labels; the "open" substring + // also matches "half_open". Only count true "open" if + // "half_open" didn't appear in THIS response. + if (r.find("half_open") == std::string::npos) { + saw_open.store(true); + } + } + }; + + std::vector threads; + for (int i = 0; i < 6; ++i) { + threads.emplace_back(probe, i); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + for (auto& t : threads) t.join(); + + // Pass if at least one HALF_OPEN-labelled reject was observed. + // saw_open may or may not be observed (some rejects could have + // hit between cycles) — the key contract is that HALF_OPEN + // rejects no longer get the plain "open" label. + bool pass = saw_half_open.load(); + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN reject label", pass, + pass ? "" : + "saw_half_open=" + std::to_string(saw_half_open.load()) + + " saw_open=" + std::to_string(saw_open.load())); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN reject label", false, e.what()); + } +} + void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl; @@ -909,6 +1011,7 @@ void RunAllTests() { TestHalfOpenRecoveryRoundTrip(); TestRetryAfterCapCeilsNonAlignedMax(); TestRetriedFailuresCountTowardTrip(); + TestHalfOpenRejectLabel(); } } // namespace CircuitBreakerPhase4Tests From fccd6f5a7013637c1f9b14346c486ec4cb9233a4 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 15:55:55 +0800 Subject: [PATCH 20/37] Fix review comment --- .../circuit_breaker/circuit_breaker_slice.h | 14 ++ server/circuit_breaker_slice.cc | 5 + server/proxy_transaction.cc | 81 +++++----- test/circuit_breaker_phase4_test.h | 147 ++++++++++++++++++ 4 files changed, 208 insertions(+), 39 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 6e9734df..1c96dcd0 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -146,6 +146,20 @@ class CircuitBreakerSlice { return open_until_steady_ns_.load(std::memory_order_relaxed) > 0; } + // Expected next open-duration in milliseconds if the slice re-trips + // from its current state. Computed from base_open_duration_ms + // shifted by the current `consecutive_trips_` count and clamped by + // max_open_duration_ms. Used by the Retry-After hint path for + // HALF_OPEN rejections, where there's no stored deadline but the + // next OPEN window (if the probe cycle fails) will follow the + // exponential-backoff curve — base alone would under-report after + // multiple trips. + // + // Safe from any thread (atomic load of consecutive_trips_ + plain + // reads of config_ fields). Config fields are dispatcher-owned but + // a slightly-torn read is fine for an observability hint. + int64_t NextOpenDurationMs() const; + private: // Logging label: "service=X host=Y:Z partition=N" built once. std::string host_label_; diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index be9da56a..c34c25ae 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -62,6 +62,11 @@ std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const { return std::chrono::milliseconds(scaled_ms); } +int64_t CircuitBreakerSlice::NextOpenDurationMs() const { + return std::chrono::duration_cast( + ComputeOpenDuration()).count(); +} + bool CircuitBreakerSlice::ShouldTripClosed( std::chrono::steady_clock::time_point now) { if (consecutive_failures_ >= config_.consecutive_failure_threshold) { diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 192f4328..bc847368 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -833,30 +833,28 @@ void ProxyTransaction::Cancel() { // In INIT and CHECKOUT_PENDING no bytes have left the client side // toward the upstream yet, so the connection (if any) is still // clean and safe to return to the pool. - const bool upstream_exercised = - (state_ != State::INIT && state_ != State::CHECKOUT_PENDING); - if (upstream_exercised) { + if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) { poison_connection_ = true; } - // Release any held breaker admission before tearing down. Two paths: - // * Pre-upstream (INIT / CHECKOUT_PENDING): upstream was never - // touched — neutral release so a HALF_OPEN probe slot stays - // eligible for replacement (matches ReportNeutral's design - // contract: "the upstream wasn't actually exercised"). - // * Post-send (SENDING_REQUEST / AWAITING_RESPONSE / RECEIVING_BODY): - // we poisoned the pooled connection, which from the upstream's - // point of view is indistinguishable from a mid-flight disconnect. - // Report as UPSTREAM_DISCONNECT so the probe counts against the - // HALF_OPEN cycle (no replacement, re-trip on saw_failure drain) - // and CLOSED-state accounting sees the disruption instead of - // silently dropping a real signal. - // Both branches clear admission_generation_ internally, so late - // transport callbacks (if any) become no-ops. - if (upstream_exercised) { - ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT); - } else { - ReleaseBreakerAdmissionNeutral(); - } + // Release any held breaker admission neutrally. Cancel() is always + // a LOCAL termination — client disconnect, framework-level abort, + // H2 stream reset, etc. Even when we poisoned a pooled connection + // mid-request, counting that as an upstream-health failure would + // trip the breaker against a backend that may be perfectly healthy + // (browser cancels, user-initiated timeouts, etc. are all common + // causes). The reviewer guidance is explicit: client-initiated + // aborts must be neutral from the breaker's perspective. + // + // Trade-off: in HALF_OPEN, ReportNeutral on a probe decrements + // both inflight and admitted, so a cancelled probe makes the slot + // eligible for a replacement admission in the same cycle. That is + // the documented design contract of ReportNeutral ("the upstream + // wasn't actually exercised by this admission" from the breaker's + // decision-math point of view — we didn't observe a success or + // failure), and it is acceptable: probes that genuinely succeed + // or fail still close / re-trip the cycle normally, and a broken + // upstream under cancel-spam will still fail those real probes. + ReleaseBreakerAdmissionNeutral(); // Release the upstream lease back to the pool (or destroy it if // poisoned) and clear transport callbacks so any in-flight upstream // bytes land harmlessly. @@ -1060,9 +1058,18 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { // pressure" on dashboards. int retry_after_secs = 1; const char* breaker_label = "open"; + // Absolute sanity ceiling — independent of config. Protects against + // ridiculous programmatic values that might slip past validation. + static constexpr int RETRY_AFTER_ABS_MAX_SECS = 3600; // 1 hour if (slice_) { if (slice_->IsOpenDeadlineSet()) { - // True OPEN — Retry-After from actual deadline. + // True OPEN — Retry-After from the actual stored deadline. + // The deadline is authoritative: it's what the slice will + // actually honor, regardless of any subsequent config + // reload that might lower max_open_duration_ms. Clamping + // below the stored deadline would tell well-behaved clients + // to retry early and bounce on more 503s until the original + // deadline elapses. auto open_until = slice_->OpenUntil(); auto now = std::chrono::steady_clock::now(); auto ms_remaining = std::chrono::duration_cast( @@ -1070,28 +1077,24 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const { // Ceiling-round to seconds so we never advertise a window // shorter than the actual remaining backoff. int64_t diff = (ms_remaining + 999) / 1000; - // Upper bound tracks the configured max_open_duration_ms - // (ceiling-rounded to avoid under-reporting non-second- - // aligned configs), with an absolute safety ceiling at - // 3600s. - long long cfg_ms = slice_->config().max_open_duration_ms; - int cfg_cap_secs = static_cast( - std::max(1, (cfg_ms + 999) / 1000)); - int upper = std::min(cfg_cap_secs, 3600); if (diff < 1) diff = 1; - if (diff > upper) diff = upper; + if (diff > RETRY_AFTER_ABS_MAX_SECS) diff = RETRY_AFTER_ABS_MAX_SECS; retry_after_secs = static_cast(diff); breaker_label = "open"; } else if (slice_->CurrentState() == circuit_breaker::State::HALF_OPEN) { - // HALF_OPEN reject — no deadline to read; hint the operator - // with a ceiled base_open_duration so retrying clients wait - // for at least the worst-case re-trip window instead of - // bouncing immediately on Retry-After=1. - long long base_ms = slice_->config().base_open_duration_ms; + // HALF_OPEN reject — no deadline to read. Hint with the + // NEXT expected open duration (base << consecutive_trips_, + // clamped by max_open_duration_ms) rather than base alone: + // after multiple trips, exponential backoff has already + // grown the OPEN window, and advertising bare base would + // tell clients to retry far earlier than the breaker will + // admit even in the worst case (probe cycle fails, slice + // re-trips into the larger backoff). + int64_t next_ms = slice_->NextOpenDurationMs(); int hint = static_cast( - std::max(1, (base_ms + 999) / 1000)); - retry_after_secs = std::min(hint, 3600); + std::max(1, (next_ms + 999) / 1000)); + retry_after_secs = std::min(hint, RETRY_AFTER_ABS_MAX_SECS); breaker_label = "half_open"; } // Any other state (CLOSED): shouldn't reach here — ConsultBreaker diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h index 64b68571..5b044952 100644 --- a/test/circuit_breaker_phase4_test.h +++ b/test/circuit_breaker_phase4_test.h @@ -994,6 +994,152 @@ void TestHalfOpenRejectLabel() { } } +// --------------------------------------------------------------------------- +// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff, +// not just base_open_duration_ms. After multiple trips, the next OPEN window +// (if the probe cycle fails) is base << consecutive_trips, clamped by +// max_open_duration_ms. Advertising bare base would under-report the worst- +// case wait by a factor of 2^n. +// +// Strategy: trip → recover → trip → recover → trip to drive consecutive_trips +// up. Then hit HALF_OPEN during the next OPEN window elapse and assert +// Retry-After > base seconds. +// --------------------------------------------------------------------------- +void TestHalfOpenRetryAfterScalesWithBackoff() { + std::cout << "\n[TEST] CB Phase 4: HALF_OPEN Retry-After exponential..." + << std::endl; + try { + // Backend hangs on demand so we can pin the probe slot and + // observe HALF_OPEN rejections. + std::atomic hang{false}; + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&hang, &fail_mode](const HttpRequest&, + HttpResponse& resp) { + if (hang.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(800)); + } + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 2; + gw.http2.enabled = false; + // base=100ms, max=5000ms. After 3 trips the next duration is + // 100 << 3 = 800ms (< max), so HALF_OPEN's hint should be + // ceil(800/1000)=1s. But we only need to validate that the + // hint is >= 1s (which base alone would also produce from + // ceil(100/1000)=1). To get an observable difference, use a + // smaller base (50ms) and enough trips that 50 << N > 1000ms. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/2); + u.circuit_breaker.base_open_duration_ms = 100; // config minimum + u.circuit_breaker.max_open_duration_ms = 8000; // cap at 8s + u.circuit_breaker.permitted_half_open_calls = 1; // single probe + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip 1: two consecutive failures. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + // Wait past base (50ms → open window) so slice goes HALF_OPEN. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + // Recovery: one probe success (flip fail_mode briefly). + fail_mode.store(false); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + fail_mode.store(true); + + // Trip 2: two more failures. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + std::this_thread::sleep_for(std::chrono::milliseconds(250)); + + // Recovery again. + fail_mode.store(false); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + fail_mode.store(true); + + // Trip 3: two more failures. consecutive_trips should now be + // high enough that base << trips > 1000ms — HALF_OPEN hint + // should be >= 1 but potentially larger. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + // Wait for the open window to elapse and next admission + // transitions HALF_OPEN. + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + // Pin the probe slot with a hanging request so subsequent + // requests get HALF_OPEN rejects. + hang.store(true); + std::thread probe([&]() { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); + hang.store(false); + probe.join(); + + bool is_half_open = + r.find("X-Circuit-Breaker: half_open") != std::string::npos || + r.find("x-circuit-breaker: half_open") != std::string::npos; + + // Extract Retry-After. + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Pre-fix the HALF_OPEN hint was hard-coded to ceil(base/1000)=1s. + // Post-fix, with base=50ms and consecutive_trips ~= 3, the next + // open duration is 50 << 3 = 400ms → ceil = 1s (still 1). With + // trips ~= 5, 50 << 5 = 1600ms → ceil = 2s. So we need enough + // trips to cross the second boundary. The exact count depends + // on which partition the requests hit (aggregated sharding). + // Assert at least that we saw a HALF_OPEN response and + // Retry-After is at least 1 and at most max/1000=8 — both + // conservative lower/upper bounds of the exponential formula. + bool retry_after_ok = (retry_after >= 1 && retry_after <= 8); + bool pass = is_half_open && retry_after_ok; + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN Retry-After exponential-aware", pass, + pass ? "" : + "is_half_open=" + std::to_string(is_half_open) + + " retry_after=" + std::to_string(retry_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN Retry-After exponential-aware", + false, e.what()); + } +} + void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl; @@ -1012,6 +1158,7 @@ void RunAllTests() { TestRetryAfterCapCeilsNonAlignedMax(); TestRetriedFailuresCountTowardTrip(); TestHalfOpenRejectLabel(); + TestHalfOpenRetryAfterScalesWithBackoff(); } } // namespace CircuitBreakerPhase4Tests From 5ee26fd6ed951330ab521fa0b6d2036829eb5d35 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 18:45:37 +0800 Subject: [PATCH 21/37] Fix review comment --- test/circuit_breaker_phase4_test.h | 235 +++++++++++++++++------------ 1 file changed, 142 insertions(+), 93 deletions(-) diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h index 5b044952..5626b77a 100644 --- a/test/circuit_breaker_phase4_test.h +++ b/test/circuit_breaker_phase4_test.h @@ -84,8 +84,12 @@ void TestBreakerTripsAfterConsecutiveFailures() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; - gw.http2.enabled = false; // match the existing proxy test pattern // single thread → single breaker partition exercised + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. // single thread → single breaker partition exercised gw.upstreams.push_back( MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3)); @@ -159,8 +163,12 @@ void TestBreakerDisabledPassesThrough() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; - gw.http2.enabled = false; // match the existing proxy test pattern + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. gw.upstreams.push_back( MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/false, /*threshold=*/3)); @@ -203,7 +211,13 @@ void TestSuccessResetsConsecutiveFailureCounter() { try { std::atomic fail_mode{true}; HttpServer backend("127.0.0.1", 0); - backend.Get("/toggle", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + // Backend must serve /fail — that's the exact-match route the + // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail", + // strip_prefix=false). A different backend path would leave + // the gateway 404-ing every request without ever exercising + // the proxy, and the CLOSED-state assertion below would pass + // for the wrong reason. + backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { if (fail_mode.load()) { resp.Status(502).Body("err", "text/plain"); } else { @@ -216,8 +230,12 @@ void TestSuccessResetsConsecutiveFailureCounter() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; - gw.http2.enabled = false; // match the existing proxy test pattern + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. gw.upstreams.push_back( MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3)); @@ -229,28 +247,35 @@ void TestSuccessResetsConsecutiveFailureCounter() { // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails. // With reset semantics, consecutive_failures_ never exceeds 2 → no trip. for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000); // FAIL + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL } fail_mode.store(false); - TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000); // SUCCESS → reset + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // SUCCESS → reset fail_mode.store(true); for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000); // FAIL + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL } - // Inspect the breaker's state directly — it should still be CLOSED. + // Inspect the breaker's state directly. The slice must be CLOSED + // AND must have observed activity — without the second check, a + // gateway that 404's every request (e.g. because the proxy route + // doesn't match) would also pass trivially. auto* cbm = gateway.GetUpstreamManager() ? gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; auto* host = cbm ? cbm->GetHost("svc") : nullptr; auto* slice = host ? host->GetSlice(0) : nullptr; bool still_closed = slice && slice->CurrentState() == State::CLOSED; + // No trip fired: total_trips should be zero for this slice. + int64_t trips = slice ? slice->Trips() : -1; + bool no_trips = (trips == 0); + bool pass = still_closed && no_trips; TestFramework::RecordTest( - "CB Phase 4: success resets consecutive counter", still_closed, - still_closed ? "" : - "slice not CLOSED after S resets failures: state=" + - std::to_string(static_cast( - slice ? slice->CurrentState() : State::CLOSED))); + "CB Phase 4: success resets consecutive counter", pass, + pass ? "" : + "state=" + std::to_string(static_cast( + slice ? slice->CurrentState() : State::CLOSED)) + + " trips=" + std::to_string(trips)); } catch (const std::exception& e) { TestFramework::RecordTest( "CB Phase 4: success resets consecutive counter", false, e.what()); @@ -275,8 +300,12 @@ void TestTripDrivesSliceState() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; - gw.http2.enabled = false; // match the existing proxy test pattern + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. gw.upstreams.push_back( MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3)); @@ -338,8 +367,12 @@ void TestOpenBreakerShortCircuitsUpstreamCall() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; - gw.http2.enabled = false; // match the existing proxy test pattern + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. gw.upstreams.push_back( MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3)); @@ -388,7 +421,7 @@ void TestBareProxyWorks() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; UpstreamConfig u; u.name = "svc"; @@ -442,7 +475,7 @@ void TestRetryAfterHeaderValue() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; // base_open_duration 2000ms, max 60_000ms — Retry-After should // ceiling-round and fall inside [1, 60]. @@ -526,7 +559,7 @@ void TestCircuitOpenTerminalForRetry() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; // Retries enabled on 5xx — if the breaker reject leaked into // MaybeRetry, the test would see extra backend hits after the @@ -592,7 +625,7 @@ void TestDryRunPassthrough() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3); @@ -674,7 +707,7 @@ void TestHalfOpenRecoveryRoundTrip() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3); @@ -770,7 +803,7 @@ void TestRetryAfterCapCeilsNonAlignedMax() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; // Configure a non-second-aligned max backoff. base = 1500ms so // the actual OpenUntil-now at trip time is ~1.5s, which ceil- @@ -848,7 +881,7 @@ void TestRetriedFailuresCountTowardTrip() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each // client request produces 1 + max_retries=3 = 4 upstream @@ -925,7 +958,7 @@ void TestHalfOpenRejectLabel() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; gw.http2.enabled = false; auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/3); @@ -996,34 +1029,30 @@ void TestHalfOpenRejectLabel() { // --------------------------------------------------------------------------- // Test 14: HALF_OPEN Retry-After reflects the current exponential backoff, -// not just base_open_duration_ms. After multiple trips, the next OPEN window -// (if the probe cycle fails) is base << consecutive_trips, clamped by -// max_open_duration_ms. Advertising bare base would under-report the worst- -// case wait by a factor of 2^n. +// not just base_open_duration_ms. After multiple trips the next OPEN window +// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old +// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report +// the worst-case wait, which this test must fail for. // -// Strategy: trip → recover → trip → recover → trip to drive consecutive_trips -// up. Then hit HALF_OPEN during the next OPEN window elapse and assert -// Retry-After > base seconds. +// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting +// the OPEN window elapse and single probe fail each cycle. Successful +// recoveries must be avoided — TransitionHalfOpenToClosed resets +// consecutive_trips_ to 0, which hides the exponential hint. // --------------------------------------------------------------------------- void TestHalfOpenRetryAfterScalesWithBackoff() { std::cout << "\n[TEST] CB Phase 4: HALF_OPEN Retry-After exponential..." << std::endl; try { - // Backend hangs on demand so we can pin the probe slot and - // observe HALF_OPEN rejections. + // Backend fails fast by default. When `hang` is set, the + // handler blocks — used at the end to pin the probe slot so + // a concurrent request observes HALF_OPEN rejection. std::atomic hang{false}; - std::atomic fail_mode{true}; HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&hang, &fail_mode](const HttpRequest&, - HttpResponse& resp) { + backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { if (hang.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(800)); - } - if (fail_mode.load()) { - resp.Status(502).Body("err", "text/plain"); - } else { - resp.Status(200).Body("ok", "text/plain"); + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); } + resp.Status(502).Body("err", "text/plain"); }); TestServerRunner backend_runner(backend); int backend_port = backend_runner.GetPort(); @@ -1031,14 +1060,8 @@ void TestHalfOpenRetryAfterScalesWithBackoff() { ServerConfig gw; gw.bind_host = "127.0.0.1"; gw.bind_port = 0; - gw.worker_threads = 2; + gw.worker_threads = 1; // pin all traffic to slice[0] gw.http2.enabled = false; - // base=100ms, max=5000ms. After 3 trips the next duration is - // 100 << 3 = 800ms (< max), so HALF_OPEN's hint should be - // ceil(800/1000)=1s. But we only need to validate that the - // hint is >= 1s (which base alone would also produce from - // ceil(100/1000)=1). To get an observable difference, use a - // smaller base (50ms) and enough trips that 50 << N > 1000ms. auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, /*enabled=*/true, /*threshold=*/2); u.circuit_breaker.base_open_duration_ms = 100; // config minimum @@ -1050,46 +1073,78 @@ void TestHalfOpenRetryAfterScalesWithBackoff() { TestServerRunner gw_runner(gateway); int gw_port = gw_runner.GetPort(); - // Trip 1: two consecutive failures. - for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); + auto* cbm = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; + auto* host = cbm ? cbm->GetHost("svc") : nullptr; + auto* slice = host ? host->GetSlice(0) : nullptr; + if (!slice) { + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN Retry-After exponential-aware", + false, "slice lookup failed"); + return; } - // Wait past base (50ms → open window) so slice goes HALF_OPEN. - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - // Recovery: one probe success (flip fail_mode briefly). - fail_mode.store(false); - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - fail_mode.store(true); - - // Trip 2: two more failures. + // Initial trip: 2 consecutive failures with threshold=2. for (int i = 0; i < 2; ++i) { TestHttpClient::HttpGet(gw_port, "/fail", 3000); } - std::this_thread::sleep_for(std::chrono::milliseconds(250)); - - // Recovery again. - fail_mode.store(false); - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - fail_mode.store(true); - // Trip 3: two more failures. consecutive_trips should now be - // high enough that base << trips > 1000ms — HALF_OPEN hint - // should be >= 1 but potentially larger. - for (int i = 0; i < 2; ++i) { + // Drive consecutive_trips_ up by letting successive OPEN windows + // elapse and probes fail (no recovery → no reset). Stop when + // NextOpenDurationMs crosses 1000ms, which is the threshold + // where the HALF_OPEN Retry-After hint starts exceeding the + // base-only value (ceil(100ms)=1s). + // + // The slice re-trips on each failed probe; each trip doubles + // the open duration. We run ~8 cycles with safety margin which + // is comfortably past the trip count needed for Retry-After>=2. + for (int cycle = 0; cycle < 8; ++cycle) { + // Wait past the current open window. Upper bound: max=8s, + // so 1200ms is plenty for the first few short cycles, and + // we re-check after each request anyway. + int64_t next_ms = slice->NextOpenDurationMs(); + // Current OPEN window is the one stored BEFORE the upcoming + // re-trip — we don't have that directly, so sleep past the + // NEXT duration as an over-approximation (next is always >= + // current). This ensures OPEN has elapsed. + auto sleep_ms = std::max(next_ms + 50, 200); + if (sleep_ms > 2000) sleep_ms = 2000; // cap per cycle + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + + // One request — it should admit as a probe (HALF_OPEN), + // the backend fails fast (502), probe fails → re-trip with + // consecutive_trips_++ and fresh OPEN. TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + // Bail early once the exponential hint crosses 1s → the + // subsequent HALF_OPEN reject will carry Retry-After >= 2. + if (slice->NextOpenDurationMs() >= 2000) break; } - // Wait for the open window to elapse and next admission - // transitions HALF_OPEN. - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - // Pin the probe slot with a hanging request so subsequent - // requests get HALF_OPEN rejects. + int64_t next_open_ms = slice->NextOpenDurationMs(); + if (next_open_ms < 2000) { + TestFramework::RecordTest( + "CB Phase 4: HALF_OPEN Retry-After exponential-aware", + false, + "setup failed: next_open_ms=" + std::to_string(next_open_ms) + + " (need >= 2000 to distinguish from base-only hint)"); + return; + } + + // Now trigger a HALF_OPEN reject: wait for current OPEN to + // elapse, start a hanging probe (pins the slot), then fire a + // sibling request — it must see half_open_full with the + // exponential Retry-After. + int64_t post_wait_ms = next_open_ms + 100; + if (post_wait_ms > 4000) post_wait_ms = 4000; + std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms)); + hang.store(true); std::thread probe([&]() { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); + TestHttpClient::HttpGet(gw_port, "/fail", 3500); }); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + // Let the probe get admitted and start hanging. + std::this_thread::sleep_for(std::chrono::milliseconds(200)); std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); hang.store(false); @@ -1099,7 +1154,6 @@ void TestHalfOpenRetryAfterScalesWithBackoff() { r.find("X-Circuit-Breaker: half_open") != std::string::npos || r.find("x-circuit-breaker: half_open") != std::string::npos; - // Extract Retry-After. int retry_after = -1; const char* markers[] = {"Retry-After:", "retry-after:"}; for (const char* m : markers) { @@ -1117,22 +1171,17 @@ void TestHalfOpenRetryAfterScalesWithBackoff() { if (any) { retry_after = val; break; } } - // Pre-fix the HALF_OPEN hint was hard-coded to ceil(base/1000)=1s. - // Post-fix, with base=50ms and consecutive_trips ~= 3, the next - // open duration is 50 << 3 = 400ms → ceil = 1s (still 1). With - // trips ~= 5, 50 << 5 = 1600ms → ceil = 2s. So we need enough - // trips to cross the second boundary. The exact count depends - // on which partition the requests hit (aggregated sharding). - // Assert at least that we saw a HALF_OPEN response and - // Retry-After is at least 1 and at most max/1000=8 — both - // conservative lower/upper bounds of the exponential formula. - bool retry_after_ok = (retry_after >= 1 && retry_after <= 8); + // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2. + // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1. + // Asserting >= 2 fails the pre-fix implementation. + bool retry_after_ok = (retry_after >= 2 && retry_after <= 8); bool pass = is_half_open && retry_after_ok; TestFramework::RecordTest( "CB Phase 4: HALF_OPEN Retry-After exponential-aware", pass, pass ? "" : "is_half_open=" + std::to_string(is_half_open) + - " retry_after=" + std::to_string(retry_after)); + " retry_after=" + std::to_string(retry_after) + + " next_open_ms=" + std::to_string(next_open_ms)); } catch (const std::exception& e) { TestFramework::RecordTest( "CB Phase 4: HALF_OPEN Retry-After exponential-aware", From f08fbe358c6e3bba32bb610aed0ee8317bf7f660 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 19:39:27 +0800 Subject: [PATCH 22/37] Finished Phase 5: Retry budget integration --- Makefile | 2 +- include/upstream/proxy_transaction.h | 39 ++- server/proxy_transaction.cc | 69 ++++- test/circuit_breaker_phase5_test.h | 366 +++++++++++++++++++++++++++ test/run_test.cc | 7 +- 5 files changed, 474 insertions(+), 9 deletions(-) create mode 100644 test/circuit_breaker_phase5_test.h diff --git a/Makefile b/Makefile index 2dbd8c2a..80f5f9a1 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h index 40886be4..cded9b71 100644 --- a/include/upstream/proxy_transaction.h +++ b/include/upstream/proxy_transaction.h @@ -6,6 +6,7 @@ #include "upstream/header_rewriter.h" #include "upstream/retry_policy.h" #include "config/server_config.h" // ProxyConfig (stored by value) +#include "circuit_breaker/retry_budget.h" // RetryBudget::InFlightGuard (member-by-value) #include "http/http_callbacks.h" #include "http/http_response.h" // , , , , , provided by common.h @@ -17,7 +18,7 @@ class Dispatcher; namespace circuit_breaker { class CircuitBreakerSlice; -} +} // RetryBudget already defined via retry_budget.h class ProxyTransaction : public std::enable_shared_from_this { public: @@ -166,6 +167,23 @@ class ProxyTransaction : public std::enable_shared_from_this { // CircuitBreakerManager on HttpServer, which outlives this transaction. circuit_breaker::CircuitBreakerSlice* slice_ = nullptr; + // Per-host retry budget, resolved alongside `slice_` in Start() from + // the same CircuitBreakerHost. Null when there's no breaker attached + // for this service — in that case the transaction skips budget + // tracking entirely. Lifetime: the budget is owned by the host, + // which outlives this transaction (destruction order guaranteed by + // HttpServer member declaration). + circuit_breaker::RetryBudget* retry_budget_ = nullptr; + + // Per-attempt in-flight tracker. Held for the duration of each + // attempt (first try and retries alike). Replaced on every + // AttemptCheckout — move-assignment decrements the counter for the + // prior attempt and increments for the new one, so a retrying + // transaction stays at a single in_flight unit. Default-constructed + // guard is empty (counter_ = nullptr): used when retry_budget_ is + // null or before the first ConsultBreaker admission. + circuit_breaker::RetryBudget::InFlightGuard inflight_guard_; + // Per-ATTEMPT admission state. Reset on each call to ConsultBreaker(); // paired Report*() calls thread the `generation` back so the slice // can drop stale completions across state transitions (see @@ -175,11 +193,11 @@ class ProxyTransaction : public std::enable_shared_from_this { uint64_t admission_generation_ = 0; bool is_probe_ = false; - // TODO(phase-5): retry-budget token held by this transaction's most - // recent retry attempt. Phase 5 flips this to true on successful - // TryConsumeRetry and clears it on ReleaseRetry. Phase 4 declares - // the field so Cleanup() and Cancel() have something to check, but - // the retry loop does not yet consume the budget. + // Retry-budget token held by this transaction's current retry + // attempt (attempt_ > 0). Set true after a successful + // TryConsumeRetry in MaybeRetry; cleared by ReleaseRetryToken in + // Cleanup. Dry-run rejects proceed but the flag stays false — no + // token was consumed, so no ReleaseRetry is required. bool retry_token_held_ = false; // Internal methods @@ -257,4 +275,13 @@ class ProxyTransaction : public std::enable_shared_from_this { // admission_generation_ so a following ReportBreakerOutcome is a // no-op. void ReleaseBreakerAdmissionNeutral(); + + // Release the retry-budget token held by this attempt, if any. + // Idempotent via the retry_token_held_ flag — called from Cleanup + // between attempts (so the next retry's TryConsumeRetry sees a + // freshly-released counter) AND from the destructor / Cancel as + // safety nets. No-op when no budget was attached or no token was + // consumed (e.g. first attempt, or dry-run reject that didn't + // consume). + void ReleaseRetryToken(); }; diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index bc847368..020d898f 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -124,6 +124,13 @@ void ProxyTransaction::Start() { auto* host = cbm->GetHost(service_name_); if (host) { slice_ = host->GetSlice(static_cast(dispatcher_index_)); + // Retry budget is host-level (shared across partitions). + // Resolve from the same host so retry admission math stays + // consistent with the slice's dispatcher routing. Always + // non-null when the host exists (budget is unconditionally + // constructed by the host ctor). Null only when `host` + // itself is null. + retry_budget_ = host->GetRetryBudget(); } } } @@ -146,6 +153,16 @@ void ProxyTransaction::AttemptCheckout() { return; } + // Track this attempt against the host-level retry budget's + // in_flight counter. Replaces any prior guard (from the previous + // attempt of the same transaction) — move-assignment decrements + // the old counter and takes ownership of the new, so a retrying + // transaction stays at exactly one in_flight unit throughout. No-op + // when retry_budget_ is null (no breaker attached for this service). + if (retry_budget_) { + inflight_guard_ = retry_budget_->TrackInFlight(); + } + auto self = shared_from_this(); // Lazily allocate the shared cancel token so the pool can drop @@ -671,7 +688,10 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) { client_fd_, service_name_, attempt_, static_cast(condition)); - // Release old lease, clear callbacks, poison if tainted + // Release old lease, clear callbacks, poison if tainted. + // Cleanup also releases any retry token held by the previous + // retry attempt (attempt_ > 1) so the next TryConsumeRetry sees + // a fresh counter. Cleanup(); codec_.Reset(); // Re-apply request method after reset — llhttp_init() zeroes @@ -680,6 +700,40 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) { codec_.SetRequestMethod(method_); poison_connection_ = false; + // Retry-budget gate. `attempt_ > 0` here is guaranteed — we + // just incremented. The budget bounds how many retries can be + // concurrently in flight against this upstream HOST (aggregated + // across all transactions for the service), preventing a retry + // storm from amplifying traffic to a struggling backend. + // + // Dry-run: log the would-reject but still proceed (consistent + // with REJECTED_OPEN_DRYRUN on the slice path). No token is + // consumed, so no ReleaseRetry is needed on the dry-run path. + // + // Full mode: deliver the §12.2 retry-budget response (503 + + // X-Retry-Budget-Exhausted) and terminate. Does NOT call + // ReportBreakerOutcome — our own reject must not feed back + // into the slice's failure math. + if (retry_budget_) { + bool is_dry_run = slice_ && slice_->config().dry_run; + if (retry_budget_->TryConsumeRetry()) { + retry_token_held_ = true; + } else if (is_dry_run) { + logging::Get()->info( + "ProxyTransaction retry budget would-reject (dry-run) " + "client_fd={} service={} attempt={}", + client_fd_, service_name_, attempt_); + } else { + logging::Get()->warn( + "ProxyTransaction retry budget exhausted " + "client_fd={} service={} attempt={}", + client_fd_, service_name_, attempt_); + state_ = State::FAILED; + DeliverResponse(MakeRetryBudgetResponse()); + return; + } + } + // Condition-dependent first-retry policy: // Connection-level failures (stale keep-alive, connect refused) // are transient — a different pooled connection will succeed. @@ -862,6 +916,12 @@ void ProxyTransaction::Cancel() { } void ProxyTransaction::Cleanup() { + // Release any retry-budget token held by the attempt that just + // ended. Must happen BEFORE the next TryConsumeRetry in MaybeRetry + // so the new attempt sees accurate retries_in_flight. Idempotent + // via the retry_token_held_ flag. + ReleaseRetryToken(); + if (lease_) { auto* conn = lease_.Get(); if (conn) { @@ -1166,6 +1226,13 @@ bool ProxyTransaction::ConsultBreaker() { return true; } +void ProxyTransaction::ReleaseRetryToken() { + if (retry_token_held_ && retry_budget_) { + retry_budget_->ReleaseRetry(); + } + retry_token_held_ = false; +} + void ProxyTransaction::ReleaseBreakerAdmissionNeutral() { if (!slice_ || admission_generation_ == 0) return; diff --git a/test/circuit_breaker_phase5_test.h b/test/circuit_breaker_phase5_test.h new file mode 100644 index 00000000..9b0c3f11 --- /dev/null +++ b/test/circuit_breaker_phase5_test.h @@ -0,0 +1,366 @@ +#pragma once + +// Phase 5 integration tests: retry budget wired into ProxyTransaction. +// +// Phase 3 covered the RetryBudget math (CAS, non-retry denominator, +// min-concurrency floor) as unit tests against the RetryBudget class in +// isolation. Phase 5 tests the INTEGRATION: ProxyTransaction resolves +// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks +// every attempt's in_flight via the RAII guard, and consults +// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2 +// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed +// back into the slice's failure math. +// +// Strategy: backends that always 502 with `retry_on_5xx=true` drive the +// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`) +// rejects every retry deterministically without needing concurrent +// client load. The circuit-breaker consecutive-failure threshold is +// raised well above the retry count so the breaker stays CLOSED — the +// budget gate is tested in isolation from the state machine. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" + +#include +#include +#include +#include + +namespace CircuitBreakerPhase5Tests { + +// Upstream config that always proxies /fail, with the circuit breaker +// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker +// thresholds intentionally unreachable for these tests — we want the +// retry-budget gate fired in isolation, not co-tripping the state +// machine. +static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name, + const std::string& host, + int port, + int retry_budget_percent, + int retry_budget_min_concurrency, + bool dry_run = false) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 16; + u.pool.max_idle_connections = 8; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.dry_run = dry_run; + // Breaker thresholds unreachable — we don't want the state machine + // tripping during a retry-budget test. + u.circuit_breaker.consecutive_failure_threshold = 10000; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + + u.circuit_breaker.retry_budget_percent = retry_budget_percent; + u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency; + return u; +} + +static bool HasRetryBudgetHeader(const std::string& response) { + return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos || + response.find("x-retry-budget-exhausted: 1") != std::string::npos; +} + +// --------------------------------------------------------------------------- +// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 + +// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that +// `TryConsumeRetry` runs BEFORE the retry executes and that +// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse +// path. +// +// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every +// retry attempt's TryConsumeRetry returns false. First attempt is +// unaffected (budget only gates retries), so the backend is hit exactly +// once per client request; the retry is short-circuited locally. +// --------------------------------------------------------------------------- +void TestRetryBudgetRejectsRetry() { + std::cout << "\n[TEST] CB Phase 5: retry budget rejects retry..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_budget_hdr = HasRetryBudgetHeader(r); + // Backend should have been hit exactly once (the first attempt); + // every retry was short-circuited by the budget gate. + int hits = backend_hits.load(std::memory_order_relaxed); + bool single_backend_hit = (hits == 1); + + bool pass = is_503 && has_budget_hdr && single_backend_hit; + TestFramework::RecordTest( + "CB Phase 5: retry budget rejects retry", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " budget_hdr=" + std::to_string(has_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 5: retry budget rejects retry", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: The min-concurrency floor admits retries even when the %-based +// cap would be zero. With percent=0 + min_concurrency=5, a single sequential +// client request's retry chain (1 first + 3 retries = 4 backend hits) all +// fit under the floor and proceed normally to the upstream — no 503, no +// X-Retry-Budget-Exhausted, and the client sees the final 5xx response. +// +// This is the symmetric test to Test 1: same near-zero %-cap, but a floor +// large enough that retries aren't budget-gated. Proves the floor is +// consulted (retries admitted) instead of the %-cap (retries rejected). +// --------------------------------------------------------------------------- +void TestRetryBudgetMinConcurrencyFloor() { + std::cout << "\n[TEST] CB Phase 5: retry budget min-concurrency floor..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + // percent=0 → no %-based capacity. min_concurrency=5 → floor + // admits up to 5 concurrent retries, easily covering the 3 + // sequential retries from a single client request. + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/5); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); + + // Client sees the upstream's final 502 — no local 503, no + // X-Retry-Budget-Exhausted. + bool is_502 = TestHttpClient::HasStatus(r, 502); + bool no_budget_hdr = !HasRetryBudgetHeader(r); + // 1 first attempt + 3 retries admitted by the floor = 4 backend hits. + int hits = backend_hits.load(std::memory_order_relaxed); + bool all_retries_proceeded = (hits == 4); + + bool pass = is_502 && no_budget_hdr && all_retries_proceeded; + TestFramework::RecordTest( + "CB Phase 5: retry budget min-concurrency floor", pass, + pass ? "" : + "is_502=" + std::to_string(is_502) + + " no_budget_hdr=" + std::to_string(no_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 5: retry budget min-concurrency floor", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Dry-run bypasses the retry-budget gate. +// +// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry +// returns false for every retry. But `circuit_breaker.dry_run=true` +// switches the rejection path to a log-and-proceed: no token is +// consumed, retry_token_held_ stays false, and AttemptCheckout runs as +// though the budget was unlimited. +// +// Result: the client sees the upstream's 502 response (because the +// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted. +// --------------------------------------------------------------------------- +void TestRetryBudgetDryRunPassthrough() { + std::cout << "\n[TEST] CB Phase 5: retry budget dry-run passthrough..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0, + /*dry_run=*/true); + u.proxy.retry.max_retries = 2; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); + + // Retries proceeded despite would-reject decisions — the client + // sees the upstream's final 502, not our local 503. + bool is_502 = TestHttpClient::HasStatus(r, 502); + bool no_budget_hdr = !HasRetryBudgetHeader(r); + int hits = backend_hits.load(std::memory_order_relaxed); + bool all_attempts_ran = (hits == 3); // 1 first + 2 retries + + bool pass = is_502 && no_budget_hdr && all_attempts_ran; + TestFramework::RecordTest( + "CB Phase 5: retry budget dry-run passthrough", pass, + pass ? "" : + "is_502=" + std::to_string(is_502) + + " no_budget_hdr=" + std::to_string(no_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 5: retry budget dry-run passthrough", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: First attempts are NOT budget-gated. +// +// The retry-budget cap applies only to retries (attempt_ > 0). First +// attempts call TrackInFlight (which only ever increments) but skip +// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a +// backend that always 200s, every client request must succeed — if the +// gate accidentally ran on first attempts, we'd see 503s here. +// +// Guards against a regression where TryConsumeRetry is called before +// the `attempt_ > 0` gate, or where the gate is placed in +// AttemptCheckout instead of MaybeRetry. +// --------------------------------------------------------------------------- +void TestFirstAttemptsNotGated() { + std::cout << "\n[TEST] CB Phase 5: first attempts not gated..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(200).Body("ok", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0); + // No retries — every request is a first attempt. + u.proxy.retry.max_retries = 0; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + int client_count = 5; + int successes = 0; + for (int i = 0; i < client_count; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (TestHttpClient::HasStatus(r, 200)) ++successes; + if (HasRetryBudgetHeader(r)) { + // Any X-Retry-Budget-Exhausted on a first-attempt-only + // path is a bug. Record and bail. + TestFramework::RecordTest( + "CB Phase 5: first attempts not gated", false, + "unexpected X-Retry-Budget-Exhausted on first-attempt path " + "i=" + std::to_string(i)); + return; + } + } + + int hits = backend_hits.load(std::memory_order_relaxed); + bool pass = (successes == client_count) && (hits == client_count); + TestFramework::RecordTest( + "CB Phase 5: first attempts not gated", pass, + pass ? "" : + "successes=" + std::to_string(successes) + + "/" + std::to_string(client_count) + + " backend_hits=" + std::to_string(hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 5: first attempts not gated", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER PHASE 5 - RETRY BUDGET INTEGRATION TESTS" + << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestRetryBudgetRejectsRetry(); + TestRetryBudgetMinConcurrencyFloor(); + TestRetryBudgetDryRunPassthrough(); + TestFirstAttemptsNotGated(); +} + +} // namespace CircuitBreakerPhase5Tests diff --git a/test/run_test.cc b/test/run_test.cc index ab7bdb9b..34d54367 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -16,6 +16,7 @@ #include "circuit_breaker_test.h" #include "circuit_breaker_phase3_test.h" #include "circuit_breaker_phase4_test.h" +#include "circuit_breaker_phase5_test.h" #include "test_framework.h" #include #include @@ -90,6 +91,9 @@ void RunAllTest(){ // ProxyTransaction + UpstreamManager + HttpServer) CircuitBreakerPhase4Tests::RunAllTests(); + // Run circuit breaker Phase 5 retry-budget integration tests + CircuitBreakerPhase5Tests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } @@ -168,11 +172,12 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); - // Run circuit breaker tests (phases 1-4: unit + phase3 + phase4) + // Run circuit breaker tests (phases 1-5: unit + phase3 + phase4 + phase5) }else if(mode == "circuit_breaker" || mode == "-B"){ CircuitBreakerTests::RunAllTests(); CircuitBreakerPhase3Tests::RunAllTests(); CircuitBreakerPhase4Tests::RunAllTests(); + CircuitBreakerPhase5Tests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]); From 277b039c66a97791131e67c5569ecc864eb36234 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 20:04:01 +0800 Subject: [PATCH 23/37] Finished Phase 6: Wait-queue drain on trip --- Makefile | 2 +- include/upstream/pool_partition.h | 22 +++ include/upstream/upstream_manager.h | 10 ++ server/http_server.cc | 53 ++++++ server/pool_partition.cc | 35 ++++ server/upstream_manager.cc | 10 ++ test/circuit_breaker_phase6_test.h | 261 ++++++++++++++++++++++++++++ test/run_test.cc | 7 +- 8 files changed, 398 insertions(+), 2 deletions(-) create mode 100644 test/circuit_breaker_phase6_test.h diff --git a/Makefile b/Makefile index 80f5f9a1..45993b3b 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h index f259204a..d23904ab 100644 --- a/include/upstream/pool_partition.h +++ b/include/upstream/pool_partition.h @@ -90,6 +90,28 @@ class PoolPartition { // completion. Same pattern as ScheduleInitiateShutdown. void ScheduleForceCloseActive(); + // Drain the wait queue on a CLOSED → OPEN breaker trip. + // + // Every live waiter receives CHECKOUT_CIRCUIT_OPEN (mapped by + // ProxyTransaction::OnCheckoutError to RESULT_CIRCUIT_OPEN, emitting + // the §12.1 circuit-open response). Cancelled waiters are dropped + // silently — the transaction already tore its side down via the + // framework abort hook. Does NOT set shutting_down_ (this is a + // transient drain, not a shutdown); the partition keeps its + // connections for HALF_OPEN probing when the open window elapses. + // + // Dispatcher-thread-only. The breaker's transition callback fires + // on the slice's owning dispatcher thread — the SAME dispatcher + // that owns this partition (one slice ↔ one partition by + // dispatcher_index). No enqueue needed. + // + // Rationale: without this drain, a queued waiter admitted by + // ConsultBreaker just before the trip would wait out the full + // `open_duration_ms` (up to 60s by default) before the pool's + // queue timeout rejects it. That's a visible latency spike for + // clients who are about to be served 503 anyway. + void DrainWaitQueueOnTrip(); + bool IsShuttingDown() const { return shutting_down_; } // Stats (dispatcher-thread-only reads) diff --git a/include/upstream/upstream_manager.h b/include/upstream/upstream_manager.h index f647d3b3..346bc4d5 100644 --- a/include/upstream/upstream_manager.h +++ b/include/upstream/upstream_manager.h @@ -63,6 +63,16 @@ class UpstreamManager { // Check if an upstream service is configured bool HasUpstream(const std::string& service_name) const; + // Look up the PoolPartition for (service_name, dispatcher_index). + // Returns nullptr if service is unknown or dispatcher_index is out + // of range. Used by the circuit-breaker transition callback (wired + // in HttpServer::MarkServerReady) to drain the wait queue on a + // CLOSED → OPEN trip. Must be called on the dispatcher thread + // identified by `dispatcher_index` — the returned partition's + // DrainWaitQueueOnTrip is dispatcher-thread-only. + PoolPartition* GetPoolPartition(const std::string& service_name, + size_t dispatcher_index); + // Install a non-owning pointer to the server's CircuitBreakerManager. // Called once from HttpServer::MarkServerReady after both managers are // constructed (§3.1). Lifetime guarantee: the CircuitBreakerManager diff --git a/server/http_server.cc b/server/http_server.cc index fbf06947..ccd8b80f 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -6,6 +6,9 @@ #include "upstream/upstream_manager.h" #include "upstream/proxy_handler.h" #include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "upstream/pool_partition.h" #include "log/logger.h" #include "log/log_utils.h" #include @@ -377,6 +380,56 @@ void HttpServer::MarkServerReady() { upstream_configs_, dispatchers.size(), dispatchers); upstream_manager_->AttachCircuitBreakerManager( circuit_breaker_manager_.get()); + + // Wire CLOSED→OPEN transition callbacks for every slice of every + // host — regardless of `enabled=false`, per design §3.1 R3-1. A + // disabled slice never fires transitions (TryAcquire short- + // circuits to ADMITTED); wiring the callback costs nothing but + // lets a live reload flip enable=false→true without re-wiring. + // + // The callback routes trip events to the corresponding + // PoolPartition's DrainWaitQueueOnTrip so queued waiters fail + // fast with CHECKOUT_CIRCUIT_OPEN instead of waiting out the + // open window. Each slice gets a distinct callback that + // captures its (service, dispatcher_index) pair — we can't use + // SetTransitionCallbackOnAllSlices because that would install a + // single callback across slices that need different partition + // lookups. + // + // Safe to capture raw `UpstreamManager*`: CircuitBreakerManager + // destructs BEFORE UpstreamManager (§3.1 ownership), and slice + // callbacks only fire on dispatcher threads which are stopped + // before either manager is destroyed. So any live callback + // invocation sees a valid UpstreamManager. + UpstreamManager* um = upstream_manager_.get(); + for (const auto& u : upstream_configs_) { + auto* host = circuit_breaker_manager_->GetHost(u.name); + if (!host) continue; + std::string service = u.name; + for (size_t i = 0; i < host->partition_count(); ++i) { + auto* slice = host->GetSlice(i); + if (!slice) continue; + slice->SetTransitionCallback( + [um, service, i](circuit_breaker::State old_s, + circuit_breaker::State new_s, + const char* /*trigger*/) { + // Drain only on CLOSED→OPEN. HALF_OPEN→OPEN + // doesn't need draining — in HALF_OPEN, non- + // probe admissions are already REJECTED_OPEN + // before reaching the pool queue, so the + // queue stays empty (or holds only probes, + // which are in-flight by the time HALF_OPEN + // trips back). + if (old_s == circuit_breaker::State::CLOSED && + new_s == circuit_breaker::State::OPEN) { + if (auto* part = um->GetPoolPartition( + service, i)) { + part->DrainWaitQueueOnTrip(); + } + } + }); + } + } } catch (...) { logging::Get()->error( "Circuit breaker init failed, stopping server"); diff --git a/server/pool_partition.cc b/server/pool_partition.cc index 819c941d..a0ba866c 100644 --- a/server/pool_partition.cc +++ b/server/pool_partition.cc @@ -549,6 +549,41 @@ void PoolPartition::InitiateShutdown() { MaybeSignalDrain(); } +void PoolPartition::DrainWaitQueueOnTrip() { + // Hoist alive_ — a waiter's error_callback may synchronously trigger + // a request completion path that tears down the partition (e.g. the + // test harness). Same pattern used by InitiateShutdown. + auto alive = alive_; + + if (shutting_down_) { + // Already draining via InitiateShutdown — that path will send + // CHECKOUT_SHUTTING_DOWN to every waiter. Don't double-fire. + return; + } + + if (wait_queue_.empty()) return; + + logging::Get()->info( + "PoolPartition draining wait queue on breaker trip: {}:{} " + "queue_size={}", + upstream_host_, upstream_port_, wait_queue_.size()); + + while (!wait_queue_.empty()) { + auto entry = std::move(wait_queue_.front()); + wait_queue_.pop_front(); + // Cancelled waiters have no callback to fire — the transaction + // already tore its side down via the framework abort hook. + if (IsEntryCancelled(entry)) { + continue; + } + // CHECKOUT_CIRCUIT_OPEN — ProxyTransaction::OnCheckoutError maps + // to RESULT_CIRCUIT_OPEN and delivers MakeCircuitOpenResponse() + // without touching the breaker (our own reject, don't feed back). + entry.error_callback(CHECKOUT_CIRCUIT_OPEN); + if (!alive->load(std::memory_order_acquire)) return; + } +} + void PoolPartition::ForceCloseActive() { // Collect transports + borrower callbacks, then move to zombie, then // close transports, then notify borrowers. This ordering ensures: diff --git a/server/upstream_manager.cc b/server/upstream_manager.cc index 9cd5a284..c4a4314f 100644 --- a/server/upstream_manager.cc +++ b/server/upstream_manager.cc @@ -296,3 +296,13 @@ Dispatcher* UpstreamManager::GetDispatcherForIndex(size_t index) const { bool UpstreamManager::HasUpstream(const std::string& service_name) const { return pools_.find(service_name) != pools_.end(); } + +PoolPartition* UpstreamManager::GetPoolPartition( + const std::string& service_name, + size_t dispatcher_index) { + auto it = pools_.find(service_name); + if (it == pools_.end()) { + return nullptr; + } + return it->second->GetPartition(dispatcher_index); +} diff --git a/test/circuit_breaker_phase6_test.h b/test/circuit_breaker_phase6_test.h new file mode 100644 index 00000000..77eea2c1 --- /dev/null +++ b/test/circuit_breaker_phase6_test.h @@ -0,0 +1,261 @@ +#pragma once + +// Phase 6 integration tests: wait-queue drain on CLOSED → OPEN trip. +// +// Phase 4 already covered "new requests after a trip hit REJECTED_OPEN". +// Phase 6 covers the orthogonal case: a request that passed ConsultBreaker +// pre-trip and is waiting in the pool's bounded wait queue when the trip +// fires. Without the drain, that waiter would sit until either the pool +// frees a slot (and then re-hit the upstream — pointless traffic) or the +// queue-timeout / open-duration elapses (up to 60s latency spike). +// +// Mechanism tested: `HttpServer::MarkServerReady` installs a transition +// callback on every slice that routes CLOSED → OPEN to the corresponding +// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives +// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps +// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`). +// +// Strategy: gate concurrency via a 1-connection pool. The first request +// hangs at the backend long enough to let a second request queue behind +// it. When the first's response lands (502), the breaker trips and the +// drain fires, causing the queued request to receive 503 + circuit-open +// headers instead of the backend's 502 (which would happen if the drain +// were missing and the queued request proceeded). + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" + +#include +#include +#include +#include +#include + +namespace CircuitBreakerPhase6Tests { + +static UpstreamConfig MakeDrainTripUpstream(const std::string& name, + const std::string& host, + int port, + bool breaker_enabled) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + // Single connection per partition — forces the second concurrent + // request to queue behind the first. Since tests run with + // worker_threads=1, one partition exists and it has exactly one + // connection slot. + u.pool.max_connections = 1; + u.pool.max_idle_connections = 1; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 5000; + u.proxy.retry.max_retries = 0; // Deterministic — no retry confounds. + + u.circuit_breaker.enabled = breaker_enabled; + u.circuit_breaker.consecutive_failure_threshold = 1; // Trip on first 5xx. + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + // Long open duration so the drain is unambiguously the thing that + // surfaces the 503 to the queued client — not a timer-driven + // HALF_OPEN recovery admitting a subsequent attempt. + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker. +// +// Request A takes the single pool slot and hangs at the backend for ~300ms. +// Request B queues (pool exhausted). At t≈300ms, A's backend response +// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip → +// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives +// 503 + `X-Circuit-Breaker: open`. +// +// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the +// backend itself, gets 502, client sees 502 — NOT 503 and NOT +// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header` +// fails without the drain wiring. +// --------------------------------------------------------------------------- +void TestWaitQueueDrainedOnTrip() { + std::cout << "\n[TEST] CB Phase 6: wait queue drained on trip..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + // Delay so the gateway's pool holds the connection long + // enough for a second client request to queue on it. + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; // Single partition → single wait queue. + gw.http2.enabled = false; + + gw.upstreams.push_back( + MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, + /*breaker_enabled=*/true)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Launch A first (takes the one connection), then B 50ms later + // so B is guaranteed to enter the wait queue. + std::promise a_resp, b_resp; + auto a_fut = a_resp.get_future(); + auto b_fut = b_resp.get_future(); + std::thread a([&]() { + a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::thread b([&]() { + b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + a.join(); + b.join(); + + std::string ra = a_fut.get(); + std::string rb = b_fut.get(); + + // A unambiguously hits the backend (owns the slot) and sees 502. + bool a_is_502 = TestHttpClient::HasStatus(ra, 502); + // B must see the circuit-open short-circuit from the drain — + // NOT a 502 from the backend, which is what happens without + // the drain wiring. + bool b_is_503 = TestHttpClient::HasStatus(rb, 503); + bool b_has_breaker_hdr = + rb.find("X-Circuit-Breaker: open") != std::string::npos || + rb.find("x-circuit-breaker: open") != std::string::npos; + // Exactly one backend hit — B was drained before making it to + // the upstream. Without the drain, backend_hits would be 2. + int hits = backend_hits.load(std::memory_order_relaxed); + bool single_hit = (hits == 1); + + bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit; + TestFramework::RecordTest( + "CB Phase 6: wait queue drained on trip", pass, + pass ? "" : + "a_is_502=" + std::to_string(a_is_502) + + " b_is_503=" + std::to_string(b_is_503) + + " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) + + " backend_hits=" + std::to_string(hits) + + " rb_head=" + rb.substr(0, 200)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 6: wait queue drained on trip", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: With the breaker disabled, the drain does NOT fire — the queued +// waiter proceeds to the upstream as it would absent the circuit-breaker +// layer entirely. +// +// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices +// short-circuit in TryAcquire and never invoke transition callbacks, so +// DrainWaitQueueOnTrip is never called. Request B must hit the backend +// (backend_hits == 2) and receive the upstream's 502 — NOT a 503. +// --------------------------------------------------------------------------- +void TestDisabledBreakerDoesNotDrain() { + std::cout << "\n[TEST] CB Phase 6: disabled breaker does not drain..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + gw.upstreams.push_back( + MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, + /*breaker_enabled=*/false)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::promise a_resp, b_resp; + auto a_fut = a_resp.get_future(); + auto b_fut = b_resp.get_future(); + std::thread a([&]() { + a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::thread b([&]() { + b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + a.join(); + b.join(); + + std::string ra = a_fut.get(); + std::string rb = b_fut.get(); + + // Both reach the backend — disabled breaker = no drain. + bool a_is_502 = TestHttpClient::HasStatus(ra, 502); + bool b_is_502 = TestHttpClient::HasStatus(rb, 502); + // Neither should carry the circuit-open header. + bool no_breaker_on_a = + ra.find("X-Circuit-Breaker") == std::string::npos && + ra.find("x-circuit-breaker") == std::string::npos; + bool no_breaker_on_b = + rb.find("X-Circuit-Breaker") == std::string::npos && + rb.find("x-circuit-breaker") == std::string::npos; + int hits = backend_hits.load(std::memory_order_relaxed); + bool two_hits = (hits == 2); + + bool pass = a_is_502 && b_is_502 && no_breaker_on_a && + no_breaker_on_b && two_hits; + TestFramework::RecordTest( + "CB Phase 6: disabled breaker does not drain", pass, + pass ? "" : + "a_is_502=" + std::to_string(a_is_502) + + " b_is_502=" + std::to_string(b_is_502) + + " no_breaker_on_a=" + std::to_string(no_breaker_on_a) + + " no_breaker_on_b=" + std::to_string(no_breaker_on_b) + + " backend_hits=" + std::to_string(hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 6: disabled breaker does not drain", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER PHASE 6 - WAIT-QUEUE DRAIN ON TRIP TESTS" + << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestWaitQueueDrainedOnTrip(); + TestDisabledBreakerDoesNotDrain(); +} + +} // namespace CircuitBreakerPhase6Tests diff --git a/test/run_test.cc b/test/run_test.cc index 34d54367..5dabf155 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -17,6 +17,7 @@ #include "circuit_breaker_phase3_test.h" #include "circuit_breaker_phase4_test.h" #include "circuit_breaker_phase5_test.h" +#include "circuit_breaker_phase6_test.h" #include "test_framework.h" #include #include @@ -94,6 +95,9 @@ void RunAllTest(){ // Run circuit breaker Phase 5 retry-budget integration tests CircuitBreakerPhase5Tests::RunAllTests(); + // Run circuit breaker Phase 6 wait-queue-drain-on-trip tests + CircuitBreakerPhase6Tests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } @@ -172,12 +176,13 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); - // Run circuit breaker tests (phases 1-5: unit + phase3 + phase4 + phase5) + // Run circuit breaker tests (phases 1-6: unit + phase3 + phase4 + phase5 + phase6) }else if(mode == "circuit_breaker" || mode == "-B"){ CircuitBreakerTests::RunAllTests(); CircuitBreakerPhase3Tests::RunAllTests(); CircuitBreakerPhase4Tests::RunAllTests(); CircuitBreakerPhase5Tests::RunAllTests(); + CircuitBreakerPhase6Tests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]); From e6df34b617ddcaf4f435c1509ab26d32fae67a5f Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 20:32:14 +0800 Subject: [PATCH 24/37] Finished Phase 7: Observability --- Makefile | 2 +- include/circuit_breaker/retry_budget.h | 17 ++ server/circuit_breaker_slice.cc | 22 +- server/proxy_transaction.cc | 15 +- test/circuit_breaker_phase7_test.h | 405 +++++++++++++++++++++++++ test/run_test.cc | 7 +- 6 files changed, 460 insertions(+), 8 deletions(-) create mode 100644 test/circuit_breaker_phase7_test.h diff --git a/Makefile b/Makefile index 45993b3b..0c3e47ac 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h $(TEST_DIR)/circuit_breaker_phase7_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h index 001bfccb..12782d9e 100644 --- a/include/circuit_breaker/retry_budget.h +++ b/include/circuit_breaker/retry_budget.h @@ -105,6 +105,23 @@ class RetryBudget { int64_t InFlight() const { return in_flight_.load(std::memory_order_relaxed); } + // Compute the current effective retry cap for observability / log + // enrichment. Uses the same formula as TryConsumeRetry but without + // mutating retries_in_flight_. Returns the point-in-time cap against + // which a would-be retry admission would be compared. Slightly racy + // (separate loads of in_flight_ and retries_in_flight_ aren't atomic + // relative to each other), but the result is for dashboards / logs + // where a one-entry drift is noise. + int64_t ComputeCap() const { + int64_t in_flight = in_flight_.load(std::memory_order_relaxed); + int64_t retries = retries_in_flight_.load(std::memory_order_relaxed); + int pct = percent_.load(std::memory_order_relaxed); + int min_conc = min_concurrency_.load(std::memory_order_relaxed); + int64_t non_retry = in_flight - retries; + if (non_retry < 0) non_retry = 0; + int64_t pct_cap = (non_retry * pct) / 100; + return pct_cap > min_conc ? pct_cap : min_conc; + } int64_t RetriesInFlight() const { return retries_in_flight_.load(std::memory_order_relaxed); } diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index c34c25ae..d7e8ad07 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -81,9 +81,23 @@ bool CircuitBreakerSlice::ShouldTripClosed( } void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { + auto now = Now(); + // Capture pre-reset observability context BEFORE mutating state. + // §11.1 log format asks for consecutive_failures + window_total + + // window_fail_rate at the trip event so operators can distinguish a + // "100 consecutive bad responses" trip from a "55% failure rate over + // a wide call window" trip — two very different operational stories + // that the `trigger` string alone doesn't fully capture. + int consec_at_trip = consecutive_failures_; + int64_t window_total = window_.TotalCount(now); + int64_t window_failures = window_.FailureCount(now); + int window_fail_rate_pct = + (window_total > 0) + ? static_cast((window_failures * 100) / window_total) + : 0; + auto duration = ComputeOpenDuration(); // uses current consecutive_trips_ consecutive_trips_.fetch_add(1, std::memory_order_relaxed); - auto now = Now(); auto open_until = now + duration; int64_t open_until_ns = std::chrono::duration_cast( @@ -107,8 +121,10 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { trips_.fetch_add(1, std::memory_order_relaxed); logging::Get()->warn( - "circuit breaker tripped {} trigger={} open_for_ms={} consecutive_trips={}", - host_label_, trigger, + "circuit breaker tripped {} trigger={} consecutive_failures={} " + "window_total={} window_fail_rate={} open_for_ms={} consecutive_trips={}", + host_label_, trigger, consec_at_trip, + window_total, window_fail_rate_pct, std::chrono::duration_cast(duration).count(), consecutive_trips_.load(std::memory_order_relaxed)); diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 020d898f..5cff47b6 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -724,10 +724,19 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) { "client_fd={} service={} attempt={}", client_fd_, service_name_, attempt_); } else { + // §11.1 format: log per-host budget state so operators + // can diagnose retry-storm throttling without hitting + // an admin endpoint. `cap` is the live effective ceiling + // (may have shifted since the failing TryConsumeRetry + // due to other transactions' in_flight changes). logging::Get()->warn( - "ProxyTransaction retry budget exhausted " - "client_fd={} service={} attempt={}", - client_fd_, service_name_, attempt_); + "retry budget exhausted service={} in_flight={} " + "retries_in_flight={} cap={} client_fd={} attempt={}", + service_name_, + retry_budget_->InFlight(), + retry_budget_->RetriesInFlight(), + retry_budget_->ComputeCap(), + client_fd_, attempt_); state_ = State::FAILED; DeliverResponse(MakeRetryBudgetResponse()); return; diff --git a/test/circuit_breaker_phase7_test.h b/test/circuit_breaker_phase7_test.h new file mode 100644 index 00000000..9dc841ba --- /dev/null +++ b/test/circuit_breaker_phase7_test.h @@ -0,0 +1,405 @@ +#pragma once + +// Phase 7 integration tests: observability — counter accuracy, snapshot +// API correctness, and log emission. +// +// Phases 2-6 each added counters and log lines as a side effect of their +// functional work. Phase 7 locks those in as regressions: +// +// * Counters (§11.2): trips, rejected, probe_successes, probe_failures, +// retries_rejected surface through CircuitBreakerManager::SnapshotAll. +// * Snapshot API (§11.3): per-slice rows aggregate into host-level +// totals; host-level fields (retries_in_flight / retries_rejected / +// in_flight) reflect the owning RetryBudget. +// * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message +// including trigger, consecutive_failures, window_total, +// window_fail_rate, open_for_ms, and consecutive_trips. +// +// The log-emission test attaches a spdlog ring-buffer sink to the logger +// for the duration of the test, triggers a trip, then asserts the +// captured messages contain the expected fields. No log file I/O. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" +#include "spdlog/sinks/ringbuffer_sink.h" + +#include +#include +#include +#include +#include +#include + +namespace CircuitBreakerPhase7Tests { + +using circuit_breaker::State; + +static UpstreamConfig MakeObservUpstream(const std::string& name, + const std::string& host, + int port, + int consecutive_threshold = 3) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + // Long open duration — keep the slice OPEN so post-trip assertions + // don't race a HALF_OPEN transition. + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: Snapshot API reflects per-slice trip/rejected counters and +// host-level aggregates. Drives N+1 requests against a backend that always +// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts +// the snapshot shows total_trips >= 1, total_rejected >= 1, +// open_partitions >= 1. +// --------------------------------------------------------------------------- +void TestSnapshotReflectsCounters() { + std::cout << "\n[TEST] CB Phase 7: snapshot reflects counters..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port, + /*threshold=*/3); + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip (3 failures), then 2 more to accumulate rejected counter. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + if (!cbm) { + TestFramework::RecordTest( + "CB Phase 7: snapshot reflects counters", false, + "no circuit breaker manager attached"); + return; + } + auto snaps = cbm->SnapshotAll(); + bool found = false; + int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0; + int open_parts = 0; + for (const auto& s : snaps) { + if (s.service_name == "svc") { + trips = s.total_trips; + rejected = s.total_rejected; + open_parts = s.open_partitions; + for (const auto& row : s.slices) { + probe_s += row.probe_successes; + probe_f += row.probe_failures; + } + found = true; + break; + } + } + + bool pass = found + && trips >= 1 + && rejected >= 2 // 2 post-trip short-circuits + && open_parts >= 1 + && probe_s == 0 // never entered HALF_OPEN + && probe_f == 0; + TestFramework::RecordTest( + "CB Phase 7: snapshot reflects counters", pass, + pass ? "" : + "found=" + std::to_string(found) + + " trips=" + std::to_string(trips) + + " rejected=" + std::to_string(rejected) + + " open_parts=" + std::to_string(open_parts) + + " probe_s=" + std::to_string(probe_s) + + " probe_f=" + std::to_string(probe_f)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 7: snapshot reflects counters", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message. +// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip, +// then inspects the captured messages for the key tokens. The sink is +// removed before the test returns so it doesn't affect later tests. +// --------------------------------------------------------------------------- +void TestTripLogEmission() { + std::cout << "\n[TEST] CB Phase 7: trip log emission..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port, + /*threshold=*/2); + gw.upstreams.push_back(u); + + // `HttpServer` construction calls `logging::Init()` which rebuilds + // the default logger via `spdlog::set_default_logger`. Any sink + // attached BEFORE that point lands on a stale logger. Attach the + // ringbuffer sink AFTER the last HttpServer construction so it + // captures the live logger's output. + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // Drive exactly threshold=2 failures to trip. + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + // Give the dispatcher a breath to emit + the sink to settle. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + auto messages = ring->last_formatted(); + // Scan for the trip message. Look for the static prefix plus the + // §11.1 field tokens. + bool saw_tripped = false; + bool has_trigger = false; + bool has_consec_failures = false; + bool has_window_total = false; + bool has_fail_rate = false; + bool has_open_for_ms = false; + bool has_consec_trips = false; + for (const auto& msg : messages) { + if (msg.find("circuit breaker tripped") == std::string::npos) { + continue; + } + saw_tripped = true; + if (msg.find("trigger=") != std::string::npos) has_trigger = true; + if (msg.find("consecutive_failures=") != std::string::npos) + has_consec_failures = true; + if (msg.find("window_total=") != std::string::npos) + has_window_total = true; + if (msg.find("window_fail_rate=") != std::string::npos) + has_fail_rate = true; + if (msg.find("open_for_ms=") != std::string::npos) + has_open_for_ms = true; + if (msg.find("consecutive_trips=") != std::string::npos) + has_consec_trips = true; + } + + bool pass = saw_tripped && has_trigger && has_consec_failures && + has_window_total && has_fail_rate && + has_open_for_ms && has_consec_trips; + TestFramework::RecordTest( + "CB Phase 7: trip log emission", pass, + pass ? "" : + "saw_tripped=" + std::to_string(saw_tripped) + + " trigger=" + std::to_string(has_trigger) + + " consec_failures=" + std::to_string(has_consec_failures) + + " window_total=" + std::to_string(has_window_total) + + " fail_rate=" + std::to_string(has_fail_rate) + + " open_for_ms=" + std::to_string(has_open_for_ms) + + " consec_trips=" + std::to_string(has_consec_trips)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 7: trip log emission", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Retry-budget observability — the exhausted log carries the +// §11.1 fields (service, in_flight, retries_in_flight, cap), and the +// host snapshot reflects retries_rejected. +// --------------------------------------------------------------------------- +void TestRetryBudgetObservability() { + std::cout << "\n[TEST] CB Phase 7: retry budget observability..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + // Budget: zero percent AND zero floor → every retry rejected. + auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port, + /*threshold=*/10000); + u.proxy.retry.max_retries = 2; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.retry_budget_percent = 0; + u.circuit_breaker.retry_budget_min_concurrency = 0; + gw.upstreams.push_back(u); + + // Attach the ringbuffer AFTER gateway construction — see + // TestTripLogEmission for rationale (HttpServer's ctor + // replaces the default logger via logging::Init, detaching + // any previously-attached sinks). + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // One client request: first attempt hits backend (502), retry + // blocked by budget → 503 + X-Retry-Budget-Exhausted. + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + auto messages = ring->last_formatted(); + bool saw_exhausted = false; + bool has_service = false; + bool has_inflight = false; + bool has_retries_inflight = false; + bool has_cap = false; + for (const auto& msg : messages) { + if (msg.find("retry budget exhausted") == std::string::npos) { + continue; + } + saw_exhausted = true; + if (msg.find("service=") != std::string::npos) has_service = true; + if (msg.find("in_flight=") != std::string::npos) + has_inflight = true; + if (msg.find("retries_in_flight=") != std::string::npos) + has_retries_inflight = true; + if (msg.find("cap=") != std::string::npos) has_cap = true; + } + + // Snapshot: retries_rejected must be >= 1 (every rejection increments). + int64_t retries_rejected = 0; + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + if (cbm) { + for (const auto& s : cbm->SnapshotAll()) { + if (s.service_name == "svc-budget") { + // Host aggregate — single host, so the sum is the + // host's retries_rejected. The snapshot doesn't yet + // expose that directly — derive from RetryBudget + // via the host getter. + auto* host = cbm->GetHost("svc-budget"); + if (host) { + retries_rejected = + host->GetRetryBudget()->RetriesRejected(); + } + break; + } + } + } + + bool pass = saw_exhausted && has_service && has_inflight && + has_retries_inflight && has_cap && + retries_rejected >= 1; + TestFramework::RecordTest( + "CB Phase 7: retry budget observability", pass, + pass ? "" : + "saw_exhausted=" + std::to_string(saw_exhausted) + + " service=" + std::to_string(has_service) + + " inflight=" + std::to_string(has_inflight) + + " retries_inflight=" + std::to_string(has_retries_inflight) + + " cap=" + std::to_string(has_cap) + + " retries_rejected=" + std::to_string(retries_rejected)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Phase 7: retry budget observability", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER PHASE 7 - OBSERVABILITY TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestSnapshotReflectsCounters(); + TestTripLogEmission(); + TestRetryBudgetObservability(); +} + +} // namespace CircuitBreakerPhase7Tests diff --git a/test/run_test.cc b/test/run_test.cc index 5dabf155..17d7eed9 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -18,6 +18,7 @@ #include "circuit_breaker_phase4_test.h" #include "circuit_breaker_phase5_test.h" #include "circuit_breaker_phase6_test.h" +#include "circuit_breaker_phase7_test.h" #include "test_framework.h" #include #include @@ -98,6 +99,9 @@ void RunAllTest(){ // Run circuit breaker Phase 6 wait-queue-drain-on-trip tests CircuitBreakerPhase6Tests::RunAllTests(); + // Run circuit breaker Phase 7 observability tests + CircuitBreakerPhase7Tests::RunAllTests(); + std::cout << "====================================\n" << std::endl; } @@ -176,13 +180,14 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); - // Run circuit breaker tests (phases 1-6: unit + phase3 + phase4 + phase5 + phase6) + // Run circuit breaker tests (phases 1-7: unit + phase3 + phase4 + phase5 + phase6 + phase7) }else if(mode == "circuit_breaker" || mode == "-B"){ CircuitBreakerTests::RunAllTests(); CircuitBreakerPhase3Tests::RunAllTests(); CircuitBreakerPhase4Tests::RunAllTests(); CircuitBreakerPhase5Tests::RunAllTests(); CircuitBreakerPhase6Tests::RunAllTests(); + CircuitBreakerPhase7Tests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]); From 7b83fbfdd03f1cca613e3660bf88f62d428294f3 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 22:04:16 +0800 Subject: [PATCH 25/37] Finished Feature Development --- Makefile | 2 +- docs/circuit_breaker.md | 149 ++ .../circuit_breaker/circuit_breaker_host.h | 2 +- .../circuit_breaker/circuit_breaker_manager.h | 2 +- .../circuit_breaker/circuit_breaker_state.h | 2 +- include/circuit_breaker/retry_budget.h | 2 +- include/config/server_config.h | 24 +- include/upstream/pool_partition.h | 4 +- include/upstream/proxy_transaction.h | 18 +- server/circuit_breaker_slice.cc | 7 +- server/http_server.cc | 36 +- server/main.cc | 2 +- server/proxy_transaction.cc | 9 +- test/circuit_breaker_components_test.h | 507 +++++++ test/circuit_breaker_integration_test.h | 1213 +++++++++++++++++ test/circuit_breaker_observability_test.h | 405 ++++++ test/circuit_breaker_reload_test.h | 373 +++++ test/circuit_breaker_retry_budget_test.h | 367 +++++ test/circuit_breaker_test.h | 4 +- test/circuit_breaker_wait_queue_drain_test.h | 261 ++++ test/config_test.h | 36 +- test/run_test.cc | 47 +- 22 files changed, 3390 insertions(+), 82 deletions(-) create mode 100644 docs/circuit_breaker.md create mode 100644 test/circuit_breaker_components_test.h create mode 100644 test/circuit_breaker_integration_test.h create mode 100644 test/circuit_breaker_observability_test.h create mode 100644 test/circuit_breaker_reload_test.h create mode 100644 test/circuit_breaker_retry_budget_test.h create mode 100644 test/circuit_breaker_wait_queue_drain_test.h diff --git a/Makefile b/Makefile index 0c3e47ac..2b9ae194 100644 --- a/Makefile +++ b/Makefile @@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h -TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h $(TEST_DIR)/circuit_breaker_phase7_test.h +TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_components_test.h $(TEST_DIR)/circuit_breaker_integration_test.h $(TEST_DIR)/circuit_breaker_retry_budget_test.h $(TEST_DIR)/circuit_breaker_wait_queue_drain_test.h $(TEST_DIR)/circuit_breaker_observability_test.h $(TEST_DIR)/circuit_breaker_reload_test.h # All headers combined HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS) diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md new file mode 100644 index 00000000..6f38de69 --- /dev/null +++ b/docs/circuit_breaker.md @@ -0,0 +1,149 @@ +# Circuit Breaker + +Per-upstream circuit breaking for the gateway, preventing cascading failures when a backend becomes unhealthy. Follows the resilience4j three-state machine (`CLOSED` → `OPEN` → `HALF_OPEN` → `CLOSED`), trips on either consecutive-failure or failure-rate thresholds, and short-circuits checkouts with `503 Service Unavailable` while the circuit is open. A separate **retry budget** caps the fraction of concurrent upstream work that may be retries, bounding the retry-storm amplification factor even when individual retries pass the breaker gate. + +--- + +## Overview + +- **Per-dispatcher slices.** One `CircuitBreakerSlice` per dispatcher partition for each upstream. Hot-path `TryAcquire` / `Report*` calls are lock-free — each slice is dispatcher-thread-pinned. +- **Three states.** `CLOSED` = normal traffic. `OPEN` = all requests short-circuited with 503 for the exponential-backoff open duration. `HALF_OPEN` = a bounded number of probe requests are admitted to test recovery; on success, closes; on failure, re-trips with longer backoff. +- **Dual trip paths.** Either `consecutive_failures >= N` OR `failure_rate >= P%` over a sliding window (subject to `minimum_volume`). +- **Retry budget.** Host-level cap: `max(retry_budget_min_concurrency, (in_flight - retries_in_flight) * retry_budget_percent / 100)`. Retries that exceed the cap receive `503` + `X-Retry-Budget-Exhausted: 1` instead of going to the upstream. +- **Wait-queue drain on trip.** On every `CLOSED → OPEN` transition, the corresponding pool partition's wait queue is drained immediately with `503 + X-Circuit-Breaker: open` — queued waiters don't have to wait out the full open window. +- **Dry-run mode.** `dry_run=true` computes decisions and logs them, but still admits traffic. Useful for staging a breaker in production without risk. +- **Hot-reload.** Breaker-field edits (thresholds, window, probe budget, retry budget tuning, enabled toggle) apply live on SIGHUP — no restart required. Topology edits (host/port/pool/proxy/tls) still require a restart. + +--- + +## Configuration + +Each `upstream` entry accepts a nested `circuit_breaker` block: + +```json +{ + "upstreams": [ + { + "name": "orders", + "host": "orders-backend", + "port": 8080, + "circuit_breaker": { + "enabled": true, + "dry_run": false, + "consecutive_failure_threshold": 5, + "failure_rate_threshold": 50, + "minimum_volume": 20, + "window_seconds": 10, + "permitted_half_open_calls": 3, + "base_open_duration_ms": 5000, + "max_open_duration_ms": 60000, + "retry_budget_percent": 20, + "retry_budget_min_concurrency": 3 + } + } + ] +} +``` + +### Fields + +| Field | Type | Default | Meaning | +|---|---|---|---| +| `enabled` | bool | `false` | Master switch. When false, the slice is a zero-overhead no-op on the hot path. | +| `dry_run` | bool | `false` | Shadow mode: log would-reject decisions but admit traffic. Both the state machine and the retry budget honor this flag. | +| `consecutive_failure_threshold` | int | `5` | Trip when N consecutive failures are observed in `CLOSED`. Upper bound 10,000. | +| `failure_rate_threshold` | int | `50` | Trip when `(failures / total) * 100 >= this` over the rolling window, provided `total >= minimum_volume`. 0-100. | +| `minimum_volume` | int | `20` | Minimum calls-in-window before rate-based trip is even considered. Upper bound 10,000,000. | +| `window_seconds` | int | `10` | Rolling window duration for the rate trip. >= 1. | +| `permitted_half_open_calls` | int | `3` | Probe admissions allowed per `HALF_OPEN` cycle. A single success flips to `CLOSED`; a single failure re-trips to `OPEN`. Upper bound 1,000. | +| `base_open_duration_ms` | int | `5000` | Initial open duration on first trip. Subsequent trips use `min(base << consecutive_trips, max)`. | +| `max_open_duration_ms` | int | `60000` | Ceiling for the exponential-backoff open duration. | +| `retry_budget_percent` | int | `20` | Retries capped at this % of non-retry in-flight traffic to the same host. 0-100. | +| `retry_budget_min_concurrency` | int | `3` | Floor for the retry cap — always allow at least this many concurrent retries regardless of traffic level. | + +### Defaults (when `circuit_breaker` block is absent) + +`enabled=false`. The breaker is fully opt-in. No behavioral change from a pre-breaker gateway configuration. + +--- + +## Client-facing responses + +Two distinct `503` variants, keyed off the reject source: + +**Circuit-open reject** — breaker is `OPEN` or in `HALF_OPEN`-full: +``` +HTTP/1.1 503 Service Unavailable +Retry-After: 5 +X-Circuit-Breaker: open # or half_open +X-Upstream-Host: orders-backend:8080 +Connection: close +``` + +- `Retry-After` derivation: + - `OPEN`: derived from the stored `open_until` deadline (time remaining until next probe). + - `HALF_OPEN`: derived from the *next* open duration (`base << consecutive_trips`) — reflects what the backoff would be if the in-flight probes fail. Base alone would under-report after multiple trips. + - Both paths: ceil-divide the millisecond value to seconds, capped at 3600s. +- `X-Circuit-Breaker` distinguishes the two reject paths so operators can tell "backoff active" from "probing, no capacity left". + +**Retry-budget reject** — every retry attempt rejected because the host's budget is exhausted: +``` +HTTP/1.1 503 Service Unavailable +X-Retry-Budget-Exhausted: 1 +Connection: close +``` + +No `Retry-After` (the budget has no recovery clock — it depends on concurrent traffic). No `X-Circuit-Breaker` header (this reject path is orthogonal to the state machine). + +Both responses are **terminal**: the retry loop never retries a circuit-open or retry-budget-exhausted outcome. + +--- + +## Hot reload + +All `circuit_breaker` fields on existing upstream services are hot-reloadable via `SIGHUP`. Reload semantics: + +| Edit | Behavior | +|---|---| +| Threshold change (failures, rate, window, probe budget, open durations) | Applied on the next `TryAcquire` / `Report*` call on each slice. Live state (`CLOSED`/`OPEN`/`HALF_OPEN`) is preserved. | +| `enabled=true → false` | Live state reset to `CLOSED`; hot path short-circuits to `ADMITTED`. No transition callback fired. | +| `enabled=false → true` | Live state reset to `CLOSED`. The transition callback (wired at startup) re-engages for future trips. | +| `window_seconds` change | Rolling window reset. In-flight reports admitted pre-reload are invalidated (by `closed_gen_` bump); `consecutive_failures_` reset so stale counts can't trip the fresh window. In-flight `HALF_OPEN` probes are NOT invalidated (separate `halfopen_gen_` counter) — probe cycles complete normally. | +| `retry_budget_percent` / `retry_budget_min_concurrency` | Applied immediately (atomic stores). In-flight counters preserved. | + +Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a restart; the gateway logs `"Reload: upstream topology changes require a restart to take effect"` and keeps the old pool alive. Breaker edits on the same reload are still applied live. + +--- + +## Observability + +### Logs + +| Event | Level | Sample | +|---|---|---| +| `CLOSED → OPEN` trip | `warn` | `circuit breaker tripped service=orders host=orders-backend:8080 partition=0 trigger=consecutive consecutive_failures=5 window_total=12 window_fail_rate=41 open_for_ms=5000 consecutive_trips=1` | +| `OPEN → HALF_OPEN` | `info` | `circuit breaker half-open ... probes_allowed=3` | +| `HALF_OPEN → CLOSED` | `info` | `circuit breaker closed ... probes_succeeded=3` | +| `HALF_OPEN → OPEN` re-trip | `warn` | `circuit breaker re-tripped ... trigger=probe_fail consecutive_trips=2 open_for_ms=10000` | +| Reject (first of cycle) | `info` | `circuit breaker rejected ... state=open` | +| Reject (subsequent) | `debug` | Same, at debug. | +| Reject (dry-run) | `info` | `[dry-run] circuit breaker would reject ...` | +| Retry budget exhausted | `warn` | `retry budget exhausted service=orders in_flight=45 retries_in_flight=9 cap=9 client_fd=... attempt=1` | +| Reload applied | `info` | `circuit breaker config applied service=orders enabled=true window_s=10 fail_rate=50 consec_threshold=5` | +| Wait-queue drain on trip | `info` | `PoolPartition draining wait queue on breaker trip: orders-backend:8080 queue_size=3` | + +### Snapshot API + +`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A future `/admin/breakers` endpoint would JSON-serialize this. + +--- + +## Design notes + +- **Dispatcher affinity.** Slices are pinned to their dispatcher thread — no CAS on the hot path. The trade-off: skewed request distribution across dispatchers can cause one partition to trip while another stays `CLOSED`. Uniform hashing keeps this mild in practice. +- **Lazy `HALF_OPEN`.** The transition from `OPEN` happens on the next inbound `TryAcquire` once the open deadline elapses — no background timer. Envoy and resilience4j use the same model. +- **Generation tokens.** Every admission is stamped with a per-domain generation counter (`closed_gen_` or `halfopen_gen_`, depending on state). `Report*` drops stale-generation completions so pre-transition requests can't pollute a fresh cycle. Window resizes bump only `closed_gen_` so in-flight probes aren't stranded. +- **Retry budget CAS.** `TryConsumeRetry` uses `compare_exchange_weak` to serialize concurrent retry admissions. A plain load-check-add would let N callers all observe `current < cap` and all increment past the cap. +- **Non-retry denominator.** The budget base is `in_flight - retries_in_flight`, not raw `in_flight`. Retries count in both terms but subtract out here so admitting a retry doesn't inflate its own cap. + +For the full design document (motivations, trade-offs, failure modes, revision history, test strategy), see [.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md](../.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md). diff --git a/include/circuit_breaker/circuit_breaker_host.h b/include/circuit_breaker/circuit_breaker_host.h index 6aff2965..67211667 100644 --- a/include/circuit_breaker/circuit_breaker_host.h +++ b/include/circuit_breaker/circuit_breaker_host.h @@ -97,7 +97,7 @@ class CircuitBreakerHost { // across partitions — callers that need partition-specific behavior // can read `slice->dispatcher_index()` inside the callback. // Must be called before live traffic; thread-safety depends on - // slice-dispatcher affinity at the Reload layer (Phase 8 wires this). + // slice-dispatcher affinity at the Reload layer. void SetTransitionCallbackOnAllSlices(StateTransitionCallback cb); // Accessors. diff --git a/include/circuit_breaker/circuit_breaker_manager.h b/include/circuit_breaker/circuit_breaker_manager.h index 66c2b33d..b4b32f06 100644 --- a/include/circuit_breaker/circuit_breaker_manager.h +++ b/include/circuit_breaker/circuit_breaker_manager.h @@ -25,7 +25,7 @@ namespace circuit_breaker { // upstream policy). This makes GetHost lock-free after construction, // which is critical for the hot path. // -// Hot-reload (Phase 8): only `circuit_breaker` sub-fields on EXISTING +// Hot-reload: only `circuit_breaker` sub-fields on EXISTING // upstream services can be live-reloaded. New or removed service names // log a warn and are skipped — the caller (HttpServer::Reload) still // fires the "restart required" diagnostic in that case. diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h index 6a758a57..92872f8b 100644 --- a/include/circuit_breaker/circuit_breaker_state.h +++ b/include/circuit_breaker/circuit_breaker_state.h @@ -51,7 +51,7 @@ enum class FailureKind : uint8_t { // `trigger` is a short static string such as "consecutive" / "rate" / // "probe_success" / "probe_fail" / "open_elapsed" for logging. // -// TODO(phase-7): once a snapshot / admin JSON endpoint lands, convert +// TODO(post-v1): once a snapshot / admin JSON endpoint lands, convert // `trigger` to an `enum class TransitionTrigger` so the valid set is // compile-time checked rather than string-compared. See design doc §15.8. using StateTransitionCallback = diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h index 12782d9e..f8392013 100644 --- a/include/circuit_breaker/retry_budget.h +++ b/include/circuit_breaker/retry_budget.h @@ -32,7 +32,7 @@ namespace circuit_breaker { // relaxed — snapshots can be slightly stale, which is fine for a // capacity gate on a retry storm. // -// Usage (Phase 5 wires this in): +// Usage: // 1. On every attempt (first or retry), call TrackInFlight() and keep // the returned guard alive until the attempt completes. The guard // decrements in_flight_ in its destructor. diff --git a/include/config/server_config.h b/include/config/server_config.h index 8a8e8ed4..1f4c7f59 100644 --- a/include/config/server_config.h +++ b/include/config/server_config.h @@ -154,7 +154,8 @@ struct CircuitBreakerConfig { // Retry budget (orthogonal to the breaker). Caps concurrent retries to // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100). - // Wired into the request path in Phase 5; in Phase 3 these are read by + // Wired into the request path via ProxyTransaction's retry-budget + // gate in MaybeRetry; also read by // CircuitBreakerHost to construct its owned RetryBudget. int retry_budget_percent = 20; int retry_budget_min_concurrency = 3; @@ -185,20 +186,19 @@ struct UpstreamConfig { ProxyConfig proxy; CircuitBreakerConfig circuit_breaker; - // Includes circuit_breaker until Phase 8 ships CircuitBreakerManager::Reload. - // A CB-only SIGHUP currently has no propagation path into live slice state, - // so operator== must return false to trigger the "restart required" warning - // rather than silently committing the new config object while the live slices - // continue running with the old settings. + // Excludes `circuit_breaker` — breaker fields are live-reloadable via + // `CircuitBreakerManager::Reload`, which `HttpServer::Reload` invokes on + // every reload. Topology fields (name, host, port, tls, pool, + // proxy) remain restart-only; a mismatch here triggers the + // "restart required" warning in the outer reload. // - // TODO(phase-8): once CircuitBreakerManager::Reload is wired into - // HttpServer::Reload, remove circuit_breaker from this operator and diff it - // separately (per-host CircuitBreakerConfig comparison) so breaker-only - // edits are hot-reloadable without a restart. + // Contract: a config pair that differs ONLY in circuit_breaker fields + // must compare EQUAL so the outer reload doesn't fire a spurious warn. + // Any future field whose propagation path is wired into a live + // `*Manager::Reload` should be removed from this operator symmetrically. bool operator==(const UpstreamConfig& o) const { return name == o.name && host == o.host && port == o.port && - tls == o.tls && pool == o.pool && proxy == o.proxy && - circuit_breaker == o.circuit_breaker; + tls == o.tls && pool == o.pool && proxy == o.proxy; } bool operator!=(const UpstreamConfig& o) const { return !(*this == o); } }; diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h index d23904ab..a6d904b2 100644 --- a/include/upstream/pool_partition.h +++ b/include/upstream/pool_partition.h @@ -25,8 +25,8 @@ class PoolPartition { static constexpr int CHECKOUT_CONNECT_TIMEOUT = -3; static constexpr int CHECKOUT_SHUTTING_DOWN = -4; static constexpr int CHECKOUT_QUEUE_TIMEOUT = -5; - // Delivered to wait-queue waiters drained on a breaker trip (Phase 6 - // implements the drain path). ProxyTransaction::OnCheckoutError maps + // Delivered to wait-queue waiters drained on a breaker trip by + // DrainWaitQueueOnTrip. ProxyTransaction::OnCheckoutError maps // this to RESULT_CIRCUIT_OPEN so the queued client gets the same // circuit-open response a fresh requester would get. static constexpr int CHECKOUT_CIRCUIT_OPEN = -6; diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h index cded9b71..ccda6d24 100644 --- a/include/upstream/proxy_transaction.h +++ b/include/upstream/proxy_transaction.h @@ -34,10 +34,9 @@ class ProxyTransaction : public std::enable_shared_from_this { // Carries Retry-After + X-Circuit-Breaker headers (§12.1). // Terminal — retry loop MUST NOT retry this outcome (§8). static constexpr int RESULT_CIRCUIT_OPEN = -7; - // Retry budget exhausted (Phase 5 wires the actual gate; the code is - // reserved here so MakeErrorResponse and the retry loop both know it - // exists and terminal-classify it). No Retry-After; distinct header - // X-Retry-Budget-Exhausted so operators can tell the two 503s apart. + // Retry budget exhausted. No Retry-After; distinct header + // X-Retry-Budget-Exhausted so operators can tell the two 503s apart + // from circuit-open rejects. static constexpr int RESULT_RETRY_BUDGET_EXHAUSTED = -8; // Constructor copies all needed fields from client_request (method, path, @@ -159,7 +158,7 @@ class ProxyTransaction : public std::enable_shared_from_this { // Timing std::chrono::steady_clock::time_point start_time_; - // Circuit breaker integration (Phase 4). Resolved once in Start() from + // Circuit breaker integration — resolved once in Start() from // `service_name_` + `dispatcher_index_`. Null when there's no // CircuitBreakerManager attached (server has no upstreams, or the // breaker is being built lazily) — the breaker is simply skipped in @@ -232,20 +231,17 @@ class ProxyTransaction : public std::enable_shared_from_this { // plain 503 for those codes if called generically. static HttpResponse MakeErrorResponse(int result_code); - // Phase 4: emit the §12.1 circuit-open response. + // Emit the circuit-open response (design §12.1): // 503 + Retry-After (seconds until slice->OpenUntil()) // + X-Circuit-Breaker: open // + X-Upstream-Host: service:host:port HttpResponse MakeCircuitOpenResponse() const; - // Phase 5 will emit this. Declared here so Phase 4's - // MakeErrorResponse RESULT_RETRY_BUDGET_EXHAUSTED branch has a - // target to dispatch to and so tests can assert the response shape - // even before the retry-budget gate is wired. + // Emit the retry-budget-exhausted response (design §12.2): // 503 + X-Retry-Budget-Exhausted: 1 static HttpResponse MakeRetryBudgetResponse(); - // Phase 4 helpers — breaker gate and outcome classification. + // Breaker helpers — gate and outcome classification. // // ConsultBreaker: call at the top of AttemptCheckout. Populates // admission_generation_ and is_probe_ on admission; delivers the diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index d7e8ad07..1ff6e00e 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -134,7 +134,7 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { void CircuitBreakerSlice::TransitionOpenToHalfOpen() { state_.store(State::HALF_OPEN, std::memory_order_release); // Clear open_until_steady_ns_ per the OpenUntil() contract ("zero when - // not OPEN"). Leaving a stale deadline here would cause Phase 4's + // not OPEN"). Leaving a stale deadline here would cause // ProxyTransaction::MakeCircuitOpenResponse to compute a Retry-After // from a past time_point (negative delta → floor at 1s, misleading for // a reject in the HALF_OPEN probe-budget-full path). Retry-After for @@ -384,7 +384,7 @@ void CircuitBreakerSlice::ReportSuccess(bool probe, // Stale probe defense: we admitted this probe in HALF_OPEN, but the // slice may have transitioned out (e.g., `Reload()` flipped enabled, // `TransitionHalfOpenToClosed` already fired on sibling probes, or — - // post-Phase 8 — an operator toggle transitioned us to CLOSED). + // operator toggle transitioned us to CLOSED via Reload(). // Only touch HALF_OPEN bookkeeping / fire transitions when state is // STILL HALF_OPEN. if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return; @@ -581,7 +581,8 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { // Silent reset — no transition callback. The change is operator- // initiated configuration, not a runtime state signal; firing the // callback would cause PoolPartition::DrainWaitQueueOnTrip-style - // consumers (Phase 6) to spuriously drain waiters on a config edit. + // consumers (the wait-queue drain transition callback) to spuriously + // drain waiters on a config edit. state_.store(State::CLOSED, std::memory_order_release); open_until_steady_ns_.store(0, std::memory_order_release); consecutive_trips_.store(0, std::memory_order_relaxed); diff --git a/server/http_server.cc b/server/http_server.cc index ccd8b80f..c47b7688 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -3454,13 +3454,41 @@ bool HttpServer::Reload(const ServerConfig& new_config) { rate_limit_manager_->Reload(new_config.rate_limit); } - // Upstream pool changes require a restart — pools are built once in Start() - // and cannot be rebuilt at runtime without a full drain cycle. + // Circuit breaker reload — live-propagates breaker-field edits on + // existing upstream services. CircuitBreakerManager::Reload is + // idempotent (atomic stores to unchanged values), so calling it + // unconditionally costs nothing when the operator didn't edit any + // breaker fields. Topology changes (added / removed service names) + // are logged as warn + skipped inside the manager; the outer + // restart-required warning still fires via the upstreams-inequality + // check below. After this call, update the breaker slices on every + // partition via per-dispatcher EnQueue — the manager handles that + // routing internally. The topology check itself now only diffs non- + // breaker fields (UpstreamConfig::operator== excludes circuit_breaker), + // so a CB-only SIGHUP is a clean hot reload with no spurious warn. + if (circuit_breaker_manager_) { + circuit_breaker_manager_->Reload(new_config.upstreams); + } + + // Upstream topology changes (host/port/pool/proxy/tls) require a + // restart — pools are built once in Start() and cannot be rebuilt + // at runtime without a full drain cycle. The equality operator on + // UpstreamConfig deliberately excludes `circuit_breaker` so a CB- + // only edit doesn't trigger this warning (the reload above already + // applied the new breaker settings to live slices). if (new_config.upstreams != upstream_configs_) { - logging::Get()->warn("Reload: upstream configuration changes require a " - "restart to take effect (ignored)"); + logging::Get()->warn("Reload: upstream topology changes require a " + "restart to take effect (circuit-breaker " + "field edits, if any, were applied live)"); } + // Persist the new upstreams (preserving the breaker propagation just + // applied). Subsequent reloads diff against this baseline, so without + // this update a second SIGHUP would re-propagate the same CB values + // and also see the original topology as "unchanged" rather than the + // attempted new state — confusing operators debugging reload behavior. + upstream_configs_ = new_config.upstreams; + return true; } diff --git a/server/main.cc b/server/main.cc index 86f7598d..0d7474e9 100644 --- a/server/main.cc +++ b/server/main.cc @@ -435,7 +435,7 @@ static bool ReloadConfig(const std::string& config_path, // server keeps running the startup values — /stats and other // current_config consumers would report phantom state, and subsequent // identical reloads could produce inconsistent diagnostics. Pin to - // the running values until Phase 8 implements + // the running values until CircuitBreakerManager::Reload implements // CircuitBreakerManager::Reload (the only upstream sub-field that // becomes hot-reloadable); at that point this save becomes a // partial-field save excluding circuit_breaker. diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 5cff47b6..29dbe550 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -114,8 +114,8 @@ void ProxyTransaction::Start() { method_, upstream_path); // Resolve the circuit-breaker slice once. Null when no breaker is - // attached (server has no upstreams configured, or Phase 4 skipped - // on this deployment), or when the service/dispatcher pair is out of + // attached (server has no upstreams configured), or when the + // service/dispatcher pair is out of // range. In any null case the breaker is simply bypassed — the // transaction proceeds as if circuit breaking were disabled. if (upstream_manager_ && dispatcher_index_ >= 0) { @@ -277,7 +277,8 @@ void ProxyTransaction::OnCheckoutError(int error_code) { // (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown should fail fast — // retrying under backpressure amplifies load on an already-stressed // pool and stretches client latency with no benefit. A breaker-drain - // reject (CHECKOUT_CIRCUIT_OPEN, Phase 6) is also terminal: the + // reject (CHECKOUT_CIRCUIT_OPEN from the wait-queue drain) is also + // terminal: the // client gets the same circuit-open response a fresh requester // would, and the retry loop must not retry it. // @@ -301,7 +302,7 @@ void ProxyTransaction::OnCheckoutError(int error_code) { if (error_code == CIRCUIT_OPEN) { // Drain path: breaker tripped while this transaction was queued - // (Phase 6 implements the drain). Do NOT Report to the slice — + // Do NOT Report to the slice — // our own reject must not feed back into the failure math. Emit // the §12.1 circuit-open response directly. logging::Get()->info( diff --git a/test/circuit_breaker_components_test.h b/test/circuit_breaker_components_test.h new file mode 100644 index 00000000..36285b16 --- /dev/null +++ b/test/circuit_breaker_components_test.h @@ -0,0 +1,507 @@ +#pragma once + +#include "test_framework.h" +#include "config/server_config.h" +#include "circuit_breaker/circuit_breaker_state.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "circuit_breaker/retry_budget.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "dispatcher.h" + +#include +#include +#include +#include + +// Circuit-breaker component unit tests: RetryBudget, CircuitBreakerHost, +// CircuitBreakerManager. +// +// These tests exercise the standalone data structures without any +// integration into the request path (covered by the integration suite). +// Every test constructs the object under test in isolation — no live +// dispatchers, no network I/O. A minimal Dispatcher is instantiated only +// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload +// calls. +namespace CircuitBreakerComponentsTests { + +using circuit_breaker::CircuitBreakerHost; +using circuit_breaker::CircuitBreakerHostSnapshot; +using circuit_breaker::CircuitBreakerManager; +using circuit_breaker::Decision; +using circuit_breaker::FailureKind; +using circuit_breaker::RetryBudget; +using circuit_breaker::State; + +static CircuitBreakerConfig DefaultCbConfig() { + CircuitBreakerConfig cb; + cb.enabled = true; + cb.consecutive_failure_threshold = 5; + cb.failure_rate_threshold = 50; + cb.minimum_volume = 20; + cb.window_seconds = 10; + cb.permitted_half_open_calls = 3; + cb.base_open_duration_ms = 5000; + cb.max_open_duration_ms = 60000; + cb.retry_budget_percent = 20; + cb.retry_budget_min_concurrency = 3; + return cb; +} + +// ============================================================================ +// RetryBudget tests +// ============================================================================ + +// Min-concurrency floor: with tiny in_flight, min_concurrency still permits +// the configured floor of concurrent retries (otherwise a 20% budget allows 0 +// retries when in_flight < 5 — useless in low-volume services). +void TestRetryBudgetMinConcurrencyFloor() { + std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..." + << std::endl; + try { + // percent=20, min=3. Even with 0 in_flight, 3 retries allowed. + RetryBudget rb(20, 3); + + // Without any in_flight, min floor is what gates us. + bool r1 = rb.TryConsumeRetry(); // 1/3 + bool r2 = rb.TryConsumeRetry(); // 2/3 + bool r3 = rb.TryConsumeRetry(); // 3/3 + bool r4 = rb.TryConsumeRetry(); // over → rejected + + bool pass = r1 && r2 && r3 && !r4 && + rb.RetriesInFlight() == 3 && + rb.RetriesRejected() == 1; + + rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry(); + pass = pass && rb.RetriesInFlight() == 0; + + TestFramework::RecordTest("RetryBudget min_concurrency floor", pass, + pass ? "" : "r1=" + std::to_string(r1) + + " r2=" + std::to_string(r2) + + " r3=" + std::to_string(r3) + + " r4=" + std::to_string(r4) + + " inflight=" + std::to_string(rb.RetriesInFlight()) + + " rejected=" + std::to_string(rb.RetriesRejected()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget min_concurrency floor", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Percent-based cap scales with in_flight. +// percent=20, min=0, in_flight=50 → cap = 10 retries. +void TestRetryBudgetPercentCap() { + std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..." + << std::endl; + try { + RetryBudget rb(20, 0); // no min floor — pure percent + + // Push in_flight to 50 via guards that we intentionally keep + // alive. Per the documented API, callers hold TrackInFlight() + // for BOTH first attempts and retries — but TryConsumeRetry + // subtracts retries_in_flight from the base so the budget + // doesn't self-inflate as retries are admitted. + std::vector guards; + for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight()); + + // With 50 non-retry in-flight and 20% budget the first + // admission is against cap=10, but each admission shrinks the + // non-retry base by 1. The admission count converges at r + // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The + // pre-fix formula (cap computed from raw in_flight) would + // admit 10, drifting the effective ratio above 20% of + // originals. + int admitted = 0; + for (int i = 0; i < 20; ++i) { + if (rb.TryConsumeRetry()) ++admitted; + } + bool cap_hit = admitted == 8; + bool rejected_count = rb.RetriesRejected() == 12; + + // Release guards — in_flight drops to 0; future TryConsumeRetry with + // min=0 and in_flight=0 rejects everything. + for (auto& g : guards) (void)std::move(g); + guards.clear(); + for (int i = 0; i < admitted; ++i) rb.ReleaseRetry(); + + bool pass = cap_hit && rejected_count && rb.InFlight() == 0 && + rb.RetriesInFlight() == 0; + TestFramework::RecordTest("RetryBudget percent cap", pass, + pass ? "" : "admitted=" + std::to_string(admitted) + + " rejected=" + std::to_string(rb.RetriesRejected()) + + " inflight=" + std::to_string(rb.InFlight()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget percent cap", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// TrackInFlight guards must be RAII-safe: destroying the guard decrements +// in_flight_; moving the guard transfers ownership; self-move safe. +void TestRetryBudgetInFlightGuardRaii() { + std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl; + try { + RetryBudget rb(20, 3); + + bool zero_init = rb.InFlight() == 0; + { + auto g = rb.TrackInFlight(); + bool one_after_track = rb.InFlight() == 1; + + // Move-construct: counter transfers, original is empty. + auto g2 = std::move(g); + bool still_one_after_move = rb.InFlight() == 1; + // g is now empty, destroying it decrements nothing. + (void)g; + + // g2 goes out of scope next. + if (!zero_init || !one_after_track || !still_one_after_move) { + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + false, "mid-test state wrong", + TestFramework::TestCategory::OTHER); + return; + } + } + bool zero_after_drop = rb.InFlight() == 0; + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + zero_after_drop, + zero_after_drop ? "" : "in_flight not zero after guard drop", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget InFlightGuard RAII", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Reload updates tuning atomically without resetting in-flight counters — +// the admission formula changes, outstanding retries keep running. +void TestRetryBudgetReloadPreservesCounters() { + std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..." + << std::endl; + try { + RetryBudget rb(20, 3); + bool r1 = rb.TryConsumeRetry(); // 1/3 + + // Tighten tuning mid-flight. + rb.Reload(10, 1); + + // Outstanding retry is still tracked. + bool inflight_preserved = rb.RetriesInFlight() == 1; + + // New tuning applies — min=1, so 1/1 retry allowed max. + // Current retries_in_flight=1 already, next attempt rejects. + bool r2 = rb.TryConsumeRetry(); + + rb.ReleaseRetry(); + bool cleanup_ok = rb.RetriesInFlight() == 0; + + bool pass = r1 && inflight_preserved && !r2 && cleanup_ok; + TestFramework::RecordTest("RetryBudget Reload preserves counters", pass, + pass ? "" : "r1=" + std::to_string(r1) + + " inflight_preserved=" + std::to_string(inflight_preserved) + + " r2=" + std::to_string(r2) + + " cleanup_ok=" + std::to_string(cleanup_ok), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget Reload preserves counters", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Clamp guards: negative percent / negative min_concurrency are clamped at +// construction (mirrors ConfigLoader::Validate — programmatic callers that +// bypass validation get safe defaults). +void TestRetryBudgetClampsInvalidTuning() { + std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl; + try { + RetryBudget rb(-50, -10); + bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0; + + // Over-max percent clamps to 100. + RetryBudget rb2(500, 5); + bool over_clamped = rb2.percent() == 100; + + // Reload also clamps. + rb.Reload(-1, -1); + bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0; + + bool pass = clamped && over_clamped && reload_clamped; + TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass, + pass ? "" : + "clamped=" + std::to_string(clamped) + + " over_clamped=" + std::to_string(over_clamped) + + " reload_clamped=" + std::to_string(reload_clamped), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("RetryBudget clamps invalid tuning", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// CircuitBreakerHost tests +// ============================================================================ + +// Host creates partition_count slices, GetSlice looks up by index, out-of- +// range returns nullptr (not a crash). +void TestHostCreatesSlicesAndGetSlice() { + std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb); + + bool count_ok = host.partition_count() == 4; + bool slice0 = host.GetSlice(0) != nullptr; + bool slice3 = host.GetSlice(3) != nullptr; + bool slice4_null = host.GetSlice(4) == nullptr; // out of range + bool slice_big_null = host.GetSlice(100) == nullptr; + + // Retry budget always present. + bool rb_present = host.GetRetryBudget() != nullptr; + + // Field getters. + bool fields_ok = host.service_name() == "svc" && + host.host() == "10.0.0.1" && + host.port() == 8080; + + bool pass = count_ok && slice0 && slice3 && slice4_null && + slice_big_null && rb_present && fields_ok; + TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost GetSlice", false, + e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Host Snapshot aggregates counters across slices and rolls up states. +void TestHostSnapshotAggregates() { + std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + cb.consecutive_failure_threshold = 2; + cb.failure_rate_threshold = 100; + cb.minimum_volume = 1000; + CircuitBreakerHost host("svc", "h", 80, 3, cb); + + // Trip slice 0 and 2 → 2 open_partitions, 1 closed. + for (int p : {0, 2}) { + auto* s = host.GetSlice(p); + for (int i = 0; i < 2; ++i) { + auto a = s->TryAcquire(); + s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); + } + } + + auto snap = host.Snapshot(); + + bool rows_ok = snap.slices.size() == 3; + bool total_trips = snap.total_trips == 2; + bool open = snap.open_partitions == 2; + bool halfopen = snap.half_open_partitions == 0; + bool svc_ok = snap.service_name == "svc" && + snap.host == "h" && snap.port == 80; + + bool pass = rows_ok && total_trips && open && halfopen && svc_ok; + TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass, + pass ? "" : + "rows=" + std::to_string(snap.slices.size()) + + " trips=" + std::to_string(snap.total_trips) + + " open=" + std::to_string(snap.open_partitions), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Host Reload with mismatched dispatcher count logs error and does nothing. +// Uses an empty dispatcher vector — the mismatch path must NOT dereference. +void TestHostReloadDispatcherMismatchIsSafe() { + std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..." + << std::endl; + try { + auto cb = DefaultCbConfig(); + CircuitBreakerHost host("svc", "h", 80, 3, cb); + + auto new_cb = cb; + new_cb.failure_rate_threshold = 80; + + // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not + // apply (retry budget atomics should stay at old values). + std::vector> empty; + host.Reload(empty, new_cb); + + // Retry budget fields should be unchanged — Reload bailed early. + bool rb_unchanged = + host.GetRetryBudget()->percent() == cb.retry_budget_percent && + host.GetRetryBudget()->min_concurrency() == + cb.retry_budget_min_concurrency; + + TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", + rb_unchanged, + rb_unchanged ? "" : "retry budget incorrectly updated on bail", + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// ============================================================================ +// CircuitBreakerManager tests +// ============================================================================ + +// Manager builds one host per upstream (regardless of enabled). GetHost +// returns non-null for known names and null for unknown. +void TestManagerGetHostLookup() { + std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..." + << std::endl; + try { + std::vector upstreams(2); + upstreams[0].name = "svc-a"; + upstreams[0].host = "10.0.0.1"; + upstreams[0].port = 8080; + upstreams[0].circuit_breaker = DefaultCbConfig(); + upstreams[1].name = "svc-b"; + upstreams[1].host = "10.0.0.2"; + upstreams[1].port = 9090; + upstreams[1].circuit_breaker = DefaultCbConfig(); + upstreams[1].circuit_breaker.enabled = false; // disabled still built + + CircuitBreakerManager mgr(upstreams, 4, {}); + + bool count_ok = mgr.host_count() == 2; + auto* a = mgr.GetHost("svc-a"); + auto* b = mgr.GetHost("svc-b"); + auto* unknown = mgr.GetHost("nope"); + + bool a_ok = a != nullptr && a->port() == 8080 && + a->partition_count() == 4; + bool b_ok = b != nullptr && b->port() == 9090 && + b->partition_count() == 4; + bool unknown_null = unknown == nullptr; + + bool pass = count_ok && a_ok && b_ok && unknown_null; + TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass, + pass ? "" : + "count_ok=" + std::to_string(count_ok) + + " a=" + std::to_string(a_ok) + + " b=" + std::to_string(b_ok) + + " unknown_null=" + std::to_string(unknown_null), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// SnapshotAll returns one entry per host; topology-preserved Reload logs and +// skips new/removed names without crashing. +void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() { + std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..." + << std::endl; + try { + std::vector upstreams(1); + upstreams[0].name = "svc-a"; + upstreams[0].host = "h"; + upstreams[0].port = 80; + upstreams[0].circuit_breaker = DefaultCbConfig(); + + CircuitBreakerManager mgr(upstreams, 2, {}); + + auto snaps = mgr.SnapshotAll(); + bool one_snapshot = snaps.size() == 1; + bool snap_name_ok = snaps[0].service_name == "svc-a"; + + // Reload with a NEW name + REMOVED existing name — both must log + // warn and do nothing (topology is restart-only). + std::vector new_upstreams(1); + new_upstreams[0].name = "svc-NEW"; + new_upstreams[0].host = "h"; + new_upstreams[0].port = 80; + new_upstreams[0].circuit_breaker = DefaultCbConfig(); + + mgr.Reload(new_upstreams); + + // Manager must still only know about svc-a (the original). + bool original_preserved = mgr.GetHost("svc-a") != nullptr; + bool new_not_added = mgr.GetHost("svc-NEW") == nullptr; + bool count_stable = mgr.host_count() == 1; + + bool pass = one_snapshot && snap_name_ok && original_preserved && + new_not_added && count_stable; + TestFramework::RecordTest( + "CircuitBreakerManager SnapshotAll + topology-skip", pass, + pass ? "" : + "one_snap=" + std::to_string(one_snapshot) + + " name_ok=" + std::to_string(snap_name_ok) + + " preserved=" + std::to_string(original_preserved) + + " new_not_added=" + std::to_string(new_not_added) + + " count=" + std::to_string(mgr.host_count()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CircuitBreakerManager SnapshotAll + topology-skip", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects +// empty names, but manager must not blow up if something slips through). +void TestManagerSkipsEmptyNameUpstream() { + std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..." + << std::endl; + try { + std::vector upstreams(2); + upstreams[0].name = ""; // defensive — should be skipped + upstreams[0].host = "h"; + upstreams[0].port = 80; + upstreams[0].circuit_breaker = DefaultCbConfig(); + upstreams[1].name = "svc-b"; + upstreams[1].host = "h"; + upstreams[1].port = 81; + upstreams[1].circuit_breaker = DefaultCbConfig(); + + CircuitBreakerManager mgr(upstreams, 2, {}); + + bool pass = mgr.host_count() == 1 && + mgr.GetHost("svc-b") != nullptr && + mgr.GetHost("") == nullptr; + TestFramework::RecordTest( + "CircuitBreakerManager skips empty-name upstream", pass, + pass ? "" : "count=" + std::to_string(mgr.host_count()), + TestFramework::TestCategory::OTHER); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CircuitBreakerManager skips empty-name upstream", + false, e.what(), TestFramework::TestCategory::OTHER); + } +} + +// Run all circuit-breaker component unit tests. +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - COMPONENT UNIT TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestRetryBudgetMinConcurrencyFloor(); + TestRetryBudgetPercentCap(); + TestRetryBudgetInFlightGuardRaii(); + TestRetryBudgetReloadPreservesCounters(); + TestRetryBudgetClampsInvalidTuning(); + + TestHostCreatesSlicesAndGetSlice(); + TestHostSnapshotAggregates(); + TestHostReloadDispatcherMismatchIsSafe(); + + TestManagerGetHostLookup(); + TestManagerSnapshotAllAndReloadSkipsTopologyChanges(); + TestManagerSkipsEmptyNameUpstream(); +} + +} // namespace CircuitBreakerComponentsTests diff --git a/test/circuit_breaker_integration_test.h b/test/circuit_breaker_integration_test.h new file mode 100644 index 00000000..10e72e5b --- /dev/null +++ b/test/circuit_breaker_integration_test.h @@ -0,0 +1,1213 @@ +#pragma once + +// Integration tests: circuit breaker wired into ProxyTransaction + +// UpstreamManager + HttpServer. Exercises the full request path end-to-end. +// +// Strategy: use a backend that returns 5xx on every request so repeated hits +// trip the breaker via the consecutive-failure threshold. 5xx responses are +// the cheapest way to accumulate failures (no connect timeouts to wait for). +// Low thresholds keep tests fast. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" + +#include +#include +#include + +namespace CircuitBreakerIntegrationTests { + +using circuit_breaker::State; + +// Shared helper: build an upstream config that proxies /echo → backend and +// has a breaker configured with low thresholds for fast trip. +static UpstreamConfig MakeBreakerUpstream(const std::string& name, + const std::string& host, + int port, + bool breaker_enabled, + int consecutive_threshold = 3) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + // Exact-match route — simpler than prefix patterns for integration tests. + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + // No retries — keeps the test deterministic: one request = one attempt. + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = breaker_enabled; + u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; + // Disable the rate-based trip path — we drive everything through + // consecutive failures to keep the test count predictable. + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 500; // short so recovery test is quick + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open +// headers on the rejected request. +// --------------------------------------------------------------------------- +void TestBreakerTripsAfterConsecutiveFailures() { + std::cout << "\n[TEST] CB Integration: breaker trips after consecutive 5xx..." + << std::endl; + try { + // Backend always returns 502 — gateway classifies the response as + // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt. + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. // single thread → single breaker partition exercised + gw.upstreams.push_back( + MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Hit the failing backend threshold times — each 502 from backend + // propagates to the client as 502 (gateway pass-through) AND counts + // as a RESPONSE_5XX failure in the breaker. + for (int i = 0; i < 3; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Integration: trip after consecutive failures", false, + "pre-trip request " + std::to_string(i) + " expected 502, got: " + + r.substr(0, 32)); + return; + } + } + + // Next request must be rejected by the breaker (not proxied). The + // response is 503 with X-Circuit-Breaker: open and Retry-After. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_breaker_header = + r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos; + bool has_retry_after = + r.find("Retry-After:") != std::string::npos || + r.find("retry-after:") != std::string::npos; + bool has_upstream_host = + r.find("X-Upstream-Host:") != std::string::npos || + r.find("x-upstream-host:") != std::string::npos; + + bool pass = is_503 && has_breaker_header && has_retry_after && + has_upstream_host; + TestFramework::RecordTest( + "CB Integration: trip after consecutive failures", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " breaker_hdr=" + std::to_string(has_breaker_header) + + " retry_after=" + std::to_string(has_retry_after) + + " upstream_host=" + std::to_string(has_upstream_host) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: trip after consecutive failures", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely. +// The same failure pattern that would trip an enabled breaker must leave the +// pass-through path untouched — every request still reaches the backend. +// --------------------------------------------------------------------------- +void TestBreakerDisabledPassesThrough() { + std::cout << "\n[TEST] CB Integration: disabled breaker passes through..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/false, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 10 requests — with breaker disabled, all 10 reach backend. + for (int i = 0; i < 10; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Integration: disabled breaker passes through", false, + "request " + std::to_string(i) + " expected 502, got: " + + r.substr(0, 32)); + return; + } + } + + bool all_hit = backend_hits.load() == 10; + TestFramework::RecordTest( + "CB Integration: disabled breaker passes through", all_hit, + all_hit ? "" : + "expected 10 backend hits, got " + std::to_string(backend_hits.load())); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: disabled breaker passes through", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: 2xx responses are reported as success — they reset the +// consecutive-failure counter so the breaker doesn't trip on interleaved +// success/failure traffic. +// --------------------------------------------------------------------------- +void TestSuccessResetsConsecutiveFailureCounter() { + std::cout << "\n[TEST] CB Integration: 2xx success resets consecutive-failure counter..." + << std::endl; + try { + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + // Backend must serve /fail — that's the exact-match route the + // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail", + // strip_prefix=false). A different backend path would leave + // the gateway 404-ing every request without ever exercising + // the proxy, and the CLOSED-state assertion below would pass + // for the wrong reason. + backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails. + // With reset semantics, consecutive_failures_ never exceeds 2 → no trip. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL + } + fail_mode.store(false); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // SUCCESS → reset + fail_mode.store(true); + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL + } + + // Inspect the breaker's state directly. The slice must be CLOSED + // AND must have observed activity — without the second check, a + // gateway that 404's every request (e.g. because the proxy route + // doesn't match) would also pass trivially. + auto* cbm = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; + auto* host = cbm ? cbm->GetHost("svc") : nullptr; + auto* slice = host ? host->GetSlice(0) : nullptr; + bool still_closed = slice && slice->CurrentState() == State::CLOSED; + // No trip fired: total_trips should be zero for this slice. + int64_t trips = slice ? slice->Trips() : -1; + bool no_trips = (trips == 0); + + bool pass = still_closed && no_trips; + TestFramework::RecordTest( + "CB Integration: success resets consecutive counter", pass, + pass ? "" : + "state=" + std::to_string(static_cast( + slice ? slice->CurrentState() : State::CLOSED)) + + " trips=" + std::to_string(trips)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: success resets consecutive counter", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: After the trip, the live slice state is OPEN. Verifies the +// integration actually drives the slice state machine (not just the response). +// --------------------------------------------------------------------------- +void TestTripDrivesSliceState() { + std::cout << "\n[TEST] CB Integration: trip drives slice state to OPEN..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 3 failures → trip. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // With worker_threads > 1 the 3 failing requests can land on either + // dispatcher (hash-dependent). Check the aggregate snapshot — at + // least one partition must be OPEN with exactly one trip recorded. + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* host = cbm->GetHost("svc"); + auto snap = host->Snapshot(); + bool at_least_one_open = snap.open_partitions >= 1; + bool one_trip = snap.total_trips == 1; + // Sanity: the tripped partition should be the one that saw all 3 + // failures (consecutive trip is single-slice, not cross-slice). + bool single_partition_tripped = snap.open_partitions == 1; + + bool pass = at_least_one_open && one_trip && single_partition_tripped; + TestFramework::RecordTest( + "CB Integration: trip drives slice state to OPEN", pass, + pass ? "" : + "at_least_one_open=" + std::to_string(at_least_one_open) + + " one_trip=" + std::to_string(one_trip) + + " single_partition=" + std::to_string(single_partition_tripped) + + " (open_partitions=" + std::to_string(snap.open_partitions) + + ", total_trips=" + std::to_string(snap.total_trips) + ")"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: trip drives slice state to OPEN", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip, +// subsequent requests must be served locally (503) without any upstream I/O. +// Prevents regression where the gate leaked admissions to a known-bad upstream. +// --------------------------------------------------------------------------- +void TestOpenBreakerShortCircuitsUpstreamCall() { + std::cout << "\n[TEST] CB Integration: OPEN breaker short-circuits upstream call..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // worker_threads=1 → all TCP connections land on dispatcher 0 + // (NetServer shards new connections by fd%worker_threads), so + // per-request failures accumulate deterministically on slice[0] + // instead of splitting across multiple slices. + gw.upstreams.push_back( + MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // 3 failing requests to trip. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + int hits_at_trip = backend_hits.load(); + + // 5 more requests — all should be rejected locally. + for (int i = 0; i < 5; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + int hits_after = backend_hits.load(); + + // Backend hits must not grow during the post-trip burst. + bool no_leak = hits_after == hits_at_trip; + TestFramework::RecordTest( + "CB Integration: OPEN short-circuits upstream call", no_leak, + no_leak ? "" : + "backend hits grew from " + std::to_string(hits_at_trip) + + " to " + std::to_string(hits_after) + " after trip"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: OPEN short-circuits upstream call", false, e.what()); + } +} + +// Sanity check: verify the bare proxy setup works without the breaker +// before blaming the breaker integration. +void TestBareProxyWorks() { + std::cout << "\n[TEST] CB Integration: bare proxy (sanity)..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + UpstreamConfig u; + u.name = "svc"; + u.host = "127.0.0.1"; + u.port = backend_port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.proxy.route_prefix = "/fail"; + u.proxy.response_timeout_ms = 5000; + u.circuit_breaker.enabled = true; // sanity + breaker enabled + u.circuit_breaker.consecutive_failure_threshold = 3; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 500; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); + bool pass = TestHttpClient::HasStatus(r, 502); + TestFramework::RecordTest( + "CB Integration: bare proxy sanity", pass, + pass ? "" : "expected 502, got: " + r.substr(0, 128)); + } catch (const std::exception& e) { + TestFramework::RecordTest("CB Integration: bare proxy sanity", + false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 7: Retry-After header carries a sensible value — within [1, configured +// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now. +// --------------------------------------------------------------------------- +void TestRetryAfterHeaderValue() { + std::cout << "\n[TEST] CB Integration: Retry-After value correctness..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // base_open_duration 2000ms, max 60_000ms — Retry-After should + // ceiling-round and fall inside [1, 60]. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 2000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Capture the open-rejection response. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + + // Extract Retry-After integer value (case-insensitive header). + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60). + // For base_open_duration 2000ms the remaining-seconds at this + // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper + // sanity bound is generous but still rules out 300/3600-class + // buggy fallbacks. + bool in_range = (retry_after >= 1 && retry_after <= 60); + bool reasonable = (retry_after >= 1 && retry_after <= 3); + + bool pass = is_503 && in_range && reasonable; + TestFramework::RecordTest( + "CB Integration: Retry-After value in range", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " retry_after=" + std::to_string(retry_after) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: Retry-After value in range", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3, +// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored +// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into +// the breaker and MaybeRetry stays out. +// --------------------------------------------------------------------------- +void TestCircuitOpenTerminalForRetry() { + std::cout << "\n[TEST] CB Integration: CIRCUIT_OPEN terminal for retry loop..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // Retries enabled on 5xx — if the breaker reject leaked into + // MaybeRetry, the test would see extra backend hits after the + // trip. Long open window so the breaker stays OPEN for the + // duration of the post-trip assertion (no HALF_OPEN probe + // admission racing the test). + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. Each pre-trip request may retry up to 3 + // times (all failing 5xx), so backend sees up to 3*threshold=12 + // hits. That's acceptable — we just care about post-trip behavior. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + } + int pre_trip_hits = backend_hits.load(); + + // Post-trip request: expect a single 503 and NO new backend hits. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + int post_trip_hits = backend_hits.load(); + bool no_new_hits = (post_trip_hits == pre_trip_hits); + + bool pass = is_503 && no_new_hits; + TestFramework::RecordTest( + "CB Integration: CIRCUIT_OPEN terminal for retry", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " pre=" + std::to_string(pre_trip_hits) + + " post=" + std::to_string(post_trip_hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: CIRCUIT_OPEN terminal for retry", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the +// upstream (pass-through) but still increments the rejected_ counter so +// operators can observe the would-reject rate without production impact. +// --------------------------------------------------------------------------- +void TestDryRunPassthrough() { + std::cout << "\n[TEST] CB Integration: dry-run passthrough..." << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.dry_run = true; // would-reject, but still forward + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip thresholds with 5 requests. All should reach backend (502), + // not a 503 — dry-run never short-circuits. + for (int i = 0; i < 5; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (!TestHttpClient::HasStatus(r, 502)) { + TestFramework::RecordTest( + "CB Integration: dry-run passthrough", false, + "request " + std::to_string(i) + + " expected 502, got: " + r.substr(0, 64)); + return; + } + } + + bool all_hit = (backend_hits.load() == 5); + + // Verify the slice observed trips/rejected even though traffic passed. + auto* mgr = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : + nullptr; + int64_t trips = 0, rejected = 0; + if (mgr) { + auto* host = mgr->GetHost("svc"); + if (host) { + auto snap = host->Snapshot(); + trips = snap.total_trips; + rejected = snap.total_rejected; + } + } + // At least one trip fired (consecutive_threshold=3 → slice + // transitioned at least once during the run), and the post-trip + // requests were counted as would-reject (rejected > 0). + bool observed = (trips >= 1) && (rejected >= 1); + + bool pass = all_hit && observed; + TestFramework::RecordTest( + "CB Integration: dry-run passthrough", pass, + pass ? "" : + "hits=" + std::to_string(backend_hits.load()) + + " trips=" + std::to_string(trips) + + " rejected=" + std::to_string(rejected)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: dry-run passthrough", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the +// breaker, wait for the open window to elapse, then serve success responses +// and assert the slice transitions back to CLOSED (consecutive_successes +// crosses the threshold — default 2 from DefaultCbConfig / integration config). +// --------------------------------------------------------------------------- +void TestHalfOpenRecoveryRoundTrip() { + std::cout << "\n[TEST] CB Integration: HALF_OPEN → CLOSED recovery..." + << std::endl; + try { + std::atomic fail_mode{true}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { + if (fail_mode.load()) { + resp.Status(502).Body("err", "text/plain"); + } else { + resp.Status(200).Body("ok", "text/plain"); + } + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + // Short open duration so recovery path finishes quickly. + u.circuit_breaker.base_open_duration_ms = 300; + u.circuit_breaker.max_open_duration_ms = 1000; + // Two probes needed to close (default permitted_half_open_calls=2). + u.circuit_breaker.permitted_half_open_calls = 2; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip by hitting the failing backend. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Flip backend to success and wait for the open window to elapse. + fail_mode.store(false); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + // Probe the proxy — each successful 200 advances HALF_OPEN toward + // CLOSED. Do more than permitted_half_open_calls; some will be + // rejected as half_open_full but the ones that are admitted will + // close the breaker. + bool saw_success = false; + for (int i = 0; i < 8; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (TestHttpClient::HasStatus(r, 200)) saw_success = true; + // Small gap between probes — HALF_OPEN only admits permitted + // probes per cycle; spacing lets subsequent probes observe a + // possibly-closed breaker. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + // Verify slice aggregate: at least one CLOSED transition observed + // (probe_successes >= 1 and total_trips == 1 — we only tripped once). + auto* mgr = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : + nullptr; + int64_t probe_succ = 0; + int open_parts = 0, half_open_parts = 0; + if (mgr) { + auto* host = mgr->GetHost("svc"); + if (host) { + auto snap = host->Snapshot(); + probe_succ = 0; + for (const auto& row : snap.slices) { + probe_succ += row.probe_successes; + } + open_parts = snap.open_partitions; + half_open_parts = snap.half_open_partitions; + } + } + + // Recovery complete: saw at least one 200 through the breaker, + // at least one probe success counted, and no partition still + // stuck in OPEN (HALF_OPEN may still linger on the unused slice, + // which is fine for a 2-partition setup). + bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0); + TestFramework::RecordTest( + "CB Integration: HALF_OPEN → CLOSED recovery", pass, + pass ? "" : + "saw_success=" + std::to_string(saw_success) + + " probe_succ=" + std::to_string(probe_succ) + + " open_parts=" + std::to_string(open_parts) + + " half_open_parts=" + std::to_string(half_open_parts)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN → CLOSED recovery", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 11: Retry-After ceils the config cap from a non-second-aligned +// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap +// would clamp the advertised retry window below what the breaker honors, +// causing well-behaved clients to re-hit the 503. +// --------------------------------------------------------------------------- +void TestRetryAfterCapCeilsNonAlignedMax() { + std::cout << "\n[TEST] CB Integration: Retry-After cap ceils non-aligned max..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // Configure a non-second-aligned max backoff. base = 1500ms so + // the actual OpenUntil-now at trip time is ~1.5s, which ceil- + // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration + // (1500ms → 1s), the clamp would drop Retry-After to 1s even + // though the breaker would keep rejecting through the second + // half of that window. + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 1500; + u.circuit_breaker.max_open_duration_ms = 1500; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil- + // rounds 1500ms to 2s, and the remaining-time ceil-rounds to + // 2 at the moment of trip (may be 1 if enough wall-clock has + // elapsed between trip and response). Critically it must NEVER + // be zero or exceed 2 (clamped to the 2s cap). + bool in_range = (retry_after >= 1 && retry_after <= 2); + TestFramework::RecordTest( + "CB Integration: Retry-After ceils non-aligned cap", in_range, + in_range ? "" : + "retry_after=" + std::to_string(retry_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: Retry-After ceils non-aligned cap", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 12: Retried failures are reported BEFORE the retry fires. With retries +// enabled on 5xx, each attempt's outcome must be counted against the breaker; +// otherwise the slice trips only after the final retry exhausts, under- +// counting failures and potentially never tripping if retries mask enough of +// them. Verifies the trip still happens within the expected number of client +// requests once reporting is attached to the retry path. +// --------------------------------------------------------------------------- +void TestRetriedFailuresCountTowardTrip() { + std::cout << "\n[TEST] CB Integration: retried failures count toward trip..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each + // client request produces 1 + max_retries=3 = 4 upstream + // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome + // path that this fix patches in. The breaker must trip after + // at most 3 upstream failure reports (which the first client + // request alone produces). + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // One client request → 4 upstream attempts → 4 RESPONSE_5XX + // reports. Threshold=3 should trip during this single request. + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + // Second client request must hit the OPEN breaker → 503. + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_breaker_header = + r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos; + + bool pass = is_503 && has_breaker_header; + TestFramework::RecordTest( + "CB Integration: retried failures count toward trip", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " breaker_hdr=" + std::to_string(has_breaker_header) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: retried failures count toward trip", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label. +// TryAcquire returns REJECTED_OPEN for three situations (true OPEN, +// half_open_full, half_open_recovery_failing). When the slice is in +// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse +// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading +// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects +// with a more conservative Retry-After hint. +// +// Strategy: trip the breaker, wait for the open window to elapse so the +// slice transitions HALF_OPEN on the next admission attempt, then flood +// concurrent requests so some hit half_open_full. +// --------------------------------------------------------------------------- +void TestHalfOpenRejectLabel() { + std::cout << "\n[TEST] CB Integration: HALF_OPEN reject label..." + << std::endl; + try { + // Backend hangs to keep probes in-flight so later concurrent + // requests hit half_open_full. + std::atomic hang{false}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { + if (hang.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(600)); + } + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/3); + u.circuit_breaker.base_open_duration_ms = 200; + u.circuit_breaker.max_open_duration_ms = 500; + u.circuit_breaker.permitted_half_open_calls = 1; // tiny budget + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + // Wait for the open window to elapse so the next admission + // flips the slice to HALF_OPEN. + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + + // Flip backend to hang so the probe occupies the single probe + // slot while we fire sibling requests that must hit half_open_full. + hang.store(true); + + std::atomic saw_half_open{false}; + std::atomic saw_open{false}; + auto probe = [&](int id) { + (void)id; + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); + if (!TestHttpClient::HasStatus(r, 503)) return; + if (r.find("X-Circuit-Breaker: half_open") != std::string::npos || + r.find("x-circuit-breaker: half_open") != std::string::npos) { + saw_half_open.store(true); + } + if (r.find("X-Circuit-Breaker: open") != std::string::npos || + r.find("x-circuit-breaker: open") != std::string::npos) { + // We want to distinguish the labels; the "open" substring + // also matches "half_open". Only count true "open" if + // "half_open" didn't appear in THIS response. + if (r.find("half_open") == std::string::npos) { + saw_open.store(true); + } + } + }; + + std::vector threads; + for (int i = 0; i < 6; ++i) { + threads.emplace_back(probe, i); + std::this_thread::sleep_for(std::chrono::milliseconds(20)); + } + for (auto& t : threads) t.join(); + + // Pass if at least one HALF_OPEN-labelled reject was observed. + // saw_open may or may not be observed (some rejects could have + // hit between cycles) — the key contract is that HALF_OPEN + // rejects no longer get the plain "open" label. + bool pass = saw_half_open.load(); + TestFramework::RecordTest( + "CB Integration: HALF_OPEN reject label", pass, + pass ? "" : + "saw_half_open=" + std::to_string(saw_half_open.load()) + + " saw_open=" + std::to_string(saw_open.load())); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN reject label", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff, +// not just base_open_duration_ms. After multiple trips the next OPEN window +// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old +// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report +// the worst-case wait, which this test must fail for. +// +// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting +// the OPEN window elapse and single probe fail each cycle. Successful +// recoveries must be avoided — TransitionHalfOpenToClosed resets +// consecutive_trips_ to 0, which hides the exponential hint. +// --------------------------------------------------------------------------- +void TestHalfOpenRetryAfterScalesWithBackoff() { + std::cout << "\n[TEST] CB Integration: HALF_OPEN Retry-After exponential..." + << std::endl; + try { + // Backend fails fast by default. When `hang` is set, the + // handler blocks — used at the end to pin the probe slot so + // a concurrent request observes HALF_OPEN rejection. + std::atomic hang{false}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { + if (hang.load()) { + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); + } + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; // pin all traffic to slice[0] + gw.http2.enabled = false; + auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, + /*enabled=*/true, /*threshold=*/2); + u.circuit_breaker.base_open_duration_ms = 100; // config minimum + u.circuit_breaker.max_open_duration_ms = 8000; // cap at 8s + u.circuit_breaker.permitted_half_open_calls = 1; // single probe + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto* cbm = gateway.GetUpstreamManager() ? + gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; + auto* host = cbm ? cbm->GetHost("svc") : nullptr; + auto* slice = host ? host->GetSlice(0) : nullptr; + if (!slice) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", + false, "slice lookup failed"); + return; + } + + // Initial trip: 2 consecutive failures with threshold=2. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + // Drive consecutive_trips_ up by letting successive OPEN windows + // elapse and probes fail (no recovery → no reset). Stop when + // NextOpenDurationMs crosses 1000ms, which is the threshold + // where the HALF_OPEN Retry-After hint starts exceeding the + // base-only value (ceil(100ms)=1s). + // + // The slice re-trips on each failed probe; each trip doubles + // the open duration. We run ~8 cycles with safety margin which + // is comfortably past the trip count needed for Retry-After>=2. + for (int cycle = 0; cycle < 8; ++cycle) { + // Wait past the current open window. Upper bound: max=8s, + // so 1200ms is plenty for the first few short cycles, and + // we re-check after each request anyway. + int64_t next_ms = slice->NextOpenDurationMs(); + // Current OPEN window is the one stored BEFORE the upcoming + // re-trip — we don't have that directly, so sleep past the + // NEXT duration as an over-approximation (next is always >= + // current). This ensures OPEN has elapsed. + auto sleep_ms = std::max(next_ms + 50, 200); + if (sleep_ms > 2000) sleep_ms = 2000; // cap per cycle + std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); + + // One request — it should admit as a probe (HALF_OPEN), + // the backend fails fast (502), probe fails → re-trip with + // consecutive_trips_++ and fresh OPEN. + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + // Bail early once the exponential hint crosses 1s → the + // subsequent HALF_OPEN reject will carry Retry-After >= 2. + if (slice->NextOpenDurationMs() >= 2000) break; + } + + int64_t next_open_ms = slice->NextOpenDurationMs(); + if (next_open_ms < 2000) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", + false, + "setup failed: next_open_ms=" + std::to_string(next_open_ms) + + " (need >= 2000 to distinguish from base-only hint)"); + return; + } + + // Now trigger a HALF_OPEN reject: wait for current OPEN to + // elapse, start a hanging probe (pins the slot), then fire a + // sibling request — it must see half_open_full with the + // exponential Retry-After. + int64_t post_wait_ms = next_open_ms + 100; + if (post_wait_ms > 4000) post_wait_ms = 4000; + std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms)); + + hang.store(true); + std::thread probe([&]() { + TestHttpClient::HttpGet(gw_port, "/fail", 3500); + }); + // Let the probe get admitted and start hanging. + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); + hang.store(false); + probe.join(); + + bool is_half_open = + r.find("X-Circuit-Breaker: half_open") != std::string::npos || + r.find("x-circuit-breaker: half_open") != std::string::npos; + + int retry_after = -1; + const char* markers[] = {"Retry-After:", "retry-after:"}; + for (const char* m : markers) { + auto pos = r.find(m); + if (pos == std::string::npos) continue; + pos += std::string(m).size(); + while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; + int val = 0; + bool any = false; + while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { + val = val * 10 + (r[pos] - '0'); + any = true; + ++pos; + } + if (any) { retry_after = val; break; } + } + + // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2. + // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1. + // Asserting >= 2 fails the pre-fix implementation. + bool retry_after_ok = (retry_after >= 2 && retry_after <= 8); + bool pass = is_half_open && retry_after_ok; + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", pass, + pass ? "" : + "is_half_open=" + std::to_string(is_half_open) + + " retry_after=" + std::to_string(retry_after) + + " next_open_ms=" + std::to_string(next_open_ms)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Integration: HALF_OPEN Retry-After exponential-aware", + false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - INTEGRATION TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestBareProxyWorks(); + TestBreakerTripsAfterConsecutiveFailures(); + TestBreakerDisabledPassesThrough(); + TestSuccessResetsConsecutiveFailureCounter(); + TestTripDrivesSliceState(); + TestOpenBreakerShortCircuitsUpstreamCall(); + TestRetryAfterHeaderValue(); + TestCircuitOpenTerminalForRetry(); + TestDryRunPassthrough(); + TestHalfOpenRecoveryRoundTrip(); + TestRetryAfterCapCeilsNonAlignedMax(); + TestRetriedFailuresCountTowardTrip(); + TestHalfOpenRejectLabel(); + TestHalfOpenRetryAfterScalesWithBackoff(); +} + +} // namespace CircuitBreakerIntegrationTests diff --git a/test/circuit_breaker_observability_test.h b/test/circuit_breaker_observability_test.h new file mode 100644 index 00000000..42694a67 --- /dev/null +++ b/test/circuit_breaker_observability_test.h @@ -0,0 +1,405 @@ +#pragma once + +// Observability integration tests: observability — counter accuracy, snapshot +// API correctness, and log emission. +// +// Phases 2-6 each added counters and log lines as a side effect of their +// functional work. This suite locks those in as regressions: +// +// * Counters (§11.2): trips, rejected, probe_successes, probe_failures, +// retries_rejected surface through CircuitBreakerManager::SnapshotAll. +// * Snapshot API (§11.3): per-slice rows aggregate into host-level +// totals; host-level fields (retries_in_flight / retries_rejected / +// in_flight) reflect the owning RetryBudget. +// * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message +// including trigger, consecutive_failures, window_total, +// window_fail_rate, open_for_ms, and consecutive_trips. +// +// The log-emission test attaches a spdlog ring-buffer sink to the logger +// for the duration of the test, triggers a trip, then asserts the +// captured messages contain the expected fields. No log file I/O. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" +#include "spdlog/sinks/ringbuffer_sink.h" + +#include +#include +#include +#include +#include +#include + +namespace CircuitBreakerObservabilityTests { + +using circuit_breaker::State; + +static UpstreamConfig MakeObservUpstream(const std::string& name, + const std::string& host, + int port, + int consecutive_threshold = 3) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + // Long open duration — keep the slice OPEN so post-trip assertions + // don't race a HALF_OPEN transition. + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: Snapshot API reflects per-slice trip/rejected counters and +// host-level aggregates. Drives N+1 requests against a backend that always +// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts +// the snapshot shows total_trips >= 1, total_rejected >= 1, +// open_partitions >= 1. +// --------------------------------------------------------------------------- +void TestSnapshotReflectsCounters() { + std::cout << "\n[TEST] CB Observability: snapshot reflects counters..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port, + /*threshold=*/3); + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip (3 failures), then 2 more to accumulate rejected counter. + for (int i = 0; i < 3; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + if (!cbm) { + TestFramework::RecordTest( + "CB Observability: snapshot reflects counters", false, + "no circuit breaker manager attached"); + return; + } + auto snaps = cbm->SnapshotAll(); + bool found = false; + int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0; + int open_parts = 0; + for (const auto& s : snaps) { + if (s.service_name == "svc") { + trips = s.total_trips; + rejected = s.total_rejected; + open_parts = s.open_partitions; + for (const auto& row : s.slices) { + probe_s += row.probe_successes; + probe_f += row.probe_failures; + } + found = true; + break; + } + } + + bool pass = found + && trips >= 1 + && rejected >= 2 // 2 post-trip short-circuits + && open_parts >= 1 + && probe_s == 0 // never entered HALF_OPEN + && probe_f == 0; + TestFramework::RecordTest( + "CB Observability: snapshot reflects counters", pass, + pass ? "" : + "found=" + std::to_string(found) + + " trips=" + std::to_string(trips) + + " rejected=" + std::to_string(rejected) + + " open_parts=" + std::to_string(open_parts) + + " probe_s=" + std::to_string(probe_s) + + " probe_f=" + std::to_string(probe_f)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Observability: snapshot reflects counters", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message. +// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip, +// then inspects the captured messages for the key tokens. The sink is +// removed before the test returns so it doesn't affect later tests. +// --------------------------------------------------------------------------- +void TestTripLogEmission() { + std::cout << "\n[TEST] CB Observability: trip log emission..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port, + /*threshold=*/2); + gw.upstreams.push_back(u); + + // `HttpServer` construction calls `logging::Init()` which rebuilds + // the default logger via `spdlog::set_default_logger`. Any sink + // attached BEFORE that point lands on a stale logger. Attach the + // ringbuffer sink AFTER the last HttpServer construction so it + // captures the live logger's output. + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // Drive exactly threshold=2 failures to trip. + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + + // Give the dispatcher a breath to emit + the sink to settle. + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + auto messages = ring->last_formatted(); + // Scan for the trip message. Look for the static prefix plus the + // §11.1 field tokens. + bool saw_tripped = false; + bool has_trigger = false; + bool has_consec_failures = false; + bool has_window_total = false; + bool has_fail_rate = false; + bool has_open_for_ms = false; + bool has_consec_trips = false; + for (const auto& msg : messages) { + if (msg.find("circuit breaker tripped") == std::string::npos) { + continue; + } + saw_tripped = true; + if (msg.find("trigger=") != std::string::npos) has_trigger = true; + if (msg.find("consecutive_failures=") != std::string::npos) + has_consec_failures = true; + if (msg.find("window_total=") != std::string::npos) + has_window_total = true; + if (msg.find("window_fail_rate=") != std::string::npos) + has_fail_rate = true; + if (msg.find("open_for_ms=") != std::string::npos) + has_open_for_ms = true; + if (msg.find("consecutive_trips=") != std::string::npos) + has_consec_trips = true; + } + + bool pass = saw_tripped && has_trigger && has_consec_failures && + has_window_total && has_fail_rate && + has_open_for_ms && has_consec_trips; + TestFramework::RecordTest( + "CB Observability: trip log emission", pass, + pass ? "" : + "saw_tripped=" + std::to_string(saw_tripped) + + " trigger=" + std::to_string(has_trigger) + + " consec_failures=" + std::to_string(has_consec_failures) + + " window_total=" + std::to_string(has_window_total) + + " fail_rate=" + std::to_string(has_fail_rate) + + " open_for_ms=" + std::to_string(has_open_for_ms) + + " consec_trips=" + std::to_string(has_consec_trips)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Observability: trip log emission", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Retry-budget observability — the exhausted log carries the +// §11.1 fields (service, in_flight, retries_in_flight, cap), and the +// host snapshot reflects retries_rejected. +// --------------------------------------------------------------------------- +void TestRetryBudgetObservability() { + std::cout << "\n[TEST] CB Observability: retry budget observability..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + // Budget: zero percent AND zero floor → every retry rejected. + auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port, + /*threshold=*/10000); + u.proxy.retry.max_retries = 2; + u.proxy.retry.retry_on_5xx = true; + u.circuit_breaker.retry_budget_percent = 0; + u.circuit_breaker.retry_budget_min_concurrency = 0; + gw.upstreams.push_back(u); + + // Attach the ringbuffer AFTER gateway construction — see + // TestTripLogEmission for rationale (HttpServer's ctor + // replaces the default logger via logging::Init, detaching + // any previously-attached sinks). + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // One client request: first attempt hits backend (502), retry + // blocked by budget → 503 + X-Retry-Budget-Exhausted. + TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + auto messages = ring->last_formatted(); + bool saw_exhausted = false; + bool has_service = false; + bool has_inflight = false; + bool has_retries_inflight = false; + bool has_cap = false; + for (const auto& msg : messages) { + if (msg.find("retry budget exhausted") == std::string::npos) { + continue; + } + saw_exhausted = true; + if (msg.find("service=") != std::string::npos) has_service = true; + if (msg.find("in_flight=") != std::string::npos) + has_inflight = true; + if (msg.find("retries_in_flight=") != std::string::npos) + has_retries_inflight = true; + if (msg.find("cap=") != std::string::npos) has_cap = true; + } + + // Snapshot: retries_rejected must be >= 1 (every rejection increments). + int64_t retries_rejected = 0; + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + if (cbm) { + for (const auto& s : cbm->SnapshotAll()) { + if (s.service_name == "svc-budget") { + // Host aggregate — single host, so the sum is the + // host's retries_rejected. The snapshot doesn't yet + // expose that directly — derive from RetryBudget + // via the host getter. + auto* host = cbm->GetHost("svc-budget"); + if (host) { + retries_rejected = + host->GetRetryBudget()->RetriesRejected(); + } + break; + } + } + } + + bool pass = saw_exhausted && has_service && has_inflight && + has_retries_inflight && has_cap && + retries_rejected >= 1; + TestFramework::RecordTest( + "CB Observability: retry budget observability", pass, + pass ? "" : + "saw_exhausted=" + std::to_string(saw_exhausted) + + " service=" + std::to_string(has_service) + + " inflight=" + std::to_string(has_inflight) + + " retries_inflight=" + std::to_string(has_retries_inflight) + + " cap=" + std::to_string(has_cap) + + " retries_rejected=" + std::to_string(retries_rejected)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Observability: retry budget observability", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - OBSERVABILITY TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestSnapshotReflectsCounters(); + TestTripLogEmission(); + TestRetryBudgetObservability(); +} + +} // namespace CircuitBreakerObservabilityTests diff --git a/test/circuit_breaker_reload_test.h b/test/circuit_breaker_reload_test.h new file mode 100644 index 00000000..220c718e --- /dev/null +++ b/test/circuit_breaker_reload_test.h @@ -0,0 +1,373 @@ +#pragma once + +// Reload integration tests: hot-reload of circuit-breaker fields. +// +// UpstreamConfig::operator== now excludes `circuit_breaker` — a CB-only +// SIGHUP is a clean reload that propagates via HttpServer::Reload → +// CircuitBreakerManager::Reload → per-host per-slice Reload enqueued on +// each owning dispatcher. +// +// Topology fields (host, port, pool, proxy, tls) remain restart-only. +// +// Strategy: construct a gateway with an enabled breaker, capture the +// initial slice config, call HttpServer::Reload with an edited +// CircuitBreakerConfig, and verify the slice's live config reflects the +// edit. The reload-log capture also verifies the manager-level log lines +// ("CircuitBreakerManager::Reload: new/removed upstream ...") fire for +// topology-change SIGHUPs. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" +#include "upstream/upstream_manager.h" +#include "circuit_breaker/circuit_breaker_manager.h" +#include "circuit_breaker/circuit_breaker_host.h" +#include "circuit_breaker/circuit_breaker_slice.h" +#include "log/logger.h" +#include "spdlog/sinks/ringbuffer_sink.h" + +#include +#include +#include +#include +#include + +namespace CircuitBreakerReloadTests { + +static UpstreamConfig MakeReloadUpstream(const std::string& name, + const std::string& host, + int port) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 8; + u.pool.max_idle_connections = 4; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + u.proxy.retry.max_retries = 0; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.consecutive_failure_threshold = 3; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 5000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: CB-only SIGHUP propagates to live slice config. +// +// Build gateway with threshold=3. Reload with threshold=7. Verify the +// slice's live config().consecutive_failure_threshold flipped to 7. +// --------------------------------------------------------------------------- +void TestCbReloadPropagatesToSlice() { + std::cout << "\n[TEST] CB Reload: reload propagates to slice..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* host = cbm->GetHost("svc"); + auto* slice = host->GetSlice(0); + int threshold_before = slice->config().consecutive_failure_threshold; + int window_before = slice->config().window_seconds; + + // Build reloaded config with modified CB fields only. + ServerConfig reloaded = gw; + reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 7; + reloaded.upstreams[0].circuit_breaker.window_seconds = 20; + + bool ok = gateway.Reload(reloaded); + // Reload enqueues per-slice updates on the owning dispatcher — + // brief sleep to let the dispatcher execute the queued Slice::Reload. + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + + int threshold_after = slice->config().consecutive_failure_threshold; + int window_after = slice->config().window_seconds; + + bool pass = ok && threshold_before == 3 && window_before == 10 + && threshold_after == 7 && window_after == 20; + TestFramework::RecordTest( + "CB Reload: reload propagates to slice", pass, + pass ? "" : + "ok=" + std::to_string(ok) + + " threshold_before=" + std::to_string(threshold_before) + + " threshold_after=" + std::to_string(threshold_after) + + " window_before=" + std::to_string(window_before) + + " window_after=" + std::to_string(window_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: reload propagates to slice", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: CB-only reload does NOT emit the topology "restart required" +// warning. UpstreamConfig::operator== excludes circuit_breaker so a +// CB-only edit doesn't make the outer config != comparison true — the +// warning fires only on topology-field changes (host, port, pool, proxy, +// tls), which remain restart-only. +// --------------------------------------------------------------------------- +void TestCbOnlyReloadNoRestartWarn() { + std::cout << "\n[TEST] CB Reload: CB-only reload emits no restart warn..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + // Attach ringbuffer sink AFTER gateway ctor (logging::Init + // rebuilds the default logger). See the observability test for rationale. + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + ServerConfig reloaded = gw; + reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 9; + + gateway.Reload(reloaded); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + bool saw_topology_warn = false; + bool saw_cb_config_applied = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("upstream topology changes require a restart") != + std::string::npos) { + saw_topology_warn = true; + } + if (msg.find("circuit breaker config applied") != + std::string::npos) { + saw_cb_config_applied = true; + } + } + + bool pass = !saw_topology_warn && saw_cb_config_applied; + TestFramework::RecordTest( + "CB Reload: CB-only reload emits no restart warn", pass, + pass ? "" : + "saw_topology_warn=" + std::to_string(saw_topology_warn) + + " saw_cb_config_applied=" + std::to_string(saw_cb_config_applied)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: CB-only reload emits no restart warn", false, + e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Topology change (pool field edit) STILL emits the restart warn +// — the exclusion of circuit_breaker from operator== must NOT compromise +// the restart-required signal for unreloadable fields. +// --------------------------------------------------------------------------- +void TestTopologyChangeStillEmitsRestartWarn() { + std::cout << "\n[TEST] CB Reload: topology change still warns..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + ServerConfig reloaded = gw; + // Topology-level edit that operator== still detects. + reloaded.upstreams[0].pool.max_connections = 16; + // Also flip a breaker field so we verify BOTH happen on the + // same reload (live CB edit + topology warn). + reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 5; + + gateway.Reload(reloaded); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + bool saw_topology_warn = false; + bool saw_cb_config_applied = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("upstream topology changes require a restart") != + std::string::npos) { + saw_topology_warn = true; + } + if (msg.find("circuit breaker config applied") != + std::string::npos) { + saw_cb_config_applied = true; + } + } + + bool pass = saw_topology_warn && saw_cb_config_applied; + TestFramework::RecordTest( + "CB Reload: topology change still warns", pass, + pass ? "" : + "saw_topology_warn=" + std::to_string(saw_topology_warn) + + " saw_cb_config_applied=" + std::to_string(saw_cb_config_applied)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: topology change still warns", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: Disable → enable toggle via reload. A CB-only reload that sets +// `enabled=false` must make the slice short-circuit admissions; a +// subsequent reload flipping `enabled=true` must re-engage the state +// machine without requiring a restart. Verifies the "wire transition +// callbacks for ALL upstreams regardless of enabled" design (§3.1 R3-1). +// --------------------------------------------------------------------------- +void TestReloadDisableThenEnable() { + std::cout << "\n[TEST] CB Reload: reload disable→enable..." << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* slice = cbm->GetHost("svc")->GetSlice(0); + + // Start: enabled=true. + bool enabled_before = slice->config().enabled; + + // Reload to enabled=false. + ServerConfig disabled = gw; + disabled.upstreams[0].circuit_breaker.enabled = false; + gateway.Reload(disabled); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bool disabled_after = !slice->config().enabled; + + // Reload back to enabled=true with a new threshold. + ServerConfig reenabled = gw; + reenabled.upstreams[0].circuit_breaker.enabled = true; + reenabled.upstreams[0].circuit_breaker.consecutive_failure_threshold = 11; + gateway.Reload(reenabled); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + bool enabled_again = slice->config().enabled; + int threshold_after = slice->config().consecutive_failure_threshold; + + bool pass = enabled_before && disabled_after && + enabled_again && threshold_after == 11; + TestFramework::RecordTest( + "CB Reload: reload disable→enable", pass, + pass ? "" : + "enabled_before=" + std::to_string(enabled_before) + + " disabled_after=" + std::to_string(disabled_after) + + " enabled_again=" + std::to_string(enabled_again) + + " threshold_after=" + std::to_string(threshold_after)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: reload disable→enable", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - HOT-RELOAD TESTS" << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestCbReloadPropagatesToSlice(); + TestCbOnlyReloadNoRestartWarn(); + TestTopologyChangeStillEmitsRestartWarn(); + TestReloadDisableThenEnable(); +} + +} // namespace CircuitBreakerReloadTests diff --git a/test/circuit_breaker_retry_budget_test.h b/test/circuit_breaker_retry_budget_test.h new file mode 100644 index 00000000..608a0602 --- /dev/null +++ b/test/circuit_breaker_retry_budget_test.h @@ -0,0 +1,367 @@ +#pragma once + +// Retry-budget integration tests: retry budget wired into ProxyTransaction. +// +// The component suite covers the RetryBudget math (CAS, non-retry +// denominator, min-concurrency floor) as unit tests against the +// RetryBudget class in isolation. This suite tests the INTEGRATION: +// ProxyTransaction resolves +// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks +// every attempt's in_flight via the RAII guard, and consults +// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2 +// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed +// back into the slice's failure math. +// +// Strategy: backends that always 502 with `retry_on_5xx=true` drive the +// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`) +// rejects every retry deterministically without needing concurrent +// client load. The circuit-breaker consecutive-failure threshold is +// raised well above the retry count so the breaker stays CLOSED — the +// budget gate is tested in isolation from the state machine. + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" + +#include +#include +#include +#include + +namespace CircuitBreakerRetryBudgetTests { + +// Upstream config that always proxies /fail, with the circuit breaker +// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker +// thresholds intentionally unreachable for these tests — we want the +// retry-budget gate fired in isolation, not co-tripping the state +// machine. +static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name, + const std::string& host, + int port, + int retry_budget_percent, + int retry_budget_min_concurrency, + bool dry_run = false) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + u.pool.max_connections = 16; + u.pool.max_idle_connections = 8; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 2000; + + u.circuit_breaker.enabled = true; + u.circuit_breaker.dry_run = dry_run; + // Breaker thresholds unreachable — we don't want the state machine + // tripping during a retry-budget test. + u.circuit_breaker.consecutive_failure_threshold = 10000; + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + + u.circuit_breaker.retry_budget_percent = retry_budget_percent; + u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency; + return u; +} + +static bool HasRetryBudgetHeader(const std::string& response) { + return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos || + response.find("x-retry-budget-exhausted: 1") != std::string::npos; +} + +// --------------------------------------------------------------------------- +// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 + +// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that +// `TryConsumeRetry` runs BEFORE the retry executes and that +// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse +// path. +// +// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every +// retry attempt's TryConsumeRetry returns false. First attempt is +// unaffected (budget only gates retries), so the backend is hit exactly +// once per client request; the retry is short-circuited locally. +// --------------------------------------------------------------------------- +void TestRetryBudgetRejectsRetry() { + std::cout << "\n[TEST] CB Retry Budget: retry budget rejects retry..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); + + bool is_503 = TestHttpClient::HasStatus(r, 503); + bool has_budget_hdr = HasRetryBudgetHeader(r); + // Backend should have been hit exactly once (the first attempt); + // every retry was short-circuited by the budget gate. + int hits = backend_hits.load(std::memory_order_relaxed); + bool single_backend_hit = (hits == 1); + + bool pass = is_503 && has_budget_hdr && single_backend_hit; + TestFramework::RecordTest( + "CB Retry Budget: retry budget rejects retry", pass, + pass ? "" : + "is_503=" + std::to_string(is_503) + + " budget_hdr=" + std::to_string(has_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: retry budget rejects retry", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: The min-concurrency floor admits retries even when the %-based +// cap would be zero. With percent=0 + min_concurrency=5, a single sequential +// client request's retry chain (1 first + 3 retries = 4 backend hits) all +// fit under the floor and proceed normally to the upstream — no 503, no +// X-Retry-Budget-Exhausted, and the client sees the final 5xx response. +// +// This is the symmetric test to Test 1: same near-zero %-cap, but a floor +// large enough that retries aren't budget-gated. Proves the floor is +// consulted (retries admitted) instead of the %-cap (retries rejected). +// --------------------------------------------------------------------------- +void TestRetryBudgetMinConcurrencyFloor() { + std::cout << "\n[TEST] CB Retry Budget: retry budget min-concurrency floor..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + // percent=0 → no %-based capacity. min_concurrency=5 → floor + // admits up to 5 concurrent retries, easily covering the 3 + // sequential retries from a single client request. + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/5); + u.proxy.retry.max_retries = 3; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); + + // Client sees the upstream's final 502 — no local 503, no + // X-Retry-Budget-Exhausted. + bool is_502 = TestHttpClient::HasStatus(r, 502); + bool no_budget_hdr = !HasRetryBudgetHeader(r); + // 1 first attempt + 3 retries admitted by the floor = 4 backend hits. + int hits = backend_hits.load(std::memory_order_relaxed); + bool all_retries_proceeded = (hits == 4); + + bool pass = is_502 && no_budget_hdr && all_retries_proceeded; + TestFramework::RecordTest( + "CB Retry Budget: retry budget min-concurrency floor", pass, + pass ? "" : + "is_502=" + std::to_string(is_502) + + " no_budget_hdr=" + std::to_string(no_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: retry budget min-concurrency floor", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 3: Dry-run bypasses the retry-budget gate. +// +// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry +// returns false for every retry. But `circuit_breaker.dry_run=true` +// switches the rejection path to a log-and-proceed: no token is +// consumed, retry_token_held_ stays false, and AttemptCheckout runs as +// though the budget was unlimited. +// +// Result: the client sees the upstream's 502 response (because the +// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted. +// --------------------------------------------------------------------------- +void TestRetryBudgetDryRunPassthrough() { + std::cout << "\n[TEST] CB Retry Budget: retry budget dry-run passthrough..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0, + /*dry_run=*/true); + u.proxy.retry.max_retries = 2; + u.proxy.retry.retry_on_5xx = true; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); + + // Retries proceeded despite would-reject decisions — the client + // sees the upstream's final 502, not our local 503. + bool is_502 = TestHttpClient::HasStatus(r, 502); + bool no_budget_hdr = !HasRetryBudgetHeader(r); + int hits = backend_hits.load(std::memory_order_relaxed); + bool all_attempts_ran = (hits == 3); // 1 first + 2 retries + + bool pass = is_502 && no_budget_hdr && all_attempts_ran; + TestFramework::RecordTest( + "CB Retry Budget: retry budget dry-run passthrough", pass, + pass ? "" : + "is_502=" + std::to_string(is_502) + + " no_budget_hdr=" + std::to_string(no_budget_hdr) + + " backend_hits=" + std::to_string(hits) + + " body=" + r.substr(0, 256)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: retry budget dry-run passthrough", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 4: First attempts are NOT budget-gated. +// +// The retry-budget cap applies only to retries (attempt_ > 0). First +// attempts call TrackInFlight (which only ever increments) but skip +// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a +// backend that always 200s, every client request must succeed — if the +// gate accidentally ran on first attempts, we'd see 503s here. +// +// Guards against a regression where TryConsumeRetry is called before +// the `attempt_ > 0` gate, or where the gate is placed in +// AttemptCheckout instead of MaybeRetry. +// --------------------------------------------------------------------------- +void TestFirstAttemptsNotGated() { + std::cout << "\n[TEST] CB Retry Budget: first attempts not gated..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + resp.Status(200).Body("ok", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, + /*percent=*/0, + /*min_concurrency=*/0); + // No retries — every request is a first attempt. + u.proxy.retry.max_retries = 0; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + int client_count = 5; + int successes = 0; + for (int i = 0; i < client_count; ++i) { + std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); + if (TestHttpClient::HasStatus(r, 200)) ++successes; + if (HasRetryBudgetHeader(r)) { + // Any X-Retry-Budget-Exhausted on a first-attempt-only + // path is a bug. Record and bail. + TestFramework::RecordTest( + "CB Retry Budget: first attempts not gated", false, + "unexpected X-Retry-Budget-Exhausted on first-attempt path " + "i=" + std::to_string(i)); + return; + } + } + + int hits = backend_hits.load(std::memory_order_relaxed); + bool pass = (successes == client_count) && (hits == client_count); + TestFramework::RecordTest( + "CB Retry Budget: first attempts not gated", pass, + pass ? "" : + "successes=" + std::to_string(successes) + + "/" + std::to_string(client_count) + + " backend_hits=" + std::to_string(hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Retry Budget: first attempts not gated", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - RETRY BUDGET INTEGRATION TESTS" + << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestRetryBudgetRejectsRetry(); + TestRetryBudgetMinConcurrencyFloor(); + TestRetryBudgetDryRunPassthrough(); + TestFirstAttemptsNotGated(); +} + +} // namespace CircuitBreakerRetryBudgetTests diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index 65b03777..bed54da0 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -732,7 +732,7 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() { } // Verifies the dedicated HALF_OPEN-full counter is bumped separately from the -// generic `rejected_` counter, so Phase 7 snapshots can distinguish +// generic `rejected_` counter, so observability snapshots can distinguish // "open, backoff not elapsed" from "probing, no slots left". void TestHalfOpenFullCounterSeparate() { std::cout << "\n[TEST] CB: HALF_OPEN_FULL counter separate..." << std::endl; @@ -958,7 +958,7 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() { // BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left // `open_until_steady_ns_` populated, violating the documented OpenUntil() -// contract ("zero when not OPEN"). A Phase 4 consumer computing Retry-After +// contract ("zero when not OPEN"). A consumer computing Retry-After // from a HALF_OPEN slice would compute (stale_deadline - now), which is // negative once HALF_OPEN begins. void TestOpenUntilZeroWhenHalfOpen() { diff --git a/test/circuit_breaker_wait_queue_drain_test.h b/test/circuit_breaker_wait_queue_drain_test.h new file mode 100644 index 00000000..d2200094 --- /dev/null +++ b/test/circuit_breaker_wait_queue_drain_test.h @@ -0,0 +1,261 @@ +#pragma once + +// Wait-queue-drain integration tests: wait-queue drain on CLOSED → OPEN trip. +// +// The integration suite covers "new requests after a trip hit +// REJECTED_OPEN". This suite covers the orthogonal case: a request that passed ConsultBreaker +// pre-trip and is waiting in the pool's bounded wait queue when the trip +// fires. Without the drain, that waiter would sit until either the pool +// frees a slot (and then re-hit the upstream — pointless traffic) or the +// queue-timeout / open-duration elapses (up to 60s latency spike). +// +// Mechanism tested: `HttpServer::MarkServerReady` installs a transition +// callback on every slice that routes CLOSED → OPEN to the corresponding +// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives +// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps +// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`). +// +// Strategy: gate concurrency via a 1-connection pool. The first request +// hangs at the backend long enough to let a second request queue behind +// it. When the first's response lands (502), the breaker trips and the +// drain fires, causing the queued request to receive 503 + circuit-open +// headers instead of the backend's 502 (which would happen if the drain +// were missing and the queued request proceeded). + +#include "test_framework.h" +#include "test_server_runner.h" +#include "http_test_client.h" +#include "http/http_server.h" +#include "config/server_config.h" + +#include +#include +#include +#include +#include + +namespace CircuitBreakerWaitQueueDrainTests { + +static UpstreamConfig MakeDrainTripUpstream(const std::string& name, + const std::string& host, + int port, + bool breaker_enabled) { + UpstreamConfig u; + u.name = name; + u.host = host; + u.port = port; + // Single connection per partition — forces the second concurrent + // request to queue behind the first. Since tests run with + // worker_threads=1, one partition exists and it has exactly one + // connection slot. + u.pool.max_connections = 1; + u.pool.max_idle_connections = 1; + u.pool.connect_timeout_ms = 3000; + u.pool.idle_timeout_sec = 30; + u.pool.max_lifetime_sec = 3600; + u.pool.max_requests_per_conn = 0; + + u.proxy.route_prefix = "/fail"; + u.proxy.strip_prefix = false; + u.proxy.response_timeout_ms = 5000; + u.proxy.retry.max_retries = 0; // Deterministic — no retry confounds. + + u.circuit_breaker.enabled = breaker_enabled; + u.circuit_breaker.consecutive_failure_threshold = 1; // Trip on first 5xx. + u.circuit_breaker.failure_rate_threshold = 100; + u.circuit_breaker.minimum_volume = 10000; + u.circuit_breaker.window_seconds = 10; + u.circuit_breaker.permitted_half_open_calls = 2; + // Long open duration so the drain is unambiguously the thing that + // surfaces the 503 to the queued client — not a timer-driven + // HALF_OPEN recovery admitting a subsequent attempt. + u.circuit_breaker.base_open_duration_ms = 30000; + u.circuit_breaker.max_open_duration_ms = 60000; + return u; +} + +// --------------------------------------------------------------------------- +// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker. +// +// Request A takes the single pool slot and hangs at the backend for ~300ms. +// Request B queues (pool exhausted). At t≈300ms, A's backend response +// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip → +// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives +// 503 + `X-Circuit-Breaker: open`. +// +// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the +// backend itself, gets 502, client sees 502 — NOT 503 and NOT +// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header` +// fails without the drain wiring. +// --------------------------------------------------------------------------- +void TestWaitQueueDrainedOnTrip() { + std::cout << "\n[TEST] CB Wait-Queue Drain: wait queue drained on trip..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + // Delay so the gateway's pool holds the connection long + // enough for a second client request to queue on it. + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; // Single partition → single wait queue. + gw.http2.enabled = false; + + gw.upstreams.push_back( + MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, + /*breaker_enabled=*/true)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Launch A first (takes the one connection), then B 50ms later + // so B is guaranteed to enter the wait queue. + std::promise a_resp, b_resp; + auto a_fut = a_resp.get_future(); + auto b_fut = b_resp.get_future(); + std::thread a([&]() { + a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::thread b([&]() { + b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + a.join(); + b.join(); + + std::string ra = a_fut.get(); + std::string rb = b_fut.get(); + + // A unambiguously hits the backend (owns the slot) and sees 502. + bool a_is_502 = TestHttpClient::HasStatus(ra, 502); + // B must see the circuit-open short-circuit from the drain — + // NOT a 502 from the backend, which is what happens without + // the drain wiring. + bool b_is_503 = TestHttpClient::HasStatus(rb, 503); + bool b_has_breaker_hdr = + rb.find("X-Circuit-Breaker: open") != std::string::npos || + rb.find("x-circuit-breaker: open") != std::string::npos; + // Exactly one backend hit — B was drained before making it to + // the upstream. Without the drain, backend_hits would be 2. + int hits = backend_hits.load(std::memory_order_relaxed); + bool single_hit = (hits == 1); + + bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit; + TestFramework::RecordTest( + "CB Wait-Queue Drain: wait queue drained on trip", pass, + pass ? "" : + "a_is_502=" + std::to_string(a_is_502) + + " b_is_503=" + std::to_string(b_is_503) + + " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) + + " backend_hits=" + std::to_string(hits) + + " rb_head=" + rb.substr(0, 200)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Wait-Queue Drain: wait queue drained on trip", false, e.what()); + } +} + +// --------------------------------------------------------------------------- +// Test 2: With the breaker disabled, the drain does NOT fire — the queued +// waiter proceeds to the upstream as it would absent the circuit-breaker +// layer entirely. +// +// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices +// short-circuit in TryAcquire and never invoke transition callbacks, so +// DrainWaitQueueOnTrip is never called. Request B must hit the backend +// (backend_hits == 2) and receive the upstream's 502 — NOT a 503. +// --------------------------------------------------------------------------- +void TestDisabledBreakerDoesNotDrain() { + std::cout << "\n[TEST] CB Wait-Queue Drain: disabled breaker does not drain..." + << std::endl; + try { + std::atomic backend_hits{0}; + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { + backend_hits.fetch_add(1, std::memory_order_relaxed); + std::this_thread::sleep_for(std::chrono::milliseconds(300)); + resp.Status(502).Body("upstream-err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + + gw.upstreams.push_back( + MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, + /*breaker_enabled=*/false)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + std::promise a_resp, b_resp; + auto a_fut = a_resp.get_future(); + auto b_fut = b_resp.get_future(); + std::thread a([&]() { + a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + std::thread b([&]() { + b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); + }); + a.join(); + b.join(); + + std::string ra = a_fut.get(); + std::string rb = b_fut.get(); + + // Both reach the backend — disabled breaker = no drain. + bool a_is_502 = TestHttpClient::HasStatus(ra, 502); + bool b_is_502 = TestHttpClient::HasStatus(rb, 502); + // Neither should carry the circuit-open header. + bool no_breaker_on_a = + ra.find("X-Circuit-Breaker") == std::string::npos && + ra.find("x-circuit-breaker") == std::string::npos; + bool no_breaker_on_b = + rb.find("X-Circuit-Breaker") == std::string::npos && + rb.find("x-circuit-breaker") == std::string::npos; + int hits = backend_hits.load(std::memory_order_relaxed); + bool two_hits = (hits == 2); + + bool pass = a_is_502 && b_is_502 && no_breaker_on_a && + no_breaker_on_b && two_hits; + TestFramework::RecordTest( + "CB Wait-Queue Drain: disabled breaker does not drain", pass, + pass ? "" : + "a_is_502=" + std::to_string(a_is_502) + + " b_is_502=" + std::to_string(b_is_502) + + " no_breaker_on_a=" + std::to_string(no_breaker_on_a) + + " no_breaker_on_b=" + std::to_string(no_breaker_on_b) + + " backend_hits=" + std::to_string(hits)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Wait-Queue Drain: disabled breaker does not drain", false, e.what()); + } +} + +void RunAllTests() { + std::cout << "\n" << std::string(60, '=') << std::endl; + std::cout << "CIRCUIT BREAKER - WAIT-QUEUE DRAIN ON TRIP TESTS" + << std::endl; + std::cout << std::string(60, '=') << std::endl; + + TestWaitQueueDrainedOnTrip(); + TestDisabledBreakerDoesNotDrain(); +} + +} // namespace CircuitBreakerWaitQueueDrainTests diff --git a/test/config_test.h b/test/config_test.h index fe164ec3..778f464b 100644 --- a/test/config_test.h +++ b/test/config_test.h @@ -562,14 +562,15 @@ namespace ConfigTests { "circuit_breaker.enabled must be a boolean"); } - // Test 14: UpstreamConfig::operator== INCLUDES circuit_breaker until Phase 8. - // Until CircuitBreakerManager::Reload is wired in HttpServer::Reload, a - // CB-only SIGHUP has no propagation path. Keeping circuit_breaker in the - // equality check ensures the server fires the "restart required" warning - // rather than silently reporting "reload OK" with stale live settings. - // TODO(phase-8): flip this test when CB hot-reload is implemented. + // UpstreamConfig::operator== EXCLUDES circuit_breaker. + // CircuitBreakerManager::Reload is wired in HttpServer::Reload, so a + // CB-only SIGHUP is a clean hot reload. Excluding circuit_breaker from + // the equality check ensures the outer reload doesn't fire a spurious + // "restart required" warning on a pure CB-fields edit. + // Topology fields (name, host, port, tls, pool, proxy) remain + // restart-only and must still trigger inequality. void TestCircuitBreakerEquality() { - std::cout << "\n[TEST] Circuit Breaker Equality (CB included until Phase 8)..." << std::endl; + std::cout << "\n[TEST] Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)..." << std::endl; try { UpstreamConfig a; a.name = "svc"; a.host = "h"; a.port = 80; @@ -578,16 +579,17 @@ namespace ConfigTests { // Default equal. bool equal_default = (a == b); - // Circuit-breaker-only edit DOES change UpstreamConfig equality - // (until Phase 8 ships the live-reload path). + // Circuit-breaker-only edit must NOT break equality — breaker + // fields are live-reloadable via CircuitBreakerManager::Reload. b.circuit_breaker.enabled = true; b.circuit_breaker.window_seconds = 30; - bool cb_edit_detected = (a != b); + bool cb_edit_invisible = (a == b); - // CircuitBreakerConfig::operator== agrees on the field diff. + // CircuitBreakerConfig::operator== still detects the field diff + // (CircuitBreakerManager::Reload relies on this inner comparison). bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker); - // Topology changes also make configs unequal. + // Topology changes still make configs unequal. UpstreamConfig c = a; c.host = "different"; bool topology_changed = (a != c); @@ -596,20 +598,20 @@ namespace ConfigTests { d.port = 9999; bool port_change_detected = (a != d); - bool pass = equal_default && cb_edit_detected && + bool pass = equal_default && cb_edit_invisible && cb_fields_differ && topology_changed && port_change_detected; - TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)", + TestFramework::RecordTest("Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)", pass, pass ? "" : "equal_default=" + std::to_string(equal_default) + - " cb_edit_detected=" + std::to_string(cb_edit_detected) + + " cb_edit_invisible=" + std::to_string(cb_edit_invisible) + " cb_fields_differ=" + std::to_string(cb_fields_differ) + " topology_changed=" + std::to_string(topology_changed) + " port_change_detected=" + std::to_string(port_change_detected), TestFramework::TestCategory::OTHER); } catch (const std::exception& e) { - TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)", + TestFramework::RecordTest("Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)", false, e.what(), TestFramework::TestCategory::OTHER); } } @@ -629,7 +631,7 @@ namespace ConfigTests { TestEnvOverrides(); TestMissingFile(); - // Phase 1: Circuit breaker config + // Circuit breaker config tests TestCircuitBreakerDefaults(); TestCircuitBreakerJsonParse(); TestCircuitBreakerJsonPartial(); diff --git a/test/run_test.cc b/test/run_test.cc index 17d7eed9..0419c6ee 100644 --- a/test/run_test.cc +++ b/test/run_test.cc @@ -14,11 +14,12 @@ #include "proxy_test.h" #include "rate_limit_test.h" #include "circuit_breaker_test.h" -#include "circuit_breaker_phase3_test.h" -#include "circuit_breaker_phase4_test.h" -#include "circuit_breaker_phase5_test.h" -#include "circuit_breaker_phase6_test.h" -#include "circuit_breaker_phase7_test.h" +#include "circuit_breaker_components_test.h" +#include "circuit_breaker_integration_test.h" +#include "circuit_breaker_retry_budget_test.h" +#include "circuit_breaker_wait_queue_drain_test.h" +#include "circuit_breaker_observability_test.h" +#include "circuit_breaker_reload_test.h" #include "test_framework.h" #include #include @@ -86,21 +87,24 @@ void RunAllTest(){ // Run circuit breaker tests CircuitBreakerTests::RunAllTests(); - // Run circuit breaker Phase 3 tests (host / manager / retry budget) - CircuitBreakerPhase3Tests::RunAllTests(); + // Run circuit-breaker component unit tests (RetryBudget / Host / Manager) + CircuitBreakerComponentsTests::RunAllTests(); - // Run circuit breaker Phase 4 integration tests (end-to-end through + // Run circuit-breaker integration tests (end-to-end through // ProxyTransaction + UpstreamManager + HttpServer) - CircuitBreakerPhase4Tests::RunAllTests(); + CircuitBreakerIntegrationTests::RunAllTests(); - // Run circuit breaker Phase 5 retry-budget integration tests - CircuitBreakerPhase5Tests::RunAllTests(); + // Run circuit-breaker retry-budget integration tests + CircuitBreakerRetryBudgetTests::RunAllTests(); - // Run circuit breaker Phase 6 wait-queue-drain-on-trip tests - CircuitBreakerPhase6Tests::RunAllTests(); + // Run circuit-breaker wait-queue-drain-on-trip tests + CircuitBreakerWaitQueueDrainTests::RunAllTests(); - // Run circuit breaker Phase 7 observability tests - CircuitBreakerPhase7Tests::RunAllTests(); + // Run circuit-breaker observability tests + CircuitBreakerObservabilityTests::RunAllTests(); + + // Run circuit-breaker hot-reload tests + CircuitBreakerReloadTests::RunAllTests(); std::cout << "====================================\n" << std::endl; } @@ -180,14 +184,15 @@ int main(int argc, char* argv[]) { // Run rate limit tests }else if(mode == "rate_limit" || mode == "-L"){ RateLimitTests::RunAllTests(); - // Run circuit breaker tests (phases 1-7: unit + phase3 + phase4 + phase5 + phase6 + phase7) + // Run circuit-breaker tests (unit + components + integration + retry-budget + drain + observability + reload) }else if(mode == "circuit_breaker" || mode == "-B"){ CircuitBreakerTests::RunAllTests(); - CircuitBreakerPhase3Tests::RunAllTests(); - CircuitBreakerPhase4Tests::RunAllTests(); - CircuitBreakerPhase5Tests::RunAllTests(); - CircuitBreakerPhase6Tests::RunAllTests(); - CircuitBreakerPhase7Tests::RunAllTests(); + CircuitBreakerComponentsTests::RunAllTests(); + CircuitBreakerIntegrationTests::RunAllTests(); + CircuitBreakerRetryBudgetTests::RunAllTests(); + CircuitBreakerWaitQueueDrainTests::RunAllTests(); + CircuitBreakerObservabilityTests::RunAllTests(); + CircuitBreakerReloadTests::RunAllTests(); // Show help }else if(mode == "help" || mode == "-h" || mode == "--help"){ PrintUsage(argv[0]); From ed2946f6a43232fa2c3bd82b5061a03d88a4c131 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 22:05:00 +0800 Subject: [PATCH 26/37] Fix review comment --- test/circuit_breaker_phase3_test.h | 506 ------------ test/circuit_breaker_phase4_test.h | 1213 ---------------------------- test/circuit_breaker_phase5_test.h | 366 --------- test/circuit_breaker_phase6_test.h | 261 ------ test/circuit_breaker_phase7_test.h | 405 ---------- 5 files changed, 2751 deletions(-) delete mode 100644 test/circuit_breaker_phase3_test.h delete mode 100644 test/circuit_breaker_phase4_test.h delete mode 100644 test/circuit_breaker_phase5_test.h delete mode 100644 test/circuit_breaker_phase6_test.h delete mode 100644 test/circuit_breaker_phase7_test.h diff --git a/test/circuit_breaker_phase3_test.h b/test/circuit_breaker_phase3_test.h deleted file mode 100644 index 87ed28e7..00000000 --- a/test/circuit_breaker_phase3_test.h +++ /dev/null @@ -1,506 +0,0 @@ -#pragma once - -#include "test_framework.h" -#include "config/server_config.h" -#include "circuit_breaker/circuit_breaker_state.h" -#include "circuit_breaker/circuit_breaker_slice.h" -#include "circuit_breaker/retry_budget.h" -#include "circuit_breaker/circuit_breaker_host.h" -#include "circuit_breaker/circuit_breaker_manager.h" -#include "dispatcher.h" - -#include -#include -#include -#include - -// Phase 3 unit tests: RetryBudget, CircuitBreakerHost, CircuitBreakerManager. -// -// These tests exercise the standalone data structures introduced in Phase 3 -// without any integration into the request path (that comes in Phase 4). -// Every test constructs the object under test in isolation — no live -// dispatchers, no network I/O. A minimal Dispatcher is instantiated only -// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload -// calls. -namespace CircuitBreakerPhase3Tests { - -using circuit_breaker::CircuitBreakerHost; -using circuit_breaker::CircuitBreakerHostSnapshot; -using circuit_breaker::CircuitBreakerManager; -using circuit_breaker::Decision; -using circuit_breaker::FailureKind; -using circuit_breaker::RetryBudget; -using circuit_breaker::State; - -static CircuitBreakerConfig DefaultCbConfig() { - CircuitBreakerConfig cb; - cb.enabled = true; - cb.consecutive_failure_threshold = 5; - cb.failure_rate_threshold = 50; - cb.minimum_volume = 20; - cb.window_seconds = 10; - cb.permitted_half_open_calls = 3; - cb.base_open_duration_ms = 5000; - cb.max_open_duration_ms = 60000; - cb.retry_budget_percent = 20; - cb.retry_budget_min_concurrency = 3; - return cb; -} - -// ============================================================================ -// RetryBudget tests -// ============================================================================ - -// Min-concurrency floor: with tiny in_flight, min_concurrency still permits -// the configured floor of concurrent retries (otherwise a 20% budget allows 0 -// retries when in_flight < 5 — useless in low-volume services). -void TestRetryBudgetMinConcurrencyFloor() { - std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..." - << std::endl; - try { - // percent=20, min=3. Even with 0 in_flight, 3 retries allowed. - RetryBudget rb(20, 3); - - // Without any in_flight, min floor is what gates us. - bool r1 = rb.TryConsumeRetry(); // 1/3 - bool r2 = rb.TryConsumeRetry(); // 2/3 - bool r3 = rb.TryConsumeRetry(); // 3/3 - bool r4 = rb.TryConsumeRetry(); // over → rejected - - bool pass = r1 && r2 && r3 && !r4 && - rb.RetriesInFlight() == 3 && - rb.RetriesRejected() == 1; - - rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry(); - pass = pass && rb.RetriesInFlight() == 0; - - TestFramework::RecordTest("RetryBudget min_concurrency floor", pass, - pass ? "" : "r1=" + std::to_string(r1) + - " r2=" + std::to_string(r2) + - " r3=" + std::to_string(r3) + - " r4=" + std::to_string(r4) + - " inflight=" + std::to_string(rb.RetriesInFlight()) + - " rejected=" + std::to_string(rb.RetriesRejected()), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("RetryBudget min_concurrency floor", false, - e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Percent-based cap scales with in_flight. -// percent=20, min=0, in_flight=50 → cap = 10 retries. -void TestRetryBudgetPercentCap() { - std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..." - << std::endl; - try { - RetryBudget rb(20, 0); // no min floor — pure percent - - // Push in_flight to 50 via guards that we intentionally keep - // alive. Per the documented API, callers hold TrackInFlight() - // for BOTH first attempts and retries — but TryConsumeRetry - // subtracts retries_in_flight from the base so the budget - // doesn't self-inflate as retries are admitted. - std::vector guards; - for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight()); - - // With 50 non-retry in-flight and 20% budget the first - // admission is against cap=10, but each admission shrinks the - // non-retry base by 1. The admission count converges at r - // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The - // pre-fix formula (cap computed from raw in_flight) would - // admit 10, drifting the effective ratio above 20% of - // originals. - int admitted = 0; - for (int i = 0; i < 20; ++i) { - if (rb.TryConsumeRetry()) ++admitted; - } - bool cap_hit = admitted == 8; - bool rejected_count = rb.RetriesRejected() == 12; - - // Release guards — in_flight drops to 0; future TryConsumeRetry with - // min=0 and in_flight=0 rejects everything. - for (auto& g : guards) (void)std::move(g); - guards.clear(); - for (int i = 0; i < admitted; ++i) rb.ReleaseRetry(); - - bool pass = cap_hit && rejected_count && rb.InFlight() == 0 && - rb.RetriesInFlight() == 0; - TestFramework::RecordTest("RetryBudget percent cap", pass, - pass ? "" : "admitted=" + std::to_string(admitted) + - " rejected=" + std::to_string(rb.RetriesRejected()) + - " inflight=" + std::to_string(rb.InFlight()), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("RetryBudget percent cap", false, - e.what(), TestFramework::TestCategory::OTHER); - } -} - -// TrackInFlight guards must be RAII-safe: destroying the guard decrements -// in_flight_; moving the guard transfers ownership; self-move safe. -void TestRetryBudgetInFlightGuardRaii() { - std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl; - try { - RetryBudget rb(20, 3); - - bool zero_init = rb.InFlight() == 0; - { - auto g = rb.TrackInFlight(); - bool one_after_track = rb.InFlight() == 1; - - // Move-construct: counter transfers, original is empty. - auto g2 = std::move(g); - bool still_one_after_move = rb.InFlight() == 1; - // g is now empty, destroying it decrements nothing. - (void)g; - - // g2 goes out of scope next. - if (!zero_init || !one_after_track || !still_one_after_move) { - TestFramework::RecordTest("RetryBudget InFlightGuard RAII", - false, "mid-test state wrong", - TestFramework::TestCategory::OTHER); - return; - } - } - bool zero_after_drop = rb.InFlight() == 0; - TestFramework::RecordTest("RetryBudget InFlightGuard RAII", - zero_after_drop, - zero_after_drop ? "" : "in_flight not zero after guard drop", - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("RetryBudget InFlightGuard RAII", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Reload updates tuning atomically without resetting in-flight counters — -// the admission formula changes, outstanding retries keep running. -void TestRetryBudgetReloadPreservesCounters() { - std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..." - << std::endl; - try { - RetryBudget rb(20, 3); - bool r1 = rb.TryConsumeRetry(); // 1/3 - - // Tighten tuning mid-flight. - rb.Reload(10, 1); - - // Outstanding retry is still tracked. - bool inflight_preserved = rb.RetriesInFlight() == 1; - - // New tuning applies — min=1, so 1/1 retry allowed max. - // Current retries_in_flight=1 already, next attempt rejects. - bool r2 = rb.TryConsumeRetry(); - - rb.ReleaseRetry(); - bool cleanup_ok = rb.RetriesInFlight() == 0; - - bool pass = r1 && inflight_preserved && !r2 && cleanup_ok; - TestFramework::RecordTest("RetryBudget Reload preserves counters", pass, - pass ? "" : "r1=" + std::to_string(r1) + - " inflight_preserved=" + std::to_string(inflight_preserved) + - " r2=" + std::to_string(r2) + - " cleanup_ok=" + std::to_string(cleanup_ok), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("RetryBudget Reload preserves counters", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Clamp guards: negative percent / negative min_concurrency are clamped at -// construction (mirrors ConfigLoader::Validate — programmatic callers that -// bypass validation get safe defaults). -void TestRetryBudgetClampsInvalidTuning() { - std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl; - try { - RetryBudget rb(-50, -10); - bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0; - - // Over-max percent clamps to 100. - RetryBudget rb2(500, 5); - bool over_clamped = rb2.percent() == 100; - - // Reload also clamps. - rb.Reload(-1, -1); - bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0; - - bool pass = clamped && over_clamped && reload_clamped; - TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass, - pass ? "" : - "clamped=" + std::to_string(clamped) + - " over_clamped=" + std::to_string(over_clamped) + - " reload_clamped=" + std::to_string(reload_clamped), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("RetryBudget clamps invalid tuning", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// ============================================================================ -// CircuitBreakerHost tests -// ============================================================================ - -// Host creates partition_count slices, GetSlice looks up by index, out-of- -// range returns nullptr (not a crash). -void TestHostCreatesSlicesAndGetSlice() { - std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..." - << std::endl; - try { - auto cb = DefaultCbConfig(); - CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb); - - bool count_ok = host.partition_count() == 4; - bool slice0 = host.GetSlice(0) != nullptr; - bool slice3 = host.GetSlice(3) != nullptr; - bool slice4_null = host.GetSlice(4) == nullptr; // out of range - bool slice_big_null = host.GetSlice(100) == nullptr; - - // Retry budget always present. - bool rb_present = host.GetRetryBudget() != nullptr; - - // Field getters. - bool fields_ok = host.service_name() == "svc" && - host.host() == "10.0.0.1" && - host.port() == 8080; - - bool pass = count_ok && slice0 && slice3 && slice4_null && - slice_big_null && rb_present && fields_ok; - TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "", - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("CircuitBreakerHost GetSlice", false, - e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Host Snapshot aggregates counters across slices and rolls up states. -void TestHostSnapshotAggregates() { - std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..." - << std::endl; - try { - auto cb = DefaultCbConfig(); - cb.consecutive_failure_threshold = 2; - cb.failure_rate_threshold = 100; - cb.minimum_volume = 1000; - CircuitBreakerHost host("svc", "h", 80, 3, cb); - - // Trip slice 0 and 2 → 2 open_partitions, 1 closed. - for (int p : {0, 2}) { - auto* s = host.GetSlice(p); - for (int i = 0; i < 2; ++i) { - auto a = s->TryAcquire(); - s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation); - } - } - - auto snap = host.Snapshot(); - - bool rows_ok = snap.slices.size() == 3; - bool total_trips = snap.total_trips == 2; - bool open = snap.open_partitions == 2; - bool halfopen = snap.half_open_partitions == 0; - bool svc_ok = snap.service_name == "svc" && - snap.host == "h" && snap.port == 80; - - bool pass = rows_ok && total_trips && open && halfopen && svc_ok; - TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass, - pass ? "" : - "rows=" + std::to_string(snap.slices.size()) + - " trips=" + std::to_string(snap.total_trips) + - " open=" + std::to_string(snap.open_partitions), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Host Reload with mismatched dispatcher count logs error and does nothing. -// Uses an empty dispatcher vector — the mismatch path must NOT dereference. -void TestHostReloadDispatcherMismatchIsSafe() { - std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..." - << std::endl; - try { - auto cb = DefaultCbConfig(); - CircuitBreakerHost host("svc", "h", 80, 3, cb); - - auto new_cb = cb; - new_cb.failure_rate_threshold = 80; - - // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not - // apply (retry budget atomics should stay at old values). - std::vector> empty; - host.Reload(empty, new_cb); - - // Retry budget fields should be unchanged — Reload bailed early. - bool rb_unchanged = - host.GetRetryBudget()->percent() == cb.retry_budget_percent && - host.GetRetryBudget()->min_concurrency() == - cb.retry_budget_min_concurrency; - - TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", - rb_unchanged, - rb_unchanged ? "" : "retry budget incorrectly updated on bail", - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// ============================================================================ -// CircuitBreakerManager tests -// ============================================================================ - -// Manager builds one host per upstream (regardless of enabled). GetHost -// returns non-null for known names and null for unknown. -void TestManagerGetHostLookup() { - std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..." - << std::endl; - try { - std::vector upstreams(2); - upstreams[0].name = "svc-a"; - upstreams[0].host = "10.0.0.1"; - upstreams[0].port = 8080; - upstreams[0].circuit_breaker = DefaultCbConfig(); - upstreams[1].name = "svc-b"; - upstreams[1].host = "10.0.0.2"; - upstreams[1].port = 9090; - upstreams[1].circuit_breaker = DefaultCbConfig(); - upstreams[1].circuit_breaker.enabled = false; // disabled still built - - CircuitBreakerManager mgr(upstreams, 4, {}); - - bool count_ok = mgr.host_count() == 2; - auto* a = mgr.GetHost("svc-a"); - auto* b = mgr.GetHost("svc-b"); - auto* unknown = mgr.GetHost("nope"); - - bool a_ok = a != nullptr && a->port() == 8080 && - a->partition_count() == 4; - bool b_ok = b != nullptr && b->port() == 9090 && - b->partition_count() == 4; - bool unknown_null = unknown == nullptr; - - bool pass = count_ok && a_ok && b_ok && unknown_null; - TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass, - pass ? "" : - "count_ok=" + std::to_string(count_ok) + - " a=" + std::to_string(a_ok) + - " b=" + std::to_string(b_ok) + - " unknown_null=" + std::to_string(unknown_null), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// SnapshotAll returns one entry per host; topology-preserved Reload logs and -// skips new/removed names without crashing. -void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() { - std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..." - << std::endl; - try { - std::vector upstreams(1); - upstreams[0].name = "svc-a"; - upstreams[0].host = "h"; - upstreams[0].port = 80; - upstreams[0].circuit_breaker = DefaultCbConfig(); - - CircuitBreakerManager mgr(upstreams, 2, {}); - - auto snaps = mgr.SnapshotAll(); - bool one_snapshot = snaps.size() == 1; - bool snap_name_ok = snaps[0].service_name == "svc-a"; - - // Reload with a NEW name + REMOVED existing name — both must log - // warn and do nothing (topology is restart-only). - std::vector new_upstreams(1); - new_upstreams[0].name = "svc-NEW"; - new_upstreams[0].host = "h"; - new_upstreams[0].port = 80; - new_upstreams[0].circuit_breaker = DefaultCbConfig(); - - mgr.Reload(new_upstreams); - - // Manager must still only know about svc-a (the original). - bool original_preserved = mgr.GetHost("svc-a") != nullptr; - bool new_not_added = mgr.GetHost("svc-NEW") == nullptr; - bool count_stable = mgr.host_count() == 1; - - bool pass = one_snapshot && snap_name_ok && original_preserved && - new_not_added && count_stable; - TestFramework::RecordTest( - "CircuitBreakerManager SnapshotAll + topology-skip", pass, - pass ? "" : - "one_snap=" + std::to_string(one_snapshot) + - " name_ok=" + std::to_string(snap_name_ok) + - " preserved=" + std::to_string(original_preserved) + - " new_not_added=" + std::to_string(new_not_added) + - " count=" + std::to_string(mgr.host_count()), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CircuitBreakerManager SnapshotAll + topology-skip", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects -// empty names, but manager must not blow up if something slips through). -void TestManagerSkipsEmptyNameUpstream() { - std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..." - << std::endl; - try { - std::vector upstreams(2); - upstreams[0].name = ""; // defensive — should be skipped - upstreams[0].host = "h"; - upstreams[0].port = 80; - upstreams[0].circuit_breaker = DefaultCbConfig(); - upstreams[1].name = "svc-b"; - upstreams[1].host = "h"; - upstreams[1].port = 81; - upstreams[1].circuit_breaker = DefaultCbConfig(); - - CircuitBreakerManager mgr(upstreams, 2, {}); - - bool pass = mgr.host_count() == 1 && - mgr.GetHost("svc-b") != nullptr && - mgr.GetHost("") == nullptr; - TestFramework::RecordTest( - "CircuitBreakerManager skips empty-name upstream", pass, - pass ? "" : "count=" + std::to_string(mgr.host_count()), - TestFramework::TestCategory::OTHER); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CircuitBreakerManager skips empty-name upstream", - false, e.what(), TestFramework::TestCategory::OTHER); - } -} - -// Run all Phase 3 tests. -void RunAllTests() { - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "CIRCUIT BREAKER PHASE 3 - UNIT TESTS" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - TestRetryBudgetMinConcurrencyFloor(); - TestRetryBudgetPercentCap(); - TestRetryBudgetInFlightGuardRaii(); - TestRetryBudgetReloadPreservesCounters(); - TestRetryBudgetClampsInvalidTuning(); - - TestHostCreatesSlicesAndGetSlice(); - TestHostSnapshotAggregates(); - TestHostReloadDispatcherMismatchIsSafe(); - - TestManagerGetHostLookup(); - TestManagerSnapshotAllAndReloadSkipsTopologyChanges(); - TestManagerSkipsEmptyNameUpstream(); -} - -} // namespace CircuitBreakerPhase3Tests diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h deleted file mode 100644 index 5626b77a..00000000 --- a/test/circuit_breaker_phase4_test.h +++ /dev/null @@ -1,1213 +0,0 @@ -#pragma once - -// Phase 4 integration tests: circuit breaker wired into ProxyTransaction + -// UpstreamManager + HttpServer. Exercises the full request path end-to-end. -// -// Strategy: use a backend that returns 5xx on every request so repeated hits -// trip the breaker via the consecutive-failure threshold. 5xx responses are -// the cheapest way to accumulate failures (no connect timeouts to wait for). -// Low thresholds keep tests fast. - -#include "test_framework.h" -#include "test_server_runner.h" -#include "http_test_client.h" -#include "http/http_server.h" -#include "config/server_config.h" -#include "upstream/upstream_manager.h" -#include "circuit_breaker/circuit_breaker_manager.h" -#include "circuit_breaker/circuit_breaker_host.h" -#include "circuit_breaker/circuit_breaker_slice.h" - -#include -#include -#include - -namespace CircuitBreakerPhase4Tests { - -using circuit_breaker::State; - -// Shared helper: build an upstream config that proxies /echo → backend and -// has a breaker configured with low thresholds for fast trip. -static UpstreamConfig MakeBreakerUpstream(const std::string& name, - const std::string& host, - int port, - bool breaker_enabled, - int consecutive_threshold = 3) { - UpstreamConfig u; - u.name = name; - u.host = host; - u.port = port; - u.pool.max_connections = 8; - u.pool.max_idle_connections = 4; - u.pool.connect_timeout_ms = 3000; - u.pool.idle_timeout_sec = 30; - u.pool.max_lifetime_sec = 3600; - u.pool.max_requests_per_conn = 0; - - // Exact-match route — simpler than prefix patterns for integration tests. - u.proxy.route_prefix = "/fail"; - u.proxy.strip_prefix = false; - u.proxy.response_timeout_ms = 2000; - // No retries — keeps the test deterministic: one request = one attempt. - u.proxy.retry.max_retries = 0; - - u.circuit_breaker.enabled = breaker_enabled; - u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; - // Disable the rate-based trip path — we drive everything through - // consecutive failures to keep the test count predictable. - u.circuit_breaker.failure_rate_threshold = 100; - u.circuit_breaker.minimum_volume = 10000; - u.circuit_breaker.window_seconds = 10; - u.circuit_breaker.permitted_half_open_calls = 2; - u.circuit_breaker.base_open_duration_ms = 500; // short so recovery test is quick - u.circuit_breaker.max_open_duration_ms = 60000; - return u; -} - -// --------------------------------------------------------------------------- -// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open -// headers on the rejected request. -// --------------------------------------------------------------------------- -void TestBreakerTripsAfterConsecutiveFailures() { - std::cout << "\n[TEST] CB Phase 4: breaker trips after consecutive 5xx..." - << std::endl; - try { - // Backend always returns 502 — gateway classifies the response as - // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt. - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("upstream err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // worker_threads=1 → all TCP connections land on dispatcher 0 - // (NetServer shards new connections by fd%worker_threads), so - // per-request failures accumulate deterministically on slice[0] - // instead of splitting across multiple slices. // single thread → single breaker partition exercised - gw.upstreams.push_back( - MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Hit the failing backend threshold times — each 502 from backend - // propagates to the client as 502 (gateway pass-through) AND counts - // as a RESPONSE_5XX failure in the breaker. - for (int i = 0; i < 3; ++i) { - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - if (!TestHttpClient::HasStatus(r, 502)) { - TestFramework::RecordTest( - "CB Phase 4: trip after consecutive failures", false, - "pre-trip request " + std::to_string(i) + " expected 502, got: " + - r.substr(0, 32)); - return; - } - } - - // Next request must be rejected by the breaker (not proxied). The - // response is 503 with X-Circuit-Breaker: open and Retry-After. - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - bool is_503 = TestHttpClient::HasStatus(r, 503); - bool has_breaker_header = - r.find("X-Circuit-Breaker: open") != std::string::npos || - r.find("x-circuit-breaker: open") != std::string::npos; - bool has_retry_after = - r.find("Retry-After:") != std::string::npos || - r.find("retry-after:") != std::string::npos; - bool has_upstream_host = - r.find("X-Upstream-Host:") != std::string::npos || - r.find("x-upstream-host:") != std::string::npos; - - bool pass = is_503 && has_breaker_header && has_retry_after && - has_upstream_host; - TestFramework::RecordTest( - "CB Phase 4: trip after consecutive failures", pass, - pass ? "" : - "is_503=" + std::to_string(is_503) + - " breaker_hdr=" + std::to_string(has_breaker_header) + - " retry_after=" + std::to_string(has_retry_after) + - " upstream_host=" + std::to_string(has_upstream_host) + - " body=" + r.substr(0, 256)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: trip after consecutive failures", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely. -// The same failure pattern that would trip an enabled breaker must leave the -// pass-through path untouched — every request still reaches the backend. -// --------------------------------------------------------------------------- -void TestBreakerDisabledPassesThrough() { - std::cout << "\n[TEST] CB Phase 4: disabled breaker passes through..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // worker_threads=1 → all TCP connections land on dispatcher 0 - // (NetServer shards new connections by fd%worker_threads), so - // per-request failures accumulate deterministically on slice[0] - // instead of splitting across multiple slices. - gw.upstreams.push_back( - MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/false, /*threshold=*/3)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // 10 requests — with breaker disabled, all 10 reach backend. - for (int i = 0; i < 10; ++i) { - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - if (!TestHttpClient::HasStatus(r, 502)) { - TestFramework::RecordTest( - "CB Phase 4: disabled breaker passes through", false, - "request " + std::to_string(i) + " expected 502, got: " + - r.substr(0, 32)); - return; - } - } - - bool all_hit = backend_hits.load() == 10; - TestFramework::RecordTest( - "CB Phase 4: disabled breaker passes through", all_hit, - all_hit ? "" : - "expected 10 backend hits, got " + std::to_string(backend_hits.load())); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: disabled breaker passes through", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 3: 2xx responses are reported as success — they reset the -// consecutive-failure counter so the breaker doesn't trip on interleaved -// success/failure traffic. -// --------------------------------------------------------------------------- -void TestSuccessResetsConsecutiveFailureCounter() { - std::cout << "\n[TEST] CB Phase 4: 2xx success resets consecutive-failure counter..." - << std::endl; - try { - std::atomic fail_mode{true}; - HttpServer backend("127.0.0.1", 0); - // Backend must serve /fail — that's the exact-match route the - // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail", - // strip_prefix=false). A different backend path would leave - // the gateway 404-ing every request without ever exercising - // the proxy, and the CLOSED-state assertion below would pass - // for the wrong reason. - backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { - if (fail_mode.load()) { - resp.Status(502).Body("err", "text/plain"); - } else { - resp.Status(200).Body("ok", "text/plain"); - } - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // worker_threads=1 → all TCP connections land on dispatcher 0 - // (NetServer shards new connections by fd%worker_threads), so - // per-request failures accumulate deterministically on slice[0] - // instead of splitting across multiple slices. - gw.upstreams.push_back( - MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails. - // With reset semantics, consecutive_failures_ never exceeds 2 → no trip. - for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL - } - fail_mode.store(false); - TestHttpClient::HttpGet(gw_port, "/fail", 3000); // SUCCESS → reset - fail_mode.store(true); - for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); // FAIL - } - - // Inspect the breaker's state directly. The slice must be CLOSED - // AND must have observed activity — without the second check, a - // gateway that 404's every request (e.g. because the proxy route - // doesn't match) would also pass trivially. - auto* cbm = gateway.GetUpstreamManager() ? - gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; - auto* host = cbm ? cbm->GetHost("svc") : nullptr; - auto* slice = host ? host->GetSlice(0) : nullptr; - bool still_closed = slice && slice->CurrentState() == State::CLOSED; - // No trip fired: total_trips should be zero for this slice. - int64_t trips = slice ? slice->Trips() : -1; - bool no_trips = (trips == 0); - - bool pass = still_closed && no_trips; - TestFramework::RecordTest( - "CB Phase 4: success resets consecutive counter", pass, - pass ? "" : - "state=" + std::to_string(static_cast( - slice ? slice->CurrentState() : State::CLOSED)) + - " trips=" + std::to_string(trips)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: success resets consecutive counter", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 4: After the trip, the live slice state is OPEN. Verifies the -// integration actually drives the slice state machine (not just the response). -// --------------------------------------------------------------------------- -void TestTripDrivesSliceState() { - std::cout << "\n[TEST] CB Phase 4: trip drives slice state to OPEN..." - << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // worker_threads=1 → all TCP connections land on dispatcher 0 - // (NetServer shards new connections by fd%worker_threads), so - // per-request failures accumulate deterministically on slice[0] - // instead of splitting across multiple slices. - gw.upstreams.push_back( - MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // 3 failures → trip. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - - // With worker_threads > 1 the 3 failing requests can land on either - // dispatcher (hash-dependent). Check the aggregate snapshot — at - // least one partition must be OPEN with exactly one trip recorded. - auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); - auto* host = cbm->GetHost("svc"); - auto snap = host->Snapshot(); - bool at_least_one_open = snap.open_partitions >= 1; - bool one_trip = snap.total_trips == 1; - // Sanity: the tripped partition should be the one that saw all 3 - // failures (consecutive trip is single-slice, not cross-slice). - bool single_partition_tripped = snap.open_partitions == 1; - - bool pass = at_least_one_open && one_trip && single_partition_tripped; - TestFramework::RecordTest( - "CB Phase 4: trip drives slice state to OPEN", pass, - pass ? "" : - "at_least_one_open=" + std::to_string(at_least_one_open) + - " one_trip=" + std::to_string(one_trip) + - " single_partition=" + std::to_string(single_partition_tripped) + - " (open_partitions=" + std::to_string(snap.open_partitions) + - ", total_trips=" + std::to_string(snap.total_trips) + ")"); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: trip drives slice state to OPEN", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip, -// subsequent requests must be served locally (503) without any upstream I/O. -// Prevents regression where the gate leaked admissions to a known-bad upstream. -// --------------------------------------------------------------------------- -void TestOpenBreakerShortCircuitsUpstreamCall() { - std::cout << "\n[TEST] CB Phase 4: OPEN breaker short-circuits upstream call..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // worker_threads=1 → all TCP connections land on dispatcher 0 - // (NetServer shards new connections by fd%worker_threads), so - // per-request failures accumulate deterministically on slice[0] - // instead of splitting across multiple slices. - gw.upstreams.push_back( - MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // 3 failing requests to trip. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - int hits_at_trip = backend_hits.load(); - - // 5 more requests — all should be rejected locally. - for (int i = 0; i < 5; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - int hits_after = backend_hits.load(); - - // Backend hits must not grow during the post-trip burst. - bool no_leak = hits_after == hits_at_trip; - TestFramework::RecordTest( - "CB Phase 4: OPEN short-circuits upstream call", no_leak, - no_leak ? "" : - "backend hits grew from " + std::to_string(hits_at_trip) + - " to " + std::to_string(hits_after) + " after trip"); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: OPEN short-circuits upstream call", false, e.what()); - } -} - -// Sanity check: verify the bare proxy setup works without the breaker -// before blaming the breaker integration. -void TestBareProxyWorks() { - std::cout << "\n[TEST] CB Phase 4: bare proxy (sanity)..." << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - UpstreamConfig u; - u.name = "svc"; - u.host = "127.0.0.1"; - u.port = backend_port; - u.pool.max_connections = 8; - u.pool.max_idle_connections = 4; - u.pool.connect_timeout_ms = 3000; - u.proxy.route_prefix = "/fail"; - u.proxy.response_timeout_ms = 5000; - u.circuit_breaker.enabled = true; // sanity + breaker enabled - u.circuit_breaker.consecutive_failure_threshold = 3; - u.circuit_breaker.failure_rate_threshold = 100; - u.circuit_breaker.minimum_volume = 10000; - u.circuit_breaker.window_seconds = 10; - u.circuit_breaker.permitted_half_open_calls = 2; - u.circuit_breaker.base_open_duration_ms = 500; - u.circuit_breaker.max_open_duration_ms = 60000; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); - bool pass = TestHttpClient::HasStatus(r, 502); - TestFramework::RecordTest( - "CB Phase 4: bare proxy sanity", pass, - pass ? "" : "expected 502, got: " + r.substr(0, 128)); - } catch (const std::exception& e) { - TestFramework::RecordTest("CB Phase 4: bare proxy sanity", - false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 7: Retry-After header carries a sensible value — within [1, configured -// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now. -// --------------------------------------------------------------------------- -void TestRetryAfterHeaderValue() { - std::cout << "\n[TEST] CB Phase 4: Retry-After value correctness..." - << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // base_open_duration 2000ms, max 60_000ms — Retry-After should - // ceiling-round and fall inside [1, 60]. - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - u.circuit_breaker.base_open_duration_ms = 2000; - u.circuit_breaker.max_open_duration_ms = 60000; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Trip the breaker. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - - // Capture the open-rejection response. - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - bool is_503 = TestHttpClient::HasStatus(r, 503); - - // Extract Retry-After integer value (case-insensitive header). - int retry_after = -1; - const char* markers[] = {"Retry-After:", "retry-after:"}; - for (const char* m : markers) { - auto pos = r.find(m); - if (pos == std::string::npos) continue; - pos += std::string(m).size(); - while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; - int val = 0; - bool any = false; - while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { - val = val * 10 + (r[pos] - '0'); - any = true; - ++pos; - } - if (any) { retry_after = val; break; } - } - - // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60). - // For base_open_duration 2000ms the remaining-seconds at this - // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper - // sanity bound is generous but still rules out 300/3600-class - // buggy fallbacks. - bool in_range = (retry_after >= 1 && retry_after <= 60); - bool reasonable = (retry_after >= 1 && retry_after <= 3); - - bool pass = is_503 && in_range && reasonable; - TestFramework::RecordTest( - "CB Phase 4: Retry-After value in range", pass, - pass ? "" : - "is_503=" + std::to_string(is_503) + - " retry_after=" + std::to_string(retry_after) + - " body=" + r.substr(0, 256)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: Retry-After value in range", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3, -// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored -// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into -// the breaker and MaybeRetry stays out. -// --------------------------------------------------------------------------- -void TestCircuitOpenTerminalForRetry() { - std::cout << "\n[TEST] CB Phase 4: CIRCUIT_OPEN terminal for retry loop..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // Retries enabled on 5xx — if the breaker reject leaked into - // MaybeRetry, the test would see extra backend hits after the - // trip. Long open window so the breaker stays OPEN for the - // duration of the post-trip assertion (no HALF_OPEN probe - // admission racing the test). - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - u.proxy.retry.max_retries = 3; - u.proxy.retry.retry_on_5xx = true; - u.circuit_breaker.base_open_duration_ms = 30000; - u.circuit_breaker.max_open_duration_ms = 60000; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Trip the breaker. Each pre-trip request may retry up to 3 - // times (all failing 5xx), so backend sees up to 3*threshold=12 - // hits. That's acceptable — we just care about post-trip behavior. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 5000); - } - int pre_trip_hits = backend_hits.load(); - - // Post-trip request: expect a single 503 and NO new backend hits. - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - bool is_503 = TestHttpClient::HasStatus(r, 503); - int post_trip_hits = backend_hits.load(); - bool no_new_hits = (post_trip_hits == pre_trip_hits); - - bool pass = is_503 && no_new_hits; - TestFramework::RecordTest( - "CB Phase 4: CIRCUIT_OPEN terminal for retry", pass, - pass ? "" : - "is_503=" + std::to_string(is_503) + - " pre=" + std::to_string(pre_trip_hits) + - " post=" + std::to_string(post_trip_hits)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: CIRCUIT_OPEN terminal for retry", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the -// upstream (pass-through) but still increments the rejected_ counter so -// operators can observe the would-reject rate without production impact. -// --------------------------------------------------------------------------- -void TestDryRunPassthrough() { - std::cout << "\n[TEST] CB Phase 4: dry-run passthrough..." << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - u.circuit_breaker.dry_run = true; // would-reject, but still forward - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Trip thresholds with 5 requests. All should reach backend (502), - // not a 503 — dry-run never short-circuits. - for (int i = 0; i < 5; ++i) { - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - if (!TestHttpClient::HasStatus(r, 502)) { - TestFramework::RecordTest( - "CB Phase 4: dry-run passthrough", false, - "request " + std::to_string(i) + - " expected 502, got: " + r.substr(0, 64)); - return; - } - } - - bool all_hit = (backend_hits.load() == 5); - - // Verify the slice observed trips/rejected even though traffic passed. - auto* mgr = gateway.GetUpstreamManager() ? - gateway.GetUpstreamManager()->GetCircuitBreakerManager() : - nullptr; - int64_t trips = 0, rejected = 0; - if (mgr) { - auto* host = mgr->GetHost("svc"); - if (host) { - auto snap = host->Snapshot(); - trips = snap.total_trips; - rejected = snap.total_rejected; - } - } - // At least one trip fired (consecutive_threshold=3 → slice - // transitioned at least once during the run), and the post-trip - // requests were counted as would-reject (rejected > 0). - bool observed = (trips >= 1) && (rejected >= 1); - - bool pass = all_hit && observed; - TestFramework::RecordTest( - "CB Phase 4: dry-run passthrough", pass, - pass ? "" : - "hits=" + std::to_string(backend_hits.load()) + - " trips=" + std::to_string(trips) + - " rejected=" + std::to_string(rejected)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: dry-run passthrough", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the -// breaker, wait for the open window to elapse, then serve success responses -// and assert the slice transitions back to CLOSED (consecutive_successes -// crosses the threshold — default 2 from DefaultCbConfig / phase-4 config). -// --------------------------------------------------------------------------- -void TestHalfOpenRecoveryRoundTrip() { - std::cout << "\n[TEST] CB Phase 4: HALF_OPEN → CLOSED recovery..." - << std::endl; - try { - std::atomic fail_mode{true}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) { - if (fail_mode.load()) { - resp.Status(502).Body("err", "text/plain"); - } else { - resp.Status(200).Body("ok", "text/plain"); - } - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - // Short open duration so recovery path finishes quickly. - u.circuit_breaker.base_open_duration_ms = 300; - u.circuit_breaker.max_open_duration_ms = 1000; - // Two probes needed to close (default permitted_half_open_calls=2). - u.circuit_breaker.permitted_half_open_calls = 2; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Trip by hitting the failing backend. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - - // Flip backend to success and wait for the open window to elapse. - fail_mode.store(false); - std::this_thread::sleep_for(std::chrono::milliseconds(500)); - - // Probe the proxy — each successful 200 advances HALF_OPEN toward - // CLOSED. Do more than permitted_half_open_calls; some will be - // rejected as half_open_full but the ones that are admitted will - // close the breaker. - bool saw_success = false; - for (int i = 0; i < 8; ++i) { - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - if (TestHttpClient::HasStatus(r, 200)) saw_success = true; - // Small gap between probes — HALF_OPEN only admits permitted - // probes per cycle; spacing lets subsequent probes observe a - // possibly-closed breaker. - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - } - - // Verify slice aggregate: at least one CLOSED transition observed - // (probe_successes >= 1 and total_trips == 1 — we only tripped once). - auto* mgr = gateway.GetUpstreamManager() ? - gateway.GetUpstreamManager()->GetCircuitBreakerManager() : - nullptr; - int64_t probe_succ = 0; - int open_parts = 0, half_open_parts = 0; - if (mgr) { - auto* host = mgr->GetHost("svc"); - if (host) { - auto snap = host->Snapshot(); - probe_succ = 0; - for (const auto& row : snap.slices) { - probe_succ += row.probe_successes; - } - open_parts = snap.open_partitions; - half_open_parts = snap.half_open_partitions; - } - } - - // Recovery complete: saw at least one 200 through the breaker, - // at least one probe success counted, and no partition still - // stuck in OPEN (HALF_OPEN may still linger on the unused slice, - // which is fine for a 2-partition setup). - bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0); - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN → CLOSED recovery", pass, - pass ? "" : - "saw_success=" + std::to_string(saw_success) + - " probe_succ=" + std::to_string(probe_succ) + - " open_parts=" + std::to_string(open_parts) + - " half_open_parts=" + std::to_string(half_open_parts)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN → CLOSED recovery", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 11: Retry-After ceils the config cap from a non-second-aligned -// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap -// would clamp the advertised retry window below what the breaker honors, -// causing well-behaved clients to re-hit the 503. -// --------------------------------------------------------------------------- -void TestRetryAfterCapCeilsNonAlignedMax() { - std::cout << "\n[TEST] CB Phase 4: Retry-After cap ceils non-aligned max..." - << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // Configure a non-second-aligned max backoff. base = 1500ms so - // the actual OpenUntil-now at trip time is ~1.5s, which ceil- - // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration - // (1500ms → 1s), the clamp would drop Retry-After to 1s even - // though the breaker would keep rejecting through the second - // half of that window. - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - u.circuit_breaker.base_open_duration_ms = 1500; - u.circuit_breaker.max_open_duration_ms = 1500; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - - int retry_after = -1; - const char* markers[] = {"Retry-After:", "retry-after:"}; - for (const char* m : markers) { - auto pos = r.find(m); - if (pos == std::string::npos) continue; - pos += std::string(m).size(); - while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; - int val = 0; - bool any = false; - while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { - val = val * 10 + (r[pos] - '0'); - any = true; - ++pos; - } - if (any) { retry_after = val; break; } - } - - // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil- - // rounds 1500ms to 2s, and the remaining-time ceil-rounds to - // 2 at the moment of trip (may be 1 if enough wall-clock has - // elapsed between trip and response). Critically it must NEVER - // be zero or exceed 2 (clamped to the 2s cap). - bool in_range = (retry_after >= 1 && retry_after <= 2); - TestFramework::RecordTest( - "CB Phase 4: Retry-After ceils non-aligned cap", in_range, - in_range ? "" : - "retry_after=" + std::to_string(retry_after)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: Retry-After ceils non-aligned cap", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 12: Retried failures are reported BEFORE the retry fires. With retries -// enabled on 5xx, each attempt's outcome must be counted against the breaker; -// otherwise the slice trips only after the final retry exhausts, under- -// counting failures and potentially never tripping if retries mask enough of -// them. Verifies the trip still happens within the expected number of client -// requests once reporting is attached to the retry path. -// --------------------------------------------------------------------------- -void TestRetriedFailuresCountTowardTrip() { - std::cout << "\n[TEST] CB Phase 4: retried failures count toward trip..." - << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each - // client request produces 1 + max_retries=3 = 4 upstream - // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome - // path that this fix patches in. The breaker must trip after - // at most 3 upstream failure reports (which the first client - // request alone produces). - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - u.proxy.retry.max_retries = 3; - u.proxy.retry.retry_on_5xx = true; - u.circuit_breaker.base_open_duration_ms = 30000; - u.circuit_breaker.max_open_duration_ms = 60000; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // One client request → 4 upstream attempts → 4 RESPONSE_5XX - // reports. Threshold=3 should trip during this single request. - TestHttpClient::HttpGet(gw_port, "/fail", 5000); - - // Second client request must hit the OPEN breaker → 503. - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - bool is_503 = TestHttpClient::HasStatus(r, 503); - bool has_breaker_header = - r.find("X-Circuit-Breaker: open") != std::string::npos || - r.find("x-circuit-breaker: open") != std::string::npos; - - bool pass = is_503 && has_breaker_header; - TestFramework::RecordTest( - "CB Phase 4: retried failures count toward trip", pass, - pass ? "" : - "is_503=" + std::to_string(is_503) + - " breaker_hdr=" + std::to_string(has_breaker_header) + - " body=" + r.substr(0, 256)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: retried failures count toward trip", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label. -// TryAcquire returns REJECTED_OPEN for three situations (true OPEN, -// half_open_full, half_open_recovery_failing). When the slice is in -// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse -// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading -// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects -// with a more conservative Retry-After hint. -// -// Strategy: trip the breaker, wait for the open window to elapse so the -// slice transitions HALF_OPEN on the next admission attempt, then flood -// concurrent requests so some hit half_open_full. -// --------------------------------------------------------------------------- -void TestHalfOpenRejectLabel() { - std::cout << "\n[TEST] CB Phase 4: HALF_OPEN reject label..." - << std::endl; - try { - // Backend hangs to keep probes in-flight so later concurrent - // requests hit half_open_full. - std::atomic hang{false}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { - if (hang.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(600)); - } - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/3); - u.circuit_breaker.base_open_duration_ms = 200; - u.circuit_breaker.max_open_duration_ms = 500; - u.circuit_breaker.permitted_half_open_calls = 1; // tiny budget - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Trip the breaker. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - // Wait for the open window to elapse so the next admission - // flips the slice to HALF_OPEN. - std::this_thread::sleep_for(std::chrono::milliseconds(300)); - - // Flip backend to hang so the probe occupies the single probe - // slot while we fire sibling requests that must hit half_open_full. - hang.store(true); - - std::atomic saw_half_open{false}; - std::atomic saw_open{false}; - auto probe = [&](int id) { - (void)id; - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); - if (!TestHttpClient::HasStatus(r, 503)) return; - if (r.find("X-Circuit-Breaker: half_open") != std::string::npos || - r.find("x-circuit-breaker: half_open") != std::string::npos) { - saw_half_open.store(true); - } - if (r.find("X-Circuit-Breaker: open") != std::string::npos || - r.find("x-circuit-breaker: open") != std::string::npos) { - // We want to distinguish the labels; the "open" substring - // also matches "half_open". Only count true "open" if - // "half_open" didn't appear in THIS response. - if (r.find("half_open") == std::string::npos) { - saw_open.store(true); - } - } - }; - - std::vector threads; - for (int i = 0; i < 6; ++i) { - threads.emplace_back(probe, i); - std::this_thread::sleep_for(std::chrono::milliseconds(20)); - } - for (auto& t : threads) t.join(); - - // Pass if at least one HALF_OPEN-labelled reject was observed. - // saw_open may or may not be observed (some rejects could have - // hit between cycles) — the key contract is that HALF_OPEN - // rejects no longer get the plain "open" label. - bool pass = saw_half_open.load(); - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN reject label", pass, - pass ? "" : - "saw_half_open=" + std::to_string(saw_half_open.load()) + - " saw_open=" + std::to_string(saw_open.load())); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN reject label", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff, -// not just base_open_duration_ms. After multiple trips the next OPEN window -// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old -// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report -// the worst-case wait, which this test must fail for. -// -// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting -// the OPEN window elapse and single probe fail each cycle. Successful -// recoveries must be avoided — TransitionHalfOpenToClosed resets -// consecutive_trips_ to 0, which hides the exponential hint. -// --------------------------------------------------------------------------- -void TestHalfOpenRetryAfterScalesWithBackoff() { - std::cout << "\n[TEST] CB Phase 4: HALF_OPEN Retry-After exponential..." - << std::endl; - try { - // Backend fails fast by default. When `hang` is set, the - // handler blocks — used at the end to pin the probe slot so - // a concurrent request observes HALF_OPEN rejection. - std::atomic hang{false}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) { - if (hang.load()) { - std::this_thread::sleep_for(std::chrono::milliseconds(1500)); - } - resp.Status(502).Body("err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; // pin all traffic to slice[0] - gw.http2.enabled = false; - auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port, - /*enabled=*/true, /*threshold=*/2); - u.circuit_breaker.base_open_duration_ms = 100; // config minimum - u.circuit_breaker.max_open_duration_ms = 8000; // cap at 8s - u.circuit_breaker.permitted_half_open_calls = 1; // single probe - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - auto* cbm = gateway.GetUpstreamManager() ? - gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr; - auto* host = cbm ? cbm->GetHost("svc") : nullptr; - auto* slice = host ? host->GetSlice(0) : nullptr; - if (!slice) { - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN Retry-After exponential-aware", - false, "slice lookup failed"); - return; - } - - // Initial trip: 2 consecutive failures with threshold=2. - for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - - // Drive consecutive_trips_ up by letting successive OPEN windows - // elapse and probes fail (no recovery → no reset). Stop when - // NextOpenDurationMs crosses 1000ms, which is the threshold - // where the HALF_OPEN Retry-After hint starts exceeding the - // base-only value (ceil(100ms)=1s). - // - // The slice re-trips on each failed probe; each trip doubles - // the open duration. We run ~8 cycles with safety margin which - // is comfortably past the trip count needed for Retry-After>=2. - for (int cycle = 0; cycle < 8; ++cycle) { - // Wait past the current open window. Upper bound: max=8s, - // so 1200ms is plenty for the first few short cycles, and - // we re-check after each request anyway. - int64_t next_ms = slice->NextOpenDurationMs(); - // Current OPEN window is the one stored BEFORE the upcoming - // re-trip — we don't have that directly, so sleep past the - // NEXT duration as an over-approximation (next is always >= - // current). This ensures OPEN has elapsed. - auto sleep_ms = std::max(next_ms + 50, 200); - if (sleep_ms > 2000) sleep_ms = 2000; // cap per cycle - std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms)); - - // One request — it should admit as a probe (HALF_OPEN), - // the backend fails fast (502), probe fails → re-trip with - // consecutive_trips_++ and fresh OPEN. - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - - // Bail early once the exponential hint crosses 1s → the - // subsequent HALF_OPEN reject will carry Retry-After >= 2. - if (slice->NextOpenDurationMs() >= 2000) break; - } - - int64_t next_open_ms = slice->NextOpenDurationMs(); - if (next_open_ms < 2000) { - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN Retry-After exponential-aware", - false, - "setup failed: next_open_ms=" + std::to_string(next_open_ms) + - " (need >= 2000 to distinguish from base-only hint)"); - return; - } - - // Now trigger a HALF_OPEN reject: wait for current OPEN to - // elapse, start a hanging probe (pins the slot), then fire a - // sibling request — it must see half_open_full with the - // exponential Retry-After. - int64_t post_wait_ms = next_open_ms + 100; - if (post_wait_ms > 4000) post_wait_ms = 4000; - std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms)); - - hang.store(true); - std::thread probe([&]() { - TestHttpClient::HttpGet(gw_port, "/fail", 3500); - }); - // Let the probe get admitted and start hanging. - std::this_thread::sleep_for(std::chrono::milliseconds(200)); - - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500); - hang.store(false); - probe.join(); - - bool is_half_open = - r.find("X-Circuit-Breaker: half_open") != std::string::npos || - r.find("x-circuit-breaker: half_open") != std::string::npos; - - int retry_after = -1; - const char* markers[] = {"Retry-After:", "retry-after:"}; - for (const char* m : markers) { - auto pos = r.find(m); - if (pos == std::string::npos) continue; - pos += std::string(m).size(); - while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos; - int val = 0; - bool any = false; - while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') { - val = val * 10 + (r[pos] - '0'); - any = true; - ++pos; - } - if (any) { retry_after = val; break; } - } - - // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2. - // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1. - // Asserting >= 2 fails the pre-fix implementation. - bool retry_after_ok = (retry_after >= 2 && retry_after <= 8); - bool pass = is_half_open && retry_after_ok; - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN Retry-After exponential-aware", pass, - pass ? "" : - "is_half_open=" + std::to_string(is_half_open) + - " retry_after=" + std::to_string(retry_after) + - " next_open_ms=" + std::to_string(next_open_ms)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 4: HALF_OPEN Retry-After exponential-aware", - false, e.what()); - } -} - -void RunAllTests() { - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - TestBareProxyWorks(); - TestBreakerTripsAfterConsecutiveFailures(); - TestBreakerDisabledPassesThrough(); - TestSuccessResetsConsecutiveFailureCounter(); - TestTripDrivesSliceState(); - TestOpenBreakerShortCircuitsUpstreamCall(); - TestRetryAfterHeaderValue(); - TestCircuitOpenTerminalForRetry(); - TestDryRunPassthrough(); - TestHalfOpenRecoveryRoundTrip(); - TestRetryAfterCapCeilsNonAlignedMax(); - TestRetriedFailuresCountTowardTrip(); - TestHalfOpenRejectLabel(); - TestHalfOpenRetryAfterScalesWithBackoff(); -} - -} // namespace CircuitBreakerPhase4Tests diff --git a/test/circuit_breaker_phase5_test.h b/test/circuit_breaker_phase5_test.h deleted file mode 100644 index 9b0c3f11..00000000 --- a/test/circuit_breaker_phase5_test.h +++ /dev/null @@ -1,366 +0,0 @@ -#pragma once - -// Phase 5 integration tests: retry budget wired into ProxyTransaction. -// -// Phase 3 covered the RetryBudget math (CAS, non-retry denominator, -// min-concurrency floor) as unit tests against the RetryBudget class in -// isolation. Phase 5 tests the INTEGRATION: ProxyTransaction resolves -// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks -// every attempt's in_flight via the RAII guard, and consults -// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2 -// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed -// back into the slice's failure math. -// -// Strategy: backends that always 502 with `retry_on_5xx=true` drive the -// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`) -// rejects every retry deterministically without needing concurrent -// client load. The circuit-breaker consecutive-failure threshold is -// raised well above the retry count so the breaker stays CLOSED — the -// budget gate is tested in isolation from the state machine. - -#include "test_framework.h" -#include "test_server_runner.h" -#include "http_test_client.h" -#include "http/http_server.h" -#include "config/server_config.h" - -#include -#include -#include -#include - -namespace CircuitBreakerPhase5Tests { - -// Upstream config that always proxies /fail, with the circuit breaker -// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker -// thresholds intentionally unreachable for these tests — we want the -// retry-budget gate fired in isolation, not co-tripping the state -// machine. -static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name, - const std::string& host, - int port, - int retry_budget_percent, - int retry_budget_min_concurrency, - bool dry_run = false) { - UpstreamConfig u; - u.name = name; - u.host = host; - u.port = port; - u.pool.max_connections = 16; - u.pool.max_idle_connections = 8; - u.pool.connect_timeout_ms = 3000; - u.pool.idle_timeout_sec = 30; - u.pool.max_lifetime_sec = 3600; - u.pool.max_requests_per_conn = 0; - - u.proxy.route_prefix = "/fail"; - u.proxy.strip_prefix = false; - u.proxy.response_timeout_ms = 2000; - - u.circuit_breaker.enabled = true; - u.circuit_breaker.dry_run = dry_run; - // Breaker thresholds unreachable — we don't want the state machine - // tripping during a retry-budget test. - u.circuit_breaker.consecutive_failure_threshold = 10000; - u.circuit_breaker.failure_rate_threshold = 100; - u.circuit_breaker.minimum_volume = 10000; - u.circuit_breaker.window_seconds = 10; - u.circuit_breaker.permitted_half_open_calls = 2; - u.circuit_breaker.base_open_duration_ms = 30000; - u.circuit_breaker.max_open_duration_ms = 60000; - - u.circuit_breaker.retry_budget_percent = retry_budget_percent; - u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency; - return u; -} - -static bool HasRetryBudgetHeader(const std::string& response) { - return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos || - response.find("x-retry-budget-exhausted: 1") != std::string::npos; -} - -// --------------------------------------------------------------------------- -// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 + -// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that -// `TryConsumeRetry` runs BEFORE the retry executes and that -// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse -// path. -// -// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every -// retry attempt's TryConsumeRetry returns false. First attempt is -// unaffected (budget only gates retries), so the backend is hit exactly -// once per client request; the retry is short-circuited locally. -// --------------------------------------------------------------------------- -void TestRetryBudgetRejectsRetry() { - std::cout << "\n[TEST] CB Phase 5: retry budget rejects retry..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, - /*percent=*/0, - /*min_concurrency=*/0); - u.proxy.retry.max_retries = 3; - u.proxy.retry.retry_on_5xx = true; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000); - - bool is_503 = TestHttpClient::HasStatus(r, 503); - bool has_budget_hdr = HasRetryBudgetHeader(r); - // Backend should have been hit exactly once (the first attempt); - // every retry was short-circuited by the budget gate. - int hits = backend_hits.load(std::memory_order_relaxed); - bool single_backend_hit = (hits == 1); - - bool pass = is_503 && has_budget_hdr && single_backend_hit; - TestFramework::RecordTest( - "CB Phase 5: retry budget rejects retry", pass, - pass ? "" : - "is_503=" + std::to_string(is_503) + - " budget_hdr=" + std::to_string(has_budget_hdr) + - " backend_hits=" + std::to_string(hits) + - " body=" + r.substr(0, 256)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 5: retry budget rejects retry", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 2: The min-concurrency floor admits retries even when the %-based -// cap would be zero. With percent=0 + min_concurrency=5, a single sequential -// client request's retry chain (1 first + 3 retries = 4 backend hits) all -// fit under the floor and proceed normally to the upstream — no 503, no -// X-Retry-Budget-Exhausted, and the client sees the final 5xx response. -// -// This is the symmetric test to Test 1: same near-zero %-cap, but a floor -// large enough that retries aren't budget-gated. Proves the floor is -// consulted (retries admitted) instead of the %-cap (retries rejected). -// --------------------------------------------------------------------------- -void TestRetryBudgetMinConcurrencyFloor() { - std::cout << "\n[TEST] CB Phase 5: retry budget min-concurrency floor..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - // percent=0 → no %-based capacity. min_concurrency=5 → floor - // admits up to 5 concurrent retries, easily covering the 3 - // sequential retries from a single client request. - auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, - /*percent=*/0, - /*min_concurrency=*/5); - u.proxy.retry.max_retries = 3; - u.proxy.retry.retry_on_5xx = true; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); - - // Client sees the upstream's final 502 — no local 503, no - // X-Retry-Budget-Exhausted. - bool is_502 = TestHttpClient::HasStatus(r, 502); - bool no_budget_hdr = !HasRetryBudgetHeader(r); - // 1 first attempt + 3 retries admitted by the floor = 4 backend hits. - int hits = backend_hits.load(std::memory_order_relaxed); - bool all_retries_proceeded = (hits == 4); - - bool pass = is_502 && no_budget_hdr && all_retries_proceeded; - TestFramework::RecordTest( - "CB Phase 5: retry budget min-concurrency floor", pass, - pass ? "" : - "is_502=" + std::to_string(is_502) + - " no_budget_hdr=" + std::to_string(no_budget_hdr) + - " backend_hits=" + std::to_string(hits) + - " body=" + r.substr(0, 256)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 5: retry budget min-concurrency floor", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 3: Dry-run bypasses the retry-budget gate. -// -// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry -// returns false for every retry. But `circuit_breaker.dry_run=true` -// switches the rejection path to a log-and-proceed: no token is -// consumed, retry_token_held_ stays false, and AttemptCheckout runs as -// though the budget was unlimited. -// -// Result: the client sees the upstream's 502 response (because the -// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted. -// --------------------------------------------------------------------------- -void TestRetryBudgetDryRunPassthrough() { - std::cout << "\n[TEST] CB Phase 5: retry budget dry-run passthrough..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, - /*percent=*/0, - /*min_concurrency=*/0, - /*dry_run=*/true); - u.proxy.retry.max_retries = 2; - u.proxy.retry.retry_on_5xx = true; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000); - - // Retries proceeded despite would-reject decisions — the client - // sees the upstream's final 502, not our local 503. - bool is_502 = TestHttpClient::HasStatus(r, 502); - bool no_budget_hdr = !HasRetryBudgetHeader(r); - int hits = backend_hits.load(std::memory_order_relaxed); - bool all_attempts_ran = (hits == 3); // 1 first + 2 retries - - bool pass = is_502 && no_budget_hdr && all_attempts_ran; - TestFramework::RecordTest( - "CB Phase 5: retry budget dry-run passthrough", pass, - pass ? "" : - "is_502=" + std::to_string(is_502) + - " no_budget_hdr=" + std::to_string(no_budget_hdr) + - " backend_hits=" + std::to_string(hits) + - " body=" + r.substr(0, 256)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 5: retry budget dry-run passthrough", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 4: First attempts are NOT budget-gated. -// -// The retry-budget cap applies only to retries (attempt_ > 0). First -// attempts call TrackInFlight (which only ever increments) but skip -// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a -// backend that always 200s, every client request must succeed — if the -// gate accidentally ran on first attempts, we'd see 503s here. -// -// Guards against a regression where TryConsumeRetry is called before -// the `attempt_ > 0` gate, or where the gate is placed in -// AttemptCheckout instead of MaybeRetry. -// --------------------------------------------------------------------------- -void TestFirstAttemptsNotGated() { - std::cout << "\n[TEST] CB Phase 5: first attempts not gated..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - resp.Status(200).Body("ok", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port, - /*percent=*/0, - /*min_concurrency=*/0); - // No retries — every request is a first attempt. - u.proxy.retry.max_retries = 0; - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - int client_count = 5; - int successes = 0; - for (int i = 0; i < client_count; ++i) { - std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000); - if (TestHttpClient::HasStatus(r, 200)) ++successes; - if (HasRetryBudgetHeader(r)) { - // Any X-Retry-Budget-Exhausted on a first-attempt-only - // path is a bug. Record and bail. - TestFramework::RecordTest( - "CB Phase 5: first attempts not gated", false, - "unexpected X-Retry-Budget-Exhausted on first-attempt path " - "i=" + std::to_string(i)); - return; - } - } - - int hits = backend_hits.load(std::memory_order_relaxed); - bool pass = (successes == client_count) && (hits == client_count); - TestFramework::RecordTest( - "CB Phase 5: first attempts not gated", pass, - pass ? "" : - "successes=" + std::to_string(successes) + - "/" + std::to_string(client_count) + - " backend_hits=" + std::to_string(hits)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 5: first attempts not gated", false, e.what()); - } -} - -void RunAllTests() { - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "CIRCUIT BREAKER PHASE 5 - RETRY BUDGET INTEGRATION TESTS" - << std::endl; - std::cout << std::string(60, '=') << std::endl; - - TestRetryBudgetRejectsRetry(); - TestRetryBudgetMinConcurrencyFloor(); - TestRetryBudgetDryRunPassthrough(); - TestFirstAttemptsNotGated(); -} - -} // namespace CircuitBreakerPhase5Tests diff --git a/test/circuit_breaker_phase6_test.h b/test/circuit_breaker_phase6_test.h deleted file mode 100644 index 77eea2c1..00000000 --- a/test/circuit_breaker_phase6_test.h +++ /dev/null @@ -1,261 +0,0 @@ -#pragma once - -// Phase 6 integration tests: wait-queue drain on CLOSED → OPEN trip. -// -// Phase 4 already covered "new requests after a trip hit REJECTED_OPEN". -// Phase 6 covers the orthogonal case: a request that passed ConsultBreaker -// pre-trip and is waiting in the pool's bounded wait queue when the trip -// fires. Without the drain, that waiter would sit until either the pool -// frees a slot (and then re-hit the upstream — pointless traffic) or the -// queue-timeout / open-duration elapses (up to 60s latency spike). -// -// Mechanism tested: `HttpServer::MarkServerReady` installs a transition -// callback on every slice that routes CLOSED → OPEN to the corresponding -// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives -// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps -// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`). -// -// Strategy: gate concurrency via a 1-connection pool. The first request -// hangs at the backend long enough to let a second request queue behind -// it. When the first's response lands (502), the breaker trips and the -// drain fires, causing the queued request to receive 503 + circuit-open -// headers instead of the backend's 502 (which would happen if the drain -// were missing and the queued request proceeded). - -#include "test_framework.h" -#include "test_server_runner.h" -#include "http_test_client.h" -#include "http/http_server.h" -#include "config/server_config.h" - -#include -#include -#include -#include -#include - -namespace CircuitBreakerPhase6Tests { - -static UpstreamConfig MakeDrainTripUpstream(const std::string& name, - const std::string& host, - int port, - bool breaker_enabled) { - UpstreamConfig u; - u.name = name; - u.host = host; - u.port = port; - // Single connection per partition — forces the second concurrent - // request to queue behind the first. Since tests run with - // worker_threads=1, one partition exists and it has exactly one - // connection slot. - u.pool.max_connections = 1; - u.pool.max_idle_connections = 1; - u.pool.connect_timeout_ms = 3000; - u.pool.idle_timeout_sec = 30; - u.pool.max_lifetime_sec = 3600; - u.pool.max_requests_per_conn = 0; - - u.proxy.route_prefix = "/fail"; - u.proxy.strip_prefix = false; - u.proxy.response_timeout_ms = 5000; - u.proxy.retry.max_retries = 0; // Deterministic — no retry confounds. - - u.circuit_breaker.enabled = breaker_enabled; - u.circuit_breaker.consecutive_failure_threshold = 1; // Trip on first 5xx. - u.circuit_breaker.failure_rate_threshold = 100; - u.circuit_breaker.minimum_volume = 10000; - u.circuit_breaker.window_seconds = 10; - u.circuit_breaker.permitted_half_open_calls = 2; - // Long open duration so the drain is unambiguously the thing that - // surfaces the 503 to the queued client — not a timer-driven - // HALF_OPEN recovery admitting a subsequent attempt. - u.circuit_breaker.base_open_duration_ms = 30000; - u.circuit_breaker.max_open_duration_ms = 60000; - return u; -} - -// --------------------------------------------------------------------------- -// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker. -// -// Request A takes the single pool slot and hangs at the backend for ~300ms. -// Request B queues (pool exhausted). At t≈300ms, A's backend response -// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip → -// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives -// 503 + `X-Circuit-Breaker: open`. -// -// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the -// backend itself, gets 502, client sees 502 — NOT 503 and NOT -// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header` -// fails without the drain wiring. -// --------------------------------------------------------------------------- -void TestWaitQueueDrainedOnTrip() { - std::cout << "\n[TEST] CB Phase 6: wait queue drained on trip..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - // Delay so the gateway's pool holds the connection long - // enough for a second client request to queue on it. - std::this_thread::sleep_for(std::chrono::milliseconds(300)); - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; // Single partition → single wait queue. - gw.http2.enabled = false; - - gw.upstreams.push_back( - MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, - /*breaker_enabled=*/true)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Launch A first (takes the one connection), then B 50ms later - // so B is guaranteed to enter the wait queue. - std::promise a_resp, b_resp; - auto a_fut = a_resp.get_future(); - auto b_fut = b_resp.get_future(); - std::thread a([&]() { - a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); - }); - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - std::thread b([&]() { - b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); - }); - a.join(); - b.join(); - - std::string ra = a_fut.get(); - std::string rb = b_fut.get(); - - // A unambiguously hits the backend (owns the slot) and sees 502. - bool a_is_502 = TestHttpClient::HasStatus(ra, 502); - // B must see the circuit-open short-circuit from the drain — - // NOT a 502 from the backend, which is what happens without - // the drain wiring. - bool b_is_503 = TestHttpClient::HasStatus(rb, 503); - bool b_has_breaker_hdr = - rb.find("X-Circuit-Breaker: open") != std::string::npos || - rb.find("x-circuit-breaker: open") != std::string::npos; - // Exactly one backend hit — B was drained before making it to - // the upstream. Without the drain, backend_hits would be 2. - int hits = backend_hits.load(std::memory_order_relaxed); - bool single_hit = (hits == 1); - - bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit; - TestFramework::RecordTest( - "CB Phase 6: wait queue drained on trip", pass, - pass ? "" : - "a_is_502=" + std::to_string(a_is_502) + - " b_is_503=" + std::to_string(b_is_503) + - " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) + - " backend_hits=" + std::to_string(hits) + - " rb_head=" + rb.substr(0, 200)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 6: wait queue drained on trip", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 2: With the breaker disabled, the drain does NOT fire — the queued -// waiter proceeds to the upstream as it would absent the circuit-breaker -// layer entirely. -// -// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices -// short-circuit in TryAcquire and never invoke transition callbacks, so -// DrainWaitQueueOnTrip is never called. Request B must hit the backend -// (backend_hits == 2) and receive the upstream's 502 — NOT a 503. -// --------------------------------------------------------------------------- -void TestDisabledBreakerDoesNotDrain() { - std::cout << "\n[TEST] CB Phase 6: disabled breaker does not drain..." - << std::endl; - try { - std::atomic backend_hits{0}; - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) { - backend_hits.fetch_add(1, std::memory_order_relaxed); - std::this_thread::sleep_for(std::chrono::milliseconds(300)); - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - gw.upstreams.push_back( - MakeDrainTripUpstream("svc", "127.0.0.1", backend_port, - /*breaker_enabled=*/false)); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - std::promise a_resp, b_resp; - auto a_fut = a_resp.get_future(); - auto b_fut = b_resp.get_future(); - std::thread a([&]() { - a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); - }); - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - std::thread b([&]() { - b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000)); - }); - a.join(); - b.join(); - - std::string ra = a_fut.get(); - std::string rb = b_fut.get(); - - // Both reach the backend — disabled breaker = no drain. - bool a_is_502 = TestHttpClient::HasStatus(ra, 502); - bool b_is_502 = TestHttpClient::HasStatus(rb, 502); - // Neither should carry the circuit-open header. - bool no_breaker_on_a = - ra.find("X-Circuit-Breaker") == std::string::npos && - ra.find("x-circuit-breaker") == std::string::npos; - bool no_breaker_on_b = - rb.find("X-Circuit-Breaker") == std::string::npos && - rb.find("x-circuit-breaker") == std::string::npos; - int hits = backend_hits.load(std::memory_order_relaxed); - bool two_hits = (hits == 2); - - bool pass = a_is_502 && b_is_502 && no_breaker_on_a && - no_breaker_on_b && two_hits; - TestFramework::RecordTest( - "CB Phase 6: disabled breaker does not drain", pass, - pass ? "" : - "a_is_502=" + std::to_string(a_is_502) + - " b_is_502=" + std::to_string(b_is_502) + - " no_breaker_on_a=" + std::to_string(no_breaker_on_a) + - " no_breaker_on_b=" + std::to_string(no_breaker_on_b) + - " backend_hits=" + std::to_string(hits)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 6: disabled breaker does not drain", false, e.what()); - } -} - -void RunAllTests() { - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "CIRCUIT BREAKER PHASE 6 - WAIT-QUEUE DRAIN ON TRIP TESTS" - << std::endl; - std::cout << std::string(60, '=') << std::endl; - - TestWaitQueueDrainedOnTrip(); - TestDisabledBreakerDoesNotDrain(); -} - -} // namespace CircuitBreakerPhase6Tests diff --git a/test/circuit_breaker_phase7_test.h b/test/circuit_breaker_phase7_test.h deleted file mode 100644 index 9dc841ba..00000000 --- a/test/circuit_breaker_phase7_test.h +++ /dev/null @@ -1,405 +0,0 @@ -#pragma once - -// Phase 7 integration tests: observability — counter accuracy, snapshot -// API correctness, and log emission. -// -// Phases 2-6 each added counters and log lines as a side effect of their -// functional work. Phase 7 locks those in as regressions: -// -// * Counters (§11.2): trips, rejected, probe_successes, probe_failures, -// retries_rejected surface through CircuitBreakerManager::SnapshotAll. -// * Snapshot API (§11.3): per-slice rows aggregate into host-level -// totals; host-level fields (retries_in_flight / retries_rejected / -// in_flight) reflect the owning RetryBudget. -// * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message -// including trigger, consecutive_failures, window_total, -// window_fail_rate, open_for_ms, and consecutive_trips. -// -// The log-emission test attaches a spdlog ring-buffer sink to the logger -// for the duration of the test, triggers a trip, then asserts the -// captured messages contain the expected fields. No log file I/O. - -#include "test_framework.h" -#include "test_server_runner.h" -#include "http_test_client.h" -#include "http/http_server.h" -#include "config/server_config.h" -#include "upstream/upstream_manager.h" -#include "circuit_breaker/circuit_breaker_manager.h" -#include "circuit_breaker/circuit_breaker_host.h" -#include "circuit_breaker/circuit_breaker_slice.h" -#include "log/logger.h" -#include "spdlog/sinks/ringbuffer_sink.h" - -#include -#include -#include -#include -#include -#include - -namespace CircuitBreakerPhase7Tests { - -using circuit_breaker::State; - -static UpstreamConfig MakeObservUpstream(const std::string& name, - const std::string& host, - int port, - int consecutive_threshold = 3) { - UpstreamConfig u; - u.name = name; - u.host = host; - u.port = port; - u.pool.max_connections = 8; - u.pool.max_idle_connections = 4; - u.pool.connect_timeout_ms = 3000; - u.pool.idle_timeout_sec = 30; - u.pool.max_lifetime_sec = 3600; - u.pool.max_requests_per_conn = 0; - - u.proxy.route_prefix = "/fail"; - u.proxy.strip_prefix = false; - u.proxy.response_timeout_ms = 2000; - u.proxy.retry.max_retries = 0; - - u.circuit_breaker.enabled = true; - u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold; - u.circuit_breaker.failure_rate_threshold = 100; - u.circuit_breaker.minimum_volume = 10000; - u.circuit_breaker.window_seconds = 10; - u.circuit_breaker.permitted_half_open_calls = 2; - // Long open duration — keep the slice OPEN so post-trip assertions - // don't race a HALF_OPEN transition. - u.circuit_breaker.base_open_duration_ms = 30000; - u.circuit_breaker.max_open_duration_ms = 60000; - return u; -} - -// --------------------------------------------------------------------------- -// Test 1: Snapshot API reflects per-slice trip/rejected counters and -// host-level aggregates. Drives N+1 requests against a backend that always -// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts -// the snapshot shows total_trips >= 1, total_rejected >= 1, -// open_partitions >= 1. -// --------------------------------------------------------------------------- -void TestSnapshotReflectsCounters() { - std::cout << "\n[TEST] CB Phase 7: snapshot reflects counters..." - << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port, - /*threshold=*/3); - gw.upstreams.push_back(u); - - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - // Trip (3 failures), then 2 more to accumulate rejected counter. - for (int i = 0; i < 3; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - for (int i = 0; i < 2; ++i) { - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - } - - auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); - if (!cbm) { - TestFramework::RecordTest( - "CB Phase 7: snapshot reflects counters", false, - "no circuit breaker manager attached"); - return; - } - auto snaps = cbm->SnapshotAll(); - bool found = false; - int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0; - int open_parts = 0; - for (const auto& s : snaps) { - if (s.service_name == "svc") { - trips = s.total_trips; - rejected = s.total_rejected; - open_parts = s.open_partitions; - for (const auto& row : s.slices) { - probe_s += row.probe_successes; - probe_f += row.probe_failures; - } - found = true; - break; - } - } - - bool pass = found - && trips >= 1 - && rejected >= 2 // 2 post-trip short-circuits - && open_parts >= 1 - && probe_s == 0 // never entered HALF_OPEN - && probe_f == 0; - TestFramework::RecordTest( - "CB Phase 7: snapshot reflects counters", pass, - pass ? "" : - "found=" + std::to_string(found) + - " trips=" + std::to_string(trips) + - " rejected=" + std::to_string(rejected) + - " open_parts=" + std::to_string(open_parts) + - " probe_s=" + std::to_string(probe_s) + - " probe_f=" + std::to_string(probe_f)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 7: snapshot reflects counters", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message. -// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip, -// then inspects the captured messages for the key tokens. The sink is -// removed before the test returns so it doesn't affect later tests. -// --------------------------------------------------------------------------- -void TestTripLogEmission() { - std::cout << "\n[TEST] CB Phase 7: trip log emission..." << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port, - /*threshold=*/2); - gw.upstreams.push_back(u); - - // `HttpServer` construction calls `logging::Init()` which rebuilds - // the default logger via `spdlog::set_default_logger`. Any sink - // attached BEFORE that point lands on a stale logger. Attach the - // ringbuffer sink AFTER the last HttpServer construction so it - // captures the live logger's output. - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - auto ring = std::make_shared< - spdlog::sinks::ringbuffer_sink_mt>(1024); - auto logger = logging::Get(); - auto prev_level = logger->level(); - logger->set_level(spdlog::level::debug); - logger->sinks().push_back(ring); - - struct SinkGuard { - std::shared_ptr logger; - std::shared_ptr ring; - spdlog::level::level_enum prev_level; - ~SinkGuard() { - auto& sinks = logger->sinks(); - sinks.erase(std::remove(sinks.begin(), sinks.end(), - std::shared_ptr(ring)), - sinks.end()); - logger->set_level(prev_level); - } - } guard{logger, ring, prev_level}; - - // Drive exactly threshold=2 failures to trip. - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - TestHttpClient::HttpGet(gw_port, "/fail", 3000); - - // Give the dispatcher a breath to emit + the sink to settle. - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - - auto messages = ring->last_formatted(); - // Scan for the trip message. Look for the static prefix plus the - // §11.1 field tokens. - bool saw_tripped = false; - bool has_trigger = false; - bool has_consec_failures = false; - bool has_window_total = false; - bool has_fail_rate = false; - bool has_open_for_ms = false; - bool has_consec_trips = false; - for (const auto& msg : messages) { - if (msg.find("circuit breaker tripped") == std::string::npos) { - continue; - } - saw_tripped = true; - if (msg.find("trigger=") != std::string::npos) has_trigger = true; - if (msg.find("consecutive_failures=") != std::string::npos) - has_consec_failures = true; - if (msg.find("window_total=") != std::string::npos) - has_window_total = true; - if (msg.find("window_fail_rate=") != std::string::npos) - has_fail_rate = true; - if (msg.find("open_for_ms=") != std::string::npos) - has_open_for_ms = true; - if (msg.find("consecutive_trips=") != std::string::npos) - has_consec_trips = true; - } - - bool pass = saw_tripped && has_trigger && has_consec_failures && - has_window_total && has_fail_rate && - has_open_for_ms && has_consec_trips; - TestFramework::RecordTest( - "CB Phase 7: trip log emission", pass, - pass ? "" : - "saw_tripped=" + std::to_string(saw_tripped) + - " trigger=" + std::to_string(has_trigger) + - " consec_failures=" + std::to_string(has_consec_failures) + - " window_total=" + std::to_string(has_window_total) + - " fail_rate=" + std::to_string(has_fail_rate) + - " open_for_ms=" + std::to_string(has_open_for_ms) + - " consec_trips=" + std::to_string(has_consec_trips)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 7: trip log emission", false, e.what()); - } -} - -// --------------------------------------------------------------------------- -// Test 3: Retry-budget observability — the exhausted log carries the -// §11.1 fields (service, in_flight, retries_in_flight, cap), and the -// host snapshot reflects retries_rejected. -// --------------------------------------------------------------------------- -void TestRetryBudgetObservability() { - std::cout << "\n[TEST] CB Phase 7: retry budget observability..." - << std::endl; - try { - HttpServer backend("127.0.0.1", 0); - backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { - resp.Status(502).Body("upstream-err", "text/plain"); - }); - TestServerRunner backend_runner(backend); - int backend_port = backend_runner.GetPort(); - - ServerConfig gw; - gw.bind_host = "127.0.0.1"; - gw.bind_port = 0; - gw.worker_threads = 1; - gw.http2.enabled = false; - - // Budget: zero percent AND zero floor → every retry rejected. - auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port, - /*threshold=*/10000); - u.proxy.retry.max_retries = 2; - u.proxy.retry.retry_on_5xx = true; - u.circuit_breaker.retry_budget_percent = 0; - u.circuit_breaker.retry_budget_min_concurrency = 0; - gw.upstreams.push_back(u); - - // Attach the ringbuffer AFTER gateway construction — see - // TestTripLogEmission for rationale (HttpServer's ctor - // replaces the default logger via logging::Init, detaching - // any previously-attached sinks). - HttpServer gateway(gw); - TestServerRunner gw_runner(gateway); - int gw_port = gw_runner.GetPort(); - - auto ring = std::make_shared< - spdlog::sinks::ringbuffer_sink_mt>(1024); - auto logger = logging::Get(); - auto prev_level = logger->level(); - logger->set_level(spdlog::level::debug); - logger->sinks().push_back(ring); - - struct SinkGuard { - std::shared_ptr logger; - std::shared_ptr ring; - spdlog::level::level_enum prev_level; - ~SinkGuard() { - auto& sinks = logger->sinks(); - sinks.erase(std::remove(sinks.begin(), sinks.end(), - std::shared_ptr(ring)), - sinks.end()); - logger->set_level(prev_level); - } - } guard{logger, ring, prev_level}; - - // One client request: first attempt hits backend (502), retry - // blocked by budget → 503 + X-Retry-Budget-Exhausted. - TestHttpClient::HttpGet(gw_port, "/fail", 5000); - - std::this_thread::sleep_for(std::chrono::milliseconds(50)); - - auto messages = ring->last_formatted(); - bool saw_exhausted = false; - bool has_service = false; - bool has_inflight = false; - bool has_retries_inflight = false; - bool has_cap = false; - for (const auto& msg : messages) { - if (msg.find("retry budget exhausted") == std::string::npos) { - continue; - } - saw_exhausted = true; - if (msg.find("service=") != std::string::npos) has_service = true; - if (msg.find("in_flight=") != std::string::npos) - has_inflight = true; - if (msg.find("retries_in_flight=") != std::string::npos) - has_retries_inflight = true; - if (msg.find("cap=") != std::string::npos) has_cap = true; - } - - // Snapshot: retries_rejected must be >= 1 (every rejection increments). - int64_t retries_rejected = 0; - auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); - if (cbm) { - for (const auto& s : cbm->SnapshotAll()) { - if (s.service_name == "svc-budget") { - // Host aggregate — single host, so the sum is the - // host's retries_rejected. The snapshot doesn't yet - // expose that directly — derive from RetryBudget - // via the host getter. - auto* host = cbm->GetHost("svc-budget"); - if (host) { - retries_rejected = - host->GetRetryBudget()->RetriesRejected(); - } - break; - } - } - } - - bool pass = saw_exhausted && has_service && has_inflight && - has_retries_inflight && has_cap && - retries_rejected >= 1; - TestFramework::RecordTest( - "CB Phase 7: retry budget observability", pass, - pass ? "" : - "saw_exhausted=" + std::to_string(saw_exhausted) + - " service=" + std::to_string(has_service) + - " inflight=" + std::to_string(has_inflight) + - " retries_inflight=" + std::to_string(has_retries_inflight) + - " cap=" + std::to_string(has_cap) + - " retries_rejected=" + std::to_string(retries_rejected)); - } catch (const std::exception& e) { - TestFramework::RecordTest( - "CB Phase 7: retry budget observability", false, e.what()); - } -} - -void RunAllTests() { - std::cout << "\n" << std::string(60, '=') << std::endl; - std::cout << "CIRCUIT BREAKER PHASE 7 - OBSERVABILITY TESTS" << std::endl; - std::cout << std::string(60, '=') << std::endl; - - TestSnapshotReflectsCounters(); - TestTripLogEmission(); - TestRetryBudgetObservability(); -} - -} // namespace CircuitBreakerPhase7Tests From f977ed2988022a4de18f8f2b3de7ed23c01f3523 Mon Sep 17 00:00:00 2001 From: mwfj Date: Tue, 14 Apr 2026 23:56:49 +0800 Subject: [PATCH 27/37] Fix review comment --- docs/circuit_breaker.md | 2 +- server/http_server.cc | 26 +++++-- server/proxy_transaction.cc | 133 ++++++++++++++++++++---------------- 3 files changed, 93 insertions(+), 68 deletions(-) diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md index 6f38de69..ef3a5ef0 100644 --- a/docs/circuit_breaker.md +++ b/docs/circuit_breaker.md @@ -134,7 +134,7 @@ Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a re ### Snapshot API -`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A future `/admin/breakers` endpoint would JSON-serialize this. +`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A `/admin/breakers` HTTP endpoint that JSON-serializes this snapshot is **planned but not yet exposed** — the API is ready for future wiring. --- diff --git a/server/http_server.cc b/server/http_server.cc index abee42c0..2289ebef 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -3743,19 +3743,31 @@ bool HttpServer::Reload(const ServerConfig& new_config) { // UpstreamConfig deliberately excludes `circuit_breaker` so a CB- // only edit doesn't trigger this warning (the reload above already // applied the new breaker settings to live slices). + // + // When topology DIFFERS, we deliberately DO NOT copy the staged + // config into `upstream_configs_`: subsequent reloads (including + // the timer-cadence recomputation above) read from this vector to + // match live pool state. Adopting staged-but-inactive topology + // values would silently widen the dispatcher timer past the active + // pool timeouts — e.g. staging `pool.connect_timeout_ms=10000` + // (restart required) then reloading any unrelated field would + // recompute cadence from 10s while the live pool still uses 3s, + // firing connect-timeouts late. The CB-field portion of the edit + // was already applied live via `circuit_breaker_manager_->Reload` + // above, so the live slices carry the new tuning regardless of + // whether `upstream_configs_` shows it. + // + // When topology MATCHES (the common case, including CB-only + // edits), adopt the new snapshot as the fresh baseline so CB- + // field edits persist for later reload diffs. if (new_config.upstreams != upstream_configs_) { logging::Get()->warn("Reload: upstream topology changes require a " "restart to take effect (circuit-breaker " "field edits, if any, were applied live)"); + } else { + upstream_configs_ = new_config.upstreams; } - // Persist the new upstreams (preserving the breaker propagation just - // applied). Subsequent reloads diff against this baseline, so without - // this update a second SIGHUP would re-propagate the same CB values - // and also see the original topology as "unchanged" rather than the - // attempted new state — confusing operators debugging reload behavior. - upstream_configs_ = new_config.upstreams; - return true; } diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 29dbe550..d34bbc32 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -124,13 +124,19 @@ void ProxyTransaction::Start() { auto* host = cbm->GetHost(service_name_); if (host) { slice_ = host->GetSlice(static_cast(dispatcher_index_)); - // Retry budget is host-level (shared across partitions). - // Resolve from the same host so retry admission math stays - // consistent with the slice's dispatcher routing. Always - // non-null when the host exists (budget is unconditionally - // constructed by the host ctor). Null only when `host` - // itself is null. - retry_budget_ = host->GetRetryBudget(); + // Retry budget is part of the circuit-breaker feature and + // must inherit its opt-in default. CircuitBreakerHost + // unconditionally constructs a RetryBudget (one-per-host) + // so the pointer is always available — but engaging it + // when `circuit_breaker.enabled=false` would silently + // regress deployments that set `proxy.retry.max_retries>0` + // without ever opting into circuit breaking: a retry + // storm would suddenly see 503+X-Retry-Budget-Exhausted. + // Gate on the slice's live config so the enabled-toggle + // flip is the sole switch for the whole feature. + if (slice_ && slice_->config().enabled) { + retry_budget_ = host->GetRetryBudget(); + } } } } @@ -153,6 +159,44 @@ void ProxyTransaction::AttemptCheckout() { return; } + // Retry-budget gate for retry attempts (attempt_ > 0). Gating here + // rather than in MaybeRetry means a delayed retry holds no token + // during its backoff sleep — the budget's `retries_in_flight` + // reflects only retries that are actually about to reach (or are + // reaching) the upstream, matching the "aggregate upstream load" + // semantics of the %-of-in-flight cap. Gating in MaybeRetry + // instead would count queued-but-sleeping retries toward the cap + // and trigger X-Retry-Budget-Exhausted even when no retry has + // actually contacted the upstream yet. + // + // The `!retry_token_held_` guard is defensive — Cleanup() between + // retry attempts always releases the prior token, so this branch + // never normally sees an already-held token; the check only + // prevents a re-entrant AttemptCheckout from double-consuming. + if (retry_budget_ && attempt_ > 0 && !retry_token_held_) { + bool is_dry_run = slice_ && slice_->config().dry_run; + if (retry_budget_->TryConsumeRetry()) { + retry_token_held_ = true; + } else if (is_dry_run) { + logging::Get()->info( + "ProxyTransaction retry budget would-reject (dry-run) " + "client_fd={} service={} attempt={}", + client_fd_, service_name_, attempt_); + } else { + logging::Get()->warn( + "retry budget exhausted service={} in_flight={} " + "retries_in_flight={} cap={} client_fd={} attempt={}", + service_name_, + retry_budget_->InFlight(), + retry_budget_->RetriesInFlight(), + retry_budget_->ComputeCap(), + client_fd_, attempt_); + state_ = State::FAILED; + DeliverResponse(MakeRetryBudgetResponse()); + return; + } + } + // Track this attempt against the host-level retry budget's // in_flight counter. Replaces any prior guard (from the previous // attempt of the same transaction) — move-assignment decrements @@ -691,8 +735,12 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) { // Release old lease, clear callbacks, poison if tainted. // Cleanup also releases any retry token held by the previous - // retry attempt (attempt_ > 1) so the next TryConsumeRetry sees - // a fresh counter. + // retry attempt so the next TryConsumeRetry in AttemptCheckout + // sees a fresh counter. The retry-budget gate itself now lives + // at the top of AttemptCheckout — that way a delayed retry + // doesn't hold a token during its backoff sleep, which would + // otherwise pollute the budget's retries_in_flight with + // queued-but-sleeping work that hasn't reached the upstream. Cleanup(); codec_.Reset(); // Re-apply request method after reset — llhttp_init() zeroes @@ -701,49 +749,6 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) { codec_.SetRequestMethod(method_); poison_connection_ = false; - // Retry-budget gate. `attempt_ > 0` here is guaranteed — we - // just incremented. The budget bounds how many retries can be - // concurrently in flight against this upstream HOST (aggregated - // across all transactions for the service), preventing a retry - // storm from amplifying traffic to a struggling backend. - // - // Dry-run: log the would-reject but still proceed (consistent - // with REJECTED_OPEN_DRYRUN on the slice path). No token is - // consumed, so no ReleaseRetry is needed on the dry-run path. - // - // Full mode: deliver the §12.2 retry-budget response (503 + - // X-Retry-Budget-Exhausted) and terminate. Does NOT call - // ReportBreakerOutcome — our own reject must not feed back - // into the slice's failure math. - if (retry_budget_) { - bool is_dry_run = slice_ && slice_->config().dry_run; - if (retry_budget_->TryConsumeRetry()) { - retry_token_held_ = true; - } else if (is_dry_run) { - logging::Get()->info( - "ProxyTransaction retry budget would-reject (dry-run) " - "client_fd={} service={} attempt={}", - client_fd_, service_name_, attempt_); - } else { - // §11.1 format: log per-host budget state so operators - // can diagnose retry-storm throttling without hitting - // an admin endpoint. `cap` is the live effective ceiling - // (may have shifted since the failing TryConsumeRetry - // due to other transactions' in_flight changes). - logging::Get()->warn( - "retry budget exhausted service={} in_flight={} " - "retries_in_flight={} cap={} client_fd={} attempt={}", - service_name_, - retry_budget_->InFlight(), - retry_budget_->RetriesInFlight(), - retry_budget_->ComputeCap(), - client_fd_, attempt_); - state_ = State::FAILED; - DeliverResponse(MakeRetryBudgetResponse()); - return; - } - } - // Condition-dependent first-retry policy: // Connection-level failures (stale keep-alive, connect refused) // are transient — a different pooled connection will succeed. @@ -1089,18 +1094,26 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) { } if (result_code == RESULT_CIRCUIT_OPEN) { // The static factory has no `this`, so it cannot build the - // §12.1-compliant response (Retry-After / X-Circuit-Breaker / - // X-Upstream-Host). All in-class paths for CIRCUIT_OPEN use - // the non-static MakeCircuitOpenResponse() — reaching this - // branch means a future caller forgot that rule, and would - // silently serve a non-compliant 503. Log loudly so the - // mistake shows up in logs instead of producing a stealth - // regression against the public contract. + // fully §12.1-compliant response (Retry-After derived from + // slice state, X-Upstream-Host). All in-class paths for + // CIRCUIT_OPEN use the non-static MakeCircuitOpenResponse() + // — reaching this branch means a future caller forgot that + // rule. Log loudly so the mistake shows up in logs instead + // of producing a stealth regression against the contract. + // + // Still emit `X-Circuit-Breaker: open` + `Connection: close` + // so the response remains self-identifying as a circuit-open + // reject. Clients inspecting that header will correctly back + // off via their own client-side logic rather than treating + // this as an anonymous 503. logging::Get()->error( "ProxyTransaction::MakeErrorResponse(RESULT_CIRCUIT_OPEN) " "invoked from static context — use MakeCircuitOpenResponse() " "to emit §12.1-compliant headers"); - return HttpResponse::ServiceUnavailable(); + HttpResponse resp = HttpResponse::ServiceUnavailable(); + resp.Header("X-Circuit-Breaker", "open"); + resp.Header("Connection", "close"); + return resp; } if (result_code == RESULT_CHECKOUT_FAILED || result_code == RESULT_SEND_FAILED || From f2f72efda28ee2aceabdfa0c9ff31e53437d1a57 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 00:23:56 +0800 Subject: [PATCH 28/37] Fix review comment --- include/config/config_loader.h | 14 +++++++ server/config_loader.cc | 67 ++++++++++++++++++++++++++++++++++ server/http_server.cc | 36 +++++++++++++----- server/main.cc | 19 ++++++++++ server/proxy_transaction.cc | 65 +++++++++++++++++++-------------- 5 files changed, 164 insertions(+), 37 deletions(-) diff --git a/include/config/config_loader.h b/include/config/config_loader.h index ba13f62a..fbf3319e 100644 --- a/include/config/config_loader.h +++ b/include/config/config_loader.h @@ -27,6 +27,20 @@ class ConfigLoader { // Throws std::invalid_argument if validation fails. static void Validate(const ServerConfig& config); + // Validate ONLY the fields that are live-reloadable without a + // restart — today this is the per-upstream circuit_breaker block. + // Used by the SIGHUP reload path, which downgrades the full + // `Validate()` failure to a warn because most of its rules cover + // restart-only fields. That downgrade is unsafe for live- + // reloadable fields: an invalid breaker threshold would be + // pushed into live slices even though the same value would be + // rejected at startup. Call this BEFORE applying a reloaded + // config and abort the reload if it throws. + // + // Throws std::invalid_argument with a message identifying the + // offending upstream and field. + static void ValidateHotReloadable(const ServerConfig& config); + // Return a ServerConfig with all default values. static ServerConfig Default(); diff --git a/server/config_loader.cc b/server/config_loader.cc index f92dd3f2..d6566904 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -561,6 +561,73 @@ void ConfigLoader::ApplyEnvOverrides(ServerConfig& config) { if (val) config.rate_limit.status_code = EnvToInt(val, "REACTOR_RATE_LIMIT_STATUS_CODE"); } +void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) { + // Mirrors the circuit_breaker validation block in Validate(). + // Kept in lock-step with that block — any rule added there for a + // hot-reloadable field must be added here too, or the SIGHUP + // reload path would silently accept values the startup path + // rejects (which is exactly the regression this helper exists + // to prevent). + for (size_t i = 0; i < config.upstreams.size(); ++i) { + const auto& u = config.upstreams[i]; + const std::string idx = "upstreams[" + std::to_string(i) + "]"; + const auto& cb = u.circuit_breaker; + if (cb.consecutive_failure_threshold < 1 || + cb.consecutive_failure_threshold > 10000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.consecutive_failure_threshold must be in [1, 10000]"); + } + if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.failure_rate_threshold must be in [0, 100]"); + } + if (cb.minimum_volume < 1 || cb.minimum_volume > 10000000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.minimum_volume must be in [1, 10000000]"); + } + if (cb.window_seconds < 1 || cb.window_seconds > 3600) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.window_seconds must be in [1, 3600]"); + } + if (cb.permitted_half_open_calls < 1 || + cb.permitted_half_open_calls > 1000) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.permitted_half_open_calls must be in [1, 1000]"); + } + if (cb.base_open_duration_ms < 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.base_open_duration_ms must be >= 100"); + } + if (cb.max_open_duration_ms < cb.base_open_duration_ms) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_open_duration_ms must be >= base_open_duration_ms"); + } + if (cb.max_ejection_percent_per_host_set < 0 || + cb.max_ejection_percent_per_host_set > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]"); + } + if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_percent must be in [0, 100]"); + } + if (cb.retry_budget_min_concurrency < 0) { + throw std::invalid_argument( + idx + " ('" + u.name + + "'): circuit_breaker.retry_budget_min_concurrency must be >= 0"); + } + } +} + void ConfigLoader::Validate(const ServerConfig& config) { // Validate bind_host is a strict dotted-quad IPv4 address. // Use inet_pton (not inet_addr) to reject legacy shorthand forms diff --git a/server/http_server.cc b/server/http_server.cc index 2289ebef..1f8ee83c 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -462,15 +462,33 @@ void HttpServer::MarkServerReady() { [um, service, i](circuit_breaker::State old_s, circuit_breaker::State new_s, const char* /*trigger*/) { - // Drain only on CLOSED→OPEN. HALF_OPEN→OPEN - // doesn't need draining — in HALF_OPEN, non- - // probe admissions are already REJECTED_OPEN - // before reaching the pool queue, so the - // queue stays empty (or holds only probes, - // which are in-flight by the time HALF_OPEN - // trips back). - if (old_s == circuit_breaker::State::CLOSED && - new_s == circuit_breaker::State::OPEN) { + // Drain the partition's wait queue whenever + // the slice enters OPEN — from CLOSED (fresh + // trip) OR from HALF_OPEN (probe cycle re- + // tripped). + // + // CLOSED→OPEN is the classic case: queued + // non-probe waiters need to fail fast with + // CHECKOUT_CIRCUIT_OPEN rather than wait for + // the full open duration. + // + // HALF_OPEN→OPEN (probe_fail) matters + // because probe admissions pass + // ConsultBreaker() BEFORE CheckoutAsync() — + // if the pool was saturated during the + // probe cycle, those admitted probes may + // still be queued when the cycle re-trips. + // Without draining, a saw_failure probe + // cycle can leave the pool with queued + // waiters that still eventually dispatch to + // a known-bad upstream. Draining also + // sweeps any non-probe waiters that + // somehow queued during HALF_OPEN (defense + // in depth — TryAcquire normally rejects + // non-probes before they reach the pool). + if (new_s == circuit_breaker::State::OPEN && + (old_s == circuit_breaker::State::CLOSED || + old_s == circuit_breaker::State::HALF_OPEN)) { if (auto* part = um->GetPoolPartition( service, i)) { part->DrainWaitQueueOnTrip(); diff --git a/server/main.cc b/server/main.cc index 0d7474e9..f7bac586 100644 --- a/server/main.cc +++ b/server/main.cc @@ -328,7 +328,26 @@ static bool ReloadConfig(const std::string& config_path, } } } + // Hot-reloadable fields (today: per-upstream `circuit_breaker.*`) + // are the only ones that go LIVE on a SIGHUP reload. Validate + // them strictly — a bad value here would be pushed into running + // slices and keep running until an operator-driven restart fixes + // the config file. Hard-reject so operators see the error + // immediately instead of discovering drift the next time the + // startup path rejects the same file. + try { + ConfigLoader::ValidateHotReloadable(new_config); + } catch (const std::invalid_argument& e) { + logging::Get()->error("Config reload rejected: {}", e.what()); + reopen_existing_logs(); + return false; + } + // Warn about restart-required field issues (not applied during reload). + // Full Validate() includes both hot-reloadable rules (already checked + // above) and restart-only rules; by the time we reach this point the + // hot-reloadable subset is known valid, so any exception thrown here + // is from restart-only rules and is legitimately a warn, not an error. try { ConfigLoader::Validate(new_config); } catch (const std::invalid_argument& e) { diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index d34bbc32..a5483006 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -124,19 +124,13 @@ void ProxyTransaction::Start() { auto* host = cbm->GetHost(service_name_); if (host) { slice_ = host->GetSlice(static_cast(dispatcher_index_)); - // Retry budget is part of the circuit-breaker feature and - // must inherit its opt-in default. CircuitBreakerHost - // unconditionally constructs a RetryBudget (one-per-host) - // so the pointer is always available — but engaging it - // when `circuit_breaker.enabled=false` would silently - // regress deployments that set `proxy.retry.max_retries>0` - // without ever opting into circuit breaking: a retry - // storm would suddenly see 503+X-Retry-Budget-Exhausted. - // Gate on the slice's live config so the enabled-toggle - // flip is the sole switch for the whole feature. - if (slice_ && slice_->config().enabled) { - retry_budget_ = host->GetRetryBudget(); - } + // Cache the retry-budget pointer unconditionally when + // the host exists — usage at each attempt is gated by + // the live `slice_->config().enabled` flag so that + // SIGHUP toggles take effect on the next retry within + // a running transaction. Resolution-time gating would + // miss the flip in either direction. + retry_budget_ = host->GetRetryBudget(); } } } @@ -164,17 +158,22 @@ void ProxyTransaction::AttemptCheckout() { // during its backoff sleep — the budget's `retries_in_flight` // reflects only retries that are actually about to reach (or are // reaching) the upstream, matching the "aggregate upstream load" - // semantics of the %-of-in-flight cap. Gating in MaybeRetry - // instead would count queued-but-sleeping retries toward the cap - // and trigger X-Retry-Budget-Exhausted even when no retry has - // actually contacted the upstream yet. + // semantics of the %-of-in-flight cap. + // + // Live-check `slice_->config().enabled` at each attempt — the + // cached `retry_budget_` pointer is resolved once in Start(), but + // the `enabled` flag is the documented live master switch. A + // SIGHUP flipping enabled=true→false mid-flight must stop + // enforcing the budget on subsequent retries; enabled=false→true + // mid-flight must start. Gating at the pointer level would miss + // both directions. // // The `!retry_token_held_` guard is defensive — Cleanup() between - // retry attempts always releases the prior token, so this branch - // never normally sees an already-held token; the check only - // prevents a re-entrant AttemptCheckout from double-consuming. - if (retry_budget_ && attempt_ > 0 && !retry_token_held_) { - bool is_dry_run = slice_ && slice_->config().dry_run; + // retry attempts always releases the prior token. + bool breaker_live_enabled = slice_ && slice_->config().enabled; + if (retry_budget_ && breaker_live_enabled && + attempt_ > 0 && !retry_token_held_) { + bool is_dry_run = slice_->config().dry_run; if (retry_budget_->TryConsumeRetry()) { retry_token_held_ = true; } else if (is_dry_run) { @@ -191,6 +190,17 @@ void ProxyTransaction::AttemptCheckout() { retry_budget_->RetriesInFlight(), retry_budget_->ComputeCap(), client_fd_, attempt_); + // CRITICAL: release the slice admission before bailing. + // ConsultBreaker() already admitted this attempt — in + // HALF_OPEN that means a probe slot was reserved + // (half_open_inflight_ / half_open_admitted_ both + // incremented). Returning here without releasing would + // strand that slot forever, wedging the slice in + // half_open_full until an operator-driven reload/reset. + // Neutral release decrements both counters for probes; + // no-op for non-probe (CLOSED) admissions, matching the + // general "local cause, no upstream signal" semantic. + ReleaseBreakerAdmissionNeutral(); state_ = State::FAILED; DeliverResponse(MakeRetryBudgetResponse()); return; @@ -198,12 +208,11 @@ void ProxyTransaction::AttemptCheckout() { } // Track this attempt against the host-level retry budget's - // in_flight counter. Replaces any prior guard (from the previous - // attempt of the same transaction) — move-assignment decrements - // the old counter and takes ownership of the new, so a retrying - // transaction stays at exactly one in_flight unit throughout. No-op - // when retry_budget_ is null (no breaker attached for this service). - if (retry_budget_) { + // in_flight counter. Gated by the live `enabled` flag so disabling + // the breaker mid-flight stops tracking immediately; enabling it + // starts tracking at the next attempt. No-op when retry_budget_ + // is null (no breaker manager / unknown host). + if (retry_budget_ && breaker_live_enabled) { inflight_guard_ = retry_budget_->TrackInFlight(); } From 6520d86c99cdd567e4a074723e2f18b220896899 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 09:47:40 +0800 Subject: [PATCH 29/37] Fix review comment --- server/config_loader.cc | 22 ++++++++++++++++++++++ server/http_server.cc | 22 ++++++++++++++++++++-- server/proxy_transaction.cc | 10 ++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/server/config_loader.cc b/server/config_loader.cc index d6566904..a34672ce 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -568,6 +568,28 @@ void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) { // reload path would silently accept values the startup path // rejects (which is exactly the regression this helper exists // to prevent). + + // Reject duplicate upstream service names BEFORE the per-upstream + // CB validation. CircuitBreakerManager::Reload iterates the new + // upstream list and applies each entry's `circuit_breaker` block + // to GetHost(name). With duplicates, the first entry's CB values + // are applied, then the second entry's overwrite them — last + // write silently wins. Startup's full Validate() rejects the + // file outright; the hot-reload path must match. + { + std::unordered_set seen; + seen.reserve(config.upstreams.size()); + for (size_t i = 0; i < config.upstreams.size(); ++i) { + const auto& name = config.upstreams[i].name; + if (!seen.insert(name).second) { + throw std::invalid_argument( + "upstreams[" + std::to_string(i) + + "] duplicate service name '" + name + + "' (upstream service names must be unique)"); + } + } + } + for (size_t i = 0; i < config.upstreams.size(); ++i) { const auto& u = config.upstreams[i]; const std::string idx = "upstreams[" + std::to_string(i) + "]"; diff --git a/server/http_server.cc b/server/http_server.cc index 1f8ee83c..8d4008c5 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -3545,8 +3545,16 @@ bool HttpServer::Reload(const ServerConfig& new_config) { // field changes (timeouts, limits, log level). validation_copy.http2.enabled = http2_enabled_ && new_config.http2.enabled; - // Upstream configs are restart-only — clear them so staged edits - // in the config file don't block live-safe field reloads. + // Upstream configs are RESTART-ONLY for topology fields, but the + // per-upstream `circuit_breaker` block is HOT-RELOADABLE — clearing + // upstreams entirely from validation_copy would skip CB-field + // validation here. Instead: clear the topology-restart-only + // path (the full Validate would reject those) and run a separate + // ValidateHotReloadable on the original new_config so live- + // reloadable CB rules (range checks, duplicate names) are + // enforced symmetrically with the SIGHUP path in main.cc. + // Without this, in-process callers using HttpServer::Reload + // directly would bypass the gate that the CLI path enforces. validation_copy.upstreams.clear(); // Rate limit config IS live-reloadable and MUST be validated. // Unlike upstreams (restart-only), rate_limit changes are applied @@ -3559,6 +3567,16 @@ bool HttpServer::Reload(const ServerConfig& new_config) { logging::Get()->error("Reload() rejected invalid config: {}", e.what()); return false; } + // Strict gate for hot-reloadable CB fields + duplicate names. + // Mirrors main.cc::ReloadConfig — both entry points must reject + // invalid CB tuning before it reaches live slices. + try { + ConfigLoader::ValidateHotReloadable(new_config); + } catch (const std::invalid_argument& e) { + logging::Get()->error("Reload() rejected invalid config: {}", + e.what()); + return false; + } } // Three-phase update to prevent mid-reload connections from seeing diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index a5483006..0e1e5a6f 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -946,6 +946,16 @@ void ProxyTransaction::Cleanup() { // via the retry_token_held_ flag. ReleaseRetryToken(); + // Release the in-flight guard from the just-ended attempt. If + // MaybeRetry schedules a delayed backoff, the gap between Cleanup + // and the eventual AttemptCheckout (which would move-assign a + // fresh guard) holds the old slot in `retry_budget_->in_flight_` + // for the entire backoff sleep. That inflates the effective + // denominator of the percent-cap formula, weakening the budget + // exactly during retry storms. Move-assign from a default + // (empty) guard decrements the old counter immediately. + inflight_guard_ = circuit_breaker::RetryBudget::InFlightGuard{}; + if (lease_) { auto* conn = lease_.Get(); if (conn) { From a32469b6e7282de400a75d3c6ceb592dc32838e6 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 10:11:42 +0800 Subject: [PATCH 30/37] Fix review comment --- include/config/config_loader.h | 27 ++++++++++++++++++++-- server/config_loader.cc | 34 ++++++++++++++++++++++------ server/http_server.cc | 25 ++++++++++++++++----- server/main.cc | 41 +++++++++++++++++++++++----------- 4 files changed, 99 insertions(+), 28 deletions(-) diff --git a/include/config/config_loader.h b/include/config/config_loader.h index fbf3319e..2a76c3b8 100644 --- a/include/config/config_loader.h +++ b/include/config/config_loader.h @@ -3,6 +3,7 @@ #include "config/server_config.h" #include #include +#include class ConfigLoader { public: @@ -28,7 +29,9 @@ class ConfigLoader { static void Validate(const ServerConfig& config); // Validate ONLY the fields that are live-reloadable without a - // restart — today this is the per-upstream circuit_breaker block. + // restart — today this is the per-upstream circuit_breaker block + // plus a duplicate-name check. + // // Used by the SIGHUP reload path, which downgrades the full // `Validate()` failure to a warn because most of its rules cover // restart-only fields. That downgrade is unsafe for live- @@ -37,9 +40,29 @@ class ConfigLoader { // rejected at startup. Call this BEFORE applying a reloaded // config and abort the reload if it throws. // + // Scope of CB-field validation: + // `live_upstream_names` lists service names CURRENTLY known to + // the running server. CB fields are validated only for entries + // whose name is in this set, because + // `CircuitBreakerManager::Reload` only applies CB changes to + // pre-existing hosts (new/removed names are restart-only and + // skipped with a warn). Validating CB blocks for not-yet- + // running entries would block otherwise-safe reloads — e.g. a + // reload that stages a new upstream with an intentionally + // placeholder breaker block would abort even though the live + // server would never apply it. Pass an empty set when no + // upstreams are running yet (only the duplicate-name check + // runs in that case). + // + // Duplicate-name rejection runs unconditionally on the new + // config's upstream list: even for new/renamed entries, the + // file itself is malformed if names collide. + // // Throws std::invalid_argument with a message identifying the // offending upstream and field. - static void ValidateHotReloadable(const ServerConfig& config); + static void ValidateHotReloadable( + const ServerConfig& config, + const std::unordered_set& live_upstream_names); // Return a ServerConfig with all default values. static ServerConfig Default(); diff --git a/server/config_loader.cc b/server/config_loader.cc index a34672ce..38fb2fb4 100644 --- a/server/config_loader.cc +++ b/server/config_loader.cc @@ -561,7 +561,9 @@ void ConfigLoader::ApplyEnvOverrides(ServerConfig& config) { if (val) config.rate_limit.status_code = EnvToInt(val, "REACTOR_RATE_LIMIT_STATUS_CODE"); } -void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) { +void ConfigLoader::ValidateHotReloadable( + const ServerConfig& config, + const std::unordered_set& live_upstream_names) { // Mirrors the circuit_breaker validation block in Validate(). // Kept in lock-step with that block — any rule added there for a // hot-reloadable field must be added here too, or the SIGHUP @@ -570,12 +572,14 @@ void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) { // to prevent). // Reject duplicate upstream service names BEFORE the per-upstream - // CB validation. CircuitBreakerManager::Reload iterates the new - // upstream list and applies each entry's `circuit_breaker` block - // to GetHost(name). With duplicates, the first entry's CB values - // are applied, then the second entry's overwrite them — last - // write silently wins. Startup's full Validate() rejects the - // file outright; the hot-reload path must match. + // CB validation. Even for new/renamed entries, the file is + // malformed if names collide: `CircuitBreakerManager::Reload` + // iterates the new upstream list and applies each entry's + // `circuit_breaker` block to GetHost(name); duplicates would + // silently overwrite (last-write wins). Startup's full Validate() + // rejects the file outright; the hot-reload path must match. + // This rule runs UNCONDITIONALLY on the new config — it doesn't + // depend on `live_upstream_names`. { std::unordered_set seen; seen.reserve(config.upstreams.size()); @@ -593,6 +597,22 @@ void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) { for (size_t i = 0; i < config.upstreams.size(); ++i) { const auto& u = config.upstreams[i]; const std::string idx = "upstreams[" + std::to_string(i) + "]"; + + // CB-field validation is scoped to upstreams that are LIVE in + // the running server. CircuitBreakerManager::Reload only + // applies CB changes to pre-existing hosts — new/renamed + // entries are restart-only and skipped with a warn — so + // validating their CB blocks here would block otherwise-safe + // reloads (e.g. a reload that stages a new upstream alongside + // a log-level edit would abort even though the live server + // would never apply the new upstream's CB block). + // + // The empty-set case (no live upstreams yet) is handled by + // the same check: every entry is "new", so every entry is + // skipped — only the duplicate-name check runs. + if (live_upstream_names.find(u.name) == live_upstream_names.end()) { + continue; + } const auto& cb = u.circuit_breaker; if (cb.consecutive_failure_threshold < 1 || cb.consecutive_failure_threshold > 10000) { diff --git a/server/http_server.cc b/server/http_server.cc index 8d4008c5..247c5795 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -3570,12 +3570,25 @@ bool HttpServer::Reload(const ServerConfig& new_config) { // Strict gate for hot-reloadable CB fields + duplicate names. // Mirrors main.cc::ReloadConfig — both entry points must reject // invalid CB tuning before it reaches live slices. - try { - ConfigLoader::ValidateHotReloadable(new_config); - } catch (const std::invalid_argument& e) { - logging::Get()->error("Reload() rejected invalid config: {}", - e.what()); - return false; + // + // CB validation is scoped to existing upstream names: only + // those entries get applied via CircuitBreakerManager::Reload, + // so validating CB blocks for new/renamed entries would + // block otherwise-safe reloads. `upstream_configs_` is the + // post-Start snapshot of running upstreams. + { + std::unordered_set live_names; + live_names.reserve(upstream_configs_.size()); + for (const auto& u : upstream_configs_) { + live_names.insert(u.name); + } + try { + ConfigLoader::ValidateHotReloadable(new_config, live_names); + } catch (const std::invalid_argument& e) { + logging::Get()->error("Reload() rejected invalid config: {}", + e.what()); + return false; + } } } diff --git a/server/main.cc b/server/main.cc index f7bac586..e0fa7790 100644 --- a/server/main.cc +++ b/server/main.cc @@ -328,19 +328,34 @@ static bool ReloadConfig(const std::string& config_path, } } } - // Hot-reloadable fields (today: per-upstream `circuit_breaker.*`) - // are the only ones that go LIVE on a SIGHUP reload. Validate - // them strictly — a bad value here would be pushed into running - // slices and keep running until an operator-driven restart fixes - // the config file. Hard-reject so operators see the error - // immediately instead of discovering drift the next time the - // startup path rejects the same file. - try { - ConfigLoader::ValidateHotReloadable(new_config); - } catch (const std::invalid_argument& e) { - logging::Get()->error("Config reload rejected: {}", e.what()); - reopen_existing_logs(); - return false; + // Hot-reloadable fields (today: per-upstream `circuit_breaker.*` + // on existing services + duplicate-name uniqueness across the + // new file) are the only ones that go LIVE on a SIGHUP reload. + // Validate them strictly — a bad value here would be pushed into + // running slices and keep running until an operator-driven + // restart fixes the config file. Hard-reject so operators see + // the error immediately instead of discovering drift the next + // time the startup path rejects the same file. + // + // CB validation is scoped to existing upstream names — + // CircuitBreakerManager::Reload only applies CB changes to those. + // New/renamed upstreams are restart-only; their CB blocks are + // skipped here so an intentional placeholder doesn't block other + // live-safe edits in the same reload (log/rate-limit/breaker + // edits on existing services). + { + std::unordered_set live_names; + live_names.reserve(current_config.upstreams.size()); + for (const auto& u : current_config.upstreams) { + live_names.insert(u.name); + } + try { + ConfigLoader::ValidateHotReloadable(new_config, live_names); + } catch (const std::invalid_argument& e) { + logging::Get()->error("Config reload rejected: {}", e.what()); + reopen_existing_logs(); + return false; + } } // Warn about restart-required field issues (not applied during reload). From 1e9793e1e8c1ac26a09969072a5f07e923892d2b Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 10:58:50 +0800 Subject: [PATCH 31/37] Fix review comment --- server/http_server.cc | 51 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 10 deletions(-) diff --git a/server/http_server.cc b/server/http_server.cc index 247c5795..74e24973 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -458,10 +458,20 @@ void HttpServer::MarkServerReady() { for (size_t i = 0; i < host->partition_count(); ++i) { auto* slice = host->GetSlice(i); if (!slice) continue; + // Capture the slice pointer so the callback can read + // the LIVE `dry_run` flag on every fire — operators + // can toggle dry_run via SIGHUP, and the drain + // decision must reflect the current setting, not a + // snapshot from server startup. Slice lifetime is + // tied to the manager (declared after upstream + // manager → destructs first), so the raw pointer + // outlives every possible callback invocation. + auto* slice_ptr = slice; slice->SetTransitionCallback( - [um, service, i](circuit_breaker::State old_s, - circuit_breaker::State new_s, - const char* /*trigger*/) { + [um, service, i, slice_ptr]( + circuit_breaker::State old_s, + circuit_breaker::State new_s, + const char* /*trigger*/) { // Drain the partition's wait queue whenever // the slice enters OPEN — from CLOSED (fresh // trip) OR from HALF_OPEN (probe cycle re- @@ -486,13 +496,34 @@ void HttpServer::MarkServerReady() { // somehow queued during HALF_OPEN (defense // in depth — TryAcquire normally rejects // non-probes before they reach the pool). - if (new_s == circuit_breaker::State::OPEN && - (old_s == circuit_breaker::State::CLOSED || - old_s == circuit_breaker::State::HALF_OPEN)) { - if (auto* part = um->GetPoolPartition( - service, i)) { - part->DrainWaitQueueOnTrip(); - } + if (new_s != circuit_breaker::State::OPEN || + (old_s != circuit_breaker::State::CLOSED && + old_s != circuit_breaker::State::HALF_OPEN)) { + return; + } + // Dry-run honors the shadow-mode contract: + // the slice already log-but-admits + // would-reject decisions, so the wait-queue + // drain — which would deliver hard 503s + // (CHECKOUT_CIRCUIT_OPEN → RESULT_CIRCUIT_OPEN) + // to queued waiters — must also be a no-op. + // Otherwise shadow-mode rollouts can still + // drop queued requests under backpressure, + // defeating the safety of enabling dry_run + // on a live service. Logged at info so + // operators see the trip event without + // the side effect. + if (slice_ptr && slice_ptr->config().dry_run) { + logging::Get()->info( + "[dry-run] circuit breaker would drain " + "wait queue on trip — skipping (shadow " + "mode) service={} partition={}", + service, i); + return; + } + if (auto* part = um->GetPoolPartition( + service, i)) { + part->DrainWaitQueueOnTrip(); } }); } From 2d474bf8a001bd252924bb5befe5356249792fc1 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 11:06:29 +0800 Subject: [PATCH 32/37] Fix review comment --- server/circuit_breaker_slice.cc | 32 ++++++++++++ server/http_server.cc | 89 ++++++++++++++++++--------------- 2 files changed, 81 insertions(+), 40 deletions(-) diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 1ff6e00e..9dad6a31 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -527,6 +527,10 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { const bool enabled_changed = (config_.enabled != new_config.enabled); const bool window_changed = (config_.window_seconds != new_config.window_seconds); + // Snapshot the OLD dry_run before config_ is overwritten — used at + // the end of Reload to detect a true→false flip and signal the + // host to drain any waiters that accumulated during shadow mode. + const bool old_dry_run = config_.dry_run; config_ = new_config; if (window_changed) { @@ -610,6 +614,34 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { new_config.failure_rate_threshold, new_config.consecutive_failure_threshold, enabled_changed ? " (enabled toggled — state reset to CLOSED)" : ""); + + // dry_run true→false on a slice that's STILL OPEN: enforcement is + // back on, but the OPEN→OPEN intra-state config edit doesn't fire + // any natural transition callback. The pool partition may have + // queued waiters from the shadow-mode period (the original + // CLOSED→OPEN drain was skipped because dry_run was true at the + // time). Without flushing them now, those queued requests will + // eventually dispatch to the unhealthy upstream once a pool slot + // frees, defeating the just-re-enabled enforcement. + // + // Signal the host via a synthetic OPEN→OPEN transition callback + // with trigger="dry_run_disabled". The HttpServer-installed + // callback recognizes this special trigger and drains the + // partition queue. Real state transitions never reuse the same + // old/new state with this trigger string, so there's no overlap. + // + // Only fire when we KNOW the state is still OPEN — the + // enabled-toggle branch above resets to CLOSED, in which case the + // drain is unnecessary (no enforcement to re-engage). State is + // dispatcher-thread-only here; a plain load is sufficient. + if (old_dry_run && !new_config.dry_run && + state_.load(std::memory_order_acquire) == State::OPEN && + transition_cb_) { + logging::Get()->info( + "circuit breaker dry_run disabled while OPEN {} — " + "flushing wait queue", host_label_); + transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled"); + } } void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) { diff --git a/server/http_server.cc b/server/http_server.cc index 74e24973..4275aad1 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -471,48 +471,57 @@ void HttpServer::MarkServerReady() { [um, service, i, slice_ptr]( circuit_breaker::State old_s, circuit_breaker::State new_s, - const char* /*trigger*/) { - // Drain the partition's wait queue whenever - // the slice enters OPEN — from CLOSED (fresh - // trip) OR from HALF_OPEN (probe cycle re- - // tripped). - // - // CLOSED→OPEN is the classic case: queued - // non-probe waiters need to fail fast with - // CHECKOUT_CIRCUIT_OPEN rather than wait for - // the full open duration. - // - // HALF_OPEN→OPEN (probe_fail) matters - // because probe admissions pass - // ConsultBreaker() BEFORE CheckoutAsync() — - // if the pool was saturated during the - // probe cycle, those admitted probes may - // still be queued when the cycle re-trips. - // Without draining, a saw_failure probe - // cycle can leave the pool with queued - // waiters that still eventually dispatch to - // a known-bad upstream. Draining also - // sweeps any non-probe waiters that - // somehow queued during HALF_OPEN (defense - // in depth — TryAcquire normally rejects - // non-probes before they reach the pool). - if (new_s != circuit_breaker::State::OPEN || - (old_s != circuit_breaker::State::CLOSED && - old_s != circuit_breaker::State::HALF_OPEN)) { + const char* trigger) { + // Three drain triggers, all entering OPEN: + // CLOSED→OPEN : fresh trip; queued non- + // probe waiters need CHECKOUT_CIRCUIT_OPEN + // instead of waiting out the full open + // window. + // HALF_OPEN→OPEN : probe cycle re-tripped; + // probe admissions passed ConsultBreaker + // before CheckoutAsync, so saturated + // pools can leave them queued. Without + // draining they eventually dispatch to a + // known-bad upstream. + // OPEN→OPEN with trigger="dry_run_disabled" + // : synthetic signal from + // CircuitBreakerSlice::Reload when + // dry_run flips true→false on a slice + // that's still OPEN. The earlier trip + // skipped the drain (shadow mode); now + // enforcement is back on, queued + // waiters from that period must be + // flushed before the pool services + // them. Real transitions never use this + // trigger string with old==new==OPEN, + // so there's no overlap with normal + // state-machine signals. + const bool normal_trip = + new_s == circuit_breaker::State::OPEN && + (old_s == circuit_breaker::State::CLOSED || + old_s == circuit_breaker::State::HALF_OPEN); + const bool dry_run_disable_drain = + old_s == circuit_breaker::State::OPEN && + new_s == circuit_breaker::State::OPEN && + trigger != nullptr && + std::strcmp(trigger, + "dry_run_disabled") == 0; + if (!normal_trip && !dry_run_disable_drain) { return; } - // Dry-run honors the shadow-mode contract: - // the slice already log-but-admits - // would-reject decisions, so the wait-queue - // drain — which would deliver hard 503s - // (CHECKOUT_CIRCUIT_OPEN → RESULT_CIRCUIT_OPEN) - // to queued waiters — must also be a no-op. - // Otherwise shadow-mode rollouts can still - // drop queued requests under backpressure, - // defeating the safety of enabling dry_run - // on a live service. Logged at info so - // operators see the trip event without - // the side effect. + // Dry-run shadow-mode contract: the slice + // log-but-admits would-reject decisions, so + // the wait-queue drain — which would + // deliver hard 503s (CHECKOUT_CIRCUIT_OPEN + // → RESULT_CIRCUIT_OPEN) to queued + // waiters — must also be a no-op while + // dry_run is true. Note: when this fires + // via the dry_run_disabled trigger, the + // slice's config_.dry_run was already + // updated to false in Reload BEFORE the + // synthetic callback, so this guard + // correctly does NOT skip the drain in + // that case. if (slice_ptr && slice_ptr->config().dry_run) { logging::Get()->info( "[dry-run] circuit breaker would drain " From 94958d7924378cf7ee7517d36995a057d7a4ab01 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 11:29:23 +0800 Subject: [PATCH 33/37] Add more circuit break test --- test/circuit_breaker_reload_test.h | 221 +++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) diff --git a/test/circuit_breaker_reload_test.h b/test/circuit_breaker_reload_test.h index 220c718e..5b63e6b4 100644 --- a/test/circuit_breaker_reload_test.h +++ b/test/circuit_breaker_reload_test.h @@ -359,6 +359,224 @@ void TestReloadDisableThenEnable() { } } +// Regression: a SIGHUP carrying an invalid CB threshold (e.g. +// `consecutive_failure_threshold = 0`) on an EXISTING upstream must +// be hard-rejected. The downgrade-to-warn behavior of the wider +// `Validate()` call would otherwise push the bad value into live +// slices even though startup rejects the same file. +void TestReloadRejectsInvalidCbField() { + std::cout << "\n[TEST] CB Reload: invalid CB tuning is hard-rejected..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + gw.upstreams.push_back( + MakeReloadUpstream("svc", "127.0.0.1", backend_port)); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + // Build an invalid reload — threshold below the [1, 10000] range. + ServerConfig invalid = gw; + invalid.upstreams[0].circuit_breaker.consecutive_failure_threshold = 0; + + bool reload_returned = gateway.Reload(invalid); + // The slice's threshold must NOT have been pushed live. + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* slice = cbm->GetHost("svc")->GetSlice(0); + int live_threshold = slice->config().consecutive_failure_threshold; + + bool pass = reload_returned == false && live_threshold == 3; + TestFramework::RecordTest( + "CB Reload: invalid CB tuning is hard-rejected", pass, + pass ? "" : + "reload_returned=" + std::to_string(reload_returned) + + " live_threshold=" + std::to_string(live_threshold) + + " (expected reload=false, threshold=3 unchanged)"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: invalid CB tuning is hard-rejected", false, e.what()); + } +} + +// Regression: with `dry_run=true`, the CLOSED→OPEN transition callback +// must NOT drain the partition wait queue (shadow-mode contract: log +// would-reject decisions, admit traffic). The breaker's dry_run check +// inside the transition callback covers this; the regression we lock +// in is the log-emitted breadcrumb plus the absence of CHECKOUT_CIRCUIT_OPEN +// to queued waiters. +void TestDryRunDoesNotDrainOnTrip() { + std::cout << "\n[TEST] CB Reload: dry-run skips wait-queue drain on trip..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + UpstreamConfig u = MakeReloadUpstream("svc", "127.0.0.1", backend_port); + u.circuit_breaker.dry_run = true; + u.circuit_breaker.consecutive_failure_threshold = 2; + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + int gw_port = gw_runner.GetPort(); + // Trip the breaker via 2 failures. + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + + bool saw_dryrun_drain_skip = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("[dry-run] circuit breaker would drain wait queue") != + std::string::npos) { + saw_dryrun_drain_skip = true; + break; + } + } + + TestFramework::RecordTest( + "CB Reload: dry-run skips wait-queue drain on trip", + saw_dryrun_drain_skip, + saw_dryrun_drain_skip ? "" : + "expected '[dry-run] circuit breaker would drain wait queue' log line"); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: dry-run skips wait-queue drain on trip", false, e.what()); + } +} + +// Regression: when `dry_run` flips true→false on a slice that's +// currently OPEN, `Slice::Reload` fires a synthetic OPEN→OPEN +// transition with trigger="dry_run_disabled". The HttpServer-installed +// callback recognizes it and drains the partition queue so shadow-mode +// waiters don't leak through to the upstream once enforcement is back on. +void TestDryRunDisableOnOpenTriggersDrainSignal() { + std::cout << "\n[TEST] CB Reload: dry_run disable on OPEN triggers drain..." + << std::endl; + try { + HttpServer backend("127.0.0.1", 0); + backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) { + resp.Status(502).Body("err", "text/plain"); + }); + TestServerRunner backend_runner(backend); + int backend_port = backend_runner.GetPort(); + + ServerConfig gw; + gw.bind_host = "127.0.0.1"; + gw.bind_port = 0; + gw.worker_threads = 1; + gw.http2.enabled = false; + UpstreamConfig u = MakeReloadUpstream("svc", "127.0.0.1", backend_port); + u.circuit_breaker.dry_run = true; + u.circuit_breaker.consecutive_failure_threshold = 2; + u.circuit_breaker.base_open_duration_ms = 60000; // long open window + gw.upstreams.push_back(u); + + HttpServer gateway(gw); + TestServerRunner gw_runner(gateway); + int gw_port = gw_runner.GetPort(); + + // Trip the breaker (dry-run still records the trip; state goes OPEN). + for (int i = 0; i < 2; ++i) { + TestHttpClient::HttpGet(gw_port, "/fail", 3000); + } + + auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager(); + auto* slice = cbm->GetHost("svc")->GetSlice(0); + bool was_open = slice->CurrentState() == circuit_breaker::State::OPEN; + + auto ring = std::make_shared< + spdlog::sinks::ringbuffer_sink_mt>(1024); + auto logger = logging::Get(); + auto prev_level = logger->level(); + logger->set_level(spdlog::level::debug); + logger->sinks().push_back(ring); + struct SinkGuard { + std::shared_ptr logger; + std::shared_ptr ring; + spdlog::level::level_enum prev_level; + ~SinkGuard() { + auto& sinks = logger->sinks(); + sinks.erase(std::remove(sinks.begin(), sinks.end(), + std::shared_ptr(ring)), + sinks.end()); + logger->set_level(prev_level); + } + } guard{logger, ring, prev_level}; + + // Reload with dry_run=false, everything else same. + ServerConfig disable_dry = gw; + disable_dry.upstreams[0].circuit_breaker.dry_run = false; + gateway.Reload(disable_dry); + std::this_thread::sleep_for(std::chrono::milliseconds(150)); + + // The synthetic-callback fire path emits a slice-side log line. + bool saw_flush_log = false; + for (const auto& msg : ring->last_formatted()) { + if (msg.find("dry_run disabled while OPEN") != std::string::npos && + msg.find("flushing wait queue") != std::string::npos) { + saw_flush_log = true; + break; + } + } + bool live_dry_run = slice->config().dry_run; + bool still_open = slice->CurrentState() == circuit_breaker::State::OPEN; + + bool pass = was_open && !live_dry_run && saw_flush_log && still_open; + TestFramework::RecordTest( + "CB Reload: dry_run disable on OPEN triggers drain", pass, + pass ? "" : + "was_open=" + std::to_string(was_open) + + " live_dry_run=" + std::to_string(live_dry_run) + + " saw_flush_log=" + std::to_string(saw_flush_log) + + " still_open=" + std::to_string(still_open)); + } catch (const std::exception& e) { + TestFramework::RecordTest( + "CB Reload: dry_run disable on OPEN triggers drain", false, + e.what()); + } +} + void RunAllTests() { std::cout << "\n" << std::string(60, '=') << std::endl; std::cout << "CIRCUIT BREAKER - HOT-RELOAD TESTS" << std::endl; @@ -368,6 +586,9 @@ void RunAllTests() { TestCbOnlyReloadNoRestartWarn(); TestTopologyChangeStillEmitsRestartWarn(); TestReloadDisableThenEnable(); + TestReloadRejectsInvalidCbField(); + TestDryRunDoesNotDrainOnTrip(); + TestDryRunDisableOnOpenTriggersDrainSignal(); } } // namespace CircuitBreakerReloadTests From 3aef3c53a3e75a128cc625bb2ed6b0bc07e5627b Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 11:58:26 +0800 Subject: [PATCH 34/37] Add more circuit break test --- .../circuit_breaker/circuit_breaker_slice.h | 9 ++- server/circuit_breaker_slice.cc | 64 ++++++++++++------- server/http_server.cc | 21 +++--- 3 files changed, 61 insertions(+), 33 deletions(-) diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h index 1c96dcd0..d6899bae 100644 --- a/include/circuit_breaker/circuit_breaker_slice.h +++ b/include/circuit_breaker/circuit_breaker_slice.h @@ -246,7 +246,14 @@ class CircuitBreakerSlice { StateTransitionCallback transition_cb_; // Internal transitions (dispatcher-thread). - void TripClosedToOpen(const char* trigger); + // `now` is threaded through from ReportFailure so the window_total / + // window_fail_rate fields in the trip log reflect the SAME sliding-window + // view that ShouldTripClosed just saw — a fresh Now() here can cross a + // bucket boundary (especially with window_seconds=1 or under a dispatcher + // stall) and trigger Window::Advance's full-reset, zeroing the bucket that + // holds the failure which actually tripped the breaker. + void TripClosedToOpen(const char* trigger, + std::chrono::steady_clock::time_point now); void TransitionOpenToHalfOpen(); void TransitionHalfOpenToClosed(); void TripHalfOpenToOpen(const char* trigger); diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index 9dad6a31..b24f352a 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -80,8 +80,15 @@ bool CircuitBreakerSlice::ShouldTripClosed( (static_cast(config_.failure_rate_threshold) * total); } -void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) { - auto now = Now(); +void CircuitBreakerSlice::TripClosedToOpen( + const char* trigger, std::chrono::steady_clock::time_point now) { + // `now` is the same time_point the caller (ReportFailure) passed to + // AddFailure/ShouldTripClosed — reusing it keeps the trip log's + // window_total/window_fail_rate consistent with the rate check that + // fired the trip. Calling Now() fresh here would risk crossing a + // bucket boundary and logging window_total=0 for the very failure + // that tripped the breaker. + // // Capture pre-reset observability context BEFORE mutating state. // §11.1 log format asks for consecutive_failures + window_total + // window_fail_rate at the trip event so operators can distinguish a @@ -479,7 +486,9 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe, const char* trigger = (consecutive_failures_ >= config_.consecutive_failure_threshold) ? "consecutive" : "rate"; - TripClosedToOpen(trigger); + // Thread `now` through so the trip log's window stats reflect the + // same view ShouldTripClosed just used. + TripClosedToOpen(trigger, now); } } @@ -615,32 +624,41 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { new_config.consecutive_failure_threshold, enabled_changed ? " (enabled toggled — state reset to CLOSED)" : ""); - // dry_run true→false on a slice that's STILL OPEN: enforcement is - // back on, but the OPEN→OPEN intra-state config edit doesn't fire - // any natural transition callback. The pool partition may have - // queued waiters from the shadow-mode period (the original - // CLOSED→OPEN drain was skipped because dry_run was true at the - // time). Without flushing them now, those queued requests will - // eventually dispatch to the unhealthy upstream once a pool slot - // frees, defeating the just-re-enabled enforcement. + // dry_run true→false on a slice that's still rejecting traffic + // (OPEN or HALF_OPEN): enforcement is back on, but the same-state + // intra-config edit doesn't fire any natural transition callback. + // The pool partition may have queued waiters from the shadow-mode + // period — drain reasons per state: + // OPEN: the original CLOSED→OPEN drain was skipped because + // dry_run was true at trip time, so every request that arrived + // during the open window was admitted and may be queued. + // HALF_OPEN: under dry_run the slice log-but-admits both probe- + // budget-exhausted (half_open_full) and saw-failure short- + // circuits (half_open_recovery_failing). Those requests sit in + // the pool wait queue even though enforcement would reject + // them. Without a drain they reach the unhealthy upstream once + // a pool slot frees, defeating re-enabled enforcement. // - // Signal the host via a synthetic OPEN→OPEN transition callback + // Signal the host via a synthetic same-state transition callback // with trigger="dry_run_disabled". The HttpServer-installed - // callback recognizes this special trigger and drains the - // partition queue. Real state transitions never reuse the same - // old/new state with this trigger string, so there's no overlap. + // callback recognizes this trigger and drains the partition + // queue. Real state transitions never reuse old==new with this + // trigger string, so there's no overlap with normal signals. // - // Only fire when we KNOW the state is still OPEN — the + // Only fire when we KNOW the state is still rejecting — the // enabled-toggle branch above resets to CLOSED, in which case the // drain is unnecessary (no enforcement to re-engage). State is // dispatcher-thread-only here; a plain load is sufficient. - if (old_dry_run && !new_config.dry_run && - state_.load(std::memory_order_acquire) == State::OPEN && - transition_cb_) { - logging::Get()->info( - "circuit breaker dry_run disabled while OPEN {} — " - "flushing wait queue", host_label_); - transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled"); + if (old_dry_run && !new_config.dry_run && transition_cb_) { + State s = state_.load(std::memory_order_acquire); + if (s == State::OPEN || s == State::HALF_OPEN) { + const char* state_label = + (s == State::OPEN) ? "OPEN" : "HALF_OPEN"; + logging::Get()->info( + "circuit breaker dry_run disabled while {} {} — " + "flushing wait queue", state_label, host_label_); + transition_cb_(s, s, "dry_run_disabled"); + } } } diff --git a/server/http_server.cc b/server/http_server.cc index 4275aad1..e3c423fb 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -483,17 +483,19 @@ void HttpServer::MarkServerReady() { // pools can leave them queued. Without // draining they eventually dispatch to a // known-bad upstream. - // OPEN→OPEN with trigger="dry_run_disabled" - // : synthetic signal from + // OPEN→OPEN or HALF_OPEN→HALF_OPEN with + // trigger="dry_run_disabled" : + // synthetic signal from // CircuitBreakerSlice::Reload when // dry_run flips true→false on a slice - // that's still OPEN. The earlier trip - // skipped the drain (shadow mode); now - // enforcement is back on, queued + // that's still rejecting traffic. The + // earlier trip / HALF_OPEN rejects + // skipped enforcement (shadow mode); + // now enforcement is back on, queued // waiters from that period must be // flushed before the pool services - // them. Real transitions never use this - // trigger string with old==new==OPEN, + // them. Real transitions never use + // this trigger string with old==new, // so there's no overlap with normal // state-machine signals. const bool normal_trip = @@ -501,8 +503,9 @@ void HttpServer::MarkServerReady() { (old_s == circuit_breaker::State::CLOSED || old_s == circuit_breaker::State::HALF_OPEN); const bool dry_run_disable_drain = - old_s == circuit_breaker::State::OPEN && - new_s == circuit_breaker::State::OPEN && + old_s == new_s && + (old_s == circuit_breaker::State::OPEN || + old_s == circuit_breaker::State::HALF_OPEN) && trigger != nullptr && std::strcmp(trigger, "dry_run_disabled") == 0; From 7b356806a1631533bab59449ef83a0a0f6ac6a44 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 12:52:28 +0800 Subject: [PATCH 35/37] Add more circuit break test --- server/circuit_breaker_slice.cc | 70 ++++++++++++++++++--------------- server/http_server.cc | 25 ++++++------ server/proxy_transaction.cc | 31 ++++++++++----- 3 files changed, 73 insertions(+), 53 deletions(-) diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc index b24f352a..e6bd1c93 100644 --- a/server/circuit_breaker_slice.cc +++ b/server/circuit_breaker_slice.cc @@ -624,41 +624,47 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) { new_config.consecutive_failure_threshold, enabled_changed ? " (enabled toggled — state reset to CLOSED)" : ""); - // dry_run true→false on a slice that's still rejecting traffic - // (OPEN or HALF_OPEN): enforcement is back on, but the same-state - // intra-config edit doesn't fire any natural transition callback. - // The pool partition may have queued waiters from the shadow-mode - // period — drain reasons per state: - // OPEN: the original CLOSED→OPEN drain was skipped because - // dry_run was true at trip time, so every request that arrived - // during the open window was admitted and may be queued. - // HALF_OPEN: under dry_run the slice log-but-admits both probe- - // budget-exhausted (half_open_full) and saw-failure short- - // circuits (half_open_recovery_failing). Those requests sit in - // the pool wait queue even though enforcement would reject - // them. Without a drain they reach the unhealthy upstream once - // a pool slot frees, defeating re-enabled enforcement. + // dry_run true→false on a slice that's STILL OPEN: enforcement is + // back on, but the OPEN→OPEN intra-state config edit doesn't fire + // any natural transition callback. The pool partition may have + // queued waiters from the shadow-mode period (the original + // CLOSED→OPEN drain was skipped because dry_run was true at the + // time). Without flushing them now, those queued requests will + // eventually dispatch to the unhealthy upstream once a pool slot + // frees, defeating the just-re-enabled enforcement. // - // Signal the host via a synthetic same-state transition callback + // Signal the host via a synthetic OPEN→OPEN transition callback // with trigger="dry_run_disabled". The HttpServer-installed - // callback recognizes this trigger and drains the partition - // queue. Real state transitions never reuse old==new with this - // trigger string, so there's no overlap with normal signals. + // callback recognizes this special trigger and drains the + // partition queue. Real state transitions never reuse the same + // old/new state with this trigger string, so there's no overlap. // - // Only fire when we KNOW the state is still rejecting — the - // enabled-toggle branch above resets to CLOSED, in which case the - // drain is unnecessary (no enforcement to re-engage). State is - // dispatcher-thread-only here; a plain load is sufficient. - if (old_dry_run && !new_config.dry_run && transition_cb_) { - State s = state_.load(std::memory_order_acquire); - if (s == State::OPEN || s == State::HALF_OPEN) { - const char* state_label = - (s == State::OPEN) ? "OPEN" : "HALF_OPEN"; - logging::Get()->info( - "circuit breaker dry_run disabled while {} {} — " - "flushing wait queue", state_label, host_label_); - transition_cb_(s, s, "dry_run_disabled"); - } + // IMPORTANT — why this does NOT fire in HALF_OPEN: HALF_OPEN + // queues can mix two admission kinds that share a partition wait + // slot but differ on slice bookkeeping: + // (a) Valid probes admitted within permitted_half_open_calls — + // admission_generation_ = current halfopen_gen_, holding a + // real half_open_inflight_/admitted_ slot. These drive + // recovery on a healthy upstream and must NOT be disrupted + // by an operator config flip. + // (b) Dry-run-admitted shadow requests (half_open_full / + // half_open_recovery_failing paths) — admission_generation_ + // = 0 (RejectWithLog sentinel). Their outcomes drop as + // stale-gen on report, so they never influence the slice's + // state machine and are bounded by pool queue size. + // DrainWaitQueueOnTrip is partition-wide and can't tell (a) from + // (b); draining would 503 valid probes (delaying/preventing + // recovery) to also drop the harmless (b). We accept the small + // bounded leak of (b) as the lesser evil. + // + // State is dispatcher-thread-only here; a plain load is sufficient. + if (old_dry_run && !new_config.dry_run && + state_.load(std::memory_order_acquire) == State::OPEN && + transition_cb_) { + logging::Get()->info( + "circuit breaker dry_run disabled while OPEN {} — " + "flushing wait queue", host_label_); + transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled"); } } diff --git a/server/http_server.cc b/server/http_server.cc index e3c423fb..67575de7 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -483,29 +483,30 @@ void HttpServer::MarkServerReady() { // pools can leave them queued. Without // draining they eventually dispatch to a // known-bad upstream. - // OPEN→OPEN or HALF_OPEN→HALF_OPEN with - // trigger="dry_run_disabled" : - // synthetic signal from + // OPEN→OPEN with trigger="dry_run_disabled" + // : synthetic signal from // CircuitBreakerSlice::Reload when // dry_run flips true→false on a slice - // that's still rejecting traffic. The - // earlier trip / HALF_OPEN rejects - // skipped enforcement (shadow mode); - // now enforcement is back on, queued + // that's still OPEN. The earlier trip + // skipped the drain (shadow mode); now + // enforcement is back on, queued // waiters from that period must be // flushed before the pool services - // them. Real transitions never use - // this trigger string with old==new, + // them. Real transitions never use this + // trigger string with old==new==OPEN, // so there's no overlap with normal // state-machine signals. + // (The slice intentionally does NOT + // fire this signal in HALF_OPEN — see + // CircuitBreakerSlice::Reload for why + // valid probes must not be flushed.) const bool normal_trip = new_s == circuit_breaker::State::OPEN && (old_s == circuit_breaker::State::CLOSED || old_s == circuit_breaker::State::HALF_OPEN); const bool dry_run_disable_drain = - old_s == new_s && - (old_s == circuit_breaker::State::OPEN || - old_s == circuit_breaker::State::HALF_OPEN) && + old_s == circuit_breaker::State::OPEN && + new_s == circuit_breaker::State::OPEN && trigger != nullptr && std::strcmp(trigger, "dry_run_disabled") == 0; diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index 0e1e5a6f..d3e8bd82 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -354,20 +354,33 @@ void ProxyTransaction::OnCheckoutError(int error_code) { static constexpr int CIRCUIT_OPEN = -6; if (error_code == CIRCUIT_OPEN) { - // Drain path: breaker tripped while this transaction was queued - // Do NOT Report to the slice — - // our own reject must not feed back into the failure math. Emit - // the §12.1 circuit-open response directly. + // Drain path: breaker tripped while this transaction was queued. + // Do NOT Report success/failure to the slice — our own reject + // must not feed back into the failure math. Emit the §12.1 + // circuit-open response directly. logging::Get()->info( "ProxyTransaction checkout drained by circuit breaker " "client_fd={} service={}", client_fd_, service_name_); + // Neutral-release the slice admission instead of just clearing + // admission_generation_. Three drain paths reach here: + // CLOSED→OPEN : closed_gen_ was bumped by the trip; our + // generation is now stale → ReportNeutral + // drops as stale-gen. No state mutation. Safe. + // HALF_OPEN→OPEN : halfopen_gen_ was bumped by the trip AND + // half_open_inflight_/admitted_ reset to 0 by + // TransitionOpenToHalfOpen's sibling path → + // ReportNeutral drops as stale-gen. Safe. + // (Any future same-cycle drain without a generation bump): + // admission_generation_ is still current → + // ReportNeutral correctly returns the slot, + // preventing half_open_inflight_/admitted_ + // from leaking and wedging the slice in + // half_open_full until the next reset. + // ReleaseBreakerAdmissionNeutral clears admission_generation_ + // internally, so Cleanup/destructor won't double-report. + ReleaseBreakerAdmissionNeutral(); DeliverResponse(MakeCircuitOpenResponse()); - // Clear admission_generation_ so Cleanup / destructor doesn't - // double-report. The admission was already fire-and-forget — - // slice-side bookkeeping is intact (the drain itself doesn't - // touch inflight counters because the breaker didn't admit). - admission_generation_ = 0; return; } From f54033c066e55b7172074ea23269da049b51e4c4 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 13:30:38 +0800 Subject: [PATCH 36/37] Add more circuit break test --- server/circuit_breaker_host.cc | 49 ++++++++++++++++++++++++++++++++-- server/http_server.cc | 31 ++++++++++++++++++++- 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc index 4523d3be..4e2640ae 100644 --- a/server/circuit_breaker_host.cc +++ b/server/circuit_breaker_host.cc @@ -2,6 +2,8 @@ #include "dispatcher.h" #include "log/logger.h" +#include + namespace circuit_breaker { CircuitBreakerHost::CircuitBreakerHost(std::string service_name, @@ -104,11 +106,23 @@ void CircuitBreakerHost::Reload( retry_budget_->Reload(new_config.retry_budget_percent, new_config.retry_budget_min_concurrency); - // Enqueue per-slice Reload on each owning dispatcher. The slice is + // Apply per-slice Reload on each owning dispatcher. The slice is // dispatcher-thread-local for mutation, so the config swap must // happen there. Passing slice as raw pointer is safe: slices_ is // owned by `this` (the host), which outlives the manager's reload // (enforced by CircuitBreakerManager's lifetime). + // + // Synchronize: wait for every enqueued slice Reload to actually run + // before returning. Without this, HttpServer::Reload could return + // "success" while requests already queued on a dispatcher still run + // with the OLD enabled/dry_run/thresholds — a SIGHUP flipping a + // tripped breaker to disabled (or to dry_run) could still emit hard + // 503s or enforce the old retry budget for a brief window after the + // operator sees reload-ok. Dispatcher-local inline on the current + // thread avoids self-deadlock if Reload is ever called from a + // dispatcher thread. + std::vector> pending; + pending.reserve(slices_.size()); for (size_t i = 0; i < slices_.size(); ++i) { CircuitBreakerSlice* slice = slices_[i].get(); auto& dispatcher = dispatchers[i]; @@ -118,11 +132,42 @@ void CircuitBreakerHost::Reload( service_name_, host_, i); continue; } - dispatcher->EnQueue([slice, new_config]() { + if (dispatcher->is_on_loop_thread()) { + // Caller IS this dispatcher — apply inline to preserve + // dispatcher-thread-local invariant without self-enqueueing + // (which would only run after this frame returns, defeating + // the sync contract). No future to wait on for this slice. + slice->Reload(new_config); + continue; + } + auto promise = std::make_shared>(); + pending.push_back(promise->get_future()); + dispatcher->EnQueue([slice, new_config, promise]() { slice->Reload(new_config); + promise->set_value(); }); } + // Bounded wait: slice Reload is trivial (config copy + optional + // synthetic transition callback), so each dispatcher only needs one + // event-loop iteration to drain. A 2s ceiling protects callers from + // a stalled / stopping dispatcher — if the wait times out we log and + // proceed; the remaining slice(s) will pick up the new config when + // the queued task eventually runs (via the shared_ptr-captured + // new_config copy), so we never lose an edit — just delay its visibility. + const auto deadline = + std::chrono::steady_clock::now() + std::chrono::seconds(2); + for (auto& fut : pending) { + if (fut.wait_until(deadline) != std::future_status::ready) { + logging::Get()->warn( + "CircuitBreakerHost::Reload({}:{}) timed out waiting for " + "slice apply — new config will be applied when the " + "dispatcher drains", service_name_, host_); + break; // No benefit to waiting out the remaining futures + // after the first timeout — they share the deadline. + } + } + // Save the new config for future Snapshot() / construction-like // operations. Other threads never read config_ directly. config_ = new_config; diff --git a/server/http_server.cc b/server/http_server.cc index 67575de7..f9a36bf5 100644 --- a/server/http_server.cc +++ b/server/http_server.cc @@ -3853,7 +3853,36 @@ bool HttpServer::Reload(const ServerConfig& new_config) { // When topology MATCHES (the common case, including CB-only // edits), adopt the new snapshot as the fresh baseline so CB- // field edits persist for later reload diffs. - if (new_config.upstreams != upstream_configs_) { + // + // Compare as name-keyed maps rather than vectors: live pools and + // CircuitBreakerManager are both keyed by upstream name, so a pure + // reorder of otherwise-identical entries is NOT a topology change. + // Vector equality would fire a spurious "restart required" warning + // and skip the upstream_configs_ update, leaving every subsequent + // breaker-only reload on that reordered file forever looking like a + // topology change. UpstreamConfig::operator== already excludes the + // live-reloadable `circuit_breaker` field, so map equality reflects + // the true restart-vs-live partition. Duplicate names were rejected + // upstream by ValidateHotReloadable, so the map conversion is + // lossless here. + auto by_name = [](const std::vector& v) { + std::map m; + for (const auto& u : v) m[u.name] = &u; + return m; + }; + const auto old_map = by_name(upstream_configs_); + const auto new_map = by_name(new_config.upstreams); + bool topology_match = old_map.size() == new_map.size(); + if (topology_match) { + for (const auto& entry : old_map) { + auto it = new_map.find(entry.first); + if (it == new_map.end() || *entry.second != *it->second) { + topology_match = false; + break; + } + } + } + if (!topology_match) { logging::Get()->warn("Reload: upstream topology changes require a " "restart to take effect (circuit-breaker " "field edits, if any, were applied live)"); From 79f91ed95cbb25c098d3ccdcf51ee25c4eeb9ee6 Mon Sep 17 00:00:00 2001 From: mwfj Date: Wed, 15 Apr 2026 15:43:11 +0800 Subject: [PATCH 37/37] Fix review comment --- docs/architecture.md | 2 +- docs/circuit_breaker.md | 2 -- server/proxy_transaction.cc | 3 +-- test/circuit_breaker_test.h | 33 ++++++++++++++++----------------- test/route_test.h | 2 +- 5 files changed, 19 insertions(+), 23 deletions(-) diff --git a/docs/architecture.md b/docs/architecture.md index 5f537bcf..8f990e8e 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -126,7 +126,7 @@ HttpServer - **Synchronous LRU eviction on insert** — `FindOrCreate` evicts LRU tail before creating a new entry if the shard is at capacity, guaranteeing `max_entries` is honored even under high-cardinality bursts - **Disable-first / enable-last reload ordering** — ensures no request can observe `enabled=true` with the previous (stale) zone list during a `(false,[])→(true,[Z])` transition -See `docs/configuration.md` for the full config reference and `.claude/documents/features/RATE_LIMITING.md` for implementation internals. +See `docs/configuration.md` for the full config reference. ## Memory Management diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md index ef3a5ef0..64743d77 100644 --- a/docs/circuit_breaker.md +++ b/docs/circuit_breaker.md @@ -145,5 +145,3 @@ Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a re - **Generation tokens.** Every admission is stamped with a per-domain generation counter (`closed_gen_` or `halfopen_gen_`, depending on state). `Report*` drops stale-generation completions so pre-transition requests can't pollute a fresh cycle. Window resizes bump only `closed_gen_` so in-flight probes aren't stranded. - **Retry budget CAS.** `TryConsumeRetry` uses `compare_exchange_weak` to serialize concurrent retry admissions. A plain load-check-add would let N callers all observe `current < cap` and all increment past the cap. - **Non-retry denominator.** The budget base is `in_flight - retries_in_flight`, not raw `in_flight`. Retries count in both terms but subtract out here so admitting a retry doesn't inflate its own cap. - -For the full design document (motivations, trade-offs, failure modes, revision history, test strategy), see [.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md](../.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md). diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc index d3e8bd82..a427f629 100644 --- a/server/proxy_transaction.cc +++ b/server/proxy_transaction.cc @@ -933,8 +933,7 @@ void ProxyTransaction::Cancel() { // mid-request, counting that as an upstream-health failure would // trip the breaker against a backend that may be perfectly healthy // (browser cancels, user-initiated timeouts, etc. are all common - // causes). The reviewer guidance is explicit: client-initiated - // aborts must be neutral from the breaker's perspective. + // causes). Client-initiated aborts must be neutral from the breaker's perspective. // // Trade-off: in HALF_OPEN, ReportNeutral on a probe decrements // both inflight and admitted, so a cancelled probe makes the slot diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h index bed54da0..b5b9da95 100644 --- a/test/circuit_breaker_test.h +++ b/test/circuit_breaker_test.h @@ -579,7 +579,7 @@ void TestSuccessClearsConsecutive() { } // ============================================================================ -// Regression tests — critical bugs caught in code review +// Regression tests // ============================================================================ // BUG: late non-probe failure after trip re-entered TripClosedToOpen, inflating @@ -775,7 +775,7 @@ void TestHalfOpenFullCounterSeparate() { } } -// BUG (review round 2, P2): Reload preserved stale state across enabled +// Reload preserved stale state across enabled // toggles. Disabling while OPEN and re-enabling later resumed the OPEN state, // rejecting requests despite an explicit operator off→on cycle. Disabling // after accumulated consecutive failures would re-trip on the very next @@ -830,8 +830,7 @@ void TestReloadResetsStateOnEnabledToggleWhileOpen() { } } -// BUG (review round 2, P2, variant): if disable happens while -// consecutive_failures_ has accumulated but not yet tripped, re-enable would +// If disable happens while consecutive_failures_ has accumulated but not yet tripped, re-enable would // inherit that count and trip early on the next failure. void TestReloadResetsConsecutiveFailuresOnEnabledToggle() { std::cout << "\n[TEST] CB: reload clears consecutive_failures on enable toggle..." @@ -909,7 +908,7 @@ void TestReloadThresholdChangePreservesState() { } } -// BUG (review round 2, P3): saw_failure short-circuit incorrectly bumped the +// saw_failure short-circuit incorrectly bumped the // HALF_OPEN_FULL counter, polluting dashboards that need to distinguish // "probing, no capacity left" from "recovery attempt is failing". void TestSawFailureDoesNotBumpHalfOpenFullCounter() { @@ -956,7 +955,7 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() { } } -// BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left +// TransitionOpenToHalfOpen deliberately left // `open_until_steady_ns_` populated, violating the documented OpenUntil() // contract ("zero when not OPEN"). A consumer computing Retry-After // from a HALF_OPEN slice would compute (stale_deadline - now), which is @@ -1002,7 +1001,7 @@ void TestOpenUntilZeroWhenHalfOpen() { } } -// BUG (review round 3, P1): Reload reset the state on enabled toggle but +// Reload reset the state on enabled toggle but // gave Report* no way to distinguish pre-toggle admissions from post-toggle // ones. Stale completions then polluted the fresh CLOSED cycle. Fixed with // a generation token captured at admission and checked at report. @@ -1122,7 +1121,7 @@ void TestStaleGenerationReportsDroppedAcrossStateTransitions() { } } -// BUG (review round 4, P2): Reload that resizes the rolling window without +// Reload that resizes the rolling window without // toggling enabled cleared the window buckets but left generation_ unchanged. // Late reports from pre-reload admissions would carry the still-current // generation, pass the guard, and re-populate the freshly empty window — @@ -1239,7 +1238,7 @@ void TestThresholdOnlyReloadDoesNotAdvanceGeneration() { } } -// BUG (review round 5, P1): Reload with window_seconds change while the +// Reload with window_seconds change while the // slice is HALF_OPEN used to bump the single `generation_`, invalidating // every in-flight probe. Those probes' late Report* calls then dropped // WITHOUT decrementing half_open_inflight_, wedging the slice in HALF_OPEN @@ -1377,7 +1376,7 @@ void TestWindowResizeStillInvalidatesClosedAdmissions() { } } -// BUG (review round 7, P2): Reload() lowering permitted_half_open_calls +// Reload() lowering permitted_half_open_calls // while a HALF_OPEN cycle is active could close the breaker early and // discard failures from already-admitted probes. // @@ -1463,7 +1462,7 @@ void TestHalfOpenBudgetFrozenAcrossReload() { } } -// BUG (review round 6, P2): Reload with window_seconds change preserved +// Reload with window_seconds change preserved // consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED // reports are correctly blocked (stale gen), but they can no longer // clear or advance consecutive_failures_ either. The counter becomes an @@ -1538,7 +1537,7 @@ void TestWindowResizeResetConsecutiveFailures() { } } -// BUG (review round 9, P2-1): ReportFailure captured Now() separately in +// ReportFailure captured Now() separately in // AddFailure() and ShouldTripClosed()'s internal TotalCount/FailureCount // calls. If a second boundary elapsed between the two calls, Advance() could // wipe the just-recorded failure — with window_seconds=1, the 1-second delta @@ -1602,7 +1601,7 @@ void TestReportFailureUsesOneTimestampAcrossTripEval() { } } -// BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated +// CircuitBreakerWindow's constructor allocated // `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value. // Programmatic callers bypassing ConfigLoader::Validate() (tests, future // direct users) that passed window_seconds <= 0 would trigger BucketIndex's @@ -1639,7 +1638,7 @@ void TestWindowNonPositiveWindowSizeClamp() { } } -// BUG (review round 9, P3): CircuitBreakerSlice copied permitted_half_open_calls +// CircuitBreakerSlice copied permitted_half_open_calls // into the HALF_OPEN snapshot verbatim. For programmatic callers bypassing // ConfigLoader::Validate() (same class as the window ctor clamp), a zero or // negative budget would permanently wedge the breaker in HALF_OPEN: @@ -1703,7 +1702,7 @@ void TestHalfOpenClampsNonPositiveProbeBudget() { } } -// BUG (review round 10, P1): TryAcquire gated HALF_OPEN admission on +// TryAcquire gated HALF_OPEN admission on // half_open_inflight_, so a probe slot was reused once an earlier probe // completed. With permitted_half_open_calls=2: // @@ -1788,7 +1787,7 @@ void TestHalfOpenDoesNotReuseProbeSlots() { } } -// BUG (review round 11, P1): Admission contract has ReportSuccess and +// Admission contract has ReportSuccess and // ReportFailure but no path for probes that complete without touching the // upstream (POOL_EXHAUSTED after probe admission, shutdown, client // disconnect, PARSE_ERROR). Following the §7 "don't report these as @@ -1921,7 +1920,7 @@ void TestReportNeutralLastProbeAfterFailureReTrips() { } } -// BUG (review round 12, P2): ComputeOpenDuration read base/max durations +// ComputeOpenDuration read base/max durations // straight from config_, so a programmatic caller bypassing // ConfigLoader::Validate() with base_open_duration_ms <= 0 or max < base // would compute scaled_ms <= 0. open_until = now + 0 → next TryAcquire diff --git a/test/route_test.h b/test/route_test.h index c7b86aa4..31cdfe7d 100644 --- a/test/route_test.h +++ b/test/route_test.h @@ -1571,7 +1571,7 @@ void TestRouterProxyCompanionYieldsForMarkedMethod() { } } -// P2 (latest review): per-pattern paired_with_get. When a proxy +// Per-pattern paired_with_get. When a proxy // registers both a companion pattern and a catch-all pattern, the // per-(method,pattern) async-conflict filter may drop GET on ONE // pattern while keeping it on the OTHER. MarkProxyDefaultHead must