From 49a2ae9ce96fc0a43f12e433399a0a497a794f1a Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Mon, 13 Apr 2026 16:09:33 +0800
Subject: [PATCH 01/37] Support Circut break Phase1-2

---
 Makefile                                      |   8 +-
 .../circuit_breaker/circuit_breaker_slice.h   | 115 ++++
 .../circuit_breaker/circuit_breaker_state.h   |  66 ++
 .../circuit_breaker/circuit_breaker_window.h  |  59 ++
 include/config/server_config.h                |  47 +-
 server/circuit_breaker_slice.cc               | 277 ++++++++
 server/circuit_breaker_window.cc              |  81 +++
 server/config_loader.cc                       | 111 +++
 test/circuit_breaker_test.h                   | 647 ++++++++++++++++++
 test/config_test.h                            | 221 ++++++
 test/run_test.cc                              |   7 +
 11 files changed, 1636 insertions(+), 3 deletions(-)
 create mode 100644 include/circuit_breaker/circuit_breaker_slice.h
 create mode 100644 include/circuit_breaker/circuit_breaker_state.h
 create mode 100644 include/circuit_breaker/circuit_breaker_window.h
 create mode 100644 server/circuit_breaker_slice.cc
 create mode 100644 server/circuit_breaker_window.cc
 create mode 100644 test/circuit_breaker_test.h

diff --git a/Makefile b/Makefile
index 68d5f781..8f4ec3f2 100644
--- a/Makefile
+++ b/Makefile
@@ -76,6 +76,9 @@ UPSTREAM_SRCS = $(SERVER_DIR)/upstream_connection.cc $(SERVER_DIR)/pool_partitio
 # Rate limit layer sources
 RATE_LIMIT_SRCS = $(SERVER_DIR)/token_bucket.cc $(SERVER_DIR)/rate_limit_zone.cc $(SERVER_DIR)/rate_limiter.cc
 
+# Circuit breaker layer sources
+CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc
+
 # CLI layer sources
 CLI_SRCS = $(SERVER_DIR)/cli_parser.cc $(SERVER_DIR)/signal_handler.cc $(SERVER_DIR)/pid_file.cc $(SERVER_DIR)/daemonizer.cc
 
@@ -122,7 +125,7 @@ NGHTTP2_SRC = $(THIRD_PARTY_DIR)/nghttp2/nghttp2_alpn.c \
 NGHTTP2_OBJ = $(NGHTTP2_SRC:.c=.o)
 
 # Server library sources (shared between test and production binaries)
-LIB_SRCS = $(REACTOR_SRCS) $(NETWORK_SRCS) $(SERVER_SRCS) $(THREAD_POOL_SRCS) $(FOUNDATION_SRCS) $(HTTP_SRCS) $(HTTP2_SRCS) $(WS_SRCS) $(TLS_SRCS) $(UPSTREAM_SRCS) $(RATE_LIMIT_SRCS) $(CLI_SRCS) $(UTIL_SRCS)
+LIB_SRCS = $(REACTOR_SRCS) $(NETWORK_SRCS) $(SERVER_SRCS) $(THREAD_POOL_SRCS) $(FOUNDATION_SRCS) $(HTTP_SRCS) $(HTTP2_SRCS) $(WS_SRCS) $(TLS_SRCS) $(UPSTREAM_SRCS) $(RATE_LIMIT_SRCS) $(CIRCUIT_BREAKER_SRCS) $(CLI_SRCS) $(UTIL_SRCS)
 
 # Test binary sources
 TEST_SRCS = $(LIB_SRCS) $(TEST_DIR)/test_framework.cc $(TEST_DIR)/run_test.cc
@@ -142,11 +145,12 @@ WS_HEADERS = $(LIB_DIR)/ws/websocket_connection.h $(LIB_DIR)/ws/websocket_frame.
 TLS_HEADERS = $(LIB_DIR)/tls/tls_context.h $(LIB_DIR)/tls/tls_connection.h $(LIB_DIR)/tls/tls_client_context.h
 UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/upstream_host_pool.h $(LIB_DIR)/upstream/pool_partition.h $(LIB_DIR)/upstream/upstream_connection.h $(LIB_DIR)/upstream/upstream_lease.h $(LIB_DIR)/upstream/upstream_http_codec.h $(LIB_DIR)/upstream/http_request_serializer.h $(LIB_DIR)/upstream/header_rewriter.h $(LIB_DIR)/upstream/retry_policy.h $(LIB_DIR)/upstream/proxy_transaction.h $(LIB_DIR)/upstream/proxy_handler.h $(LIB_DIR)/upstream/upstream_response.h $(LIB_DIR)/upstream/upstream_callbacks.h
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
+CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
 TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h
 
 # All headers combined
-HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
+HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
 
 # Default target
 .DEFAULT_GOAL := all
diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
new file mode 100644
index 00000000..5633c355
--- /dev/null
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -0,0 +1,115 @@
+#pragma once
+
+#include "common.h"
+#include "config/server_config.h"
+#include "circuit_breaker/circuit_breaker_state.h"
+#include "circuit_breaker/circuit_breaker_window.h"
+// <chrono>, <atomic>, <string> provided by common.h
+
+namespace circuit_breaker {
+
+// One per-dispatcher slice of the breaker state for a given upstream host.
+// Dispatcher-thread-local for hot-path correctness: TryAcquire, ReportSuccess,
+// ReportFailure must only be called on the dispatcher that owns this slice.
+//
+// Observability counters (`trips_`, `rejected_`, etc.) are atomic so other
+// threads can snapshot them without synchronization. Everything else is
+// plain (no atomics) — single-writer, single-reader.
+class CircuitBreakerSlice {
+public:
+    // `time_source` defaults to steady_clock::now. Tests inject a mock clock.
+    using TimeSource = std::function<std::chrono::steady_clock::time_point()>;
+
+    CircuitBreakerSlice(std::string host_label,
+                        size_t dispatcher_index,
+                        const CircuitBreakerConfig& config,
+                        TimeSource time_source = nullptr);
+
+    // Non-copyable, non-movable: slices are pinned in a Host's vector and
+    // callbacks capture raw pointers.
+    CircuitBreakerSlice(const CircuitBreakerSlice&) = delete;
+    CircuitBreakerSlice& operator=(const CircuitBreakerSlice&) = delete;
+
+    // Hot-path decision. Consults state + (if applicable) advances OPEN→HALF_OPEN
+    // and reserves a probe slot. Increments `rejected_` on REJECTED_OPEN*
+    // (both enforce and dry-run). Emits reject log on dispatcher thread.
+    Decision TryAcquire();
+
+    // Outcome reporting. `probe` is true iff the paired TryAcquire returned
+    // ADMITTED_PROBE. Report* may trigger state transitions and fire the
+    // transition callback.
+    void ReportSuccess(bool probe);
+    void ReportFailure(FailureKind kind, bool probe);
+
+    // Apply a new config (called on this slice's dispatcher thread).
+    // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if
+    // window_seconds changed.
+    void Reload(const CircuitBreakerConfig& new_config);
+
+    // Install or replace the state-transition callback. Safe to call before
+    // any traffic (startup wiring) OR after a hot-reload flips enabled=false→true.
+    // Callers must invoke on this slice's dispatcher thread.
+    void SetTransitionCallback(StateTransitionCallback cb);
+
+    // Observability — safe from any thread.
+    State    CurrentState() const { return state_.load(std::memory_order_acquire); }
+    int64_t  Trips()            const { return trips_.load(std::memory_order_relaxed); }
+    int64_t  Rejected()         const { return rejected_.load(std::memory_order_relaxed); }
+    int64_t  ProbeSuccesses()   const { return probe_successes_.load(std::memory_order_relaxed); }
+    int64_t  ProbeFailures()    const { return probe_failures_.load(std::memory_order_relaxed); }
+
+    const std::string& host_label() const { return host_label_; }
+    size_t dispatcher_index() const { return dispatcher_index_; }
+
+    // Current open_until time. Used by ProxyTransaction to compute
+    // Retry-After. Returns zero ns when not OPEN.
+    std::chrono::steady_clock::time_point OpenUntil() const;
+
+private:
+    // Logging label: "service=X host=Y:Z partition=N" built once.
+    std::string host_label_;
+    size_t dispatcher_index_;
+    CircuitBreakerConfig config_;
+
+    TimeSource time_source_;
+
+    // Hot-path state — state_ written on dispatcher, read by observers.
+    std::atomic<State> state_{State::CLOSED};
+    // Nanoseconds since steady_clock epoch — 0 when not OPEN.
+    std::atomic<int64_t> open_until_steady_ns_{0};
+    // Count of consecutive trips (OPEN entries) since last CLOSED —
+    // drives exponential backoff of open duration.
+    std::atomic<int> consecutive_trips_{0};
+
+    // Dispatcher-thread-only (no atomics).
+    int consecutive_failures_ = 0;
+    CircuitBreakerWindow window_;
+    int half_open_inflight_ = 0;
+    int half_open_successes_ = 0;
+    bool half_open_saw_failure_ = false;
+
+    // Observability counters.
+    std::atomic<int64_t> trips_{0};
+    std::atomic<int64_t> rejected_{0};
+    std::atomic<int64_t> probe_successes_{0};
+    std::atomic<int64_t> probe_failures_{0};
+
+    StateTransitionCallback transition_cb_;
+
+    // Internal transitions (dispatcher-thread).
+    void TripClosedToOpen(const char* trigger);
+    void TransitionOpenToHalfOpen();
+    void TransitionHalfOpenToClosed();
+    void TripHalfOpenToOpen(const char* trigger);
+
+    // Compute open duration for the current consecutive_trips_ value:
+    // min(base * 2^consecutive_trips, max). Always >= base_open_duration_ms.
+    std::chrono::nanoseconds ComputeOpenDuration() const;
+
+    // Check whether CLOSED trip conditions are met. Called after every failure.
+    bool ShouldTripClosed();
+
+    std::chrono::steady_clock::time_point Now() const;
+};
+
+}  // namespace circuit_breaker
diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h
new file mode 100644
index 00000000..06fa695d
--- /dev/null
+++ b/include/circuit_breaker/circuit_breaker_state.h
@@ -0,0 +1,66 @@
+#pragma once
+
+#include "common.h"
+// <cstdint>, <functional>, <string> provided by common.h
+
+// Circuit breaker state machine and classification enums. Used by
+// CircuitBreakerSlice, CircuitBreakerHost, CircuitBreakerManager, and
+// ProxyTransaction to talk about state, admission decisions, and
+// failure kinds.
+//
+// Three-state resilience4j-style machine:
+//
+//   CLOSED ──trip── OPEN ──(open_until elapsed)── HALF_OPEN ──success── CLOSED
+//                                                       │
+//                                                       failure
+//                                                       ▼
+//                                                      OPEN
+namespace circuit_breaker {
+
+enum class State : uint8_t {
+    CLOSED    = 0,
+    OPEN      = 1,
+    HALF_OPEN = 2,
+};
+
+// Result of CircuitBreakerSlice::TryAcquire. Callers branch on this enum
+// only — they never read the CircuitBreakerConfig directly. Dry-run policy
+// is encoded in the decision, not in a separate flag.
+enum class Decision : uint8_t {
+    ADMITTED,              // CLOSED — proceed to pool
+    ADMITTED_PROBE,        // HALF_OPEN probe slot consumed — proceed, tag as probe
+    REJECTED_OPEN,         // OPEN (or HALF_OPEN-full); ENFORCE — drop with 503
+    REJECTED_OPEN_DRYRUN,  // Shadow mode: slice would reject but operator asked
+                           // for pass-through. Caller proceeds to pool. Counters
+                           // and log already updated by TryAcquire.
+};
+
+// Failure classification. Only these kinds feed ReportFailure — 4xx and
+// local-capacity issues (POOL_EXHAUSTED, QUEUE_TIMEOUT, shutdown) are NOT
+// reported as failures.
+enum class FailureKind : uint8_t {
+    CONNECT_FAILURE,
+    RESPONSE_5XX,
+    RESPONSE_TIMEOUT,
+    UPSTREAM_DISCONNECT,
+};
+
+// Callback fired on every slice state transition. Runs on the slice's
+// owning dispatcher thread. Callers can compare old/new to key off a
+// specific edge (e.g. CLOSED→OPEN fires wait-queue drain).
+// `trigger` is a short static string such as "consecutive" / "rate" /
+// "probe_success" / "probe_failure" for logging.
+using StateTransitionCallback =
+    std::function<void(State old_state, State new_state, const char* trigger)>;
+
+// Convert a state to a short lowercase label for logging.
+inline const char* StateName(State s) {
+    switch (s) {
+        case State::CLOSED:    return "closed";
+        case State::OPEN:      return "open";
+        case State::HALF_OPEN: return "half_open";
+    }
+    return "unknown";
+}
+
+}  // namespace circuit_breaker
diff --git a/include/circuit_breaker/circuit_breaker_window.h b/include/circuit_breaker/circuit_breaker_window.h
new file mode 100644
index 00000000..12679bcd
--- /dev/null
+++ b/include/circuit_breaker/circuit_breaker_window.h
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "common.h"
+// <vector>, <chrono> provided by common.h
+
+namespace circuit_breaker {
+
+// Time-bucketed sliding window. One bucket per second; ring indexed by
+// `epoch_sec % window_seconds`. Advances lazily on every Add* call:
+// when the incoming `now` is ahead of the recorded head, all buckets
+// that have aged out of the window are zeroed before the new increment.
+//
+// Dispatcher-thread-local by design — NO synchronization. Used from
+// CircuitBreakerSlice, which is owned by a single dispatcher.
+class CircuitBreakerWindow {
+public:
+    explicit CircuitBreakerWindow(int window_seconds);
+
+    // Record one outcome at `now`. Advances the ring if needed.
+    void AddSuccess(std::chrono::steady_clock::time_point now);
+    void AddFailure(std::chrono::steady_clock::time_point now);
+
+    // Observed counts across the current window. `now` is used to expire
+    // stale buckets before reading.
+    int64_t TotalCount(std::chrono::steady_clock::time_point now);
+    int64_t FailureCount(std::chrono::steady_clock::time_point now);
+
+    // Reset the ring to zero. Called on state transitions that should
+    // start a fresh observation (e.g. HALF_OPEN → CLOSED).
+    void Reset();
+
+    // Reinitialize for a new window size (config reload). Resets buckets.
+    void Resize(int new_window_seconds);
+
+    int window_seconds() const { return window_seconds_; }
+
+private:
+    struct Bucket {
+        int64_t total = 0;
+        int64_t failures = 0;
+    };
+
+    int window_seconds_;
+    std::vector<Bucket> buckets_;
+
+    // Epoch-seconds of the most recent observation. Used to compute how
+    // many buckets need to be zeroed on advance.
+    int64_t head_epoch_sec_ = -1;
+
+    // Advance the ring if `now_sec` is newer than `head_epoch_sec_`,
+    // zeroing any buckets that aged out.
+    void Advance(int64_t now_sec);
+
+    // Convert a steady_clock time_point to epoch-seconds (we only
+    // care about relative seconds; steady_clock is monotonic).
+    static int64_t ToEpochSec(std::chrono::steady_clock::time_point now);
+};
+
+}  // namespace circuit_breaker
diff --git a/include/config/server_config.h b/include/config/server_config.h
index 4af21543..7dd949d1 100644
--- a/include/config/server_config.h
+++ b/include/config/server_config.h
@@ -131,6 +131,49 @@ struct ProxyConfig {
     bool operator!=(const ProxyConfig& o) const { return !(*this == o); }
 };
 
+struct CircuitBreakerConfig {
+    bool enabled = false;                      // Opt-in; off by default
+    bool dry_run = false;                      // Compute + log, but do not reject
+
+    // Trip conditions (ORed). Either alone is sufficient.
+    int consecutive_failure_threshold = 5;     // Trip after N consecutive failures
+    int failure_rate_threshold = 50;           // Trip when fail_rate >= N percent
+    int minimum_volume = 20;                   // Required window volume before
+                                               // failure_rate is consulted
+    int window_seconds = 10;                   // Sliding-window duration
+
+    // HALF_OPEN admission
+    int permitted_half_open_calls = 5;
+
+    // Recovery timing. open_duration = min(base * 2^consecutive_trips, max).
+    int base_open_duration_ms = 5000;
+    int max_open_duration_ms = 60000;
+
+    // Safety valve (future-proof for load-balanced services; no-op v1).
+    int max_ejection_percent_per_host_set = 50;
+
+    // Retry budget (orthogonal to the breaker). Caps concurrent retries to
+    // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100).
+    int retry_budget_percent = 20;
+    int retry_budget_min_concurrency = 3;
+
+    bool operator==(const CircuitBreakerConfig& o) const {
+        return enabled == o.enabled &&
+               dry_run == o.dry_run &&
+               consecutive_failure_threshold == o.consecutive_failure_threshold &&
+               failure_rate_threshold == o.failure_rate_threshold &&
+               minimum_volume == o.minimum_volume &&
+               window_seconds == o.window_seconds &&
+               permitted_half_open_calls == o.permitted_half_open_calls &&
+               base_open_duration_ms == o.base_open_duration_ms &&
+               max_open_duration_ms == o.max_open_duration_ms &&
+               max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set &&
+               retry_budget_percent == o.retry_budget_percent &&
+               retry_budget_min_concurrency == o.retry_budget_min_concurrency;
+    }
+    bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); }
+};
+
 struct UpstreamConfig {
     std::string name;
     std::string host;
@@ -138,10 +181,12 @@ struct UpstreamConfig {
     UpstreamTlsConfig tls;
     UpstreamPoolConfig pool;
     ProxyConfig proxy;
+    CircuitBreakerConfig circuit_breaker;
 
     bool operator==(const UpstreamConfig& o) const {
         return name == o.name && host == o.host && port == o.port &&
-               tls == o.tls && pool == o.pool && proxy == o.proxy;
+               tls == o.tls && pool == o.pool && proxy == o.proxy &&
+               circuit_breaker == o.circuit_breaker;
     }
     bool operator!=(const UpstreamConfig& o) const { return !(*this == o); }
 };
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
new file mode 100644
index 00000000..5a30737b
--- /dev/null
+++ b/server/circuit_breaker_slice.cc
@@ -0,0 +1,277 @@
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "log/logger.h"
+
+namespace circuit_breaker {
+
+CircuitBreakerSlice::CircuitBreakerSlice(std::string host_label,
+                                         size_t dispatcher_index,
+                                         const CircuitBreakerConfig& config,
+                                         TimeSource time_source)
+    : host_label_(std::move(host_label)),
+      dispatcher_index_(dispatcher_index),
+      config_(config),
+      time_source_(std::move(time_source)),
+      window_(config.window_seconds) {
+}
+
+std::chrono::steady_clock::time_point CircuitBreakerSlice::Now() const {
+    if (time_source_) return time_source_();
+    return std::chrono::steady_clock::now();
+}
+
+std::chrono::steady_clock::time_point CircuitBreakerSlice::OpenUntil() const {
+    int64_t ns = open_until_steady_ns_.load(std::memory_order_acquire);
+    if (ns == 0) return std::chrono::steady_clock::time_point{};
+    return std::chrono::steady_clock::time_point(std::chrono::nanoseconds(ns));
+}
+
+std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const {
+    // Duration = base << consecutive_trips_ (shift expresses 2^n exponential).
+    // `consecutive_trips_` is the number of trips observed BEFORE this one, so
+    // the first trip uses 2^0 = 1x base, the second trip uses 2x, etc.
+    // Callers must increment consecutive_trips_ AFTER calling this method.
+    int trips = consecutive_trips_.load(std::memory_order_relaxed);
+    // Saturate shift at 30 to avoid UB on huge trip counts.
+    if (trips > 30) trips = 30;
+    int64_t base_ms = config_.base_open_duration_ms;
+    int64_t max_ms  = config_.max_open_duration_ms;
+    int64_t scaled_ms = base_ms << trips;
+    if (scaled_ms < base_ms /* overflow */ || scaled_ms > max_ms) {
+        scaled_ms = max_ms;
+    }
+    return std::chrono::milliseconds(scaled_ms);
+}
+
+bool CircuitBreakerSlice::ShouldTripClosed() {
+    if (consecutive_failures_ >= config_.consecutive_failure_threshold) {
+        return true;
+    }
+    auto now = Now();
+    int64_t total = window_.TotalCount(now);
+    if (total < config_.minimum_volume) return false;
+    int64_t fails = window_.FailureCount(now);
+    // Compare without floating point: fails * 100 >= threshold * total.
+    return (fails * 100) >= (static_cast<int64_t>(config_.failure_rate_threshold) * total);
+}
+
+void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
+    auto duration = ComputeOpenDuration();   // uses current consecutive_trips_
+    consecutive_trips_.fetch_add(1, std::memory_order_relaxed);
+    auto now = Now();
+    auto open_until = now + duration;
+    int64_t open_until_ns =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            open_until.time_since_epoch()).count();
+
+    open_until_steady_ns_.store(open_until_ns, std::memory_order_release);
+    state_.store(State::OPEN, std::memory_order_release);
+
+    // Reset on-trip bookkeeping.
+    consecutive_failures_ = 0;
+    half_open_inflight_ = 0;
+    half_open_successes_ = 0;
+    half_open_saw_failure_ = false;
+
+    trips_.fetch_add(1, std::memory_order_relaxed);
+
+    logging::Get()->warn(
+        "circuit breaker tripped {} trigger={} open_for_ms={} consecutive_trips={}",
+        host_label_, trigger,
+        std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(),
+        consecutive_trips_.load(std::memory_order_relaxed));
+
+    if (transition_cb_) transition_cb_(State::CLOSED, State::OPEN, trigger);
+}
+
+void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
+    state_.store(State::HALF_OPEN, std::memory_order_release);
+    // Keep open_until_steady_ns_ so observers see the "last open" boundary;
+    // it's cleared on transition to CLOSED.
+    half_open_inflight_ = 0;
+    half_open_successes_ = 0;
+    half_open_saw_failure_ = false;
+
+    logging::Get()->info(
+        "circuit breaker half-open {} probes_allowed={}",
+        host_label_, config_.permitted_half_open_calls);
+
+    if (transition_cb_) {
+        transition_cb_(State::OPEN, State::HALF_OPEN, "open_elapsed");
+    }
+}
+
+void CircuitBreakerSlice::TransitionHalfOpenToClosed() {
+    state_.store(State::CLOSED, std::memory_order_release);
+    open_until_steady_ns_.store(0, std::memory_order_release);
+    consecutive_trips_.store(0, std::memory_order_relaxed);
+    consecutive_failures_ = 0;
+    window_.Reset();
+    half_open_inflight_ = 0;
+    half_open_successes_ = 0;
+    half_open_saw_failure_ = false;
+
+    logging::Get()->info(
+        "circuit breaker closed {} probes_succeeded={}",
+        host_label_, config_.permitted_half_open_calls);
+
+    if (transition_cb_) {
+        transition_cb_(State::HALF_OPEN, State::CLOSED, "probe_success");
+    }
+}
+
+void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) {
+    auto duration = ComputeOpenDuration();   // uses current consecutive_trips_
+    consecutive_trips_.fetch_add(1, std::memory_order_relaxed);
+    auto now = Now();
+    auto open_until = now + duration;
+    int64_t open_until_ns =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(
+            open_until.time_since_epoch()).count();
+
+    open_until_steady_ns_.store(open_until_ns, std::memory_order_release);
+    state_.store(State::OPEN, std::memory_order_release);
+
+    half_open_inflight_ = 0;
+    half_open_successes_ = 0;
+    half_open_saw_failure_ = false;
+
+    trips_.fetch_add(1, std::memory_order_relaxed);
+
+    logging::Get()->warn(
+        "circuit breaker re-tripped {} trigger={} open_for_ms={} consecutive_trips={}",
+        host_label_, trigger,
+        std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(),
+        consecutive_trips_.load(std::memory_order_relaxed));
+
+    if (transition_cb_) transition_cb_(State::HALF_OPEN, State::OPEN, trigger);
+}
+
+Decision CircuitBreakerSlice::TryAcquire() {
+    // Disabled fast path — zero overhead when config.enabled=false.
+    if (!config_.enabled) return Decision::ADMITTED;
+
+    State s = state_.load(std::memory_order_acquire);
+
+    if (s == State::OPEN) {
+        // Check whether the open window has elapsed.
+        int64_t open_until_ns =
+            open_until_steady_ns_.load(std::memory_order_acquire);
+        int64_t now_ns = std::chrono::duration_cast<std::chrono::nanoseconds>(
+                             Now().time_since_epoch()).count();
+        if (now_ns >= open_until_ns) {
+            // Transition OPEN → HALF_OPEN on this thread. Because slices are
+            // dispatcher-thread-pinned, no CAS is needed (a plain store is
+            // safe under the single-writer invariant).
+            TransitionOpenToHalfOpen();
+            s = State::HALF_OPEN;
+        } else {
+            rejected_.fetch_add(1, std::memory_order_relaxed);
+            if (config_.dry_run) {
+                logging::Get()->info(
+                    "[dry-run] circuit breaker would reject {} state=open",
+                    host_label_);
+                return Decision::REJECTED_OPEN_DRYRUN;
+            }
+            logging::Get()->debug(
+                "circuit breaker rejected {} state=open", host_label_);
+            return Decision::REJECTED_OPEN;
+        }
+    }
+
+    if (s == State::HALF_OPEN) {
+        if (half_open_inflight_ >= config_.permitted_half_open_calls) {
+            rejected_.fetch_add(1, std::memory_order_relaxed);
+            if (config_.dry_run) {
+                logging::Get()->info(
+                    "[dry-run] circuit breaker would reject {} state=half_open_full",
+                    host_label_);
+                return Decision::REJECTED_OPEN_DRYRUN;
+            }
+            logging::Get()->debug(
+                "circuit breaker rejected {} state=half_open_full", host_label_);
+            return Decision::REJECTED_OPEN;
+        }
+        half_open_inflight_++;
+        return Decision::ADMITTED_PROBE;
+    }
+
+    // CLOSED: fast path.
+    return Decision::ADMITTED;
+}
+
+void CircuitBreakerSlice::ReportSuccess(bool probe) {
+    if (!config_.enabled) return;
+
+    if (probe) {
+        probe_successes_.fetch_add(1, std::memory_order_relaxed);
+        // Count the completed probe regardless of saw_failure state (we still
+        // decrement inflight to release the slot).
+        if (half_open_inflight_ > 0) half_open_inflight_--;
+        if (half_open_saw_failure_) {
+            // A sibling probe already failed; whichever probe finishes last
+            // transitions to OPEN. Handle here only if this is the last probe.
+            if (half_open_inflight_ == 0) {
+                TripHalfOpenToOpen("probe_fail");
+            }
+            return;
+        }
+        half_open_successes_++;
+        if (half_open_successes_ >= config_.permitted_half_open_calls) {
+            TransitionHalfOpenToClosed();
+        }
+        return;
+    }
+
+    // CLOSED success: reset consecutive counter, record in window.
+    consecutive_failures_ = 0;
+    window_.AddSuccess(Now());
+}
+
+void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
+    (void)kind;  // Kind is used by higher layers for logging; slice itself
+                 // treats all failures the same way for trip math.
+    if (!config_.enabled) return;
+
+    if (probe) {
+        probe_failures_.fetch_add(1, std::memory_order_relaxed);
+        if (half_open_inflight_ > 0) half_open_inflight_--;
+        half_open_saw_failure_ = true;
+        // On the last probe (or if all remaining complete) transition OPEN.
+        if (half_open_inflight_ == 0) {
+            TripHalfOpenToOpen("probe_fail");
+        }
+        return;
+    }
+
+    // CLOSED failure path.
+    consecutive_failures_++;
+    window_.AddFailure(Now());
+
+    if (ShouldTripClosed()) {
+        const char* trigger =
+            (consecutive_failures_ >= config_.consecutive_failure_threshold)
+                ? "consecutive" : "rate";
+        TripClosedToOpen(trigger);
+    }
+}
+
+void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
+    bool window_changed = (config_.window_seconds != new_config.window_seconds);
+    config_ = new_config;
+    if (window_changed) window_.Resize(new_config.window_seconds);
+    // Live state preserved — operator expects new thresholds to apply to the
+    // next evaluation, not to reset an in-progress trip.
+
+    logging::Get()->info(
+        "circuit breaker config applied {} enabled={} window_s={} "
+        "fail_rate={} consec_threshold={}",
+        host_label_, new_config.enabled, new_config.window_seconds,
+        new_config.failure_rate_threshold,
+        new_config.consecutive_failure_threshold);
+}
+
+void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) {
+    transition_cb_ = std::move(cb);
+}
+
+}  // namespace circuit_breaker
diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc
new file mode 100644
index 00000000..14ea34a5
--- /dev/null
+++ b/server/circuit_breaker_window.cc
@@ -0,0 +1,81 @@
+#include "circuit_breaker/circuit_breaker_window.h"
+
+namespace circuit_breaker {
+
+CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds)
+    : window_seconds_(window_seconds),
+      buckets_(window_seconds > 0 ? static_cast<size_t>(window_seconds) : 1) {
+}
+
+int64_t CircuitBreakerWindow::ToEpochSec(
+    std::chrono::steady_clock::time_point now) {
+    return std::chrono::duration_cast<std::chrono::seconds>(
+               now.time_since_epoch()).count();
+}
+
+void CircuitBreakerWindow::Advance(int64_t now_sec) {
+    if (head_epoch_sec_ < 0) {
+        head_epoch_sec_ = now_sec;
+        return;
+    }
+    if (now_sec <= head_epoch_sec_) return;
+    int64_t delta = now_sec - head_epoch_sec_;
+    // If delta exceeds window size, everything is stale — full reset.
+    if (delta >= window_seconds_) {
+        for (auto& b : buckets_) { b.total = 0; b.failures = 0; }
+    } else {
+        // Zero buckets from head+1..now_sec inclusive.
+        for (int64_t s = head_epoch_sec_ + 1; s <= now_sec; ++s) {
+            size_t idx = static_cast<size_t>(s % window_seconds_);
+            buckets_[idx].total = 0;
+            buckets_[idx].failures = 0;
+        }
+    }
+    head_epoch_sec_ = now_sec;
+}
+
+void CircuitBreakerWindow::AddSuccess(
+    std::chrono::steady_clock::time_point now) {
+    int64_t now_sec = ToEpochSec(now);
+    Advance(now_sec);
+    size_t idx = static_cast<size_t>(now_sec % window_seconds_);
+    buckets_[idx].total++;
+}
+
+void CircuitBreakerWindow::AddFailure(
+    std::chrono::steady_clock::time_point now) {
+    int64_t now_sec = ToEpochSec(now);
+    Advance(now_sec);
+    size_t idx = static_cast<size_t>(now_sec % window_seconds_);
+    buckets_[idx].total++;
+    buckets_[idx].failures++;
+}
+
+int64_t CircuitBreakerWindow::TotalCount(
+    std::chrono::steady_clock::time_point now) {
+    Advance(ToEpochSec(now));
+    int64_t sum = 0;
+    for (const auto& b : buckets_) sum += b.total;
+    return sum;
+}
+
+int64_t CircuitBreakerWindow::FailureCount(
+    std::chrono::steady_clock::time_point now) {
+    Advance(ToEpochSec(now));
+    int64_t sum = 0;
+    for (const auto& b : buckets_) sum += b.failures;
+    return sum;
+}
+
+void CircuitBreakerWindow::Reset() {
+    for (auto& b : buckets_) { b.total = 0; b.failures = 0; }
+    head_epoch_sec_ = -1;
+}
+
+void CircuitBreakerWindow::Resize(int new_window_seconds) {
+    window_seconds_ = new_window_seconds > 0 ? new_window_seconds : 1;
+    buckets_.assign(static_cast<size_t>(window_seconds_), Bucket{});
+    head_epoch_sec_ = -1;
+}
+
+}  // namespace circuit_breaker
diff --git a/server/config_loader.cc b/server/config_loader.cc
index 9ae4e212..c17a544d 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -262,6 +262,36 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) {
                 }
             }
 
+            if (item.contains("circuit_breaker")) {
+                if (!item["circuit_breaker"].is_object())
+                    throw std::runtime_error("upstream circuit_breaker must be an object");
+                auto& cb = item["circuit_breaker"];
+                upstream.circuit_breaker.enabled =
+                    cb.value("enabled", false);
+                upstream.circuit_breaker.dry_run =
+                    cb.value("dry_run", false);
+                upstream.circuit_breaker.consecutive_failure_threshold =
+                    cb.value("consecutive_failure_threshold", 5);
+                upstream.circuit_breaker.failure_rate_threshold =
+                    cb.value("failure_rate_threshold", 50);
+                upstream.circuit_breaker.minimum_volume =
+                    cb.value("minimum_volume", 20);
+                upstream.circuit_breaker.window_seconds =
+                    cb.value("window_seconds", 10);
+                upstream.circuit_breaker.permitted_half_open_calls =
+                    cb.value("permitted_half_open_calls", 5);
+                upstream.circuit_breaker.base_open_duration_ms =
+                    cb.value("base_open_duration_ms", 5000);
+                upstream.circuit_breaker.max_open_duration_ms =
+                    cb.value("max_open_duration_ms", 60000);
+                upstream.circuit_breaker.max_ejection_percent_per_host_set =
+                    cb.value("max_ejection_percent_per_host_set", 50);
+                upstream.circuit_breaker.retry_budget_percent =
+                    cb.value("retry_budget_percent", 20);
+                upstream.circuit_breaker.retry_budget_min_concurrency =
+                    cb.value("retry_budget_min_concurrency", 3);
+            }
+
             config.upstreams.push_back(std::move(upstream));
         }
     }
@@ -791,6 +821,62 @@ void ConfigLoader::Validate(const ServerConfig& config) {
                     idx + " ('" + u.name +
                     "'): proxy.retry.max_retries must be >= 0 and <= 10");
             }
+
+            // Circuit breaker validation
+            {
+                const auto& cb = u.circuit_breaker;
+                if (cb.consecutive_failure_threshold < 1) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.consecutive_failure_threshold must be >= 1");
+                }
+                if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.failure_rate_threshold must be in [0, 100]");
+                }
+                if (cb.minimum_volume < 1) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.minimum_volume must be >= 1");
+                }
+                if (cb.window_seconds < 1 || cb.window_seconds > 3600) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.window_seconds must be in [1, 3600]");
+                }
+                if (cb.permitted_half_open_calls < 1) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.permitted_half_open_calls must be >= 1");
+                }
+                if (cb.base_open_duration_ms < 100) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.base_open_duration_ms must be >= 100");
+                }
+                if (cb.max_open_duration_ms < cb.base_open_duration_ms) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.max_open_duration_ms must be >= base_open_duration_ms");
+                }
+                if (cb.max_ejection_percent_per_host_set < 0 ||
+                    cb.max_ejection_percent_per_host_set > 100) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]");
+                }
+                if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.retry_budget_percent must be in [0, 100]");
+                }
+                if (cb.retry_budget_min_concurrency < 0) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.retry_budget_min_concurrency must be >= 0");
+                }
+            }
             // Validate method names — reject unknowns and duplicates.
             // Duplicates would cause RouteAsync to throw at startup.
             {
@@ -1052,6 +1138,31 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) {
 
             uj["proxy"] = pj;
         }
+        // Always serialize circuit_breaker — same rationale as proxy block.
+        if (u.circuit_breaker != CircuitBreakerConfig{}) {
+            nlohmann::json cbj;
+            cbj["enabled"] = u.circuit_breaker.enabled;
+            cbj["dry_run"] = u.circuit_breaker.dry_run;
+            cbj["consecutive_failure_threshold"] =
+                u.circuit_breaker.consecutive_failure_threshold;
+            cbj["failure_rate_threshold"] =
+                u.circuit_breaker.failure_rate_threshold;
+            cbj["minimum_volume"] = u.circuit_breaker.minimum_volume;
+            cbj["window_seconds"] = u.circuit_breaker.window_seconds;
+            cbj["permitted_half_open_calls"] =
+                u.circuit_breaker.permitted_half_open_calls;
+            cbj["base_open_duration_ms"] =
+                u.circuit_breaker.base_open_duration_ms;
+            cbj["max_open_duration_ms"] =
+                u.circuit_breaker.max_open_duration_ms;
+            cbj["max_ejection_percent_per_host_set"] =
+                u.circuit_breaker.max_ejection_percent_per_host_set;
+            cbj["retry_budget_percent"] =
+                u.circuit_breaker.retry_budget_percent;
+            cbj["retry_budget_min_concurrency"] =
+                u.circuit_breaker.retry_budget_min_concurrency;
+            uj["circuit_breaker"] = cbj;
+        }
         j["upstreams"].push_back(uj);
     }
 
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
new file mode 100644
index 00000000..bd932a28
--- /dev/null
+++ b/test/circuit_breaker_test.h
@@ -0,0 +1,647 @@
+#pragma once
+
+#include "test_framework.h"
+#include "config/server_config.h"
+#include "circuit_breaker/circuit_breaker_state.h"
+#include "circuit_breaker/circuit_breaker_window.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+
+#include <chrono>
+#include <iostream>
+#include <string>
+
+namespace CircuitBreakerTests {
+
+using circuit_breaker::CircuitBreakerSlice;
+using circuit_breaker::CircuitBreakerWindow;
+using circuit_breaker::Decision;
+using circuit_breaker::FailureKind;
+using circuit_breaker::State;
+
+// A simple mock clock that advances only when the test tells it to.
+class MockClock {
+public:
+    std::chrono::steady_clock::time_point now{
+        // Choose a non-zero base so 0 is distinguishable from "not OPEN".
+        std::chrono::steady_clock::time_point(std::chrono::seconds(1'000'000))
+    };
+    void Advance(std::chrono::milliseconds ms) { now += ms; }
+    void AdvanceSec(int seconds) { now += std::chrono::seconds(seconds); }
+    std::chrono::steady_clock::time_point operator()() const { return now; }
+};
+
+// Build a config with default values — tests override specific fields.
+static CircuitBreakerConfig DefaultEnabledConfig() {
+    CircuitBreakerConfig cb;
+    cb.enabled = true;
+    cb.consecutive_failure_threshold = 5;
+    cb.failure_rate_threshold = 50;
+    cb.minimum_volume = 20;
+    cb.window_seconds = 10;
+    cb.permitted_half_open_calls = 5;
+    cb.base_open_duration_ms = 5000;
+    cb.max_open_duration_ms = 60000;
+    return cb;
+}
+
+// ============================================================================
+// State machine tests
+// ============================================================================
+
+void TestDisabledFastPath() {
+    std::cout << "\n[TEST] CB: Disabled fast path..." << std::endl;
+    try {
+        CircuitBreakerConfig cb;   // enabled=false by default
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        bool pass = slice.TryAcquire() == Decision::ADMITTED &&
+                    slice.CurrentState() == State::CLOSED;
+
+        // Reporting 100 failures must not trip.
+        for (int i = 0; i < 100; ++i) {
+            slice.ReportFailure(FailureKind::CONNECT_FAILURE, false);
+        }
+        pass = pass && slice.CurrentState() == State::CLOSED &&
+               slice.Trips() == 0;
+
+        TestFramework::RecordTest("CB: disabled fast path", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: disabled fast path", false, e.what(),
+            TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestClosedStaysClosedBelowConsecutiveThreshold() {
+    std::cout << "\n[TEST] CB: 4 failures below threshold..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 4; ++i) {
+            slice.ReportFailure(FailureKind::CONNECT_FAILURE, false);
+        }
+        bool pass = slice.CurrentState() == State::CLOSED &&
+                    slice.TryAcquire() == Decision::ADMITTED &&
+                    slice.Trips() == 0;
+        TestFramework::RecordTest("CB: 4 failures below threshold", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: 4 failures below threshold", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestConsecutiveFailureTrip() {
+    std::cout << "\n[TEST] CB: 5 consecutive failures trip..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        bool pass = slice.CurrentState() == State::OPEN &&
+                    slice.Trips() == 1 &&
+                    slice.TryAcquire() == Decision::REJECTED_OPEN;
+        TestFramework::RecordTest("CB: 5 consecutive failures trip", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: 5 consecutive failures trip", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestFailureRateTrip() {
+    std::cout << "\n[TEST] CB: failure-rate trip (50% of 20)..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.consecutive_failure_threshold = 1000;  // disable consec path
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Alternate 10 failures and 10 successes within the same second —
+        // ratio = 50%, total = 20 (>= minimum_volume).
+        for (int i = 0; i < 10; ++i) {
+            slice.ReportSuccess(false);
+        }
+        // A success between-failures clears consecutive_failures_, confirming
+        // only rate path can trip here.
+        for (int i = 0; i < 9; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        // Still CLOSED — 9/19 < 50%.
+        bool pass_pre = slice.CurrentState() == State::CLOSED;
+        // 10th failure brings ratio to 10/20 = 50% exactly — tripper.
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        bool pass = pass_pre && slice.CurrentState() == State::OPEN &&
+                    slice.Trips() == 1;
+        TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestMinimumVolumeGate() {
+    std::cout << "\n[TEST] CB: minimum_volume gate..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.consecutive_failure_threshold = 1000;  // disable consec path
+        cb.minimum_volume = 20;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // 19 total calls, all failures — should NOT trip (below volume).
+        for (int i = 0; i < 19; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        bool pass = slice.CurrentState() == State::CLOSED && slice.Trips() == 0;
+        TestFramework::RecordTest("CB: minimum_volume gate", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: minimum_volume gate", false, e.what(),
+            TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestOpenBeforeDurationStaysOpen() {
+    std::cout << "\n[TEST] CB: OPEN rejects before elapsed..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        // Advance less than base_open_duration_ms (5000ms).
+        clock->Advance(std::chrono::milliseconds(2000));
+        Decision d = slice.TryAcquire();
+        bool pass = d == Decision::REJECTED_OPEN &&
+                    slice.CurrentState() == State::OPEN;
+        TestFramework::RecordTest("CB: OPEN rejects before elapsed", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: OPEN rejects before elapsed", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestOpenToHalfOpenAfterDuration() {
+    std::cout << "\n[TEST] CB: OPEN → HALF_OPEN after duration..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+        Decision d = slice.TryAcquire();
+        bool pass = d == Decision::ADMITTED_PROBE &&
+                    slice.CurrentState() == State::HALF_OPEN;
+        TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestHalfOpenAllProbesSucceed() {
+    std::cout << "\n[TEST] CB: HALF_OPEN 5 probe successes close..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Take 5 probes; report success on each.
+        for (int i = 0; i < cb.permitted_half_open_calls; ++i) {
+            Decision d = slice.TryAcquire();
+            if (d != Decision::ADMITTED_PROBE) {
+                TestFramework::RecordTest(
+                    "CB: HALF_OPEN 5 probe successes close", false,
+                    "probe " + std::to_string(i) + " not ADMITTED_PROBE",
+                    TestFramework::TestCategory::OTHER);
+                return;
+            }
+            slice.ReportSuccess(true);
+        }
+        bool pass = slice.CurrentState() == State::CLOSED &&
+                    slice.ProbeSuccesses() == 5;
+        TestFramework::RecordTest("CB: HALF_OPEN 5 probe successes close",
+            pass, "", TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: HALF_OPEN 5 probe successes close",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestHalfOpenProbeFailureReopens() {
+    std::cout << "\n[TEST] CB: HALF_OPEN single probe fail re-opens..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Take 1 probe, fail it.
+        Decision d = slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        bool pass = d == Decision::ADMITTED_PROBE &&
+                    slice.CurrentState() == State::OPEN &&
+                    slice.Trips() == 2 &&  // initial trip + re-trip
+                    slice.ProbeFailures() == 1;
+        TestFramework::RecordTest("CB: HALF_OPEN probe fail re-opens", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: HALF_OPEN probe fail re-opens", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestHalfOpenExhaustedSlotsRejected() {
+    std::cout << "\n[TEST] CB: HALF_OPEN over capacity rejects..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+        // Take 5 probes but DON'T report outcomes yet.
+        for (int i = 0; i < 5; ++i) slice.TryAcquire();
+        // 6th TryAcquire must reject (all slots taken).
+        Decision d = slice.TryAcquire();
+        bool pass = d == Decision::REJECTED_OPEN;
+        TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects",
+            pass, "", TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestExponentialBackoff() {
+    std::cout << "\n[TEST] CB: exponential backoff progression..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.base_open_duration_ms = 1000;
+        cb.max_open_duration_ms = 8000;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        auto trip_then_probe_fail = [&]() {
+            // Reach OPEN.
+            for (int i = 0; i < 5; ++i) {
+                slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            }
+        };
+        auto measure_open_ms = [&]() {
+            // open_until - now at the instant of the trip.
+            auto open_until = slice.OpenUntil();
+            auto remaining = open_until - clock->now;
+            return std::chrono::duration_cast<std::chrono::milliseconds>(
+                       remaining).count();
+        };
+
+        // Trip 1 — expect ~1000ms.
+        trip_then_probe_fail();
+        int64_t d1 = measure_open_ms();
+        // Move to HALF_OPEN and fail the probe → trip 2.
+        clock->Advance(std::chrono::milliseconds(d1 + 1));
+        slice.TryAcquire();  // HALF_OPEN, ADMITTED_PROBE
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        int64_t d2 = measure_open_ms();
+        clock->Advance(std::chrono::milliseconds(d2 + 1));
+        slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        int64_t d3 = measure_open_ms();
+        clock->Advance(std::chrono::milliseconds(d3 + 1));
+        slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        int64_t d4 = measure_open_ms();
+        clock->Advance(std::chrono::milliseconds(d4 + 1));
+        slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        int64_t d5 = measure_open_ms();
+
+        // Expect 1000, 2000, 4000, 8000, 8000 (capped).
+        bool pass = d1 == 1000 && d2 == 2000 && d3 == 4000 &&
+                    d4 == 8000 && d5 == 8000;
+        std::string err = "d1=" + std::to_string(d1) + " d2=" + std::to_string(d2) +
+                          " d3=" + std::to_string(d3) + " d4=" + std::to_string(d4) +
+                          " d5=" + std::to_string(d5);
+        TestFramework::RecordTest("CB: exponential backoff",
+            pass, pass ? "" : err, TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: exponential backoff", false, e.what(),
+            TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestResetOnClose() {
+    std::cout << "\n[TEST] CB: consecutive_trips resets on close..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.base_open_duration_ms = 1000;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip 1.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(1001));
+        // Move to HALF_OPEN.
+        for (int i = 0; i < 5; ++i) {
+            slice.TryAcquire();
+            slice.ReportSuccess(true);
+        }
+        // Now CLOSED. Trip again — expect base_duration again (not doubled).
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        auto open_until = slice.OpenUntil();
+        auto remaining = open_until - clock->now;
+        int64_t d_after_close = std::chrono::duration_cast<
+            std::chrono::milliseconds>(remaining).count();
+        bool pass = d_after_close == 1000;
+        TestFramework::RecordTest("CB: trips reset on close", pass,
+            pass ? "" : "expected 1000ms, got " + std::to_string(d_after_close),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: trips reset on close", false, e.what(),
+            TestFramework::TestCategory::OTHER);
+    }
+}
+
+// ============================================================================
+// Window tests
+// ============================================================================
+
+void TestWindowBucketByCurrentSecond() {
+    std::cout << "\n[TEST] CB Window: bucket by current second..." << std::endl;
+    try {
+        CircuitBreakerWindow w(10);
+        auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100));
+        w.AddSuccess(t0);
+        w.AddFailure(t0);
+        w.AddFailure(t0);
+        bool pass = w.TotalCount(t0) == 3 && w.FailureCount(t0) == 2;
+        TestFramework::RecordTest("CB Window: bucket by current second", pass,
+            "", TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB Window: bucket by current second",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestWindowAdvanceSkipsStale() {
+    std::cout << "\n[TEST] CB Window: advance skips stale..." << std::endl;
+    try {
+        CircuitBreakerWindow w(10);
+        auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100));
+        w.AddFailure(t0);  // bucket 100%10 = 0
+        auto t1 = t0 + std::chrono::seconds(15);  // beyond window
+        // After long idle, incoming record should see zero history.
+        bool pre = w.TotalCount(t1) == 0;
+        w.AddSuccess(t1);
+        bool pass = pre && w.TotalCount(t1) == 1 && w.FailureCount(t1) == 0;
+        TestFramework::RecordTest("CB Window: advance skips stale", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB Window: advance skips stale", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestWindowPartialExpiry() {
+    std::cout << "\n[TEST] CB Window: partial expiry..." << std::endl;
+    try {
+        CircuitBreakerWindow w(10);
+        auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100));
+        w.AddFailure(t0);               // sec 100
+        auto t1 = t0 + std::chrono::seconds(5);
+        w.AddFailure(t1);               // sec 105
+        auto t2 = t0 + std::chrono::seconds(11);
+        // sec 100 is now out of window (100 + 10 <= 111 - 1 = 110). So:
+        // bucket 0 (sec 100 or sec 110) would have been zeroed when advancing
+        // from head=105 past sec 110.
+        bool pass = w.TotalCount(t2) == 1 && w.FailureCount(t2) == 1;
+        TestFramework::RecordTest("CB Window: partial expiry", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB Window: partial expiry", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestWindowReset() {
+    std::cout << "\n[TEST] CB Window: reset clears..." << std::endl;
+    try {
+        CircuitBreakerWindow w(10);
+        auto t0 = std::chrono::steady_clock::time_point(std::chrono::seconds(100));
+        w.AddFailure(t0); w.AddSuccess(t0); w.AddFailure(t0);
+        w.Reset();
+        bool pass = w.TotalCount(t0) == 0 && w.FailureCount(t0) == 0;
+        TestFramework::RecordTest("CB Window: reset clears", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB Window: reset clears", false, e.what(),
+            TestFramework::TestCategory::OTHER);
+    }
+}
+
+// ============================================================================
+// Dry-run + Reload + Edge cases
+// ============================================================================
+
+void TestDryRunAdmits() {
+    std::cout << "\n[TEST] CB: dry_run admits through OPEN..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.dry_run = true;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        // OPEN + dry_run → REJECTED_OPEN_DRYRUN (caller proceeds).
+        Decision d = slice.TryAcquire();
+        bool pass = d == Decision::REJECTED_OPEN_DRYRUN &&
+                    slice.CurrentState() == State::OPEN &&
+                    slice.Rejected() == 1;
+        TestFramework::RecordTest("CB: dry_run admits through OPEN", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: dry_run admits through OPEN", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestReloadPreservesState() {
+    std::cout << "\n[TEST] CB: reload preserves live state..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        // OPEN at this point.
+        auto cb2 = cb;
+        cb2.consecutive_failure_threshold = 2;  // tighter
+        cb2.window_seconds = 30;                // triggers ring resize
+        slice.Reload(cb2);
+        // Still OPEN immediately after reload — live state preserved.
+        bool pass = slice.CurrentState() == State::OPEN;
+        TestFramework::RecordTest("CB: reload preserves live state", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: reload preserves live state", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestConsecutiveThresholdOne() {
+    std::cout << "\n[TEST] CB: threshold=1 single failure trips..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.consecutive_failure_threshold = 1;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        bool pass = slice.CurrentState() == State::OPEN && slice.Trips() == 1;
+        TestFramework::RecordTest("CB: threshold=1 single failure trips",
+            pass, "", TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: threshold=1 single failure trips",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestSuccessClearsConsecutive() {
+    std::cout << "\n[TEST] CB: success clears consecutive..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 4; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        slice.ReportSuccess(false);  // resets consecutive
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        // consecutive is back to 1, no trip.
+        bool pass = slice.CurrentState() == State::CLOSED;
+        TestFramework::RecordTest("CB: success clears consecutive", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: success clears consecutive", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+void TestTransitionCallbackInvoked() {
+    std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        int closed_to_open = 0;
+        int open_to_halfopen = 0;
+        int halfopen_to_closed = 0;
+        slice.SetTransitionCallback(
+            [&](State o, State n, const char*) {
+                if (o == State::CLOSED && n == State::OPEN) closed_to_open++;
+                else if (o == State::OPEN && n == State::HALF_OPEN) open_to_halfopen++;
+                else if (o == State::HALF_OPEN && n == State::CLOSED) halfopen_to_closed++;
+            });
+
+        // Full cycle.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+        for (int i = 0; i < cb.permitted_half_open_calls; ++i) {
+            slice.TryAcquire();
+            slice.ReportSuccess(true);
+        }
+        bool pass = closed_to_open == 1 && open_to_halfopen == 1 &&
+                    halfopen_to_closed == 1;
+        TestFramework::RecordTest("CB: transition callback invoked", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: transition callback invoked", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Run all circuit breaker unit tests.
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - UNIT TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestDisabledFastPath();
+    TestClosedStaysClosedBelowConsecutiveThreshold();
+    TestConsecutiveFailureTrip();
+    TestFailureRateTrip();
+    TestMinimumVolumeGate();
+    TestOpenBeforeDurationStaysOpen();
+    TestOpenToHalfOpenAfterDuration();
+    TestHalfOpenAllProbesSucceed();
+    TestHalfOpenProbeFailureReopens();
+    TestHalfOpenExhaustedSlotsRejected();
+    TestExponentialBackoff();
+    TestResetOnClose();
+    TestWindowBucketByCurrentSecond();
+    TestWindowAdvanceSkipsStale();
+    TestWindowPartialExpiry();
+    TestWindowReset();
+    TestDryRunAdmits();
+    TestReloadPreservesState();
+    TestConsecutiveThresholdOne();
+    TestSuccessClearsConsecutive();
+    TestTransitionCallbackInvoked();
+}
+
+}  // namespace CircuitBreakerTests
diff --git a/test/config_test.h b/test/config_test.h
index cfb90c7a..213fd8ac 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -348,6 +348,219 @@ namespace ConfigTests {
         }
     }
 
+    // Test 9: Circuit breaker defaults
+    void TestCircuitBreakerDefaults() {
+        std::cout << "\n[TEST] Circuit Breaker Defaults..." << std::endl;
+        try {
+            CircuitBreakerConfig cb;   // value-initialized defaults
+            bool pass = cb.enabled == false &&
+                        cb.dry_run == false &&
+                        cb.consecutive_failure_threshold == 5 &&
+                        cb.failure_rate_threshold == 50 &&
+                        cb.minimum_volume == 20 &&
+                        cb.window_seconds == 10 &&
+                        cb.permitted_half_open_calls == 5 &&
+                        cb.base_open_duration_ms == 5000 &&
+                        cb.max_open_duration_ms == 60000 &&
+                        cb.max_ejection_percent_per_host_set == 50 &&
+                        cb.retry_budget_percent == 20 &&
+                        cb.retry_budget_min_concurrency == 3;
+            TestFramework::RecordTest("Circuit Breaker Defaults", pass,
+                pass ? "" : "default value mismatch",
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::exception& e) {
+            TestFramework::RecordTest("Circuit Breaker Defaults", false, e.what(),
+                TestFramework::TestCategory::OTHER);
+        }
+    }
+
+    // Test 10: Circuit breaker JSON parsing (populated block)
+    void TestCircuitBreakerJsonParse() {
+        std::cout << "\n[TEST] Circuit Breaker JSON Parse..." << std::endl;
+        try {
+            std::string json = R"({
+                "upstreams": [{
+                    "name": "svc",
+                    "host": "10.0.0.1",
+                    "port": 8080,
+                    "circuit_breaker": {
+                        "enabled": true,
+                        "dry_run": true,
+                        "consecutive_failure_threshold": 7,
+                        "failure_rate_threshold": 75,
+                        "minimum_volume": 50,
+                        "window_seconds": 30,
+                        "permitted_half_open_calls": 3,
+                        "base_open_duration_ms": 2000,
+                        "max_open_duration_ms": 120000,
+                        "max_ejection_percent_per_host_set": 33,
+                        "retry_budget_percent": 10,
+                        "retry_budget_min_concurrency": 5
+                    }
+                }]
+            })";
+            ServerConfig config = ConfigLoader::LoadFromString(json);
+            const auto& cb = config.upstreams.at(0).circuit_breaker;
+            bool pass = cb.enabled == true && cb.dry_run == true &&
+                        cb.consecutive_failure_threshold == 7 &&
+                        cb.failure_rate_threshold == 75 &&
+                        cb.minimum_volume == 50 &&
+                        cb.window_seconds == 30 &&
+                        cb.permitted_half_open_calls == 3 &&
+                        cb.base_open_duration_ms == 2000 &&
+                        cb.max_open_duration_ms == 120000 &&
+                        cb.max_ejection_percent_per_host_set == 33 &&
+                        cb.retry_budget_percent == 10 &&
+                        cb.retry_budget_min_concurrency == 5;
+            TestFramework::RecordTest("Circuit Breaker JSON Parse", pass,
+                pass ? "" : "parsed values mismatch",
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::exception& e) {
+            TestFramework::RecordTest("Circuit Breaker JSON Parse", false, e.what(),
+                TestFramework::TestCategory::OTHER);
+        }
+    }
+
+    // Test 11: Circuit breaker JSON partial block uses defaults for missing fields
+    void TestCircuitBreakerJsonPartial() {
+        std::cout << "\n[TEST] Circuit Breaker JSON Partial..." << std::endl;
+        try {
+            std::string json = R"({
+                "upstreams": [{
+                    "name": "svc", "host": "10.0.0.1", "port": 8080,
+                    "circuit_breaker": {"enabled": true}
+                }]
+            })";
+            ServerConfig config = ConfigLoader::LoadFromString(json);
+            const auto& cb = config.upstreams.at(0).circuit_breaker;
+            bool pass = cb.enabled == true &&
+                        cb.consecutive_failure_threshold == 5 &&
+                        cb.window_seconds == 10;
+            TestFramework::RecordTest("Circuit Breaker JSON Partial", pass,
+                pass ? "" : "expected defaults for unset fields",
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::exception& e) {
+            TestFramework::RecordTest("Circuit Breaker JSON Partial", false, e.what(),
+                TestFramework::TestCategory::OTHER);
+        }
+    }
+
+    // Test 12: Round-trip via ToJson() preserves circuit_breaker
+    void TestCircuitBreakerJsonRoundTrip() {
+        std::cout << "\n[TEST] Circuit Breaker JSON Round-Trip..." << std::endl;
+        try {
+            ServerConfig in;
+            UpstreamConfig u;
+            u.name = "svc"; u.host = "10.0.0.1"; u.port = 8080;
+            u.circuit_breaker.enabled = true;
+            u.circuit_breaker.window_seconds = 25;
+            u.circuit_breaker.failure_rate_threshold = 42;
+            in.upstreams.push_back(u);
+
+            std::string serialized = ConfigLoader::ToJson(in);
+            ServerConfig out = ConfigLoader::LoadFromString(serialized);
+
+            const auto& cb = out.upstreams.at(0).circuit_breaker;
+            bool pass = cb.enabled == true && cb.window_seconds == 25 &&
+                        cb.failure_rate_threshold == 42;
+            TestFramework::RecordTest("Circuit Breaker JSON Round-Trip", pass,
+                pass ? "" : "round-trip lost fields",
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::exception& e) {
+            TestFramework::RecordTest("Circuit Breaker JSON Round-Trip", false,
+                e.what(), TestFramework::TestCategory::OTHER);
+        }
+    }
+
+    // Helper: assert a circuit_breaker JSON override is rejected by Validate().
+    static void ExpectValidationFailure(const std::string& name,
+                                        const std::string& cb_json_override,
+                                        const std::string& expected_substr) {
+        std::string json = std::string(R"({
+            "upstreams": [{
+                "name": "svc", "host": "10.0.0.1", "port": 8080,
+                "circuit_breaker": )") + cb_json_override + R"(
+            }]
+        })";
+        try {
+            ServerConfig config = ConfigLoader::LoadFromString(json);
+            ConfigLoader::Validate(config);
+            TestFramework::RecordTest(name, false,
+                "expected validation failure containing: " + expected_substr,
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::invalid_argument& e) {
+            std::string msg(e.what());
+            bool pass = msg.find(expected_substr) != std::string::npos;
+            TestFramework::RecordTest(name, pass,
+                pass ? "" : std::string("wrong error: ") + msg,
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::exception& e) {
+            TestFramework::RecordTest(name, false,
+                std::string("wrong exception type: ") + e.what(),
+                TestFramework::TestCategory::OTHER);
+        }
+    }
+
+    // Test 13: Validation rejects bad circuit_breaker fields
+    void TestCircuitBreakerValidation() {
+        std::cout << "\n[TEST] Circuit Breaker Validation..." << std::endl;
+        ExpectValidationFailure("CB Validation: consecutive_failure_threshold<1",
+            R"({"consecutive_failure_threshold": 0})",
+            "consecutive_failure_threshold must be >= 1");
+        ExpectValidationFailure("CB Validation: failure_rate_threshold>100",
+            R"({"failure_rate_threshold": 101})",
+            "failure_rate_threshold must be in [0, 100]");
+        ExpectValidationFailure("CB Validation: minimum_volume<1",
+            R"({"minimum_volume": 0})",
+            "minimum_volume must be >= 1");
+        ExpectValidationFailure("CB Validation: window_seconds<1",
+            R"({"window_seconds": 0})",
+            "window_seconds must be in [1, 3600]");
+        ExpectValidationFailure("CB Validation: window_seconds>3600",
+            R"({"window_seconds": 3601})",
+            "window_seconds must be in [1, 3600]");
+        ExpectValidationFailure("CB Validation: base_open_duration_ms<100",
+            R"({"base_open_duration_ms": 50})",
+            "base_open_duration_ms must be >= 100");
+        ExpectValidationFailure("CB Validation: max<base",
+            R"({"base_open_duration_ms": 5000, "max_open_duration_ms": 1000})",
+            "max_open_duration_ms must be >= base_open_duration_ms");
+        ExpectValidationFailure("CB Validation: retry_budget_percent>100",
+            R"({"retry_budget_percent": 200})",
+            "retry_budget_percent must be in [0, 100]");
+        ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0",
+            R"({"retry_budget_min_concurrency": -1})",
+            "retry_budget_min_concurrency must be >= 0");
+        ExpectValidationFailure("CB Validation: max_ejection_percent>100",
+            R"({"max_ejection_percent_per_host_set": 150})",
+            "max_ejection_percent_per_host_set must be in [0, 100]");
+        ExpectValidationFailure("CB Validation: permitted_half_open_calls<1",
+            R"({"permitted_half_open_calls": 0})",
+            "permitted_half_open_calls must be >= 1");
+    }
+
+    // Test 14: Equality operator covers circuit_breaker field
+    void TestCircuitBreakerEquality() {
+        std::cout << "\n[TEST] Circuit Breaker Equality..." << std::endl;
+        try {
+            UpstreamConfig a;
+            a.name = "svc"; a.host = "h"; a.port = 80;
+            UpstreamConfig b = a;
+            bool equal_default = (a == b);
+
+            b.circuit_breaker.enabled = true;
+            bool not_equal_after_diff = (a != b);
+
+            bool pass = equal_default && not_equal_after_diff;
+            TestFramework::RecordTest("Circuit Breaker Equality", pass,
+                pass ? "" : "operator== failed for circuit_breaker",
+                TestFramework::TestCategory::OTHER);
+        } catch (const std::exception& e) {
+            TestFramework::RecordTest("Circuit Breaker Equality", false, e.what(),
+                TestFramework::TestCategory::OTHER);
+        }
+    }
+
     // Run all config tests
     void RunAllTests() {
         std::cout << "\n" << std::string(60, '=') << std::endl;
@@ -362,6 +575,14 @@ namespace ConfigTests {
         TestValidationTlsNoCert();
         TestEnvOverrides();
         TestMissingFile();
+
+        // Phase 1: Circuit breaker config
+        TestCircuitBreakerDefaults();
+        TestCircuitBreakerJsonParse();
+        TestCircuitBreakerJsonPartial();
+        TestCircuitBreakerJsonRoundTrip();
+        TestCircuitBreakerValidation();
+        TestCircuitBreakerEquality();
     }
 
 } // namespace ConfigTests
diff --git a/test/run_test.cc b/test/run_test.cc
index 4edb0139..3d55f06f 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -13,6 +13,7 @@
 #include "upstream_pool_test.h"
 #include "proxy_test.h"
 #include "rate_limit_test.h"
+#include "circuit_breaker_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -77,6 +78,9 @@ void RunAllTest(){
     // Run rate limit tests
     RateLimitTests::RunAllTests();
 
+    // Run circuit breaker tests
+    CircuitBreakerTests::RunAllTests();
+
     std::cout << "====================================\n" << std::endl;
 }
 
@@ -155,6 +159,9 @@ int main(int argc, char* argv[]) {
         // Run rate limit tests
         }else if(mode == "rate_limit" || mode == "-L"){
             RateLimitTests::RunAllTests();
+        // Run circuit breaker tests
+        }else if(mode == "circuit_breaker" || mode == "-B"){
+            CircuitBreakerTests::RunAllTests();
         // Show help
         }else if(mode == "help" || mode == "-h" || mode == "--help"){
             PrintUsage(argv[0]);

From 14921ec90244245eb53594bf8ea34206b635593c Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Mon, 13 Apr 2026 17:09:09 +0800
Subject: [PATCH 02/37] Fix review comment

---
 Makefile                                      |   7 +-
 .../circuit_breaker/circuit_breaker_slice.h   |  19 ++
 .../circuit_breaker/circuit_breaker_state.h   |   6 +-
 server/circuit_breaker_slice.cc               | 109 +++++++---
 server/circuit_breaker_window.cc              |  20 +-
 server/config_loader.cc                       |  21 +-
 test/circuit_breaker_test.h                   | 201 ++++++++++++++++++
 test/config_test.h                            |  16 +-
 8 files changed, 355 insertions(+), 44 deletions(-)

diff --git a/Makefile b/Makefile
index 8f4ec3f2..4dd6b83a 100644
--- a/Makefile
+++ b/Makefile
@@ -242,6 +242,11 @@ test_rate_limit: $(TARGET)
 	@echo "Running rate limit tests only..."
 	./$(TARGET) rate_limit
 
+# Run only circuit breaker tests
+test_circuit_breaker: $(TARGET)
+	@echo "Running circuit breaker tests only..."
+	./$(TARGET) circuit_breaker
+
 # Display help information
 help:
 	@echo "Reactor Server C++ - Makefile Help"
@@ -322,4 +327,4 @@ help:
 # Build only the production server binary
 server: $(SERVER_TARGET)
 
-.PHONY: all clean test server test_basic test_stress test_race test_config test_http test_ws test_tls test_cli test_http2 test_upstream test_proxy test_rate_limit help
+.PHONY: all clean test server test_basic test_stress test_race test_config test_http test_ws test_tls test_cli test_http2 test_upstream test_proxy test_rate_limit test_circuit_breaker help
diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 5633c355..8d7af6d7 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -57,6 +57,12 @@ class CircuitBreakerSlice {
     int64_t  Rejected()         const { return rejected_.load(std::memory_order_relaxed); }
     int64_t  ProbeSuccesses()   const { return probe_successes_.load(std::memory_order_relaxed); }
     int64_t  ProbeFailures()    const { return probe_failures_.load(std::memory_order_relaxed); }
+    // Rejections specifically caused by HALF_OPEN being out of probe slots
+    // (subset of `Rejected()`). Lets dashboards distinguish "backoff has not
+    // elapsed" from "probing, no capacity left".
+    int64_t  RejectedHalfOpenFull() const {
+        return rejected_half_open_full_.load(std::memory_order_relaxed);
+    }
 
     const std::string& host_label() const { return host_label_; }
     size_t dispatcher_index() const { return dispatcher_index_; }
@@ -91,9 +97,17 @@ class CircuitBreakerSlice {
     // Observability counters.
     std::atomic<int64_t> trips_{0};
     std::atomic<int64_t> rejected_{0};
+    std::atomic<int64_t> rejected_half_open_full_{0};
     std::atomic<int64_t> probe_successes_{0};
     std::atomic<int64_t> probe_failures_{0};
 
+    // One-shot flag: true after the slice has emitted a higher-level
+    // (info) log for the first rejection in the current OPEN/HALF_OPEN
+    // cycle. Reset on transition to CLOSED and on each fresh trip. Keeps
+    // per-request reject logs at debug while still surfacing the first
+    // post-trip reject in default-warn operator logs. Dispatcher-thread only.
+    bool first_reject_logged_for_open_ = false;
+
     StateTransitionCallback transition_cb_;
 
     // Internal transitions (dispatcher-thread).
@@ -102,6 +116,11 @@ class CircuitBreakerSlice {
     void TransitionHalfOpenToClosed();
     void TripHalfOpenToOpen(const char* trigger);
 
+    // Emit the correct reject log line, bump counters, and return the matching
+    // Decision (enforce or dry-run). Used by both the OPEN (backoff active)
+    // and HALF_OPEN-full paths — keeps the three loggers/counters consistent.
+    Decision RejectWithLog(const char* state_label, bool half_open_full);
+
     // Compute open duration for the current consecutive_trips_ value:
     // min(base * 2^consecutive_trips, max). Always >= base_open_duration_ms.
     std::chrono::nanoseconds ComputeOpenDuration() const;
diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h
index 06fa695d..6a758a57 100644
--- a/include/circuit_breaker/circuit_breaker_state.h
+++ b/include/circuit_breaker/circuit_breaker_state.h
@@ -49,7 +49,11 @@ enum class FailureKind : uint8_t {
 // owning dispatcher thread. Callers can compare old/new to key off a
 // specific edge (e.g. CLOSED→OPEN fires wait-queue drain).
 // `trigger` is a short static string such as "consecutive" / "rate" /
-// "probe_success" / "probe_failure" for logging.
+// "probe_success" / "probe_fail" / "open_elapsed" for logging.
+//
+// TODO(phase-7): once a snapshot / admin JSON endpoint lands, convert
+// `trigger` to an `enum class TransitionTrigger` so the valid set is
+// compile-time checked rather than string-compared. See design doc §15.8.
 using StateTransitionCallback =
     std::function<void(State old_state, State new_state, const char* trigger)>;
 
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 5a30737b..3b794c65 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -25,14 +25,21 @@ std::chrono::steady_clock::time_point CircuitBreakerSlice::OpenUntil() const {
     return std::chrono::steady_clock::time_point(std::chrono::nanoseconds(ns));
 }
 
+// Cap the left-shift exponent used to compute open duration. `1 << 30` already
+// covers ~12.4 days of base open duration even before the `max_open_duration_ms`
+// clamp — higher shift amounts would invoke undefined behavior on `int`.
+static constexpr int MAX_OPEN_DURATION_SHIFT = 30;
+
+// Scale factor for integer percent math: `fails * PERCENT_SCALE >= threshold * total`.
+static constexpr int PERCENT_SCALE = 100;
+
 std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const {
     // Duration = base << consecutive_trips_ (shift expresses 2^n exponential).
     // `consecutive_trips_` is the number of trips observed BEFORE this one, so
     // the first trip uses 2^0 = 1x base, the second trip uses 2x, etc.
     // Callers must increment consecutive_trips_ AFTER calling this method.
     int trips = consecutive_trips_.load(std::memory_order_relaxed);
-    // Saturate shift at 30 to avoid UB on huge trip counts.
-    if (trips > 30) trips = 30;
+    if (trips > MAX_OPEN_DURATION_SHIFT) trips = MAX_OPEN_DURATION_SHIFT;
     int64_t base_ms = config_.base_open_duration_ms;
     int64_t max_ms  = config_.max_open_duration_ms;
     int64_t scaled_ms = base_ms << trips;
@@ -50,8 +57,9 @@ bool CircuitBreakerSlice::ShouldTripClosed() {
     int64_t total = window_.TotalCount(now);
     if (total < config_.minimum_volume) return false;
     int64_t fails = window_.FailureCount(now);
-    // Compare without floating point: fails * 100 >= threshold * total.
-    return (fails * 100) >= (static_cast<int64_t>(config_.failure_rate_threshold) * total);
+    // Integer percent math: fails * PERCENT_SCALE >= threshold_pct * total.
+    return (fails * PERCENT_SCALE) >=
+           (static_cast<int64_t>(config_.failure_rate_threshold) * total);
 }
 
 void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
@@ -71,6 +79,7 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    first_reject_logged_for_open_ = false;
 
     trips_.fetch_add(1, std::memory_order_relaxed);
 
@@ -101,6 +110,12 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
 }
 
 void CircuitBreakerSlice::TransitionHalfOpenToClosed() {
+    // Capture actual probes-succeeded BEFORE resetting — the log then reflects
+    // reality instead of the configured target (the two are equal at the moment
+    // of transition today, but relying on that is brittle if the transition
+    // logic ever changes).
+    int probes_succeeded = half_open_successes_;
+
     state_.store(State::CLOSED, std::memory_order_release);
     open_until_steady_ns_.store(0, std::memory_order_release);
     consecutive_trips_.store(0, std::memory_order_relaxed);
@@ -109,10 +124,11 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    first_reject_logged_for_open_ = false;
 
     logging::Get()->info(
         "circuit breaker closed {} probes_succeeded={}",
-        host_label_, config_.permitted_half_open_calls);
+        host_label_, probes_succeeded);
 
     if (transition_cb_) {
         transition_cb_(State::HALF_OPEN, State::CLOSED, "probe_success");
@@ -134,6 +150,7 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    first_reject_logged_for_open_ = false;
 
     trips_.fetch_add(1, std::memory_order_relaxed);
 
@@ -165,31 +182,20 @@ Decision CircuitBreakerSlice::TryAcquire() {
             TransitionOpenToHalfOpen();
             s = State::HALF_OPEN;
         } else {
-            rejected_.fetch_add(1, std::memory_order_relaxed);
-            if (config_.dry_run) {
-                logging::Get()->info(
-                    "[dry-run] circuit breaker would reject {} state=open",
-                    host_label_);
-                return Decision::REJECTED_OPEN_DRYRUN;
-            }
-            logging::Get()->debug(
-                "circuit breaker rejected {} state=open", host_label_);
-            return Decision::REJECTED_OPEN;
+            return RejectWithLog("open", /*half_open_full=*/false);
         }
     }
 
     if (s == State::HALF_OPEN) {
-        if (half_open_inflight_ >= config_.permitted_half_open_calls) {
-            rejected_.fetch_add(1, std::memory_order_relaxed);
-            if (config_.dry_run) {
-                logging::Get()->info(
-                    "[dry-run] circuit breaker would reject {} state=half_open_full",
-                    host_label_);
-                return Decision::REJECTED_OPEN_DRYRUN;
-            }
-            logging::Get()->debug(
-                "circuit breaker rejected {} state=half_open_full", host_label_);
-            return Decision::REJECTED_OPEN;
+        // Short-circuit as soon as any probe has failed: the breaker is
+        // guaranteed to re-trip once the remaining in-flight probes drain, so
+        // admitting more probes just wastes capacity on a known-bad upstream.
+        // Previously this path kept admitting probes until `permitted_half_open_calls`
+        // in-flight was reached, which under continued failure could keep
+        // traffic flowing indefinitely instead of converging back to OPEN.
+        if (half_open_saw_failure_ ||
+            half_open_inflight_ >= config_.permitted_half_open_calls) {
+            return RejectWithLog("half_open_full", /*half_open_full=*/true);
         }
         half_open_inflight_++;
         return Decision::ADMITTED_PROBE;
@@ -199,6 +205,41 @@ Decision CircuitBreakerSlice::TryAcquire() {
     return Decision::ADMITTED;
 }
 
+Decision CircuitBreakerSlice::RejectWithLog(const char* state_label,
+                                            bool half_open_full) {
+    rejected_.fetch_add(1, std::memory_order_relaxed);
+    if (half_open_full) {
+        rejected_half_open_full_.fetch_add(1, std::memory_order_relaxed);
+    }
+    // First reject in this OPEN/HALF_OPEN cycle is info — gives operators
+    // looking at a flurry of 503s a single high-level breadcrumb in default-
+    // warn logs without flooding them. Subsequent rejects are debug.
+    const bool first = !first_reject_logged_for_open_;
+    if (first) first_reject_logged_for_open_ = true;
+
+    if (config_.dry_run) {
+        if (first) {
+            logging::Get()->info(
+                "[dry-run] circuit breaker would reject {} state={}",
+                host_label_, state_label);
+        } else {
+            logging::Get()->debug(
+                "[dry-run] circuit breaker would reject {} state={}",
+                host_label_, state_label);
+        }
+        return Decision::REJECTED_OPEN_DRYRUN;
+    }
+    if (first) {
+        logging::Get()->info(
+            "circuit breaker rejecting {} state={} (first reject this cycle)",
+            host_label_, state_label);
+    } else {
+        logging::Get()->debug(
+            "circuit breaker rejected {} state={}", host_label_, state_label);
+    }
+    return Decision::REJECTED_OPEN;
+}
+
 void CircuitBreakerSlice::ReportSuccess(bool probe) {
     if (!config_.enabled) return;
 
@@ -222,7 +263,12 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) {
         return;
     }
 
-    // CLOSED success: reset consecutive counter, record in window.
+    // Non-probe success: only meaningful when state is CLOSED. If the slice
+    // has since transitioned (e.g., other requests in this burst tripped it),
+    // this late outcome must NOT retroactively reset `consecutive_failures_`
+    // or pollute the window — a fresh CLOSED cycle after recovery would start
+    // with bogus success history.
+    if (state_.load(std::memory_order_acquire) != State::CLOSED) return;
     consecutive_failures_ = 0;
     window_.AddSuccess(Now());
 }
@@ -243,7 +289,14 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
         return;
     }
 
-    // CLOSED failure path.
+    // Non-probe failure: only count when CLOSED. Late failures from requests
+    // admitted in CLOSED but completing after a trip must NOT re-enter
+    // `TripClosedToOpen` — doing so double-increments `consecutive_trips_`
+    // (inflating open_duration) and fires a spurious CLOSED→OPEN transition
+    // edge that downstream consumers (wait-queue drain, snapshot telemetry)
+    // would see as a ghost trip.
+    if (state_.load(std::memory_order_acquire) != State::CLOSED) return;
+
     consecutive_failures_++;
     window_.AddFailure(Now());
 
diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc
index 14ea34a5..06fccc18 100644
--- a/server/circuit_breaker_window.cc
+++ b/server/circuit_breaker_window.cc
@@ -2,6 +2,19 @@
 
 namespace circuit_breaker {
 
+// Map an epoch-second value into a non-negative bucket index. C++ built-in `%`
+// can return a negative result when the dividend is negative — and while
+// `steady_clock::time_since_epoch()` is zero-based on all mainstream
+// libstdc++/libc++ implementations, the standard does not strictly guarantee a
+// non-negative epoch across every implementation. The extra `+ w` and second
+// `% w` costs a single add + mod on the slow (negative) branch, zero observable
+// overhead on the common positive branch after the compiler eliminates the
+// redundant math.
+static inline size_t BucketIndex(int64_t epoch_sec, int window_seconds) {
+    const int64_t w = window_seconds;
+    return static_cast<size_t>(((epoch_sec % w) + w) % w);
+}
+
 CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds)
     : window_seconds_(window_seconds),
       buckets_(window_seconds > 0 ? static_cast<size_t>(window_seconds) : 1) {
@@ -26,7 +39,7 @@ void CircuitBreakerWindow::Advance(int64_t now_sec) {
     } else {
         // Zero buckets from head+1..now_sec inclusive.
         for (int64_t s = head_epoch_sec_ + 1; s <= now_sec; ++s) {
-            size_t idx = static_cast<size_t>(s % window_seconds_);
+            size_t idx = BucketIndex(s, window_seconds_);
             buckets_[idx].total = 0;
             buckets_[idx].failures = 0;
         }
@@ -38,15 +51,14 @@ void CircuitBreakerWindow::AddSuccess(
     std::chrono::steady_clock::time_point now) {
     int64_t now_sec = ToEpochSec(now);
     Advance(now_sec);
-    size_t idx = static_cast<size_t>(now_sec % window_seconds_);
-    buckets_[idx].total++;
+    buckets_[BucketIndex(now_sec, window_seconds_)].total++;
 }
 
 void CircuitBreakerWindow::AddFailure(
     std::chrono::steady_clock::time_point now) {
     int64_t now_sec = ToEpochSec(now);
     Advance(now_sec);
-    size_t idx = static_cast<size_t>(now_sec % window_seconds_);
+    size_t idx = BucketIndex(now_sec, window_seconds_);
     buckets_[idx].total++;
     buckets_[idx].failures++;
 }
diff --git a/server/config_loader.cc b/server/config_loader.cc
index c17a544d..f6ff4698 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -822,33 +822,40 @@ void ConfigLoader::Validate(const ServerConfig& config) {
                     "'): proxy.retry.max_retries must be >= 0 and <= 10");
             }
 
-            // Circuit breaker validation
+            // Circuit breaker validation.
+            //
+            // Upper bounds on counting fields are generous — they exist to
+            // catch pathological configs (typo like "10_000_000_000" or a
+            // missing unit conversion), not to constrain legitimate tuning.
+            // Defaults are 5 / 20 / 5; limits are 1000× to 50000× the defaults.
             {
                 const auto& cb = u.circuit_breaker;
-                if (cb.consecutive_failure_threshold < 1) {
+                if (cb.consecutive_failure_threshold < 1 ||
+                    cb.consecutive_failure_threshold > 10000) {
                     throw std::invalid_argument(
                         idx + " ('" + u.name +
-                        "'): circuit_breaker.consecutive_failure_threshold must be >= 1");
+                        "'): circuit_breaker.consecutive_failure_threshold must be in [1, 10000]");
                 }
                 if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) {
                     throw std::invalid_argument(
                         idx + " ('" + u.name +
                         "'): circuit_breaker.failure_rate_threshold must be in [0, 100]");
                 }
-                if (cb.minimum_volume < 1) {
+                if (cb.minimum_volume < 1 || cb.minimum_volume > 10000000) {
                     throw std::invalid_argument(
                         idx + " ('" + u.name +
-                        "'): circuit_breaker.minimum_volume must be >= 1");
+                        "'): circuit_breaker.minimum_volume must be in [1, 10000000]");
                 }
                 if (cb.window_seconds < 1 || cb.window_seconds > 3600) {
                     throw std::invalid_argument(
                         idx + " ('" + u.name +
                         "'): circuit_breaker.window_seconds must be in [1, 3600]");
                 }
-                if (cb.permitted_half_open_calls < 1) {
+                if (cb.permitted_half_open_calls < 1 ||
+                    cb.permitted_half_open_calls > 1000) {
                     throw std::invalid_argument(
                         idx + " ('" + u.name +
-                        "'): circuit_breaker.permitted_half_open_calls must be >= 1");
+                        "'): circuit_breaker.permitted_half_open_calls must be in [1, 1000]");
                 }
                 if (cb.base_open_duration_ms < 100) {
                     throw std::invalid_argument(
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index bd932a28..f8b265d7 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -578,6 +578,203 @@ void TestSuccessClearsConsecutive() {
     }
 }
 
+// ============================================================================
+// Regression tests — critical bugs caught in code review
+// ============================================================================
+
+// BUG: late non-probe failure after trip re-entered TripClosedToOpen, inflating
+// consecutive_trips_ (→ longer backoff) and firing a spurious CLOSED→OPEN
+// transition edge. Fix: guard ReportFailure(probe=false) on state_ == CLOSED.
+void TestLateFailureAfterTripDoesNotInflateBackoff() {
+    std::cout << "\n[TEST] CB: late failure after trip does not inflate backoff..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.base_open_duration_ms = 1000;
+        cb.max_open_duration_ms = 60000;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Admit 10 requests in CLOSED. Slice state is single-threaded so
+        // admission + bookkeeping is serialized by the event loop — but in
+        // production the outcomes for those admitted requests can arrive after
+        // the slice has already tripped.
+        for (int i = 0; i < 10; ++i) {
+            Decision d = slice.TryAcquire();
+            if (d != Decision::ADMITTED) {
+                TestFramework::RecordTest("CB: late failure after trip",
+                    false, "admission i=" + std::to_string(i) + " not ADMITTED",
+                    TestFramework::TestCategory::OTHER);
+                return;
+            }
+        }
+        // Report 5 failures — trip at the 5th.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        if (slice.CurrentState() != State::OPEN) {
+            TestFramework::RecordTest("CB: late failure after trip", false,
+                "expected OPEN after 5 failures",
+                TestFramework::TestCategory::OTHER);
+            return;
+        }
+        int64_t trips_after_first_trip = slice.Trips();
+        // Capture open_until immediately post-trip.
+        auto open_until_initial = slice.OpenUntil();
+
+        // Now the remaining 5 in-flight requests land with late failures.
+        // Before the fix, each of these would go through the CLOSED path,
+        // climb consecutive_failures_, and trigger another TripClosedToOpen
+        // even though state is already OPEN.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        int64_t trips_after_late = slice.Trips();
+        auto open_until_after_late = slice.OpenUntil();
+
+        bool pass = slice.CurrentState() == State::OPEN &&
+                    trips_after_late == trips_after_first_trip &&  // no ghost trip
+                    open_until_after_late == open_until_initial;    // backoff unchanged
+        TestFramework::RecordTest(
+            "CB: late failure after trip does not inflate backoff",
+            pass, pass ? "" :
+                  "trips: " + std::to_string(trips_after_first_trip) +
+                  " → " + std::to_string(trips_after_late),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: late failure after trip does not inflate backoff",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// BUG: late non-probe success after trip would reset consecutive_failures_
+// and pollute the sliding window (pretending a fresh CLOSED cycle observed
+// successes). Fix: guard ReportSuccess(probe=false) on state_ == CLOSED.
+void TestLateSuccessAfterTripIgnored() {
+    std::cout << "\n[TEST] CB: late success after trip ignored..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        // Slice is OPEN now. A late success arrives — must not change state.
+        State pre = slice.CurrentState();
+        slice.ReportSuccess(false);
+        bool pass = pre == State::OPEN && slice.CurrentState() == State::OPEN;
+        TestFramework::RecordTest("CB: late success after trip ignored", pass,
+            "", TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: late success after trip ignored",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// BUG: HALF_OPEN admission kept accepting probes after the first probe
+// failure (only enforcing `inflight < permitted`), so under load a failed
+// recovery cycle could keep leaking traffic indefinitely instead of re-OPENing
+// after the in-flight probes drained. Fix: short-circuit on saw_failure.
+void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() {
+    std::cout << "\n[TEST] CB: HALF_OPEN stops admitting after probe fail..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.permitted_half_open_calls = 5;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip the breaker.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Admit 2 probes. Report failure on the first (but NOT the second yet
+        // — leave 1 in-flight so we can observe the short-circuit).
+        Decision d1 = slice.TryAcquire();   // ADMITTED_PROBE, inflight=1
+        Decision d2 = slice.TryAcquire();   // ADMITTED_PROBE, inflight=2
+        if (d1 != Decision::ADMITTED_PROBE || d2 != Decision::ADMITTED_PROBE) {
+            TestFramework::RecordTest(
+                "CB: HALF_OPEN stops admitting after probe fail",
+                false, "probes not admitted as expected",
+                TestFramework::TestCategory::OTHER);
+            return;
+        }
+        // Fail the first probe — inflight drops to 1, saw_failure=true.
+        // Last-probe trip does not yet fire (inflight is still 1).
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+
+        // State must still be HALF_OPEN (final probe not yet completed).
+        State mid = slice.CurrentState();
+
+        // Subsequent TryAcquire — BEFORE fix this would succeed because
+        // inflight (1) < permitted (5). AFTER fix it short-circuits because
+        // saw_failure is set.
+        Decision d3 = slice.TryAcquire();
+
+        bool pass = mid == State::HALF_OPEN &&
+                    d3 == Decision::REJECTED_OPEN;
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN stops admitting after probe fail",
+            pass, pass ? "" : "expected REJECTED_OPEN on 3rd TryAcquire",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN stops admitting after probe fail",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Verifies the dedicated HALF_OPEN-full counter is bumped separately from the
+// generic `rejected_` counter, so Phase 7 snapshots can distinguish
+// "open, backoff not elapsed" from "probing, no slots left".
+void TestHalfOpenFullCounterSeparate() {
+    std::cout << "\n[TEST] CB: HALF_OPEN_FULL counter separate..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.permitted_half_open_calls = 2;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip → OPEN reject increments generic counter only.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        slice.TryAcquire();  // REJECTED_OPEN (backoff active)
+        int64_t rejected_open_only = slice.Rejected();
+        int64_t half_open_full_open_only = slice.RejectedHalfOpenFull();
+
+        // Elapse backoff → HALF_OPEN. Fill the probe budget, then a 3rd
+        // TryAcquire rejects with half_open_full, incrementing both counters.
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+        slice.TryAcquire();                  // probe 1 admitted
+        slice.TryAcquire();                  // probe 2 admitted (budget full)
+        slice.TryAcquire();                  // REJECTED (full)
+        int64_t rejected_total = slice.Rejected();
+        int64_t half_open_full_total = slice.RejectedHalfOpenFull();
+
+        bool pass = rejected_open_only == 1 &&
+                    half_open_full_open_only == 0 &&
+                    rejected_total == 2 &&            // 1 OPEN + 1 HALF_OPEN_FULL
+                    half_open_full_total == 1;        // only the HALF_OPEN one
+        TestFramework::RecordTest("CB: HALF_OPEN_FULL counter separate",
+            pass, pass ? "" :
+                  "rej=" + std::to_string(rejected_total) +
+                  " hof=" + std::to_string(half_open_full_total),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB: HALF_OPEN_FULL counter separate",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -641,6 +838,10 @@ void RunAllTests() {
     TestReloadPreservesState();
     TestConsecutiveThresholdOne();
     TestSuccessClearsConsecutive();
+    TestLateFailureAfterTripDoesNotInflateBackoff();
+    TestLateSuccessAfterTripIgnored();
+    TestHalfOpenStopsAdmittingAfterFirstProbeFailure();
+    TestHalfOpenFullCounterSeparate();
     TestTransitionCallbackInvoked();
 }
 
diff --git a/test/config_test.h b/test/config_test.h
index 213fd8ac..94c60763 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -506,13 +506,13 @@ namespace ConfigTests {
         std::cout << "\n[TEST] Circuit Breaker Validation..." << std::endl;
         ExpectValidationFailure("CB Validation: consecutive_failure_threshold<1",
             R"({"consecutive_failure_threshold": 0})",
-            "consecutive_failure_threshold must be >= 1");
+            "consecutive_failure_threshold must be in [1, 10000]");
         ExpectValidationFailure("CB Validation: failure_rate_threshold>100",
             R"({"failure_rate_threshold": 101})",
             "failure_rate_threshold must be in [0, 100]");
         ExpectValidationFailure("CB Validation: minimum_volume<1",
             R"({"minimum_volume": 0})",
-            "minimum_volume must be >= 1");
+            "minimum_volume must be in [1, 10000000]");
         ExpectValidationFailure("CB Validation: window_seconds<1",
             R"({"window_seconds": 0})",
             "window_seconds must be in [1, 3600]");
@@ -536,7 +536,17 @@ namespace ConfigTests {
             "max_ejection_percent_per_host_set must be in [0, 100]");
         ExpectValidationFailure("CB Validation: permitted_half_open_calls<1",
             R"({"permitted_half_open_calls": 0})",
-            "permitted_half_open_calls must be >= 1");
+            "permitted_half_open_calls must be in [1, 1000]");
+        // Upper-bound regressions — pathological configs must be rejected.
+        ExpectValidationFailure("CB Validation: consecutive_failure_threshold>10000",
+            R"({"consecutive_failure_threshold": 10001})",
+            "consecutive_failure_threshold must be in [1, 10000]");
+        ExpectValidationFailure("CB Validation: minimum_volume>10000000",
+            R"({"minimum_volume": 10000001})",
+            "minimum_volume must be in [1, 10000000]");
+        ExpectValidationFailure("CB Validation: permitted_half_open_calls>1000",
+            R"({"permitted_half_open_calls": 1001})",
+            "permitted_half_open_calls must be in [1, 1000]");
     }
 
     // Test 14: Equality operator covers circuit_breaker field

From 8c7a64ba9ef3f01d23c3a2d39237e97de2a4af20 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Mon, 13 Apr 2026 19:38:49 +0800
Subject: [PATCH 03/37] Fix review comment

---
 Makefile                        |   2 +-
 server/circuit_breaker_slice.cc |  91 +++++++++++++---
 test/circuit_breaker_test.h     | 185 ++++++++++++++++++++++++++++++++
 3 files changed, 262 insertions(+), 16 deletions(-)

diff --git a/Makefile b/Makefile
index 4dd6b83a..23a46ce0 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
 CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 3b794c65..17f6113d 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -99,6 +99,12 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    // Reset the info-log "first reject" breadcrumb so the first rejection
+    // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN
+    // rejection (recovery attempt failing or probe budget full) is
+    // operationally distinct from OPEN rejection (still backing off) and
+    // deserves its own breadcrumb in default-warn operator logs.
+    first_reject_logged_for_open_ = false;
 
     logging::Get()->info(
         "circuit breaker half-open {} probes_allowed={}",
@@ -187,14 +193,20 @@ Decision CircuitBreakerSlice::TryAcquire() {
     }
 
     if (s == State::HALF_OPEN) {
-        // Short-circuit as soon as any probe has failed: the breaker is
-        // guaranteed to re-trip once the remaining in-flight probes drain, so
-        // admitting more probes just wastes capacity on a known-bad upstream.
-        // Previously this path kept admitting probes until `permitted_half_open_calls`
-        // in-flight was reached, which under continued failure could keep
-        // traffic flowing indefinitely instead of converging back to OPEN.
-        if (half_open_saw_failure_ ||
-            half_open_inflight_ >= config_.permitted_half_open_calls) {
+        // Case A: a sibling probe already failed. Short-circuit remaining
+        // admissions — the breaker is guaranteed to re-trip once in-flight
+        // probes drain. This is operationally DIFFERENT from "budget
+        // exhausted" (case B): probe slots may still be free, we just know
+        // using them can't change the outcome. Track it with its own log
+        // label and do NOT bump `rejected_half_open_full_` — that counter
+        // is specifically "probing, no capacity left" for dashboards.
+        if (half_open_saw_failure_) {
+            return RejectWithLog("half_open_recovery_failing",
+                                 /*half_open_full=*/false);
+        }
+        // Case B: probe budget fully in flight. "No capacity" — bump the
+        // dedicated counter so dashboards can tell these two apart.
+        if (half_open_inflight_ >= config_.permitted_half_open_calls) {
             return RejectWithLog("half_open_full", /*half_open_full=*/true);
         }
         half_open_inflight_++;
@@ -244,9 +256,20 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) {
     if (!config_.enabled) return;
 
     if (probe) {
+        // Record the completed-probe outcome for observability regardless of
+        // current state — this is a signal about upstream behavior, not a
+        // signal about our state machine.
         probe_successes_.fetch_add(1, std::memory_order_relaxed);
-        // Count the completed probe regardless of saw_failure state (we still
-        // decrement inflight to release the slot).
+
+        // Stale probe defense: we admitted this probe in HALF_OPEN, but the
+        // slice may have transitioned out (e.g., `Reload()` flipped enabled
+        // or resized the window, `TransitionHalfOpenToClosed` already fired
+        // on sibling probes, or — post-Phase 8 — an operator toggle
+        // transitioned us to CLOSED). Only touch HALF_OPEN bookkeeping /
+        // fire transitions when state is STILL HALF_OPEN. Otherwise the
+        // probe is informational only.
+        if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return;
+
         if (half_open_inflight_ > 0) half_open_inflight_--;
         if (half_open_saw_failure_) {
             // A sibling probe already failed; whichever probe finishes last
@@ -280,6 +303,10 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
 
     if (probe) {
         probe_failures_.fetch_add(1, std::memory_order_relaxed);
+
+        // Stale probe defense — see matching comment in ReportSuccess above.
+        if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return;
+
         if (half_open_inflight_ > 0) half_open_inflight_--;
         half_open_saw_failure_ = true;
         // On the last probe (or if all remaining complete) transition OPEN.
@@ -309,18 +336,52 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
 }
 
 void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
-    bool window_changed = (config_.window_seconds != new_config.window_seconds);
+    const bool enabled_changed = (config_.enabled != new_config.enabled);
+    const bool window_changed =
+        (config_.window_seconds != new_config.window_seconds);
+
     config_ = new_config;
     if (window_changed) window_.Resize(new_config.window_seconds);
-    // Live state preserved — operator expects new thresholds to apply to the
-    // next evaluation, not to reset an in-progress trip.
+
+    if (enabled_changed) {
+        // Toggling `enabled` is an operator intent to start fresh, not a
+        // runtime state transition. Without this reset:
+        //   - Disabling while OPEN and re-enabling later would resume the
+        //     OPEN state and reject requests even though the operator
+        //     explicitly turned the breaker off and back on.
+        //   - Disabling while HALF_OPEN with in-flight probes would leave
+        //     inconsistent bookkeeping (inflight > 0, state=HALF_OPEN) that
+        //     a subsequent enable would interpret as live probes.
+        //   - Disabling mid-CLOSED-cycle and re-enabling would trip on the
+        //     very next failure because consecutive_failures_ persisted.
+        // Matches design doc §10.1 (enabled→disabled / disabled→enabled
+        // transitions both get a clean CLOSED start).
+        //
+        // Silent reset — no transition callback. The change is operator-
+        // initiated configuration, not a runtime state signal; firing the
+        // callback would cause PoolPartition::DrainWaitQueueOnTrip-style
+        // consumers (Phase 6) to spuriously drain waiters on a config edit.
+        state_.store(State::CLOSED, std::memory_order_release);
+        open_until_steady_ns_.store(0, std::memory_order_release);
+        consecutive_trips_.store(0, std::memory_order_relaxed);
+        consecutive_failures_ = 0;
+        window_.Reset();
+        half_open_inflight_ = 0;
+        half_open_successes_ = 0;
+        half_open_saw_failure_ = false;
+        first_reject_logged_for_open_ = false;
+    }
+    // When `enabled` is unchanged: live state preserved — operator expects
+    // new thresholds to apply to the next evaluation, not to reset an
+    // in-progress trip.
 
     logging::Get()->info(
         "circuit breaker config applied {} enabled={} window_s={} "
-        "fail_rate={} consec_threshold={}",
+        "fail_rate={} consec_threshold={}{}",
         host_label_, new_config.enabled, new_config.window_seconds,
         new_config.failure_rate_threshold,
-        new_config.consecutive_failure_threshold);
+        new_config.consecutive_failure_threshold,
+        enabled_changed ? " (enabled toggled — state reset to CLOSED)" : "");
 }
 
 void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) {
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index f8b265d7..d39ab52a 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -775,6 +775,187 @@ void TestHalfOpenFullCounterSeparate() {
     }
 }
 
+// BUG (review round 2, P2): Reload preserved stale state across enabled
+// toggles. Disabling while OPEN and re-enabling later resumed the OPEN state,
+// rejecting requests despite an explicit operator off→on cycle. Disabling
+// after accumulated consecutive failures would re-trip on the very next
+// failure. Fix: reset state to CLOSED whenever enabled toggles.
+void TestReloadResetsStateOnEnabledToggleWhileOpen() {
+    std::cout << "\n[TEST] CB: reload resets state on enabled toggle (while OPEN)..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Drive to OPEN.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        if (slice.CurrentState() != State::OPEN) {
+            TestFramework::RecordTest(
+                "CB: reload resets state on enabled toggle (OPEN)", false,
+                "precondition: slice not OPEN",
+                TestFramework::TestCategory::OTHER);
+            return;
+        }
+
+        // Disable via reload — state must reset to CLOSED.
+        auto disabled = cb;
+        disabled.enabled = false;
+        slice.Reload(disabled);
+        bool disabled_closed = slice.CurrentState() == State::CLOSED;
+
+        // Re-enable via reload — state must remain CLOSED (no stale OPEN).
+        slice.Reload(cb);
+        bool reenabled_closed = slice.CurrentState() == State::CLOSED;
+
+        // And the slice must NOT insta-trip on a single failure (pre-fix,
+        // consecutive_failures_ could have persisted ≥ threshold).
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        bool one_fail_no_trip = slice.CurrentState() == State::CLOSED;
+
+        bool pass = disabled_closed && reenabled_closed && one_fail_no_trip;
+        TestFramework::RecordTest(
+            "CB: reload resets state on enabled toggle (OPEN)", pass,
+            pass ? "" : "disabled_closed=" + std::to_string(disabled_closed) +
+                        " reenabled_closed=" + std::to_string(reenabled_closed) +
+                        " one_fail_no_trip=" + std::to_string(one_fail_no_trip),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: reload resets state on enabled toggle (OPEN)", false, e.what(),
+            TestFramework::TestCategory::OTHER);
+    }
+}
+
+// BUG (review round 2, P2, variant): if disable happens while
+// consecutive_failures_ has accumulated but not yet tripped, re-enable would
+// inherit that count and trip early on the next failure.
+void TestReloadResetsConsecutiveFailuresOnEnabledToggle() {
+    std::cout << "\n[TEST] CB: reload clears consecutive_failures on enable toggle..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.consecutive_failure_threshold = 5;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // 4 failures — just under threshold. State still CLOSED.
+        for (int i = 0; i < 4; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        if (slice.CurrentState() != State::CLOSED) {
+            TestFramework::RecordTest(
+                "CB: reload clears consecutive_failures", false,
+                "precondition: slice not CLOSED",
+                TestFramework::TestCategory::OTHER);
+            return;
+        }
+
+        // Disable then re-enable.
+        auto disabled = cb; disabled.enabled = false;
+        slice.Reload(disabled);
+        slice.Reload(cb);
+
+        // A single failure post-reenable must NOT trip — consecutive_failures_
+        // should have been reset to 0, not preserved at 4.
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        bool pass = slice.CurrentState() == State::CLOSED;
+        TestFramework::RecordTest(
+            "CB: reload clears consecutive_failures on enable toggle",
+            pass,
+            pass ? "" : "expected CLOSED after 1 post-reenable failure, got " +
+                        std::string(circuit_breaker::StateName(slice.CurrentState())),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: reload clears consecutive_failures on enable toggle",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Threshold-change-only reload (enabled unchanged) MUST preserve live state
+// per design §10. Regression guard for fix #1.
+void TestReloadThresholdChangePreservesState() {
+    std::cout << "\n[TEST] CB: reload preserves state when only thresholds change..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        // OPEN. Reload with a tighter threshold but enabled unchanged.
+        auto tighter = cb;
+        tighter.consecutive_failure_threshold = 2;
+        slice.Reload(tighter);
+        // State must remain OPEN — live state preservation.
+        bool pass = slice.CurrentState() == State::OPEN;
+        TestFramework::RecordTest(
+            "CB: reload preserves state on threshold-only change",
+            pass, pass ? "" : "expected OPEN, got " +
+                              std::string(circuit_breaker::StateName(slice.CurrentState())),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: reload preserves state on threshold-only change", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// BUG (review round 2, P3): saw_failure short-circuit incorrectly bumped the
+// HALF_OPEN_FULL counter, polluting dashboards that need to distinguish
+// "probing, no capacity left" from "recovery attempt is failing".
+void TestSawFailureDoesNotBumpHalfOpenFullCounter() {
+    std::cout << "\n[TEST] CB: saw_failure reject does not bump HALF_OPEN_FULL..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.permitted_half_open_calls = 5;  // plenty of capacity
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Admit 2 probes, fail the first — saw_failure=true, inflight=1.
+        slice.TryAcquire();  // probe 1 admitted
+        slice.TryAcquire();  // probe 2 admitted
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+
+        int64_t hof_before = slice.RejectedHalfOpenFull();
+        // Reject via saw_failure short-circuit (capacity is NOT exhausted —
+        // only 1 probe actually in flight, and permitted is 5).
+        Decision d = slice.TryAcquire();
+        int64_t hof_after = slice.RejectedHalfOpenFull();
+
+        // Still REJECTED_OPEN (same client-visible outcome), but
+        // RejectedHalfOpenFull must NOT be incremented — this is a
+        // "recovery failing" reject, not a capacity reject.
+        bool pass = d == Decision::REJECTED_OPEN &&
+                    hof_before == 0 &&
+                    hof_after == 0;
+        TestFramework::RecordTest(
+            "CB: saw_failure reject does not bump HALF_OPEN_FULL",
+            pass, pass ? "" : "hof_before=" + std::to_string(hof_before) +
+                              " hof_after=" + std::to_string(hof_after),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: saw_failure reject does not bump HALF_OPEN_FULL",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -842,6 +1023,10 @@ void RunAllTests() {
     TestLateSuccessAfterTripIgnored();
     TestHalfOpenStopsAdmittingAfterFirstProbeFailure();
     TestHalfOpenFullCounterSeparate();
+    TestReloadResetsStateOnEnabledToggleWhileOpen();
+    TestReloadResetsConsecutiveFailuresOnEnabledToggle();
+    TestReloadThresholdChangePreservesState();
+    TestSawFailureDoesNotBumpHalfOpenFullCounter();
     TestTransitionCallbackInvoked();
 }
 

From 6d5cac69ce333ca23b226d0459b168f37d24c54d Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Mon, 13 Apr 2026 20:45:46 +0800
Subject: [PATCH 04/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  55 ++-
 server/circuit_breaker_slice.cc               | 133 ++++--
 test/circuit_breaker_test.h                   | 404 +++++++++++++++---
 3 files changed, 499 insertions(+), 93 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 8d7af6d7..4ba1331d 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -30,16 +30,34 @@ class CircuitBreakerSlice {
     CircuitBreakerSlice(const CircuitBreakerSlice&) = delete;
     CircuitBreakerSlice& operator=(const CircuitBreakerSlice&) = delete;
 
+    // Return value of TryAcquire. `generation` is a monotonically-increasing
+    // token identifying which state-machine cycle the admission belongs to.
+    // Callers MUST pass it back to Report*() unchanged so the slice can drop
+    // late completions that belong to a prior cycle (crossed a state
+    // transition or a Reload()-reset boundary). Without this, stale
+    // completions can pollute the bookkeeping of a fresh CLOSED/HALF_OPEN
+    // cycle (e.g., a pre-toggle failure incrementing the post-toggle
+    // consecutive_failures_, or a pre-CLOSED'-cycle success wiping a
+    // legitimate post-CLOSED' counter).
+    struct Admission {
+        Decision decision;
+        uint64_t generation;
+    };
+
     // Hot-path decision. Consults state + (if applicable) advances OPEN→HALF_OPEN
     // and reserves a probe slot. Increments `rejected_` on REJECTED_OPEN*
     // (both enforce and dry-run). Emits reject log on dispatcher thread.
-    Decision TryAcquire();
+    // Returned generation must be threaded to the paired Report*().
+    Admission TryAcquire();
 
     // Outcome reporting. `probe` is true iff the paired TryAcquire returned
-    // ADMITTED_PROBE. Report* may trigger state transitions and fire the
-    // transition callback.
-    void ReportSuccess(bool probe);
-    void ReportFailure(FailureKind kind, bool probe);
+    // ADMITTED_PROBE. `admission_generation` is the generation returned by
+    // the paired TryAcquire — reports from a stale generation are silently
+    // dropped (observability counters still update so the outcome is not
+    // lost from dashboards). Report* may trigger state transitions and fire
+    // the transition callback.
+    void ReportSuccess(bool probe, uint64_t admission_generation);
+    void ReportFailure(FailureKind kind, bool probe, uint64_t admission_generation);
 
     // Apply a new config (called on this slice's dispatcher thread).
     // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if
@@ -63,6 +81,20 @@ class CircuitBreakerSlice {
     int64_t  RejectedHalfOpenFull() const {
         return rejected_half_open_full_.load(std::memory_order_relaxed);
     }
+    // Number of Report* calls silently dropped because their admission
+    // generation no longer matches the slice's current generation. These
+    // are reports of requests admitted before a state transition or a
+    // Reload()-reset. Useful for detecting mis-threaded admission tokens.
+    int64_t  ReportsStaleGeneration() const {
+        return reports_stale_generation_.load(std::memory_order_relaxed);
+    }
+
+    // **Test-only** accessor for the current generation. Production callers
+    // MUST use the generation returned by TryAcquire (racy otherwise — this
+    // getter is not atomic). Tests use it as ergonomic shorthand for
+    // "admission just happened in the current cycle", bypassing the need to
+    // thread a token per synthetic Report* call.
+    uint64_t CurrentGenerationForTesting() const { return generation_; }
 
     const std::string& host_label() const { return host_label_; }
     size_t dispatcher_index() const { return dispatcher_index_; }
@@ -108,6 +140,19 @@ class CircuitBreakerSlice {
     // post-trip reject in default-warn operator logs. Dispatcher-thread only.
     bool first_reject_logged_for_open_ = false;
 
+    // Monotonic generation counter. Incremented on every state transition
+    // AND on every Reload() enabled-toggle reset. TryAcquire captures the
+    // current generation at admission time; Report* compares against it
+    // and drops reports from a stale generation (e.g., a request admitted
+    // before an operator reset whose outcome arrives after). Dispatcher-
+    // thread only — plain int (no atomic needed).
+    uint64_t generation_ = 1;
+
+    // Rejections silently dropped because their admission generation no
+    // longer matches `generation_`. Observability only; lets dashboards see
+    // how often the generation guard fires.
+    std::atomic<int64_t> reports_stale_generation_{0};
+
     StateTransitionCallback transition_cb_;
 
     // Internal transitions (dispatcher-thread).
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 17f6113d..32fcdfc5 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -80,6 +80,9 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
     first_reject_logged_for_open_ = false;
+    // Bump generation: any in-flight admission from the closing CLOSED
+    // cycle is now stale. Late Report*() for those requests is dropped.
+    ++generation_;
 
     trips_.fetch_add(1, std::memory_order_relaxed);
 
@@ -94,8 +97,13 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
 
 void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     state_.store(State::HALF_OPEN, std::memory_order_release);
-    // Keep open_until_steady_ns_ so observers see the "last open" boundary;
-    // it's cleared on transition to CLOSED.
+    // Clear open_until_steady_ns_ per the OpenUntil() contract ("zero when
+    // not OPEN"). Leaving a stale deadline here would cause Phase 4's
+    // ProxyTransaction::MakeCircuitOpenResponse to compute a Retry-After
+    // from a past time_point (negative delta → floor at 1s, misleading for
+    // a reject in the HALF_OPEN probe-budget-full path). Retry-After for
+    // HALF_OPEN rejects is computed fresh by callers when needed.
+    open_until_steady_ns_.store(0, std::memory_order_release);
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
@@ -105,6 +113,9 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     // operationally distinct from OPEN rejection (still backing off) and
     // deserves its own breadcrumb in default-warn operator logs.
     first_reject_logged_for_open_ = false;
+    // Fresh HALF_OPEN cycle — any stale probe admissions from a prior
+    // HALF_OPEN cycle (after re-trip then re-enter) are now invalidated.
+    ++generation_;
 
     logging::Get()->info(
         "circuit breaker half-open {} probes_allowed={}",
@@ -131,6 +142,10 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() {
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
     first_reject_logged_for_open_ = false;
+    // Fresh CLOSED cycle — any non-probe admissions from the PREVIOUS
+    // CLOSED cycle (before trip) are now stale, and any probe admissions
+    // from the just-completed HALF_OPEN cycle are too.
+    ++generation_;
 
     logging::Get()->info(
         "circuit breaker closed {} probes_succeeded={}",
@@ -157,6 +172,9 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) {
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
     first_reject_logged_for_open_ = false;
+    // Bump generation — any in-flight probe admissions from the closing
+    // HALF_OPEN cycle are now stale.
+    ++generation_;
 
     trips_.fetch_add(1, std::memory_order_relaxed);
 
@@ -169,9 +187,12 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) {
     if (transition_cb_) transition_cb_(State::HALF_OPEN, State::OPEN, trigger);
 }
 
-Decision CircuitBreakerSlice::TryAcquire() {
+CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() {
     // Disabled fast path — zero overhead when config.enabled=false.
-    if (!config_.enabled) return Decision::ADMITTED;
+    // Use generation 0 (sentinel) since the slice won't consult it on report.
+    if (!config_.enabled) {
+        return Admission{Decision::ADMITTED, /*generation=*/0};
+    }
 
     State s = state_.load(std::memory_order_acquire);
 
@@ -188,7 +209,8 @@ Decision CircuitBreakerSlice::TryAcquire() {
             TransitionOpenToHalfOpen();
             s = State::HALF_OPEN;
         } else {
-            return RejectWithLog("open", /*half_open_full=*/false);
+            return Admission{RejectWithLog("open", /*half_open_full=*/false),
+                             generation_};
         }
     }
 
@@ -201,20 +223,23 @@ Decision CircuitBreakerSlice::TryAcquire() {
         // label and do NOT bump `rejected_half_open_full_` — that counter
         // is specifically "probing, no capacity left" for dashboards.
         if (half_open_saw_failure_) {
-            return RejectWithLog("half_open_recovery_failing",
-                                 /*half_open_full=*/false);
+            return Admission{RejectWithLog("half_open_recovery_failing",
+                                           /*half_open_full=*/false),
+                             generation_};
         }
         // Case B: probe budget fully in flight. "No capacity" — bump the
         // dedicated counter so dashboards can tell these two apart.
         if (half_open_inflight_ >= config_.permitted_half_open_calls) {
-            return RejectWithLog("half_open_full", /*half_open_full=*/true);
+            return Admission{RejectWithLog("half_open_full",
+                                           /*half_open_full=*/true),
+                             generation_};
         }
         half_open_inflight_++;
-        return Decision::ADMITTED_PROBE;
+        return Admission{Decision::ADMITTED_PROBE, generation_};
     }
 
     // CLOSED: fast path.
-    return Decision::ADMITTED;
+    return Admission{Decision::ADMITTED, generation_};
 }
 
 Decision CircuitBreakerSlice::RejectWithLog(const char* state_label,
@@ -252,7 +277,8 @@ Decision CircuitBreakerSlice::RejectWithLog(const char* state_label,
     return Decision::REJECTED_OPEN;
 }
 
-void CircuitBreakerSlice::ReportSuccess(bool probe) {
+void CircuitBreakerSlice::ReportSuccess(bool probe,
+                                        uint64_t admission_generation) {
     if (!config_.enabled) return;
 
     if (probe) {
@@ -261,13 +287,22 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) {
         // signal about our state machine.
         probe_successes_.fetch_add(1, std::memory_order_relaxed);
 
+        // Generation guard: drop reports for admissions that pre-date the
+        // current cycle (a state transition or Reload reset invalidated them).
+        // Belt-and-suspenders with the state guard below — the generation
+        // catches stale-report-in-same-state cases (e.g., HALF_OPEN cycle
+        // A probe completing after re-trip and re-entry into HALF_OPEN B).
+        if (admission_generation != generation_) {
+            reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
+            return;
+        }
+
         // Stale probe defense: we admitted this probe in HALF_OPEN, but the
-        // slice may have transitioned out (e.g., `Reload()` flipped enabled
-        // or resized the window, `TransitionHalfOpenToClosed` already fired
-        // on sibling probes, or — post-Phase 8 — an operator toggle
-        // transitioned us to CLOSED). Only touch HALF_OPEN bookkeeping /
-        // fire transitions when state is STILL HALF_OPEN. Otherwise the
-        // probe is informational only.
+        // slice may have transitioned out (e.g., `Reload()` flipped enabled,
+        // `TransitionHalfOpenToClosed` already fired on sibling probes, or —
+        // post-Phase 8 — an operator toggle transitioned us to CLOSED).
+        // Only touch HALF_OPEN bookkeeping / fire transitions when state is
+        // STILL HALF_OPEN.
         if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return;
 
         if (half_open_inflight_ > 0) half_open_inflight_--;
@@ -286,17 +321,25 @@ void CircuitBreakerSlice::ReportSuccess(bool probe) {
         return;
     }
 
-    // Non-probe success: only meaningful when state is CLOSED. If the slice
-    // has since transitioned (e.g., other requests in this burst tripped it),
-    // this late outcome must NOT retroactively reset `consecutive_failures_`
-    // or pollute the window — a fresh CLOSED cycle after recovery would start
-    // with bogus success history.
+    // Non-probe success path.
+    if (admission_generation != generation_) {
+        reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
+        return;
+    }
+    // Only meaningful when state is CLOSED. If the slice has since
+    // transitioned (e.g., other requests in this burst tripped it), this
+    // late outcome must NOT retroactively reset `consecutive_failures_` or
+    // pollute the window — a fresh CLOSED cycle after recovery would start
+    // with bogus success history. (Transitions bump `generation_`, so the
+    // guard above catches this too; the state check is a direct guard for
+    // observability clarity.)
     if (state_.load(std::memory_order_acquire) != State::CLOSED) return;
     consecutive_failures_ = 0;
     window_.AddSuccess(Now());
 }
 
-void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
+void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe,
+                                        uint64_t admission_generation) {
     (void)kind;  // Kind is used by higher layers for logging; slice itself
                  // treats all failures the same way for trip math.
     if (!config_.enabled) return;
@@ -304,6 +347,11 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
     if (probe) {
         probe_failures_.fetch_add(1, std::memory_order_relaxed);
 
+        if (admission_generation != generation_) {
+            reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
+            return;
+        }
+
         // Stale probe defense — see matching comment in ReportSuccess above.
         if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return;
 
@@ -316,12 +364,18 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe) {
         return;
     }
 
-    // Non-probe failure: only count when CLOSED. Late failures from requests
-    // admitted in CLOSED but completing after a trip must NOT re-enter
-    // `TripClosedToOpen` — doing so double-increments `consecutive_trips_`
-    // (inflating open_duration) and fires a spurious CLOSED→OPEN transition
-    // edge that downstream consumers (wait-queue drain, snapshot telemetry)
-    // would see as a ghost trip.
+    // Non-probe failure path.
+    if (admission_generation != generation_) {
+        reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
+        return;
+    }
+    // Only count when CLOSED. Late failures from requests admitted in CLOSED
+    // but completing after a trip must NOT re-enter `TripClosedToOpen` —
+    // doing so double-increments `consecutive_trips_` (inflating
+    // open_duration) and fires a spurious CLOSED→OPEN transition edge that
+    // downstream consumers (wait-queue drain, snapshot telemetry) would see
+    // as a ghost trip. (Again, the generation guard above catches this too;
+    // keep the state check for observability clarity.)
     if (state_.load(std::memory_order_acquire) != State::CLOSED) return;
 
     consecutive_failures_++;
@@ -341,7 +395,22 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         (config_.window_seconds != new_config.window_seconds);
 
     config_ = new_config;
-    if (window_changed) window_.Resize(new_config.window_seconds);
+    if (window_changed) {
+        // Resize wipes the failure-rate ring buckets. Without bumping
+        // generation_ here, late completions from pre-reload admissions
+        // would still carry the matching generation, pass the guard, and
+        // repopulate the freshly empty window — mixing pre-reload and
+        // post-reload traffic. A pre-reload failure plus one new failure
+        // could then immediately satisfy minimum_volume / failure_rate
+        // and trip on the next evaluation, despite this being a fresh
+        // observation cycle by operator intent.
+        //
+        // Skip when enabled_changed is also true: the full-reset branch
+        // below bumps the generation as part of its larger reset, and
+        // double-bumping is harmless but noisy.
+        window_.Resize(new_config.window_seconds);
+        if (!enabled_changed) ++generation_;
+    }
 
     if (enabled_changed) {
         // Toggling `enabled` is an operator intent to start fresh, not a
@@ -370,6 +439,10 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         half_open_successes_ = 0;
         half_open_saw_failure_ = false;
         first_reject_logged_for_open_ = false;
+        // Fresh generation: reports of requests admitted before this
+        // reset will carry the old generation and be silently dropped,
+        // preserving clean-restart semantics.
+        ++generation_;
     }
     // When `enabled` is unchanged: live state preserved — operator expects
     // new thresholds to apply to the next evaluation, not to reset an
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index d39ab52a..828dfe4f 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -56,12 +56,12 @@ void TestDisabledFastPath() {
         CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
             [clock]() { return clock->now; });
 
-        bool pass = slice.TryAcquire() == Decision::ADMITTED &&
+        bool pass = slice.TryAcquire().decision == Decision::ADMITTED &&
                     slice.CurrentState() == State::CLOSED;
 
         // Reporting 100 failures must not trip.
         for (int i = 0; i < 100; ++i) {
-            slice.ReportFailure(FailureKind::CONNECT_FAILURE, false);
+            slice.ReportFailure(FailureKind::CONNECT_FAILURE, false, slice.CurrentGenerationForTesting());
         }
         pass = pass && slice.CurrentState() == State::CLOSED &&
                slice.Trips() == 0;
@@ -83,10 +83,10 @@ void TestClosedStaysClosedBelowConsecutiveThreshold() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 4; ++i) {
-            slice.ReportFailure(FailureKind::CONNECT_FAILURE, false);
+            slice.ReportFailure(FailureKind::CONNECT_FAILURE, false, slice.CurrentGenerationForTesting());
         }
         bool pass = slice.CurrentState() == State::CLOSED &&
-                    slice.TryAcquire() == Decision::ADMITTED &&
+                    slice.TryAcquire().decision == Decision::ADMITTED &&
                     slice.Trips() == 0;
         TestFramework::RecordTest("CB: 4 failures below threshold", pass, "",
             TestFramework::TestCategory::OTHER);
@@ -105,11 +105,11 @@ void TestConsecutiveFailureTrip() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         bool pass = slice.CurrentState() == State::OPEN &&
                     slice.Trips() == 1 &&
-                    slice.TryAcquire() == Decision::REJECTED_OPEN;
+                    slice.TryAcquire().decision == Decision::REJECTED_OPEN;
         TestFramework::RecordTest("CB: 5 consecutive failures trip", pass, "",
             TestFramework::TestCategory::OTHER);
     } catch (const std::exception& e) {
@@ -130,17 +130,17 @@ void TestFailureRateTrip() {
         // Alternate 10 failures and 10 successes within the same second —
         // ratio = 50%, total = 20 (>= minimum_volume).
         for (int i = 0; i < 10; ++i) {
-            slice.ReportSuccess(false);
+            slice.ReportSuccess(false, slice.CurrentGenerationForTesting());
         }
         // A success between-failures clears consecutive_failures_, confirming
         // only rate path can trip here.
         for (int i = 0; i < 9; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         // Still CLOSED — 9/19 < 50%.
         bool pass_pre = slice.CurrentState() == State::CLOSED;
         // 10th failure brings ratio to 10/20 = 50% exactly — tripper.
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         bool pass = pass_pre && slice.CurrentState() == State::OPEN &&
                     slice.Trips() == 1;
         TestFramework::RecordTest("CB: failure-rate trip (50% of 20)", pass, "",
@@ -163,7 +163,7 @@ void TestMinimumVolumeGate() {
 
         // 19 total calls, all failures — should NOT trip (below volume).
         for (int i = 0; i < 19; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         bool pass = slice.CurrentState() == State::CLOSED && slice.Trips() == 0;
         TestFramework::RecordTest("CB: minimum_volume gate", pass, "",
@@ -183,11 +183,11 @@ void TestOpenBeforeDurationStaysOpen() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         // Advance less than base_open_duration_ms (5000ms).
         clock->Advance(std::chrono::milliseconds(2000));
-        Decision d = slice.TryAcquire();
+        Decision d = slice.TryAcquire().decision;
         bool pass = d == Decision::REJECTED_OPEN &&
                     slice.CurrentState() == State::OPEN;
         TestFramework::RecordTest("CB: OPEN rejects before elapsed", pass, "",
@@ -207,10 +207,10 @@ void TestOpenToHalfOpenAfterDuration() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
-        Decision d = slice.TryAcquire();
+        Decision d = slice.TryAcquire().decision;
         bool pass = d == Decision::ADMITTED_PROBE &&
                     slice.CurrentState() == State::HALF_OPEN;
         TestFramework::RecordTest("CB: OPEN -> HALF_OPEN after duration", pass, "",
@@ -230,13 +230,13 @@ void TestHalfOpenAllProbesSucceed() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
 
         // Take 5 probes; report success on each.
         for (int i = 0; i < cb.permitted_half_open_calls; ++i) {
-            Decision d = slice.TryAcquire();
+            Decision d = slice.TryAcquire().decision;
             if (d != Decision::ADMITTED_PROBE) {
                 TestFramework::RecordTest(
                     "CB: HALF_OPEN 5 probe successes close", false,
@@ -244,7 +244,7 @@ void TestHalfOpenAllProbesSucceed() {
                     TestFramework::TestCategory::OTHER);
                 return;
             }
-            slice.ReportSuccess(true);
+            slice.ReportSuccess(true, slice.CurrentGenerationForTesting());
         }
         bool pass = slice.CurrentState() == State::CLOSED &&
                     slice.ProbeSuccesses() == 5;
@@ -265,13 +265,13 @@ void TestHalfOpenProbeFailureReopens() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
 
         // Take 1 probe, fail it.
-        Decision d = slice.TryAcquire();
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        Decision d = slice.TryAcquire().decision;
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
         bool pass = d == Decision::ADMITTED_PROBE &&
                     slice.CurrentState() == State::OPEN &&
                     slice.Trips() == 2 &&  // initial trip + re-trip
@@ -293,13 +293,13 @@ void TestHalfOpenExhaustedSlotsRejected() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
         // Take 5 probes but DON'T report outcomes yet.
         for (int i = 0; i < 5; ++i) slice.TryAcquire();
         // 6th TryAcquire must reject (all slots taken).
-        Decision d = slice.TryAcquire();
+        Decision d = slice.TryAcquire().decision;
         bool pass = d == Decision::REJECTED_OPEN;
         TestFramework::RecordTest("CB: HALF_OPEN over capacity rejects",
             pass, "", TestFramework::TestCategory::OTHER);
@@ -322,7 +322,7 @@ void TestExponentialBackoff() {
         auto trip_then_probe_fail = [&]() {
             // Reach OPEN.
             for (int i = 0; i < 5; ++i) {
-                slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+                slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
             }
         };
         auto measure_open_ms = [&]() {
@@ -339,19 +339,19 @@ void TestExponentialBackoff() {
         // Move to HALF_OPEN and fail the probe → trip 2.
         clock->Advance(std::chrono::milliseconds(d1 + 1));
         slice.TryAcquire();  // HALF_OPEN, ADMITTED_PROBE
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
         int64_t d2 = measure_open_ms();
         clock->Advance(std::chrono::milliseconds(d2 + 1));
         slice.TryAcquire();
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
         int64_t d3 = measure_open_ms();
         clock->Advance(std::chrono::milliseconds(d3 + 1));
         slice.TryAcquire();
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
         int64_t d4 = measure_open_ms();
         clock->Advance(std::chrono::milliseconds(d4 + 1));
         slice.TryAcquire();
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
         int64_t d5 = measure_open_ms();
 
         // Expect 1000, 2000, 4000, 8000, 8000 (capped).
@@ -379,17 +379,17 @@ void TestResetOnClose() {
 
         // Trip 1.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(1001));
         // Move to HALF_OPEN.
         for (int i = 0; i < 5; ++i) {
             slice.TryAcquire();
-            slice.ReportSuccess(true);
+            slice.ReportSuccess(true, slice.CurrentGenerationForTesting());
         }
         // Now CLOSED. Trip again — expect base_duration again (not doubled).
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         auto open_until = slice.OpenUntil();
         auto remaining = open_until - clock->now;
@@ -496,10 +496,10 @@ void TestDryRunAdmits() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         // OPEN + dry_run → REJECTED_OPEN_DRYRUN (caller proceeds).
-        Decision d = slice.TryAcquire();
+        Decision d = slice.TryAcquire().decision;
         bool pass = d == Decision::REJECTED_OPEN_DRYRUN &&
                     slice.CurrentState() == State::OPEN &&
                     slice.Rejected() == 1;
@@ -520,7 +520,7 @@ void TestReloadPreservesState() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         // OPEN at this point.
         auto cb2 = cb;
@@ -545,7 +545,7 @@ void TestConsecutiveThresholdOne() {
         auto clock = std::make_shared<MockClock>();
         CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
             [clock]() { return clock->now; });
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         bool pass = slice.CurrentState() == State::OPEN && slice.Trips() == 1;
         TestFramework::RecordTest("CB: threshold=1 single failure trips",
             pass, "", TestFramework::TestCategory::OTHER);
@@ -564,10 +564,10 @@ void TestSuccessClearsConsecutive() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 4; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
-        slice.ReportSuccess(false);  // resets consecutive
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        slice.ReportSuccess(false, slice.CurrentGenerationForTesting());  // resets consecutive
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         // consecutive is back to 1, no trip.
         bool pass = slice.CurrentState() == State::CLOSED;
         TestFramework::RecordTest("CB: success clears consecutive", pass, "",
@@ -601,7 +601,7 @@ void TestLateFailureAfterTripDoesNotInflateBackoff() {
         // production the outcomes for those admitted requests can arrive after
         // the slice has already tripped.
         for (int i = 0; i < 10; ++i) {
-            Decision d = slice.TryAcquire();
+            Decision d = slice.TryAcquire().decision;
             if (d != Decision::ADMITTED) {
                 TestFramework::RecordTest("CB: late failure after trip",
                     false, "admission i=" + std::to_string(i) + " not ADMITTED",
@@ -611,7 +611,7 @@ void TestLateFailureAfterTripDoesNotInflateBackoff() {
         }
         // Report 5 failures — trip at the 5th.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         if (slice.CurrentState() != State::OPEN) {
             TestFramework::RecordTest("CB: late failure after trip", false,
@@ -628,7 +628,7 @@ void TestLateFailureAfterTripDoesNotInflateBackoff() {
         // climb consecutive_failures_, and trigger another TripClosedToOpen
         // even though state is already OPEN.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         int64_t trips_after_late = slice.Trips();
         auto open_until_after_late = slice.OpenUntil();
@@ -661,11 +661,11 @@ void TestLateSuccessAfterTripIgnored() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         // Slice is OPEN now. A late success arrives — must not change state.
         State pre = slice.CurrentState();
-        slice.ReportSuccess(false);
+        slice.ReportSuccess(false, slice.CurrentGenerationForTesting());
         bool pass = pre == State::OPEN && slice.CurrentState() == State::OPEN;
         TestFramework::RecordTest("CB: late success after trip ignored", pass,
             "", TestFramework::TestCategory::OTHER);
@@ -691,14 +691,14 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() {
 
         // Trip the breaker.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
 
         // Admit 2 probes. Report failure on the first (but NOT the second yet
         // — leave 1 in-flight so we can observe the short-circuit).
-        Decision d1 = slice.TryAcquire();   // ADMITTED_PROBE, inflight=1
-        Decision d2 = slice.TryAcquire();   // ADMITTED_PROBE, inflight=2
+        Decision d1 = slice.TryAcquire().decision;   // ADMITTED_PROBE, inflight=1
+        Decision d2 = slice.TryAcquire().decision;   // ADMITTED_PROBE, inflight=2
         if (d1 != Decision::ADMITTED_PROBE || d2 != Decision::ADMITTED_PROBE) {
             TestFramework::RecordTest(
                 "CB: HALF_OPEN stops admitting after probe fail",
@@ -708,7 +708,7 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() {
         }
         // Fail the first probe — inflight drops to 1, saw_failure=true.
         // Last-probe trip does not yet fire (inflight is still 1).
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
 
         // State must still be HALF_OPEN (final probe not yet completed).
         State mid = slice.CurrentState();
@@ -716,7 +716,7 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() {
         // Subsequent TryAcquire — BEFORE fix this would succeed because
         // inflight (1) < permitted (5). AFTER fix it short-circuits because
         // saw_failure is set.
-        Decision d3 = slice.TryAcquire();
+        Decision d3 = slice.TryAcquire().decision;
 
         bool pass = mid == State::HALF_OPEN &&
                     d3 == Decision::REJECTED_OPEN;
@@ -745,7 +745,7 @@ void TestHalfOpenFullCounterSeparate() {
 
         // Trip → OPEN reject increments generic counter only.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         slice.TryAcquire();  // REJECTED_OPEN (backoff active)
         int64_t rejected_open_only = slice.Rejected();
@@ -791,7 +791,7 @@ void TestReloadResetsStateOnEnabledToggleWhileOpen() {
 
         // Drive to OPEN.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         if (slice.CurrentState() != State::OPEN) {
             TestFramework::RecordTest(
@@ -813,7 +813,7 @@ void TestReloadResetsStateOnEnabledToggleWhileOpen() {
 
         // And the slice must NOT insta-trip on a single failure (pre-fix,
         // consecutive_failures_ could have persisted ≥ threshold).
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         bool one_fail_no_trip = slice.CurrentState() == State::CLOSED;
 
         bool pass = disabled_closed && reenabled_closed && one_fail_no_trip;
@@ -845,7 +845,7 @@ void TestReloadResetsConsecutiveFailuresOnEnabledToggle() {
 
         // 4 failures — just under threshold. State still CLOSED.
         for (int i = 0; i < 4; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         if (slice.CurrentState() != State::CLOSED) {
             TestFramework::RecordTest(
@@ -862,7 +862,7 @@ void TestReloadResetsConsecutiveFailuresOnEnabledToggle() {
 
         // A single failure post-reenable must NOT trip — consecutive_failures_
         // should have been reset to 0, not preserved at 4.
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         bool pass = slice.CurrentState() == State::CLOSED;
         TestFramework::RecordTest(
             "CB: reload clears consecutive_failures on enable toggle",
@@ -889,7 +889,7 @@ void TestReloadThresholdChangePreservesState() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         // OPEN. Reload with a tighter threshold but enabled unchanged.
         auto tighter = cb;
@@ -923,19 +923,19 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() {
             [clock]() { return clock->now; });
 
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
 
         // Admit 2 probes, fail the first — saw_failure=true, inflight=1.
         slice.TryAcquire();  // probe 1 admitted
         slice.TryAcquire();  // probe 2 admitted
-        slice.ReportFailure(FailureKind::RESPONSE_5XX, true);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, slice.CurrentGenerationForTesting());
 
         int64_t hof_before = slice.RejectedHalfOpenFull();
         // Reject via saw_failure short-circuit (capacity is NOT exhausted —
         // only 1 probe actually in flight, and permitted is 5).
-        Decision d = slice.TryAcquire();
+        Decision d = slice.TryAcquire().decision;
         int64_t hof_after = slice.RejectedHalfOpenFull();
 
         // Still REJECTED_OPEN (same client-visible outcome), but
@@ -956,6 +956,289 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() {
     }
 }
 
+// BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left
+// `open_until_steady_ns_` populated, violating the documented OpenUntil()
+// contract ("zero when not OPEN"). A Phase 4 consumer computing Retry-After
+// from a HALF_OPEN slice would compute (stale_deadline - now), which is
+// negative once HALF_OPEN begins.
+void TestOpenUntilZeroWhenHalfOpen() {
+    std::cout << "\n[TEST] CB: OpenUntil() zero in HALF_OPEN..." << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip → OPEN. OpenUntil() must be non-zero (contract: zero iff NOT OPEN).
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false,
+                                slice.CurrentGenerationForTesting());
+        }
+        auto open_ns = slice.OpenUntil();
+        bool open_nonzero = open_ns != std::chrono::steady_clock::time_point{};
+
+        // Elapse backoff → HALF_OPEN via TryAcquire.
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+        auto a = slice.TryAcquire();
+        bool halfopen = slice.CurrentState() == State::HALF_OPEN &&
+                        a.decision == Decision::ADMITTED_PROBE;
+
+        // Contract: OpenUntil() zero now that state != OPEN.
+        auto halfopen_ns = slice.OpenUntil();
+        bool halfopen_zero = halfopen_ns == std::chrono::steady_clock::time_point{};
+
+        bool pass = open_nonzero && halfopen && halfopen_zero;
+        TestFramework::RecordTest(
+            "CB: OpenUntil() zero in HALF_OPEN",
+            pass, pass ? "" :
+                  "open_nonzero=" + std::to_string(open_nonzero) +
+                  " halfopen=" + std::to_string(halfopen) +
+                  " halfopen_zero=" + std::to_string(halfopen_zero),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: OpenUntil() zero in HALF_OPEN",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// BUG (review round 3, P1): Reload reset the state on enabled toggle but
+// gave Report* no way to distinguish pre-toggle admissions from post-toggle
+// ones. Stale completions then polluted the fresh CLOSED cycle. Fixed with
+// a generation token captured at admission and checked at report.
+void TestStaleGenerationReportsDroppedAfterReloadToggle() {
+    std::cout << "\n[TEST] CB: stale-generation reports dropped after reload toggle..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.consecutive_failure_threshold = 3;  // make insta-trip detection easy
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Admit 3 requests in the original CLOSED cycle (generation = A).
+        auto a1 = slice.TryAcquire();
+        auto a2 = slice.TryAcquire();
+        auto a3 = slice.TryAcquire();
+        uint64_t gen_A = a1.generation;
+        bool same_gen_pre = a2.generation == gen_A && a3.generation == gen_A;
+
+        // Operator toggles: disable then re-enable → fresh CLOSED cycle.
+        auto disabled = cb; disabled.enabled = false;
+        slice.Reload(disabled);
+        slice.Reload(cb);
+        // After toggle, state is CLOSED and generation has advanced.
+        uint64_t gen_B = slice.CurrentGenerationForTesting();
+        bool generation_advanced = gen_B != gen_A;
+
+        // Late failures from the pre-toggle cycle arrive. Without the fix,
+        // these would increment consecutive_failures_ and trip the fresh
+        // cycle IMMEDIATELY (threshold=3, 3 late failures).
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A);
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_A);
+
+        // Fresh cycle must be untouched.
+        bool state_still_closed = slice.CurrentState() == State::CLOSED;
+        bool stale_counter_bumped = slice.ReportsStaleGeneration() == 3;
+
+        // A fresh post-toggle admission + 3 REAL failures should still trip —
+        // so the guard didn't over-drop.
+        auto fresh = slice.TryAcquire();
+        for (int i = 0; i < 3; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, fresh.generation);
+        }
+        bool fresh_trips = slice.CurrentState() == State::OPEN;
+
+        bool pass = same_gen_pre && generation_advanced &&
+                    state_still_closed && stale_counter_bumped && fresh_trips;
+        TestFramework::RecordTest(
+            "CB: stale-generation reports dropped after reload toggle",
+            pass, pass ? "" :
+                  "same_gen_pre=" + std::to_string(same_gen_pre) +
+                  " gen_advanced=" + std::to_string(generation_advanced) +
+                  " state_closed=" + std::to_string(state_still_closed) +
+                  " stale_cnt=" + std::to_string(slice.ReportsStaleGeneration()) +
+                  " fresh_trips=" + std::to_string(fresh_trips),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: stale-generation reports dropped after reload toggle",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Generation also advances across state transitions (not just Reload), so
+// a report admitted in CLOSED cycle A that completes after OPEN → HALF_OPEN
+// → CLOSED cycle B is dropped instead of polluting cycle B's counters.
+void TestStaleGenerationReportsDroppedAcrossStateTransitions() {
+    std::cout << "\n[TEST] CB: stale reports dropped across CLOSED->OPEN->CLOSED..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // CLOSED cycle A — admit a request, capture its generation.
+        auto admit_A = slice.TryAcquire();
+        uint64_t gen_A = admit_A.generation;
+
+        // Drive to OPEN, then HALF_OPEN, then CLOSED (cycle B) via probe success.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false,
+                                slice.CurrentGenerationForTesting());
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+        for (int i = 0; i < cb.permitted_half_open_calls; ++i) {
+            auto p = slice.TryAcquire();  // probe
+            slice.ReportSuccess(true, p.generation);
+        }
+        bool cycleB_closed = slice.CurrentState() == State::CLOSED;
+        uint64_t gen_B = slice.CurrentGenerationForTesting();
+        bool gen_advanced = gen_B > gen_A;
+
+        // Now the original cycle-A request finally reports a success. In a
+        // world without the generation guard, this would reset cycle B's
+        // (freshly-zero) consecutive_failures_ and add to cycle B's window,
+        // polluting fresh telemetry.
+        int64_t stale_before = slice.ReportsStaleGeneration();
+        slice.ReportSuccess(false, gen_A);
+        int64_t stale_after = slice.ReportsStaleGeneration();
+        bool dropped = stale_after == stale_before + 1;
+
+        bool pass = cycleB_closed && gen_advanced && dropped;
+        TestFramework::RecordTest(
+            "CB: stale reports dropped across CLOSED->OPEN->CLOSED",
+            pass, pass ? "" :
+                  "cycleB_closed=" + std::to_string(cycleB_closed) +
+                  " gen_advanced=" + std::to_string(gen_advanced) +
+                  " dropped=" + std::to_string(dropped),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: stale reports dropped across CLOSED->OPEN->CLOSED",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// BUG (review round 4, P2): Reload that resizes the rolling window without
+// toggling enabled cleared the window buckets but left generation_ unchanged.
+// Late reports from pre-reload admissions would carry the still-current
+// generation, pass the guard, and re-populate the freshly empty window —
+// mixing pre-reload and post-reload traffic. A pre-reload + post-reload
+// failure pair could satisfy minimum_volume / failure_rate immediately on
+// what should be a fresh observation cycle.
+void TestWindowResizeAdvancesGeneration() {
+    std::cout << "\n[TEST] CB: window resize advances generation..." << std::endl;
+    try {
+        // Use rate-trip path only (high consec threshold disables that path),
+        // a low minimum_volume so 2 failures suffice, and a high
+        // failure_rate_threshold so the trip relies on the rate calc.
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 1000;  // disable consecutive path
+        cb.failure_rate_threshold = 50;
+        cb.minimum_volume = 2;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 5;
+        cb.base_open_duration_ms = 5000;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Pre-reload: admit a request and capture its generation.
+        auto admit_pre = slice.TryAcquire();
+        uint64_t gen_pre = admit_pre.generation;
+
+        // Reload: change window_seconds but keep enabled=true. Window is
+        // resized (cleared) and generation MUST advance so the pre-reload
+        // admission's late report doesn't seed the new window.
+        auto resized = cb;
+        resized.window_seconds = 30;
+        slice.Reload(resized);
+
+        uint64_t gen_post = slice.CurrentGenerationForTesting();
+        bool gen_advanced = gen_post != gen_pre;
+
+        // The pre-reload admission completes (failure). Without the fix,
+        // this would add one failure to the freshly-empty window. Then
+        // a post-reload admission's failure brings total=2 >= minimum_volume,
+        // failures=2/2=100% >= 50% → IMMEDIATE TRIP on a fresh window.
+        // With the fix, the pre-reload report is dropped (counted as stale).
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre);
+
+        int64_t stale_after_pre = slice.ReportsStaleGeneration();
+
+        // Now a real post-reload admission and failure — single failure in
+        // a fresh window of size 30s. total=1, below minimum_volume=2 → no trip.
+        auto admit_post = slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, admit_post.generation);
+
+        bool state_still_closed = slice.CurrentState() == State::CLOSED;
+        bool stale_dropped = stale_after_pre == 1;
+
+        bool pass = gen_advanced && state_still_closed && stale_dropped;
+        TestFramework::RecordTest(
+            "CB: window resize advances generation",
+            pass, pass ? "" :
+                  "gen_advanced=" + std::to_string(gen_advanced) +
+                  " state_closed=" + std::to_string(state_still_closed) +
+                  " stale_count=" + std::to_string(slice.ReportsStaleGeneration()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: window resize advances generation",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Regression guard: a reload that changes only thresholds (no window resize,
+// no enabled toggle) MUST preserve generation. Operator intent is "apply new
+// thresholds to existing observations" — the round-4 fix's window-resize
+// generation bump must NOT trigger here.
+void TestThresholdOnlyReloadDoesNotAdvanceGeneration() {
+    std::cout << "\n[TEST] CB: threshold-only reload preserves generation..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        auto admit = slice.TryAcquire();
+        uint64_t gen_pre = admit.generation;
+
+        // Tighten thresholds; same enabled, same window_seconds.
+        auto tightened = cb;
+        tightened.consecutive_failure_threshold = 2;
+        tightened.failure_rate_threshold = 30;
+        slice.Reload(tightened);
+
+        uint64_t gen_post = slice.CurrentGenerationForTesting();
+        bool gen_preserved = gen_post == gen_pre;
+
+        // The pre-reload admission's report should NOT be dropped — operator
+        // wants the new thresholds applied to existing in-flight observations.
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre);
+        bool stale_zero = slice.ReportsStaleGeneration() == 0;
+
+        bool pass = gen_preserved && stale_zero;
+        TestFramework::RecordTest(
+            "CB: threshold-only reload preserves generation",
+            pass, pass ? "" :
+                  "gen_preserved=" + std::to_string(gen_preserved) +
+                  " stale_zero=" + std::to_string(stale_zero),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: threshold-only reload preserves generation",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -976,12 +1259,12 @@ void TestTransitionCallbackInvoked() {
 
         // Full cycle.
         for (int i = 0; i < 5; ++i) {
-            slice.ReportFailure(FailureKind::RESPONSE_5XX, false);
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, slice.CurrentGenerationForTesting());
         }
         clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
         for (int i = 0; i < cb.permitted_half_open_calls; ++i) {
             slice.TryAcquire();
-            slice.ReportSuccess(true);
+            slice.ReportSuccess(true, slice.CurrentGenerationForTesting());
         }
         bool pass = closed_to_open == 1 && open_to_halfopen == 1 &&
                     halfopen_to_closed == 1;
@@ -1027,6 +1310,11 @@ void RunAllTests() {
     TestReloadResetsConsecutiveFailuresOnEnabledToggle();
     TestReloadThresholdChangePreservesState();
     TestSawFailureDoesNotBumpHalfOpenFullCounter();
+    TestOpenUntilZeroWhenHalfOpen();
+    TestStaleGenerationReportsDroppedAfterReloadToggle();
+    TestStaleGenerationReportsDroppedAcrossStateTransitions();
+    TestWindowResizeAdvancesGeneration();
+    TestThresholdOnlyReloadDoesNotAdvanceGeneration();
     TestTransitionCallbackInvoked();
 }
 

From fa585ac508f1f277379daca5721a9167e39da1bc Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 00:10:07 +0800
Subject: [PATCH 05/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  57 +++++--
 include/config/server_config.h                |  11 +-
 server/circuit_breaker_slice.cc               | 109 ++++++++------
 test/circuit_breaker_test.h                   | 140 ++++++++++++++++++
 test/config_test.h                            |  48 ++++--
 5 files changed, 294 insertions(+), 71 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 4ba1331d..f08a8358 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -82,19 +82,32 @@ class CircuitBreakerSlice {
         return rejected_half_open_full_.load(std::memory_order_relaxed);
     }
     // Number of Report* calls silently dropped because their admission
-    // generation no longer matches the slice's current generation. These
-    // are reports of requests admitted before a state transition or a
+    // generation no longer matches the relevant per-domain counter
+    // (closed_gen_ for non-probe, halfopen_gen_ for probe). These are
+    // reports of requests admitted before a state transition or a
     // Reload()-reset. Useful for detecting mis-threaded admission tokens.
     int64_t  ReportsStaleGeneration() const {
         return reports_stale_generation_.load(std::memory_order_relaxed);
     }
 
-    // **Test-only** accessor for the current generation. Production callers
-    // MUST use the generation returned by TryAcquire (racy otherwise — this
-    // getter is not atomic). Tests use it as ergonomic shorthand for
-    // "admission just happened in the current cycle", bypassing the need to
-    // thread a token per synthetic Report* call.
-    uint64_t CurrentGenerationForTesting() const { return generation_; }
+    // **Test-only** accessor for the generation that the current state's
+    // next admission would receive. Returns `halfopen_gen_` when state is
+    // HALF_OPEN (probe admissions use that counter), otherwise `closed_gen_`
+    // (non-probe admissions use that counter). This matches what TryAcquire
+    // would stamp on a new admission right now.
+    //
+    // Production callers MUST use the generation returned by TryAcquire
+    // (racy otherwise — these getters are not atomic). Tests use it as
+    // ergonomic shorthand for "admission just happened in the current
+    // cycle", bypassing the need to thread a token per synthetic Report*.
+    uint64_t CurrentGenerationForTesting() const {
+        return (state_.load(std::memory_order_acquire) == State::HALF_OPEN)
+                   ? halfopen_gen_ : closed_gen_;
+    }
+    // Explicit per-domain getters for tests that cross state transitions
+    // while holding a captured generation from a specific domain.
+    uint64_t CurrentClosedGenForTesting()   const { return closed_gen_; }
+    uint64_t CurrentHalfOpenGenForTesting() const { return halfopen_gen_; }
 
     const std::string& host_label() const { return host_label_; }
     size_t dispatcher_index() const { return dispatcher_index_; }
@@ -140,13 +153,27 @@ class CircuitBreakerSlice {
     // post-trip reject in default-warn operator logs. Dispatcher-thread only.
     bool first_reject_logged_for_open_ = false;
 
-    // Monotonic generation counter. Incremented on every state transition
-    // AND on every Reload() enabled-toggle reset. TryAcquire captures the
-    // current generation at admission time; Report* compares against it
-    // and drops reports from a stale generation (e.g., a request admitted
-    // before an operator reset whose outcome arrives after). Dispatcher-
-    // thread only — plain int (no atomic needed).
-    uint64_t generation_ = 1;
+    // Monotonic generation counters — one per admission domain. TryAcquire
+    // stamps the admission with the domain's current value; Report* compares
+    // against it and drops reports whose admission no longer matches a live
+    // cycle. Split into two counters so operations that reset ONE domain
+    // (e.g., window_seconds reload wipes the CLOSED rate window) don't
+    // invalidate admissions in the OTHER domain (HALF_OPEN probes) — which
+    // would strand probe capacity and wedge the slice in HALF_OPEN.
+    //
+    // Dispatcher-thread only — plain ints (no atomics needed).
+    //
+    //   closed_gen_   bumps on: TripClosedToOpen (CLOSED cycle ends),
+    //                           Reload enabled-toggle reset,
+    //                           Reload window_seconds change (rate-window wipe).
+    //   halfopen_gen_ bumps on: TripHalfOpenToOpen (HALF_OPEN cycle ends),
+    //                           TransitionHalfOpenToClosed (HALF_OPEN cycle ends on success),
+    //                           Reload enabled-toggle reset.
+    //
+    // Initial value 1 (so 0 can be a "not-applicable" sentinel for
+    // admissions returned from disabled slices or the REJECTED_* paths).
+    uint64_t closed_gen_   = 1;
+    uint64_t halfopen_gen_ = 1;
 
     // Rejections silently dropped because their admission generation no
     // longer matches `generation_`. Observability only; lets dashboards see
diff --git a/include/config/server_config.h b/include/config/server_config.h
index 7dd949d1..6a82521a 100644
--- a/include/config/server_config.h
+++ b/include/config/server_config.h
@@ -183,10 +183,17 @@ struct UpstreamConfig {
     ProxyConfig proxy;
     CircuitBreakerConfig circuit_breaker;
 
+    // Intentionally EXCLUDES circuit_breaker — breaker tuning is live-
+    // reloadable (§10 of CIRCUIT_BREAKER_DESIGN.md) and must not trigger
+    // the "upstream configuration changes require a restart" warning in
+    // HttpServer::Reload (http_server.cc:3383). Phase 8's breaker-reload
+    // path compares CircuitBreakerConfig fields directly (per-host
+    // iteration), not via this operator==. All other fields here are
+    // restart-required: changing name/host/port/tls rebuilds pool
+    // topology; changing pool/proxy would re-register routes.
     bool operator==(const UpstreamConfig& o) const {
         return name == o.name && host == o.host && port == o.port &&
-               tls == o.tls && pool == o.pool && proxy == o.proxy &&
-               circuit_breaker == o.circuit_breaker;
+               tls == o.tls && pool == o.pool && proxy == o.proxy;
     }
     bool operator!=(const UpstreamConfig& o) const { return !(*this == o); }
 };
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 32fcdfc5..03313173 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -80,9 +80,10 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
     first_reject_logged_for_open_ = false;
-    // Bump generation: any in-flight admission from the closing CLOSED
-    // cycle is now stale. Late Report*() for those requests is dropped.
-    ++generation_;
+    // Bump closed_gen_: non-probe admissions from the closing CLOSED cycle
+    // are now stale. Late Report(false, ...) calls for those requests drop.
+    // halfopen_gen_ is NOT bumped — OPEN holds no HALF_OPEN admissions.
+    ++closed_gen_;
 
     trips_.fetch_add(1, std::memory_order_relaxed);
 
@@ -113,9 +114,11 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     // operationally distinct from OPEN rejection (still backing off) and
     // deserves its own breadcrumb in default-warn operator logs.
     first_reject_logged_for_open_ = false;
-    // Fresh HALF_OPEN cycle — any stale probe admissions from a prior
-    // HALF_OPEN cycle (after re-trip then re-enter) are now invalidated.
-    ++generation_;
+    // NOTE: neither closed_gen_ nor halfopen_gen_ is bumped here. No
+    // admissions are made in OPEN — the previous HALF_OPEN cycle (if any)
+    // already bumped halfopen_gen_ on its exit (TripHalfOpenToOpen) or on
+    // cycle-complete (TransitionHalfOpenToClosed), so any latent stale
+    // probes are already tagged. Bumping again would be redundant.
 
     logging::Get()->info(
         "circuit breaker half-open {} probes_allowed={}",
@@ -142,10 +145,11 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() {
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
     first_reject_logged_for_open_ = false;
-    // Fresh CLOSED cycle — any non-probe admissions from the PREVIOUS
-    // CLOSED cycle (before trip) are now stale, and any probe admissions
-    // from the just-completed HALF_OPEN cycle are too.
-    ++generation_;
+    // Bump halfopen_gen_: the just-completed HALF_OPEN cycle's probe
+    // admissions are now stale. closed_gen_ is NOT bumped — pre-trip
+    // CLOSED admissions were already invalidated by TripClosedToOpen
+    // when we left CLOSED.
+    ++halfopen_gen_;
 
     logging::Get()->info(
         "circuit breaker closed {} probes_succeeded={}",
@@ -172,9 +176,10 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) {
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
     first_reject_logged_for_open_ = false;
-    // Bump generation — any in-flight probe admissions from the closing
-    // HALF_OPEN cycle are now stale.
-    ++generation_;
+    // Bump halfopen_gen_: probe admissions from the closing HALF_OPEN
+    // cycle are now stale. closed_gen_ is NOT bumped — no CLOSED
+    // admissions are outstanding (we came from HALF_OPEN, not CLOSED).
+    ++halfopen_gen_;
 
     trips_.fetch_add(1, std::memory_order_relaxed);
 
@@ -209,8 +214,12 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() {
             TransitionOpenToHalfOpen();
             s = State::HALF_OPEN;
         } else {
+            // Rejected admissions get generation 0 — callers must not call
+            // Report* for a rejected admission, and 0 always compares stale
+            // (domain gens start at 1), so an accidental Report would drop
+            // safely rather than mutating state.
             return Admission{RejectWithLog("open", /*half_open_full=*/false),
-                             generation_};
+                             /*generation=*/0};
         }
     }
 
@@ -225,21 +234,22 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() {
         if (half_open_saw_failure_) {
             return Admission{RejectWithLog("half_open_recovery_failing",
                                            /*half_open_full=*/false),
-                             generation_};
+                             /*generation=*/0};
         }
         // Case B: probe budget fully in flight. "No capacity" — bump the
         // dedicated counter so dashboards can tell these two apart.
         if (half_open_inflight_ >= config_.permitted_half_open_calls) {
             return Admission{RejectWithLog("half_open_full",
                                            /*half_open_full=*/true),
-                             generation_};
+                             /*generation=*/0};
         }
         half_open_inflight_++;
-        return Admission{Decision::ADMITTED_PROBE, generation_};
+        // Probe admission — stamp with halfopen_gen_.
+        return Admission{Decision::ADMITTED_PROBE, halfopen_gen_};
     }
 
-    // CLOSED: fast path.
-    return Admission{Decision::ADMITTED, generation_};
+    // CLOSED: fast path — stamp with closed_gen_.
+    return Admission{Decision::ADMITTED, closed_gen_};
 }
 
 Decision CircuitBreakerSlice::RejectWithLog(const char* state_label,
@@ -287,12 +297,13 @@ void CircuitBreakerSlice::ReportSuccess(bool probe,
         // signal about our state machine.
         probe_successes_.fetch_add(1, std::memory_order_relaxed);
 
-        // Generation guard: drop reports for admissions that pre-date the
-        // current cycle (a state transition or Reload reset invalidated them).
-        // Belt-and-suspenders with the state guard below — the generation
-        // catches stale-report-in-same-state cases (e.g., HALF_OPEN cycle
-        // A probe completing after re-trip and re-entry into HALF_OPEN B).
-        if (admission_generation != generation_) {
+        // Generation guard: drop reports for probes admitted before the
+        // current HALF_OPEN cycle. Probes use halfopen_gen_ exclusively —
+        // so a window_seconds reload (bumps closed_gen_, NOT halfopen_gen_)
+        // does NOT invalidate in-flight probes, which would otherwise
+        // strand half_open_inflight_ at its pre-reload value and wedge the
+        // slice in HALF_OPEN/half_open_full.
+        if (admission_generation != halfopen_gen_) {
             reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
             return;
         }
@@ -321,8 +332,8 @@ void CircuitBreakerSlice::ReportSuccess(bool probe,
         return;
     }
 
-    // Non-probe success path.
-    if (admission_generation != generation_) {
+    // Non-probe success path — checked against closed_gen_.
+    if (admission_generation != closed_gen_) {
         reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
         return;
     }
@@ -330,7 +341,7 @@ void CircuitBreakerSlice::ReportSuccess(bool probe,
     // transitioned (e.g., other requests in this burst tripped it), this
     // late outcome must NOT retroactively reset `consecutive_failures_` or
     // pollute the window — a fresh CLOSED cycle after recovery would start
-    // with bogus success history. (Transitions bump `generation_`, so the
+    // with bogus success history. (Transitions bump `closed_gen_`, so the
     // guard above catches this too; the state check is a direct guard for
     // observability clarity.)
     if (state_.load(std::memory_order_acquire) != State::CLOSED) return;
@@ -347,7 +358,8 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe,
     if (probe) {
         probe_failures_.fetch_add(1, std::memory_order_relaxed);
 
-        if (admission_generation != generation_) {
+        // Probes use halfopen_gen_ — see matching comment in ReportSuccess.
+        if (admission_generation != halfopen_gen_) {
             reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
             return;
         }
@@ -364,8 +376,8 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe,
         return;
     }
 
-    // Non-probe failure path.
-    if (admission_generation != generation_) {
+    // Non-probe failure path — checked against closed_gen_.
+    if (admission_generation != closed_gen_) {
         reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
         return;
     }
@@ -397,19 +409,24 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
     config_ = new_config;
     if (window_changed) {
         // Resize wipes the failure-rate ring buckets. Without bumping
-        // generation_ here, late completions from pre-reload admissions
-        // would still carry the matching generation, pass the guard, and
-        // repopulate the freshly empty window — mixing pre-reload and
-        // post-reload traffic. A pre-reload failure plus one new failure
-        // could then immediately satisfy minimum_volume / failure_rate
-        // and trip on the next evaluation, despite this being a fresh
-        // observation cycle by operator intent.
+        // closed_gen_ here, late completions from pre-reload CLOSED
+        // admissions would pass the generation guard and repopulate the
+        // freshly empty window — mixing pre-reload and post-reload traffic
+        // in the rate-trip calc.
+        //
+        // CRUCIALLY: we bump ONLY closed_gen_, NOT halfopen_gen_.
+        // window_seconds affects only the CLOSED rate window. Bumping
+        // halfopen_gen_ too (as prior fix did) would invalidate in-flight
+        // probes, whose late reports could no longer decrement
+        // half_open_inflight_ or honor saw_failure/TripHalfOpenToOpen —
+        // wedging the slice in HALF_OPEN/half_open_full with full probe
+        // slots until another reset. Probe bookkeeping is untouched by
+        // Resize, so preserving halfopen_gen_ keeps probes live.
         //
         // Skip when enabled_changed is also true: the full-reset branch
-        // below bumps the generation as part of its larger reset, and
-        // double-bumping is harmless but noisy.
+        // below bumps both generations as part of its larger reset.
         window_.Resize(new_config.window_seconds);
-        if (!enabled_changed) ++generation_;
+        if (!enabled_changed) ++closed_gen_;
     }
 
     if (enabled_changed) {
@@ -439,10 +456,12 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         half_open_successes_ = 0;
         half_open_saw_failure_ = false;
         first_reject_logged_for_open_ = false;
-        // Fresh generation: reports of requests admitted before this
-        // reset will carry the old generation and be silently dropped,
-        // preserving clean-restart semantics.
-        ++generation_;
+        // Fresh generations for BOTH domains: this is a full reset.
+        // Both pre-toggle non-probe admissions (closed_gen) and in-flight
+        // probes (halfopen_gen) are invalidated — their late reports
+        // silently drop, preserving clean-restart semantics.
+        ++closed_gen_;
+        ++halfopen_gen_;
     }
     // When `enabled` is unchanged: live state preserved — operator expects
     // new thresholds to apply to the next evaluation, not to reset an
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index 828dfe4f..6a6f4354 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1239,6 +1239,144 @@ void TestThresholdOnlyReloadDoesNotAdvanceGeneration() {
     }
 }
 
+// BUG (review round 5, P1): Reload with window_seconds change while the
+// slice is HALF_OPEN used to bump the single `generation_`, invalidating
+// every in-flight probe. Those probes' late Report* calls then dropped
+// WITHOUT decrementing half_open_inflight_, wedging the slice in HALF_OPEN
+// with all probe slots stuck "in flight" forever — subsequent TryAcquires
+// rejected with half_open_full indefinitely until another full reset.
+//
+// Fix: split generation into closed_gen_ (non-probe, CLOSED-state data)
+// and halfopen_gen_ (probe, HALF_OPEN-state data). window_seconds reload
+// bumps only closed_gen_ because it only resets CLOSED-state data.
+void TestWindowResizeDuringHalfOpenDoesNotStrandProbes() {
+    std::cout << "\n[TEST] CB: window resize during HALF_OPEN preserves probes..."
+              << std::endl;
+    try {
+        auto cb = DefaultEnabledConfig();
+        cb.permitted_half_open_calls = 3;
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Drive to HALF_OPEN.
+        for (int i = 0; i < 5; ++i) {
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false,
+                                slice.CurrentGenerationForTesting());
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Admit all 3 probes (capture their admission tokens).
+        auto p1 = slice.TryAcquire();
+        auto p2 = slice.TryAcquire();
+        auto p3 = slice.TryAcquire();
+        bool all_admitted_probe = p1.decision == Decision::ADMITTED_PROBE &&
+                                  p2.decision == Decision::ADMITTED_PROBE &&
+                                  p3.decision == Decision::ADMITTED_PROBE;
+
+        // Reload window_seconds (enabled unchanged). PRE-fix: bumps single
+        // generation, invalidates p1/p2/p3 probes → stranded. POST-fix:
+        // bumps only closed_gen_, probe tokens still match halfopen_gen_.
+        auto resized = cb;
+        resized.window_seconds = 30;
+        slice.Reload(resized);
+
+        // closed_gen advanced, halfopen_gen preserved.
+        bool closed_gen_advanced = slice.CurrentClosedGenForTesting() !=
+                                   p1.generation;  // p1 was admitted in HALF_OPEN
+                                                   // but let's check against gen
+                                                   // we'd have captured in CLOSED
+        // Actually, directly: probes tokens must still match halfopen_gen_.
+        bool probe_gen_preserved =
+            p1.generation == slice.CurrentHalfOpenGenForTesting() &&
+            p2.generation == slice.CurrentHalfOpenGenForTesting() &&
+            p3.generation == slice.CurrentHalfOpenGenForTesting();
+
+        // Probes report success — each must be accepted and advance the
+        // HALF_OPEN → CLOSED transition.
+        slice.ReportSuccess(true, p1.generation);
+        slice.ReportSuccess(true, p2.generation);
+        slice.ReportSuccess(true, p3.generation);
+
+        // After 3 probe successes at permitted_half_open_calls=3, slice
+        // MUST have transitioned to CLOSED. Pre-fix: probes dropped, no
+        // progression, still HALF_OPEN with inflight stuck at 3.
+        bool closed_now = slice.CurrentState() == State::CLOSED;
+        // None of the probes were dropped as stale.
+        bool no_stale_drops = slice.ReportsStaleGeneration() == 0;
+        // All 3 probe successes counted.
+        bool all_probes_counted = slice.ProbeSuccesses() == 3;
+
+        bool pass = all_admitted_probe && probe_gen_preserved &&
+                    closed_now && no_stale_drops && all_probes_counted;
+        (void)closed_gen_advanced;  // (informational only)
+
+        TestFramework::RecordTest(
+            "CB: window resize during HALF_OPEN preserves probes",
+            pass, pass ? "" :
+                  "admitted=" + std::to_string(all_admitted_probe) +
+                  " probe_gen_preserved=" + std::to_string(probe_gen_preserved) +
+                  " closed_now=" + std::to_string(closed_now) +
+                  " stale=" + std::to_string(slice.ReportsStaleGeneration()) +
+                  " probe_success=" + std::to_string(slice.ProbeSuccesses()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: window resize during HALF_OPEN preserves probes",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Companion guard: window_seconds reload MUST still invalidate pre-reload
+// CLOSED (non-probe) admissions. Ensures the split-gen didn't weaken the
+// round-4 fix.
+void TestWindowResizeStillInvalidatesClosedAdmissions() {
+    std::cout << "\n[TEST] CB: window resize invalidates CLOSED admissions..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 1000;  // disable consec path
+        cb.failure_rate_threshold = 50;
+        cb.minimum_volume = 2;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 5;
+        cb.base_open_duration_ms = 5000;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        auto admit_pre = slice.TryAcquire();
+        uint64_t gen_pre = admit_pre.generation;
+
+        auto resized = cb; resized.window_seconds = 30;
+        slice.Reload(resized);
+
+        // Pre-reload CLOSED admission reports — must drop as stale.
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, gen_pre);
+        bool dropped_stale = slice.ReportsStaleGeneration() == 1;
+
+        // And state must remain CLOSED (pre-reload failure did NOT seed window).
+        auto admit_post = slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, admit_post.generation);
+        bool still_closed = slice.CurrentState() == State::CLOSED;
+
+        bool pass = dropped_stale && still_closed;
+        TestFramework::RecordTest(
+            "CB: window resize invalidates CLOSED admissions",
+            pass, pass ? "" :
+                  "dropped=" + std::to_string(dropped_stale) +
+                  " closed=" + std::to_string(still_closed),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: window resize invalidates CLOSED admissions",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -1315,6 +1453,8 @@ void RunAllTests() {
     TestStaleGenerationReportsDroppedAcrossStateTransitions();
     TestWindowResizeAdvancesGeneration();
     TestThresholdOnlyReloadDoesNotAdvanceGeneration();
+    TestWindowResizeDuringHalfOpenDoesNotStrandProbes();
+    TestWindowResizeStillInvalidatesClosedAdmissions();
     TestTransitionCallbackInvoked();
 }
 
diff --git a/test/config_test.h b/test/config_test.h
index 94c60763..69b5cfe7 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -549,25 +549,55 @@ namespace ConfigTests {
             "permitted_half_open_calls must be in [1, 1000]");
     }
 
-    // Test 14: Equality operator covers circuit_breaker field
+    // Test 14: UpstreamConfig::operator== EXCLUDES circuit_breaker field.
+    // Rationale: breaker tuning is live-reloadable (design §10). Including
+    // it here would make HttpServer::Reload (http_server.cc:3383) treat a
+    // breaker-only edit as an upstream topology change, fire the "restart
+    // required" warning, and block the hot-reload path. Topology fields
+    // (name/host/port/tls/pool/proxy) ARE included — they require a restart.
     void TestCircuitBreakerEquality() {
-        std::cout << "\n[TEST] Circuit Breaker Equality..." << std::endl;
+        std::cout << "\n[TEST] Circuit Breaker Equality (topology only)..." << std::endl;
         try {
             UpstreamConfig a;
             a.name = "svc"; a.host = "h"; a.port = 80;
             UpstreamConfig b = a;
+
+            // Default equal.
             bool equal_default = (a == b);
 
+            // Circuit-breaker-only edit must NOT change UpstreamConfig equality.
             b.circuit_breaker.enabled = true;
-            bool not_equal_after_diff = (a != b);
-
-            bool pass = equal_default && not_equal_after_diff;
-            TestFramework::RecordTest("Circuit Breaker Equality", pass,
-                pass ? "" : "operator== failed for circuit_breaker",
+            b.circuit_breaker.window_seconds = 30;
+            bool topology_still_equal = (a == b);
+
+            // BUT CircuitBreakerConfig::operator== catches the field diff
+            // (Phase 8 reload uses this to detect what changed per-host).
+            bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker);
+
+            // Topology changes DO make configs unequal.
+            UpstreamConfig c = a;
+            c.host = "different";
+            bool topology_changed = (a != c);
+
+            UpstreamConfig d = a;
+            d.port = 9999;
+            bool port_change_detected = (a != d);
+
+            bool pass = equal_default && topology_still_equal &&
+                        cb_fields_differ && topology_changed &&
+                        port_change_detected;
+            TestFramework::RecordTest("Circuit Breaker Equality (topology only)",
+                pass,
+                pass ? "" :
+                "equal_default=" + std::to_string(equal_default) +
+                " topology_still_equal=" + std::to_string(topology_still_equal) +
+                " cb_fields_differ=" + std::to_string(cb_fields_differ) +
+                " topology_changed=" + std::to_string(topology_changed) +
+                " port_change_detected=" + std::to_string(port_change_detected),
                 TestFramework::TestCategory::OTHER);
         } catch (const std::exception& e) {
-            TestFramework::RecordTest("Circuit Breaker Equality", false, e.what(),
-                TestFramework::TestCategory::OTHER);
+            TestFramework::RecordTest("Circuit Breaker Equality (topology only)",
+                false, e.what(), TestFramework::TestCategory::OTHER);
         }
     }
 

From eae864e641162a8b8080eb9e79f19720b8d79f62 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 00:24:29 +0800
Subject: [PATCH 06/37] Fix review comment

---
 include/config/server_config.h | 21 ++++++++++++---------
 test/config_test.h             | 32 ++++++++++++++++----------------
 2 files changed, 28 insertions(+), 25 deletions(-)

diff --git a/include/config/server_config.h b/include/config/server_config.h
index 6a82521a..606de9d9 100644
--- a/include/config/server_config.h
+++ b/include/config/server_config.h
@@ -183,17 +183,20 @@ struct UpstreamConfig {
     ProxyConfig proxy;
     CircuitBreakerConfig circuit_breaker;
 
-    // Intentionally EXCLUDES circuit_breaker — breaker tuning is live-
-    // reloadable (§10 of CIRCUIT_BREAKER_DESIGN.md) and must not trigger
-    // the "upstream configuration changes require a restart" warning in
-    // HttpServer::Reload (http_server.cc:3383). Phase 8's breaker-reload
-    // path compares CircuitBreakerConfig fields directly (per-host
-    // iteration), not via this operator==. All other fields here are
-    // restart-required: changing name/host/port/tls rebuilds pool
-    // topology; changing pool/proxy would re-register routes.
+    // Includes circuit_breaker until Phase 8 ships CircuitBreakerManager::Reload.
+    // A CB-only SIGHUP currently has no propagation path into live slice state,
+    // so operator== must return false to trigger the "restart required" warning
+    // rather than silently committing the new config object while the live slices
+    // continue running with the old settings.
+    //
+    // TODO(phase-8): once CircuitBreakerManager::Reload is wired into
+    // HttpServer::Reload, remove circuit_breaker from this operator and diff it
+    // separately (per-host CircuitBreakerConfig comparison) so breaker-only
+    // edits are hot-reloadable without a restart.
     bool operator==(const UpstreamConfig& o) const {
         return name == o.name && host == o.host && port == o.port &&
-               tls == o.tls && pool == o.pool && proxy == o.proxy;
+               tls == o.tls && pool == o.pool && proxy == o.proxy &&
+               circuit_breaker == o.circuit_breaker;
     }
     bool operator!=(const UpstreamConfig& o) const { return !(*this == o); }
 };
diff --git a/test/config_test.h b/test/config_test.h
index 69b5cfe7..18ee718f 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -549,14 +549,14 @@ namespace ConfigTests {
             "permitted_half_open_calls must be in [1, 1000]");
     }
 
-    // Test 14: UpstreamConfig::operator== EXCLUDES circuit_breaker field.
-    // Rationale: breaker tuning is live-reloadable (design §10). Including
-    // it here would make HttpServer::Reload (http_server.cc:3383) treat a
-    // breaker-only edit as an upstream topology change, fire the "restart
-    // required" warning, and block the hot-reload path. Topology fields
-    // (name/host/port/tls/pool/proxy) ARE included — they require a restart.
+    // Test 14: UpstreamConfig::operator== INCLUDES circuit_breaker until Phase 8.
+    // Until CircuitBreakerManager::Reload is wired in HttpServer::Reload, a
+    // CB-only SIGHUP has no propagation path. Keeping circuit_breaker in the
+    // equality check ensures the server fires the "restart required" warning
+    // rather than silently reporting "reload OK" with stale live settings.
+    // TODO(phase-8): flip this test when CB hot-reload is implemented.
     void TestCircuitBreakerEquality() {
-        std::cout << "\n[TEST] Circuit Breaker Equality (topology only)..." << std::endl;
+        std::cout << "\n[TEST] Circuit Breaker Equality (CB included until Phase 8)..." << std::endl;
         try {
             UpstreamConfig a;
             a.name = "svc"; a.host = "h"; a.port = 80;
@@ -565,16 +565,16 @@ namespace ConfigTests {
             // Default equal.
             bool equal_default = (a == b);
 
-            // Circuit-breaker-only edit must NOT change UpstreamConfig equality.
+            // Circuit-breaker-only edit DOES change UpstreamConfig equality
+            // (until Phase 8 ships the live-reload path).
             b.circuit_breaker.enabled = true;
             b.circuit_breaker.window_seconds = 30;
-            bool topology_still_equal = (a == b);
+            bool cb_edit_detected = (a != b);
 
-            // BUT CircuitBreakerConfig::operator== catches the field diff
-            // (Phase 8 reload uses this to detect what changed per-host).
+            // CircuitBreakerConfig::operator== agrees on the field diff.
             bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker);
 
-            // Topology changes DO make configs unequal.
+            // Topology changes also make configs unequal.
             UpstreamConfig c = a;
             c.host = "different";
             bool topology_changed = (a != c);
@@ -583,20 +583,20 @@ namespace ConfigTests {
             d.port = 9999;
             bool port_change_detected = (a != d);
 
-            bool pass = equal_default && topology_still_equal &&
+            bool pass = equal_default && cb_edit_detected &&
                         cb_fields_differ && topology_changed &&
                         port_change_detected;
-            TestFramework::RecordTest("Circuit Breaker Equality (topology only)",
+            TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)",
                 pass,
                 pass ? "" :
                 "equal_default=" + std::to_string(equal_default) +
-                " topology_still_equal=" + std::to_string(topology_still_equal) +
+                " cb_edit_detected=" + std::to_string(cb_edit_detected) +
                 " cb_fields_differ=" + std::to_string(cb_fields_differ) +
                 " topology_changed=" + std::to_string(topology_changed) +
                 " port_change_detected=" + std::to_string(port_change_detected),
                 TestFramework::TestCategory::OTHER);
         } catch (const std::exception& e) {
-            TestFramework::RecordTest("Circuit Breaker Equality (topology only)",
+            TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)",
                 false, e.what(), TestFramework::TestCategory::OTHER);
         }
     }

From a52f1dfeab9238d65fa64f22f69d26e0bdffc846 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 00:43:39 +0800
Subject: [PATCH 07/37] Fix review comment

---
 server/circuit_breaker_slice.cc | 15 ++++++-
 test/circuit_breaker_test.h     | 76 +++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 03313173..59970641 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -426,7 +426,20 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         // Skip when enabled_changed is also true: the full-reset branch
         // below bumps both generations as part of its larger reset.
         window_.Resize(new_config.window_seconds);
-        if (!enabled_changed) ++closed_gen_;
+        if (!enabled_changed) {
+            // Reset consecutive_failures_ alongside the window wipe.
+            // Both are CLOSED-domain state from the same observation cycle.
+            // Bumping closed_gen_ drops all pre-reload CLOSED reports
+            // (correct — they must not seed the fresh window). But if
+            // consecutive_failures_ is NOT also reset, those dropped reports
+            // can no longer clear or advance the counter either, so the
+            // leftover count becomes an orphaned value that mis-fires future
+            // trip evaluations (spurious trip: pre-reload success was going
+            // to clear the counter but got dropped, so the next real failure
+            // crosses the threshold using a stale count).
+            consecutive_failures_ = 0;
+            ++closed_gen_;
+        }
     }
 
     if (enabled_changed) {
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index 6a6f4354..bbd9f5e7 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1377,6 +1377,81 @@ void TestWindowResizeStillInvalidatesClosedAdmissions() {
     }
 }
 
+// BUG (review round 6, P2): Reload with window_seconds change preserved
+// consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED
+// reports are correctly blocked (stale gen), but they can no longer
+// clear or advance consecutive_failures_ either. The counter becomes an
+// orphaned relic from a prior observation cycle:
+//
+//   Scenario: 4 consecutive failures (threshold=5), reload window_seconds.
+//   Pre-reload success arrives → stale gen → DROPPED.
+//   Without fix: consecutive_failures_ stays at 4.
+//   Next real failure: consecutive_failures_ = 5 → SPURIOUS TRIP.
+//
+// Fix: reset consecutive_failures_ = 0 in the same branch that clears
+// the window on resize. Both are CLOSED-domain state from the same
+// observation cycle; invalidating one without resetting the other leaves
+// an inconsistent counter.
+void TestWindowResizeResetConsecutiveFailures() {
+    std::cout << "\n[TEST] CB: window resize resets consecutive_failures_..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 5;
+        cb.failure_rate_threshold = 100;  // rate-trip disabled (100% threshold)
+        cb.minimum_volume = 1000;         // rate-trip disabled (high volume gate)
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 5;
+        cb.base_open_duration_ms = 5000;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Accumulate 4 consecutive failures (one below the threshold of 5).
+        for (int i = 0; i < 4; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+        bool pre_reload_closed = slice.CurrentState() == State::CLOSED;
+
+        // Capture a pre-reload admission.
+        auto pre_admit = slice.TryAcquire();
+        uint64_t pre_gen = pre_admit.generation;
+
+        // Window-only reload: wipes the rate window, bumps closed_gen_,
+        // and (with the fix) resets consecutive_failures_ to 0.
+        auto resized = cb;
+        resized.window_seconds = 30;
+        slice.Reload(resized);
+
+        // Pre-reload success arrives late — must be dropped (stale gen).
+        slice.ReportSuccess(false, pre_gen);
+        bool stale_dropped = slice.ReportsStaleGeneration() == 1;
+
+        // Verify consecutive_failures_ was reset: one real post-reload failure
+        // must NOT trip the breaker (counter is 1/5, not 5/5).
+        auto post_admit = slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, post_admit.generation);
+        bool no_spurious_trip = slice.CurrentState() == State::CLOSED;
+
+        bool pass = pre_reload_closed && stale_dropped && no_spurious_trip;
+        TestFramework::RecordTest(
+            "CB: window resize resets consecutive_failures_",
+            pass, pass ? "" :
+                  "pre_reload_closed=" + std::to_string(pre_reload_closed) +
+                  " stale_dropped=" + std::to_string(stale_dropped) +
+                  " no_spurious_trip=" + std::to_string(no_spurious_trip),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: window resize resets consecutive_failures_",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -1455,6 +1530,7 @@ void RunAllTests() {
     TestThresholdOnlyReloadDoesNotAdvanceGeneration();
     TestWindowResizeDuringHalfOpenDoesNotStrandProbes();
     TestWindowResizeStillInvalidatesClosedAdmissions();
+    TestWindowResizeResetConsecutiveFailures();
     TestTransitionCallbackInvoked();
 }
 

From 833c150cbcd3c451af75cddaa25898a4699448f7 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 01:04:04 +0800
Subject: [PATCH 08/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  7 ++
 server/circuit_breaker_slice.cc               | 21 ++++-
 test/circuit_breaker_test.h                   | 87 +++++++++++++++++++
 3 files changed, 112 insertions(+), 3 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index f08a8358..0bf16afb 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -138,6 +138,13 @@ class CircuitBreakerSlice {
     int half_open_inflight_ = 0;
     int half_open_successes_ = 0;
     bool half_open_saw_failure_ = false;
+    // Probe budget for the CURRENT HALF_OPEN cycle. Snapshotted from
+    // config_.permitted_half_open_calls at the moment TransitionOpenToHalfOpen
+    // fires. A live Reload() may lower (or raise) the config field mid-cycle;
+    // the snapshot ensures TryAcquire's slot gate and ReportSuccess's close
+    // check both operate against the budget that was in effect when the probes
+    // were admitted — preventing early close or indefinitely-open behaviour.
+    int half_open_permitted_snapshot_ = 0;
 
     // Observability counters.
     std::atomic<int64_t> trips_{0};
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 59970641..4bc03410 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -108,6 +108,15 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    // Snapshot the probe budget for this cycle. A live Reload() during this
+    // HALF_OPEN episode may lower or raise config_.permitted_half_open_calls,
+    // but TryAcquire's slot gate (Case B) and ReportSuccess's close check must
+    // both operate against the budget that was in effect when probes were
+    // admitted. Without the snapshot: lowering the limit causes premature close
+    // (first success satisfies the reduced count → TransitionHalfOpenToClosed
+    // bumps halfopen_gen_ → remaining admitted probes become stale → their
+    // failures are silently dropped and the breaker falsely closes).
+    half_open_permitted_snapshot_ = config_.permitted_half_open_calls;
     // Reset the info-log "first reject" breadcrumb so the first rejection
     // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN
     // rejection (recovery attempt failing or probe budget full) is
@@ -122,7 +131,7 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
 
     logging::Get()->info(
         "circuit breaker half-open {} probes_allowed={}",
-        host_label_, config_.permitted_half_open_calls);
+        host_label_, half_open_permitted_snapshot_);
 
     if (transition_cb_) {
         transition_cb_(State::OPEN, State::HALF_OPEN, "open_elapsed");
@@ -238,7 +247,10 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() {
         }
         // Case B: probe budget fully in flight. "No capacity" — bump the
         // dedicated counter so dashboards can tell these two apart.
-        if (half_open_inflight_ >= config_.permitted_half_open_calls) {
+        // Use the cycle snapshot, not config_, so a live Reload() that
+        // lowers permitted_half_open_calls mid-cycle doesn't change how many
+        // probes were promised to this cycle.
+        if (half_open_inflight_ >= half_open_permitted_snapshot_) {
             return Admission{RejectWithLog("half_open_full",
                                            /*half_open_full=*/true),
                              /*generation=*/0};
@@ -326,7 +338,10 @@ void CircuitBreakerSlice::ReportSuccess(bool probe,
             return;
         }
         half_open_successes_++;
-        if (half_open_successes_ >= config_.permitted_half_open_calls) {
+        // Use the cycle snapshot so a mid-cycle Reload() that lowers the
+        // limit doesn't close the breaker early (before all admitted probes
+        // have reported back), silently dropping the remaining probes' failures.
+        if (half_open_successes_ >= half_open_permitted_snapshot_) {
             TransitionHalfOpenToClosed();
         }
         return;
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index bbd9f5e7..801b2048 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1377,6 +1377,92 @@ void TestWindowResizeStillInvalidatesClosedAdmissions() {
     }
 }
 
+// BUG (review round 7, P2): Reload() lowering permitted_half_open_calls
+// while a HALF_OPEN cycle is active could close the breaker early and
+// discard failures from already-admitted probes.
+//
+// Scenario (5-probe cycle reloaded down to 1):
+//   TransitionOpenToHalfOpen: snapshot=5, admit 5 probes.
+//   Reload: permitted_half_open_calls → 1.
+//   First success arrives → half_open_successes_=1 ≥ NEW limit (1)
+//   → TransitionHalfOpenToClosed() fires → halfopen_gen_ bumped.
+//   Remaining 4 admitted probes are now stale → their failures DROPPED.
+//   Breaker falsely closes even though 4 probes have not reported yet.
+//
+// Fix: snapshot config_.permitted_half_open_calls into
+// half_open_permitted_snapshot_ at TransitionOpenToHalfOpen time.
+// TryAcquire (slot gate) and ReportSuccess (close check) both use the
+// snapshot so the cycle budget is frozen for its lifetime.
+void TestHalfOpenBudgetFrozenAcrossReload() {
+    std::cout << "\n[TEST] CB: HALF_OPEN budget frozen across mid-cycle reload..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 5;
+        cb.failure_rate_threshold = 100;   // disable rate-trip
+        cb.minimum_volume = 1000;          // disable rate-trip
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 2;  // exactly 2 probes for clean drain
+        cb.base_open_duration_ms = 100;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip the breaker.
+        for (int i = 0; i < 5; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+        bool is_open = slice.CurrentState() == State::OPEN;
+
+        // Advance past open_until → OPEN→HALF_OPEN on next TryAcquire.
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Admit both probes (budget=2; snapshot set to 2 at TransitionOpenToHalfOpen).
+        auto a0 = slice.TryAcquire();
+        auto a1 = slice.TryAcquire();
+        bool both_probes = (a0.decision == Decision::ADMITTED_PROBE) &&
+                           (a1.decision == Decision::ADMITTED_PROBE);
+        bool is_halfopen = slice.CurrentState() == State::HALF_OPEN;
+
+        // Lower the limit to 1 mid-cycle.
+        auto lowered = cb;
+        lowered.permitted_half_open_calls = 1;
+        slice.Reload(lowered);
+
+        // First probe succeeds.
+        // Without fix: successes(1) >= NEW config(1) → TransitionHalfOpenToClosed
+        //              → halfopen_gen_ bumped → second probe's failure DROPPED
+        //              → breaker falsely CLOSED.
+        // With fix:    successes(1) >= snapshot(2) is false → stays HALF_OPEN.
+        slice.ReportSuccess(true, a0.generation);
+        bool not_closed_after_one = slice.CurrentState() == State::HALF_OPEN;
+
+        // Second probe fails. inflight drops to 0 → TripHalfOpenToOpen fires.
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, a1.generation);
+        bool retripped = slice.CurrentState() == State::OPEN;
+
+        bool pass = is_open && both_probes && is_halfopen &&
+                    not_closed_after_one && retripped;
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN budget frozen across mid-cycle reload",
+            pass, pass ? "" :
+                  "is_open=" + std::to_string(is_open) +
+                  " both_probes=" + std::to_string(both_probes) +
+                  " is_halfopen=" + std::to_string(is_halfopen) +
+                  " not_closed_after_one=" + std::to_string(not_closed_after_one) +
+                  " retripped=" + std::to_string(retripped),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN budget frozen across mid-cycle reload",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 // BUG (review round 6, P2): Reload with window_seconds change preserved
 // consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED
 // reports are correctly blocked (stale gen), but they can no longer
@@ -1531,6 +1617,7 @@ void RunAllTests() {
     TestWindowResizeDuringHalfOpenDoesNotStrandProbes();
     TestWindowResizeStillInvalidatesClosedAdmissions();
     TestWindowResizeResetConsecutiveFailures();
+    TestHalfOpenBudgetFrozenAcrossReload();
     TestTransitionCallbackInvoked();
 }
 

From b8d3a1f7fa20d462c7e27868fc21460a16485f15 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 08:49:54 +0800
Subject: [PATCH 09/37] Fix review comment

---
 server/circuit_breaker_window.cc | 10 +++++++--
 test/circuit_breaker_test.h      | 38 ++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 2 deletions(-)

diff --git a/server/circuit_breaker_window.cc b/server/circuit_breaker_window.cc
index 06fccc18..776c00ec 100644
--- a/server/circuit_breaker_window.cc
+++ b/server/circuit_breaker_window.cc
@@ -16,8 +16,14 @@ static inline size_t BucketIndex(int64_t epoch_sec, int window_seconds) {
 }
 
 CircuitBreakerWindow::CircuitBreakerWindow(int window_seconds)
-    : window_seconds_(window_seconds),
-      buckets_(window_seconds > 0 ? static_cast<size_t>(window_seconds) : 1) {
+    // Clamp to a minimum of 1 bucket. ConfigLoader::Validate() rejects
+    // window_seconds <= 0 on the production path, but the constructor is a
+    // public API and programmatic callers (tests, future direct users) may
+    // bypass that validation. Without the clamp, BucketIndex() does `% 0` on
+    // the first Add/TotalCount and crashes; negative values violate the ring
+    // math. Matches Resize()'s clamp so the two entry points are symmetric.
+    : window_seconds_(window_seconds > 0 ? window_seconds : 1),
+      buckets_(static_cast<size_t>(window_seconds_)) {
 }
 
 int64_t CircuitBreakerWindow::ToEpochSec(
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index 801b2048..f0e32d30 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1538,6 +1538,43 @@ void TestWindowResizeResetConsecutiveFailures() {
     }
 }
 
+// BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated
+// `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value.
+// Programmatic callers bypassing ConfigLoader::Validate() (tests, future
+// direct users) that passed window_seconds <= 0 would trigger BucketIndex's
+// `% window_seconds_` on the first Add*/TotalCount call — dividing by zero
+// for 0, or violating ring math for negatives. Resize() already clamped.
+// Fix: constructor applies the same clamp so both entry points are symmetric.
+void TestWindowNonPositiveWindowSizeClamp() {
+    std::cout << "\n[TEST] CB: window ctor clamps non-positive sizes..."
+              << std::endl;
+    try {
+        // Zero would have crashed on % 0 before the fix.
+        CircuitBreakerWindow w0(0);
+        auto t = std::chrono::steady_clock::time_point(std::chrono::seconds(1000));
+        w0.AddSuccess(t);
+        w0.AddFailure(t);
+        bool zero_ok = (w0.TotalCount(t) == 2) && (w0.FailureCount(t) == 1);
+
+        // Negative values would have violated the ring math.
+        CircuitBreakerWindow wn(-5);
+        wn.AddSuccess(t);
+        bool negative_ok = wn.TotalCount(t) == 1;
+
+        bool pass = zero_ok && negative_ok;
+        TestFramework::RecordTest(
+            "CB: window ctor clamps non-positive sizes",
+            pass, pass ? "" :
+                  "zero_ok=" + std::to_string(zero_ok) +
+                  " negative_ok=" + std::to_string(negative_ok),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: window ctor clamps non-positive sizes",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -1618,6 +1655,7 @@ void RunAllTests() {
     TestWindowResizeStillInvalidatesClosedAdmissions();
     TestWindowResizeResetConsecutiveFailures();
     TestHalfOpenBudgetFrozenAcrossReload();
+    TestWindowNonPositiveWindowSizeClamp();
     TestTransitionCallbackInvoked();
 }
 

From 679fc733918dddafcfc33b802b8dd08729abc125 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 09:13:42 +0800
Subject: [PATCH 10/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  7 +-
 include/config/server_config.h                | 14 ++--
 server/circuit_breaker_slice.cc               | 14 ++--
 server/config_loader.cc                       | 23 ++-----
 test/circuit_breaker_test.h                   | 65 +++++++++++++++++++
 test/config_test.h                            | 21 ++----
 6 files changed, 99 insertions(+), 45 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 0bf16afb..1ff8fe1d 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -205,7 +205,12 @@ class CircuitBreakerSlice {
     std::chrono::nanoseconds ComputeOpenDuration() const;
 
     // Check whether CLOSED trip conditions are met. Called after every failure.
-    bool ShouldTripClosed();
+    // Takes `now` as a parameter so the caller can record the failure and
+    // evaluate the trip against THE SAME timestamp — otherwise a clock tick
+    // between AddFailure() and ShouldTripClosed() can advance the ring and
+    // wipe the just-recorded failure (critical when window_seconds is small:
+    // with window=1, a 1-second delta triggers the full-reset path).
+    bool ShouldTripClosed(std::chrono::steady_clock::time_point now);
 
     std::chrono::steady_clock::time_point Now() const;
 };
diff --git a/include/config/server_config.h b/include/config/server_config.h
index 606de9d9..5a6a39f4 100644
--- a/include/config/server_config.h
+++ b/include/config/server_config.h
@@ -152,10 +152,12 @@ struct CircuitBreakerConfig {
     // Safety valve (future-proof for load-balanced services; no-op v1).
     int max_ejection_percent_per_host_set = 50;
 
-    // Retry budget (orthogonal to the breaker). Caps concurrent retries to
-    // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100).
-    int retry_budget_percent = 20;
-    int retry_budget_min_concurrency = 3;
+    // NOTE: retry_budget_percent and retry_budget_min_concurrency have been
+    // REMOVED from Phase 2. They'll be re-added in Phase 3 when the
+    // RetryBudget class is introduced (design §4.5). Exposing them here as
+    // config knobs without any runtime code reading them was misleading to
+    // operators — setting them produced no protection against retry storms
+    // since ProxyHandler's RetryPolicy reads proxy.retry.*, not these fields.
 
     bool operator==(const CircuitBreakerConfig& o) const {
         return enabled == o.enabled &&
@@ -167,9 +169,7 @@ struct CircuitBreakerConfig {
                permitted_half_open_calls == o.permitted_half_open_calls &&
                base_open_duration_ms == o.base_open_duration_ms &&
                max_open_duration_ms == o.max_open_duration_ms &&
-               max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set &&
-               retry_budget_percent == o.retry_budget_percent &&
-               retry_budget_min_concurrency == o.retry_budget_min_concurrency;
+               max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set;
     }
     bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); }
 };
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 4bc03410..3e22f014 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -49,11 +49,11 @@ std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const {
     return std::chrono::milliseconds(scaled_ms);
 }
 
-bool CircuitBreakerSlice::ShouldTripClosed() {
+bool CircuitBreakerSlice::ShouldTripClosed(
+        std::chrono::steady_clock::time_point now) {
     if (consecutive_failures_ >= config_.consecutive_failure_threshold) {
         return true;
     }
-    auto now = Now();
     int64_t total = window_.TotalCount(now);
     if (total < config_.minimum_volume) return false;
     int64_t fails = window_.FailureCount(now);
@@ -406,9 +406,15 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe,
     if (state_.load(std::memory_order_acquire) != State::CLOSED) return;
 
     consecutive_failures_++;
-    window_.AddFailure(Now());
+    // Capture Now() once and reuse for both the record and the trip check.
+    // Separate Now() calls can cross a second boundary, letting TotalCount's
+    // internal Advance() zero the bucket we just wrote — with window_seconds=1,
+    // a 1-second delta trips the Advance full-reset path and the just-recorded
+    // failure disappears from the ring, missing a rate trip that should fire.
+    auto now = Now();
+    window_.AddFailure(now);
 
-    if (ShouldTripClosed()) {
+    if (ShouldTripClosed(now)) {
         const char* trigger =
             (consecutive_failures_ >= config_.consecutive_failure_threshold)
                 ? "consecutive" : "rate";
diff --git a/server/config_loader.cc b/server/config_loader.cc
index f6ff4698..552ccf5c 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -286,10 +286,9 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) {
                     cb.value("max_open_duration_ms", 60000);
                 upstream.circuit_breaker.max_ejection_percent_per_host_set =
                     cb.value("max_ejection_percent_per_host_set", 50);
-                upstream.circuit_breaker.retry_budget_percent =
-                    cb.value("retry_budget_percent", 20);
-                upstream.circuit_breaker.retry_budget_min_concurrency =
-                    cb.value("retry_budget_min_concurrency", 3);
+                // retry_budget_* fields removed from Phase 2 — re-added in
+                // Phase 3 when the RetryBudget class lands. Unknown keys in
+                // input JSON are silently ignored by nlohmann::json.
             }
 
             config.upstreams.push_back(std::move(upstream));
@@ -873,16 +872,7 @@ void ConfigLoader::Validate(const ServerConfig& config) {
                         idx + " ('" + u.name +
                         "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]");
                 }
-                if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) {
-                    throw std::invalid_argument(
-                        idx + " ('" + u.name +
-                        "'): circuit_breaker.retry_budget_percent must be in [0, 100]");
-                }
-                if (cb.retry_budget_min_concurrency < 0) {
-                    throw std::invalid_argument(
-                        idx + " ('" + u.name +
-                        "'): circuit_breaker.retry_budget_min_concurrency must be >= 0");
-                }
+                // retry_budget_* validation removed — fields moved to Phase 3.
             }
             // Validate method names — reject unknowns and duplicates.
             // Duplicates would cause RouteAsync to throw at startup.
@@ -1164,10 +1154,7 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) {
                 u.circuit_breaker.max_open_duration_ms;
             cbj["max_ejection_percent_per_host_set"] =
                 u.circuit_breaker.max_ejection_percent_per_host_set;
-            cbj["retry_budget_percent"] =
-                u.circuit_breaker.retry_budget_percent;
-            cbj["retry_budget_min_concurrency"] =
-                u.circuit_breaker.retry_budget_min_concurrency;
+            // retry_budget_* fields dropped from serialization — Phase 3 adds.
             uj["circuit_breaker"] = cbj;
         }
         j["upstreams"].push_back(uj);
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index f0e32d30..bd2809f4 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1538,6 +1538,70 @@ void TestWindowResizeResetConsecutiveFailures() {
     }
 }
 
+// BUG (review round 9, P2-1): ReportFailure captured Now() separately in
+// AddFailure() and ShouldTripClosed()'s internal TotalCount/FailureCount
+// calls. If a second boundary elapsed between the two calls, Advance() could
+// wipe the just-recorded failure — with window_seconds=1, the 1-second delta
+// hits the delta >= window_seconds full-reset path and the failure
+// disappears before the trip evaluation runs. Fix: capture Now() once in
+// ReportFailure and thread it through ShouldTripClosed(now), AddFailure(now).
+//
+// Regression test injects a time source that returns T on the first call
+// and T+1s on every subsequent call, simulating the boundary crossing.
+// Post-fix, ReportFailure only calls Now() once — the fix is effective.
+// Pre-fix, the second Now() call inside ShouldTripClosed would advance the
+// ring and wipe the failure → no trip.
+void TestReportFailureUsesOneTimestampAcrossTripEval() {
+    std::cout << "\n[TEST] CB: ReportFailure uses single timestamp for trip eval..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 1000;  // disable consec path
+        cb.failure_rate_threshold = 100;          // rate=100% to trip on fail
+        cb.minimum_volume = 1;                    // single failure suffices
+        cb.window_seconds = 1;                    // boundary-sensitive
+        cb.permitted_half_open_calls = 5;
+        cb.base_open_duration_ms = 5000;
+        cb.max_open_duration_ms = 60000;
+
+        // Time source returns base on call #1 and base+1s on every call after.
+        // This simulates a clock tick between AddFailure (call 1) and any
+        // subsequent Now() inside ShouldTripClosed (call 2+).
+        auto base = std::chrono::steady_clock::time_point(
+            std::chrono::seconds(1'000'000));
+        int call_count = 0;
+        auto time_source = [&call_count, base]() {
+            int n = call_count++;
+            return n == 0 ? base : base + std::chrono::seconds(1);
+        };
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb, time_source);
+
+        // Admit + fail one request.
+        // Pre-fix trace (BUGGY): AddFailure(base) records in bucket[0]. Then
+        //   ShouldTripClosed()'s internal TotalCount(base+1s) calls Advance
+        //   → delta=1 >= window=1 → full reset wipes the bucket → total=0 <
+        //   minimum_volume=1 → NO TRIP. Rate trip missed.
+        // Post-fix: ReportFailure captures Now() once (=base), passes to
+        //   AddFailure(base) AND ShouldTripClosed(base). Ring stays aligned;
+        //   total=1, failures=1 → rate fires → TRIP to OPEN.
+        auto a = slice.TryAcquire();
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+
+        bool pass = slice.CurrentState() == State::OPEN;
+        TestFramework::RecordTest(
+            "CB: ReportFailure uses single timestamp for trip eval",
+            pass, pass ? "" :
+                  "expected OPEN, got state=" +
+                  std::to_string(static_cast<int>(slice.CurrentState())),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: ReportFailure uses single timestamp for trip eval",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 // BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated
 // `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value.
 // Programmatic callers bypassing ConfigLoader::Validate() (tests, future
@@ -1656,6 +1720,7 @@ void RunAllTests() {
     TestWindowResizeResetConsecutiveFailures();
     TestHalfOpenBudgetFrozenAcrossReload();
     TestWindowNonPositiveWindowSizeClamp();
+    TestReportFailureUsesOneTimestampAcrossTripEval();
     TestTransitionCallbackInvoked();
 }
 
diff --git a/test/config_test.h b/test/config_test.h
index 18ee718f..f0bd4599 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -362,9 +362,8 @@ namespace ConfigTests {
                         cb.permitted_half_open_calls == 5 &&
                         cb.base_open_duration_ms == 5000 &&
                         cb.max_open_duration_ms == 60000 &&
-                        cb.max_ejection_percent_per_host_set == 50 &&
-                        cb.retry_budget_percent == 20 &&
-                        cb.retry_budget_min_concurrency == 3;
+                        cb.max_ejection_percent_per_host_set == 50;
+            // retry_budget_* fields removed from Phase 2 — Phase 3 adds.
             TestFramework::RecordTest("Circuit Breaker Defaults", pass,
                 pass ? "" : "default value mismatch",
                 TestFramework::TestCategory::OTHER);
@@ -393,9 +392,7 @@ namespace ConfigTests {
                         "permitted_half_open_calls": 3,
                         "base_open_duration_ms": 2000,
                         "max_open_duration_ms": 120000,
-                        "max_ejection_percent_per_host_set": 33,
-                        "retry_budget_percent": 10,
-                        "retry_budget_min_concurrency": 5
+                        "max_ejection_percent_per_host_set": 33
                     }
                 }]
             })";
@@ -409,9 +406,7 @@ namespace ConfigTests {
                         cb.permitted_half_open_calls == 3 &&
                         cb.base_open_duration_ms == 2000 &&
                         cb.max_open_duration_ms == 120000 &&
-                        cb.max_ejection_percent_per_host_set == 33 &&
-                        cb.retry_budget_percent == 10 &&
-                        cb.retry_budget_min_concurrency == 5;
+                        cb.max_ejection_percent_per_host_set == 33;
             TestFramework::RecordTest("Circuit Breaker JSON Parse", pass,
                 pass ? "" : "parsed values mismatch",
                 TestFramework::TestCategory::OTHER);
@@ -525,12 +520,8 @@ namespace ConfigTests {
         ExpectValidationFailure("CB Validation: max<base",
             R"({"base_open_duration_ms": 5000, "max_open_duration_ms": 1000})",
             "max_open_duration_ms must be >= base_open_duration_ms");
-        ExpectValidationFailure("CB Validation: retry_budget_percent>100",
-            R"({"retry_budget_percent": 200})",
-            "retry_budget_percent must be in [0, 100]");
-        ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0",
-            R"({"retry_budget_min_concurrency": -1})",
-            "retry_budget_min_concurrency must be >= 0");
+        // retry_budget_percent / retry_budget_min_concurrency validation
+        // cases removed — fields moved to Phase 3.
         ExpectValidationFailure("CB Validation: max_ejection_percent>100",
             R"({"max_ejection_percent_per_host_set": 150})",
             "max_ejection_percent_per_host_set must be in [0, 100]");

From 628ca72a02e39bcd86836d6ea3f06e81850f90aa Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 09:29:34 +0800
Subject: [PATCH 11/37] Fix review comment

---
 server/circuit_breaker_slice.cc | 12 +++++-
 server/config_loader.cc         | 44 +++++++++++++++++-----
 test/circuit_breaker_test.h     | 65 +++++++++++++++++++++++++++++++++
 test/config_test.h              | 13 +++++++
 4 files changed, 123 insertions(+), 11 deletions(-)

diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 3e22f014..bb0568fb 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -116,7 +116,17 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     // (first success satisfies the reduced count → TransitionHalfOpenToClosed
     // bumps halfopen_gen_ → remaining admitted probes become stale → their
     // failures are silently dropped and the breaker falsely closes).
-    half_open_permitted_snapshot_ = config_.permitted_half_open_calls;
+    //
+    // Clamp to a minimum of 1. ConfigLoader::Validate() enforces >= 1 on the
+    // production path, but programmatic callers (tests, future direct users)
+    // that bypass validation could set permitted_half_open_calls <= 0. With
+    // snapshot=0, TryAcquire's Case B check (`inflight >= snapshot`) is
+    // immediately true for every probe → no probe ever admitted → no probe
+    // ever completes → half_open_inflight_ stays at 0 forever → slice is
+    // permanently stuck in HALF_OPEN rejecting all traffic. Matches the
+    // symmetric clamp in CircuitBreakerWindow's ctor.
+    int permitted = config_.permitted_half_open_calls;
+    half_open_permitted_snapshot_ = permitted > 0 ? permitted : 1;
     // Reset the info-log "first reject" breadcrumb so the first rejection
     // observed in the HALF_OPEN phase surfaces at info, not debug. HALF_OPEN
     // rejection (recovery attempt failing or probe budget full) is
diff --git a/server/config_loader.cc b/server/config_loader.cc
index 552ccf5c..e3f7f6fe 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -266,26 +266,50 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) {
                 if (!item["circuit_breaker"].is_object())
                     throw std::runtime_error("upstream circuit_breaker must be an object");
                 auto& cb = item["circuit_breaker"];
+                // Strict integer accessor: rejects float/bool/string inputs
+                // that nlohmann's default value<int>() would silently coerce
+                // (e.g., 1.9 → 1, true → 1). Without this, malformed configs
+                // pass Validate() and change breaker behavior in production.
+                auto cb_int = [&cb](const char* name, int default_val) -> int {
+                    if (!cb.contains(name)) return default_val;
+                    const auto& v = cb[name];
+                    if (!v.is_number_integer()) {
+                        throw std::invalid_argument(
+                            std::string("circuit_breaker.") + name +
+                            " must be an integer");
+                    }
+                    return v.get<int>();
+                };
+                auto cb_bool = [&cb](const char* name, bool default_val) -> bool {
+                    if (!cb.contains(name)) return default_val;
+                    const auto& v = cb[name];
+                    if (!v.is_boolean()) {
+                        throw std::invalid_argument(
+                            std::string("circuit_breaker.") + name +
+                            " must be a boolean");
+                    }
+                    return v.get<bool>();
+                };
                 upstream.circuit_breaker.enabled =
-                    cb.value("enabled", false);
+                    cb_bool("enabled", false);
                 upstream.circuit_breaker.dry_run =
-                    cb.value("dry_run", false);
+                    cb_bool("dry_run", false);
                 upstream.circuit_breaker.consecutive_failure_threshold =
-                    cb.value("consecutive_failure_threshold", 5);
+                    cb_int("consecutive_failure_threshold", 5);
                 upstream.circuit_breaker.failure_rate_threshold =
-                    cb.value("failure_rate_threshold", 50);
+                    cb_int("failure_rate_threshold", 50);
                 upstream.circuit_breaker.minimum_volume =
-                    cb.value("minimum_volume", 20);
+                    cb_int("minimum_volume", 20);
                 upstream.circuit_breaker.window_seconds =
-                    cb.value("window_seconds", 10);
+                    cb_int("window_seconds", 10);
                 upstream.circuit_breaker.permitted_half_open_calls =
-                    cb.value("permitted_half_open_calls", 5);
+                    cb_int("permitted_half_open_calls", 5);
                 upstream.circuit_breaker.base_open_duration_ms =
-                    cb.value("base_open_duration_ms", 5000);
+                    cb_int("base_open_duration_ms", 5000);
                 upstream.circuit_breaker.max_open_duration_ms =
-                    cb.value("max_open_duration_ms", 60000);
+                    cb_int("max_open_duration_ms", 60000);
                 upstream.circuit_breaker.max_ejection_percent_per_host_set =
-                    cb.value("max_ejection_percent_per_host_set", 50);
+                    cb_int("max_ejection_percent_per_host_set", 50);
                 // retry_budget_* fields removed from Phase 2 — re-added in
                 // Phase 3 when the RetryBudget class lands. Unknown keys in
                 // input JSON are silently ignored by nlohmann::json.
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index bd2809f4..bf95f2be 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1639,6 +1639,70 @@ void TestWindowNonPositiveWindowSizeClamp() {
     }
 }
 
+// BUG (review round 9, P3): CircuitBreakerSlice copied permitted_half_open_calls
+// into the HALF_OPEN snapshot verbatim. For programmatic callers bypassing
+// ConfigLoader::Validate() (same class as the window ctor clamp), a zero or
+// negative budget would permanently wedge the breaker in HALF_OPEN:
+//   TryAcquire (HALF_OPEN, case B): half_open_inflight_(0) >= snapshot(0)
+//   → every probe rejected as half_open_full → no probe ever admitted
+//   → no report ever fires → half_open_inflight_ stays at 0 forever.
+//
+// Fix: clamp the snapshot to min 1 at TransitionOpenToHalfOpen. Symmetric
+// with CircuitBreakerWindow's constructor clamp from round 8.
+void TestHalfOpenClampsNonPositiveProbeBudget() {
+    std::cout << "\n[TEST] CB: HALF_OPEN clamps non-positive probe budget..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 0;   // bypasses Validate() — direct ctor
+        cb.base_open_duration_ms = 100;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip to OPEN.
+        for (int i = 0; i < 2; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+
+        // Advance past open_until → OPEN→HALF_OPEN on next TryAcquire.
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // First TryAcquire triggers the transition. With the clamp, snapshot=1
+        // and this probe is admitted. Without the clamp, snapshot=0 → rejected
+        // as half_open_full → breaker stuck forever.
+        auto a0 = slice.TryAcquire();
+        bool probe_admitted = a0.decision == Decision::ADMITTED_PROBE;
+
+        // A successful probe closes the cycle (successes(1) >= snapshot(1)).
+        // Without the clamp this branch would never execute.
+        if (probe_admitted) {
+            slice.ReportSuccess(true, a0.generation);
+        }
+        bool recovered = slice.CurrentState() == State::CLOSED;
+
+        bool pass = probe_admitted && recovered;
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN clamps non-positive probe budget",
+            pass, pass ? "" :
+                  "probe_admitted=" + std::to_string(probe_admitted) +
+                  " recovered=" + std::to_string(recovered),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN clamps non-positive probe budget",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -1721,6 +1785,7 @@ void RunAllTests() {
     TestHalfOpenBudgetFrozenAcrossReload();
     TestWindowNonPositiveWindowSizeClamp();
     TestReportFailureUsesOneTimestampAcrossTripEval();
+    TestHalfOpenClampsNonPositiveProbeBudget();
     TestTransitionCallbackInvoked();
 }
 
diff --git a/test/config_test.h b/test/config_test.h
index f0bd4599..6317151f 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -538,6 +538,19 @@ namespace ConfigTests {
         ExpectValidationFailure("CB Validation: permitted_half_open_calls>1000",
             R"({"permitted_half_open_calls": 1001})",
             "permitted_half_open_calls must be in [1, 1000]");
+        // Type-strictness guards: nlohmann's value<int>() silently coerces
+        // float/bool to int (1.9 → 1, true → 1). Rejecting at parse time is
+        // safer than letting malformed configs pass Validate() and change
+        // production breaker behavior.
+        ExpectValidationFailure("CB Validation: float rejected for int field",
+            R"({"window_seconds": 1.9})",
+            "circuit_breaker.window_seconds must be an integer");
+        ExpectValidationFailure("CB Validation: bool rejected for int field",
+            R"({"consecutive_failure_threshold": true})",
+            "circuit_breaker.consecutive_failure_threshold must be an integer");
+        ExpectValidationFailure("CB Validation: int rejected for bool field",
+            R"({"enabled": 1})",
+            "circuit_breaker.enabled must be a boolean");
     }
 
     // Test 14: UpstreamConfig::operator== INCLUDES circuit_breaker until Phase 8.

From 0a4290a432c867e5edae74f8e6025dc3fd3606dd Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 09:53:31 +0800
Subject: [PATCH 12/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   | 12 +++
 server/circuit_breaker_slice.cc               | 29 +++++--
 test/circuit_breaker_test.h                   | 86 +++++++++++++++++++
 3 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 1ff8fe1d..edaea211 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -138,6 +138,18 @@ class CircuitBreakerSlice {
     int half_open_inflight_ = 0;
     int half_open_successes_ = 0;
     bool half_open_saw_failure_ = false;
+    // Total probes admitted in the CURRENT HALF_OPEN cycle. Never decrements
+    // within a cycle; resets on every cycle entry (TransitionOpenToHalfOpen)
+    // and cycle exit (TransitionHalfOpenToClosed / TripHalfOpenToOpen). This
+    // is what caps the cycle's probe budget — NOT half_open_inflight_, which
+    // can free slots as probes complete. Gating on inflight would let an
+    // early-completing probe's slot be reused, causing the cycle to admit
+    // more than permitted_half_open_calls total probes. The close check
+    // (successes >= snapshot) could then fire while a late-admitted probe
+    // is still running; its eventual failure would drop as stale (generation
+    // bumped by the transition) and the breaker would falsely mark an
+    // unhealthy host recovered.
+    int half_open_admitted_ = 0;
     // Probe budget for the CURRENT HALF_OPEN cycle. Snapshotted from
     // config_.permitted_half_open_calls at the moment TransitionOpenToHalfOpen
     // fires. A live Reload() may lower (or raise) the config field mid-cycle;
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index bb0568fb..f3821ab1 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -79,6 +79,7 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    half_open_admitted_ = 0;
     first_reject_logged_for_open_ = false;
     // Bump closed_gen_: non-probe admissions from the closing CLOSED cycle
     // are now stale. Late Report(false, ...) calls for those requests drop.
@@ -108,6 +109,7 @@ void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    half_open_admitted_ = 0;
     // Snapshot the probe budget for this cycle. A live Reload() during this
     // HALF_OPEN episode may lower or raise config_.permitted_half_open_calls,
     // but TryAcquire's slot gate (Case B) and ReportSuccess's close check must
@@ -163,6 +165,7 @@ void CircuitBreakerSlice::TransitionHalfOpenToClosed() {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    half_open_admitted_ = 0;
     first_reject_logged_for_open_ = false;
     // Bump halfopen_gen_: the just-completed HALF_OPEN cycle's probe
     // admissions are now stale. closed_gen_ is NOT bumped — pre-trip
@@ -194,6 +197,7 @@ void CircuitBreakerSlice::TripHalfOpenToOpen(const char* trigger) {
     half_open_inflight_ = 0;
     half_open_successes_ = 0;
     half_open_saw_failure_ = false;
+    half_open_admitted_ = 0;
     first_reject_logged_for_open_ = false;
     // Bump halfopen_gen_: probe admissions from the closing HALF_OPEN
     // cycle are now stale. closed_gen_ is NOT bumped — no CLOSED
@@ -255,16 +259,29 @@ CircuitBreakerSlice::Admission CircuitBreakerSlice::TryAcquire() {
                                            /*half_open_full=*/false),
                              /*generation=*/0};
         }
-        // Case B: probe budget fully in flight. "No capacity" — bump the
-        // dedicated counter so dashboards can tell these two apart.
-        // Use the cycle snapshot, not config_, so a live Reload() that
-        // lowers permitted_half_open_calls mid-cycle doesn't change how many
-        // probes were promised to this cycle.
-        if (half_open_inflight_ >= half_open_permitted_snapshot_) {
+        // Case B: probe budget exhausted for this cycle. "No capacity" — bump
+        // the dedicated counter so dashboards can tell this apart from
+        // saw_failure rejects.
+        //
+        // Gate on `half_open_admitted_` (total cycle admissions, never
+        // decrements), NOT on `half_open_inflight_`. Inflight drops when a
+        // probe completes, so gating on it would reuse the freed slot and let
+        // the cycle admit more than `snapshot` total probes. Consequences of
+        // that bug: the close check `successes >= snapshot` could fire before
+        // ALL admitted probes have reported (the reused-slot probe is still
+        // in flight); TransitionHalfOpenToClosed would bump halfopen_gen_;
+        // the late probe's failure would drop as stale — falsely marking an
+        // unhealthy host recovered.
+        //
+        // Use the cycle snapshot so a live Reload() that lowers
+        // permitted_half_open_calls mid-cycle doesn't change how many probes
+        // were promised to this cycle.
+        if (half_open_admitted_ >= half_open_permitted_snapshot_) {
             return Admission{RejectWithLog("half_open_full",
                                            /*half_open_full=*/true),
                              /*generation=*/0};
         }
+        half_open_admitted_++;
         half_open_inflight_++;
         // Probe admission — stamp with halfopen_gen_.
         return Admission{Decision::ADMITTED_PROBE, halfopen_gen_};
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index bf95f2be..af6f976d 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1703,6 +1703,91 @@ void TestHalfOpenClampsNonPositiveProbeBudget() {
     }
 }
 
+// BUG (review round 10, P1): TryAcquire gated HALF_OPEN admission on
+// half_open_inflight_, so a probe slot was reused once an earlier probe
+// completed. With permitted_half_open_calls=2:
+//
+//   admit A → inflight=1, admitted=1
+//   admit B → inflight=2, admitted=2
+//   Report success on A → inflight=1, successes=1
+//   admit C → inflight(1) < snapshot(2) → ACCEPTED (BUG: 3rd admission)
+//   Report success on B → inflight=0, successes=2
+//   successes(2) >= snapshot(2) → TransitionHalfOpenToClosed fires
+//   → halfopen_gen_ bumped → C's eventual failure DROPPED as stale
+//   → breaker falsely marked recovered despite the probe failing.
+//
+// Fix: gate on half_open_admitted_ (total cycle admissions, never
+// decrements) instead of half_open_inflight_. The cycle can admit at most
+// `snapshot` probes total, regardless of how quickly earlier probes drain.
+void TestHalfOpenDoesNotReuseProbeSlots() {
+    std::cout << "\n[TEST] CB: HALF_OPEN does not reuse probe slots..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 2;
+        cb.base_open_duration_ms = 100;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip to OPEN.
+        for (int i = 0; i < 2; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        // Admit 2 probes (budget=2).
+        auto a = slice.TryAcquire();
+        auto b = slice.TryAcquire();
+        bool both_admitted = a.decision == Decision::ADMITTED_PROBE &&
+                             b.decision == Decision::ADMITTED_PROBE;
+
+        // Report success on A — freeing its inflight slot.
+        slice.ReportSuccess(true, a.generation);
+        bool still_halfopen = slice.CurrentState() == State::HALF_OPEN;
+
+        // Third admission attempt. With the fix: admitted(2) >= snapshot(2)
+        // → REJECTED. Without the fix: inflight(1) < snapshot(2) → ADMITTED,
+        // creating a ghost probe.
+        auto c = slice.TryAcquire();
+        bool third_rejected = c.decision == Decision::REJECTED_OPEN;
+
+        // Close the cycle by succeeding B.
+        slice.ReportSuccess(true, b.generation);
+        bool closed = slice.CurrentState() == State::CLOSED;
+
+        // Verify no stale-generation reports accumulated — if the 3rd admission
+        // had slipped through, its (dropped) report after the close would have
+        // bumped this counter. Since the admission is now rejected up front,
+        // this should stay zero.
+        bool no_stale_reports = slice.ReportsStaleGeneration() == 0;
+
+        bool pass = both_admitted && still_halfopen && third_rejected &&
+                    closed && no_stale_reports;
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN does not reuse probe slots",
+            pass, pass ? "" :
+                  "both_admitted=" + std::to_string(both_admitted) +
+                  " still_halfopen=" + std::to_string(still_halfopen) +
+                  " third_rejected=" + std::to_string(third_rejected) +
+                  " closed=" + std::to_string(closed) +
+                  " no_stale_reports=" + std::to_string(no_stale_reports),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: HALF_OPEN does not reuse probe slots",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -1786,6 +1871,7 @@ void RunAllTests() {
     TestWindowNonPositiveWindowSizeClamp();
     TestReportFailureUsesOneTimestampAcrossTripEval();
     TestHalfOpenClampsNonPositiveProbeBudget();
+    TestHalfOpenDoesNotReuseProbeSlots();
     TestTransitionCallbackInvoked();
 }
 

From 2516637bef3dcf8072613a5c876ec454235818bb Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 11:07:56 +0800
Subject: [PATCH 13/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  17 +++
 server/circuit_breaker_slice.cc               |  40 ++++++
 server/main.cc                                |  14 ++
 test/circuit_breaker_test.h                   | 135 ++++++++++++++++++
 4 files changed, 206 insertions(+)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index edaea211..95a5beee 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -59,6 +59,23 @@ class CircuitBreakerSlice {
     void ReportSuccess(bool probe, uint64_t admission_generation);
     void ReportFailure(FailureKind kind, bool probe, uint64_t admission_generation);
 
+    // Neutral completion — the admission never exercised the upstream.
+    // Use when the request was terminated locally before reaching the
+    // upstream (POOL_EXHAUSTED after admission, shutdown draining, client
+    // disconnect, RESULT_PARSE_ERROR self-attributable). Must NOT be used
+    // for upstream outcomes — those go to ReportSuccess / ReportFailure.
+    //
+    // For probe=true (HALF_OPEN admission): returns the probe slot to the
+    // cycle — decrements `half_open_inflight_` AND `half_open_admitted_`
+    // so a replacement probe can still exercise the upstream within this
+    // cycle's budget. Without this path, a probe that dies locally leaks
+    // its slot forever, eventually wedging the slice in HALF_OPEN.
+    //
+    // For probe=false (CLOSED admission): no-op — CLOSED admissions have
+    // no slot to release. The bool matches ReportSuccess/ReportFailure so
+    // callers can use the same dispatch pattern.
+    void ReportNeutral(bool probe, uint64_t admission_generation);
+
     // Apply a new config (called on this slice's dispatcher thread).
     // Preserves live state (CLOSED/OPEN/HALF_OPEN). Resets window if
     // window_seconds changed.
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index f3821ab1..ae037a90 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -449,6 +449,46 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe,
     }
 }
 
+void CircuitBreakerSlice::ReportNeutral(bool probe,
+                                         uint64_t admission_generation) {
+    if (!config_.enabled) return;
+    if (!probe) {
+        // CLOSED-state admission: no slot to release. The bool parameter
+        // exists for API symmetry with ReportSuccess/ReportFailure; a
+        // neutral outcome in CLOSED simply means the breaker records
+        // nothing (which matches pre-neutral behavior — POOL_EXHAUSTED,
+        // shutdown, and similar local terminations were already "ignored"
+        // on the CLOSED path).
+        return;
+    }
+
+    // Probe: gate on halfopen_gen_ + current state, matching the other
+    // Report* paths. Stale (pre-transition or pre-reload) neutral
+    // completions drop silently into the stale-generation counter.
+    if (admission_generation != halfopen_gen_) {
+        reports_stale_generation_.fetch_add(1, std::memory_order_relaxed);
+        return;
+    }
+    if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return;
+
+    // Return the slot to the cycle. Decrement BOTH inflight and admitted:
+    //   - inflight so the last-probe re-trip logic below fires correctly,
+    //   - admitted so a replacement probe can still be admitted within
+    //     this cycle's budget (the whole point of a neutral release —
+    //     the upstream wasn't actually exercised by this admission).
+    if (half_open_inflight_ > 0) half_open_inflight_--;
+    if (half_open_admitted_ > 0) half_open_admitted_--;
+
+    // If an earlier sibling probe failed and this neutral release drains
+    // the last in-flight probe, the cycle must re-trip — otherwise the
+    // slice would wedge in HALF_OPEN with saw_failure=true, rejecting all
+    // future admissions via Case A forever. Mirrors the failure-path
+    // last-probe trigger.
+    if (half_open_saw_failure_ && half_open_inflight_ == 0) {
+        TripHalfOpenToOpen("probe_fail");
+    }
+}
+
 void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
     const bool enabled_changed = (config_.enabled != new_config.enabled);
     const bool window_changed =
diff --git a/server/main.cc b/server/main.cc
index 06dd2551..86f7598d 100644
--- a/server/main.cc
+++ b/server/main.cc
@@ -427,6 +427,19 @@ static bool ReloadConfig(const std::string& config_path,
     auto saved_tls = current_config.tls;
     auto saved_workers = current_config.worker_threads;
     auto saved_h2_enabled = current_config.http2.enabled;
+    // Preserve upstreams for the same reason: HttpServer::Reload treats
+    // the whole upstream block as restart-required (see http_server.cc
+    // upstream_configs_ comparison), and that internal copy never changes
+    // post-startup. If we overwrote current_config.upstreams here, a
+    // breaker-only edit would stage into current_config while the live
+    // server keeps running the startup values — /stats and other
+    // current_config consumers would report phantom state, and subsequent
+    // identical reloads could produce inconsistent diagnostics. Pin to
+    // the running values until Phase 8 implements
+    // CircuitBreakerManager::Reload (the only upstream sub-field that
+    // becomes hot-reloadable); at that point this save becomes a
+    // partial-field save excluding circuit_breaker.
+    auto saved_upstreams = current_config.upstreams;
 
     current_config = new_config;
 
@@ -435,6 +448,7 @@ static bool ReloadConfig(const std::string& config_path,
     current_config.tls = saved_tls;
     current_config.worker_threads = saved_workers;
     current_config.http2.enabled = saved_h2_enabled;
+    current_config.upstreams = std::move(saved_upstreams);
 
     // Commit file-backed state only after full success — a failed reload
     // must not flip this flag or future reloads lose the defaults+env fallback.
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index af6f976d..daa5aaa3 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1788,6 +1788,139 @@ void TestHalfOpenDoesNotReuseProbeSlots() {
     }
 }
 
+// BUG (review round 11, P1): Admission contract has ReportSuccess and
+// ReportFailure but no path for probes that complete without touching the
+// upstream (POOL_EXHAUSTED after probe admission, shutdown, client
+// disconnect, PARSE_ERROR). Following the §7 "don't report these as
+// failures" contract strictly, such probes would leak their inflight slot
+// forever — once half_open_admitted_ reaches snapshot, all further
+// admissions reject as half_open_full and nothing ever drains the cycle,
+// wedging the slice in HALF_OPEN.
+//
+// Fix: ReportNeutral decrements BOTH inflight (so the last-probe re-trip
+// still fires) and admitted (so a replacement probe can still exercise
+// the upstream within the cycle budget). No touch to successes / fails.
+void TestReportNeutralReleasesProbeSlot() {
+    std::cout << "\n[TEST] CB: ReportNeutral releases probe slot..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 2;
+        cb.base_open_duration_ms = 100;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip to OPEN, advance past backoff, fully consume probe budget.
+        for (int i = 0; i < 2; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        auto a = slice.TryAcquire();
+        auto b = slice.TryAcquire();
+        bool both_probes = a.decision == Decision::ADMITTED_PROBE &&
+                           b.decision == Decision::ADMITTED_PROBE;
+
+        // Budget full: 3rd admission rejected.
+        auto pre_release = slice.TryAcquire();
+        bool budget_full_before = pre_release.decision == Decision::REJECTED_OPEN;
+
+        // Neutral-release A: slot returns, replacement probe fits within budget.
+        slice.ReportNeutral(true, a.generation);
+
+        auto c = slice.TryAcquire();
+        bool replacement_admitted = c.decision == Decision::ADMITTED_PROBE;
+
+        // Cycle completes cleanly via B + C successes → CLOSED.
+        slice.ReportSuccess(true, b.generation);
+        slice.ReportSuccess(true, c.generation);
+        bool closed = slice.CurrentState() == State::CLOSED;
+
+        // Neutral release must NOT have bumped probe_failures / probe_successes.
+        bool counters_clean = slice.ProbeSuccesses() == 2 &&
+                              slice.ProbeFailures() == 0;
+
+        bool pass = both_probes && budget_full_before &&
+                    replacement_admitted && closed && counters_clean;
+        TestFramework::RecordTest(
+            "CB: ReportNeutral releases probe slot",
+            pass, pass ? "" :
+                  "both_probes=" + std::to_string(both_probes) +
+                  " budget_full_before=" + std::to_string(budget_full_before) +
+                  " replacement_admitted=" + std::to_string(replacement_admitted) +
+                  " closed=" + std::to_string(closed) +
+                  " counters_clean=" + std::to_string(counters_clean),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: ReportNeutral releases probe slot",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Companion: a neutral release that drains the last in-flight probe AFTER
+// a sibling failure must still trigger the HALF_OPEN→OPEN re-trip. Without
+// this last-probe hook in ReportNeutral, the slice would wedge in HALF_OPEN
+// with saw_failure=true rejecting every admission via Case A.
+void TestReportNeutralLastProbeAfterFailureReTrips() {
+    std::cout << "\n[TEST] CB: ReportNeutral re-trips as last probe after sibling fail..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 2;
+        cb.base_open_duration_ms = 100;
+        cb.max_open_duration_ms = 60000;
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        for (int i = 0; i < 2; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+        clock->Advance(std::chrono::milliseconds(cb.base_open_duration_ms + 1));
+
+        auto a = slice.TryAcquire();
+        auto b = slice.TryAcquire();
+
+        // A fails → saw_failure=true, inflight=1 (B still running), no re-trip yet.
+        slice.ReportFailure(FailureKind::RESPONSE_5XX, true, a.generation);
+        bool still_halfopen = slice.CurrentState() == State::HALF_OPEN;
+
+        // B neutral-releases → last in-flight drains. With the fix, the
+        // sibling-failure + last-probe hook fires TripHalfOpenToOpen.
+        slice.ReportNeutral(true, b.generation);
+        bool retripped = slice.CurrentState() == State::OPEN;
+
+        bool pass = still_halfopen && retripped;
+        TestFramework::RecordTest(
+            "CB: ReportNeutral re-trips as last probe after sibling fail",
+            pass, pass ? "" :
+                  "still_halfopen=" + std::to_string(still_halfopen) +
+                  " retripped=" + std::to_string(retripped),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: ReportNeutral re-trips as last probe after sibling fail",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -1872,6 +2005,8 @@ void RunAllTests() {
     TestReportFailureUsesOneTimestampAcrossTripEval();
     TestHalfOpenClampsNonPositiveProbeBudget();
     TestHalfOpenDoesNotReuseProbeSlots();
+    TestReportNeutralReleasesProbeSlot();
+    TestReportNeutralLastProbeAfterFailureReTrips();
     TestTransitionCallbackInvoked();
 }
 

From 60e1f903d03eded23e8efa2a92140a65569e1df7 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 11:26:46 +0800
Subject: [PATCH 14/37] Fix review comment

---
 server/circuit_breaker_slice.cc | 17 ++++++++--
 test/circuit_breaker_test.h     | 57 +++++++++++++++++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)

diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index ae037a90..be9da56a 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -40,8 +40,21 @@ std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const {
     // Callers must increment consecutive_trips_ AFTER calling this method.
     int trips = consecutive_trips_.load(std::memory_order_relaxed);
     if (trips > MAX_OPEN_DURATION_SHIFT) trips = MAX_OPEN_DURATION_SHIFT;
-    int64_t base_ms = config_.base_open_duration_ms;
-    int64_t max_ms  = config_.max_open_duration_ms;
+    // Clamp base/max for programmatic callers that bypass ConfigLoader::Validate
+    // (same hardening as CircuitBreakerWindow's ctor and the HALF_OPEN probe
+    // budget snapshot). Without these clamps:
+    //   - base_open_duration_ms <= 0: `base_ms << trips` is <= 0 → open_until
+    //     <= now → next TryAcquire immediately drains OPEN→HALF_OPEN,
+    //     disabling the backoff entirely.
+    //   - max_open_duration_ms < base_open_duration_ms: the overflow/clamp
+    //     branch (`scaled_ms > max_ms`) fires on every trip, pinning the
+    //     duration to a value smaller than base — same "no meaningful
+    //     backoff" effect.
+    // Clamp floors: base >= 1ms, max >= base.
+    int64_t base_ms = config_.base_open_duration_ms > 0
+                          ? config_.base_open_duration_ms : 1;
+    int64_t max_ms  = config_.max_open_duration_ms >= base_ms
+                          ? config_.max_open_duration_ms : base_ms;
     int64_t scaled_ms = base_ms << trips;
     if (scaled_ms < base_ms /* overflow */ || scaled_ms > max_ms) {
         scaled_ms = max_ms;
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index daa5aaa3..65b03777 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -1921,6 +1921,62 @@ void TestReportNeutralLastProbeAfterFailureReTrips() {
     }
 }
 
+// BUG (review round 12, P2): ComputeOpenDuration read base/max durations
+// straight from config_, so a programmatic caller bypassing
+// ConfigLoader::Validate() with base_open_duration_ms <= 0 or max < base
+// would compute scaled_ms <= 0. open_until = now + 0 → next TryAcquire
+// sees now_ns >= open_until_ns → transition to HALF_OPEN immediately.
+// The breaker never actually backed off. Fix: clamp base to >= 1ms and
+// max to >= base at the compute site, matching the window and probe
+// budget clamps.
+void TestComputeOpenDurationClampsInvalidBase() {
+    std::cout << "\n[TEST] CB: ComputeOpenDuration clamps invalid base/max..."
+              << std::endl;
+    try {
+        CircuitBreakerConfig cb;
+        cb.enabled = true;
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        cb.window_seconds = 10;
+        cb.permitted_half_open_calls = 1;
+        cb.base_open_duration_ms = 0;    // bypass — would kill backoff
+        cb.max_open_duration_ms = 0;     // bypass — would kill backoff
+
+        auto clock = std::make_shared<MockClock>();
+        CircuitBreakerSlice slice("svc:h:p p=0", 0, cb,
+            [clock]() { return clock->now; });
+
+        // Trip to OPEN.
+        for (int i = 0; i < 2; ++i) {
+            auto a = slice.TryAcquire();
+            slice.ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+        }
+        bool is_open = slice.CurrentState() == State::OPEN;
+
+        // Immediate TryAcquire: clock hasn't moved, so if the clamp holds
+        // (open_until >= now + 1ms), this MUST reject as "open" (not drain
+        // to HALF_OPEN). Without the fix, scaled_ms=0 → open_until==now →
+        // admission path immediately transitions to HALF_OPEN.
+        auto immediate = slice.TryAcquire();
+        bool rejected_as_open = immediate.decision == Decision::REJECTED_OPEN;
+        bool still_open = slice.CurrentState() == State::OPEN;
+
+        bool pass = is_open && rejected_as_open && still_open;
+        TestFramework::RecordTest(
+            "CB: ComputeOpenDuration clamps invalid base/max",
+            pass, pass ? "" :
+                  "is_open=" + std::to_string(is_open) +
+                  " rejected_as_open=" + std::to_string(rejected_as_open) +
+                  " still_open=" + std::to_string(still_open),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB: ComputeOpenDuration clamps invalid base/max",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
 void TestTransitionCallbackInvoked() {
     std::cout << "\n[TEST] CB: transition callback invoked..." << std::endl;
     try {
@@ -2007,6 +2063,7 @@ void RunAllTests() {
     TestHalfOpenDoesNotReuseProbeSlots();
     TestReportNeutralReleasesProbeSlot();
     TestReportNeutralLastProbeAfterFailureReTrips();
+    TestComputeOpenDurationClampsInvalidBase();
     TestTransitionCallbackInvoked();
 }
 

From 360b55e058da44553d7bfa6aaa96f623f691054a Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 12:03:30 +0800
Subject: [PATCH 15/37] Finished Phase4: Host + manager + retry budget

---
 Makefile                                      |   6 +-
 .../circuit_breaker/circuit_breaker_host.h    | 118 +++++
 .../circuit_breaker/circuit_breaker_manager.h |  80 +++
 include/circuit_breaker/retry_budget.h        | 126 +++++
 include/config/server_config.h                |  16 +-
 server/circuit_breaker_host.cc                | 140 +++++
 server/circuit_breaker_manager.cc             | 105 ++++
 server/config_loader.cc                       |  23 +-
 server/retry_budget.cc                        |  72 +++
 test/circuit_breaker_phase3_test.h            | 496 ++++++++++++++++++
 test/config_test.h                            |  21 +-
 test/run_test.cc                              |   4 +
 12 files changed, 1186 insertions(+), 21 deletions(-)
 create mode 100644 include/circuit_breaker/circuit_breaker_host.h
 create mode 100644 include/circuit_breaker/circuit_breaker_manager.h
 create mode 100644 include/circuit_breaker/retry_budget.h
 create mode 100644 server/circuit_breaker_host.cc
 create mode 100644 server/circuit_breaker_manager.cc
 create mode 100644 server/retry_budget.cc
 create mode 100644 test/circuit_breaker_phase3_test.h

diff --git a/Makefile b/Makefile
index 23a46ce0..935949c8 100644
--- a/Makefile
+++ b/Makefile
@@ -77,7 +77,7 @@ UPSTREAM_SRCS = $(SERVER_DIR)/upstream_connection.cc $(SERVER_DIR)/pool_partitio
 RATE_LIMIT_SRCS = $(SERVER_DIR)/token_bucket.cc $(SERVER_DIR)/rate_limit_zone.cc $(SERVER_DIR)/rate_limiter.cc
 
 # Circuit breaker layer sources
-CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc
+CIRCUIT_BREAKER_SRCS = $(SERVER_DIR)/circuit_breaker_window.cc $(SERVER_DIR)/circuit_breaker_slice.cc $(SERVER_DIR)/retry_budget.cc $(SERVER_DIR)/circuit_breaker_host.cc $(SERVER_DIR)/circuit_breaker_manager.cc
 
 # CLI layer sources
 CLI_SRCS = $(SERVER_DIR)/cli_parser.cc $(SERVER_DIR)/signal_handler.cc $(SERVER_DIR)/pid_file.cc $(SERVER_DIR)/daemonizer.cc
@@ -145,9 +145,9 @@ WS_HEADERS = $(LIB_DIR)/ws/websocket_connection.h $(LIB_DIR)/ws/websocket_frame.
 TLS_HEADERS = $(LIB_DIR)/tls/tls_context.h $(LIB_DIR)/tls/tls_connection.h $(LIB_DIR)/tls/tls_client_context.h
 UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/upstream_host_pool.h $(LIB_DIR)/upstream/pool_partition.h $(LIB_DIR)/upstream/upstream_connection.h $(LIB_DIR)/upstream/upstream_lease.h $(LIB_DIR)/upstream/upstream_http_codec.h $(LIB_DIR)/upstream/http_request_serializer.h $(LIB_DIR)/upstream/header_rewriter.h $(LIB_DIR)/upstream/retry_policy.h $(LIB_DIR)/upstream/proxy_transaction.h $(LIB_DIR)/upstream/proxy_handler.h $(LIB_DIR)/upstream/upstream_response.h $(LIB_DIR)/upstream/upstream_callbacks.h
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
-CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h
+CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/include/circuit_breaker/circuit_breaker_host.h b/include/circuit_breaker/circuit_breaker_host.h
new file mode 100644
index 00000000..6aff2965
--- /dev/null
+++ b/include/circuit_breaker/circuit_breaker_host.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "common.h"
+#include "config/server_config.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "circuit_breaker/retry_budget.h"
+// <memory>, <string>, <vector> provided by common.h
+
+class Dispatcher;
+
+namespace circuit_breaker {
+
+// Observability snapshot of a single host, aggregated across all its
+// partition slices. Safe to call from any thread (relaxed reads of
+// atomic counters). Per-slice rows let dashboards detect skewed
+// failure distribution across dispatchers.
+struct CircuitBreakerHostSnapshot {
+    std::string service_name;
+    std::string host;
+    int port = 0;
+
+    struct SliceRow {
+        size_t dispatcher_index = 0;
+        State state = State::CLOSED;
+        int64_t trips = 0;
+        int64_t rejected = 0;
+        int64_t probe_successes = 0;
+        int64_t probe_failures = 0;
+    };
+    std::vector<SliceRow> slices;
+
+    // Aggregates across slices.
+    int64_t total_trips = 0;
+    int64_t total_rejected = 0;
+    int open_partitions = 0;
+    int half_open_partitions = 0;
+
+    // Retry budget state (per-host, shared across partitions).
+    int64_t retries_in_flight = 0;
+    int64_t retries_rejected = 0;
+    int64_t in_flight = 0;
+};
+
+// Per-upstream-service aggregation layer. Owns:
+//   - N CircuitBreakerSlice instances (one per dispatcher partition,
+//     each pinned to its dispatcher for lock-free hot-path access).
+//   - One RetryBudget (shared across partitions — retry %-of-in-flight
+//     is a host-level metric, not per-dispatcher).
+//
+// Lifetime: constructed by CircuitBreakerManager at server start, lives
+// for the server's lifetime. `service_name`, `host`, `port`, and the
+// slice vector are never mutated post-construction (keys are stable for
+// lock-free map lookup in the manager).
+class CircuitBreakerHost {
+public:
+    // `partition_count` must equal the number of dispatcher partitions
+    // in the server — typically NetServer's socket worker count or
+    // upstream pool's partition count. One slice is created per
+    // partition up-front.
+    CircuitBreakerHost(std::string service_name,
+                       std::string host,
+                       int port,
+                       size_t partition_count,
+                       const CircuitBreakerConfig& config);
+
+    CircuitBreakerHost(const CircuitBreakerHost&) = delete;
+    CircuitBreakerHost& operator=(const CircuitBreakerHost&) = delete;
+
+    // Hot-path lookup — returns nullptr only if `dispatcher_index` is
+    // out of range (programming error). Caller must invoke the
+    // returned slice's methods on its owning dispatcher thread.
+    CircuitBreakerSlice* GetSlice(size_t dispatcher_index);
+
+    // Owned retry budget. Never null for the host's lifetime; safe to
+    // cache the pointer. Shared across all partitions of this host.
+    RetryBudget* GetRetryBudget() { return retry_budget_.get(); }
+    const RetryBudget* GetRetryBudget() const { return retry_budget_.get(); }
+
+    // Aggregate snapshot across all slices + retry budget. Reads are
+    // relaxed atomic — eventually consistent across threads, which is
+    // fine for dashboards.
+    CircuitBreakerHostSnapshot Snapshot() const;
+
+    // Apply a new config to every slice. Because each slice is pinned
+    // to its dispatcher thread, the call is dispatched per-partition —
+    // the caller provides the dispatcher list in the same order used at
+    // construction. If `dispatchers.size() != slices_.size()`, the
+    // method logs an error and returns without applying.
+    //
+    // The retry-budget sub-fields (percent, min_concurrency) are
+    // updated immediately (atomic stores, any thread) as part of this
+    // call — they don't need dispatcher routing.
+    void Reload(const std::vector<std::shared_ptr<Dispatcher>>& dispatchers,
+                const CircuitBreakerConfig& new_config);
+
+    // Install a transition callback on every slice. Uniform callback
+    // across partitions — callers that need partition-specific behavior
+    // can read `slice->dispatcher_index()` inside the callback.
+    // Must be called before live traffic; thread-safety depends on
+    // slice-dispatcher affinity at the Reload layer (Phase 8 wires this).
+    void SetTransitionCallbackOnAllSlices(StateTransitionCallback cb);
+
+    // Accessors.
+    const std::string& service_name() const { return service_name_; }
+    const std::string& host() const { return host_; }
+    int port() const { return port_; }
+    size_t partition_count() const { return slices_.size(); }
+
+private:
+    std::string service_name_;
+    std::string host_;
+    int port_;
+    CircuitBreakerConfig config_;
+    std::vector<std::unique_ptr<CircuitBreakerSlice>> slices_;
+    std::unique_ptr<RetryBudget> retry_budget_;
+};
+
+}  // namespace circuit_breaker
diff --git a/include/circuit_breaker/circuit_breaker_manager.h b/include/circuit_breaker/circuit_breaker_manager.h
new file mode 100644
index 00000000..66c2b33d
--- /dev/null
+++ b/include/circuit_breaker/circuit_breaker_manager.h
@@ -0,0 +1,80 @@
+#pragma once
+
+#include "common.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+// <memory>, <mutex>, <string>, <unordered_map>, <vector> provided by common.h
+
+class Dispatcher;
+
+namespace circuit_breaker {
+
+// Top-level circuit-breaker orchestrator. Mirrors the shape of
+// RateLimitManager: one instance lives on HttpServer, built once at
+// MarkServerReady, survives for the server's lifetime.
+//
+// Ownership (per design §3.1):
+//   HttpServer
+//     ├── upstream_manager_        (declared FIRST, destructs last)
+//     └── circuit_breaker_manager_ (declared SECOND, destructs first)
+//
+//   CircuitBreakerManager
+//     └── hosts_: unordered_map<service_name, unique_ptr<CircuitBreakerHost>>
+//
+// `hosts_` is built once in the constructor — keys are never added or
+// removed at runtime (topology is restart-only per the existing
+// upstream policy). This makes GetHost lock-free after construction,
+// which is critical for the hot path.
+//
+// Hot-reload (Phase 8): only `circuit_breaker` sub-fields on EXISTING
+// upstream services can be live-reloaded. New or removed service names
+// log a warn and are skipped — the caller (HttpServer::Reload) still
+// fires the "restart required" diagnostic in that case.
+class CircuitBreakerManager {
+public:
+    // Builds one CircuitBreakerHost per upstream in `upstreams` — even
+    // when upstreams[i].circuit_breaker.enabled is false — so a later
+    // reload that flips enabled to true can take effect without
+    // re-wiring transition callbacks (disabled slices hold the callback
+    // but never invoke it).
+    //
+    // `partition_count` must match the server's dispatcher partition
+    // count (upstream pool / NetServer worker count). `dispatchers`
+    // captures the dispatcher list so Reload can route per-slice work.
+    CircuitBreakerManager(
+        const std::vector<UpstreamConfig>& upstreams,
+        size_t partition_count,
+        std::vector<std::shared_ptr<Dispatcher>> dispatchers);
+
+    CircuitBreakerManager(const CircuitBreakerManager&) = delete;
+    CircuitBreakerManager& operator=(const CircuitBreakerManager&) = delete;
+
+    // Hot-path lookup — returns nullptr for unknown service names.
+    // Thread-safe (post-construction `hosts_` is read-only).
+    CircuitBreakerHost* GetHost(const std::string& service_name);
+    const CircuitBreakerHost* GetHost(const std::string& service_name) const;
+
+    // Apply breaker-field edits to EXISTING upstream services. Topology
+    // changes (new/removed service names) are logged at warn and
+    // skipped — HttpServer::Reload is the only layer that warns about
+    // topology, and this manager trusts that signal. Serialized by
+    // reload_mtx_ so concurrent Reload calls queue cleanly; the hot
+    // path does NOT take this lock.
+    void Reload(const std::vector<UpstreamConfig>& new_upstreams);
+
+    // Observability — snapshots every host. Safe from any thread.
+    std::vector<CircuitBreakerHostSnapshot> SnapshotAll() const;
+
+    // Test/admin helpers.
+    size_t host_count() const { return hosts_.size(); }
+
+private:
+    // Post-construction read-only — keys and unique_ptr values never
+    // change, so lookups don't need a lock.
+    std::unordered_map<std::string, std::unique_ptr<CircuitBreakerHost>> hosts_;
+    std::vector<std::shared_ptr<Dispatcher>> dispatchers_;
+
+    // Serializes concurrent Reload calls. NOT taken on the hot path.
+    mutable std::mutex reload_mtx_;
+};
+
+}  // namespace circuit_breaker
diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h
new file mode 100644
index 00000000..dd4da11c
--- /dev/null
+++ b/include/circuit_breaker/retry_budget.h
@@ -0,0 +1,126 @@
+#pragma once
+
+#include "common.h"
+// <atomic>, <cstdint> provided by common.h
+
+namespace circuit_breaker {
+
+// Retry budget — orthogonal to the breaker state machine.
+//
+// Problem: even when the circuit is CLOSED, a cascading failure on a
+// healthy-looking upstream can be amplified by per-request retries. If
+// 100 requests are in flight and each retries once, the upstream sees
+// 200. If each retries twice, 300. A sick-but-not-dead upstream gets
+// tipped over by the retry multiplier itself.
+//
+// Fix: cap concurrent retries as a fraction of concurrent non-retry
+// traffic plus a floor for low-volume correctness.
+//
+//   allowed_retries = max(min_concurrency, in_flight * percent / 100)
+//
+// The retry budget is PER-HOST (one instance owned by CircuitBreakerHost,
+// shared across its partitions — the percent math is about aggregate
+// upstream load, not per-dispatcher slicing). All counters are atomic
+// relaxed — snapshots can be slightly stale, which is fine for a
+// capacity gate on a retry storm.
+//
+// Usage (Phase 5 wires this in):
+//   1. On every attempt (first or retry), call TrackInFlight() and keep
+//      the returned guard alive until the attempt completes. The guard
+//      decrements in_flight_ in its destructor.
+//   2. Before issuing a retry attempt, call TryConsumeRetry(). Proceed
+//      if it returns true; reject as RETRY_BUDGET_EXHAUSTED if false.
+//   3. When the retried attempt completes, call ReleaseRetry().
+class RetryBudget {
+public:
+    // `percent` — cap retries at this % of in-flight (0-100).
+    // `min_concurrency` — always allow at least this many concurrent
+    // retries regardless of in_flight; ensures low-volume correctness
+    // (without it, a 20% budget allows 0 retries when in_flight < 5).
+    RetryBudget(int percent, int min_concurrency);
+
+    // Non-copyable, non-movable. Lifetime-stable under its owner
+    // (CircuitBreakerHost).
+    RetryBudget(const RetryBudget&) = delete;
+    RetryBudget& operator=(const RetryBudget&) = delete;
+
+    // RAII guard — decrements in_flight_ on destruction. Move-only.
+    class InFlightGuard {
+    public:
+        InFlightGuard() = default;
+        explicit InFlightGuard(std::atomic<int64_t>* counter) : counter_(counter) {}
+        ~InFlightGuard() {
+            if (counter_) counter_->fetch_sub(1, std::memory_order_relaxed);
+        }
+        InFlightGuard(InFlightGuard&& o) noexcept : counter_(o.counter_) {
+            o.counter_ = nullptr;
+        }
+        InFlightGuard& operator=(InFlightGuard&& o) noexcept {
+            if (this != &o) {
+                if (counter_) counter_->fetch_sub(1, std::memory_order_relaxed);
+                counter_ = o.counter_;
+                o.counter_ = nullptr;
+            }
+            return *this;
+        }
+        InFlightGuard(const InFlightGuard&) = delete;
+        InFlightGuard& operator=(const InFlightGuard&) = delete;
+
+    private:
+        std::atomic<int64_t>* counter_ = nullptr;
+    };
+
+    // Call on every upstream attempt entry (first try OR retry). The
+    // returned guard MUST outlive the attempt — typically stored as a
+    // ProxyTransaction member. Never returns an empty guard.
+    InFlightGuard TrackInFlight();
+
+    // Call BEFORE issuing a retry attempt. Returns true if the retry
+    // fits under the budget (retries_in_flight < cap); caller must pair
+    // a true return with a matching ReleaseRetry when the retry
+    // completes. Returns false if over budget — caller must NOT retry
+    // and must NOT call ReleaseRetry.
+    //
+    // The cap is computed against a freshly-loaded in_flight snapshot:
+    //   cap = max(min_concurrency, in_flight * percent / 100)
+    bool TryConsumeRetry();
+
+    // Call when a consumed retry attempt finishes. Must be paired with a
+    // prior successful TryConsumeRetry.
+    void ReleaseRetry();
+
+    // Apply new tuning. Thread-safe (atomics). Preserves in-flight counters
+    // — only the admission formula changes.
+    void Reload(int percent, int min_concurrency);
+
+    // Observability — safe from any thread, relaxed.
+    int64_t InFlight() const {
+        return in_flight_.load(std::memory_order_relaxed);
+    }
+    int64_t RetriesInFlight() const {
+        return retries_in_flight_.load(std::memory_order_relaxed);
+    }
+    int64_t RetriesRejected() const {
+        return retries_rejected_.load(std::memory_order_relaxed);
+    }
+
+    int percent() const { return percent_.load(std::memory_order_relaxed); }
+    int min_concurrency() const {
+        return min_concurrency_.load(std::memory_order_relaxed);
+    }
+
+private:
+    // Tuning — atomic so Reload() is lock-free.
+    std::atomic<int> percent_;
+    std::atomic<int> min_concurrency_;
+
+    // Counters (relaxed — admission decisions tolerate slightly stale
+    // reads; correctness depends on each guard's fetch_sub pairing with
+    // its increment, which holds under relaxed because they touch the
+    // same atomic).
+    std::atomic<int64_t> in_flight_{0};
+    std::atomic<int64_t> retries_in_flight_{0};
+    std::atomic<int64_t> retries_rejected_{0};
+};
+
+}  // namespace circuit_breaker
diff --git a/include/config/server_config.h b/include/config/server_config.h
index 5a6a39f4..8a8e8ed4 100644
--- a/include/config/server_config.h
+++ b/include/config/server_config.h
@@ -152,12 +152,12 @@ struct CircuitBreakerConfig {
     // Safety valve (future-proof for load-balanced services; no-op v1).
     int max_ejection_percent_per_host_set = 50;
 
-    // NOTE: retry_budget_percent and retry_budget_min_concurrency have been
-    // REMOVED from Phase 2. They'll be re-added in Phase 3 when the
-    // RetryBudget class is introduced (design §4.5). Exposing them here as
-    // config knobs without any runtime code reading them was misleading to
-    // operators — setting them produced no protection against retry storms
-    // since ProxyHandler's RetryPolicy reads proxy.retry.*, not these fields.
+    // Retry budget (orthogonal to the breaker). Caps concurrent retries to
+    // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100).
+    // Wired into the request path in Phase 5; in Phase 3 these are read by
+    // CircuitBreakerHost to construct its owned RetryBudget.
+    int retry_budget_percent = 20;
+    int retry_budget_min_concurrency = 3;
 
     bool operator==(const CircuitBreakerConfig& o) const {
         return enabled == o.enabled &&
@@ -169,7 +169,9 @@ struct CircuitBreakerConfig {
                permitted_half_open_calls == o.permitted_half_open_calls &&
                base_open_duration_ms == o.base_open_duration_ms &&
                max_open_duration_ms == o.max_open_duration_ms &&
-               max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set;
+               max_ejection_percent_per_host_set == o.max_ejection_percent_per_host_set &&
+               retry_budget_percent == o.retry_budget_percent &&
+               retry_budget_min_concurrency == o.retry_budget_min_concurrency;
     }
     bool operator!=(const CircuitBreakerConfig& o) const { return !(*this == o); }
 };
diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc
new file mode 100644
index 00000000..b41635a6
--- /dev/null
+++ b/server/circuit_breaker_host.cc
@@ -0,0 +1,140 @@
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "dispatcher.h"
+#include "log/logger.h"
+
+namespace circuit_breaker {
+
+CircuitBreakerHost::CircuitBreakerHost(std::string service_name,
+                                        std::string host,
+                                        int port,
+                                        size_t partition_count,
+                                        const CircuitBreakerConfig& config)
+    : service_name_(std::move(service_name)),
+      host_(std::move(host)),
+      port_(port),
+      config_(config),
+      retry_budget_(std::make_unique<RetryBudget>(
+          config.retry_budget_percent,
+          config.retry_budget_min_concurrency)) {
+    // Clamp partition_count — a zero-partition host would be unusable
+    // (no slices to dispatch to). Tests or misuse may pass 0; log and
+    // clamp to 1 so the host is at least consistent.
+    if (partition_count == 0) {
+        logging::Get()->error(
+            "CircuitBreakerHost({}, {}:{}) constructed with 0 partitions; "
+            "clamping to 1",
+            service_name_, host_, port_);
+        partition_count = 1;
+    }
+
+    slices_.reserve(partition_count);
+    for (size_t i = 0; i < partition_count; ++i) {
+        // Per-slice label for logs — lets operators grep logs for a
+        // specific host:partition pair.
+        std::string label = service_name_ + ":" + host_ + ":" +
+                            std::to_string(port_) + " p=" + std::to_string(i);
+        slices_.emplace_back(std::make_unique<CircuitBreakerSlice>(
+            std::move(label), i, config_));
+    }
+    logging::Get()->debug(
+        "CircuitBreakerHost created service={} host={}:{} partitions={} "
+        "enabled={} retry_budget={}%,min={}",
+        service_name_, host_, port_, partition_count,
+        config_.enabled,
+        config_.retry_budget_percent,
+        config_.retry_budget_min_concurrency);
+}
+
+CircuitBreakerSlice* CircuitBreakerHost::GetSlice(size_t dispatcher_index) {
+    if (dispatcher_index >= slices_.size()) return nullptr;
+    return slices_[dispatcher_index].get();
+}
+
+CircuitBreakerHostSnapshot CircuitBreakerHost::Snapshot() const {
+    CircuitBreakerHostSnapshot snap;
+    snap.service_name = service_name_;
+    snap.host = host_;
+    snap.port = port_;
+    snap.slices.reserve(slices_.size());
+
+    for (const auto& slice : slices_) {
+        CircuitBreakerHostSnapshot::SliceRow row;
+        row.dispatcher_index = slice->dispatcher_index();
+        row.state = slice->CurrentState();
+        row.trips = slice->Trips();
+        row.rejected = slice->Rejected();
+        row.probe_successes = slice->ProbeSuccesses();
+        row.probe_failures = slice->ProbeFailures();
+
+        snap.total_trips += row.trips;
+        snap.total_rejected += row.rejected;
+        if (row.state == State::OPEN) ++snap.open_partitions;
+        else if (row.state == State::HALF_OPEN) ++snap.half_open_partitions;
+
+        snap.slices.push_back(row);
+    }
+
+    // Retry budget aggregate (host-level, not per-partition).
+    snap.retries_in_flight = retry_budget_->RetriesInFlight();
+    snap.retries_rejected = retry_budget_->RetriesRejected();
+    snap.in_flight = retry_budget_->InFlight();
+
+    return snap;
+}
+
+void CircuitBreakerHost::Reload(
+        const std::vector<std::shared_ptr<Dispatcher>>& dispatchers,
+        const CircuitBreakerConfig& new_config) {
+    // Dispatcher list must match the slice count one-for-one — the
+    // slice at index i lives on dispatcher i. A size mismatch is a
+    // programming error (topology changed post-construction, which is
+    // restart-only); log and bail rather than mis-dispatching.
+    if (dispatchers.size() != slices_.size()) {
+        logging::Get()->error(
+            "CircuitBreakerHost::Reload({}:{}) dispatcher count mismatch: "
+            "got {}, expected {} — reload skipped",
+            service_name_, host_, dispatchers.size(), slices_.size());
+        return;
+    }
+
+    // Update host-level retry budget fields immediately — atomic stores,
+    // no dispatcher routing needed. RetryBudget::Reload clamps internally.
+    retry_budget_->Reload(new_config.retry_budget_percent,
+                          new_config.retry_budget_min_concurrency);
+
+    // Enqueue per-slice Reload on each owning dispatcher. The slice is
+    // dispatcher-thread-local for mutation, so the config swap must
+    // happen there. Passing slice as raw pointer is safe: slices_ is
+    // owned by `this` (the host), which outlives the manager's reload
+    // (enforced by CircuitBreakerManager's lifetime).
+    for (size_t i = 0; i < slices_.size(); ++i) {
+        CircuitBreakerSlice* slice = slices_[i].get();
+        auto& dispatcher = dispatchers[i];
+        if (!dispatcher) {
+            logging::Get()->error(
+                "CircuitBreakerHost::Reload({}:{}) null dispatcher at index {}",
+                service_name_, host_, i);
+            continue;
+        }
+        dispatcher->EnQueue([slice, new_config]() {
+            slice->Reload(new_config);
+        });
+    }
+
+    // Save the new config for future Snapshot() / construction-like
+    // operations. Other threads never read config_ directly.
+    config_ = new_config;
+}
+
+void CircuitBreakerHost::SetTransitionCallbackOnAllSlices(
+        StateTransitionCallback cb) {
+    for (auto& slice : slices_) {
+        // Copy the callback so each slice owns its own std::function.
+        // Passing by value into SetTransitionCallback gives each slice
+        // an independent copy, avoiding cross-partition std::function
+        // data races.
+        slice->SetTransitionCallback(cb);
+    }
+}
+
+}  // namespace circuit_breaker
diff --git a/server/circuit_breaker_manager.cc b/server/circuit_breaker_manager.cc
new file mode 100644
index 00000000..7e4a8035
--- /dev/null
+++ b/server/circuit_breaker_manager.cc
@@ -0,0 +1,105 @@
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "log/logger.h"
+#include <unordered_set>
+
+namespace circuit_breaker {
+
+CircuitBreakerManager::CircuitBreakerManager(
+        const std::vector<UpstreamConfig>& upstreams,
+        size_t partition_count,
+        std::vector<std::shared_ptr<Dispatcher>> dispatchers)
+    : dispatchers_(std::move(dispatchers)) {
+    // Build one Host per upstream regardless of .circuit_breaker.enabled.
+    // Disabled hosts still need a live Slice so a later reload can flip
+    // them on without re-wiring transition callbacks (design §3.1).
+    hosts_.reserve(upstreams.size());
+    for (const auto& u : upstreams) {
+        if (u.name.empty()) {
+            // ConfigLoader::Validate rejects empty names upstream, but
+            // defense in depth — skip rather than insert an unreachable
+            // host with an empty key that would shadow future lookups.
+            logging::Get()->error(
+                "CircuitBreakerManager: skipping upstream with empty name");
+            continue;
+        }
+        auto [it, inserted] = hosts_.emplace(
+            u.name,
+            std::make_unique<CircuitBreakerHost>(
+                u.name, u.host, u.port, partition_count, u.circuit_breaker));
+        if (!inserted) {
+            // Duplicate service name — shouldn't happen (Validate checks
+            // uniqueness), but log so the collision is visible rather
+            // than silently dropping the second entry.
+            logging::Get()->error(
+                "CircuitBreakerManager: duplicate upstream name '{}' ignored",
+                u.name);
+        }
+    }
+    logging::Get()->info(
+        "CircuitBreakerManager initialized hosts={} partitions={}",
+        hosts_.size(), partition_count);
+}
+
+CircuitBreakerHost* CircuitBreakerManager::GetHost(
+        const std::string& service_name) {
+    auto it = hosts_.find(service_name);
+    return it == hosts_.end() ? nullptr : it->second.get();
+}
+
+const CircuitBreakerHost* CircuitBreakerManager::GetHost(
+        const std::string& service_name) const {
+    auto it = hosts_.find(service_name);
+    return it == hosts_.end() ? nullptr : it->second.get();
+}
+
+void CircuitBreakerManager::Reload(
+        const std::vector<UpstreamConfig>& new_upstreams) {
+    // Serialize with any other Reload calls. Hot path doesn't take this.
+    std::lock_guard<std::mutex> lk(reload_mtx_);
+
+    // Detect topology changes (added / removed service names) so we can
+    // log and skip — the authoritative "restart required" warning lives
+    // in HttpServer::Reload; we just honor the "existing hosts only"
+    // contract by applying breaker fields to matching names and nothing
+    // else.
+    std::unordered_set<std::string> new_names;
+    new_names.reserve(new_upstreams.size());
+    for (const auto& u : new_upstreams) new_names.insert(u.name);
+
+    for (const auto& u : new_upstreams) {
+        auto* host = GetHost(u.name);
+        if (!host) {
+            // New service name — topology change, skip. The outer
+            // reload layer warns.
+            logging::Get()->warn(
+                "CircuitBreakerManager::Reload: new upstream '{}' requires "
+                "restart (ignored)",
+                u.name);
+            continue;
+        }
+        host->Reload(dispatchers_, u.circuit_breaker);
+    }
+
+    // Log removals without touching the hosts (their removal also
+    // requires a restart).
+    for (const auto& [name, _] : hosts_) {
+        if (new_names.find(name) == new_names.end()) {
+            logging::Get()->warn(
+                "CircuitBreakerManager::Reload: removed upstream '{}' requires "
+                "restart (ignored)",
+                name);
+        }
+    }
+}
+
+std::vector<CircuitBreakerHostSnapshot>
+CircuitBreakerManager::SnapshotAll() const {
+    std::vector<CircuitBreakerHostSnapshot> snapshots;
+    snapshots.reserve(hosts_.size());
+    for (const auto& [_, host] : hosts_) {
+        snapshots.push_back(host->Snapshot());
+    }
+    return snapshots;
+}
+
+}  // namespace circuit_breaker
diff --git a/server/config_loader.cc b/server/config_loader.cc
index e3f7f6fe..f9b82540 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -310,9 +310,10 @@ ServerConfig ConfigLoader::LoadFromString(const std::string& json_str) {
                     cb_int("max_open_duration_ms", 60000);
                 upstream.circuit_breaker.max_ejection_percent_per_host_set =
                     cb_int("max_ejection_percent_per_host_set", 50);
-                // retry_budget_* fields removed from Phase 2 — re-added in
-                // Phase 3 when the RetryBudget class lands. Unknown keys in
-                // input JSON are silently ignored by nlohmann::json.
+                upstream.circuit_breaker.retry_budget_percent =
+                    cb_int("retry_budget_percent", 20);
+                upstream.circuit_breaker.retry_budget_min_concurrency =
+                    cb_int("retry_budget_min_concurrency", 3);
             }
 
             config.upstreams.push_back(std::move(upstream));
@@ -896,7 +897,16 @@ void ConfigLoader::Validate(const ServerConfig& config) {
                         idx + " ('" + u.name +
                         "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]");
                 }
-                // retry_budget_* validation removed — fields moved to Phase 3.
+                if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.retry_budget_percent must be in [0, 100]");
+                }
+                if (cb.retry_budget_min_concurrency < 0) {
+                    throw std::invalid_argument(
+                        idx + " ('" + u.name +
+                        "'): circuit_breaker.retry_budget_min_concurrency must be >= 0");
+                }
             }
             // Validate method names — reject unknowns and duplicates.
             // Duplicates would cause RouteAsync to throw at startup.
@@ -1178,7 +1188,10 @@ std::string ConfigLoader::ToJson(const ServerConfig& config) {
                 u.circuit_breaker.max_open_duration_ms;
             cbj["max_ejection_percent_per_host_set"] =
                 u.circuit_breaker.max_ejection_percent_per_host_set;
-            // retry_budget_* fields dropped from serialization — Phase 3 adds.
+            cbj["retry_budget_percent"] =
+                u.circuit_breaker.retry_budget_percent;
+            cbj["retry_budget_min_concurrency"] =
+                u.circuit_breaker.retry_budget_min_concurrency;
             uj["circuit_breaker"] = cbj;
         }
         j["upstreams"].push_back(uj);
diff --git a/server/retry_budget.cc b/server/retry_budget.cc
new file mode 100644
index 00000000..7246eb26
--- /dev/null
+++ b/server/retry_budget.cc
@@ -0,0 +1,72 @@
+#include "circuit_breaker/retry_budget.h"
+
+namespace circuit_breaker {
+
+namespace {
+
+// Clamp floors for direct-ctor / Reload callers that bypass
+// ConfigLoader::Validate(). Mirrors the hardening elsewhere in the
+// circuit-breaker code (window ctor, probe budget snapshot,
+// ComputeOpenDuration) so programmatic callers can't disable the
+// budget by passing pathological values.
+//   percent < 0          → 0 (pure min_concurrency floor, no %-based cap)
+//   percent > 100        → 100 (retries capped at total in_flight)
+//   min_concurrency < 0  → 0 (no floor)
+int ClampPercent(int p) {
+    if (p < 0) return 0;
+    if (p > 100) return 100;
+    return p;
+}
+int ClampMinConcurrency(int m) {
+    return m < 0 ? 0 : m;
+}
+
+}  // namespace
+
+RetryBudget::RetryBudget(int percent, int min_concurrency)
+    : percent_(ClampPercent(percent)),
+      min_concurrency_(ClampMinConcurrency(min_concurrency)) {}
+
+RetryBudget::InFlightGuard RetryBudget::TrackInFlight() {
+    in_flight_.fetch_add(1, std::memory_order_relaxed);
+    return InFlightGuard(&in_flight_);
+}
+
+bool RetryBudget::TryConsumeRetry() {
+    // Snapshot counters with relaxed — the gate is an approximate
+    // capacity check, not a strict admission lock. Racing callers may
+    // both read cap=N and both try to reserve; the worst case is that
+    // both succeed and we momentarily sit at retries_in_flight_ =
+    // cap+1, which is acceptable for a traffic-shaping gate (unlike a
+    // security-critical gate).
+    int64_t in_flight = in_flight_.load(std::memory_order_relaxed);
+    int pct = percent_.load(std::memory_order_relaxed);
+    int min_conc = min_concurrency_.load(std::memory_order_relaxed);
+
+    // cap = max(min_concurrency, in_flight * percent / 100)
+    // Integer math is fine — percent is 0..100, in_flight is an int64.
+    // Overflow is impossible within reasonable load levels (in_flight
+    // would need to exceed ~2e16 to overflow after multiplying by 100).
+    int64_t pct_cap = (in_flight * pct) / 100;
+    int64_t cap = pct_cap > min_conc ? pct_cap : min_conc;
+
+    int64_t current = retries_in_flight_.load(std::memory_order_relaxed);
+    if (current >= cap) {
+        retries_rejected_.fetch_add(1, std::memory_order_relaxed);
+        return false;
+    }
+    retries_in_flight_.fetch_add(1, std::memory_order_relaxed);
+    return true;
+}
+
+void RetryBudget::ReleaseRetry() {
+    retries_in_flight_.fetch_sub(1, std::memory_order_relaxed);
+}
+
+void RetryBudget::Reload(int percent, int min_concurrency) {
+    percent_.store(ClampPercent(percent), std::memory_order_relaxed);
+    min_concurrency_.store(ClampMinConcurrency(min_concurrency),
+                           std::memory_order_relaxed);
+}
+
+}  // namespace circuit_breaker
diff --git a/test/circuit_breaker_phase3_test.h b/test/circuit_breaker_phase3_test.h
new file mode 100644
index 00000000..ba2f5554
--- /dev/null
+++ b/test/circuit_breaker_phase3_test.h
@@ -0,0 +1,496 @@
+#pragma once
+
+#include "test_framework.h"
+#include "config/server_config.h"
+#include "circuit_breaker/circuit_breaker_state.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "circuit_breaker/retry_budget.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "dispatcher.h"
+
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+
+// Phase 3 unit tests: RetryBudget, CircuitBreakerHost, CircuitBreakerManager.
+//
+// These tests exercise the standalone data structures introduced in Phase 3
+// without any integration into the request path (that comes in Phase 4).
+// Every test constructs the object under test in isolation — no live
+// dispatchers, no network I/O. A minimal Dispatcher is instantiated only
+// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload
+// calls.
+namespace CircuitBreakerPhase3Tests {
+
+using circuit_breaker::CircuitBreakerHost;
+using circuit_breaker::CircuitBreakerHostSnapshot;
+using circuit_breaker::CircuitBreakerManager;
+using circuit_breaker::Decision;
+using circuit_breaker::FailureKind;
+using circuit_breaker::RetryBudget;
+using circuit_breaker::State;
+
+static CircuitBreakerConfig DefaultCbConfig() {
+    CircuitBreakerConfig cb;
+    cb.enabled = true;
+    cb.consecutive_failure_threshold = 5;
+    cb.failure_rate_threshold = 50;
+    cb.minimum_volume = 20;
+    cb.window_seconds = 10;
+    cb.permitted_half_open_calls = 3;
+    cb.base_open_duration_ms = 5000;
+    cb.max_open_duration_ms = 60000;
+    cb.retry_budget_percent = 20;
+    cb.retry_budget_min_concurrency = 3;
+    return cb;
+}
+
+// ============================================================================
+// RetryBudget tests
+// ============================================================================
+
+// Min-concurrency floor: with tiny in_flight, min_concurrency still permits
+// the configured floor of concurrent retries (otherwise a 20% budget allows 0
+// retries when in_flight < 5 — useless in low-volume services).
+void TestRetryBudgetMinConcurrencyFloor() {
+    std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..."
+              << std::endl;
+    try {
+        // percent=20, min=3. Even with 0 in_flight, 3 retries allowed.
+        RetryBudget rb(20, 3);
+
+        // Without any in_flight, min floor is what gates us.
+        bool r1 = rb.TryConsumeRetry();  // 1/3
+        bool r2 = rb.TryConsumeRetry();  // 2/3
+        bool r3 = rb.TryConsumeRetry();  // 3/3
+        bool r4 = rb.TryConsumeRetry();  // over → rejected
+
+        bool pass = r1 && r2 && r3 && !r4 &&
+                    rb.RetriesInFlight() == 3 &&
+                    rb.RetriesRejected() == 1;
+
+        rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry();
+        pass = pass && rb.RetriesInFlight() == 0;
+
+        TestFramework::RecordTest("RetryBudget min_concurrency floor", pass,
+            pass ? "" : "r1=" + std::to_string(r1) +
+                        " r2=" + std::to_string(r2) +
+                        " r3=" + std::to_string(r3) +
+                        " r4=" + std::to_string(r4) +
+                        " inflight=" + std::to_string(rb.RetriesInFlight()) +
+                        " rejected=" + std::to_string(rb.RetriesRejected()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget min_concurrency floor", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Percent-based cap scales with in_flight.
+//   percent=20, min=0, in_flight=50 → cap = 10 retries.
+void TestRetryBudgetPercentCap() {
+    std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..."
+              << std::endl;
+    try {
+        RetryBudget rb(20, 0);  // no min floor — pure percent
+
+        // Push in_flight to 50 via guards that we intentionally keep alive.
+        std::vector<RetryBudget::InFlightGuard> guards;
+        for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight());
+
+        // 50 * 20% = 10 retries allowed.
+        int admitted = 0;
+        for (int i = 0; i < 20; ++i) {
+            if (rb.TryConsumeRetry()) ++admitted;
+        }
+        bool cap_hit = admitted == 10;
+        bool rejected_count = rb.RetriesRejected() == 10;
+
+        // Release guards — in_flight drops to 0; future TryConsumeRetry with
+        // min=0 and in_flight=0 rejects everything.
+        for (auto& g : guards) (void)std::move(g);
+        guards.clear();
+        for (int i = 0; i < admitted; ++i) rb.ReleaseRetry();
+
+        bool pass = cap_hit && rejected_count && rb.InFlight() == 0 &&
+                    rb.RetriesInFlight() == 0;
+        TestFramework::RecordTest("RetryBudget percent cap", pass,
+            pass ? "" : "admitted=" + std::to_string(admitted) +
+                        " rejected=" + std::to_string(rb.RetriesRejected()) +
+                        " inflight=" + std::to_string(rb.InFlight()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget percent cap", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// TrackInFlight guards must be RAII-safe: destroying the guard decrements
+// in_flight_; moving the guard transfers ownership; self-move safe.
+void TestRetryBudgetInFlightGuardRaii() {
+    std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl;
+    try {
+        RetryBudget rb(20, 3);
+
+        bool zero_init = rb.InFlight() == 0;
+        {
+            auto g = rb.TrackInFlight();
+            bool one_after_track = rb.InFlight() == 1;
+
+            // Move-construct: counter transfers, original is empty.
+            auto g2 = std::move(g);
+            bool still_one_after_move = rb.InFlight() == 1;
+            // g is now empty, destroying it decrements nothing.
+            (void)g;
+
+            // g2 goes out of scope next.
+            if (!zero_init || !one_after_track || !still_one_after_move) {
+                TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
+                    false, "mid-test state wrong",
+                    TestFramework::TestCategory::OTHER);
+                return;
+            }
+        }
+        bool zero_after_drop = rb.InFlight() == 0;
+        TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
+            zero_after_drop,
+            zero_after_drop ? "" : "in_flight not zero after guard drop",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Reload updates tuning atomically without resetting in-flight counters —
+// the admission formula changes, outstanding retries keep running.
+void TestRetryBudgetReloadPreservesCounters() {
+    std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..."
+              << std::endl;
+    try {
+        RetryBudget rb(20, 3);
+        bool r1 = rb.TryConsumeRetry();  // 1/3
+
+        // Tighten tuning mid-flight.
+        rb.Reload(10, 1);
+
+        // Outstanding retry is still tracked.
+        bool inflight_preserved = rb.RetriesInFlight() == 1;
+
+        // New tuning applies — min=1, so 1/1 retry allowed max.
+        // Current retries_in_flight=1 already, next attempt rejects.
+        bool r2 = rb.TryConsumeRetry();
+
+        rb.ReleaseRetry();
+        bool cleanup_ok = rb.RetriesInFlight() == 0;
+
+        bool pass = r1 && inflight_preserved && !r2 && cleanup_ok;
+        TestFramework::RecordTest("RetryBudget Reload preserves counters", pass,
+            pass ? "" : "r1=" + std::to_string(r1) +
+                        " inflight_preserved=" + std::to_string(inflight_preserved) +
+                        " r2=" + std::to_string(r2) +
+                        " cleanup_ok=" + std::to_string(cleanup_ok),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget Reload preserves counters",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Clamp guards: negative percent / negative min_concurrency are clamped at
+// construction (mirrors ConfigLoader::Validate — programmatic callers that
+// bypass validation get safe defaults).
+void TestRetryBudgetClampsInvalidTuning() {
+    std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl;
+    try {
+        RetryBudget rb(-50, -10);
+        bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0;
+
+        // Over-max percent clamps to 100.
+        RetryBudget rb2(500, 5);
+        bool over_clamped = rb2.percent() == 100;
+
+        // Reload also clamps.
+        rb.Reload(-1, -1);
+        bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0;
+
+        bool pass = clamped && over_clamped && reload_clamped;
+        TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass,
+            pass ? "" :
+            "clamped=" + std::to_string(clamped) +
+            " over_clamped=" + std::to_string(over_clamped) +
+            " reload_clamped=" + std::to_string(reload_clamped),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget clamps invalid tuning",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// ============================================================================
+// CircuitBreakerHost tests
+// ============================================================================
+
+// Host creates partition_count slices, GetSlice looks up by index, out-of-
+// range returns nullptr (not a crash).
+void TestHostCreatesSlicesAndGetSlice() {
+    std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..."
+              << std::endl;
+    try {
+        auto cb = DefaultCbConfig();
+        CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb);
+
+        bool count_ok = host.partition_count() == 4;
+        bool slice0 = host.GetSlice(0) != nullptr;
+        bool slice3 = host.GetSlice(3) != nullptr;
+        bool slice4_null = host.GetSlice(4) == nullptr;  // out of range
+        bool slice_big_null = host.GetSlice(100) == nullptr;
+
+        // Retry budget always present.
+        bool rb_present = host.GetRetryBudget() != nullptr;
+
+        // Field getters.
+        bool fields_ok = host.service_name() == "svc" &&
+                        host.host() == "10.0.0.1" &&
+                        host.port() == 8080;
+
+        bool pass = count_ok && slice0 && slice3 && slice4_null &&
+                    slice_big_null && rb_present && fields_ok;
+        TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerHost GetSlice", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Host Snapshot aggregates counters across slices and rolls up states.
+void TestHostSnapshotAggregates() {
+    std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..."
+              << std::endl;
+    try {
+        auto cb = DefaultCbConfig();
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        CircuitBreakerHost host("svc", "h", 80, 3, cb);
+
+        // Trip slice 0 and 2 → 2 open_partitions, 1 closed.
+        for (int p : {0, 2}) {
+            auto* s = host.GetSlice(p);
+            for (int i = 0; i < 2; ++i) {
+                auto a = s->TryAcquire();
+                s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+            }
+        }
+
+        auto snap = host.Snapshot();
+
+        bool rows_ok = snap.slices.size() == 3;
+        bool total_trips = snap.total_trips == 2;
+        bool open = snap.open_partitions == 2;
+        bool halfopen = snap.half_open_partitions == 0;
+        bool svc_ok = snap.service_name == "svc" &&
+                      snap.host == "h" && snap.port == 80;
+
+        bool pass = rows_ok && total_trips && open && halfopen && svc_ok;
+        TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass,
+            pass ? "" :
+            "rows=" + std::to_string(snap.slices.size()) +
+            " trips=" + std::to_string(snap.total_trips) +
+            " open=" + std::to_string(snap.open_partitions),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Host Reload with mismatched dispatcher count logs error and does nothing.
+// Uses an empty dispatcher vector — the mismatch path must NOT dereference.
+void TestHostReloadDispatcherMismatchIsSafe() {
+    std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..."
+              << std::endl;
+    try {
+        auto cb = DefaultCbConfig();
+        CircuitBreakerHost host("svc", "h", 80, 3, cb);
+
+        auto new_cb = cb;
+        new_cb.failure_rate_threshold = 80;
+
+        // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not
+        // apply (retry budget atomics should stay at old values).
+        std::vector<std::shared_ptr<Dispatcher>> empty;
+        host.Reload(empty, new_cb);
+
+        // Retry budget fields should be unchanged — Reload bailed early.
+        bool rb_unchanged =
+            host.GetRetryBudget()->percent() == cb.retry_budget_percent &&
+            host.GetRetryBudget()->min_concurrency() ==
+                cb.retry_budget_min_concurrency;
+
+        TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe",
+            rb_unchanged,
+            rb_unchanged ? "" : "retry budget incorrectly updated on bail",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// ============================================================================
+// CircuitBreakerManager tests
+// ============================================================================
+
+// Manager builds one host per upstream (regardless of enabled). GetHost
+// returns non-null for known names and null for unknown.
+void TestManagerGetHostLookup() {
+    std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..."
+              << std::endl;
+    try {
+        std::vector<UpstreamConfig> upstreams(2);
+        upstreams[0].name = "svc-a";
+        upstreams[0].host = "10.0.0.1";
+        upstreams[0].port = 8080;
+        upstreams[0].circuit_breaker = DefaultCbConfig();
+        upstreams[1].name = "svc-b";
+        upstreams[1].host = "10.0.0.2";
+        upstreams[1].port = 9090;
+        upstreams[1].circuit_breaker = DefaultCbConfig();
+        upstreams[1].circuit_breaker.enabled = false;  // disabled still built
+
+        CircuitBreakerManager mgr(upstreams, 4, {});
+
+        bool count_ok = mgr.host_count() == 2;
+        auto* a = mgr.GetHost("svc-a");
+        auto* b = mgr.GetHost("svc-b");
+        auto* unknown = mgr.GetHost("nope");
+
+        bool a_ok = a != nullptr && a->port() == 8080 &&
+                    a->partition_count() == 4;
+        bool b_ok = b != nullptr && b->port() == 9090 &&
+                    b->partition_count() == 4;
+        bool unknown_null = unknown == nullptr;
+
+        bool pass = count_ok && a_ok && b_ok && unknown_null;
+        TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass,
+            pass ? "" :
+            "count_ok=" + std::to_string(count_ok) +
+            " a=" + std::to_string(a_ok) +
+            " b=" + std::to_string(b_ok) +
+            " unknown_null=" + std::to_string(unknown_null),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerManager GetHost lookup",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// SnapshotAll returns one entry per host; topology-preserved Reload logs and
+// skips new/removed names without crashing.
+void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() {
+    std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..."
+              << std::endl;
+    try {
+        std::vector<UpstreamConfig> upstreams(1);
+        upstreams[0].name = "svc-a";
+        upstreams[0].host = "h";
+        upstreams[0].port = 80;
+        upstreams[0].circuit_breaker = DefaultCbConfig();
+
+        CircuitBreakerManager mgr(upstreams, 2, {});
+
+        auto snaps = mgr.SnapshotAll();
+        bool one_snapshot = snaps.size() == 1;
+        bool snap_name_ok = snaps[0].service_name == "svc-a";
+
+        // Reload with a NEW name + REMOVED existing name — both must log
+        // warn and do nothing (topology is restart-only).
+        std::vector<UpstreamConfig> new_upstreams(1);
+        new_upstreams[0].name = "svc-NEW";
+        new_upstreams[0].host = "h";
+        new_upstreams[0].port = 80;
+        new_upstreams[0].circuit_breaker = DefaultCbConfig();
+
+        mgr.Reload(new_upstreams);
+
+        // Manager must still only know about svc-a (the original).
+        bool original_preserved = mgr.GetHost("svc-a") != nullptr;
+        bool new_not_added = mgr.GetHost("svc-NEW") == nullptr;
+        bool count_stable = mgr.host_count() == 1;
+
+        bool pass = one_snapshot && snap_name_ok && original_preserved &&
+                    new_not_added && count_stable;
+        TestFramework::RecordTest(
+            "CircuitBreakerManager SnapshotAll + topology-skip", pass,
+            pass ? "" :
+            "one_snap=" + std::to_string(one_snapshot) +
+            " name_ok=" + std::to_string(snap_name_ok) +
+            " preserved=" + std::to_string(original_preserved) +
+            " new_not_added=" + std::to_string(new_not_added) +
+            " count=" + std::to_string(mgr.host_count()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CircuitBreakerManager SnapshotAll + topology-skip",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects
+// empty names, but manager must not blow up if something slips through).
+void TestManagerSkipsEmptyNameUpstream() {
+    std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..."
+              << std::endl;
+    try {
+        std::vector<UpstreamConfig> upstreams(2);
+        upstreams[0].name = "";  // defensive — should be skipped
+        upstreams[0].host = "h";
+        upstreams[0].port = 80;
+        upstreams[0].circuit_breaker = DefaultCbConfig();
+        upstreams[1].name = "svc-b";
+        upstreams[1].host = "h";
+        upstreams[1].port = 81;
+        upstreams[1].circuit_breaker = DefaultCbConfig();
+
+        CircuitBreakerManager mgr(upstreams, 2, {});
+
+        bool pass = mgr.host_count() == 1 &&
+                    mgr.GetHost("svc-b") != nullptr &&
+                    mgr.GetHost("") == nullptr;
+        TestFramework::RecordTest(
+            "CircuitBreakerManager skips empty-name upstream", pass,
+            pass ? "" : "count=" + std::to_string(mgr.host_count()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CircuitBreakerManager skips empty-name upstream",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Run all Phase 3 tests.
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER PHASE 3 - UNIT TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestRetryBudgetMinConcurrencyFloor();
+    TestRetryBudgetPercentCap();
+    TestRetryBudgetInFlightGuardRaii();
+    TestRetryBudgetReloadPreservesCounters();
+    TestRetryBudgetClampsInvalidTuning();
+
+    TestHostCreatesSlicesAndGetSlice();
+    TestHostSnapshotAggregates();
+    TestHostReloadDispatcherMismatchIsSafe();
+
+    TestManagerGetHostLookup();
+    TestManagerSnapshotAllAndReloadSkipsTopologyChanges();
+    TestManagerSkipsEmptyNameUpstream();
+}
+
+}  // namespace CircuitBreakerPhase3Tests
diff --git a/test/config_test.h b/test/config_test.h
index 6317151f..fe164ec3 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -362,8 +362,9 @@ namespace ConfigTests {
                         cb.permitted_half_open_calls == 5 &&
                         cb.base_open_duration_ms == 5000 &&
                         cb.max_open_duration_ms == 60000 &&
-                        cb.max_ejection_percent_per_host_set == 50;
-            // retry_budget_* fields removed from Phase 2 — Phase 3 adds.
+                        cb.max_ejection_percent_per_host_set == 50 &&
+                        cb.retry_budget_percent == 20 &&
+                        cb.retry_budget_min_concurrency == 3;
             TestFramework::RecordTest("Circuit Breaker Defaults", pass,
                 pass ? "" : "default value mismatch",
                 TestFramework::TestCategory::OTHER);
@@ -392,7 +393,9 @@ namespace ConfigTests {
                         "permitted_half_open_calls": 3,
                         "base_open_duration_ms": 2000,
                         "max_open_duration_ms": 120000,
-                        "max_ejection_percent_per_host_set": 33
+                        "max_ejection_percent_per_host_set": 33,
+                        "retry_budget_percent": 10,
+                        "retry_budget_min_concurrency": 5
                     }
                 }]
             })";
@@ -406,7 +409,9 @@ namespace ConfigTests {
                         cb.permitted_half_open_calls == 3 &&
                         cb.base_open_duration_ms == 2000 &&
                         cb.max_open_duration_ms == 120000 &&
-                        cb.max_ejection_percent_per_host_set == 33;
+                        cb.max_ejection_percent_per_host_set == 33 &&
+                        cb.retry_budget_percent == 10 &&
+                        cb.retry_budget_min_concurrency == 5;
             TestFramework::RecordTest("Circuit Breaker JSON Parse", pass,
                 pass ? "" : "parsed values mismatch",
                 TestFramework::TestCategory::OTHER);
@@ -520,8 +525,12 @@ namespace ConfigTests {
         ExpectValidationFailure("CB Validation: max<base",
             R"({"base_open_duration_ms": 5000, "max_open_duration_ms": 1000})",
             "max_open_duration_ms must be >= base_open_duration_ms");
-        // retry_budget_percent / retry_budget_min_concurrency validation
-        // cases removed — fields moved to Phase 3.
+        ExpectValidationFailure("CB Validation: retry_budget_percent>100",
+            R"({"retry_budget_percent": 200})",
+            "retry_budget_percent must be in [0, 100]");
+        ExpectValidationFailure("CB Validation: retry_budget_min_concurrency<0",
+            R"({"retry_budget_min_concurrency": -1})",
+            "retry_budget_min_concurrency must be >= 0");
         ExpectValidationFailure("CB Validation: max_ejection_percent>100",
             R"({"max_ejection_percent_per_host_set": 150})",
             "max_ejection_percent_per_host_set must be in [0, 100]");
diff --git a/test/run_test.cc b/test/run_test.cc
index 3d55f06f..f118d495 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -14,6 +14,7 @@
 #include "proxy_test.h"
 #include "rate_limit_test.h"
 #include "circuit_breaker_test.h"
+#include "circuit_breaker_phase3_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -81,6 +82,9 @@ void RunAllTest(){
     // Run circuit breaker tests
     CircuitBreakerTests::RunAllTests();
 
+    // Run circuit breaker Phase 3 tests (host / manager / retry budget)
+    CircuitBreakerPhase3Tests::RunAllTests();
+
     std::cout << "====================================\n" << std::endl;
 }
 

From 548e16982becc0a670045b28726e1ec7d3e148e5 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 13:22:11 +0800
Subject: [PATCH 16/37] Finished Phase4: Host + manager + retry budget

---
 Makefile                             |   2 +-
 include/http/http_server.h           |  14 +
 include/upstream/pool_partition.h    |   5 +
 include/upstream/proxy_transaction.h |  87 +++++-
 include/upstream/upstream_manager.h  |  29 ++
 server/http_server.cc                |  23 ++
 server/proxy_transaction.cc          | 275 ++++++++++++++++-
 test/circuit_breaker_phase4_test.h   | 440 +++++++++++++++++++++++++++
 test/run_test.cc                     |   5 +
 9 files changed, 865 insertions(+), 15 deletions(-)
 create mode 100644 test/circuit_breaker_phase4_test.h

diff --git a/Makefile b/Makefile
index 935949c8..2dbd8c2a 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
 CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/include/http/http_server.h b/include/http/http_server.h
index 8a497d8b..75e1e9a0 100644
--- a/include/http/http_server.h
+++ b/include/http/http_server.h
@@ -22,6 +22,10 @@
 class UpstreamManager;
 class ProxyHandler;
 
+namespace circuit_breaker {
+class CircuitBreakerManager;
+}
+
 class HttpServer {
 public:
     // Snapshot of server runtime statistics. All values are approximate
@@ -321,6 +325,16 @@ class HttpServer {
     std::vector<UpstreamConfig> upstream_configs_;
     std::unique_ptr<UpstreamManager> upstream_manager_;
 
+    // Circuit breaker — declared AFTER upstream_manager_ so destruction
+    // order is breaker-FIRST, pool-SECOND (design §3.1). On shutdown the
+    // breaker's slices may still be consulted by in-flight
+    // ProxyTransactions until they drain; destroying the breaker first
+    // (before the pool) is safe because UpstreamManager's outstanding
+    // breaker_manager_ pointer is checked against null on every lookup.
+    // Destroying the pool first would leave breaker slices holding
+    // dangling references.
+    std::unique_ptr<circuit_breaker::CircuitBreakerManager> circuit_breaker_manager_;
+
     // Rate limiting
     RateLimitConfig rate_limit_config_;
     std::unique_ptr<RateLimitManager> rate_limit_manager_;
diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h
index 4c33a0cd..f259204a 100644
--- a/include/upstream/pool_partition.h
+++ b/include/upstream/pool_partition.h
@@ -25,6 +25,11 @@ class PoolPartition {
     static constexpr int CHECKOUT_CONNECT_TIMEOUT = -3;
     static constexpr int CHECKOUT_SHUTTING_DOWN   = -4;
     static constexpr int CHECKOUT_QUEUE_TIMEOUT   = -5;
+    // Delivered to wait-queue waiters drained on a breaker trip (Phase 6
+    // implements the drain path). ProxyTransaction::OnCheckoutError maps
+    // this to RESULT_CIRCUIT_OPEN so the queued client gets the same
+    // circuit-open response a fresh requester would get.
+    static constexpr int CHECKOUT_CIRCUIT_OPEN    = -6;
 
     PoolPartition(std::shared_ptr<Dispatcher> dispatcher,
                   const std::string& upstream_host, int upstream_port,
diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h
index 6e25c689..6befe5a0 100644
--- a/include/upstream/proxy_transaction.h
+++ b/include/upstream/proxy_transaction.h
@@ -15,16 +15,29 @@ class UpstreamManager;
 class ConnectionHandler;
 class Dispatcher;
 
+namespace circuit_breaker {
+class CircuitBreakerSlice;
+}
+
 class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
 public:
     // Result codes for internal state tracking
-    static constexpr int RESULT_SUCCESS            = 0;
-    static constexpr int RESULT_CHECKOUT_FAILED    = -1;  // Upstream connect failure → 502
-    static constexpr int RESULT_SEND_FAILED        = -2;
-    static constexpr int RESULT_PARSE_ERROR        = -3;
-    static constexpr int RESULT_RESPONSE_TIMEOUT   = -4;
+    static constexpr int RESULT_SUCCESS             = 0;
+    static constexpr int RESULT_CHECKOUT_FAILED     = -1;  // Upstream connect failure → 502
+    static constexpr int RESULT_SEND_FAILED         = -2;
+    static constexpr int RESULT_PARSE_ERROR         = -3;
+    static constexpr int RESULT_RESPONSE_TIMEOUT    = -4;
     static constexpr int RESULT_UPSTREAM_DISCONNECT = -5;
-    static constexpr int RESULT_POOL_EXHAUSTED     = -6;  // Local capacity → 503
+    static constexpr int RESULT_POOL_EXHAUSTED      = -6;  // Local capacity → 503
+    // Circuit breaker rejected this attempt before it touched the upstream.
+    // Carries Retry-After + X-Circuit-Breaker headers (§12.1).
+    // Terminal — retry loop MUST NOT retry this outcome (§8).
+    static constexpr int RESULT_CIRCUIT_OPEN        = -7;
+    // Retry budget exhausted (Phase 5 wires the actual gate; the code is
+    // reserved here so MakeErrorResponse and the retry loop both know it
+    // exists and terminal-classify it). No Retry-After; distinct header
+    // X-Retry-Budget-Exhausted so operators can tell the two 503s apart.
+    static constexpr int RESULT_RETRY_BUDGET_EXHAUSTED = -8;
 
     // Constructor copies all needed fields from client_request (method, path,
     // query, headers, body, params, dispatcher_index, client_ip, client_tls,
@@ -145,6 +158,30 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // Timing
     std::chrono::steady_clock::time_point start_time_;
 
+    // Circuit breaker integration (Phase 4). Resolved once in Start() from
+    // `service_name_` + `dispatcher_index_`. Null when there's no
+    // CircuitBreakerManager attached (server has no upstreams, or the
+    // breaker is being built lazily) — the breaker is simply skipped in
+    // that case. Lifetime: the slice is owned by CircuitBreakerHost in
+    // CircuitBreakerManager on HttpServer, which outlives this transaction.
+    circuit_breaker::CircuitBreakerSlice* slice_ = nullptr;
+
+    // Per-ATTEMPT admission state. Reset on each call to ConsultBreaker();
+    // paired Report*() calls thread the `generation` back so the slice
+    // can drop stale completions across state transitions (see
+    // CircuitBreakerSlice::Admission doc). generation_==0 is a sentinel
+    // for "no admission held" — slice domain gens start at 1 so a 0-gen
+    // report always drops safely.
+    uint64_t admission_generation_ = 0;
+    bool is_probe_ = false;
+
+    // Retry-budget token held by this transaction's most recent retry
+    // attempt. Phase 5 flips this to true on successful TryConsumeRetry
+    // and clears it on ReleaseRetry. Phase 4 declares the field so
+    // Cleanup() and Cancel() have something to check, but the retry
+    // loop does not yet consume the budget.
+    bool retry_token_held_ = false;
+
     // Internal methods
     void AttemptCheckout();
     void OnCheckoutReady(UpstreamLease lease);
@@ -170,6 +207,42 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     void ArmResponseTimeout(int explicit_budget_ms = 0);
     void ClearResponseTimeout();
 
-    // Error response factory (maps result codes to HTTP responses)
+    // Error response factory (maps result codes to HTTP responses).
+    // Circuit-open and retry-budget responses need richer context
+    // (Retry-After from slice_, distinguishing header), so they have
+    // dedicated factories below — MakeErrorResponse falls back to a
+    // plain 503 for those codes if called generically.
     static HttpResponse MakeErrorResponse(int result_code);
+
+    // Phase 4: emit the §12.1 circuit-open response.
+    //   503 + Retry-After (seconds until slice->OpenUntil())
+    //       + X-Circuit-Breaker: open
+    //       + X-Upstream-Host: service:host:port
+    HttpResponse MakeCircuitOpenResponse() const;
+
+    // Phase 5 will emit this. Declared here so Phase 4's
+    // MakeErrorResponse RESULT_RETRY_BUDGET_EXHAUSTED branch has a
+    // target to dispatch to and so tests can assert the response shape
+    // even before the retry-budget gate is wired.
+    //   503 + X-Retry-Budget-Exhausted: 1
+    static HttpResponse MakeRetryBudgetResponse();
+
+    // Phase 4 helpers — breaker gate and outcome classification.
+    //
+    // ConsultBreaker: call at the top of AttemptCheckout. Populates
+    // admission_generation_ and is_probe_ on admission; delivers the
+    // circuit-open response and returns false on reject. Dry-run admits
+    // and returns true (slice already counted the would-reject).
+    // Returns true if the caller should proceed to CheckoutAsync.
+    bool ConsultBreaker();
+
+    // ReportBreakerOutcome: classify a result_code into
+    // success/failure/neutral (per design §7) and call slice->Report*
+    // with admission_generation_. Clears admission_generation_ so a
+    // double-report is impossible.
+    //
+    // failure_kind is ignored unless the outcome is a FailureKind-bearing
+    // result; the caller passes the appropriate kind for 5xx vs disconnect
+    // vs timeout since the slice treats them differently only for logs.
+    void ReportBreakerOutcome(int result_code);
 };
diff --git a/include/upstream/upstream_manager.h b/include/upstream/upstream_manager.h
index c308cbd3..f647d3b3 100644
--- a/include/upstream/upstream_manager.h
+++ b/include/upstream/upstream_manager.h
@@ -9,6 +9,10 @@
 
 class TlsClientContext;
 
+namespace circuit_breaker {
+class CircuitBreakerManager;
+}
+
 class UpstreamManager {
 public:
     UpstreamManager(const std::vector<UpstreamConfig>& upstreams,
@@ -59,6 +63,23 @@ class UpstreamManager {
     // Check if an upstream service is configured
     bool HasUpstream(const std::string& service_name) const;
 
+    // Install a non-owning pointer to the server's CircuitBreakerManager.
+    // Called once from HttpServer::MarkServerReady after both managers are
+    // constructed (§3.1). Lifetime guarantee: the CircuitBreakerManager
+    // is declared AFTER upstream_manager_ on HttpServer, so it destructs
+    // FIRST — UpstreamManager never reads through a dangling pointer on
+    // shutdown. Passing nullptr is allowed (detaches).
+    void AttachCircuitBreakerManager(circuit_breaker::CircuitBreakerManager* mgr) {
+        breaker_manager_.store(mgr, std::memory_order_release);
+    }
+
+    // Returns the attached breaker manager, or nullptr if no manager is
+    // attached. Safe from any thread (atomic load, acquire so any
+    // Attach-time publication is visible).
+    circuit_breaker::CircuitBreakerManager* GetCircuitBreakerManager() const {
+        return breaker_manager_.load(std::memory_order_acquire);
+    }
+
 private:
     // service_name → host pool. Built once at construction, never modified.
     std::unordered_map<std::string, std::unique_ptr<UpstreamHostPool>> pools_;
@@ -73,6 +94,14 @@ class UpstreamManager {
     // reject new checkouts before per-partition shutdown tasks execute.
     std::atomic<bool> shutting_down_{false};
 
+    // Non-owning pointer to the circuit-breaker manager, installed by
+    // HttpServer::MarkServerReady after both managers exist. Atomic so
+    // late-arriving hot-path reads in ProxyTransaction see either a
+    // coherent pointer or nullptr (never torn). Owned by HttpServer;
+    // lifetime outlives UpstreamManager (breaker destructs first —
+    // §3.1 ownership). Default nullptr — breaker is an opt-in layer.
+    std::atomic<circuit_breaker::CircuitBreakerManager*> breaker_manager_{nullptr};
+
     // Manager-owned atomic counter: total outstanding connections
     std::atomic<int64_t> outstanding_conns_{0};
 
diff --git a/server/http_server.cc b/server/http_server.cc
index ecfff96f..fbf06947 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -5,6 +5,7 @@
 #include "http2/http2_constants.h"
 #include "upstream/upstream_manager.h"
 #include "upstream/proxy_handler.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
 #include "log/logger.h"
 #include "log/log_utils.h"
 #include <algorithm>
@@ -361,6 +362,28 @@ void HttpServer::MarkServerReady() {
             throw;
         }
 
+        // Circuit breaker — built alongside the pool. One host per
+        // configured upstream (regardless of enabled), with one slice
+        // per dispatcher so hot-path TryAcquire is lock-free. Attached
+        // to UpstreamManager via a non-owning pointer so ProxyTransaction
+        // can reach it on the hot path via upstream_manager_->
+        // GetCircuitBreakerManager(). The manager is declared AFTER
+        // upstream_manager_ on HttpServer (see header) so teardown runs
+        // breaker-first, which matches the dangling-pointer safety rule
+        // in UpstreamManager::breaker_manager_.
+        try {
+            circuit_breaker_manager_ =
+                std::make_unique<circuit_breaker::CircuitBreakerManager>(
+                    upstream_configs_, dispatchers.size(), dispatchers);
+            upstream_manager_->AttachCircuitBreakerManager(
+                circuit_breaker_manager_.get());
+        } catch (...) {
+            logging::Get()->error(
+                "Circuit breaker init failed, stopping server");
+            net_server_.Stop();
+            throw;
+        }
+
         // Ensure the timer cadence is fast enough for upstream connect timeouts.
         // SetDeadline stores a ms-precision deadline, but TimerHandler only fires
         // at the timer scan interval. If connect_timeout_ms < current interval,
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 18aa6193..c263332e 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -2,6 +2,9 @@
 #include "upstream/upstream_manager.h"
 #include "upstream/upstream_connection.h"
 #include "upstream/http_request_serializer.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
 #include "connection_handler.h"
 #include "dispatcher.h"
 // config/server_config.h provided by proxy_transaction.h (ProxyConfig stored by value)
@@ -110,12 +113,39 @@ void ProxyTransaction::Start() {
                           upstream_host_, upstream_port_,
                           method_, upstream_path);
 
+    // Resolve the circuit-breaker slice once. Null when no breaker is
+    // attached (server has no upstreams configured, or Phase 4 skipped
+    // on this deployment), or when the service/dispatcher pair is out of
+    // range. In any null case the breaker is simply bypassed — the
+    // transaction proceeds as if circuit breaking were disabled.
+    if (upstream_manager_ && dispatcher_index_ >= 0) {
+        auto* cbm = upstream_manager_->GetCircuitBreakerManager();
+        if (cbm) {
+            auto* host = cbm->GetHost(service_name_);
+            if (host) {
+                slice_ = host->GetSlice(static_cast<size_t>(dispatcher_index_));
+            }
+        }
+    }
+
     AttemptCheckout();
 }
 
 void ProxyTransaction::AttemptCheckout() {
     state_ = State::CHECKOUT_PENDING;
 
+    // Circuit breaker gate — consulted before every attempt (first try and
+    // retries both). Each attempt gets a fresh admission stamped with the
+    // slice's current generation. If the slice rejects with REJECTED_OPEN,
+    // ConsultBreaker delivers the §12.1 response and returns false; the
+    // retry loop treats RESULT_CIRCUIT_OPEN as terminal (§8) so a rejected
+    // retry produces a single 503 to the client, not a nested retry.
+    // Dry-run reject logs inside TryAcquire and returns ADMITTED through
+    // the decision enum (REJECTED_OPEN_DRYRUN), so ConsultBreaker proceeds.
+    if (!ConsultBreaker()) {
+        return;
+    }
+
     auto self = shared_from_this();
 
     // Lazily allocate the shared cancel token so the pool can drop
@@ -224,21 +254,58 @@ void ProxyTransaction::OnCheckoutError(int error_code) {
     // Only retry actual network connect failures. Pool saturation
     // (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown should fail fast —
     // retrying under backpressure amplifies load on an already-stressed
-    // pool and stretches client latency with no benefit.
+    // pool and stretches client latency with no benefit. A breaker-drain
+    // reject (CHECKOUT_CIRCUIT_OPEN, Phase 6) is also terminal: the
+    // client gets the same circuit-open response a fresh requester
+    // would, and the retry loop must not retry it.
+    //
+    // Breaker reporting: connect failures (both timeout and refused) are
+    // upstream-health signals → ReportFailure(CONNECT_FAILURE). Local
+    // capacity (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown are NOT
+    // reported — they don't imply upstream unhealthiness (design §7).
+    // CHECKOUT_CIRCUIT_OPEN is also not reported to the breaker (would
+    // be a feedback loop — our own reject counting against the upstream).
+    //
     // Import error codes from PoolPartition:
-    //   CHECKOUT_CONNECT_FAILED  = -2  → retryable
-    //   CHECKOUT_CONNECT_TIMEOUT = -3  → retryable
-    //   CHECKOUT_POOL_EXHAUSTED  = -1  → not retryable
-    //   CHECKOUT_QUEUE_TIMEOUT   = -5  → not retryable
-    //   CHECKOUT_SHUTTING_DOWN   = -4  → not retryable
+    //   CHECKOUT_CONNECT_FAILED  = -2  → retryable, report CONNECT_FAILURE
+    //   CHECKOUT_CONNECT_TIMEOUT = -3  → retryable, report CONNECT_FAILURE
+    //   CHECKOUT_POOL_EXHAUSTED  = -1  → not retryable, neutral-release probe
+    //   CHECKOUT_QUEUE_TIMEOUT   = -5  → not retryable, neutral-release probe
+    //   CHECKOUT_SHUTTING_DOWN   = -4  → not retryable, neutral-release probe
+    //   CHECKOUT_CIRCUIT_OPEN    = -6  → not retryable, do NOT report
     static constexpr int CONNECT_FAILED  = -2;
     static constexpr int CONNECT_TIMEOUT = -3;
+    static constexpr int CIRCUIT_OPEN    = -6;
+
+    if (error_code == CIRCUIT_OPEN) {
+        // Drain path: breaker tripped while this transaction was queued
+        // (Phase 6 implements the drain). Do NOT Report to the slice —
+        // our own reject must not feed back into the failure math. Emit
+        // the §12.1 circuit-open response directly.
+        logging::Get()->info(
+            "ProxyTransaction checkout drained by circuit breaker "
+            "client_fd={} service={}",
+            client_fd_, service_name_);
+        DeliverResponse(MakeCircuitOpenResponse());
+        // Clear admission_generation_ so Cleanup / destructor doesn't
+        // double-report. The admission was already fire-and-forget —
+        // slice-side bookkeeping is intact (the drain itself doesn't
+        // touch inflight counters because the breaker didn't admit).
+        admission_generation_ = 0;
+        return;
+    }
 
     if (error_code == CONNECT_FAILED || error_code == CONNECT_TIMEOUT) {
+        // Report connect failure to the breaker BEFORE retrying —
+        // otherwise the retry's ConsultBreaker might admit against a
+        // stale success count, delaying trip detection.
+        ReportBreakerOutcome(RESULT_CHECKOUT_FAILED);
         MaybeRetry(RetryPolicy::RetryCondition::CONNECT_FAILURE);
     } else {
         // Pool exhaustion, queue timeout, or shutdown — local capacity issue.
         // Use RESULT_POOL_EXHAUSTED → 503 (not 502 which implies upstream failure).
+        // Release the breaker slot neutrally — admission never reached upstream.
+        ReportBreakerOutcome(RESULT_POOL_EXHAUSTED);
         OnError(RESULT_POOL_EXHAUSTED,
                 "Pool checkout failed (local capacity, error=" +
                 std::to_string(error_code) + ")");
@@ -517,10 +584,20 @@ void ProxyTransaction::OnResponseComplete() {
                              "service={} status={} attempt={}",
                              client_fd_, service_name_,
                              response.status_code, attempt_);
+        // Report failure BEFORE MaybeRetry — the retry's fresh
+        // ConsultBreaker must see the just-added failure in the window
+        // (and potentially reject if this was the trip-causing call).
+        // Pass a synthetic RESULT_CHECKOUT_FAILED-like signal; the
+        // classifier maps 5xx → FailureKind::RESPONSE_5XX.
+        ReportBreakerOutcome(/* sentinel */ -1000);
         MaybeRetry(RetryPolicy::RetryCondition::RESPONSE_5XX);
         return;
     }
 
+    // 2xx / 3xx / 4xx: upstream is healthy (from the breaker's
+    // perspective — 4xx is a client-side problem). Report success.
+    ReportBreakerOutcome(RESULT_SUCCESS);
+
     state_ = State::COMPLETE;
 
     auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -550,8 +627,19 @@ void ProxyTransaction::OnError(int result_code,
                          client_fd_, service_name_, result_code,
                          attempt_, duration.count(), log_message);
 
+    // Report the outcome if an admission is still held. Most error paths
+    // call ReportBreakerOutcome themselves BEFORE reaching OnError (so a
+    // retry's ConsultBreaker sees the fresh signal) — this is a safety
+    // net for error paths that skipped reporting, e.g., RESULT_SEND_FAILED
+    // and RESULT_RESPONSE_TIMEOUT from the on-upstream-data paths.
+    // ReportBreakerOutcome is idempotent: it clears admission_generation_
+    // on the first call so a double-call drops harmlessly.
+    ReportBreakerOutcome(result_code);
+
     state_ = State::FAILED;
-    HttpResponse error_response = MakeErrorResponse(result_code);
+    HttpResponse error_response = (result_code == RESULT_CIRCUIT_OPEN)
+        ? MakeCircuitOpenResponse()
+        : MakeErrorResponse(result_code);
     DeliverResponse(std::move(error_response));
 }
 
@@ -886,6 +974,15 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) {
     if (result_code == RESULT_POOL_EXHAUSTED) {
         return HttpResponse::ServiceUnavailable();
     }
+    if (result_code == RESULT_RETRY_BUDGET_EXHAUSTED) {
+        return MakeRetryBudgetResponse();
+    }
+    if (result_code == RESULT_CIRCUIT_OPEN) {
+        // MakeErrorResponse is static and has no `this` — the richer
+        // MakeCircuitOpenResponse(slice_) path is preferred. Fall back
+        // to a plain 503 here for the rare static-context invocation.
+        return HttpResponse::ServiceUnavailable();
+    }
     if (result_code == RESULT_CHECKOUT_FAILED ||
         result_code == RESULT_SEND_FAILED ||
         result_code == RESULT_PARSE_ERROR ||
@@ -894,3 +991,167 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) {
     }
     return HttpResponse::InternalError();
 }
+
+HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const {
+    // Compute Retry-After from slice->OpenUntil() if the slice is known.
+    // Falls back to a conservative 1-second hint if the slice is null
+    // (shouldn't happen on the circuit-open path — that path requires a
+    // slice — but defense in depth).
+    int retry_after_secs = 1;
+    if (slice_) {
+        auto open_until = slice_->OpenUntil();
+        // OpenUntil returns a zero time_point when NOT OPEN. Checking
+        // against zero with steady_clock::time_point is fiddly; use
+        // time_since_epoch().count() > 0 as the "is-set" check.
+        if (open_until.time_since_epoch().count() > 0) {
+            auto now = std::chrono::steady_clock::now();
+            auto diff = std::chrono::duration_cast<std::chrono::seconds>(
+                open_until - now).count();
+            // Clamp to [1, 300] — Retry-After=0 is silly, and an hour+
+            // is misleading (ops usually want operators to check
+            // sooner). The breaker's open duration caps out around
+            // minutes; anything larger means we're dealing with a
+            // cascade and we should hint sooner.
+            if (diff < 1) diff = 1;
+            if (diff > 300) diff = 300;
+            retry_after_secs = static_cast<int>(diff);
+        }
+    }
+
+    HttpResponse resp;
+    resp.Status(HttpStatus::SERVICE_UNAVAILABLE);
+    resp.Text("Upstream circuit breaker is open; please retry later.\n");
+    resp.Header("Retry-After", std::to_string(retry_after_secs));
+    resp.Header("X-Circuit-Breaker", "open");
+    // Hint operators (not clients) at which upstream tripped. Useful
+    // when a gateway fronts multiple backends; without this header, a
+    // 503 is opaque.
+    resp.Header("X-Upstream-Host",
+                   upstream_host_ + ":" + std::to_string(upstream_port_));
+    resp.Header("Connection", "close");
+    return resp;
+}
+
+HttpResponse ProxyTransaction::MakeRetryBudgetResponse() {
+    HttpResponse resp;
+    resp.Status(HttpStatus::SERVICE_UNAVAILABLE);
+    resp.Text("Upstream retry budget exhausted.\n");
+    resp.Header("X-Retry-Budget-Exhausted", "1");
+    resp.Header("Connection", "close");
+    return resp;
+}
+
+bool ProxyTransaction::ConsultBreaker() {
+    if (!slice_) {
+        // No breaker attached for this service. Proceed as if the
+        // breaker layer didn't exist. admission_generation_ stays 0 so
+        // any accidental ReportBreakerOutcome call is a no-op.
+        is_probe_ = false;
+        admission_generation_ = 0;
+        return true;
+    }
+    auto admission = slice_->TryAcquire();
+
+    // Stash the admission metadata for the paired Report*() call. Note
+    // we record this EVEN for REJECTED_OPEN (where generation_==0 is a
+    // sentinel) — it's harmless and keeps the branches simpler.
+    admission_generation_ = admission.generation;
+    is_probe_ = (admission.decision ==
+                 circuit_breaker::Decision::ADMITTED_PROBE);
+
+    if (admission.decision == circuit_breaker::Decision::REJECTED_OPEN) {
+        // Hard reject — slice counted it, logged it, and we must not
+        // touch the upstream. Emit §12.1 response and DO NOT Report
+        // back (would create a feedback loop — our own reject counting
+        // as a failure against the already-OPEN slice).
+        state_ = State::FAILED;
+        logging::Get()->info(
+            "ProxyTransaction circuit-open reject client_fd={} service={} "
+            "attempt={}",
+            client_fd_, service_name_, attempt_);
+        DeliverResponse(MakeCircuitOpenResponse());
+        // Clear admission_generation_ — there's nothing to Report.
+        admission_generation_ = 0;
+        return false;
+    }
+
+    // REJECTED_OPEN_DRYRUN: slice logged the would-reject and counted
+    // it; caller proceeds to the upstream. Fall through as admitted.
+    // ADMITTED / ADMITTED_PROBE: proceed.
+    return true;
+}
+
+void ProxyTransaction::ReportBreakerOutcome(int result_code) {
+    // No slice, or already reported: bail. admission_generation_==0 is
+    // the sentinel — slice domain generations start at 1, so a 0 gen
+    // would be rejected as stale anyway; the early return just avoids
+    // an unnecessary atomic load. The Report* methods themselves are
+    // idempotent against stale gens, but we also must not increment a
+    // probe_*/rejected_ counter for a non-event.
+    if (!slice_ || admission_generation_ == 0) return;
+
+    // Capture + clear in one go so concurrent / re-entrant calls bail.
+    uint64_t gen = admission_generation_;
+    admission_generation_ = 0;
+    bool probe = is_probe_;
+    is_probe_ = false;
+
+    using circuit_breaker::FailureKind;
+
+    // Synthetic sentinel for the OnResponseComplete 5xx path — maps to
+    // RESPONSE_5XX without needing a new public result code. Callers
+    // other than OnResponseComplete never use this value.
+    static constexpr int SENTINEL_5XX = -1000;
+
+    switch (result_code) {
+        case RESULT_SUCCESS:
+            slice_->ReportSuccess(probe, gen);
+            return;
+
+        case SENTINEL_5XX:
+            slice_->ReportFailure(FailureKind::RESPONSE_5XX, probe, gen);
+            return;
+
+        case RESULT_CHECKOUT_FAILED:
+            slice_->ReportFailure(FailureKind::CONNECT_FAILURE, probe, gen);
+            return;
+
+        case RESULT_RESPONSE_TIMEOUT:
+            slice_->ReportFailure(FailureKind::RESPONSE_TIMEOUT, probe, gen);
+            return;
+
+        case RESULT_UPSTREAM_DISCONNECT:
+        case RESULT_SEND_FAILED:
+            slice_->ReportFailure(FailureKind::UPSTREAM_DISCONNECT, probe, gen);
+            return;
+
+        case RESULT_POOL_EXHAUSTED:
+        case RESULT_PARSE_ERROR:
+            // Local outcomes — no upstream health signal. Release the
+            // admission slot neutrally so a probe doesn't leak the
+            // HALF_OPEN slot.
+            slice_->ReportNeutral(probe, gen);
+            return;
+
+        case RESULT_CIRCUIT_OPEN:
+        case RESULT_RETRY_BUDGET_EXHAUSTED:
+            // Our own rejects — MUST NOT feed back into the slice.
+            // These paths should not reach ReportBreakerOutcome (both
+            // clear admission_generation_ before delivering), but the
+            // defensive branch keeps the class-wide invariant: these
+            // outcomes are invisible to the breaker.
+            return;
+
+        default:
+            // Unknown result code — log and neutral-release to keep the
+            // probe bookkeeping consistent. A runtime log here is
+            // cheaper than a slice stuck in HALF_OPEN forever because a
+            // new result code slipped through unclassified.
+            logging::Get()->error(
+                "ReportBreakerOutcome: unclassified result_code={} "
+                "service={} — releasing neutrally",
+                result_code, service_name_);
+            slice_->ReportNeutral(probe, gen);
+            return;
+    }
+}
diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
new file mode 100644
index 00000000..db2b095d
--- /dev/null
+++ b/test/circuit_breaker_phase4_test.h
@@ -0,0 +1,440 @@
+#pragma once
+
+// Phase 4 integration tests: circuit breaker wired into ProxyTransaction +
+// UpstreamManager + HttpServer. Exercises the full request path end-to-end.
+//
+// Strategy: use a backend that returns 5xx on every request so repeated hits
+// trip the breaker via the consecutive-failure threshold. 5xx responses are
+// the cheapest way to accumulate failures (no connect timeouts to wait for).
+// Low thresholds keep tests fast.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+#include "upstream/upstream_manager.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+
+namespace CircuitBreakerPhase4Tests {
+
+using circuit_breaker::State;
+
+// Shared helper: build an upstream config that proxies /echo → backend and
+// has a breaker configured with low thresholds for fast trip.
+static UpstreamConfig MakeBreakerUpstream(const std::string& name,
+                                           const std::string& host,
+                                           int port,
+                                           bool breaker_enabled,
+                                           int consecutive_threshold = 3) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 8;
+    u.pool.max_idle_connections  = 4;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    // Exact-match route — simpler than prefix patterns for integration tests.
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+    // No retries — keeps the test deterministic: one request = one attempt.
+    u.proxy.retry.max_retries = 0;
+
+    u.circuit_breaker.enabled = breaker_enabled;
+    u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold;
+    // Disable the rate-based trip path — we drive everything through
+    // consecutive failures to keep the test count predictable.
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    u.circuit_breaker.base_open_duration_ms = 500;   // short so recovery test is quick
+    u.circuit_breaker.max_open_duration_ms = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open
+// headers on the rejected request.
+// ---------------------------------------------------------------------------
+void TestBreakerTripsAfterConsecutiveFailures() {
+    std::cout << "\n[TEST] CB Phase 4: breaker trips after consecutive 5xx..."
+              << std::endl;
+    try {
+        // Backend always returns 502 — gateway classifies the response as
+        // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt.
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;  // match the existing proxy test pattern  // single thread → single breaker partition exercised
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Hit the failing backend threshold times — each 502 from backend
+        // propagates to the client as 502 (gateway pass-through) AND counts
+        // as a RESPONSE_5XX failure in the breaker.
+        for (int i = 0; i < 3; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (!TestHttpClient::HasStatus(r, 502)) {
+                TestFramework::RecordTest(
+                    "CB Phase 4: trip after consecutive failures", false,
+                    "pre-trip request " + std::to_string(i) + " expected 502, got: " +
+                    r.substr(0, 32));
+                return;
+            }
+        }
+
+        // Next request must be rejected by the breaker (not proxied). The
+        // response is 503 with X-Circuit-Breaker: open and Retry-After.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        bool has_breaker_header =
+            r.find("X-Circuit-Breaker: open") != std::string::npos ||
+            r.find("x-circuit-breaker: open") != std::string::npos;
+        bool has_retry_after =
+            r.find("Retry-After:") != std::string::npos ||
+            r.find("retry-after:") != std::string::npos;
+        bool has_upstream_host =
+            r.find("X-Upstream-Host:") != std::string::npos ||
+            r.find("x-upstream-host:") != std::string::npos;
+
+        bool pass = is_503 && has_breaker_header && has_retry_after &&
+                    has_upstream_host;
+        TestFramework::RecordTest(
+            "CB Phase 4: trip after consecutive failures", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " breaker_hdr=" + std::to_string(has_breaker_header) +
+            " retry_after=" + std::to_string(has_retry_after) +
+            " upstream_host=" + std::to_string(has_upstream_host) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: trip after consecutive failures", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely.
+// The same failure pattern that would trip an enabled breaker must leave the
+// pass-through path untouched — every request still reaches the backend.
+// ---------------------------------------------------------------------------
+void TestBreakerDisabledPassesThrough() {
+    std::cout << "\n[TEST] CB Phase 4: disabled breaker passes through..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/false, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // 10 requests — with breaker disabled, all 10 reach backend.
+        for (int i = 0; i < 10; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (!TestHttpClient::HasStatus(r, 502)) {
+                TestFramework::RecordTest(
+                    "CB Phase 4: disabled breaker passes through", false,
+                    "request " + std::to_string(i) + " expected 502, got: " +
+                    r.substr(0, 32));
+                return;
+            }
+        }
+
+        bool all_hit = backend_hits.load() == 10;
+        TestFramework::RecordTest(
+            "CB Phase 4: disabled breaker passes through", all_hit,
+            all_hit ? "" :
+            "expected 10 backend hits, got " + std::to_string(backend_hits.load()));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: disabled breaker passes through", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: 2xx responses are reported as success — they reset the
+// consecutive-failure counter so the breaker doesn't trip on interleaved
+// success/failure traffic.
+// ---------------------------------------------------------------------------
+void TestSuccessResetsConsecutiveFailureCounter() {
+    std::cout << "\n[TEST] CB Phase 4: 2xx success resets consecutive-failure counter..."
+              << std::endl;
+    try {
+        std::atomic<bool> fail_mode{true};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/toggle", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
+            if (fail_mode.load()) {
+                resp.Status(502).Body("err", "text/plain");
+            } else {
+                resp.Status(200).Body("ok", "text/plain");
+            }
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails.
+        // With reset semantics, consecutive_failures_ never exceeds 2 → no trip.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000);  // FAIL
+        }
+        fail_mode.store(false);
+        TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000);   // SUCCESS → reset
+        fail_mode.store(true);
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000);  // FAIL
+        }
+
+        // Inspect the breaker's state directly — it should still be CLOSED.
+        auto* cbm = gateway.GetUpstreamManager() ?
+            gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
+        auto* host = cbm ? cbm->GetHost("svc") : nullptr;
+        auto* slice = host ? host->GetSlice(0) : nullptr;
+        bool still_closed = slice && slice->CurrentState() == State::CLOSED;
+
+        TestFramework::RecordTest(
+            "CB Phase 4: success resets consecutive counter", still_closed,
+            still_closed ? "" :
+            "slice not CLOSED after S resets failures: state=" +
+            std::to_string(static_cast<int>(
+                slice ? slice->CurrentState() : State::CLOSED)));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: success resets consecutive counter", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 4: After the trip, the live slice state is OPEN. Verifies the
+// integration actually drives the slice state machine (not just the response).
+// ---------------------------------------------------------------------------
+void TestTripDrivesSliceState() {
+    std::cout << "\n[TEST] CB Phase 4: trip drives slice state to OPEN..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // 3 failures → trip.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // With worker_threads > 1 the 3 failing requests can land on either
+        // dispatcher (hash-dependent). Check the aggregate snapshot — at
+        // least one partition must be OPEN with exactly one trip recorded.
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        auto* host = cbm->GetHost("svc");
+        auto snap = host->Snapshot();
+        bool at_least_one_open = snap.open_partitions >= 1;
+        bool one_trip = snap.total_trips == 1;
+        // Sanity: the tripped partition should be the one that saw all 3
+        // failures (consecutive trip is single-slice, not cross-slice).
+        bool single_partition_tripped = snap.open_partitions == 1;
+
+        bool pass = at_least_one_open && one_trip && single_partition_tripped;
+        TestFramework::RecordTest(
+            "CB Phase 4: trip drives slice state to OPEN", pass,
+            pass ? "" :
+            "at_least_one_open=" + std::to_string(at_least_one_open) +
+            " one_trip=" + std::to_string(one_trip) +
+            " single_partition=" + std::to_string(single_partition_tripped) +
+            " (open_partitions=" + std::to_string(snap.open_partitions) +
+            ", total_trips=" + std::to_string(snap.total_trips) + ")");
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: trip drives slice state to OPEN", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip,
+// subsequent requests must be served locally (503) without any upstream I/O.
+// Prevents regression where the gate leaked admissions to a known-bad upstream.
+// ---------------------------------------------------------------------------
+void TestOpenBreakerShortCircuitsUpstreamCall() {
+    std::cout << "\n[TEST] CB Phase 4: OPEN breaker short-circuits upstream call..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // 3 failing requests to trip.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        int hits_at_trip = backend_hits.load();
+
+        // 5 more requests — all should be rejected locally.
+        for (int i = 0; i < 5; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        int hits_after = backend_hits.load();
+
+        // Backend hits must not grow during the post-trip burst.
+        bool no_leak = hits_after == hits_at_trip;
+        TestFramework::RecordTest(
+            "CB Phase 4: OPEN short-circuits upstream call", no_leak,
+            no_leak ? "" :
+            "backend hits grew from " + std::to_string(hits_at_trip) +
+            " to " + std::to_string(hits_after) + " after trip");
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: OPEN short-circuits upstream call", false, e.what());
+    }
+}
+
+// Sanity check: verify the bare proxy setup works without the breaker
+// before blaming the breaker integration.
+void TestBareProxyWorks() {
+    std::cout << "\n[TEST] CB Phase 4: bare proxy (sanity)..." << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        UpstreamConfig u;
+        u.name = "svc";
+        u.host = "127.0.0.1";
+        u.port = backend_port;
+        u.pool.max_connections = 8;
+        u.pool.max_idle_connections = 4;
+        u.pool.connect_timeout_ms = 3000;
+        u.proxy.route_prefix = "/fail";
+        u.proxy.response_timeout_ms = 5000;
+        u.circuit_breaker.enabled = true;  // sanity + breaker enabled
+        u.circuit_breaker.consecutive_failure_threshold = 3;
+        u.circuit_breaker.failure_rate_threshold = 100;
+        u.circuit_breaker.minimum_volume = 10000;
+        u.circuit_breaker.window_seconds = 10;
+        u.circuit_breaker.permitted_half_open_calls = 2;
+        u.circuit_breaker.base_open_duration_ms = 500;
+        u.circuit_breaker.max_open_duration_ms = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+        bool pass = TestHttpClient::HasStatus(r, 502);
+        TestFramework::RecordTest(
+            "CB Phase 4: bare proxy sanity", pass,
+            pass ? "" : "expected 502, got: " + r.substr(0, 128));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB Phase 4: bare proxy sanity",
+            false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestBareProxyWorks();
+    TestBreakerTripsAfterConsecutiveFailures();
+    TestBreakerDisabledPassesThrough();
+    TestSuccessResetsConsecutiveFailureCounter();
+    TestTripDrivesSliceState();
+    TestOpenBreakerShortCircuitsUpstreamCall();
+}
+
+}  // namespace CircuitBreakerPhase4Tests
diff --git a/test/run_test.cc b/test/run_test.cc
index f118d495..fbf84d49 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -15,6 +15,7 @@
 #include "rate_limit_test.h"
 #include "circuit_breaker_test.h"
 #include "circuit_breaker_phase3_test.h"
+#include "circuit_breaker_phase4_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -85,6 +86,10 @@ void RunAllTest(){
     // Run circuit breaker Phase 3 tests (host / manager / retry budget)
     CircuitBreakerPhase3Tests::RunAllTests();
 
+    // Run circuit breaker Phase 4 integration tests (end-to-end through
+    // ProxyTransaction + UpstreamManager + HttpServer)
+    CircuitBreakerPhase4Tests::RunAllTests();
+
     std::cout << "====================================\n" << std::endl;
 }
 

From 83bf5d115468ea4ec7d17f2bc26ca8210add08c2 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 14:02:32 +0800
Subject: [PATCH 17/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  13 +
 include/upstream/proxy_transaction.h          |  10 +-
 server/circuit_breaker_host.cc                |   8 +-
 server/circuit_breaker_manager.cc             |  19 +
 server/proxy_transaction.cc                   |  54 +--
 server/retry_budget.cc                        |  32 +-
 test/circuit_breaker_phase4_test.h            | 330 ++++++++++++++++++
 test/run_test.cc                              |   4 +-
 8 files changed, 430 insertions(+), 40 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 95a5beee..6e9734df 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -129,10 +129,23 @@ class CircuitBreakerSlice {
     const std::string& host_label() const { return host_label_; }
     size_t dispatcher_index() const { return dispatcher_index_; }
 
+    // Read-only view of the live config. Dispatcher-thread-owned for
+    // writes (Reload only mutates here); readers on other threads get a
+    // potentially-torn read, which is acceptable for observability hints
+    // like Retry-After clamping.
+    const CircuitBreakerConfig& config() const { return config_; }
+
     // Current open_until time. Used by ProxyTransaction to compute
     // Retry-After. Returns zero ns when not OPEN.
     std::chrono::steady_clock::time_point OpenUntil() const;
 
+    // Convenience predicate: whether OpenUntil() currently holds a
+    // non-zero deadline. Avoids callers hand-rolling the zero-epoch
+    // check against `time_since_epoch().count() > 0`.
+    bool IsOpenDeadlineSet() const {
+        return open_until_steady_ns_.load(std::memory_order_relaxed) > 0;
+    }
+
 private:
     // Logging label: "service=X host=Y:Z partition=N" built once.
     std::string host_label_;
diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h
index 6befe5a0..eba34973 100644
--- a/include/upstream/proxy_transaction.h
+++ b/include/upstream/proxy_transaction.h
@@ -175,11 +175,11 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     uint64_t admission_generation_ = 0;
     bool is_probe_ = false;
 
-    // Retry-budget token held by this transaction's most recent retry
-    // attempt. Phase 5 flips this to true on successful TryConsumeRetry
-    // and clears it on ReleaseRetry. Phase 4 declares the field so
-    // Cleanup() and Cancel() have something to check, but the retry
-    // loop does not yet consume the budget.
+    // TODO(phase-5): retry-budget token held by this transaction's most
+    // recent retry attempt. Phase 5 flips this to true on successful
+    // TryConsumeRetry and clears it on ReleaseRetry. Phase 4 declares
+    // the field so Cleanup() and Cancel() have something to check, but
+    // the retry loop does not yet consume the budget.
     bool retry_token_held_ = false;
 
     // Internal methods
diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc
index b41635a6..4523d3be 100644
--- a/server/circuit_breaker_host.cc
+++ b/server/circuit_breaker_host.cc
@@ -30,9 +30,11 @@ CircuitBreakerHost::CircuitBreakerHost(std::string service_name,
     slices_.reserve(partition_count);
     for (size_t i = 0; i < partition_count; ++i) {
         // Per-slice label for logs — lets operators grep logs for a
-        // specific host:partition pair.
-        std::string label = service_name_ + ":" + host_ + ":" +
-                            std::to_string(port_) + " p=" + std::to_string(i);
+        // specific host:partition pair. Key=value form matches the
+        // format documented in circuit_breaker_slice.h:host_label_.
+        std::string label = "service=" + service_name_ +
+                            " host=" + host_ + ":" + std::to_string(port_) +
+                            " partition=" + std::to_string(i);
         slices_.emplace_back(std::make_unique<CircuitBreakerSlice>(
             std::move(label), i, config_));
     }
diff --git a/server/circuit_breaker_manager.cc b/server/circuit_breaker_manager.cc
index 7e4a8035..9e4934a3 100644
--- a/server/circuit_breaker_manager.cc
+++ b/server/circuit_breaker_manager.cc
@@ -9,6 +9,25 @@ CircuitBreakerManager::CircuitBreakerManager(
         size_t partition_count,
         std::vector<std::shared_ptr<Dispatcher>> dispatchers)
     : dispatchers_(std::move(dispatchers)) {
+    // Invariant (production path): slices are indexed by dispatcher,
+    // so partition_count must match dispatcher count. Any divergence
+    // would cause every subsequent host->Reload() to silently skip
+    // (size-mismatch guard in CircuitBreakerHost::Reload) — fail
+    // loudly at startup instead of on reload.
+    //
+    // Exception: pure unit tests that don't exercise Reload pass an
+    // empty dispatcher list; skip the check in that case so those
+    // tests can continue to allocate slices without wiring up live
+    // dispatchers.
+    if (!dispatchers_.empty() && partition_count != dispatchers_.size()) {
+        logging::Get()->critical(
+            "CircuitBreakerManager: partition_count ({}) != dispatcher count "
+            "({}) — topology mismatch",
+            partition_count, dispatchers_.size());
+        throw std::invalid_argument(
+            "CircuitBreakerManager: partition_count must equal dispatcher count");
+    }
+
     // Build one Host per upstream regardless of .circuit_breaker.enabled.
     // Disabled hosts still need a live Slice so a later reload can flip
     // them on without re-wiring transition callbacks (design §3.1).
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index c263332e..ac5713c8 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -978,9 +978,18 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) {
         return MakeRetryBudgetResponse();
     }
     if (result_code == RESULT_CIRCUIT_OPEN) {
-        // MakeErrorResponse is static and has no `this` — the richer
-        // MakeCircuitOpenResponse(slice_) path is preferred. Fall back
-        // to a plain 503 here for the rare static-context invocation.
+        // The static factory has no `this`, so it cannot build the
+        // §12.1-compliant response (Retry-After / X-Circuit-Breaker /
+        // X-Upstream-Host). All in-class paths for CIRCUIT_OPEN use
+        // the non-static MakeCircuitOpenResponse() — reaching this
+        // branch means a future caller forgot that rule, and would
+        // silently serve a non-compliant 503. Log loudly so the
+        // mistake shows up in logs instead of producing a stealth
+        // regression against the public contract.
+        logging::Get()->error(
+            "ProxyTransaction::MakeErrorResponse(RESULT_CIRCUIT_OPEN) "
+            "invoked from static context — use MakeCircuitOpenResponse() "
+            "to emit §12.1-compliant headers");
         return HttpResponse::ServiceUnavailable();
     }
     if (result_code == RESULT_CHECKOUT_FAILED ||
@@ -998,24 +1007,29 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const {
     // (shouldn't happen on the circuit-open path — that path requires a
     // slice — but defense in depth).
     int retry_after_secs = 1;
-    if (slice_) {
+    if (slice_ && slice_->IsOpenDeadlineSet()) {
         auto open_until = slice_->OpenUntil();
-        // OpenUntil returns a zero time_point when NOT OPEN. Checking
-        // against zero with steady_clock::time_point is fiddly; use
-        // time_since_epoch().count() > 0 as the "is-set" check.
-        if (open_until.time_since_epoch().count() > 0) {
-            auto now = std::chrono::steady_clock::now();
-            auto diff = std::chrono::duration_cast<std::chrono::seconds>(
-                open_until - now).count();
-            // Clamp to [1, 300] — Retry-After=0 is silly, and an hour+
-            // is misleading (ops usually want operators to check
-            // sooner). The breaker's open duration caps out around
-            // minutes; anything larger means we're dealing with a
-            // cascade and we should hint sooner.
-            if (diff < 1) diff = 1;
-            if (diff > 300) diff = 300;
-            retry_after_secs = static_cast<int>(diff);
-        }
+        auto now = std::chrono::steady_clock::now();
+        auto ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
+            open_until - now).count();
+        // Ceiling-round to seconds so we never advertise a window
+        // shorter than the actual remaining backoff (e.g. 5.9s → 6,
+        // not 5). Truncating by one second is enough to cause a
+        // well-behaved client to retry while the breaker is still OPEN
+        // and get another avoidable 503.
+        int64_t diff = (ms_remaining + 999) / 1000;
+        // Clamp: Retry-After=0 is silly; upper bound tracks the
+        // configured max_open_duration_ms (clamped to 1s min), so we
+        // don't under-report backoff windows on operators who tune the
+        // breaker longer than 5 minutes. Absolute safety ceiling of
+        // 3600s (1 hour) — anything longer likely means the breaker
+        // is mis-configured and the hint is noise.
+        int cfg_cap_secs = static_cast<int>(
+            std::max<long long>(1, slice_->config().max_open_duration_ms / 1000));
+        int upper = std::min(cfg_cap_secs, 3600);
+        if (diff < 1) diff = 1;
+        if (diff > upper) diff = upper;
+        retry_after_secs = static_cast<int>(diff);
     }
 
     HttpResponse resp;
diff --git a/server/retry_budget.cc b/server/retry_budget.cc
index 7246eb26..cc984e6d 100644
--- a/server/retry_budget.cc
+++ b/server/retry_budget.cc
@@ -33,12 +33,10 @@ RetryBudget::InFlightGuard RetryBudget::TrackInFlight() {
 }
 
 bool RetryBudget::TryConsumeRetry() {
-    // Snapshot counters with relaxed — the gate is an approximate
-    // capacity check, not a strict admission lock. Racing callers may
-    // both read cap=N and both try to reserve; the worst case is that
-    // both succeed and we momentarily sit at retries_in_flight_ =
-    // cap+1, which is acceptable for a traffic-shaping gate (unlike a
-    // security-critical gate).
+    // Snapshot tuning + in_flight once — cap is computed against a
+    // consistent slice. Retrying the cap math inside the CAS loop would
+    // just churn without improving accuracy (in_flight is inherently a
+    // moving target).
     int64_t in_flight = in_flight_.load(std::memory_order_relaxed);
     int pct = percent_.load(std::memory_order_relaxed);
     int min_conc = min_concurrency_.load(std::memory_order_relaxed);
@@ -50,13 +48,25 @@ bool RetryBudget::TryConsumeRetry() {
     int64_t pct_cap = (in_flight * pct) / 100;
     int64_t cap = pct_cap > min_conc ? pct_cap : min_conc;
 
+    // Atomically reserve a slot: load current, verify under cap, CAS up
+    // by 1. Separate load + fetch_add would let N concurrent callers
+    // all observe current < cap and all increment past the cap — under
+    // the cross-dispatcher load the retry budget is meant to protect
+    // against, the gate would stop bounding anything.
     int64_t current = retries_in_flight_.load(std::memory_order_relaxed);
-    if (current >= cap) {
-        retries_rejected_.fetch_add(1, std::memory_order_relaxed);
-        return false;
+    while (current < cap) {
+        if (retries_in_flight_.compare_exchange_weak(
+                current, current + 1,
+                std::memory_order_acq_rel,
+                std::memory_order_relaxed)) {
+            return true;
+        }
+        // CAS failure — `current` was updated with the latest value;
+        // loop re-evaluates against cap. Spurious wakeups on weak CAS
+        // are also handled by the retry.
     }
-    retries_in_flight_.fetch_add(1, std::memory_order_relaxed);
-    return true;
+    retries_rejected_.fetch_add(1, std::memory_order_relaxed);
+    return false;
 }
 
 void RetryBudget::ReleaseRetry() {
diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
index db2b095d..6a6cc2f5 100644
--- a/test/circuit_breaker_phase4_test.h
+++ b/test/circuit_breaker_phase4_test.h
@@ -424,6 +424,332 @@ void TestBareProxyWorks() {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Test 7: Retry-After header carries a sensible value — within [1, configured
+// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now.
+// ---------------------------------------------------------------------------
+void TestRetryAfterHeaderValue() {
+    std::cout << "\n[TEST] CB Phase 4: Retry-After value correctness..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        // base_open_duration 2000ms, max 60_000ms — Retry-After should
+        // ceiling-round and fall inside [1, 60].
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.base_open_duration_ms = 2000;
+        u.circuit_breaker.max_open_duration_ms  = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // Capture the open-rejection response.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+
+        // Extract Retry-After integer value (case-insensitive header).
+        int retry_after = -1;
+        const char* markers[] = {"Retry-After:", "retry-after:"};
+        for (const char* m : markers) {
+            auto pos = r.find(m);
+            if (pos == std::string::npos) continue;
+            pos += std::string(m).size();
+            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
+            int val = 0;
+            bool any = false;
+            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
+                val = val * 10 + (r[pos] - '0');
+                any = true;
+                ++pos;
+            }
+            if (any) { retry_after = val; break; }
+        }
+
+        // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60).
+        // For base_open_duration 2000ms the remaining-seconds at this
+        // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper
+        // sanity bound is generous but still rules out 300/3600-class
+        // buggy fallbacks.
+        bool in_range = (retry_after >= 1 && retry_after <= 60);
+        bool reasonable = (retry_after >= 1 && retry_after <= 3);
+
+        bool pass = is_503 && in_range && reasonable;
+        TestFramework::RecordTest(
+            "CB Phase 4: Retry-After value in range", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " retry_after=" + std::to_string(retry_after) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: Retry-After value in range", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3,
+// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored
+// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into
+// the breaker and MaybeRetry stays out.
+// ---------------------------------------------------------------------------
+void TestCircuitOpenTerminalForRetry() {
+    std::cout << "\n[TEST] CB Phase 4: CIRCUIT_OPEN terminal for retry loop..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        // Retries enabled on 5xx — if the breaker reject leaked into
+        // MaybeRetry, the test would see extra backend hits after the
+        // trip. Long open window so the breaker stays OPEN for the
+        // duration of the post-trip assertion (no HALF_OPEN probe
+        // admission racing the test).
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        u.circuit_breaker.base_open_duration_ms = 30000;
+        u.circuit_breaker.max_open_duration_ms  = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker. Each pre-trip request may retry up to 3
+        // times (all failing 5xx), so backend sees up to 3*threshold=12
+        // hits. That's acceptable — we just care about post-trip behavior.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+        }
+        int pre_trip_hits = backend_hits.load();
+
+        // Post-trip request: expect a single 503 and NO new backend hits.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        int post_trip_hits = backend_hits.load();
+        bool no_new_hits = (post_trip_hits == pre_trip_hits);
+
+        bool pass = is_503 && no_new_hits;
+        TestFramework::RecordTest(
+            "CB Phase 4: CIRCUIT_OPEN terminal for retry", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " pre=" + std::to_string(pre_trip_hits) +
+            " post=" + std::to_string(post_trip_hits));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: CIRCUIT_OPEN terminal for retry", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the
+// upstream (pass-through) but still increments the rejected_ counter so
+// operators can observe the would-reject rate without production impact.
+// ---------------------------------------------------------------------------
+void TestDryRunPassthrough() {
+    std::cout << "\n[TEST] CB Phase 4: dry-run passthrough..." << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.dry_run = true;  // would-reject, but still forward
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip thresholds with 5 requests. All should reach backend (502),
+        // not a 503 — dry-run never short-circuits.
+        for (int i = 0; i < 5; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (!TestHttpClient::HasStatus(r, 502)) {
+                TestFramework::RecordTest(
+                    "CB Phase 4: dry-run passthrough", false,
+                    "request " + std::to_string(i) +
+                    " expected 502, got: " + r.substr(0, 64));
+                return;
+            }
+        }
+
+        bool all_hit = (backend_hits.load() == 5);
+
+        // Verify the slice observed trips/rejected even though traffic passed.
+        auto* mgr = gateway.GetUpstreamManager() ?
+                     gateway.GetUpstreamManager()->GetCircuitBreakerManager() :
+                     nullptr;
+        int64_t trips = 0, rejected = 0;
+        if (mgr) {
+            auto* host = mgr->GetHost("svc");
+            if (host) {
+                auto snap = host->Snapshot();
+                trips = snap.total_trips;
+                rejected = snap.total_rejected;
+            }
+        }
+        // At least one trip fired (consecutive_threshold=3 → slice
+        // transitioned at least once during the run), and the post-trip
+        // requests were counted as would-reject (rejected > 0).
+        bool observed = (trips >= 1) && (rejected >= 1);
+
+        bool pass = all_hit && observed;
+        TestFramework::RecordTest(
+            "CB Phase 4: dry-run passthrough", pass,
+            pass ? "" :
+            "hits=" + std::to_string(backend_hits.load()) +
+            " trips=" + std::to_string(trips) +
+            " rejected=" + std::to_string(rejected));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: dry-run passthrough", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the
+// breaker, wait for the open window to elapse, then serve success responses
+// and assert the slice transitions back to CLOSED (consecutive_successes
+// crosses the threshold — default 2 from DefaultCbConfig / phase-4 config).
+// ---------------------------------------------------------------------------
+void TestHalfOpenRecoveryRoundTrip() {
+    std::cout << "\n[TEST] CB Phase 4: HALF_OPEN → CLOSED recovery..."
+              << std::endl;
+    try {
+        std::atomic<bool> fail_mode{true};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
+            if (fail_mode.load()) {
+                resp.Status(502).Body("err", "text/plain");
+            } else {
+                resp.Status(200).Body("ok", "text/plain");
+            }
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        // Short open duration so recovery path finishes quickly.
+        u.circuit_breaker.base_open_duration_ms = 300;
+        u.circuit_breaker.max_open_duration_ms = 1000;
+        // Two probes needed to close (default permitted_half_open_calls=2).
+        u.circuit_breaker.permitted_half_open_calls = 2;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip by hitting the failing backend.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // Flip backend to success and wait for the open window to elapse.
+        fail_mode.store(false);
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+        // Probe the proxy — each successful 200 advances HALF_OPEN toward
+        // CLOSED. Do more than permitted_half_open_calls; some will be
+        // rejected as half_open_full but the ones that are admitted will
+        // close the breaker.
+        bool saw_success = false;
+        for (int i = 0; i < 8; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (TestHttpClient::HasStatus(r, 200)) saw_success = true;
+            // Small gap between probes — HALF_OPEN only admits permitted
+            // probes per cycle; spacing lets subsequent probes observe a
+            // possibly-closed breaker.
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        }
+
+        // Verify slice aggregate: at least one CLOSED transition observed
+        // (probe_successes >= 1 and total_trips == 1 — we only tripped once).
+        auto* mgr = gateway.GetUpstreamManager() ?
+                     gateway.GetUpstreamManager()->GetCircuitBreakerManager() :
+                     nullptr;
+        int64_t probe_succ = 0;
+        int open_parts = 0, half_open_parts = 0;
+        if (mgr) {
+            auto* host = mgr->GetHost("svc");
+            if (host) {
+                auto snap = host->Snapshot();
+                probe_succ = 0;
+                for (const auto& row : snap.slices) {
+                    probe_succ += row.probe_successes;
+                }
+                open_parts = snap.open_partitions;
+                half_open_parts = snap.half_open_partitions;
+            }
+        }
+
+        // Recovery complete: saw at least one 200 through the breaker,
+        // at least one probe success counted, and no partition still
+        // stuck in OPEN (HALF_OPEN may still linger on the unused slice,
+        // which is fine for a 2-partition setup).
+        bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0);
+        TestFramework::RecordTest(
+            "CB Phase 4: HALF_OPEN → CLOSED recovery", pass,
+            pass ? "" :
+            "saw_success=" + std::to_string(saw_success) +
+            " probe_succ=" + std::to_string(probe_succ) +
+            " open_parts=" + std::to_string(open_parts) +
+            " half_open_parts=" + std::to_string(half_open_parts));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: HALF_OPEN → CLOSED recovery", false, e.what());
+    }
+}
+
 void RunAllTests() {
     std::cout << "\n" << std::string(60, '=') << std::endl;
     std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl;
@@ -435,6 +761,10 @@ void RunAllTests() {
     TestSuccessResetsConsecutiveFailureCounter();
     TestTripDrivesSliceState();
     TestOpenBreakerShortCircuitsUpstreamCall();
+    TestRetryAfterHeaderValue();
+    TestCircuitOpenTerminalForRetry();
+    TestDryRunPassthrough();
+    TestHalfOpenRecoveryRoundTrip();
 }
 
 }  // namespace CircuitBreakerPhase4Tests
diff --git a/test/run_test.cc b/test/run_test.cc
index fbf84d49..ab7bdb9b 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -168,9 +168,11 @@ int main(int argc, char* argv[]) {
         // Run rate limit tests
         }else if(mode == "rate_limit" || mode == "-L"){
             RateLimitTests::RunAllTests();
-        // Run circuit breaker tests
+        // Run circuit breaker tests (phases 1-4: unit + phase3 + phase4)
         }else if(mode == "circuit_breaker" || mode == "-B"){
             CircuitBreakerTests::RunAllTests();
+            CircuitBreakerPhase3Tests::RunAllTests();
+            CircuitBreakerPhase4Tests::RunAllTests();
         // Show help
         }else if(mode == "help" || mode == "-h" || mode == "--help"){
             PrintUsage(argv[0]);

From a1ddde5818fcc5eecf23e40f43de5f6bac944f47 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 14:51:08 +0800
Subject: [PATCH 18/37] Fix review comment

---
 include/upstream/proxy_transaction.h |  12 +++
 server/proxy_transaction.cc          |  53 +++++++++-
 test/circuit_breaker_phase4_test.h   | 144 +++++++++++++++++++++++++++
 3 files changed, 208 insertions(+), 1 deletion(-)

diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h
index eba34973..40886be4 100644
--- a/include/upstream/proxy_transaction.h
+++ b/include/upstream/proxy_transaction.h
@@ -245,4 +245,16 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // result; the caller passes the appropriate kind for 5xx vs disconnect
     // vs timeout since the slice treats them differently only for logs.
     void ReportBreakerOutcome(int result_code);
+
+    // ReleaseBreakerAdmissionNeutral: release the admission slot without
+    // counting a success or failure. Used when the transaction is aborted
+    // locally (Cancel() on client disconnect, cancelled_ early-return
+    // after checkout, etc.) before an upstream health signal was observed.
+    //
+    // Without this, a HALF_OPEN probe slot is stranded if the client
+    // disconnects mid-probe — the slice stays in half_open_full until an
+    // external reset. No-op if admission_generation_ == 0. Clears
+    // admission_generation_ so a following ReportBreakerOutcome is a
+    // no-op.
+    void ReleaseBreakerAdmissionNeutral();
 };
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index ac5713c8..31e5dfe3 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -179,6 +179,11 @@ void ProxyTransaction::OnCheckoutReady(UpstreamLease lease) {
         // returns to the pool for another request to use, instead of
         // sitting idle attached to a torn-down transaction.
         lease.Release();
+        // Release the breaker admission neutrally — the upstream was
+        // never exercised, and stranding the slot would wedge a
+        // HALF_OPEN probe cycle. Cancel() may already have released;
+        // the helper is no-op in that case.
+        ReleaseBreakerAdmissionNeutral();
         return;
     }
     if (state_ != State::CHECKOUT_PENDING) {
@@ -330,6 +335,13 @@ void ProxyTransaction::SendUpstreamRequest() {
         logging::Get()->warn("ProxyTransaction stale connection before send "
                              "client_fd={} service={} attempt={}",
                              client_fd_, service_name_, attempt_);
+        // Report to the breaker BEFORE retrying — MaybeRetry's
+        // AttemptCheckout will overwrite admission_generation_ on the
+        // next ConsultBreaker. Without this call, a probe in HALF_OPEN
+        // would leak its slot and the slice could stall in
+        // half_open_full; in CLOSED, the failure would be under-counted
+        // until the last retry ran through OnError.
+        ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT);
         MaybeRetry(RetryPolicy::RetryCondition::UPSTREAM_DISCONNECT);
         return;
     }
@@ -407,6 +419,8 @@ void ProxyTransaction::OnUpstreamData(
                              "state={} attempt={}",
                              client_fd_, service_name_, upstream_fd,
                              static_cast<int>(state_), attempt_);
+        // Report BEFORE retry — see stale-connection path above for why.
+        ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT);
         MaybeRetry(RetryPolicy::RetryCondition::UPSTREAM_DISCONNECT);
         return;
     }
@@ -822,6 +836,13 @@ void ProxyTransaction::Cancel() {
     if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) {
         poison_connection_ = true;
     }
+    // Release any held breaker admission neutrally before tearing down.
+    // A client disconnect during CHECKOUT_PENDING, mid-send, or mid-
+    // response leaves admission_generation_ set; without this neutral
+    // release a probe slot stays occupied and HALF_OPEN can stall in
+    // half_open_full until an external reset. No-op when no admission
+    // is held (INIT, or an outcome already reported).
+    ReleaseBreakerAdmissionNeutral();
     // Release the upstream lease back to the pool (or destroy it if
     // poisoned) and clear transport callbacks so any in-flight upstream
     // bytes land harmlessly.
@@ -939,6 +960,13 @@ void ProxyTransaction::ArmResponseTimeout(int explicit_budget_ms) {
         if (self->state_ == State::SENDING_REQUEST ||
             self->state_ == State::AWAITING_RESPONSE ||
             self->state_ == State::RECEIVING_BODY) {
+            // Report BEFORE retry — MaybeRetry's AttemptCheckout will
+            // overwrite admission_generation_ on the next
+            // ConsultBreaker, stranding the current attempt's
+            // admission (probe slot leaks in HALF_OPEN; CLOSED
+            // under-counts the failure until the last retry hits
+            // OnError).
+            self->ReportBreakerOutcome(RESULT_RESPONSE_TIMEOUT);
             self->MaybeRetry(RetryPolicy::RetryCondition::RESPONSE_TIMEOUT);
         } else {
             self->OnError(RESULT_RESPONSE_TIMEOUT, "Response timeout");
@@ -1024,8 +1052,16 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const {
         // breaker longer than 5 minutes. Absolute safety ceiling of
         // 3600s (1 hour) — anything longer likely means the breaker
         // is mis-configured and the hint is noise.
+        //
+        // Ceil the cap: floor-rounding max_open_duration_ms would
+        // under-report non-second-aligned configs. E.g. a 1500ms or
+        // 6500ms (exponential-backoff saturation) max floor-rounds to
+        // 1s/6s, advertising a shorter window than the breaker will
+        // actually honor. Clients retrying on the hint would hit
+        // another avoidable 503.
+        long long cfg_ms = slice_->config().max_open_duration_ms;
         int cfg_cap_secs = static_cast<int>(
-            std::max<long long>(1, slice_->config().max_open_duration_ms / 1000));
+            std::max<long long>(1, (cfg_ms + 999) / 1000));
         int upper = std::min(cfg_cap_secs, 3600);
         if (diff < 1) diff = 1;
         if (diff > upper) diff = upper;
@@ -1095,6 +1131,21 @@ bool ProxyTransaction::ConsultBreaker() {
     return true;
 }
 
+void ProxyTransaction::ReleaseBreakerAdmissionNeutral() {
+    if (!slice_ || admission_generation_ == 0) return;
+
+    uint64_t gen = admission_generation_;
+    admission_generation_ = 0;
+    bool probe = is_probe_;
+    is_probe_ = false;
+
+    // Neutral release — no upstream health signal. Decrements the
+    // per-partition inflight (CLOSED) or the HALF_OPEN probe admitted
+    // counter, so a cancelled probe doesn't wedge the slice in
+    // half_open_full.
+    slice_->ReportNeutral(probe, gen);
+}
+
 void ProxyTransaction::ReportBreakerOutcome(int result_code) {
     // No slice, or already reported: bail. admission_generation_==0 is
     // the sentinel — slice domain generations start at 1, so a 0 gen
diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
index 6a6cc2f5..f6b4fa16 100644
--- a/test/circuit_breaker_phase4_test.h
+++ b/test/circuit_breaker_phase4_test.h
@@ -750,6 +750,148 @@ void TestHalfOpenRecoveryRoundTrip() {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Test 11: Retry-After ceils the config cap from a non-second-aligned
+// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap
+// would clamp the advertised retry window below what the breaker honors,
+// causing well-behaved clients to re-hit the 503.
+// ---------------------------------------------------------------------------
+void TestRetryAfterCapCeilsNonAlignedMax() {
+    std::cout << "\n[TEST] CB Phase 4: Retry-After cap ceils non-aligned max..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        // Configure a non-second-aligned max backoff. base = 1500ms so
+        // the actual OpenUntil-now at trip time is ~1.5s, which ceil-
+        // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration
+        // (1500ms → 1s), the clamp would drop Retry-After to 1s even
+        // though the breaker would keep rejecting through the second
+        // half of that window.
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.base_open_duration_ms = 1500;
+        u.circuit_breaker.max_open_duration_ms  = 1500;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+
+        int retry_after = -1;
+        const char* markers[] = {"Retry-After:", "retry-after:"};
+        for (const char* m : markers) {
+            auto pos = r.find(m);
+            if (pos == std::string::npos) continue;
+            pos += std::string(m).size();
+            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
+            int val = 0;
+            bool any = false;
+            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
+                val = val * 10 + (r[pos] - '0');
+                any = true;
+                ++pos;
+            }
+            if (any) { retry_after = val; break; }
+        }
+
+        // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil-
+        // rounds 1500ms to 2s, and the remaining-time ceil-rounds to
+        // 2 at the moment of trip (may be 1 if enough wall-clock has
+        // elapsed between trip and response). Critically it must NEVER
+        // be zero or exceed 2 (clamped to the 2s cap).
+        bool in_range = (retry_after >= 1 && retry_after <= 2);
+        TestFramework::RecordTest(
+            "CB Phase 4: Retry-After ceils non-aligned cap", in_range,
+            in_range ? "" :
+            "retry_after=" + std::to_string(retry_after));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: Retry-After ceils non-aligned cap", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 12: Retried failures are reported BEFORE the retry fires. With retries
+// enabled on 5xx, each attempt's outcome must be counted against the breaker;
+// otherwise the slice trips only after the final retry exhausts, under-
+// counting failures and potentially never tripping if retries mask enough of
+// them. Verifies the trip still happens within the expected number of client
+// requests once reporting is attached to the retry path.
+// ---------------------------------------------------------------------------
+void TestRetriedFailuresCountTowardTrip() {
+    std::cout << "\n[TEST] CB Phase 4: retried failures count toward trip..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each
+        // client request produces 1 + max_retries=3 = 4 upstream
+        // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome
+        // path that this fix patches in. The breaker must trip after
+        // at most 3 upstream failure reports (which the first client
+        // request alone produces).
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        u.circuit_breaker.base_open_duration_ms = 30000;
+        u.circuit_breaker.max_open_duration_ms  = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // One client request → 4 upstream attempts → 4 RESPONSE_5XX
+        // reports. Threshold=3 should trip during this single request.
+        TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+
+        // Second client request must hit the OPEN breaker → 503.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        bool has_breaker_header =
+            r.find("X-Circuit-Breaker: open") != std::string::npos ||
+            r.find("x-circuit-breaker: open") != std::string::npos;
+
+        bool pass = is_503 && has_breaker_header;
+        TestFramework::RecordTest(
+            "CB Phase 4: retried failures count toward trip", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " breaker_hdr=" + std::to_string(has_breaker_header) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: retried failures count toward trip", false, e.what());
+    }
+}
+
 void RunAllTests() {
     std::cout << "\n" << std::string(60, '=') << std::endl;
     std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl;
@@ -765,6 +907,8 @@ void RunAllTests() {
     TestCircuitOpenTerminalForRetry();
     TestDryRunPassthrough();
     TestHalfOpenRecoveryRoundTrip();
+    TestRetryAfterCapCeilsNonAlignedMax();
+    TestRetriedFailuresCountTowardTrip();
 }
 
 }  // namespace CircuitBreakerPhase4Tests

From 30fb10ea50e4a7a17a089f0b45d10a99531ccfa8 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 15:23:02 +0800
Subject: [PATCH 19/37] Fix review comment

---
 include/circuit_breaker/retry_budget.h |  10 ++-
 server/proxy_transaction.cc            | 120 ++++++++++++++++---------
 server/retry_budget.cc                 |  35 +++++---
 test/circuit_breaker_phase3_test.h     |  18 +++-
 test/circuit_breaker_phase4_test.h     | 103 +++++++++++++++++++++
 5 files changed, 227 insertions(+), 59 deletions(-)

diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h
index dd4da11c..001bfccb 100644
--- a/include/circuit_breaker/retry_budget.h
+++ b/include/circuit_breaker/retry_budget.h
@@ -16,7 +16,15 @@ namespace circuit_breaker {
 // Fix: cap concurrent retries as a fraction of concurrent non-retry
 // traffic plus a floor for low-volume correctness.
 //
-//   allowed_retries = max(min_concurrency, in_flight * percent / 100)
+//   allowed_retries = max(min_concurrency,
+//                          (in_flight - retries_in_flight) * percent / 100)
+//
+// The subtraction is load-bearing: callers hold TrackInFlight() for
+// BOTH first attempts and retries (so the guard's RAII paired with
+// ReleaseRetry doesn't need a second counter on the hot path).
+// Without subtracting retries, admitting a retry increases in_flight
+// which increases the cap, and in steady state the effective ratio
+// converges above the configured percent of original traffic.
 //
 // The retry budget is PER-HOST (one instance owned by CircuitBreakerHost,
 // shared across its partitions — the percent math is about aggregate
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 31e5dfe3..192f4328 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -833,16 +833,30 @@ void ProxyTransaction::Cancel() {
     // In INIT and CHECKOUT_PENDING no bytes have left the client side
     // toward the upstream yet, so the connection (if any) is still
     // clean and safe to return to the pool.
-    if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) {
+    const bool upstream_exercised =
+        (state_ != State::INIT && state_ != State::CHECKOUT_PENDING);
+    if (upstream_exercised) {
         poison_connection_ = true;
     }
-    // Release any held breaker admission neutrally before tearing down.
-    // A client disconnect during CHECKOUT_PENDING, mid-send, or mid-
-    // response leaves admission_generation_ set; without this neutral
-    // release a probe slot stays occupied and HALF_OPEN can stall in
-    // half_open_full until an external reset. No-op when no admission
-    // is held (INIT, or an outcome already reported).
-    ReleaseBreakerAdmissionNeutral();
+    // Release any held breaker admission before tearing down. Two paths:
+    //   * Pre-upstream (INIT / CHECKOUT_PENDING): upstream was never
+    //     touched — neutral release so a HALF_OPEN probe slot stays
+    //     eligible for replacement (matches ReportNeutral's design
+    //     contract: "the upstream wasn't actually exercised").
+    //   * Post-send (SENDING_REQUEST / AWAITING_RESPONSE / RECEIVING_BODY):
+    //     we poisoned the pooled connection, which from the upstream's
+    //     point of view is indistinguishable from a mid-flight disconnect.
+    //     Report as UPSTREAM_DISCONNECT so the probe counts against the
+    //     HALF_OPEN cycle (no replacement, re-trip on saw_failure drain)
+    //     and CLOSED-state accounting sees the disruption instead of
+    //     silently dropping a real signal.
+    // Both branches clear admission_generation_ internally, so late
+    // transport callbacks (if any) become no-ops.
+    if (upstream_exercised) {
+        ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT);
+    } else {
+        ReleaseBreakerAdmissionNeutral();
+    }
     // Release the upstream lease back to the pool (or destroy it if
     // poisoned) and clear transport callbacks so any in-flight upstream
     // bytes land harmlessly.
@@ -1030,49 +1044,67 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) {
 }
 
 HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const {
-    // Compute Retry-After from slice->OpenUntil() if the slice is known.
-    // Falls back to a conservative 1-second hint if the slice is null
-    // (shouldn't happen on the circuit-open path — that path requires a
-    // slice — but defense in depth).
+    // TryAcquire() returns REJECTED_OPEN for three distinct situations:
+    //   * True OPEN: slice is in OPEN state, IsOpenDeadlineSet() is true,
+    //     Retry-After reflects remaining backoff from OpenUntil().
+    //   * HALF_OPEN reject (half_open_full or half_open_recovery_failing):
+    //     slice transitioned HALF_OPEN via TransitionOpenToHalfOpen, which
+    //     clears open_until. IsOpenDeadlineSet() is false. These rejects
+    //     wait on the in-flight probe cycle completing (success → CLOSED,
+    //     failure → re-trip with fresh backoff). Retry-After = 1 in this
+    //     branch would under-report the likely wait on a re-trip; ceil to
+    //     base_open_duration_ms as a conservative hint (the worst case is
+    //     re-trip + fresh backoff window).
+    // Emit a distinct X-Circuit-Breaker label for observability so
+    // operators can separate "true OPEN" from "HALF_OPEN recovery back-
+    // pressure" on dashboards.
     int retry_after_secs = 1;
-    if (slice_ && slice_->IsOpenDeadlineSet()) {
-        auto open_until = slice_->OpenUntil();
-        auto now = std::chrono::steady_clock::now();
-        auto ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
-            open_until - now).count();
-        // Ceiling-round to seconds so we never advertise a window
-        // shorter than the actual remaining backoff (e.g. 5.9s → 6,
-        // not 5). Truncating by one second is enough to cause a
-        // well-behaved client to retry while the breaker is still OPEN
-        // and get another avoidable 503.
-        int64_t diff = (ms_remaining + 999) / 1000;
-        // Clamp: Retry-After=0 is silly; upper bound tracks the
-        // configured max_open_duration_ms (clamped to 1s min), so we
-        // don't under-report backoff windows on operators who tune the
-        // breaker longer than 5 minutes. Absolute safety ceiling of
-        // 3600s (1 hour) — anything longer likely means the breaker
-        // is mis-configured and the hint is noise.
-        //
-        // Ceil the cap: floor-rounding max_open_duration_ms would
-        // under-report non-second-aligned configs. E.g. a 1500ms or
-        // 6500ms (exponential-backoff saturation) max floor-rounds to
-        // 1s/6s, advertising a shorter window than the breaker will
-        // actually honor. Clients retrying on the hint would hit
-        // another avoidable 503.
-        long long cfg_ms = slice_->config().max_open_duration_ms;
-        int cfg_cap_secs = static_cast<int>(
-            std::max<long long>(1, (cfg_ms + 999) / 1000));
-        int upper = std::min(cfg_cap_secs, 3600);
-        if (diff < 1) diff = 1;
-        if (diff > upper) diff = upper;
-        retry_after_secs = static_cast<int>(diff);
+    const char* breaker_label = "open";
+    if (slice_) {
+        if (slice_->IsOpenDeadlineSet()) {
+            // True OPEN — Retry-After from actual deadline.
+            auto open_until = slice_->OpenUntil();
+            auto now = std::chrono::steady_clock::now();
+            auto ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
+                open_until - now).count();
+            // Ceiling-round to seconds so we never advertise a window
+            // shorter than the actual remaining backoff.
+            int64_t diff = (ms_remaining + 999) / 1000;
+            // Upper bound tracks the configured max_open_duration_ms
+            // (ceiling-rounded to avoid under-reporting non-second-
+            // aligned configs), with an absolute safety ceiling at
+            // 3600s.
+            long long cfg_ms = slice_->config().max_open_duration_ms;
+            int cfg_cap_secs = static_cast<int>(
+                std::max<long long>(1, (cfg_ms + 999) / 1000));
+            int upper = std::min(cfg_cap_secs, 3600);
+            if (diff < 1) diff = 1;
+            if (diff > upper) diff = upper;
+            retry_after_secs = static_cast<int>(diff);
+            breaker_label = "open";
+        } else if (slice_->CurrentState() ==
+                   circuit_breaker::State::HALF_OPEN) {
+            // HALF_OPEN reject — no deadline to read; hint the operator
+            // with a ceiled base_open_duration so retrying clients wait
+            // for at least the worst-case re-trip window instead of
+            // bouncing immediately on Retry-After=1.
+            long long base_ms = slice_->config().base_open_duration_ms;
+            int hint = static_cast<int>(
+                std::max<long long>(1, (base_ms + 999) / 1000));
+            retry_after_secs = std::min(hint, 3600);
+            breaker_label = "half_open";
+        }
+        // Any other state (CLOSED): shouldn't reach here — ConsultBreaker
+        // only calls this on REJECTED_OPEN. Fall through with the
+        // conservative defaults (Retry-After=1, label="open") so a
+        // regression can't silently emit Retry-After=0.
     }
 
     HttpResponse resp;
     resp.Status(HttpStatus::SERVICE_UNAVAILABLE);
     resp.Text("Upstream circuit breaker is open; please retry later.\n");
     resp.Header("Retry-After", std::to_string(retry_after_secs));
-    resp.Header("X-Circuit-Breaker", "open");
+    resp.Header("X-Circuit-Breaker", breaker_label);
     // Hint operators (not clients) at which upstream tripped. Useful
     // when a gateway fronts multiple backends; without this header, a
     // 503 is opaque.
diff --git a/server/retry_budget.cc b/server/retry_budget.cc
index cc984e6d..9723d949 100644
--- a/server/retry_budget.cc
+++ b/server/retry_budget.cc
@@ -33,19 +33,34 @@ RetryBudget::InFlightGuard RetryBudget::TrackInFlight() {
 }
 
 bool RetryBudget::TryConsumeRetry() {
-    // Snapshot tuning + in_flight once — cap is computed against a
-    // consistent slice. Retrying the cap math inside the CAS loop would
-    // just churn without improving accuracy (in_flight is inherently a
-    // moving target).
+    // Snapshot tuning + both in-flight counters once so the cap is
+    // computed against a consistent slice. Retrying the cap math inside
+    // the CAS loop would just churn without improving accuracy
+    // (in_flight is inherently a moving target).
     int64_t in_flight = in_flight_.load(std::memory_order_relaxed);
+    int64_t retries_in_flight = retries_in_flight_.load(std::memory_order_relaxed);
     int pct = percent_.load(std::memory_order_relaxed);
     int min_conc = min_concurrency_.load(std::memory_order_relaxed);
 
-    // cap = max(min_concurrency, in_flight * percent / 100)
-    // Integer math is fine — percent is 0..100, in_flight is an int64.
-    // Overflow is impossible within reasonable load levels (in_flight
-    // would need to exceed ~2e16 to overflow after multiplying by 100).
-    int64_t pct_cap = (in_flight * pct) / 100;
+    // cap = max(min_concurrency, (in_flight - retries_in_flight) * percent / 100)
+    //
+    // Subtracting retries from the in_flight base prevents the budget
+    // from self-inflating: callers hold TrackInFlight() for BOTH first-
+    // attempts and retries (per the documented API), so admitting a
+    // retry increases in_flight_. Using the raw in_flight as the base
+    // would then increase the cap, which in steady state converges
+    // above the configured percentage of ORIGINAL traffic (e.g. a 20%
+    // budget with retries counted in would allow ~25% of originals to
+    // retry simultaneously; at higher percents the amplification grows
+    // faster).
+    //
+    // Floor the subtraction at 0: `retries_in_flight > in_flight` is
+    // transiently possible under racing increments (retry admitted and
+    // in_flight guard observed before first-attempt guard's pair) —
+    // clamp rather than letting the multiply go negative.
+    int64_t non_retry_in_flight = in_flight - retries_in_flight;
+    if (non_retry_in_flight < 0) non_retry_in_flight = 0;
+    int64_t pct_cap = (non_retry_in_flight * pct) / 100;
     int64_t cap = pct_cap > min_conc ? pct_cap : min_conc;
 
     // Atomically reserve a slot: load current, verify under cap, CAS up
@@ -53,7 +68,7 @@ bool RetryBudget::TryConsumeRetry() {
     // all observe current < cap and all increment past the cap — under
     // the cross-dispatcher load the retry budget is meant to protect
     // against, the gate would stop bounding anything.
-    int64_t current = retries_in_flight_.load(std::memory_order_relaxed);
+    int64_t current = retries_in_flight;
     while (current < cap) {
         if (retries_in_flight_.compare_exchange_weak(
                 current, current + 1,
diff --git a/test/circuit_breaker_phase3_test.h b/test/circuit_breaker_phase3_test.h
index ba2f5554..87ed28e7 100644
--- a/test/circuit_breaker_phase3_test.h
+++ b/test/circuit_breaker_phase3_test.h
@@ -96,17 +96,27 @@ void TestRetryBudgetPercentCap() {
     try {
         RetryBudget rb(20, 0);  // no min floor — pure percent
 
-        // Push in_flight to 50 via guards that we intentionally keep alive.
+        // Push in_flight to 50 via guards that we intentionally keep
+        // alive. Per the documented API, callers hold TrackInFlight()
+        // for BOTH first attempts and retries — but TryConsumeRetry
+        // subtracts retries_in_flight from the base so the budget
+        // doesn't self-inflate as retries are admitted.
         std::vector<RetryBudget::InFlightGuard> guards;
         for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight());
 
-        // 50 * 20% = 10 retries allowed.
+        // With 50 non-retry in-flight and 20% budget the first
+        // admission is against cap=10, but each admission shrinks the
+        // non-retry base by 1. The admission count converges at r
+        // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The
+        // pre-fix formula (cap computed from raw in_flight) would
+        // admit 10, drifting the effective ratio above 20% of
+        // originals.
         int admitted = 0;
         for (int i = 0; i < 20; ++i) {
             if (rb.TryConsumeRetry()) ++admitted;
         }
-        bool cap_hit = admitted == 10;
-        bool rejected_count = rb.RetriesRejected() == 10;
+        bool cap_hit = admitted == 8;
+        bool rejected_count = rb.RetriesRejected() == 12;
 
         // Release guards — in_flight drops to 0; future TryConsumeRetry with
         // min=0 and in_flight=0 rejects everything.
diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
index f6b4fa16..64b68571 100644
--- a/test/circuit_breaker_phase4_test.h
+++ b/test/circuit_breaker_phase4_test.h
@@ -892,6 +892,108 @@ void TestRetriedFailuresCountTowardTrip() {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label.
+// TryAcquire returns REJECTED_OPEN for three situations (true OPEN,
+// half_open_full, half_open_recovery_failing). When the slice is in
+// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse
+// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading
+// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects
+// with a more conservative Retry-After hint.
+//
+// Strategy: trip the breaker, wait for the open window to elapse so the
+// slice transitions HALF_OPEN on the next admission attempt, then flood
+// concurrent requests so some hit half_open_full.
+// ---------------------------------------------------------------------------
+void TestHalfOpenRejectLabel() {
+    std::cout << "\n[TEST] CB Phase 4: HALF_OPEN reject label..."
+              << std::endl;
+    try {
+        // Backend hangs to keep probes in-flight so later concurrent
+        // requests hit half_open_full.
+        std::atomic<bool> hang{false};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) {
+            if (hang.load()) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(600));
+            }
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.base_open_duration_ms = 200;
+        u.circuit_breaker.max_open_duration_ms  = 500;
+        u.circuit_breaker.permitted_half_open_calls = 1;  // tiny budget
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        // Wait for the open window to elapse so the next admission
+        // flips the slice to HALF_OPEN.
+        std::this_thread::sleep_for(std::chrono::milliseconds(300));
+
+        // Flip backend to hang so the probe occupies the single probe
+        // slot while we fire sibling requests that must hit half_open_full.
+        hang.store(true);
+
+        std::atomic<bool> saw_half_open{false};
+        std::atomic<bool> saw_open{false};
+        auto probe = [&](int id) {
+            (void)id;
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
+            if (!TestHttpClient::HasStatus(r, 503)) return;
+            if (r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
+                r.find("x-circuit-breaker: half_open") != std::string::npos) {
+                saw_half_open.store(true);
+            }
+            if (r.find("X-Circuit-Breaker: open") != std::string::npos ||
+                r.find("x-circuit-breaker: open") != std::string::npos) {
+                // We want to distinguish the labels; the "open" substring
+                // also matches "half_open". Only count true "open" if
+                // "half_open" didn't appear in THIS response.
+                if (r.find("half_open") == std::string::npos) {
+                    saw_open.store(true);
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 6; ++i) {
+            threads.emplace_back(probe, i);
+            std::this_thread::sleep_for(std::chrono::milliseconds(20));
+        }
+        for (auto& t : threads) t.join();
+
+        // Pass if at least one HALF_OPEN-labelled reject was observed.
+        // saw_open may or may not be observed (some rejects could have
+        // hit between cycles) — the key contract is that HALF_OPEN
+        // rejects no longer get the plain "open" label.
+        bool pass = saw_half_open.load();
+        TestFramework::RecordTest(
+            "CB Phase 4: HALF_OPEN reject label", pass,
+            pass ? "" :
+            "saw_half_open=" + std::to_string(saw_half_open.load()) +
+            " saw_open=" + std::to_string(saw_open.load()));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: HALF_OPEN reject label", false, e.what());
+    }
+}
+
 void RunAllTests() {
     std::cout << "\n" << std::string(60, '=') << std::endl;
     std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl;
@@ -909,6 +1011,7 @@ void RunAllTests() {
     TestHalfOpenRecoveryRoundTrip();
     TestRetryAfterCapCeilsNonAlignedMax();
     TestRetriedFailuresCountTowardTrip();
+    TestHalfOpenRejectLabel();
 }
 
 }  // namespace CircuitBreakerPhase4Tests

From fccd6f5a7013637c1f9b14346c486ec4cb9233a4 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 15:55:55 +0800
Subject: [PATCH 20/37] Fix review comment

---
 .../circuit_breaker/circuit_breaker_slice.h   |  14 ++
 server/circuit_breaker_slice.cc               |   5 +
 server/proxy_transaction.cc                   |  81 +++++-----
 test/circuit_breaker_phase4_test.h            | 147 ++++++++++++++++++
 4 files changed, 208 insertions(+), 39 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 6e9734df..1c96dcd0 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -146,6 +146,20 @@ class CircuitBreakerSlice {
         return open_until_steady_ns_.load(std::memory_order_relaxed) > 0;
     }
 
+    // Expected next open-duration in milliseconds if the slice re-trips
+    // from its current state. Computed from base_open_duration_ms
+    // shifted by the current `consecutive_trips_` count and clamped by
+    // max_open_duration_ms. Used by the Retry-After hint path for
+    // HALF_OPEN rejections, where there's no stored deadline but the
+    // next OPEN window (if the probe cycle fails) will follow the
+    // exponential-backoff curve — base alone would under-report after
+    // multiple trips.
+    //
+    // Safe from any thread (atomic load of consecutive_trips_ + plain
+    // reads of config_ fields). Config fields are dispatcher-owned but
+    // a slightly-torn read is fine for an observability hint.
+    int64_t NextOpenDurationMs() const;
+
 private:
     // Logging label: "service=X host=Y:Z partition=N" built once.
     std::string host_label_;
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index be9da56a..c34c25ae 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -62,6 +62,11 @@ std::chrono::nanoseconds CircuitBreakerSlice::ComputeOpenDuration() const {
     return std::chrono::milliseconds(scaled_ms);
 }
 
+int64_t CircuitBreakerSlice::NextOpenDurationMs() const {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+        ComputeOpenDuration()).count();
+}
+
 bool CircuitBreakerSlice::ShouldTripClosed(
         std::chrono::steady_clock::time_point now) {
     if (consecutive_failures_ >= config_.consecutive_failure_threshold) {
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 192f4328..bc847368 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -833,30 +833,28 @@ void ProxyTransaction::Cancel() {
     // In INIT and CHECKOUT_PENDING no bytes have left the client side
     // toward the upstream yet, so the connection (if any) is still
     // clean and safe to return to the pool.
-    const bool upstream_exercised =
-        (state_ != State::INIT && state_ != State::CHECKOUT_PENDING);
-    if (upstream_exercised) {
+    if (state_ != State::INIT && state_ != State::CHECKOUT_PENDING) {
         poison_connection_ = true;
     }
-    // Release any held breaker admission before tearing down. Two paths:
-    //   * Pre-upstream (INIT / CHECKOUT_PENDING): upstream was never
-    //     touched — neutral release so a HALF_OPEN probe slot stays
-    //     eligible for replacement (matches ReportNeutral's design
-    //     contract: "the upstream wasn't actually exercised").
-    //   * Post-send (SENDING_REQUEST / AWAITING_RESPONSE / RECEIVING_BODY):
-    //     we poisoned the pooled connection, which from the upstream's
-    //     point of view is indistinguishable from a mid-flight disconnect.
-    //     Report as UPSTREAM_DISCONNECT so the probe counts against the
-    //     HALF_OPEN cycle (no replacement, re-trip on saw_failure drain)
-    //     and CLOSED-state accounting sees the disruption instead of
-    //     silently dropping a real signal.
-    // Both branches clear admission_generation_ internally, so late
-    // transport callbacks (if any) become no-ops.
-    if (upstream_exercised) {
-        ReportBreakerOutcome(RESULT_UPSTREAM_DISCONNECT);
-    } else {
-        ReleaseBreakerAdmissionNeutral();
-    }
+    // Release any held breaker admission neutrally. Cancel() is always
+    // a LOCAL termination — client disconnect, framework-level abort,
+    // H2 stream reset, etc. Even when we poisoned a pooled connection
+    // mid-request, counting that as an upstream-health failure would
+    // trip the breaker against a backend that may be perfectly healthy
+    // (browser cancels, user-initiated timeouts, etc. are all common
+    // causes). The reviewer guidance is explicit: client-initiated
+    // aborts must be neutral from the breaker's perspective.
+    //
+    // Trade-off: in HALF_OPEN, ReportNeutral on a probe decrements
+    // both inflight and admitted, so a cancelled probe makes the slot
+    // eligible for a replacement admission in the same cycle. That is
+    // the documented design contract of ReportNeutral ("the upstream
+    // wasn't actually exercised by this admission" from the breaker's
+    // decision-math point of view — we didn't observe a success or
+    // failure), and it is acceptable: probes that genuinely succeed
+    // or fail still close / re-trip the cycle normally, and a broken
+    // upstream under cancel-spam will still fail those real probes.
+    ReleaseBreakerAdmissionNeutral();
     // Release the upstream lease back to the pool (or destroy it if
     // poisoned) and clear transport callbacks so any in-flight upstream
     // bytes land harmlessly.
@@ -1060,9 +1058,18 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const {
     // pressure" on dashboards.
     int retry_after_secs = 1;
     const char* breaker_label = "open";
+    // Absolute sanity ceiling — independent of config. Protects against
+    // ridiculous programmatic values that might slip past validation.
+    static constexpr int RETRY_AFTER_ABS_MAX_SECS = 3600;  // 1 hour
     if (slice_) {
         if (slice_->IsOpenDeadlineSet()) {
-            // True OPEN — Retry-After from actual deadline.
+            // True OPEN — Retry-After from the actual stored deadline.
+            // The deadline is authoritative: it's what the slice will
+            // actually honor, regardless of any subsequent config
+            // reload that might lower max_open_duration_ms. Clamping
+            // below the stored deadline would tell well-behaved clients
+            // to retry early and bounce on more 503s until the original
+            // deadline elapses.
             auto open_until = slice_->OpenUntil();
             auto now = std::chrono::steady_clock::now();
             auto ms_remaining = std::chrono::duration_cast<std::chrono::milliseconds>(
@@ -1070,28 +1077,24 @@ HttpResponse ProxyTransaction::MakeCircuitOpenResponse() const {
             // Ceiling-round to seconds so we never advertise a window
             // shorter than the actual remaining backoff.
             int64_t diff = (ms_remaining + 999) / 1000;
-            // Upper bound tracks the configured max_open_duration_ms
-            // (ceiling-rounded to avoid under-reporting non-second-
-            // aligned configs), with an absolute safety ceiling at
-            // 3600s.
-            long long cfg_ms = slice_->config().max_open_duration_ms;
-            int cfg_cap_secs = static_cast<int>(
-                std::max<long long>(1, (cfg_ms + 999) / 1000));
-            int upper = std::min(cfg_cap_secs, 3600);
             if (diff < 1) diff = 1;
-            if (diff > upper) diff = upper;
+            if (diff > RETRY_AFTER_ABS_MAX_SECS) diff = RETRY_AFTER_ABS_MAX_SECS;
             retry_after_secs = static_cast<int>(diff);
             breaker_label = "open";
         } else if (slice_->CurrentState() ==
                    circuit_breaker::State::HALF_OPEN) {
-            // HALF_OPEN reject — no deadline to read; hint the operator
-            // with a ceiled base_open_duration so retrying clients wait
-            // for at least the worst-case re-trip window instead of
-            // bouncing immediately on Retry-After=1.
-            long long base_ms = slice_->config().base_open_duration_ms;
+            // HALF_OPEN reject — no deadline to read. Hint with the
+            // NEXT expected open duration (base << consecutive_trips_,
+            // clamped by max_open_duration_ms) rather than base alone:
+            // after multiple trips, exponential backoff has already
+            // grown the OPEN window, and advertising bare base would
+            // tell clients to retry far earlier than the breaker will
+            // admit even in the worst case (probe cycle fails, slice
+            // re-trips into the larger backoff).
+            int64_t next_ms = slice_->NextOpenDurationMs();
             int hint = static_cast<int>(
-                std::max<long long>(1, (base_ms + 999) / 1000));
-            retry_after_secs = std::min(hint, 3600);
+                std::max<int64_t>(1, (next_ms + 999) / 1000));
+            retry_after_secs = std::min(hint, RETRY_AFTER_ABS_MAX_SECS);
             breaker_label = "half_open";
         }
         // Any other state (CLOSED): shouldn't reach here — ConsultBreaker
diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
index 64b68571..5b044952 100644
--- a/test/circuit_breaker_phase4_test.h
+++ b/test/circuit_breaker_phase4_test.h
@@ -994,6 +994,152 @@ void TestHalfOpenRejectLabel() {
     }
 }
 
+// ---------------------------------------------------------------------------
+// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff,
+// not just base_open_duration_ms. After multiple trips, the next OPEN window
+// (if the probe cycle fails) is base << consecutive_trips, clamped by
+// max_open_duration_ms. Advertising bare base would under-report the worst-
+// case wait by a factor of 2^n.
+//
+// Strategy: trip → recover → trip → recover → trip to drive consecutive_trips
+// up. Then hit HALF_OPEN during the next OPEN window elapse and assert
+// Retry-After > base seconds.
+// ---------------------------------------------------------------------------
+void TestHalfOpenRetryAfterScalesWithBackoff() {
+    std::cout << "\n[TEST] CB Phase 4: HALF_OPEN Retry-After exponential..."
+              << std::endl;
+    try {
+        // Backend hangs on demand so we can pin the probe slot and
+        // observe HALF_OPEN rejections.
+        std::atomic<bool> hang{false};
+        std::atomic<bool> fail_mode{true};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&hang, &fail_mode](const HttpRequest&,
+                                                   HttpResponse& resp) {
+            if (hang.load()) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(800));
+            }
+            if (fail_mode.load()) {
+                resp.Status(502).Body("err", "text/plain");
+            } else {
+                resp.Status(200).Body("ok", "text/plain");
+            }
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 2;
+        gw.http2.enabled = false;
+        // base=100ms, max=5000ms. After 3 trips the next duration is
+        // 100 << 3 = 800ms (< max), so HALF_OPEN's hint should be
+        // ceil(800/1000)=1s. But we only need to validate that the
+        // hint is >= 1s (which base alone would also produce from
+        // ceil(100/1000)=1). To get an observable difference, use a
+        // smaller base (50ms) and enough trips that 50 << N > 1000ms.
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/2);
+        u.circuit_breaker.base_open_duration_ms = 100;     // config minimum
+        u.circuit_breaker.max_open_duration_ms  = 8000;    // cap at 8s
+        u.circuit_breaker.permitted_half_open_calls = 1;   // single probe
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip 1: two consecutive failures.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        // Wait past base (50ms → open window) so slice goes HALF_OPEN.
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        // Recovery: one probe success (flip fail_mode briefly).
+        fail_mode.store(false);
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        fail_mode.store(true);
+
+        // Trip 2: two more failures.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(250));
+
+        // Recovery again.
+        fail_mode.store(false);
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        fail_mode.store(true);
+
+        // Trip 3: two more failures. consecutive_trips should now be
+        // high enough that base << trips > 1000ms — HALF_OPEN hint
+        // should be >= 1 but potentially larger.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        // Wait for the open window to elapse and next admission
+        // transitions HALF_OPEN.
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+        // Pin the probe slot with a hanging request so subsequent
+        // requests get HALF_OPEN rejects.
+        hang.store(true);
+        std::thread probe([&]() {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        });
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
+        hang.store(false);
+        probe.join();
+
+        bool is_half_open =
+            r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
+            r.find("x-circuit-breaker: half_open") != std::string::npos;
+
+        // Extract Retry-After.
+        int retry_after = -1;
+        const char* markers[] = {"Retry-After:", "retry-after:"};
+        for (const char* m : markers) {
+            auto pos = r.find(m);
+            if (pos == std::string::npos) continue;
+            pos += std::string(m).size();
+            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
+            int val = 0;
+            bool any = false;
+            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
+                val = val * 10 + (r[pos] - '0');
+                any = true;
+                ++pos;
+            }
+            if (any) { retry_after = val; break; }
+        }
+
+        // Pre-fix the HALF_OPEN hint was hard-coded to ceil(base/1000)=1s.
+        // Post-fix, with base=50ms and consecutive_trips ~= 3, the next
+        // open duration is 50 << 3 = 400ms → ceil = 1s (still 1). With
+        // trips ~= 5, 50 << 5 = 1600ms → ceil = 2s. So we need enough
+        // trips to cross the second boundary. The exact count depends
+        // on which partition the requests hit (aggregated sharding).
+        // Assert at least that we saw a HALF_OPEN response and
+        // Retry-After is at least 1 and at most max/1000=8 — both
+        // conservative lower/upper bounds of the exponential formula.
+        bool retry_after_ok = (retry_after >= 1 && retry_after <= 8);
+        bool pass = is_half_open && retry_after_ok;
+        TestFramework::RecordTest(
+            "CB Phase 4: HALF_OPEN Retry-After exponential-aware", pass,
+            pass ? "" :
+            "is_half_open=" + std::to_string(is_half_open) +
+            " retry_after=" + std::to_string(retry_after));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 4: HALF_OPEN Retry-After exponential-aware",
+            false, e.what());
+    }
+}
+
 void RunAllTests() {
     std::cout << "\n" << std::string(60, '=') << std::endl;
     std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl;
@@ -1012,6 +1158,7 @@ void RunAllTests() {
     TestRetryAfterCapCeilsNonAlignedMax();
     TestRetriedFailuresCountTowardTrip();
     TestHalfOpenRejectLabel();
+    TestHalfOpenRetryAfterScalesWithBackoff();
 }
 
 }  // namespace CircuitBreakerPhase4Tests

From 5ee26fd6ed951330ab521fa0b6d2036829eb5d35 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 18:45:37 +0800
Subject: [PATCH 21/37] Fix review comment

---
 test/circuit_breaker_phase4_test.h | 235 +++++++++++++++++------------
 1 file changed, 142 insertions(+), 93 deletions(-)

diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
index 5b044952..5626b77a 100644
--- a/test/circuit_breaker_phase4_test.h
+++ b/test/circuit_breaker_phase4_test.h
@@ -84,8 +84,12 @@ void TestBreakerTripsAfterConsecutiveFailures() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
-        gw.http2.enabled = false;  // match the existing proxy test pattern  // single thread → single breaker partition exercised
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.  // single thread → single breaker partition exercised
         gw.upstreams.push_back(
             MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port,
                                 /*enabled=*/true, /*threshold=*/3));
@@ -159,8 +163,12 @@ void TestBreakerDisabledPassesThrough() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
-        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
         gw.upstreams.push_back(
             MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                 /*enabled=*/false, /*threshold=*/3));
@@ -203,7 +211,13 @@ void TestSuccessResetsConsecutiveFailureCounter() {
     try {
         std::atomic<bool> fail_mode{true};
         HttpServer backend("127.0.0.1", 0);
-        backend.Get("/toggle", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
+        // Backend must serve /fail — that's the exact-match route the
+        // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail",
+        // strip_prefix=false). A different backend path would leave
+        // the gateway 404-ing every request without ever exercising
+        // the proxy, and the CLOSED-state assertion below would pass
+        // for the wrong reason.
+        backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
             if (fail_mode.load()) {
                 resp.Status(502).Body("err", "text/plain");
             } else {
@@ -216,8 +230,12 @@ void TestSuccessResetsConsecutiveFailureCounter() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
-        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
         gw.upstreams.push_back(
             MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                 /*enabled=*/true, /*threshold=*/3));
@@ -229,28 +247,35 @@ void TestSuccessResetsConsecutiveFailureCounter() {
         // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails.
         // With reset semantics, consecutive_failures_ never exceeds 2 → no trip.
         for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000);  // FAIL
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);  // FAIL
         }
         fail_mode.store(false);
-        TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000);   // SUCCESS → reset
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);      // SUCCESS → reset
         fail_mode.store(true);
         for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/echo/toggle", 3000);  // FAIL
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);  // FAIL
         }
 
-        // Inspect the breaker's state directly — it should still be CLOSED.
+        // Inspect the breaker's state directly. The slice must be CLOSED
+        // AND must have observed activity — without the second check, a
+        // gateway that 404's every request (e.g. because the proxy route
+        // doesn't match) would also pass trivially.
         auto* cbm = gateway.GetUpstreamManager() ?
             gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
         auto* host = cbm ? cbm->GetHost("svc") : nullptr;
         auto* slice = host ? host->GetSlice(0) : nullptr;
         bool still_closed = slice && slice->CurrentState() == State::CLOSED;
+        // No trip fired: total_trips should be zero for this slice.
+        int64_t trips = slice ? slice->Trips() : -1;
+        bool no_trips = (trips == 0);
 
+        bool pass = still_closed && no_trips;
         TestFramework::RecordTest(
-            "CB Phase 4: success resets consecutive counter", still_closed,
-            still_closed ? "" :
-            "slice not CLOSED after S resets failures: state=" +
-            std::to_string(static_cast<int>(
-                slice ? slice->CurrentState() : State::CLOSED)));
+            "CB Phase 4: success resets consecutive counter", pass,
+            pass ? "" :
+            "state=" + std::to_string(static_cast<int>(
+                slice ? slice->CurrentState() : State::CLOSED)) +
+            " trips=" + std::to_string(trips));
     } catch (const std::exception& e) {
         TestFramework::RecordTest(
             "CB Phase 4: success resets consecutive counter", false, e.what());
@@ -275,8 +300,12 @@ void TestTripDrivesSliceState() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
-        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
         gw.upstreams.push_back(
             MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                 /*enabled=*/true, /*threshold=*/3));
@@ -338,8 +367,12 @@ void TestOpenBreakerShortCircuitsUpstreamCall() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
-        gw.http2.enabled = false;  // match the existing proxy test pattern
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
         gw.upstreams.push_back(
             MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                 /*enabled=*/true, /*threshold=*/3));
@@ -388,7 +421,7 @@ void TestBareProxyWorks() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         UpstreamConfig u;
         u.name = "svc";
@@ -442,7 +475,7 @@ void TestRetryAfterHeaderValue() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         // base_open_duration 2000ms, max 60_000ms — Retry-After should
         // ceiling-round and fall inside [1, 60].
@@ -526,7 +559,7 @@ void TestCircuitOpenTerminalForRetry() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         // Retries enabled on 5xx — if the breaker reject leaked into
         // MaybeRetry, the test would see extra backend hits after the
@@ -592,7 +625,7 @@ void TestDryRunPassthrough() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                      /*enabled=*/true, /*threshold=*/3);
@@ -674,7 +707,7 @@ void TestHalfOpenRecoveryRoundTrip() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                      /*enabled=*/true, /*threshold=*/3);
@@ -770,7 +803,7 @@ void TestRetryAfterCapCeilsNonAlignedMax() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         // Configure a non-second-aligned max backoff. base = 1500ms so
         // the actual OpenUntil-now at trip time is ~1.5s, which ceil-
@@ -848,7 +881,7 @@ void TestRetriedFailuresCountTowardTrip() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each
         // client request produces 1 + max_retries=3 = 4 upstream
@@ -925,7 +958,7 @@ void TestHalfOpenRejectLabel() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;
         gw.http2.enabled = false;
         auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                      /*enabled=*/true, /*threshold=*/3);
@@ -996,34 +1029,30 @@ void TestHalfOpenRejectLabel() {
 
 // ---------------------------------------------------------------------------
 // Test 14: HALF_OPEN Retry-After reflects the current exponential backoff,
-// not just base_open_duration_ms. After multiple trips, the next OPEN window
-// (if the probe cycle fails) is base << consecutive_trips, clamped by
-// max_open_duration_ms. Advertising bare base would under-report the worst-
-// case wait by a factor of 2^n.
+// not just base_open_duration_ms. After multiple trips the next OPEN window
+// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old
+// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report
+// the worst-case wait, which this test must fail for.
 //
-// Strategy: trip → recover → trip → recover → trip to drive consecutive_trips
-// up. Then hit HALF_OPEN during the next OPEN window elapse and assert
-// Retry-After > base seconds.
+// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting
+// the OPEN window elapse and single probe fail each cycle. Successful
+// recoveries must be avoided — TransitionHalfOpenToClosed resets
+// consecutive_trips_ to 0, which hides the exponential hint.
 // ---------------------------------------------------------------------------
 void TestHalfOpenRetryAfterScalesWithBackoff() {
     std::cout << "\n[TEST] CB Phase 4: HALF_OPEN Retry-After exponential..."
               << std::endl;
     try {
-        // Backend hangs on demand so we can pin the probe slot and
-        // observe HALF_OPEN rejections.
+        // Backend fails fast by default. When `hang` is set, the
+        // handler blocks — used at the end to pin the probe slot so
+        // a concurrent request observes HALF_OPEN rejection.
         std::atomic<bool> hang{false};
-        std::atomic<bool> fail_mode{true};
         HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&hang, &fail_mode](const HttpRequest&,
-                                                   HttpResponse& resp) {
+        backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) {
             if (hang.load()) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(800));
-            }
-            if (fail_mode.load()) {
-                resp.Status(502).Body("err", "text/plain");
-            } else {
-                resp.Status(200).Body("ok", "text/plain");
+                std::this_thread::sleep_for(std::chrono::milliseconds(1500));
             }
+            resp.Status(502).Body("err", "text/plain");
         });
         TestServerRunner<HttpServer> backend_runner(backend);
         int backend_port = backend_runner.GetPort();
@@ -1031,14 +1060,8 @@ void TestHalfOpenRetryAfterScalesWithBackoff() {
         ServerConfig gw;
         gw.bind_host = "127.0.0.1";
         gw.bind_port = 0;
-        gw.worker_threads = 2;
+        gw.worker_threads = 1;  // pin all traffic to slice[0]
         gw.http2.enabled = false;
-        // base=100ms, max=5000ms. After 3 trips the next duration is
-        // 100 << 3 = 800ms (< max), so HALF_OPEN's hint should be
-        // ceil(800/1000)=1s. But we only need to validate that the
-        // hint is >= 1s (which base alone would also produce from
-        // ceil(100/1000)=1). To get an observable difference, use a
-        // smaller base (50ms) and enough trips that 50 << N > 1000ms.
         auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
                                      /*enabled=*/true, /*threshold=*/2);
         u.circuit_breaker.base_open_duration_ms = 100;     // config minimum
@@ -1050,46 +1073,78 @@ void TestHalfOpenRetryAfterScalesWithBackoff() {
         TestServerRunner<HttpServer> gw_runner(gateway);
         int gw_port = gw_runner.GetPort();
 
-        // Trip 1: two consecutive failures.
-        for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        auto* cbm = gateway.GetUpstreamManager() ?
+            gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
+        auto* host = cbm ? cbm->GetHost("svc") : nullptr;
+        auto* slice = host ? host->GetSlice(0) : nullptr;
+        if (!slice) {
+            TestFramework::RecordTest(
+                "CB Phase 4: HALF_OPEN Retry-After exponential-aware",
+                false, "slice lookup failed");
+            return;
         }
-        // Wait past base (50ms → open window) so slice goes HALF_OPEN.
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
 
-        // Recovery: one probe success (flip fail_mode briefly).
-        fail_mode.store(false);
-        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        fail_mode.store(true);
-
-        // Trip 2: two more failures.
+        // Initial trip: 2 consecutive failures with threshold=2.
         for (int i = 0; i < 2; ++i) {
             TestHttpClient::HttpGet(gw_port, "/fail", 3000);
         }
-        std::this_thread::sleep_for(std::chrono::milliseconds(250));
-
-        // Recovery again.
-        fail_mode.store(false);
-        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        fail_mode.store(true);
 
-        // Trip 3: two more failures. consecutive_trips should now be
-        // high enough that base << trips > 1000ms — HALF_OPEN hint
-        // should be >= 1 but potentially larger.
-        for (int i = 0; i < 2; ++i) {
+        // Drive consecutive_trips_ up by letting successive OPEN windows
+        // elapse and probes fail (no recovery → no reset). Stop when
+        // NextOpenDurationMs crosses 1000ms, which is the threshold
+        // where the HALF_OPEN Retry-After hint starts exceeding the
+        // base-only value (ceil(100ms)=1s).
+        //
+        // The slice re-trips on each failed probe; each trip doubles
+        // the open duration. We run ~8 cycles with safety margin which
+        // is comfortably past the trip count needed for Retry-After>=2.
+        for (int cycle = 0; cycle < 8; ++cycle) {
+            // Wait past the current open window. Upper bound: max=8s,
+            // so 1200ms is plenty for the first few short cycles, and
+            // we re-check after each request anyway.
+            int64_t next_ms = slice->NextOpenDurationMs();
+            // Current OPEN window is the one stored BEFORE the upcoming
+            // re-trip — we don't have that directly, so sleep past the
+            // NEXT duration as an over-approximation (next is always >=
+            // current). This ensures OPEN has elapsed.
+            auto sleep_ms = std::max<int64_t>(next_ms + 50, 200);
+            if (sleep_ms > 2000) sleep_ms = 2000;  // cap per cycle
+            std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
+
+            // One request — it should admit as a probe (HALF_OPEN),
+            // the backend fails fast (502), probe fails → re-trip with
+            // consecutive_trips_++ and fresh OPEN.
             TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+
+            // Bail early once the exponential hint crosses 1s → the
+            // subsequent HALF_OPEN reject will carry Retry-After >= 2.
+            if (slice->NextOpenDurationMs() >= 2000) break;
         }
-        // Wait for the open window to elapse and next admission
-        // transitions HALF_OPEN.
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
 
-        // Pin the probe slot with a hanging request so subsequent
-        // requests get HALF_OPEN rejects.
+        int64_t next_open_ms = slice->NextOpenDurationMs();
+        if (next_open_ms < 2000) {
+            TestFramework::RecordTest(
+                "CB Phase 4: HALF_OPEN Retry-After exponential-aware",
+                false,
+                "setup failed: next_open_ms=" + std::to_string(next_open_ms) +
+                " (need >= 2000 to distinguish from base-only hint)");
+            return;
+        }
+
+        // Now trigger a HALF_OPEN reject: wait for current OPEN to
+        // elapse, start a hanging probe (pins the slot), then fire a
+        // sibling request — it must see half_open_full with the
+        // exponential Retry-After.
+        int64_t post_wait_ms = next_open_ms + 100;
+        if (post_wait_ms > 4000) post_wait_ms = 4000;
+        std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms));
+
         hang.store(true);
         std::thread probe([&]() {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            TestHttpClient::HttpGet(gw_port, "/fail", 3500);
         });
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        // Let the probe get admitted and start hanging.
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
 
         std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
         hang.store(false);
@@ -1099,7 +1154,6 @@ void TestHalfOpenRetryAfterScalesWithBackoff() {
             r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
             r.find("x-circuit-breaker: half_open") != std::string::npos;
 
-        // Extract Retry-After.
         int retry_after = -1;
         const char* markers[] = {"Retry-After:", "retry-after:"};
         for (const char* m : markers) {
@@ -1117,22 +1171,17 @@ void TestHalfOpenRetryAfterScalesWithBackoff() {
             if (any) { retry_after = val; break; }
         }
 
-        // Pre-fix the HALF_OPEN hint was hard-coded to ceil(base/1000)=1s.
-        // Post-fix, with base=50ms and consecutive_trips ~= 3, the next
-        // open duration is 50 << 3 = 400ms → ceil = 1s (still 1). With
-        // trips ~= 5, 50 << 5 = 1600ms → ceil = 2s. So we need enough
-        // trips to cross the second boundary. The exact count depends
-        // on which partition the requests hit (aggregated sharding).
-        // Assert at least that we saw a HALF_OPEN response and
-        // Retry-After is at least 1 and at most max/1000=8 — both
-        // conservative lower/upper bounds of the exponential formula.
-        bool retry_after_ok = (retry_after >= 1 && retry_after <= 8);
+        // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2.
+        // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1.
+        // Asserting >= 2 fails the pre-fix implementation.
+        bool retry_after_ok = (retry_after >= 2 && retry_after <= 8);
         bool pass = is_half_open && retry_after_ok;
         TestFramework::RecordTest(
             "CB Phase 4: HALF_OPEN Retry-After exponential-aware", pass,
             pass ? "" :
             "is_half_open=" + std::to_string(is_half_open) +
-            " retry_after=" + std::to_string(retry_after));
+            " retry_after=" + std::to_string(retry_after) +
+            " next_open_ms=" + std::to_string(next_open_ms));
     } catch (const std::exception& e) {
         TestFramework::RecordTest(
             "CB Phase 4: HALF_OPEN Retry-After exponential-aware",

From f08fbe358c6e3bba32bb610aed0ee8317bf7f660 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 19:39:27 +0800
Subject: [PATCH 22/37] Finished Phase 5: Retry budget integration

---
 Makefile                             |   2 +-
 include/upstream/proxy_transaction.h |  39 ++-
 server/proxy_transaction.cc          |  69 ++++-
 test/circuit_breaker_phase5_test.h   | 366 +++++++++++++++++++++++++++
 test/run_test.cc                     |   7 +-
 5 files changed, 474 insertions(+), 9 deletions(-)
 create mode 100644 test/circuit_breaker_phase5_test.h

diff --git a/Makefile b/Makefile
index 2dbd8c2a..80f5f9a1 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
 CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h
index 40886be4..cded9b71 100644
--- a/include/upstream/proxy_transaction.h
+++ b/include/upstream/proxy_transaction.h
@@ -6,6 +6,7 @@
 #include "upstream/header_rewriter.h"
 #include "upstream/retry_policy.h"
 #include "config/server_config.h"        // ProxyConfig (stored by value)
+#include "circuit_breaker/retry_budget.h" // RetryBudget::InFlightGuard (member-by-value)
 #include "http/http_callbacks.h"
 #include "http/http_response.h"
 // <string>, <map>, <unordered_map>, <memory>, <functional>, <chrono> provided by common.h
@@ -17,7 +18,7 @@ class Dispatcher;
 
 namespace circuit_breaker {
 class CircuitBreakerSlice;
-}
+}  // RetryBudget already defined via retry_budget.h
 
 class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
 public:
@@ -166,6 +167,23 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // CircuitBreakerManager on HttpServer, which outlives this transaction.
     circuit_breaker::CircuitBreakerSlice* slice_ = nullptr;
 
+    // Per-host retry budget, resolved alongside `slice_` in Start() from
+    // the same CircuitBreakerHost. Null when there's no breaker attached
+    // for this service — in that case the transaction skips budget
+    // tracking entirely. Lifetime: the budget is owned by the host,
+    // which outlives this transaction (destruction order guaranteed by
+    // HttpServer member declaration).
+    circuit_breaker::RetryBudget* retry_budget_ = nullptr;
+
+    // Per-attempt in-flight tracker. Held for the duration of each
+    // attempt (first try and retries alike). Replaced on every
+    // AttemptCheckout — move-assignment decrements the counter for the
+    // prior attempt and increments for the new one, so a retrying
+    // transaction stays at a single in_flight unit. Default-constructed
+    // guard is empty (counter_ = nullptr): used when retry_budget_ is
+    // null or before the first ConsultBreaker admission.
+    circuit_breaker::RetryBudget::InFlightGuard inflight_guard_;
+
     // Per-ATTEMPT admission state. Reset on each call to ConsultBreaker();
     // paired Report*() calls thread the `generation` back so the slice
     // can drop stale completions across state transitions (see
@@ -175,11 +193,11 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     uint64_t admission_generation_ = 0;
     bool is_probe_ = false;
 
-    // TODO(phase-5): retry-budget token held by this transaction's most
-    // recent retry attempt. Phase 5 flips this to true on successful
-    // TryConsumeRetry and clears it on ReleaseRetry. Phase 4 declares
-    // the field so Cleanup() and Cancel() have something to check, but
-    // the retry loop does not yet consume the budget.
+    // Retry-budget token held by this transaction's current retry
+    // attempt (attempt_ > 0). Set true after a successful
+    // TryConsumeRetry in MaybeRetry; cleared by ReleaseRetryToken in
+    // Cleanup. Dry-run rejects proceed but the flag stays false — no
+    // token was consumed, so no ReleaseRetry is required.
     bool retry_token_held_ = false;
 
     // Internal methods
@@ -257,4 +275,13 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // admission_generation_ so a following ReportBreakerOutcome is a
     // no-op.
     void ReleaseBreakerAdmissionNeutral();
+
+    // Release the retry-budget token held by this attempt, if any.
+    // Idempotent via the retry_token_held_ flag — called from Cleanup
+    // between attempts (so the next retry's TryConsumeRetry sees a
+    // freshly-released counter) AND from the destructor / Cancel as
+    // safety nets. No-op when no budget was attached or no token was
+    // consumed (e.g. first attempt, or dry-run reject that didn't
+    // consume).
+    void ReleaseRetryToken();
 };
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index bc847368..020d898f 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -124,6 +124,13 @@ void ProxyTransaction::Start() {
             auto* host = cbm->GetHost(service_name_);
             if (host) {
                 slice_ = host->GetSlice(static_cast<size_t>(dispatcher_index_));
+                // Retry budget is host-level (shared across partitions).
+                // Resolve from the same host so retry admission math stays
+                // consistent with the slice's dispatcher routing. Always
+                // non-null when the host exists (budget is unconditionally
+                // constructed by the host ctor). Null only when `host`
+                // itself is null.
+                retry_budget_ = host->GetRetryBudget();
             }
         }
     }
@@ -146,6 +153,16 @@ void ProxyTransaction::AttemptCheckout() {
         return;
     }
 
+    // Track this attempt against the host-level retry budget's
+    // in_flight counter. Replaces any prior guard (from the previous
+    // attempt of the same transaction) — move-assignment decrements
+    // the old counter and takes ownership of the new, so a retrying
+    // transaction stays at exactly one in_flight unit throughout. No-op
+    // when retry_budget_ is null (no breaker attached for this service).
+    if (retry_budget_) {
+        inflight_guard_ = retry_budget_->TrackInFlight();
+    }
+
     auto self = shared_from_this();
 
     // Lazily allocate the shared cancel token so the pool can drop
@@ -671,7 +688,10 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) {
                              client_fd_, service_name_, attempt_,
                              static_cast<int>(condition));
 
-        // Release old lease, clear callbacks, poison if tainted
+        // Release old lease, clear callbacks, poison if tainted.
+        // Cleanup also releases any retry token held by the previous
+        // retry attempt (attempt_ > 1) so the next TryConsumeRetry sees
+        // a fresh counter.
         Cleanup();
         codec_.Reset();
         // Re-apply request method after reset — llhttp_init() zeroes
@@ -680,6 +700,40 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) {
         codec_.SetRequestMethod(method_);
         poison_connection_ = false;
 
+        // Retry-budget gate. `attempt_ > 0` here is guaranteed — we
+        // just incremented. The budget bounds how many retries can be
+        // concurrently in flight against this upstream HOST (aggregated
+        // across all transactions for the service), preventing a retry
+        // storm from amplifying traffic to a struggling backend.
+        //
+        // Dry-run: log the would-reject but still proceed (consistent
+        // with REJECTED_OPEN_DRYRUN on the slice path). No token is
+        // consumed, so no ReleaseRetry is needed on the dry-run path.
+        //
+        // Full mode: deliver the §12.2 retry-budget response (503 +
+        // X-Retry-Budget-Exhausted) and terminate. Does NOT call
+        // ReportBreakerOutcome — our own reject must not feed back
+        // into the slice's failure math.
+        if (retry_budget_) {
+            bool is_dry_run = slice_ && slice_->config().dry_run;
+            if (retry_budget_->TryConsumeRetry()) {
+                retry_token_held_ = true;
+            } else if (is_dry_run) {
+                logging::Get()->info(
+                    "ProxyTransaction retry budget would-reject (dry-run) "
+                    "client_fd={} service={} attempt={}",
+                    client_fd_, service_name_, attempt_);
+            } else {
+                logging::Get()->warn(
+                    "ProxyTransaction retry budget exhausted "
+                    "client_fd={} service={} attempt={}",
+                    client_fd_, service_name_, attempt_);
+                state_ = State::FAILED;
+                DeliverResponse(MakeRetryBudgetResponse());
+                return;
+            }
+        }
+
         // Condition-dependent first-retry policy:
         // Connection-level failures (stale keep-alive, connect refused)
         // are transient — a different pooled connection will succeed.
@@ -862,6 +916,12 @@ void ProxyTransaction::Cancel() {
 }
 
 void ProxyTransaction::Cleanup() {
+    // Release any retry-budget token held by the attempt that just
+    // ended. Must happen BEFORE the next TryConsumeRetry in MaybeRetry
+    // so the new attempt sees accurate retries_in_flight. Idempotent
+    // via the retry_token_held_ flag.
+    ReleaseRetryToken();
+
     if (lease_) {
         auto* conn = lease_.Get();
         if (conn) {
@@ -1166,6 +1226,13 @@ bool ProxyTransaction::ConsultBreaker() {
     return true;
 }
 
+void ProxyTransaction::ReleaseRetryToken() {
+    if (retry_token_held_ && retry_budget_) {
+        retry_budget_->ReleaseRetry();
+    }
+    retry_token_held_ = false;
+}
+
 void ProxyTransaction::ReleaseBreakerAdmissionNeutral() {
     if (!slice_ || admission_generation_ == 0) return;
 
diff --git a/test/circuit_breaker_phase5_test.h b/test/circuit_breaker_phase5_test.h
new file mode 100644
index 00000000..9b0c3f11
--- /dev/null
+++ b/test/circuit_breaker_phase5_test.h
@@ -0,0 +1,366 @@
+#pragma once
+
+// Phase 5 integration tests: retry budget wired into ProxyTransaction.
+//
+// Phase 3 covered the RetryBudget math (CAS, non-retry denominator,
+// min-concurrency floor) as unit tests against the RetryBudget class in
+// isolation. Phase 5 tests the INTEGRATION: ProxyTransaction resolves
+// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks
+// every attempt's in_flight via the RAII guard, and consults
+// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2
+// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed
+// back into the slice's failure math.
+//
+// Strategy: backends that always 502 with `retry_on_5xx=true` drive the
+// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`)
+// rejects every retry deterministically without needing concurrent
+// client load. The circuit-breaker consecutive-failure threshold is
+// raised well above the retry count so the breaker stays CLOSED — the
+// budget gate is tested in isolation from the state machine.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <vector>
+
+namespace CircuitBreakerPhase5Tests {
+
+// Upstream config that always proxies /fail, with the circuit breaker
+// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker
+// thresholds intentionally unreachable for these tests — we want the
+// retry-budget gate fired in isolation, not co-tripping the state
+// machine.
+static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name,
+                                              const std::string& host,
+                                              int port,
+                                              int retry_budget_percent,
+                                              int retry_budget_min_concurrency,
+                                              bool dry_run = false) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 16;
+    u.pool.max_idle_connections  = 8;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+
+    u.circuit_breaker.enabled = true;
+    u.circuit_breaker.dry_run = dry_run;
+    // Breaker thresholds unreachable — we don't want the state machine
+    // tripping during a retry-budget test.
+    u.circuit_breaker.consecutive_failure_threshold = 10000;
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    u.circuit_breaker.base_open_duration_ms = 30000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+
+    u.circuit_breaker.retry_budget_percent = retry_budget_percent;
+    u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency;
+    return u;
+}
+
+static bool HasRetryBudgetHeader(const std::string& response) {
+    return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos ||
+           response.find("x-retry-budget-exhausted: 1") != std::string::npos;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 +
+// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that
+// `TryConsumeRetry` runs BEFORE the retry executes and that
+// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse
+// path.
+//
+// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every
+// retry attempt's TryConsumeRetry returns false. First attempt is
+// unaffected (budget only gates retries), so the backend is hit exactly
+// once per client request; the retry is short-circuited locally.
+// ---------------------------------------------------------------------------
+void TestRetryBudgetRejectsRetry() {
+    std::cout << "\n[TEST] CB Phase 5: retry budget rejects retry..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/0);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        bool has_budget_hdr = HasRetryBudgetHeader(r);
+        // Backend should have been hit exactly once (the first attempt);
+        // every retry was short-circuited by the budget gate.
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool single_backend_hit = (hits == 1);
+
+        bool pass = is_503 && has_budget_hdr && single_backend_hit;
+        TestFramework::RecordTest(
+            "CB Phase 5: retry budget rejects retry", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " budget_hdr=" + std::to_string(has_budget_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 5: retry budget rejects retry", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: The min-concurrency floor admits retries even when the %-based
+// cap would be zero. With percent=0 + min_concurrency=5, a single sequential
+// client request's retry chain (1 first + 3 retries = 4 backend hits) all
+// fit under the floor and proceed normally to the upstream — no 503, no
+// X-Retry-Budget-Exhausted, and the client sees the final 5xx response.
+//
+// This is the symmetric test to Test 1: same near-zero %-cap, but a floor
+// large enough that retries aren't budget-gated. Proves the floor is
+// consulted (retries admitted) instead of the %-cap (retries rejected).
+// ---------------------------------------------------------------------------
+void TestRetryBudgetMinConcurrencyFloor() {
+    std::cout << "\n[TEST] CB Phase 5: retry budget min-concurrency floor..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        // percent=0 → no %-based capacity. min_concurrency=5 → floor
+        // admits up to 5 concurrent retries, easily covering the 3
+        // sequential retries from a single client request.
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/5);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000);
+
+        // Client sees the upstream's final 502 — no local 503, no
+        // X-Retry-Budget-Exhausted.
+        bool is_502 = TestHttpClient::HasStatus(r, 502);
+        bool no_budget_hdr = !HasRetryBudgetHeader(r);
+        // 1 first attempt + 3 retries admitted by the floor = 4 backend hits.
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool all_retries_proceeded = (hits == 4);
+
+        bool pass = is_502 && no_budget_hdr && all_retries_proceeded;
+        TestFramework::RecordTest(
+            "CB Phase 5: retry budget min-concurrency floor", pass,
+            pass ? "" :
+            "is_502=" + std::to_string(is_502) +
+            " no_budget_hdr=" + std::to_string(no_budget_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 5: retry budget min-concurrency floor", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: Dry-run bypasses the retry-budget gate.
+//
+// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry
+// returns false for every retry. But `circuit_breaker.dry_run=true`
+// switches the rejection path to a log-and-proceed: no token is
+// consumed, retry_token_held_ stays false, and AttemptCheckout runs as
+// though the budget was unlimited.
+//
+// Result: the client sees the upstream's 502 response (because the
+// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted.
+// ---------------------------------------------------------------------------
+void TestRetryBudgetDryRunPassthrough() {
+    std::cout << "\n[TEST] CB Phase 5: retry budget dry-run passthrough..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/0,
+                                         /*dry_run=*/true);
+        u.proxy.retry.max_retries = 2;
+        u.proxy.retry.retry_on_5xx = true;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000);
+
+        // Retries proceeded despite would-reject decisions — the client
+        // sees the upstream's final 502, not our local 503.
+        bool is_502 = TestHttpClient::HasStatus(r, 502);
+        bool no_budget_hdr = !HasRetryBudgetHeader(r);
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool all_attempts_ran = (hits == 3);  // 1 first + 2 retries
+
+        bool pass = is_502 && no_budget_hdr && all_attempts_ran;
+        TestFramework::RecordTest(
+            "CB Phase 5: retry budget dry-run passthrough", pass,
+            pass ? "" :
+            "is_502=" + std::to_string(is_502) +
+            " no_budget_hdr=" + std::to_string(no_budget_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 5: retry budget dry-run passthrough", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 4: First attempts are NOT budget-gated.
+//
+// The retry-budget cap applies only to retries (attempt_ > 0). First
+// attempts call TrackInFlight (which only ever increments) but skip
+// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a
+// backend that always 200s, every client request must succeed — if the
+// gate accidentally ran on first attempts, we'd see 503s here.
+//
+// Guards against a regression where TryConsumeRetry is called before
+// the `attempt_ > 0` gate, or where the gate is placed in
+// AttemptCheckout instead of MaybeRetry.
+// ---------------------------------------------------------------------------
+void TestFirstAttemptsNotGated() {
+    std::cout << "\n[TEST] CB Phase 5: first attempts not gated..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(200).Body("ok", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/0);
+        // No retries — every request is a first attempt.
+        u.proxy.retry.max_retries = 0;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        int client_count = 5;
+        int successes = 0;
+        for (int i = 0; i < client_count; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (TestHttpClient::HasStatus(r, 200)) ++successes;
+            if (HasRetryBudgetHeader(r)) {
+                // Any X-Retry-Budget-Exhausted on a first-attempt-only
+                // path is a bug. Record and bail.
+                TestFramework::RecordTest(
+                    "CB Phase 5: first attempts not gated", false,
+                    "unexpected X-Retry-Budget-Exhausted on first-attempt path "
+                    "i=" + std::to_string(i));
+                return;
+            }
+        }
+
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool pass = (successes == client_count) && (hits == client_count);
+        TestFramework::RecordTest(
+            "CB Phase 5: first attempts not gated", pass,
+            pass ? "" :
+            "successes=" + std::to_string(successes) +
+            "/" + std::to_string(client_count) +
+            " backend_hits=" + std::to_string(hits));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 5: first attempts not gated", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER PHASE 5 - RETRY BUDGET INTEGRATION TESTS"
+              << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestRetryBudgetRejectsRetry();
+    TestRetryBudgetMinConcurrencyFloor();
+    TestRetryBudgetDryRunPassthrough();
+    TestFirstAttemptsNotGated();
+}
+
+}  // namespace CircuitBreakerPhase5Tests
diff --git a/test/run_test.cc b/test/run_test.cc
index ab7bdb9b..34d54367 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -16,6 +16,7 @@
 #include "circuit_breaker_test.h"
 #include "circuit_breaker_phase3_test.h"
 #include "circuit_breaker_phase4_test.h"
+#include "circuit_breaker_phase5_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -90,6 +91,9 @@ void RunAllTest(){
     // ProxyTransaction + UpstreamManager + HttpServer)
     CircuitBreakerPhase4Tests::RunAllTests();
 
+    // Run circuit breaker Phase 5 retry-budget integration tests
+    CircuitBreakerPhase5Tests::RunAllTests();
+
     std::cout << "====================================\n" << std::endl;
 }
 
@@ -168,11 +172,12 @@ int main(int argc, char* argv[]) {
         // Run rate limit tests
         }else if(mode == "rate_limit" || mode == "-L"){
             RateLimitTests::RunAllTests();
-        // Run circuit breaker tests (phases 1-4: unit + phase3 + phase4)
+        // Run circuit breaker tests (phases 1-5: unit + phase3 + phase4 + phase5)
         }else if(mode == "circuit_breaker" || mode == "-B"){
             CircuitBreakerTests::RunAllTests();
             CircuitBreakerPhase3Tests::RunAllTests();
             CircuitBreakerPhase4Tests::RunAllTests();
+            CircuitBreakerPhase5Tests::RunAllTests();
         // Show help
         }else if(mode == "help" || mode == "-h" || mode == "--help"){
             PrintUsage(argv[0]);

From 277b039c66a97791131e67c5569ecc864eb36234 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 20:04:01 +0800
Subject: [PATCH 23/37] Finished Phase 6: Wait-queue drain on trip

---
 Makefile                            |   2 +-
 include/upstream/pool_partition.h   |  22 +++
 include/upstream/upstream_manager.h |  10 ++
 server/http_server.cc               |  53 ++++++
 server/pool_partition.cc            |  35 ++++
 server/upstream_manager.cc          |  10 ++
 test/circuit_breaker_phase6_test.h  | 261 ++++++++++++++++++++++++++++
 test/run_test.cc                    |   7 +-
 8 files changed, 398 insertions(+), 2 deletions(-)
 create mode 100644 test/circuit_breaker_phase6_test.h

diff --git a/Makefile b/Makefile
index 80f5f9a1..45993b3b 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
 CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h
index f259204a..d23904ab 100644
--- a/include/upstream/pool_partition.h
+++ b/include/upstream/pool_partition.h
@@ -90,6 +90,28 @@ class PoolPartition {
     // completion. Same pattern as ScheduleInitiateShutdown.
     void ScheduleForceCloseActive();
 
+    // Drain the wait queue on a CLOSED → OPEN breaker trip.
+    //
+    // Every live waiter receives CHECKOUT_CIRCUIT_OPEN (mapped by
+    // ProxyTransaction::OnCheckoutError to RESULT_CIRCUIT_OPEN, emitting
+    // the §12.1 circuit-open response). Cancelled waiters are dropped
+    // silently — the transaction already tore its side down via the
+    // framework abort hook. Does NOT set shutting_down_ (this is a
+    // transient drain, not a shutdown); the partition keeps its
+    // connections for HALF_OPEN probing when the open window elapses.
+    //
+    // Dispatcher-thread-only. The breaker's transition callback fires
+    // on the slice's owning dispatcher thread — the SAME dispatcher
+    // that owns this partition (one slice ↔ one partition by
+    // dispatcher_index). No enqueue needed.
+    //
+    // Rationale: without this drain, a queued waiter admitted by
+    // ConsultBreaker just before the trip would wait out the full
+    // `open_duration_ms` (up to 60s by default) before the pool's
+    // queue timeout rejects it. That's a visible latency spike for
+    // clients who are about to be served 503 anyway.
+    void DrainWaitQueueOnTrip();
+
     bool IsShuttingDown() const { return shutting_down_; }
 
     // Stats (dispatcher-thread-only reads)
diff --git a/include/upstream/upstream_manager.h b/include/upstream/upstream_manager.h
index f647d3b3..346bc4d5 100644
--- a/include/upstream/upstream_manager.h
+++ b/include/upstream/upstream_manager.h
@@ -63,6 +63,16 @@ class UpstreamManager {
     // Check if an upstream service is configured
     bool HasUpstream(const std::string& service_name) const;
 
+    // Look up the PoolPartition for (service_name, dispatcher_index).
+    // Returns nullptr if service is unknown or dispatcher_index is out
+    // of range. Used by the circuit-breaker transition callback (wired
+    // in HttpServer::MarkServerReady) to drain the wait queue on a
+    // CLOSED → OPEN trip. Must be called on the dispatcher thread
+    // identified by `dispatcher_index` — the returned partition's
+    // DrainWaitQueueOnTrip is dispatcher-thread-only.
+    PoolPartition* GetPoolPartition(const std::string& service_name,
+                                    size_t dispatcher_index);
+
     // Install a non-owning pointer to the server's CircuitBreakerManager.
     // Called once from HttpServer::MarkServerReady after both managers are
     // constructed (§3.1). Lifetime guarantee: the CircuitBreakerManager
diff --git a/server/http_server.cc b/server/http_server.cc
index fbf06947..ccd8b80f 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -6,6 +6,9 @@
 #include "upstream/upstream_manager.h"
 #include "upstream/proxy_handler.h"
 #include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "upstream/pool_partition.h"
 #include "log/logger.h"
 #include "log/log_utils.h"
 #include <algorithm>
@@ -377,6 +380,56 @@ void HttpServer::MarkServerReady() {
                     upstream_configs_, dispatchers.size(), dispatchers);
             upstream_manager_->AttachCircuitBreakerManager(
                 circuit_breaker_manager_.get());
+
+            // Wire CLOSED→OPEN transition callbacks for every slice of every
+            // host — regardless of `enabled=false`, per design §3.1 R3-1. A
+            // disabled slice never fires transitions (TryAcquire short-
+            // circuits to ADMITTED); wiring the callback costs nothing but
+            // lets a live reload flip enable=false→true without re-wiring.
+            //
+            // The callback routes trip events to the corresponding
+            // PoolPartition's DrainWaitQueueOnTrip so queued waiters fail
+            // fast with CHECKOUT_CIRCUIT_OPEN instead of waiting out the
+            // open window. Each slice gets a distinct callback that
+            // captures its (service, dispatcher_index) pair — we can't use
+            // SetTransitionCallbackOnAllSlices because that would install a
+            // single callback across slices that need different partition
+            // lookups.
+            //
+            // Safe to capture raw `UpstreamManager*`: CircuitBreakerManager
+            // destructs BEFORE UpstreamManager (§3.1 ownership), and slice
+            // callbacks only fire on dispatcher threads which are stopped
+            // before either manager is destroyed. So any live callback
+            // invocation sees a valid UpstreamManager.
+            UpstreamManager* um = upstream_manager_.get();
+            for (const auto& u : upstream_configs_) {
+                auto* host = circuit_breaker_manager_->GetHost(u.name);
+                if (!host) continue;
+                std::string service = u.name;
+                for (size_t i = 0; i < host->partition_count(); ++i) {
+                    auto* slice = host->GetSlice(i);
+                    if (!slice) continue;
+                    slice->SetTransitionCallback(
+                        [um, service, i](circuit_breaker::State old_s,
+                                         circuit_breaker::State new_s,
+                                         const char* /*trigger*/) {
+                            // Drain only on CLOSED→OPEN. HALF_OPEN→OPEN
+                            // doesn't need draining — in HALF_OPEN, non-
+                            // probe admissions are already REJECTED_OPEN
+                            // before reaching the pool queue, so the
+                            // queue stays empty (or holds only probes,
+                            // which are in-flight by the time HALF_OPEN
+                            // trips back).
+                            if (old_s == circuit_breaker::State::CLOSED &&
+                                new_s == circuit_breaker::State::OPEN) {
+                                if (auto* part = um->GetPoolPartition(
+                                        service, i)) {
+                                    part->DrainWaitQueueOnTrip();
+                                }
+                            }
+                        });
+                }
+            }
         } catch (...) {
             logging::Get()->error(
                 "Circuit breaker init failed, stopping server");
diff --git a/server/pool_partition.cc b/server/pool_partition.cc
index 819c941d..a0ba866c 100644
--- a/server/pool_partition.cc
+++ b/server/pool_partition.cc
@@ -549,6 +549,41 @@ void PoolPartition::InitiateShutdown() {
     MaybeSignalDrain();
 }
 
+void PoolPartition::DrainWaitQueueOnTrip() {
+    // Hoist alive_ — a waiter's error_callback may synchronously trigger
+    // a request completion path that tears down the partition (e.g. the
+    // test harness). Same pattern used by InitiateShutdown.
+    auto alive = alive_;
+
+    if (shutting_down_) {
+        // Already draining via InitiateShutdown — that path will send
+        // CHECKOUT_SHUTTING_DOWN to every waiter. Don't double-fire.
+        return;
+    }
+
+    if (wait_queue_.empty()) return;
+
+    logging::Get()->info(
+        "PoolPartition draining wait queue on breaker trip: {}:{} "
+        "queue_size={}",
+        upstream_host_, upstream_port_, wait_queue_.size());
+
+    while (!wait_queue_.empty()) {
+        auto entry = std::move(wait_queue_.front());
+        wait_queue_.pop_front();
+        // Cancelled waiters have no callback to fire — the transaction
+        // already tore its side down via the framework abort hook.
+        if (IsEntryCancelled(entry)) {
+            continue;
+        }
+        // CHECKOUT_CIRCUIT_OPEN — ProxyTransaction::OnCheckoutError maps
+        // to RESULT_CIRCUIT_OPEN and delivers MakeCircuitOpenResponse()
+        // without touching the breaker (our own reject, don't feed back).
+        entry.error_callback(CHECKOUT_CIRCUIT_OPEN);
+        if (!alive->load(std::memory_order_acquire)) return;
+    }
+}
+
 void PoolPartition::ForceCloseActive() {
     // Collect transports + borrower callbacks, then move to zombie, then
     // close transports, then notify borrowers. This ordering ensures:
diff --git a/server/upstream_manager.cc b/server/upstream_manager.cc
index 9cd5a284..c4a4314f 100644
--- a/server/upstream_manager.cc
+++ b/server/upstream_manager.cc
@@ -296,3 +296,13 @@ Dispatcher* UpstreamManager::GetDispatcherForIndex(size_t index) const {
 bool UpstreamManager::HasUpstream(const std::string& service_name) const {
     return pools_.find(service_name) != pools_.end();
 }
+
+PoolPartition* UpstreamManager::GetPoolPartition(
+        const std::string& service_name,
+        size_t dispatcher_index) {
+    auto it = pools_.find(service_name);
+    if (it == pools_.end()) {
+        return nullptr;
+    }
+    return it->second->GetPartition(dispatcher_index);
+}
diff --git a/test/circuit_breaker_phase6_test.h b/test/circuit_breaker_phase6_test.h
new file mode 100644
index 00000000..77eea2c1
--- /dev/null
+++ b/test/circuit_breaker_phase6_test.h
@@ -0,0 +1,261 @@
+#pragma once
+
+// Phase 6 integration tests: wait-queue drain on CLOSED → OPEN trip.
+//
+// Phase 4 already covered "new requests after a trip hit REJECTED_OPEN".
+// Phase 6 covers the orthogonal case: a request that passed ConsultBreaker
+// pre-trip and is waiting in the pool's bounded wait queue when the trip
+// fires. Without the drain, that waiter would sit until either the pool
+// frees a slot (and then re-hit the upstream — pointless traffic) or the
+// queue-timeout / open-duration elapses (up to 60s latency spike).
+//
+// Mechanism tested: `HttpServer::MarkServerReady` installs a transition
+// callback on every slice that routes CLOSED → OPEN to the corresponding
+// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives
+// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps
+// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`).
+//
+// Strategy: gate concurrency via a 1-connection pool. The first request
+// hangs at the backend long enough to let a second request queue behind
+// it. When the first's response lands (502), the breaker trips and the
+// drain fires, causing the queued request to receive 503 + circuit-open
+// headers instead of the backend's 502 (which would happen if the drain
+// were missing and the queued request proceeded).
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <vector>
+#include <future>
+
+namespace CircuitBreakerPhase6Tests {
+
+static UpstreamConfig MakeDrainTripUpstream(const std::string& name,
+                                             const std::string& host,
+                                             int port,
+                                             bool breaker_enabled) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    // Single connection per partition — forces the second concurrent
+    // request to queue behind the first. Since tests run with
+    // worker_threads=1, one partition exists and it has exactly one
+    // connection slot.
+    u.pool.max_connections       = 1;
+    u.pool.max_idle_connections  = 1;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 5000;
+    u.proxy.retry.max_retries = 0;  // Deterministic — no retry confounds.
+
+    u.circuit_breaker.enabled = breaker_enabled;
+    u.circuit_breaker.consecutive_failure_threshold = 1;  // Trip on first 5xx.
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    // Long open duration so the drain is unambiguously the thing that
+    // surfaces the 503 to the queued client — not a timer-driven
+    // HALF_OPEN recovery admitting a subsequent attempt.
+    u.circuit_breaker.base_open_duration_ms = 30000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker.
+//
+// Request A takes the single pool slot and hangs at the backend for ~300ms.
+// Request B queues (pool exhausted). At t≈300ms, A's backend response
+// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip →
+// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives
+// 503 + `X-Circuit-Breaker: open`.
+//
+// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the
+// backend itself, gets 502, client sees 502 — NOT 503 and NOT
+// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header`
+// fails without the drain wiring.
+// ---------------------------------------------------------------------------
+void TestWaitQueueDrainedOnTrip() {
+    std::cout << "\n[TEST] CB Phase 6: wait queue drained on trip..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            // Delay so the gateway's pool holds the connection long
+            // enough for a second client request to queue on it.
+            std::this_thread::sleep_for(std::chrono::milliseconds(300));
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;  // Single partition → single wait queue.
+        gw.http2.enabled = false;
+
+        gw.upstreams.push_back(
+            MakeDrainTripUpstream("svc", "127.0.0.1", backend_port,
+                                  /*breaker_enabled=*/true));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Launch A first (takes the one connection), then B 50ms later
+        // so B is guaranteed to enter the wait queue.
+        std::promise<std::string> a_resp, b_resp;
+        auto a_fut = a_resp.get_future();
+        auto b_fut = b_resp.get_future();
+        std::thread a([&]() {
+            a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        std::thread b([&]() {
+            b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        a.join();
+        b.join();
+
+        std::string ra = a_fut.get();
+        std::string rb = b_fut.get();
+
+        // A unambiguously hits the backend (owns the slot) and sees 502.
+        bool a_is_502 = TestHttpClient::HasStatus(ra, 502);
+        // B must see the circuit-open short-circuit from the drain —
+        // NOT a 502 from the backend, which is what happens without
+        // the drain wiring.
+        bool b_is_503 = TestHttpClient::HasStatus(rb, 503);
+        bool b_has_breaker_hdr =
+            rb.find("X-Circuit-Breaker: open") != std::string::npos ||
+            rb.find("x-circuit-breaker: open") != std::string::npos;
+        // Exactly one backend hit — B was drained before making it to
+        // the upstream. Without the drain, backend_hits would be 2.
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool single_hit = (hits == 1);
+
+        bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit;
+        TestFramework::RecordTest(
+            "CB Phase 6: wait queue drained on trip", pass,
+            pass ? "" :
+            "a_is_502=" + std::to_string(a_is_502) +
+            " b_is_503=" + std::to_string(b_is_503) +
+            " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " rb_head=" + rb.substr(0, 200));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 6: wait queue drained on trip", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: With the breaker disabled, the drain does NOT fire — the queued
+// waiter proceeds to the upstream as it would absent the circuit-breaker
+// layer entirely.
+//
+// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices
+// short-circuit in TryAcquire and never invoke transition callbacks, so
+// DrainWaitQueueOnTrip is never called. Request B must hit the backend
+// (backend_hits == 2) and receive the upstream's 502 — NOT a 503.
+// ---------------------------------------------------------------------------
+void TestDisabledBreakerDoesNotDrain() {
+    std::cout << "\n[TEST] CB Phase 6: disabled breaker does not drain..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            std::this_thread::sleep_for(std::chrono::milliseconds(300));
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        gw.upstreams.push_back(
+            MakeDrainTripUpstream("svc", "127.0.0.1", backend_port,
+                                  /*breaker_enabled=*/false));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::promise<std::string> a_resp, b_resp;
+        auto a_fut = a_resp.get_future();
+        auto b_fut = b_resp.get_future();
+        std::thread a([&]() {
+            a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        std::thread b([&]() {
+            b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        a.join();
+        b.join();
+
+        std::string ra = a_fut.get();
+        std::string rb = b_fut.get();
+
+        // Both reach the backend — disabled breaker = no drain.
+        bool a_is_502 = TestHttpClient::HasStatus(ra, 502);
+        bool b_is_502 = TestHttpClient::HasStatus(rb, 502);
+        // Neither should carry the circuit-open header.
+        bool no_breaker_on_a =
+            ra.find("X-Circuit-Breaker") == std::string::npos &&
+            ra.find("x-circuit-breaker") == std::string::npos;
+        bool no_breaker_on_b =
+            rb.find("X-Circuit-Breaker") == std::string::npos &&
+            rb.find("x-circuit-breaker") == std::string::npos;
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool two_hits = (hits == 2);
+
+        bool pass = a_is_502 && b_is_502 && no_breaker_on_a &&
+                    no_breaker_on_b && two_hits;
+        TestFramework::RecordTest(
+            "CB Phase 6: disabled breaker does not drain", pass,
+            pass ? "" :
+            "a_is_502=" + std::to_string(a_is_502) +
+            " b_is_502=" + std::to_string(b_is_502) +
+            " no_breaker_on_a=" + std::to_string(no_breaker_on_a) +
+            " no_breaker_on_b=" + std::to_string(no_breaker_on_b) +
+            " backend_hits=" + std::to_string(hits));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 6: disabled breaker does not drain", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER PHASE 6 - WAIT-QUEUE DRAIN ON TRIP TESTS"
+              << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestWaitQueueDrainedOnTrip();
+    TestDisabledBreakerDoesNotDrain();
+}
+
+}  // namespace CircuitBreakerPhase6Tests
diff --git a/test/run_test.cc b/test/run_test.cc
index 34d54367..5dabf155 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -17,6 +17,7 @@
 #include "circuit_breaker_phase3_test.h"
 #include "circuit_breaker_phase4_test.h"
 #include "circuit_breaker_phase5_test.h"
+#include "circuit_breaker_phase6_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -94,6 +95,9 @@ void RunAllTest(){
     // Run circuit breaker Phase 5 retry-budget integration tests
     CircuitBreakerPhase5Tests::RunAllTests();
 
+    // Run circuit breaker Phase 6 wait-queue-drain-on-trip tests
+    CircuitBreakerPhase6Tests::RunAllTests();
+
     std::cout << "====================================\n" << std::endl;
 }
 
@@ -172,12 +176,13 @@ int main(int argc, char* argv[]) {
         // Run rate limit tests
         }else if(mode == "rate_limit" || mode == "-L"){
             RateLimitTests::RunAllTests();
-        // Run circuit breaker tests (phases 1-5: unit + phase3 + phase4 + phase5)
+        // Run circuit breaker tests (phases 1-6: unit + phase3 + phase4 + phase5 + phase6)
         }else if(mode == "circuit_breaker" || mode == "-B"){
             CircuitBreakerTests::RunAllTests();
             CircuitBreakerPhase3Tests::RunAllTests();
             CircuitBreakerPhase4Tests::RunAllTests();
             CircuitBreakerPhase5Tests::RunAllTests();
+            CircuitBreakerPhase6Tests::RunAllTests();
         // Show help
         }else if(mode == "help" || mode == "-h" || mode == "--help"){
             PrintUsage(argv[0]);

From e6df34b617ddcaf4f435c1509ab26d32fae67a5f Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 20:32:14 +0800
Subject: [PATCH 24/37] Finished Phase 7: Observability

---
 Makefile                               |   2 +-
 include/circuit_breaker/retry_budget.h |  17 ++
 server/circuit_breaker_slice.cc        |  22 +-
 server/proxy_transaction.cc            |  15 +-
 test/circuit_breaker_phase7_test.h     | 405 +++++++++++++++++++++++++
 test/run_test.cc                       |   7 +-
 6 files changed, 460 insertions(+), 8 deletions(-)
 create mode 100644 test/circuit_breaker_phase7_test.h

diff --git a/Makefile b/Makefile
index 45993b3b..0c3e47ac 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
 CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h $(TEST_DIR)/circuit_breaker_phase7_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h
index 001bfccb..12782d9e 100644
--- a/include/circuit_breaker/retry_budget.h
+++ b/include/circuit_breaker/retry_budget.h
@@ -105,6 +105,23 @@ class RetryBudget {
     int64_t InFlight() const {
         return in_flight_.load(std::memory_order_relaxed);
     }
+    // Compute the current effective retry cap for observability / log
+    // enrichment. Uses the same formula as TryConsumeRetry but without
+    // mutating retries_in_flight_. Returns the point-in-time cap against
+    // which a would-be retry admission would be compared. Slightly racy
+    // (separate loads of in_flight_ and retries_in_flight_ aren't atomic
+    // relative to each other), but the result is for dashboards / logs
+    // where a one-entry drift is noise.
+    int64_t ComputeCap() const {
+        int64_t in_flight = in_flight_.load(std::memory_order_relaxed);
+        int64_t retries = retries_in_flight_.load(std::memory_order_relaxed);
+        int pct = percent_.load(std::memory_order_relaxed);
+        int min_conc = min_concurrency_.load(std::memory_order_relaxed);
+        int64_t non_retry = in_flight - retries;
+        if (non_retry < 0) non_retry = 0;
+        int64_t pct_cap = (non_retry * pct) / 100;
+        return pct_cap > min_conc ? pct_cap : min_conc;
+    }
     int64_t RetriesInFlight() const {
         return retries_in_flight_.load(std::memory_order_relaxed);
     }
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index c34c25ae..d7e8ad07 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -81,9 +81,23 @@ bool CircuitBreakerSlice::ShouldTripClosed(
 }
 
 void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
+    auto now = Now();
+    // Capture pre-reset observability context BEFORE mutating state.
+    // §11.1 log format asks for consecutive_failures + window_total +
+    // window_fail_rate at the trip event so operators can distinguish a
+    // "100 consecutive bad responses" trip from a "55% failure rate over
+    // a wide call window" trip — two very different operational stories
+    // that the `trigger` string alone doesn't fully capture.
+    int consec_at_trip = consecutive_failures_;
+    int64_t window_total = window_.TotalCount(now);
+    int64_t window_failures = window_.FailureCount(now);
+    int window_fail_rate_pct =
+        (window_total > 0)
+            ? static_cast<int>((window_failures * 100) / window_total)
+            : 0;
+
     auto duration = ComputeOpenDuration();   // uses current consecutive_trips_
     consecutive_trips_.fetch_add(1, std::memory_order_relaxed);
-    auto now = Now();
     auto open_until = now + duration;
     int64_t open_until_ns =
         std::chrono::duration_cast<std::chrono::nanoseconds>(
@@ -107,8 +121,10 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
     trips_.fetch_add(1, std::memory_order_relaxed);
 
     logging::Get()->warn(
-        "circuit breaker tripped {} trigger={} open_for_ms={} consecutive_trips={}",
-        host_label_, trigger,
+        "circuit breaker tripped {} trigger={} consecutive_failures={} "
+        "window_total={} window_fail_rate={} open_for_ms={} consecutive_trips={}",
+        host_label_, trigger, consec_at_trip,
+        window_total, window_fail_rate_pct,
         std::chrono::duration_cast<std::chrono::milliseconds>(duration).count(),
         consecutive_trips_.load(std::memory_order_relaxed));
 
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 020d898f..5cff47b6 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -724,10 +724,19 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) {
                     "client_fd={} service={} attempt={}",
                     client_fd_, service_name_, attempt_);
             } else {
+                // §11.1 format: log per-host budget state so operators
+                // can diagnose retry-storm throttling without hitting
+                // an admin endpoint. `cap` is the live effective ceiling
+                // (may have shifted since the failing TryConsumeRetry
+                // due to other transactions' in_flight changes).
                 logging::Get()->warn(
-                    "ProxyTransaction retry budget exhausted "
-                    "client_fd={} service={} attempt={}",
-                    client_fd_, service_name_, attempt_);
+                    "retry budget exhausted service={} in_flight={} "
+                    "retries_in_flight={} cap={} client_fd={} attempt={}",
+                    service_name_,
+                    retry_budget_->InFlight(),
+                    retry_budget_->RetriesInFlight(),
+                    retry_budget_->ComputeCap(),
+                    client_fd_, attempt_);
                 state_ = State::FAILED;
                 DeliverResponse(MakeRetryBudgetResponse());
                 return;
diff --git a/test/circuit_breaker_phase7_test.h b/test/circuit_breaker_phase7_test.h
new file mode 100644
index 00000000..9dc841ba
--- /dev/null
+++ b/test/circuit_breaker_phase7_test.h
@@ -0,0 +1,405 @@
+#pragma once
+
+// Phase 7 integration tests: observability — counter accuracy, snapshot
+// API correctness, and log emission.
+//
+// Phases 2-6 each added counters and log lines as a side effect of their
+// functional work. Phase 7 locks those in as regressions:
+//
+//   * Counters (§11.2): trips, rejected, probe_successes, probe_failures,
+//     retries_rejected surface through CircuitBreakerManager::SnapshotAll.
+//   * Snapshot API (§11.3): per-slice rows aggregate into host-level
+//     totals; host-level fields (retries_in_flight / retries_rejected /
+//     in_flight) reflect the owning RetryBudget.
+//   * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message
+//     including trigger, consecutive_failures, window_total,
+//     window_fail_rate, open_for_ms, and consecutive_trips.
+//
+// The log-emission test attaches a spdlog ring-buffer sink to the logger
+// for the duration of the test, triggers a trip, then asserts the
+// captured messages contain the expected fields. No log file I/O.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+#include "upstream/upstream_manager.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "log/logger.h"
+#include "spdlog/sinks/ringbuffer_sink.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace CircuitBreakerPhase7Tests {
+
+using circuit_breaker::State;
+
+static UpstreamConfig MakeObservUpstream(const std::string& name,
+                                          const std::string& host,
+                                          int port,
+                                          int consecutive_threshold = 3) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 8;
+    u.pool.max_idle_connections  = 4;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+    u.proxy.retry.max_retries = 0;
+
+    u.circuit_breaker.enabled = true;
+    u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold;
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    // Long open duration — keep the slice OPEN so post-trip assertions
+    // don't race a HALF_OPEN transition.
+    u.circuit_breaker.base_open_duration_ms = 30000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: Snapshot API reflects per-slice trip/rejected counters and
+// host-level aggregates. Drives N+1 requests against a backend that always
+// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts
+// the snapshot shows total_trips >= 1, total_rejected >= 1,
+// open_partitions >= 1.
+// ---------------------------------------------------------------------------
+void TestSnapshotReflectsCounters() {
+    std::cout << "\n[TEST] CB Phase 7: snapshot reflects counters..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port,
+                                    /*threshold=*/3);
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip (3 failures), then 2 more to accumulate rejected counter.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        if (!cbm) {
+            TestFramework::RecordTest(
+                "CB Phase 7: snapshot reflects counters", false,
+                "no circuit breaker manager attached");
+            return;
+        }
+        auto snaps = cbm->SnapshotAll();
+        bool found = false;
+        int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0;
+        int open_parts = 0;
+        for (const auto& s : snaps) {
+            if (s.service_name == "svc") {
+                trips = s.total_trips;
+                rejected = s.total_rejected;
+                open_parts = s.open_partitions;
+                for (const auto& row : s.slices) {
+                    probe_s += row.probe_successes;
+                    probe_f += row.probe_failures;
+                }
+                found = true;
+                break;
+            }
+        }
+
+        bool pass = found
+                    && trips >= 1
+                    && rejected >= 2   // 2 post-trip short-circuits
+                    && open_parts >= 1
+                    && probe_s == 0    // never entered HALF_OPEN
+                    && probe_f == 0;
+        TestFramework::RecordTest(
+            "CB Phase 7: snapshot reflects counters", pass,
+            pass ? "" :
+            "found=" + std::to_string(found) +
+            " trips=" + std::to_string(trips) +
+            " rejected=" + std::to_string(rejected) +
+            " open_parts=" + std::to_string(open_parts) +
+            " probe_s=" + std::to_string(probe_s) +
+            " probe_f=" + std::to_string(probe_f));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 7: snapshot reflects counters", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message.
+// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip,
+// then inspects the captured messages for the key tokens. The sink is
+// removed before the test returns so it doesn't affect later tests.
+// ---------------------------------------------------------------------------
+void TestTripLogEmission() {
+    std::cout << "\n[TEST] CB Phase 7: trip log emission..." << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port,
+                                    /*threshold=*/2);
+        gw.upstreams.push_back(u);
+
+        // `HttpServer` construction calls `logging::Init()` which rebuilds
+        // the default logger via `spdlog::set_default_logger`. Any sink
+        // attached BEFORE that point lands on a stale logger. Attach the
+        // ringbuffer sink AFTER the last HttpServer construction so it
+        // captures the live logger's output.
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        // Drive exactly threshold=2 failures to trip.
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+
+        // Give the dispatcher a breath to emit + the sink to settle.
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        auto messages = ring->last_formatted();
+        // Scan for the trip message. Look for the static prefix plus the
+        // §11.1 field tokens.
+        bool saw_tripped = false;
+        bool has_trigger = false;
+        bool has_consec_failures = false;
+        bool has_window_total = false;
+        bool has_fail_rate = false;
+        bool has_open_for_ms = false;
+        bool has_consec_trips = false;
+        for (const auto& msg : messages) {
+            if (msg.find("circuit breaker tripped") == std::string::npos) {
+                continue;
+            }
+            saw_tripped = true;
+            if (msg.find("trigger=") != std::string::npos) has_trigger = true;
+            if (msg.find("consecutive_failures=") != std::string::npos)
+                has_consec_failures = true;
+            if (msg.find("window_total=") != std::string::npos)
+                has_window_total = true;
+            if (msg.find("window_fail_rate=") != std::string::npos)
+                has_fail_rate = true;
+            if (msg.find("open_for_ms=") != std::string::npos)
+                has_open_for_ms = true;
+            if (msg.find("consecutive_trips=") != std::string::npos)
+                has_consec_trips = true;
+        }
+
+        bool pass = saw_tripped && has_trigger && has_consec_failures &&
+                    has_window_total && has_fail_rate &&
+                    has_open_for_ms && has_consec_trips;
+        TestFramework::RecordTest(
+            "CB Phase 7: trip log emission", pass,
+            pass ? "" :
+            "saw_tripped=" + std::to_string(saw_tripped) +
+            " trigger=" + std::to_string(has_trigger) +
+            " consec_failures=" + std::to_string(has_consec_failures) +
+            " window_total=" + std::to_string(has_window_total) +
+            " fail_rate=" + std::to_string(has_fail_rate) +
+            " open_for_ms=" + std::to_string(has_open_for_ms) +
+            " consec_trips=" + std::to_string(has_consec_trips));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 7: trip log emission", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: Retry-budget observability — the exhausted log carries the
+// §11.1 fields (service, in_flight, retries_in_flight, cap), and the
+// host snapshot reflects retries_rejected.
+// ---------------------------------------------------------------------------
+void TestRetryBudgetObservability() {
+    std::cout << "\n[TEST] CB Phase 7: retry budget observability..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        // Budget: zero percent AND zero floor → every retry rejected.
+        auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port,
+                                    /*threshold=*/10000);
+        u.proxy.retry.max_retries = 2;
+        u.proxy.retry.retry_on_5xx = true;
+        u.circuit_breaker.retry_budget_percent = 0;
+        u.circuit_breaker.retry_budget_min_concurrency = 0;
+        gw.upstreams.push_back(u);
+
+        // Attach the ringbuffer AFTER gateway construction — see
+        // TestTripLogEmission for rationale (HttpServer's ctor
+        // replaces the default logger via logging::Init, detaching
+        // any previously-attached sinks).
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        // One client request: first attempt hits backend (502), retry
+        // blocked by budget → 503 + X-Retry-Budget-Exhausted.
+        TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        auto messages = ring->last_formatted();
+        bool saw_exhausted = false;
+        bool has_service = false;
+        bool has_inflight = false;
+        bool has_retries_inflight = false;
+        bool has_cap = false;
+        for (const auto& msg : messages) {
+            if (msg.find("retry budget exhausted") == std::string::npos) {
+                continue;
+            }
+            saw_exhausted = true;
+            if (msg.find("service=") != std::string::npos) has_service = true;
+            if (msg.find("in_flight=") != std::string::npos)
+                has_inflight = true;
+            if (msg.find("retries_in_flight=") != std::string::npos)
+                has_retries_inflight = true;
+            if (msg.find("cap=") != std::string::npos) has_cap = true;
+        }
+
+        // Snapshot: retries_rejected must be >= 1 (every rejection increments).
+        int64_t retries_rejected = 0;
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        if (cbm) {
+            for (const auto& s : cbm->SnapshotAll()) {
+                if (s.service_name == "svc-budget") {
+                    // Host aggregate — single host, so the sum is the
+                    // host's retries_rejected. The snapshot doesn't yet
+                    // expose that directly — derive from RetryBudget
+                    // via the host getter.
+                    auto* host = cbm->GetHost("svc-budget");
+                    if (host) {
+                        retries_rejected =
+                            host->GetRetryBudget()->RetriesRejected();
+                    }
+                    break;
+                }
+            }
+        }
+
+        bool pass = saw_exhausted && has_service && has_inflight &&
+                    has_retries_inflight && has_cap &&
+                    retries_rejected >= 1;
+        TestFramework::RecordTest(
+            "CB Phase 7: retry budget observability", pass,
+            pass ? "" :
+            "saw_exhausted=" + std::to_string(saw_exhausted) +
+            " service=" + std::to_string(has_service) +
+            " inflight=" + std::to_string(has_inflight) +
+            " retries_inflight=" + std::to_string(has_retries_inflight) +
+            " cap=" + std::to_string(has_cap) +
+            " retries_rejected=" + std::to_string(retries_rejected));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Phase 7: retry budget observability", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER PHASE 7 - OBSERVABILITY TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestSnapshotReflectsCounters();
+    TestTripLogEmission();
+    TestRetryBudgetObservability();
+}
+
+}  // namespace CircuitBreakerPhase7Tests
diff --git a/test/run_test.cc b/test/run_test.cc
index 5dabf155..17d7eed9 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -18,6 +18,7 @@
 #include "circuit_breaker_phase4_test.h"
 #include "circuit_breaker_phase5_test.h"
 #include "circuit_breaker_phase6_test.h"
+#include "circuit_breaker_phase7_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -98,6 +99,9 @@ void RunAllTest(){
     // Run circuit breaker Phase 6 wait-queue-drain-on-trip tests
     CircuitBreakerPhase6Tests::RunAllTests();
 
+    // Run circuit breaker Phase 7 observability tests
+    CircuitBreakerPhase7Tests::RunAllTests();
+
     std::cout << "====================================\n" << std::endl;
 }
 
@@ -176,13 +180,14 @@ int main(int argc, char* argv[]) {
         // Run rate limit tests
         }else if(mode == "rate_limit" || mode == "-L"){
             RateLimitTests::RunAllTests();
-        // Run circuit breaker tests (phases 1-6: unit + phase3 + phase4 + phase5 + phase6)
+        // Run circuit breaker tests (phases 1-7: unit + phase3 + phase4 + phase5 + phase6 + phase7)
         }else if(mode == "circuit_breaker" || mode == "-B"){
             CircuitBreakerTests::RunAllTests();
             CircuitBreakerPhase3Tests::RunAllTests();
             CircuitBreakerPhase4Tests::RunAllTests();
             CircuitBreakerPhase5Tests::RunAllTests();
             CircuitBreakerPhase6Tests::RunAllTests();
+            CircuitBreakerPhase7Tests::RunAllTests();
         // Show help
         }else if(mode == "help" || mode == "-h" || mode == "--help"){
             PrintUsage(argv[0]);

From 7b83fbfdd03f1cca613e3660bf88f62d428294f3 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 22:04:16 +0800
Subject: [PATCH 25/37] Finished Feature Development

---
 Makefile                                      |    2 +-
 docs/circuit_breaker.md                       |  149 ++
 .../circuit_breaker/circuit_breaker_host.h    |    2 +-
 .../circuit_breaker/circuit_breaker_manager.h |    2 +-
 .../circuit_breaker/circuit_breaker_state.h   |    2 +-
 include/circuit_breaker/retry_budget.h        |    2 +-
 include/config/server_config.h                |   24 +-
 include/upstream/pool_partition.h             |    4 +-
 include/upstream/proxy_transaction.h          |   18 +-
 server/circuit_breaker_slice.cc               |    7 +-
 server/http_server.cc                         |   36 +-
 server/main.cc                                |    2 +-
 server/proxy_transaction.cc                   |    9 +-
 test/circuit_breaker_components_test.h        |  507 +++++++
 test/circuit_breaker_integration_test.h       | 1213 +++++++++++++++++
 test/circuit_breaker_observability_test.h     |  405 ++++++
 test/circuit_breaker_reload_test.h            |  373 +++++
 test/circuit_breaker_retry_budget_test.h      |  367 +++++
 test/circuit_breaker_test.h                   |    4 +-
 test/circuit_breaker_wait_queue_drain_test.h  |  261 ++++
 test/config_test.h                            |   36 +-
 test/run_test.cc                              |   47 +-
 22 files changed, 3390 insertions(+), 82 deletions(-)
 create mode 100644 docs/circuit_breaker.md
 create mode 100644 test/circuit_breaker_components_test.h
 create mode 100644 test/circuit_breaker_integration_test.h
 create mode 100644 test/circuit_breaker_observability_test.h
 create mode 100644 test/circuit_breaker_reload_test.h
 create mode 100644 test/circuit_breaker_retry_budget_test.h
 create mode 100644 test/circuit_breaker_wait_queue_drain_test.h

diff --git a/Makefile b/Makefile
index 0c3e47ac..2b9ae194 100644
--- a/Makefile
+++ b/Makefile
@@ -147,7 +147,7 @@ UPSTREAM_HEADERS = $(LIB_DIR)/upstream/upstream_manager.h $(LIB_DIR)/upstream/up
 RATE_LIMIT_HEADERS = $(LIB_DIR)/rate_limit/token_bucket.h $(LIB_DIR)/rate_limit/rate_limit_zone.h $(LIB_DIR)/rate_limit/rate_limiter.h
 CIRCUIT_BREAKER_HEADERS = $(LIB_DIR)/circuit_breaker/circuit_breaker_state.h $(LIB_DIR)/circuit_breaker/circuit_breaker_window.h $(LIB_DIR)/circuit_breaker/circuit_breaker_slice.h $(LIB_DIR)/circuit_breaker/retry_budget.h $(LIB_DIR)/circuit_breaker/circuit_breaker_host.h $(LIB_DIR)/circuit_breaker/circuit_breaker_manager.h
 CLI_HEADERS = $(LIB_DIR)/cli/cli_parser.h $(LIB_DIR)/cli/signal_handler.h $(LIB_DIR)/cli/pid_file.h $(LIB_DIR)/cli/version.h $(LIB_DIR)/cli/daemonizer.h
-TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_phase3_test.h $(TEST_DIR)/circuit_breaker_phase4_test.h $(TEST_DIR)/circuit_breaker_phase5_test.h $(TEST_DIR)/circuit_breaker_phase6_test.h $(TEST_DIR)/circuit_breaker_phase7_test.h
+TEST_HEADERS = $(TEST_DIR)/test_framework.h $(TEST_DIR)/http_test_client.h $(TEST_DIR)/basic_test.h $(TEST_DIR)/stress_test.h $(TEST_DIR)/race_condition_test.h $(TEST_DIR)/timeout_test.h $(TEST_DIR)/config_test.h $(TEST_DIR)/http_test.h $(TEST_DIR)/websocket_test.h $(TEST_DIR)/tls_test.h $(TEST_DIR)/cli_test.h $(TEST_DIR)/http2_test.h $(TEST_DIR)/route_test.h $(TEST_DIR)/upstream_pool_test.h $(TEST_DIR)/proxy_test.h $(TEST_DIR)/rate_limit_test.h $(TEST_DIR)/kqueue_test.h $(TEST_DIR)/circuit_breaker_test.h $(TEST_DIR)/circuit_breaker_components_test.h $(TEST_DIR)/circuit_breaker_integration_test.h $(TEST_DIR)/circuit_breaker_retry_budget_test.h $(TEST_DIR)/circuit_breaker_wait_queue_drain_test.h $(TEST_DIR)/circuit_breaker_observability_test.h $(TEST_DIR)/circuit_breaker_reload_test.h
 
 # All headers combined
 HEADERS = $(CORE_HEADERS) $(CALLBACK_HEADERS) $(REACTOR_HEADERS) $(NETWORK_HEADERS) $(SERVER_HEADERS) $(THREAD_POOL_HEADERS) $(UTIL_HEADERS) $(FOUNDATION_HEADERS) $(HTTP_HEADERS) $(HTTP2_HEADERS) $(WS_HEADERS) $(TLS_HEADERS) $(UPSTREAM_HEADERS) $(RATE_LIMIT_HEADERS) $(CIRCUIT_BREAKER_HEADERS) $(CLI_HEADERS) $(TEST_HEADERS)
diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md
new file mode 100644
index 00000000..6f38de69
--- /dev/null
+++ b/docs/circuit_breaker.md
@@ -0,0 +1,149 @@
+# Circuit Breaker
+
+Per-upstream circuit breaking for the gateway, preventing cascading failures when a backend becomes unhealthy. Follows the resilience4j three-state machine (`CLOSED` → `OPEN` → `HALF_OPEN` → `CLOSED`), trips on either consecutive-failure or failure-rate thresholds, and short-circuits checkouts with `503 Service Unavailable` while the circuit is open. A separate **retry budget** caps the fraction of concurrent upstream work that may be retries, bounding the retry-storm amplification factor even when individual retries pass the breaker gate.
+
+---
+
+## Overview
+
+- **Per-dispatcher slices.** One `CircuitBreakerSlice` per dispatcher partition for each upstream. Hot-path `TryAcquire` / `Report*` calls are lock-free — each slice is dispatcher-thread-pinned.
+- **Three states.** `CLOSED` = normal traffic. `OPEN` = all requests short-circuited with 503 for the exponential-backoff open duration. `HALF_OPEN` = a bounded number of probe requests are admitted to test recovery; on success, closes; on failure, re-trips with longer backoff.
+- **Dual trip paths.** Either `consecutive_failures >= N` OR `failure_rate >= P%` over a sliding window (subject to `minimum_volume`).
+- **Retry budget.** Host-level cap: `max(retry_budget_min_concurrency, (in_flight - retries_in_flight) * retry_budget_percent / 100)`. Retries that exceed the cap receive `503` + `X-Retry-Budget-Exhausted: 1` instead of going to the upstream.
+- **Wait-queue drain on trip.** On every `CLOSED → OPEN` transition, the corresponding pool partition's wait queue is drained immediately with `503 + X-Circuit-Breaker: open` — queued waiters don't have to wait out the full open window.
+- **Dry-run mode.** `dry_run=true` computes decisions and logs them, but still admits traffic. Useful for staging a breaker in production without risk.
+- **Hot-reload.** Breaker-field edits (thresholds, window, probe budget, retry budget tuning, enabled toggle) apply live on SIGHUP — no restart required. Topology edits (host/port/pool/proxy/tls) still require a restart.
+
+---
+
+## Configuration
+
+Each `upstream` entry accepts a nested `circuit_breaker` block:
+
+```json
+{
+  "upstreams": [
+    {
+      "name": "orders",
+      "host": "orders-backend",
+      "port": 8080,
+      "circuit_breaker": {
+        "enabled": true,
+        "dry_run": false,
+        "consecutive_failure_threshold": 5,
+        "failure_rate_threshold": 50,
+        "minimum_volume": 20,
+        "window_seconds": 10,
+        "permitted_half_open_calls": 3,
+        "base_open_duration_ms": 5000,
+        "max_open_duration_ms": 60000,
+        "retry_budget_percent": 20,
+        "retry_budget_min_concurrency": 3
+      }
+    }
+  ]
+}
+```
+
+### Fields
+
+| Field | Type | Default | Meaning |
+|---|---|---|---|
+| `enabled` | bool | `false` | Master switch. When false, the slice is a zero-overhead no-op on the hot path. |
+| `dry_run` | bool | `false` | Shadow mode: log would-reject decisions but admit traffic. Both the state machine and the retry budget honor this flag. |
+| `consecutive_failure_threshold` | int | `5` | Trip when N consecutive failures are observed in `CLOSED`. Upper bound 10,000. |
+| `failure_rate_threshold` | int | `50` | Trip when `(failures / total) * 100 >= this` over the rolling window, provided `total >= minimum_volume`. 0-100. |
+| `minimum_volume` | int | `20` | Minimum calls-in-window before rate-based trip is even considered. Upper bound 10,000,000. |
+| `window_seconds` | int | `10` | Rolling window duration for the rate trip. >= 1. |
+| `permitted_half_open_calls` | int | `3` | Probe admissions allowed per `HALF_OPEN` cycle. A single success flips to `CLOSED`; a single failure re-trips to `OPEN`. Upper bound 1,000. |
+| `base_open_duration_ms` | int | `5000` | Initial open duration on first trip. Subsequent trips use `min(base << consecutive_trips, max)`. |
+| `max_open_duration_ms` | int | `60000` | Ceiling for the exponential-backoff open duration. |
+| `retry_budget_percent` | int | `20` | Retries capped at this % of non-retry in-flight traffic to the same host. 0-100. |
+| `retry_budget_min_concurrency` | int | `3` | Floor for the retry cap — always allow at least this many concurrent retries regardless of traffic level. |
+
+### Defaults (when `circuit_breaker` block is absent)
+
+`enabled=false`. The breaker is fully opt-in. No behavioral change from a pre-breaker gateway configuration.
+
+---
+
+## Client-facing responses
+
+Two distinct `503` variants, keyed off the reject source:
+
+**Circuit-open reject** — breaker is `OPEN` or in `HALF_OPEN`-full:
+```
+HTTP/1.1 503 Service Unavailable
+Retry-After: 5
+X-Circuit-Breaker: open              # or half_open
+X-Upstream-Host: orders-backend:8080
+Connection: close
+```
+
+- `Retry-After` derivation:
+  - `OPEN`: derived from the stored `open_until` deadline (time remaining until next probe).
+  - `HALF_OPEN`: derived from the *next* open duration (`base << consecutive_trips`) — reflects what the backoff would be if the in-flight probes fail. Base alone would under-report after multiple trips.
+  - Both paths: ceil-divide the millisecond value to seconds, capped at 3600s.
+- `X-Circuit-Breaker` distinguishes the two reject paths so operators can tell "backoff active" from "probing, no capacity left".
+
+**Retry-budget reject** — every retry attempt rejected because the host's budget is exhausted:
+```
+HTTP/1.1 503 Service Unavailable
+X-Retry-Budget-Exhausted: 1
+Connection: close
+```
+
+No `Retry-After` (the budget has no recovery clock — it depends on concurrent traffic). No `X-Circuit-Breaker` header (this reject path is orthogonal to the state machine).
+
+Both responses are **terminal**: the retry loop never retries a circuit-open or retry-budget-exhausted outcome.
+
+---
+
+## Hot reload
+
+All `circuit_breaker` fields on existing upstream services are hot-reloadable via `SIGHUP`. Reload semantics:
+
+| Edit | Behavior |
+|---|---|
+| Threshold change (failures, rate, window, probe budget, open durations) | Applied on the next `TryAcquire` / `Report*` call on each slice. Live state (`CLOSED`/`OPEN`/`HALF_OPEN`) is preserved. |
+| `enabled=true → false` | Live state reset to `CLOSED`; hot path short-circuits to `ADMITTED`. No transition callback fired. |
+| `enabled=false → true` | Live state reset to `CLOSED`. The transition callback (wired at startup) re-engages for future trips. |
+| `window_seconds` change | Rolling window reset. In-flight reports admitted pre-reload are invalidated (by `closed_gen_` bump); `consecutive_failures_` reset so stale counts can't trip the fresh window. In-flight `HALF_OPEN` probes are NOT invalidated (separate `halfopen_gen_` counter) — probe cycles complete normally. |
+| `retry_budget_percent` / `retry_budget_min_concurrency` | Applied immediately (atomic stores). In-flight counters preserved. |
+
+Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a restart; the gateway logs `"Reload: upstream topology changes require a restart to take effect"` and keeps the old pool alive. Breaker edits on the same reload are still applied live.
+
+---
+
+## Observability
+
+### Logs
+
+| Event | Level | Sample |
+|---|---|---|
+| `CLOSED → OPEN` trip | `warn` | `circuit breaker tripped service=orders host=orders-backend:8080 partition=0 trigger=consecutive consecutive_failures=5 window_total=12 window_fail_rate=41 open_for_ms=5000 consecutive_trips=1` |
+| `OPEN → HALF_OPEN` | `info` | `circuit breaker half-open ... probes_allowed=3` |
+| `HALF_OPEN → CLOSED` | `info` | `circuit breaker closed ... probes_succeeded=3` |
+| `HALF_OPEN → OPEN` re-trip | `warn` | `circuit breaker re-tripped ... trigger=probe_fail consecutive_trips=2 open_for_ms=10000` |
+| Reject (first of cycle) | `info` | `circuit breaker rejected ... state=open` |
+| Reject (subsequent) | `debug` | Same, at debug. |
+| Reject (dry-run) | `info` | `[dry-run] circuit breaker would reject ...` |
+| Retry budget exhausted | `warn` | `retry budget exhausted service=orders in_flight=45 retries_in_flight=9 cap=9 client_fd=... attempt=1` |
+| Reload applied | `info` | `circuit breaker config applied service=orders enabled=true window_s=10 fail_rate=50 consec_threshold=5` |
+| Wait-queue drain on trip | `info` | `PoolPartition draining wait queue on breaker trip: orders-backend:8080 queue_size=3` |
+
+### Snapshot API
+
+`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A future `/admin/breakers` endpoint would JSON-serialize this.
+
+---
+
+## Design notes
+
+- **Dispatcher affinity.** Slices are pinned to their dispatcher thread — no CAS on the hot path. The trade-off: skewed request distribution across dispatchers can cause one partition to trip while another stays `CLOSED`. Uniform hashing keeps this mild in practice.
+- **Lazy `HALF_OPEN`.** The transition from `OPEN` happens on the next inbound `TryAcquire` once the open deadline elapses — no background timer. Envoy and resilience4j use the same model.
+- **Generation tokens.** Every admission is stamped with a per-domain generation counter (`closed_gen_` or `halfopen_gen_`, depending on state). `Report*` drops stale-generation completions so pre-transition requests can't pollute a fresh cycle. Window resizes bump only `closed_gen_` so in-flight probes aren't stranded.
+- **Retry budget CAS.** `TryConsumeRetry` uses `compare_exchange_weak` to serialize concurrent retry admissions. A plain load-check-add would let N callers all observe `current < cap` and all increment past the cap.
+- **Non-retry denominator.** The budget base is `in_flight - retries_in_flight`, not raw `in_flight`. Retries count in both terms but subtract out here so admitting a retry doesn't inflate its own cap.
+
+For the full design document (motivations, trade-offs, failure modes, revision history, test strategy), see [.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md](../.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md).
diff --git a/include/circuit_breaker/circuit_breaker_host.h b/include/circuit_breaker/circuit_breaker_host.h
index 6aff2965..67211667 100644
--- a/include/circuit_breaker/circuit_breaker_host.h
+++ b/include/circuit_breaker/circuit_breaker_host.h
@@ -97,7 +97,7 @@ class CircuitBreakerHost {
     // across partitions — callers that need partition-specific behavior
     // can read `slice->dispatcher_index()` inside the callback.
     // Must be called before live traffic; thread-safety depends on
-    // slice-dispatcher affinity at the Reload layer (Phase 8 wires this).
+    // slice-dispatcher affinity at the Reload layer.
     void SetTransitionCallbackOnAllSlices(StateTransitionCallback cb);
 
     // Accessors.
diff --git a/include/circuit_breaker/circuit_breaker_manager.h b/include/circuit_breaker/circuit_breaker_manager.h
index 66c2b33d..b4b32f06 100644
--- a/include/circuit_breaker/circuit_breaker_manager.h
+++ b/include/circuit_breaker/circuit_breaker_manager.h
@@ -25,7 +25,7 @@ namespace circuit_breaker {
 // upstream policy). This makes GetHost lock-free after construction,
 // which is critical for the hot path.
 //
-// Hot-reload (Phase 8): only `circuit_breaker` sub-fields on EXISTING
+// Hot-reload: only `circuit_breaker` sub-fields on EXISTING
 // upstream services can be live-reloaded. New or removed service names
 // log a warn and are skipped — the caller (HttpServer::Reload) still
 // fires the "restart required" diagnostic in that case.
diff --git a/include/circuit_breaker/circuit_breaker_state.h b/include/circuit_breaker/circuit_breaker_state.h
index 6a758a57..92872f8b 100644
--- a/include/circuit_breaker/circuit_breaker_state.h
+++ b/include/circuit_breaker/circuit_breaker_state.h
@@ -51,7 +51,7 @@ enum class FailureKind : uint8_t {
 // `trigger` is a short static string such as "consecutive" / "rate" /
 // "probe_success" / "probe_fail" / "open_elapsed" for logging.
 //
-// TODO(phase-7): once a snapshot / admin JSON endpoint lands, convert
+// TODO(post-v1): once a snapshot / admin JSON endpoint lands, convert
 // `trigger` to an `enum class TransitionTrigger` so the valid set is
 // compile-time checked rather than string-compared. See design doc §15.8.
 using StateTransitionCallback =
diff --git a/include/circuit_breaker/retry_budget.h b/include/circuit_breaker/retry_budget.h
index 12782d9e..f8392013 100644
--- a/include/circuit_breaker/retry_budget.h
+++ b/include/circuit_breaker/retry_budget.h
@@ -32,7 +32,7 @@ namespace circuit_breaker {
 // relaxed — snapshots can be slightly stale, which is fine for a
 // capacity gate on a retry storm.
 //
-// Usage (Phase 5 wires this in):
+// Usage:
 //   1. On every attempt (first or retry), call TrackInFlight() and keep
 //      the returned guard alive until the attempt completes. The guard
 //      decrements in_flight_ in its destructor.
diff --git a/include/config/server_config.h b/include/config/server_config.h
index 8a8e8ed4..1f4c7f59 100644
--- a/include/config/server_config.h
+++ b/include/config/server_config.h
@@ -154,7 +154,8 @@ struct CircuitBreakerConfig {
 
     // Retry budget (orthogonal to the breaker). Caps concurrent retries to
     // max(retry_budget_min_concurrency, in_flight * retry_budget_percent/100).
-    // Wired into the request path in Phase 5; in Phase 3 these are read by
+    // Wired into the request path via ProxyTransaction's retry-budget
+    // gate in MaybeRetry; also read by
     // CircuitBreakerHost to construct its owned RetryBudget.
     int retry_budget_percent = 20;
     int retry_budget_min_concurrency = 3;
@@ -185,20 +186,19 @@ struct UpstreamConfig {
     ProxyConfig proxy;
     CircuitBreakerConfig circuit_breaker;
 
-    // Includes circuit_breaker until Phase 8 ships CircuitBreakerManager::Reload.
-    // A CB-only SIGHUP currently has no propagation path into live slice state,
-    // so operator== must return false to trigger the "restart required" warning
-    // rather than silently committing the new config object while the live slices
-    // continue running with the old settings.
+    // Excludes `circuit_breaker` — breaker fields are live-reloadable via
+    // `CircuitBreakerManager::Reload`, which `HttpServer::Reload` invokes on
+    // every reload. Topology fields (name, host, port, tls, pool,
+    // proxy) remain restart-only; a mismatch here triggers the
+    // "restart required" warning in the outer reload.
     //
-    // TODO(phase-8): once CircuitBreakerManager::Reload is wired into
-    // HttpServer::Reload, remove circuit_breaker from this operator and diff it
-    // separately (per-host CircuitBreakerConfig comparison) so breaker-only
-    // edits are hot-reloadable without a restart.
+    // Contract: a config pair that differs ONLY in circuit_breaker fields
+    // must compare EQUAL so the outer reload doesn't fire a spurious warn.
+    // Any future field whose propagation path is wired into a live
+    // `*Manager::Reload` should be removed from this operator symmetrically.
     bool operator==(const UpstreamConfig& o) const {
         return name == o.name && host == o.host && port == o.port &&
-               tls == o.tls && pool == o.pool && proxy == o.proxy &&
-               circuit_breaker == o.circuit_breaker;
+               tls == o.tls && pool == o.pool && proxy == o.proxy;
     }
     bool operator!=(const UpstreamConfig& o) const { return !(*this == o); }
 };
diff --git a/include/upstream/pool_partition.h b/include/upstream/pool_partition.h
index d23904ab..a6d904b2 100644
--- a/include/upstream/pool_partition.h
+++ b/include/upstream/pool_partition.h
@@ -25,8 +25,8 @@ class PoolPartition {
     static constexpr int CHECKOUT_CONNECT_TIMEOUT = -3;
     static constexpr int CHECKOUT_SHUTTING_DOWN   = -4;
     static constexpr int CHECKOUT_QUEUE_TIMEOUT   = -5;
-    // Delivered to wait-queue waiters drained on a breaker trip (Phase 6
-    // implements the drain path). ProxyTransaction::OnCheckoutError maps
+    // Delivered to wait-queue waiters drained on a breaker trip by
+    // DrainWaitQueueOnTrip. ProxyTransaction::OnCheckoutError maps
     // this to RESULT_CIRCUIT_OPEN so the queued client gets the same
     // circuit-open response a fresh requester would get.
     static constexpr int CHECKOUT_CIRCUIT_OPEN    = -6;
diff --git a/include/upstream/proxy_transaction.h b/include/upstream/proxy_transaction.h
index cded9b71..ccda6d24 100644
--- a/include/upstream/proxy_transaction.h
+++ b/include/upstream/proxy_transaction.h
@@ -34,10 +34,9 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // Carries Retry-After + X-Circuit-Breaker headers (§12.1).
     // Terminal — retry loop MUST NOT retry this outcome (§8).
     static constexpr int RESULT_CIRCUIT_OPEN        = -7;
-    // Retry budget exhausted (Phase 5 wires the actual gate; the code is
-    // reserved here so MakeErrorResponse and the retry loop both know it
-    // exists and terminal-classify it). No Retry-After; distinct header
-    // X-Retry-Budget-Exhausted so operators can tell the two 503s apart.
+    // Retry budget exhausted. No Retry-After; distinct header
+    // X-Retry-Budget-Exhausted so operators can tell the two 503s apart
+    // from circuit-open rejects.
     static constexpr int RESULT_RETRY_BUDGET_EXHAUSTED = -8;
 
     // Constructor copies all needed fields from client_request (method, path,
@@ -159,7 +158,7 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // Timing
     std::chrono::steady_clock::time_point start_time_;
 
-    // Circuit breaker integration (Phase 4). Resolved once in Start() from
+    // Circuit breaker integration — resolved once in Start() from
     // `service_name_` + `dispatcher_index_`. Null when there's no
     // CircuitBreakerManager attached (server has no upstreams, or the
     // breaker is being built lazily) — the breaker is simply skipped in
@@ -232,20 +231,17 @@ class ProxyTransaction : public std::enable_shared_from_this<ProxyTransaction> {
     // plain 503 for those codes if called generically.
     static HttpResponse MakeErrorResponse(int result_code);
 
-    // Phase 4: emit the §12.1 circuit-open response.
+    // Emit the circuit-open response (design §12.1):
     //   503 + Retry-After (seconds until slice->OpenUntil())
     //       + X-Circuit-Breaker: open
     //       + X-Upstream-Host: service:host:port
     HttpResponse MakeCircuitOpenResponse() const;
 
-    // Phase 5 will emit this. Declared here so Phase 4's
-    // MakeErrorResponse RESULT_RETRY_BUDGET_EXHAUSTED branch has a
-    // target to dispatch to and so tests can assert the response shape
-    // even before the retry-budget gate is wired.
+    // Emit the retry-budget-exhausted response (design §12.2):
     //   503 + X-Retry-Budget-Exhausted: 1
     static HttpResponse MakeRetryBudgetResponse();
 
-    // Phase 4 helpers — breaker gate and outcome classification.
+    // Breaker helpers — gate and outcome classification.
     //
     // ConsultBreaker: call at the top of AttemptCheckout. Populates
     // admission_generation_ and is_probe_ on admission; delivers the
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index d7e8ad07..1ff6e00e 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -134,7 +134,7 @@ void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
 void CircuitBreakerSlice::TransitionOpenToHalfOpen() {
     state_.store(State::HALF_OPEN, std::memory_order_release);
     // Clear open_until_steady_ns_ per the OpenUntil() contract ("zero when
-    // not OPEN"). Leaving a stale deadline here would cause Phase 4's
+    // not OPEN"). Leaving a stale deadline here would cause
     // ProxyTransaction::MakeCircuitOpenResponse to compute a Retry-After
     // from a past time_point (negative delta → floor at 1s, misleading for
     // a reject in the HALF_OPEN probe-budget-full path). Retry-After for
@@ -384,7 +384,7 @@ void CircuitBreakerSlice::ReportSuccess(bool probe,
         // Stale probe defense: we admitted this probe in HALF_OPEN, but the
         // slice may have transitioned out (e.g., `Reload()` flipped enabled,
         // `TransitionHalfOpenToClosed` already fired on sibling probes, or —
-        // post-Phase 8 — an operator toggle transitioned us to CLOSED).
+        // operator toggle transitioned us to CLOSED via Reload().
         // Only touch HALF_OPEN bookkeeping / fire transitions when state is
         // STILL HALF_OPEN.
         if (state_.load(std::memory_order_acquire) != State::HALF_OPEN) return;
@@ -581,7 +581,8 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         // Silent reset — no transition callback. The change is operator-
         // initiated configuration, not a runtime state signal; firing the
         // callback would cause PoolPartition::DrainWaitQueueOnTrip-style
-        // consumers (Phase 6) to spuriously drain waiters on a config edit.
+        // consumers (the wait-queue drain transition callback) to spuriously
+        // drain waiters on a config edit.
         state_.store(State::CLOSED, std::memory_order_release);
         open_until_steady_ns_.store(0, std::memory_order_release);
         consecutive_trips_.store(0, std::memory_order_relaxed);
diff --git a/server/http_server.cc b/server/http_server.cc
index ccd8b80f..c47b7688 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -3454,13 +3454,41 @@ bool HttpServer::Reload(const ServerConfig& new_config) {
         rate_limit_manager_->Reload(new_config.rate_limit);
     }
 
-    // Upstream pool changes require a restart — pools are built once in Start()
-    // and cannot be rebuilt at runtime without a full drain cycle.
+    // Circuit breaker reload — live-propagates breaker-field edits on
+    // existing upstream services. CircuitBreakerManager::Reload is
+    // idempotent (atomic stores to unchanged values), so calling it
+    // unconditionally costs nothing when the operator didn't edit any
+    // breaker fields. Topology changes (added / removed service names)
+    // are logged as warn + skipped inside the manager; the outer
+    // restart-required warning still fires via the upstreams-inequality
+    // check below. After this call, update the breaker slices on every
+    // partition via per-dispatcher EnQueue — the manager handles that
+    // routing internally. The topology check itself now only diffs non-
+    // breaker fields (UpstreamConfig::operator== excludes circuit_breaker),
+    // so a CB-only SIGHUP is a clean hot reload with no spurious warn.
+    if (circuit_breaker_manager_) {
+        circuit_breaker_manager_->Reload(new_config.upstreams);
+    }
+
+    // Upstream topology changes (host/port/pool/proxy/tls) require a
+    // restart — pools are built once in Start() and cannot be rebuilt
+    // at runtime without a full drain cycle. The equality operator on
+    // UpstreamConfig deliberately excludes `circuit_breaker` so a CB-
+    // only edit doesn't trigger this warning (the reload above already
+    // applied the new breaker settings to live slices).
     if (new_config.upstreams != upstream_configs_) {
-        logging::Get()->warn("Reload: upstream configuration changes require a "
-                             "restart to take effect (ignored)");
+        logging::Get()->warn("Reload: upstream topology changes require a "
+                             "restart to take effect (circuit-breaker "
+                             "field edits, if any, were applied live)");
     }
 
+    // Persist the new upstreams (preserving the breaker propagation just
+    // applied). Subsequent reloads diff against this baseline, so without
+    // this update a second SIGHUP would re-propagate the same CB values
+    // and also see the original topology as "unchanged" rather than the
+    // attempted new state — confusing operators debugging reload behavior.
+    upstream_configs_ = new_config.upstreams;
+
     return true;
 }
 
diff --git a/server/main.cc b/server/main.cc
index 86f7598d..0d7474e9 100644
--- a/server/main.cc
+++ b/server/main.cc
@@ -435,7 +435,7 @@ static bool ReloadConfig(const std::string& config_path,
     // server keeps running the startup values — /stats and other
     // current_config consumers would report phantom state, and subsequent
     // identical reloads could produce inconsistent diagnostics. Pin to
-    // the running values until Phase 8 implements
+    // the running values until CircuitBreakerManager::Reload implements
     // CircuitBreakerManager::Reload (the only upstream sub-field that
     // becomes hot-reloadable); at that point this save becomes a
     // partial-field save excluding circuit_breaker.
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 5cff47b6..29dbe550 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -114,8 +114,8 @@ void ProxyTransaction::Start() {
                           method_, upstream_path);
 
     // Resolve the circuit-breaker slice once. Null when no breaker is
-    // attached (server has no upstreams configured, or Phase 4 skipped
-    // on this deployment), or when the service/dispatcher pair is out of
+    // attached (server has no upstreams configured), or when the
+    // service/dispatcher pair is out of
     // range. In any null case the breaker is simply bypassed — the
     // transaction proceeds as if circuit breaking were disabled.
     if (upstream_manager_ && dispatcher_index_ >= 0) {
@@ -277,7 +277,8 @@ void ProxyTransaction::OnCheckoutError(int error_code) {
     // (POOL_EXHAUSTED, QUEUE_TIMEOUT) and shutdown should fail fast —
     // retrying under backpressure amplifies load on an already-stressed
     // pool and stretches client latency with no benefit. A breaker-drain
-    // reject (CHECKOUT_CIRCUIT_OPEN, Phase 6) is also terminal: the
+    // reject (CHECKOUT_CIRCUIT_OPEN from the wait-queue drain) is also
+    // terminal: the
     // client gets the same circuit-open response a fresh requester
     // would, and the retry loop must not retry it.
     //
@@ -301,7 +302,7 @@ void ProxyTransaction::OnCheckoutError(int error_code) {
 
     if (error_code == CIRCUIT_OPEN) {
         // Drain path: breaker tripped while this transaction was queued
-        // (Phase 6 implements the drain). Do NOT Report to the slice —
+        // Do NOT Report to the slice —
         // our own reject must not feed back into the failure math. Emit
         // the §12.1 circuit-open response directly.
         logging::Get()->info(
diff --git a/test/circuit_breaker_components_test.h b/test/circuit_breaker_components_test.h
new file mode 100644
index 00000000..36285b16
--- /dev/null
+++ b/test/circuit_breaker_components_test.h
@@ -0,0 +1,507 @@
+#pragma once
+
+#include "test_framework.h"
+#include "config/server_config.h"
+#include "circuit_breaker/circuit_breaker_state.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "circuit_breaker/retry_budget.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "dispatcher.h"
+
+#include <iostream>
+#include <string>
+#include <thread>
+#include <vector>
+
+// Circuit-breaker component unit tests: RetryBudget, CircuitBreakerHost,
+// CircuitBreakerManager.
+//
+// These tests exercise the standalone data structures without any
+// integration into the request path (covered by the integration suite).
+// Every test constructs the object under test in isolation — no live
+// dispatchers, no network I/O. A minimal Dispatcher is instantiated only
+// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload
+// calls.
+namespace CircuitBreakerComponentsTests {
+
+using circuit_breaker::CircuitBreakerHost;
+using circuit_breaker::CircuitBreakerHostSnapshot;
+using circuit_breaker::CircuitBreakerManager;
+using circuit_breaker::Decision;
+using circuit_breaker::FailureKind;
+using circuit_breaker::RetryBudget;
+using circuit_breaker::State;
+
+static CircuitBreakerConfig DefaultCbConfig() {
+    CircuitBreakerConfig cb;
+    cb.enabled = true;
+    cb.consecutive_failure_threshold = 5;
+    cb.failure_rate_threshold = 50;
+    cb.minimum_volume = 20;
+    cb.window_seconds = 10;
+    cb.permitted_half_open_calls = 3;
+    cb.base_open_duration_ms = 5000;
+    cb.max_open_duration_ms = 60000;
+    cb.retry_budget_percent = 20;
+    cb.retry_budget_min_concurrency = 3;
+    return cb;
+}
+
+// ============================================================================
+// RetryBudget tests
+// ============================================================================
+
+// Min-concurrency floor: with tiny in_flight, min_concurrency still permits
+// the configured floor of concurrent retries (otherwise a 20% budget allows 0
+// retries when in_flight < 5 — useless in low-volume services).
+void TestRetryBudgetMinConcurrencyFloor() {
+    std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..."
+              << std::endl;
+    try {
+        // percent=20, min=3. Even with 0 in_flight, 3 retries allowed.
+        RetryBudget rb(20, 3);
+
+        // Without any in_flight, min floor is what gates us.
+        bool r1 = rb.TryConsumeRetry();  // 1/3
+        bool r2 = rb.TryConsumeRetry();  // 2/3
+        bool r3 = rb.TryConsumeRetry();  // 3/3
+        bool r4 = rb.TryConsumeRetry();  // over → rejected
+
+        bool pass = r1 && r2 && r3 && !r4 &&
+                    rb.RetriesInFlight() == 3 &&
+                    rb.RetriesRejected() == 1;
+
+        rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry();
+        pass = pass && rb.RetriesInFlight() == 0;
+
+        TestFramework::RecordTest("RetryBudget min_concurrency floor", pass,
+            pass ? "" : "r1=" + std::to_string(r1) +
+                        " r2=" + std::to_string(r2) +
+                        " r3=" + std::to_string(r3) +
+                        " r4=" + std::to_string(r4) +
+                        " inflight=" + std::to_string(rb.RetriesInFlight()) +
+                        " rejected=" + std::to_string(rb.RetriesRejected()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget min_concurrency floor", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Percent-based cap scales with in_flight.
+//   percent=20, min=0, in_flight=50 → cap = 10 retries.
+void TestRetryBudgetPercentCap() {
+    std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..."
+              << std::endl;
+    try {
+        RetryBudget rb(20, 0);  // no min floor — pure percent
+
+        // Push in_flight to 50 via guards that we intentionally keep
+        // alive. Per the documented API, callers hold TrackInFlight()
+        // for BOTH first attempts and retries — but TryConsumeRetry
+        // subtracts retries_in_flight from the base so the budget
+        // doesn't self-inflate as retries are admitted.
+        std::vector<RetryBudget::InFlightGuard> guards;
+        for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight());
+
+        // With 50 non-retry in-flight and 20% budget the first
+        // admission is against cap=10, but each admission shrinks the
+        // non-retry base by 1. The admission count converges at r
+        // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The
+        // pre-fix formula (cap computed from raw in_flight) would
+        // admit 10, drifting the effective ratio above 20% of
+        // originals.
+        int admitted = 0;
+        for (int i = 0; i < 20; ++i) {
+            if (rb.TryConsumeRetry()) ++admitted;
+        }
+        bool cap_hit = admitted == 8;
+        bool rejected_count = rb.RetriesRejected() == 12;
+
+        // Release guards — in_flight drops to 0; future TryConsumeRetry with
+        // min=0 and in_flight=0 rejects everything.
+        for (auto& g : guards) (void)std::move(g);
+        guards.clear();
+        for (int i = 0; i < admitted; ++i) rb.ReleaseRetry();
+
+        bool pass = cap_hit && rejected_count && rb.InFlight() == 0 &&
+                    rb.RetriesInFlight() == 0;
+        TestFramework::RecordTest("RetryBudget percent cap", pass,
+            pass ? "" : "admitted=" + std::to_string(admitted) +
+                        " rejected=" + std::to_string(rb.RetriesRejected()) +
+                        " inflight=" + std::to_string(rb.InFlight()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget percent cap", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// TrackInFlight guards must be RAII-safe: destroying the guard decrements
+// in_flight_; moving the guard transfers ownership; self-move safe.
+void TestRetryBudgetInFlightGuardRaii() {
+    std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl;
+    try {
+        RetryBudget rb(20, 3);
+
+        bool zero_init = rb.InFlight() == 0;
+        {
+            auto g = rb.TrackInFlight();
+            bool one_after_track = rb.InFlight() == 1;
+
+            // Move-construct: counter transfers, original is empty.
+            auto g2 = std::move(g);
+            bool still_one_after_move = rb.InFlight() == 1;
+            // g is now empty, destroying it decrements nothing.
+            (void)g;
+
+            // g2 goes out of scope next.
+            if (!zero_init || !one_after_track || !still_one_after_move) {
+                TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
+                    false, "mid-test state wrong",
+                    TestFramework::TestCategory::OTHER);
+                return;
+            }
+        }
+        bool zero_after_drop = rb.InFlight() == 0;
+        TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
+            zero_after_drop,
+            zero_after_drop ? "" : "in_flight not zero after guard drop",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Reload updates tuning atomically without resetting in-flight counters —
+// the admission formula changes, outstanding retries keep running.
+void TestRetryBudgetReloadPreservesCounters() {
+    std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..."
+              << std::endl;
+    try {
+        RetryBudget rb(20, 3);
+        bool r1 = rb.TryConsumeRetry();  // 1/3
+
+        // Tighten tuning mid-flight.
+        rb.Reload(10, 1);
+
+        // Outstanding retry is still tracked.
+        bool inflight_preserved = rb.RetriesInFlight() == 1;
+
+        // New tuning applies — min=1, so 1/1 retry allowed max.
+        // Current retries_in_flight=1 already, next attempt rejects.
+        bool r2 = rb.TryConsumeRetry();
+
+        rb.ReleaseRetry();
+        bool cleanup_ok = rb.RetriesInFlight() == 0;
+
+        bool pass = r1 && inflight_preserved && !r2 && cleanup_ok;
+        TestFramework::RecordTest("RetryBudget Reload preserves counters", pass,
+            pass ? "" : "r1=" + std::to_string(r1) +
+                        " inflight_preserved=" + std::to_string(inflight_preserved) +
+                        " r2=" + std::to_string(r2) +
+                        " cleanup_ok=" + std::to_string(cleanup_ok),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget Reload preserves counters",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Clamp guards: negative percent / negative min_concurrency are clamped at
+// construction (mirrors ConfigLoader::Validate — programmatic callers that
+// bypass validation get safe defaults).
+void TestRetryBudgetClampsInvalidTuning() {
+    std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl;
+    try {
+        RetryBudget rb(-50, -10);
+        bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0;
+
+        // Over-max percent clamps to 100.
+        RetryBudget rb2(500, 5);
+        bool over_clamped = rb2.percent() == 100;
+
+        // Reload also clamps.
+        rb.Reload(-1, -1);
+        bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0;
+
+        bool pass = clamped && over_clamped && reload_clamped;
+        TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass,
+            pass ? "" :
+            "clamped=" + std::to_string(clamped) +
+            " over_clamped=" + std::to_string(over_clamped) +
+            " reload_clamped=" + std::to_string(reload_clamped),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("RetryBudget clamps invalid tuning",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// ============================================================================
+// CircuitBreakerHost tests
+// ============================================================================
+
+// Host creates partition_count slices, GetSlice looks up by index, out-of-
+// range returns nullptr (not a crash).
+void TestHostCreatesSlicesAndGetSlice() {
+    std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..."
+              << std::endl;
+    try {
+        auto cb = DefaultCbConfig();
+        CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb);
+
+        bool count_ok = host.partition_count() == 4;
+        bool slice0 = host.GetSlice(0) != nullptr;
+        bool slice3 = host.GetSlice(3) != nullptr;
+        bool slice4_null = host.GetSlice(4) == nullptr;  // out of range
+        bool slice_big_null = host.GetSlice(100) == nullptr;
+
+        // Retry budget always present.
+        bool rb_present = host.GetRetryBudget() != nullptr;
+
+        // Field getters.
+        bool fields_ok = host.service_name() == "svc" &&
+                        host.host() == "10.0.0.1" &&
+                        host.port() == 8080;
+
+        bool pass = count_ok && slice0 && slice3 && slice4_null &&
+                    slice_big_null && rb_present && fields_ok;
+        TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerHost GetSlice", false,
+            e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Host Snapshot aggregates counters across slices and rolls up states.
+void TestHostSnapshotAggregates() {
+    std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..."
+              << std::endl;
+    try {
+        auto cb = DefaultCbConfig();
+        cb.consecutive_failure_threshold = 2;
+        cb.failure_rate_threshold = 100;
+        cb.minimum_volume = 1000;
+        CircuitBreakerHost host("svc", "h", 80, 3, cb);
+
+        // Trip slice 0 and 2 → 2 open_partitions, 1 closed.
+        for (int p : {0, 2}) {
+            auto* s = host.GetSlice(p);
+            for (int i = 0; i < 2; ++i) {
+                auto a = s->TryAcquire();
+                s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
+            }
+        }
+
+        auto snap = host.Snapshot();
+
+        bool rows_ok = snap.slices.size() == 3;
+        bool total_trips = snap.total_trips == 2;
+        bool open = snap.open_partitions == 2;
+        bool halfopen = snap.half_open_partitions == 0;
+        bool svc_ok = snap.service_name == "svc" &&
+                      snap.host == "h" && snap.port == 80;
+
+        bool pass = rows_ok && total_trips && open && halfopen && svc_ok;
+        TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass,
+            pass ? "" :
+            "rows=" + std::to_string(snap.slices.size()) +
+            " trips=" + std::to_string(snap.total_trips) +
+            " open=" + std::to_string(snap.open_partitions),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Host Reload with mismatched dispatcher count logs error and does nothing.
+// Uses an empty dispatcher vector — the mismatch path must NOT dereference.
+void TestHostReloadDispatcherMismatchIsSafe() {
+    std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..."
+              << std::endl;
+    try {
+        auto cb = DefaultCbConfig();
+        CircuitBreakerHost host("svc", "h", 80, 3, cb);
+
+        auto new_cb = cb;
+        new_cb.failure_rate_threshold = 80;
+
+        // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not
+        // apply (retry budget atomics should stay at old values).
+        std::vector<std::shared_ptr<Dispatcher>> empty;
+        host.Reload(empty, new_cb);
+
+        // Retry budget fields should be unchanged — Reload bailed early.
+        bool rb_unchanged =
+            host.GetRetryBudget()->percent() == cb.retry_budget_percent &&
+            host.GetRetryBudget()->min_concurrency() ==
+                cb.retry_budget_min_concurrency;
+
+        TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe",
+            rb_unchanged,
+            rb_unchanged ? "" : "retry budget incorrectly updated on bail",
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// ============================================================================
+// CircuitBreakerManager tests
+// ============================================================================
+
+// Manager builds one host per upstream (regardless of enabled). GetHost
+// returns non-null for known names and null for unknown.
+void TestManagerGetHostLookup() {
+    std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..."
+              << std::endl;
+    try {
+        std::vector<UpstreamConfig> upstreams(2);
+        upstreams[0].name = "svc-a";
+        upstreams[0].host = "10.0.0.1";
+        upstreams[0].port = 8080;
+        upstreams[0].circuit_breaker = DefaultCbConfig();
+        upstreams[1].name = "svc-b";
+        upstreams[1].host = "10.0.0.2";
+        upstreams[1].port = 9090;
+        upstreams[1].circuit_breaker = DefaultCbConfig();
+        upstreams[1].circuit_breaker.enabled = false;  // disabled still built
+
+        CircuitBreakerManager mgr(upstreams, 4, {});
+
+        bool count_ok = mgr.host_count() == 2;
+        auto* a = mgr.GetHost("svc-a");
+        auto* b = mgr.GetHost("svc-b");
+        auto* unknown = mgr.GetHost("nope");
+
+        bool a_ok = a != nullptr && a->port() == 8080 &&
+                    a->partition_count() == 4;
+        bool b_ok = b != nullptr && b->port() == 9090 &&
+                    b->partition_count() == 4;
+        bool unknown_null = unknown == nullptr;
+
+        bool pass = count_ok && a_ok && b_ok && unknown_null;
+        TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass,
+            pass ? "" :
+            "count_ok=" + std::to_string(count_ok) +
+            " a=" + std::to_string(a_ok) +
+            " b=" + std::to_string(b_ok) +
+            " unknown_null=" + std::to_string(unknown_null),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CircuitBreakerManager GetHost lookup",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// SnapshotAll returns one entry per host; topology-preserved Reload logs and
+// skips new/removed names without crashing.
+void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() {
+    std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..."
+              << std::endl;
+    try {
+        std::vector<UpstreamConfig> upstreams(1);
+        upstreams[0].name = "svc-a";
+        upstreams[0].host = "h";
+        upstreams[0].port = 80;
+        upstreams[0].circuit_breaker = DefaultCbConfig();
+
+        CircuitBreakerManager mgr(upstreams, 2, {});
+
+        auto snaps = mgr.SnapshotAll();
+        bool one_snapshot = snaps.size() == 1;
+        bool snap_name_ok = snaps[0].service_name == "svc-a";
+
+        // Reload with a NEW name + REMOVED existing name — both must log
+        // warn and do nothing (topology is restart-only).
+        std::vector<UpstreamConfig> new_upstreams(1);
+        new_upstreams[0].name = "svc-NEW";
+        new_upstreams[0].host = "h";
+        new_upstreams[0].port = 80;
+        new_upstreams[0].circuit_breaker = DefaultCbConfig();
+
+        mgr.Reload(new_upstreams);
+
+        // Manager must still only know about svc-a (the original).
+        bool original_preserved = mgr.GetHost("svc-a") != nullptr;
+        bool new_not_added = mgr.GetHost("svc-NEW") == nullptr;
+        bool count_stable = mgr.host_count() == 1;
+
+        bool pass = one_snapshot && snap_name_ok && original_preserved &&
+                    new_not_added && count_stable;
+        TestFramework::RecordTest(
+            "CircuitBreakerManager SnapshotAll + topology-skip", pass,
+            pass ? "" :
+            "one_snap=" + std::to_string(one_snapshot) +
+            " name_ok=" + std::to_string(snap_name_ok) +
+            " preserved=" + std::to_string(original_preserved) +
+            " new_not_added=" + std::to_string(new_not_added) +
+            " count=" + std::to_string(mgr.host_count()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CircuitBreakerManager SnapshotAll + topology-skip",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects
+// empty names, but manager must not blow up if something slips through).
+void TestManagerSkipsEmptyNameUpstream() {
+    std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..."
+              << std::endl;
+    try {
+        std::vector<UpstreamConfig> upstreams(2);
+        upstreams[0].name = "";  // defensive — should be skipped
+        upstreams[0].host = "h";
+        upstreams[0].port = 80;
+        upstreams[0].circuit_breaker = DefaultCbConfig();
+        upstreams[1].name = "svc-b";
+        upstreams[1].host = "h";
+        upstreams[1].port = 81;
+        upstreams[1].circuit_breaker = DefaultCbConfig();
+
+        CircuitBreakerManager mgr(upstreams, 2, {});
+
+        bool pass = mgr.host_count() == 1 &&
+                    mgr.GetHost("svc-b") != nullptr &&
+                    mgr.GetHost("") == nullptr;
+        TestFramework::RecordTest(
+            "CircuitBreakerManager skips empty-name upstream", pass,
+            pass ? "" : "count=" + std::to_string(mgr.host_count()),
+            TestFramework::TestCategory::OTHER);
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CircuitBreakerManager skips empty-name upstream",
+            false, e.what(), TestFramework::TestCategory::OTHER);
+    }
+}
+
+// Run all circuit-breaker component unit tests.
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - COMPONENT UNIT TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestRetryBudgetMinConcurrencyFloor();
+    TestRetryBudgetPercentCap();
+    TestRetryBudgetInFlightGuardRaii();
+    TestRetryBudgetReloadPreservesCounters();
+    TestRetryBudgetClampsInvalidTuning();
+
+    TestHostCreatesSlicesAndGetSlice();
+    TestHostSnapshotAggregates();
+    TestHostReloadDispatcherMismatchIsSafe();
+
+    TestManagerGetHostLookup();
+    TestManagerSnapshotAllAndReloadSkipsTopologyChanges();
+    TestManagerSkipsEmptyNameUpstream();
+}
+
+}  // namespace CircuitBreakerComponentsTests
diff --git a/test/circuit_breaker_integration_test.h b/test/circuit_breaker_integration_test.h
new file mode 100644
index 00000000..10e72e5b
--- /dev/null
+++ b/test/circuit_breaker_integration_test.h
@@ -0,0 +1,1213 @@
+#pragma once
+
+// Integration tests: circuit breaker wired into ProxyTransaction +
+// UpstreamManager + HttpServer. Exercises the full request path end-to-end.
+//
+// Strategy: use a backend that returns 5xx on every request so repeated hits
+// trip the breaker via the consecutive-failure threshold. 5xx responses are
+// the cheapest way to accumulate failures (no connect timeouts to wait for).
+// Low thresholds keep tests fast.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+#include "upstream/upstream_manager.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+
+namespace CircuitBreakerIntegrationTests {
+
+using circuit_breaker::State;
+
+// Shared helper: build an upstream config that proxies /echo → backend and
+// has a breaker configured with low thresholds for fast trip.
+static UpstreamConfig MakeBreakerUpstream(const std::string& name,
+                                           const std::string& host,
+                                           int port,
+                                           bool breaker_enabled,
+                                           int consecutive_threshold = 3) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 8;
+    u.pool.max_idle_connections  = 4;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    // Exact-match route — simpler than prefix patterns for integration tests.
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+    // No retries — keeps the test deterministic: one request = one attempt.
+    u.proxy.retry.max_retries = 0;
+
+    u.circuit_breaker.enabled = breaker_enabled;
+    u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold;
+    // Disable the rate-based trip path — we drive everything through
+    // consecutive failures to keep the test count predictable.
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    u.circuit_breaker.base_open_duration_ms = 500;   // short so recovery test is quick
+    u.circuit_breaker.max_open_duration_ms = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open
+// headers on the rejected request.
+// ---------------------------------------------------------------------------
+void TestBreakerTripsAfterConsecutiveFailures() {
+    std::cout << "\n[TEST] CB Integration: breaker trips after consecutive 5xx..."
+              << std::endl;
+    try {
+        // Backend always returns 502 — gateway classifies the response as
+        // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt.
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.  // single thread → single breaker partition exercised
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Hit the failing backend threshold times — each 502 from backend
+        // propagates to the client as 502 (gateway pass-through) AND counts
+        // as a RESPONSE_5XX failure in the breaker.
+        for (int i = 0; i < 3; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (!TestHttpClient::HasStatus(r, 502)) {
+                TestFramework::RecordTest(
+                    "CB Integration: trip after consecutive failures", false,
+                    "pre-trip request " + std::to_string(i) + " expected 502, got: " +
+                    r.substr(0, 32));
+                return;
+            }
+        }
+
+        // Next request must be rejected by the breaker (not proxied). The
+        // response is 503 with X-Circuit-Breaker: open and Retry-After.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        bool has_breaker_header =
+            r.find("X-Circuit-Breaker: open") != std::string::npos ||
+            r.find("x-circuit-breaker: open") != std::string::npos;
+        bool has_retry_after =
+            r.find("Retry-After:") != std::string::npos ||
+            r.find("retry-after:") != std::string::npos;
+        bool has_upstream_host =
+            r.find("X-Upstream-Host:") != std::string::npos ||
+            r.find("x-upstream-host:") != std::string::npos;
+
+        bool pass = is_503 && has_breaker_header && has_retry_after &&
+                    has_upstream_host;
+        TestFramework::RecordTest(
+            "CB Integration: trip after consecutive failures", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " breaker_hdr=" + std::to_string(has_breaker_header) +
+            " retry_after=" + std::to_string(has_retry_after) +
+            " upstream_host=" + std::to_string(has_upstream_host) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: trip after consecutive failures", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely.
+// The same failure pattern that would trip an enabled breaker must leave the
+// pass-through path untouched — every request still reaches the backend.
+// ---------------------------------------------------------------------------
+void TestBreakerDisabledPassesThrough() {
+    std::cout << "\n[TEST] CB Integration: disabled breaker passes through..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/false, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // 10 requests — with breaker disabled, all 10 reach backend.
+        for (int i = 0; i < 10; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (!TestHttpClient::HasStatus(r, 502)) {
+                TestFramework::RecordTest(
+                    "CB Integration: disabled breaker passes through", false,
+                    "request " + std::to_string(i) + " expected 502, got: " +
+                    r.substr(0, 32));
+                return;
+            }
+        }
+
+        bool all_hit = backend_hits.load() == 10;
+        TestFramework::RecordTest(
+            "CB Integration: disabled breaker passes through", all_hit,
+            all_hit ? "" :
+            "expected 10 backend hits, got " + std::to_string(backend_hits.load()));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: disabled breaker passes through", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: 2xx responses are reported as success — they reset the
+// consecutive-failure counter so the breaker doesn't trip on interleaved
+// success/failure traffic.
+// ---------------------------------------------------------------------------
+void TestSuccessResetsConsecutiveFailureCounter() {
+    std::cout << "\n[TEST] CB Integration: 2xx success resets consecutive-failure counter..."
+              << std::endl;
+    try {
+        std::atomic<bool> fail_mode{true};
+        HttpServer backend("127.0.0.1", 0);
+        // Backend must serve /fail — that's the exact-match route the
+        // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail",
+        // strip_prefix=false). A different backend path would leave
+        // the gateway 404-ing every request without ever exercising
+        // the proxy, and the CLOSED-state assertion below would pass
+        // for the wrong reason.
+        backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
+            if (fail_mode.load()) {
+                resp.Status(502).Body("err", "text/plain");
+            } else {
+                resp.Status(200).Body("ok", "text/plain");
+            }
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails.
+        // With reset semantics, consecutive_failures_ never exceeds 2 → no trip.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);  // FAIL
+        }
+        fail_mode.store(false);
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);      // SUCCESS → reset
+        fail_mode.store(true);
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);  // FAIL
+        }
+
+        // Inspect the breaker's state directly. The slice must be CLOSED
+        // AND must have observed activity — without the second check, a
+        // gateway that 404's every request (e.g. because the proxy route
+        // doesn't match) would also pass trivially.
+        auto* cbm = gateway.GetUpstreamManager() ?
+            gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
+        auto* host = cbm ? cbm->GetHost("svc") : nullptr;
+        auto* slice = host ? host->GetSlice(0) : nullptr;
+        bool still_closed = slice && slice->CurrentState() == State::CLOSED;
+        // No trip fired: total_trips should be zero for this slice.
+        int64_t trips = slice ? slice->Trips() : -1;
+        bool no_trips = (trips == 0);
+
+        bool pass = still_closed && no_trips;
+        TestFramework::RecordTest(
+            "CB Integration: success resets consecutive counter", pass,
+            pass ? "" :
+            "state=" + std::to_string(static_cast<int>(
+                slice ? slice->CurrentState() : State::CLOSED)) +
+            " trips=" + std::to_string(trips));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: success resets consecutive counter", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 4: After the trip, the live slice state is OPEN. Verifies the
+// integration actually drives the slice state machine (not just the response).
+// ---------------------------------------------------------------------------
+void TestTripDrivesSliceState() {
+    std::cout << "\n[TEST] CB Integration: trip drives slice state to OPEN..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // 3 failures → trip.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // With worker_threads > 1 the 3 failing requests can land on either
+        // dispatcher (hash-dependent). Check the aggregate snapshot — at
+        // least one partition must be OPEN with exactly one trip recorded.
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        auto* host = cbm->GetHost("svc");
+        auto snap = host->Snapshot();
+        bool at_least_one_open = snap.open_partitions >= 1;
+        bool one_trip = snap.total_trips == 1;
+        // Sanity: the tripped partition should be the one that saw all 3
+        // failures (consecutive trip is single-slice, not cross-slice).
+        bool single_partition_tripped = snap.open_partitions == 1;
+
+        bool pass = at_least_one_open && one_trip && single_partition_tripped;
+        TestFramework::RecordTest(
+            "CB Integration: trip drives slice state to OPEN", pass,
+            pass ? "" :
+            "at_least_one_open=" + std::to_string(at_least_one_open) +
+            " one_trip=" + std::to_string(one_trip) +
+            " single_partition=" + std::to_string(single_partition_tripped) +
+            " (open_partitions=" + std::to_string(snap.open_partitions) +
+            ", total_trips=" + std::to_string(snap.total_trips) + ")");
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: trip drives slice state to OPEN", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip,
+// subsequent requests must be served locally (503) without any upstream I/O.
+// Prevents regression where the gate leaked admissions to a known-bad upstream.
+// ---------------------------------------------------------------------------
+void TestOpenBreakerShortCircuitsUpstreamCall() {
+    std::cout << "\n[TEST] CB Integration: OPEN breaker short-circuits upstream call..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // worker_threads=1 → all TCP connections land on dispatcher 0
+        // (NetServer shards new connections by fd%worker_threads), so
+        // per-request failures accumulate deterministically on slice[0]
+        // instead of splitting across multiple slices.
+        gw.upstreams.push_back(
+            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                /*enabled=*/true, /*threshold=*/3));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // 3 failing requests to trip.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        int hits_at_trip = backend_hits.load();
+
+        // 5 more requests — all should be rejected locally.
+        for (int i = 0; i < 5; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        int hits_after = backend_hits.load();
+
+        // Backend hits must not grow during the post-trip burst.
+        bool no_leak = hits_after == hits_at_trip;
+        TestFramework::RecordTest(
+            "CB Integration: OPEN short-circuits upstream call", no_leak,
+            no_leak ? "" :
+            "backend hits grew from " + std::to_string(hits_at_trip) +
+            " to " + std::to_string(hits_after) + " after trip");
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: OPEN short-circuits upstream call", false, e.what());
+    }
+}
+
+// Sanity check: verify the bare proxy setup works without the breaker
+// before blaming the breaker integration.
+void TestBareProxyWorks() {
+    std::cout << "\n[TEST] CB Integration: bare proxy (sanity)..." << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        UpstreamConfig u;
+        u.name = "svc";
+        u.host = "127.0.0.1";
+        u.port = backend_port;
+        u.pool.max_connections = 8;
+        u.pool.max_idle_connections = 4;
+        u.pool.connect_timeout_ms = 3000;
+        u.proxy.route_prefix = "/fail";
+        u.proxy.response_timeout_ms = 5000;
+        u.circuit_breaker.enabled = true;  // sanity + breaker enabled
+        u.circuit_breaker.consecutive_failure_threshold = 3;
+        u.circuit_breaker.failure_rate_threshold = 100;
+        u.circuit_breaker.minimum_volume = 10000;
+        u.circuit_breaker.window_seconds = 10;
+        u.circuit_breaker.permitted_half_open_calls = 2;
+        u.circuit_breaker.base_open_duration_ms = 500;
+        u.circuit_breaker.max_open_duration_ms = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+        bool pass = TestHttpClient::HasStatus(r, 502);
+        TestFramework::RecordTest(
+            "CB Integration: bare proxy sanity", pass,
+            pass ? "" : "expected 502, got: " + r.substr(0, 128));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest("CB Integration: bare proxy sanity",
+            false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 7: Retry-After header carries a sensible value — within [1, configured
+// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now.
+// ---------------------------------------------------------------------------
+void TestRetryAfterHeaderValue() {
+    std::cout << "\n[TEST] CB Integration: Retry-After value correctness..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // base_open_duration 2000ms, max 60_000ms — Retry-After should
+        // ceiling-round and fall inside [1, 60].
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.base_open_duration_ms = 2000;
+        u.circuit_breaker.max_open_duration_ms  = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // Capture the open-rejection response.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+
+        // Extract Retry-After integer value (case-insensitive header).
+        int retry_after = -1;
+        const char* markers[] = {"Retry-After:", "retry-after:"};
+        for (const char* m : markers) {
+            auto pos = r.find(m);
+            if (pos == std::string::npos) continue;
+            pos += std::string(m).size();
+            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
+            int val = 0;
+            bool any = false;
+            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
+                val = val * 10 + (r[pos] - '0');
+                any = true;
+                ++pos;
+            }
+            if (any) { retry_after = val; break; }
+        }
+
+        // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60).
+        // For base_open_duration 2000ms the remaining-seconds at this
+        // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper
+        // sanity bound is generous but still rules out 300/3600-class
+        // buggy fallbacks.
+        bool in_range = (retry_after >= 1 && retry_after <= 60);
+        bool reasonable = (retry_after >= 1 && retry_after <= 3);
+
+        bool pass = is_503 && in_range && reasonable;
+        TestFramework::RecordTest(
+            "CB Integration: Retry-After value in range", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " retry_after=" + std::to_string(retry_after) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: Retry-After value in range", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3,
+// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored
+// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into
+// the breaker and MaybeRetry stays out.
+// ---------------------------------------------------------------------------
+void TestCircuitOpenTerminalForRetry() {
+    std::cout << "\n[TEST] CB Integration: CIRCUIT_OPEN terminal for retry loop..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // Retries enabled on 5xx — if the breaker reject leaked into
+        // MaybeRetry, the test would see extra backend hits after the
+        // trip. Long open window so the breaker stays OPEN for the
+        // duration of the post-trip assertion (no HALF_OPEN probe
+        // admission racing the test).
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        u.circuit_breaker.base_open_duration_ms = 30000;
+        u.circuit_breaker.max_open_duration_ms  = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker. Each pre-trip request may retry up to 3
+        // times (all failing 5xx), so backend sees up to 3*threshold=12
+        // hits. That's acceptable — we just care about post-trip behavior.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+        }
+        int pre_trip_hits = backend_hits.load();
+
+        // Post-trip request: expect a single 503 and NO new backend hits.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        int post_trip_hits = backend_hits.load();
+        bool no_new_hits = (post_trip_hits == pre_trip_hits);
+
+        bool pass = is_503 && no_new_hits;
+        TestFramework::RecordTest(
+            "CB Integration: CIRCUIT_OPEN terminal for retry", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " pre=" + std::to_string(pre_trip_hits) +
+            " post=" + std::to_string(post_trip_hits));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: CIRCUIT_OPEN terminal for retry", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the
+// upstream (pass-through) but still increments the rejected_ counter so
+// operators can observe the would-reject rate without production impact.
+// ---------------------------------------------------------------------------
+void TestDryRunPassthrough() {
+    std::cout << "\n[TEST] CB Integration: dry-run passthrough..." << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.dry_run = true;  // would-reject, but still forward
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip thresholds with 5 requests. All should reach backend (502),
+        // not a 503 — dry-run never short-circuits.
+        for (int i = 0; i < 5; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (!TestHttpClient::HasStatus(r, 502)) {
+                TestFramework::RecordTest(
+                    "CB Integration: dry-run passthrough", false,
+                    "request " + std::to_string(i) +
+                    " expected 502, got: " + r.substr(0, 64));
+                return;
+            }
+        }
+
+        bool all_hit = (backend_hits.load() == 5);
+
+        // Verify the slice observed trips/rejected even though traffic passed.
+        auto* mgr = gateway.GetUpstreamManager() ?
+                     gateway.GetUpstreamManager()->GetCircuitBreakerManager() :
+                     nullptr;
+        int64_t trips = 0, rejected = 0;
+        if (mgr) {
+            auto* host = mgr->GetHost("svc");
+            if (host) {
+                auto snap = host->Snapshot();
+                trips = snap.total_trips;
+                rejected = snap.total_rejected;
+            }
+        }
+        // At least one trip fired (consecutive_threshold=3 → slice
+        // transitioned at least once during the run), and the post-trip
+        // requests were counted as would-reject (rejected > 0).
+        bool observed = (trips >= 1) && (rejected >= 1);
+
+        bool pass = all_hit && observed;
+        TestFramework::RecordTest(
+            "CB Integration: dry-run passthrough", pass,
+            pass ? "" :
+            "hits=" + std::to_string(backend_hits.load()) +
+            " trips=" + std::to_string(trips) +
+            " rejected=" + std::to_string(rejected));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: dry-run passthrough", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the
+// breaker, wait for the open window to elapse, then serve success responses
+// and assert the slice transitions back to CLOSED (consecutive_successes
+// crosses the threshold — default 2 from DefaultCbConfig / integration config).
+// ---------------------------------------------------------------------------
+void TestHalfOpenRecoveryRoundTrip() {
+    std::cout << "\n[TEST] CB Integration: HALF_OPEN → CLOSED recovery..."
+              << std::endl;
+    try {
+        std::atomic<bool> fail_mode{true};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
+            if (fail_mode.load()) {
+                resp.Status(502).Body("err", "text/plain");
+            } else {
+                resp.Status(200).Body("ok", "text/plain");
+            }
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        // Short open duration so recovery path finishes quickly.
+        u.circuit_breaker.base_open_duration_ms = 300;
+        u.circuit_breaker.max_open_duration_ms = 1000;
+        // Two probes needed to close (default permitted_half_open_calls=2).
+        u.circuit_breaker.permitted_half_open_calls = 2;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip by hitting the failing backend.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // Flip backend to success and wait for the open window to elapse.
+        fail_mode.store(false);
+        std::this_thread::sleep_for(std::chrono::milliseconds(500));
+
+        // Probe the proxy — each successful 200 advances HALF_OPEN toward
+        // CLOSED. Do more than permitted_half_open_calls; some will be
+        // rejected as half_open_full but the ones that are admitted will
+        // close the breaker.
+        bool saw_success = false;
+        for (int i = 0; i < 8; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (TestHttpClient::HasStatus(r, 200)) saw_success = true;
+            // Small gap between probes — HALF_OPEN only admits permitted
+            // probes per cycle; spacing lets subsequent probes observe a
+            // possibly-closed breaker.
+            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        }
+
+        // Verify slice aggregate: at least one CLOSED transition observed
+        // (probe_successes >= 1 and total_trips == 1 — we only tripped once).
+        auto* mgr = gateway.GetUpstreamManager() ?
+                     gateway.GetUpstreamManager()->GetCircuitBreakerManager() :
+                     nullptr;
+        int64_t probe_succ = 0;
+        int open_parts = 0, half_open_parts = 0;
+        if (mgr) {
+            auto* host = mgr->GetHost("svc");
+            if (host) {
+                auto snap = host->Snapshot();
+                probe_succ = 0;
+                for (const auto& row : snap.slices) {
+                    probe_succ += row.probe_successes;
+                }
+                open_parts = snap.open_partitions;
+                half_open_parts = snap.half_open_partitions;
+            }
+        }
+
+        // Recovery complete: saw at least one 200 through the breaker,
+        // at least one probe success counted, and no partition still
+        // stuck in OPEN (HALF_OPEN may still linger on the unused slice,
+        // which is fine for a 2-partition setup).
+        bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0);
+        TestFramework::RecordTest(
+            "CB Integration: HALF_OPEN → CLOSED recovery", pass,
+            pass ? "" :
+            "saw_success=" + std::to_string(saw_success) +
+            " probe_succ=" + std::to_string(probe_succ) +
+            " open_parts=" + std::to_string(open_parts) +
+            " half_open_parts=" + std::to_string(half_open_parts));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: HALF_OPEN → CLOSED recovery", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 11: Retry-After ceils the config cap from a non-second-aligned
+// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap
+// would clamp the advertised retry window below what the breaker honors,
+// causing well-behaved clients to re-hit the 503.
+// ---------------------------------------------------------------------------
+void TestRetryAfterCapCeilsNonAlignedMax() {
+    std::cout << "\n[TEST] CB Integration: Retry-After cap ceils non-aligned max..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // Configure a non-second-aligned max backoff. base = 1500ms so
+        // the actual OpenUntil-now at trip time is ~1.5s, which ceil-
+        // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration
+        // (1500ms → 1s), the clamp would drop Retry-After to 1s even
+        // though the breaker would keep rejecting through the second
+        // half of that window.
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.base_open_duration_ms = 1500;
+        u.circuit_breaker.max_open_duration_ms  = 1500;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+
+        int retry_after = -1;
+        const char* markers[] = {"Retry-After:", "retry-after:"};
+        for (const char* m : markers) {
+            auto pos = r.find(m);
+            if (pos == std::string::npos) continue;
+            pos += std::string(m).size();
+            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
+            int val = 0;
+            bool any = false;
+            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
+                val = val * 10 + (r[pos] - '0');
+                any = true;
+                ++pos;
+            }
+            if (any) { retry_after = val; break; }
+        }
+
+        // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil-
+        // rounds 1500ms to 2s, and the remaining-time ceil-rounds to
+        // 2 at the moment of trip (may be 1 if enough wall-clock has
+        // elapsed between trip and response). Critically it must NEVER
+        // be zero or exceed 2 (clamped to the 2s cap).
+        bool in_range = (retry_after >= 1 && retry_after <= 2);
+        TestFramework::RecordTest(
+            "CB Integration: Retry-After ceils non-aligned cap", in_range,
+            in_range ? "" :
+            "retry_after=" + std::to_string(retry_after));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: Retry-After ceils non-aligned cap", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 12: Retried failures are reported BEFORE the retry fires. With retries
+// enabled on 5xx, each attempt's outcome must be counted against the breaker;
+// otherwise the slice trips only after the final retry exhausts, under-
+// counting failures and potentially never tripping if retries mask enough of
+// them. Verifies the trip still happens within the expected number of client
+// requests once reporting is attached to the retry path.
+// ---------------------------------------------------------------------------
+void TestRetriedFailuresCountTowardTrip() {
+    std::cout << "\n[TEST] CB Integration: retried failures count toward trip..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each
+        // client request produces 1 + max_retries=3 = 4 upstream
+        // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome
+        // path that this fix patches in. The breaker must trip after
+        // at most 3 upstream failure reports (which the first client
+        // request alone produces).
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        u.circuit_breaker.base_open_duration_ms = 30000;
+        u.circuit_breaker.max_open_duration_ms  = 60000;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // One client request → 4 upstream attempts → 4 RESPONSE_5XX
+        // reports. Threshold=3 should trip during this single request.
+        TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+
+        // Second client request must hit the OPEN breaker → 503.
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        bool has_breaker_header =
+            r.find("X-Circuit-Breaker: open") != std::string::npos ||
+            r.find("x-circuit-breaker: open") != std::string::npos;
+
+        bool pass = is_503 && has_breaker_header;
+        TestFramework::RecordTest(
+            "CB Integration: retried failures count toward trip", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " breaker_hdr=" + std::to_string(has_breaker_header) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: retried failures count toward trip", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label.
+// TryAcquire returns REJECTED_OPEN for three situations (true OPEN,
+// half_open_full, half_open_recovery_failing). When the slice is in
+// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse
+// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading
+// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects
+// with a more conservative Retry-After hint.
+//
+// Strategy: trip the breaker, wait for the open window to elapse so the
+// slice transitions HALF_OPEN on the next admission attempt, then flood
+// concurrent requests so some hit half_open_full.
+// ---------------------------------------------------------------------------
+void TestHalfOpenRejectLabel() {
+    std::cout << "\n[TEST] CB Integration: HALF_OPEN reject label..."
+              << std::endl;
+    try {
+        // Backend hangs to keep probes in-flight so later concurrent
+        // requests hit half_open_full.
+        std::atomic<bool> hang{false};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) {
+            if (hang.load()) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(600));
+            }
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/3);
+        u.circuit_breaker.base_open_duration_ms = 200;
+        u.circuit_breaker.max_open_duration_ms  = 500;
+        u.circuit_breaker.permitted_half_open_calls = 1;  // tiny budget
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        // Wait for the open window to elapse so the next admission
+        // flips the slice to HALF_OPEN.
+        std::this_thread::sleep_for(std::chrono::milliseconds(300));
+
+        // Flip backend to hang so the probe occupies the single probe
+        // slot while we fire sibling requests that must hit half_open_full.
+        hang.store(true);
+
+        std::atomic<bool> saw_half_open{false};
+        std::atomic<bool> saw_open{false};
+        auto probe = [&](int id) {
+            (void)id;
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
+            if (!TestHttpClient::HasStatus(r, 503)) return;
+            if (r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
+                r.find("x-circuit-breaker: half_open") != std::string::npos) {
+                saw_half_open.store(true);
+            }
+            if (r.find("X-Circuit-Breaker: open") != std::string::npos ||
+                r.find("x-circuit-breaker: open") != std::string::npos) {
+                // We want to distinguish the labels; the "open" substring
+                // also matches "half_open". Only count true "open" if
+                // "half_open" didn't appear in THIS response.
+                if (r.find("half_open") == std::string::npos) {
+                    saw_open.store(true);
+                }
+            }
+        };
+
+        std::vector<std::thread> threads;
+        for (int i = 0; i < 6; ++i) {
+            threads.emplace_back(probe, i);
+            std::this_thread::sleep_for(std::chrono::milliseconds(20));
+        }
+        for (auto& t : threads) t.join();
+
+        // Pass if at least one HALF_OPEN-labelled reject was observed.
+        // saw_open may or may not be observed (some rejects could have
+        // hit between cycles) — the key contract is that HALF_OPEN
+        // rejects no longer get the plain "open" label.
+        bool pass = saw_half_open.load();
+        TestFramework::RecordTest(
+            "CB Integration: HALF_OPEN reject label", pass,
+            pass ? "" :
+            "saw_half_open=" + std::to_string(saw_half_open.load()) +
+            " saw_open=" + std::to_string(saw_open.load()));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: HALF_OPEN reject label", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff,
+// not just base_open_duration_ms. After multiple trips the next OPEN window
+// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old
+// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report
+// the worst-case wait, which this test must fail for.
+//
+// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting
+// the OPEN window elapse and single probe fail each cycle. Successful
+// recoveries must be avoided — TransitionHalfOpenToClosed resets
+// consecutive_trips_ to 0, which hides the exponential hint.
+// ---------------------------------------------------------------------------
+void TestHalfOpenRetryAfterScalesWithBackoff() {
+    std::cout << "\n[TEST] CB Integration: HALF_OPEN Retry-After exponential..."
+              << std::endl;
+    try {
+        // Backend fails fast by default. When `hang` is set, the
+        // handler blocks — used at the end to pin the probe slot so
+        // a concurrent request observes HALF_OPEN rejection.
+        std::atomic<bool> hang{false};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) {
+            if (hang.load()) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(1500));
+            }
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;  // pin all traffic to slice[0]
+        gw.http2.enabled = false;
+        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
+                                     /*enabled=*/true, /*threshold=*/2);
+        u.circuit_breaker.base_open_duration_ms = 100;     // config minimum
+        u.circuit_breaker.max_open_duration_ms  = 8000;    // cap at 8s
+        u.circuit_breaker.permitted_half_open_calls = 1;   // single probe
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        auto* cbm = gateway.GetUpstreamManager() ?
+            gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
+        auto* host = cbm ? cbm->GetHost("svc") : nullptr;
+        auto* slice = host ? host->GetSlice(0) : nullptr;
+        if (!slice) {
+            TestFramework::RecordTest(
+                "CB Integration: HALF_OPEN Retry-After exponential-aware",
+                false, "slice lookup failed");
+            return;
+        }
+
+        // Initial trip: 2 consecutive failures with threshold=2.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        // Drive consecutive_trips_ up by letting successive OPEN windows
+        // elapse and probes fail (no recovery → no reset). Stop when
+        // NextOpenDurationMs crosses 1000ms, which is the threshold
+        // where the HALF_OPEN Retry-After hint starts exceeding the
+        // base-only value (ceil(100ms)=1s).
+        //
+        // The slice re-trips on each failed probe; each trip doubles
+        // the open duration. We run ~8 cycles with safety margin which
+        // is comfortably past the trip count needed for Retry-After>=2.
+        for (int cycle = 0; cycle < 8; ++cycle) {
+            // Wait past the current open window. Upper bound: max=8s,
+            // so 1200ms is plenty for the first few short cycles, and
+            // we re-check after each request anyway.
+            int64_t next_ms = slice->NextOpenDurationMs();
+            // Current OPEN window is the one stored BEFORE the upcoming
+            // re-trip — we don't have that directly, so sleep past the
+            // NEXT duration as an over-approximation (next is always >=
+            // current). This ensures OPEN has elapsed.
+            auto sleep_ms = std::max<int64_t>(next_ms + 50, 200);
+            if (sleep_ms > 2000) sleep_ms = 2000;  // cap per cycle
+            std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
+
+            // One request — it should admit as a probe (HALF_OPEN),
+            // the backend fails fast (502), probe fails → re-trip with
+            // consecutive_trips_++ and fresh OPEN.
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+
+            // Bail early once the exponential hint crosses 1s → the
+            // subsequent HALF_OPEN reject will carry Retry-After >= 2.
+            if (slice->NextOpenDurationMs() >= 2000) break;
+        }
+
+        int64_t next_open_ms = slice->NextOpenDurationMs();
+        if (next_open_ms < 2000) {
+            TestFramework::RecordTest(
+                "CB Integration: HALF_OPEN Retry-After exponential-aware",
+                false,
+                "setup failed: next_open_ms=" + std::to_string(next_open_ms) +
+                " (need >= 2000 to distinguish from base-only hint)");
+            return;
+        }
+
+        // Now trigger a HALF_OPEN reject: wait for current OPEN to
+        // elapse, start a hanging probe (pins the slot), then fire a
+        // sibling request — it must see half_open_full with the
+        // exponential Retry-After.
+        int64_t post_wait_ms = next_open_ms + 100;
+        if (post_wait_ms > 4000) post_wait_ms = 4000;
+        std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms));
+
+        hang.store(true);
+        std::thread probe([&]() {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3500);
+        });
+        // Let the probe get admitted and start hanging.
+        std::this_thread::sleep_for(std::chrono::milliseconds(200));
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
+        hang.store(false);
+        probe.join();
+
+        bool is_half_open =
+            r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
+            r.find("x-circuit-breaker: half_open") != std::string::npos;
+
+        int retry_after = -1;
+        const char* markers[] = {"Retry-After:", "retry-after:"};
+        for (const char* m : markers) {
+            auto pos = r.find(m);
+            if (pos == std::string::npos) continue;
+            pos += std::string(m).size();
+            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
+            int val = 0;
+            bool any = false;
+            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
+                val = val * 10 + (r[pos] - '0');
+                any = true;
+                ++pos;
+            }
+            if (any) { retry_after = val; break; }
+        }
+
+        // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2.
+        // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1.
+        // Asserting >= 2 fails the pre-fix implementation.
+        bool retry_after_ok = (retry_after >= 2 && retry_after <= 8);
+        bool pass = is_half_open && retry_after_ok;
+        TestFramework::RecordTest(
+            "CB Integration: HALF_OPEN Retry-After exponential-aware", pass,
+            pass ? "" :
+            "is_half_open=" + std::to_string(is_half_open) +
+            " retry_after=" + std::to_string(retry_after) +
+            " next_open_ms=" + std::to_string(next_open_ms));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Integration: HALF_OPEN Retry-After exponential-aware",
+            false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - INTEGRATION TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestBareProxyWorks();
+    TestBreakerTripsAfterConsecutiveFailures();
+    TestBreakerDisabledPassesThrough();
+    TestSuccessResetsConsecutiveFailureCounter();
+    TestTripDrivesSliceState();
+    TestOpenBreakerShortCircuitsUpstreamCall();
+    TestRetryAfterHeaderValue();
+    TestCircuitOpenTerminalForRetry();
+    TestDryRunPassthrough();
+    TestHalfOpenRecoveryRoundTrip();
+    TestRetryAfterCapCeilsNonAlignedMax();
+    TestRetriedFailuresCountTowardTrip();
+    TestHalfOpenRejectLabel();
+    TestHalfOpenRetryAfterScalesWithBackoff();
+}
+
+}  // namespace CircuitBreakerIntegrationTests
diff --git a/test/circuit_breaker_observability_test.h b/test/circuit_breaker_observability_test.h
new file mode 100644
index 00000000..42694a67
--- /dev/null
+++ b/test/circuit_breaker_observability_test.h
@@ -0,0 +1,405 @@
+#pragma once
+
+// Observability integration tests: observability — counter accuracy, snapshot
+// API correctness, and log emission.
+//
+// Phases 2-6 each added counters and log lines as a side effect of their
+// functional work. This suite locks those in as regressions:
+//
+//   * Counters (§11.2): trips, rejected, probe_successes, probe_failures,
+//     retries_rejected surface through CircuitBreakerManager::SnapshotAll.
+//   * Snapshot API (§11.3): per-slice rows aggregate into host-level
+//     totals; host-level fields (retries_in_flight / retries_rejected /
+//     in_flight) reflect the owning RetryBudget.
+//   * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message
+//     including trigger, consecutive_failures, window_total,
+//     window_fail_rate, open_for_ms, and consecutive_trips.
+//
+// The log-emission test attaches a spdlog ring-buffer sink to the logger
+// for the duration of the test, triggers a trip, then asserts the
+// captured messages contain the expected fields. No log file I/O.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+#include "upstream/upstream_manager.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "log/logger.h"
+#include "spdlog/sinks/ringbuffer_sink.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace CircuitBreakerObservabilityTests {
+
+using circuit_breaker::State;
+
+static UpstreamConfig MakeObservUpstream(const std::string& name,
+                                          const std::string& host,
+                                          int port,
+                                          int consecutive_threshold = 3) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 8;
+    u.pool.max_idle_connections  = 4;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+    u.proxy.retry.max_retries = 0;
+
+    u.circuit_breaker.enabled = true;
+    u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold;
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    // Long open duration — keep the slice OPEN so post-trip assertions
+    // don't race a HALF_OPEN transition.
+    u.circuit_breaker.base_open_duration_ms = 30000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: Snapshot API reflects per-slice trip/rejected counters and
+// host-level aggregates. Drives N+1 requests against a backend that always
+// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts
+// the snapshot shows total_trips >= 1, total_rejected >= 1,
+// open_partitions >= 1.
+// ---------------------------------------------------------------------------
+void TestSnapshotReflectsCounters() {
+    std::cout << "\n[TEST] CB Observability: snapshot reflects counters..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port,
+                                    /*threshold=*/3);
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip (3 failures), then 2 more to accumulate rejected counter.
+        for (int i = 0; i < 3; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        if (!cbm) {
+            TestFramework::RecordTest(
+                "CB Observability: snapshot reflects counters", false,
+                "no circuit breaker manager attached");
+            return;
+        }
+        auto snaps = cbm->SnapshotAll();
+        bool found = false;
+        int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0;
+        int open_parts = 0;
+        for (const auto& s : snaps) {
+            if (s.service_name == "svc") {
+                trips = s.total_trips;
+                rejected = s.total_rejected;
+                open_parts = s.open_partitions;
+                for (const auto& row : s.slices) {
+                    probe_s += row.probe_successes;
+                    probe_f += row.probe_failures;
+                }
+                found = true;
+                break;
+            }
+        }
+
+        bool pass = found
+                    && trips >= 1
+                    && rejected >= 2   // 2 post-trip short-circuits
+                    && open_parts >= 1
+                    && probe_s == 0    // never entered HALF_OPEN
+                    && probe_f == 0;
+        TestFramework::RecordTest(
+            "CB Observability: snapshot reflects counters", pass,
+            pass ? "" :
+            "found=" + std::to_string(found) +
+            " trips=" + std::to_string(trips) +
+            " rejected=" + std::to_string(rejected) +
+            " open_parts=" + std::to_string(open_parts) +
+            " probe_s=" + std::to_string(probe_s) +
+            " probe_f=" + std::to_string(probe_f));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Observability: snapshot reflects counters", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message.
+// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip,
+// then inspects the captured messages for the key tokens. The sink is
+// removed before the test returns so it doesn't affect later tests.
+// ---------------------------------------------------------------------------
+void TestTripLogEmission() {
+    std::cout << "\n[TEST] CB Observability: trip log emission..." << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port,
+                                    /*threshold=*/2);
+        gw.upstreams.push_back(u);
+
+        // `HttpServer` construction calls `logging::Init()` which rebuilds
+        // the default logger via `spdlog::set_default_logger`. Any sink
+        // attached BEFORE that point lands on a stale logger. Attach the
+        // ringbuffer sink AFTER the last HttpServer construction so it
+        // captures the live logger's output.
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        // Drive exactly threshold=2 failures to trip.
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+
+        // Give the dispatcher a breath to emit + the sink to settle.
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        auto messages = ring->last_formatted();
+        // Scan for the trip message. Look for the static prefix plus the
+        // §11.1 field tokens.
+        bool saw_tripped = false;
+        bool has_trigger = false;
+        bool has_consec_failures = false;
+        bool has_window_total = false;
+        bool has_fail_rate = false;
+        bool has_open_for_ms = false;
+        bool has_consec_trips = false;
+        for (const auto& msg : messages) {
+            if (msg.find("circuit breaker tripped") == std::string::npos) {
+                continue;
+            }
+            saw_tripped = true;
+            if (msg.find("trigger=") != std::string::npos) has_trigger = true;
+            if (msg.find("consecutive_failures=") != std::string::npos)
+                has_consec_failures = true;
+            if (msg.find("window_total=") != std::string::npos)
+                has_window_total = true;
+            if (msg.find("window_fail_rate=") != std::string::npos)
+                has_fail_rate = true;
+            if (msg.find("open_for_ms=") != std::string::npos)
+                has_open_for_ms = true;
+            if (msg.find("consecutive_trips=") != std::string::npos)
+                has_consec_trips = true;
+        }
+
+        bool pass = saw_tripped && has_trigger && has_consec_failures &&
+                    has_window_total && has_fail_rate &&
+                    has_open_for_ms && has_consec_trips;
+        TestFramework::RecordTest(
+            "CB Observability: trip log emission", pass,
+            pass ? "" :
+            "saw_tripped=" + std::to_string(saw_tripped) +
+            " trigger=" + std::to_string(has_trigger) +
+            " consec_failures=" + std::to_string(has_consec_failures) +
+            " window_total=" + std::to_string(has_window_total) +
+            " fail_rate=" + std::to_string(has_fail_rate) +
+            " open_for_ms=" + std::to_string(has_open_for_ms) +
+            " consec_trips=" + std::to_string(has_consec_trips));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Observability: trip log emission", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: Retry-budget observability — the exhausted log carries the
+// §11.1 fields (service, in_flight, retries_in_flight, cap), and the
+// host snapshot reflects retries_rejected.
+// ---------------------------------------------------------------------------
+void TestRetryBudgetObservability() {
+    std::cout << "\n[TEST] CB Observability: retry budget observability..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        // Budget: zero percent AND zero floor → every retry rejected.
+        auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port,
+                                    /*threshold=*/10000);
+        u.proxy.retry.max_retries = 2;
+        u.proxy.retry.retry_on_5xx = true;
+        u.circuit_breaker.retry_budget_percent = 0;
+        u.circuit_breaker.retry_budget_min_concurrency = 0;
+        gw.upstreams.push_back(u);
+
+        // Attach the ringbuffer AFTER gateway construction — see
+        // TestTripLogEmission for rationale (HttpServer's ctor
+        // replaces the default logger via logging::Init, detaching
+        // any previously-attached sinks).
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        // One client request: first attempt hits backend (502), retry
+        // blocked by budget → 503 + X-Retry-Budget-Exhausted.
+        TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        auto messages = ring->last_formatted();
+        bool saw_exhausted = false;
+        bool has_service = false;
+        bool has_inflight = false;
+        bool has_retries_inflight = false;
+        bool has_cap = false;
+        for (const auto& msg : messages) {
+            if (msg.find("retry budget exhausted") == std::string::npos) {
+                continue;
+            }
+            saw_exhausted = true;
+            if (msg.find("service=") != std::string::npos) has_service = true;
+            if (msg.find("in_flight=") != std::string::npos)
+                has_inflight = true;
+            if (msg.find("retries_in_flight=") != std::string::npos)
+                has_retries_inflight = true;
+            if (msg.find("cap=") != std::string::npos) has_cap = true;
+        }
+
+        // Snapshot: retries_rejected must be >= 1 (every rejection increments).
+        int64_t retries_rejected = 0;
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        if (cbm) {
+            for (const auto& s : cbm->SnapshotAll()) {
+                if (s.service_name == "svc-budget") {
+                    // Host aggregate — single host, so the sum is the
+                    // host's retries_rejected. The snapshot doesn't yet
+                    // expose that directly — derive from RetryBudget
+                    // via the host getter.
+                    auto* host = cbm->GetHost("svc-budget");
+                    if (host) {
+                        retries_rejected =
+                            host->GetRetryBudget()->RetriesRejected();
+                    }
+                    break;
+                }
+            }
+        }
+
+        bool pass = saw_exhausted && has_service && has_inflight &&
+                    has_retries_inflight && has_cap &&
+                    retries_rejected >= 1;
+        TestFramework::RecordTest(
+            "CB Observability: retry budget observability", pass,
+            pass ? "" :
+            "saw_exhausted=" + std::to_string(saw_exhausted) +
+            " service=" + std::to_string(has_service) +
+            " inflight=" + std::to_string(has_inflight) +
+            " retries_inflight=" + std::to_string(has_retries_inflight) +
+            " cap=" + std::to_string(has_cap) +
+            " retries_rejected=" + std::to_string(retries_rejected));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Observability: retry budget observability", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - OBSERVABILITY TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestSnapshotReflectsCounters();
+    TestTripLogEmission();
+    TestRetryBudgetObservability();
+}
+
+}  // namespace CircuitBreakerObservabilityTests
diff --git a/test/circuit_breaker_reload_test.h b/test/circuit_breaker_reload_test.h
new file mode 100644
index 00000000..220c718e
--- /dev/null
+++ b/test/circuit_breaker_reload_test.h
@@ -0,0 +1,373 @@
+#pragma once
+
+// Reload integration tests: hot-reload of circuit-breaker fields.
+//
+// UpstreamConfig::operator== now excludes `circuit_breaker` — a CB-only
+// SIGHUP is a clean reload that propagates via HttpServer::Reload →
+// CircuitBreakerManager::Reload → per-host per-slice Reload enqueued on
+// each owning dispatcher.
+//
+// Topology fields (host, port, pool, proxy, tls) remain restart-only.
+//
+// Strategy: construct a gateway with an enabled breaker, capture the
+// initial slice config, call HttpServer::Reload with an edited
+// CircuitBreakerConfig, and verify the slice's live config reflects the
+// edit. The reload-log capture also verifies the manager-level log lines
+// ("CircuitBreakerManager::Reload: new/removed upstream ...") fire for
+// topology-change SIGHUPs.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+#include "upstream/upstream_manager.h"
+#include "circuit_breaker/circuit_breaker_manager.h"
+#include "circuit_breaker/circuit_breaker_host.h"
+#include "circuit_breaker/circuit_breaker_slice.h"
+#include "log/logger.h"
+#include "spdlog/sinks/ringbuffer_sink.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <string>
+#include <memory>
+
+namespace CircuitBreakerReloadTests {
+
+static UpstreamConfig MakeReloadUpstream(const std::string& name,
+                                          const std::string& host,
+                                          int port) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 8;
+    u.pool.max_idle_connections  = 4;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+    u.proxy.retry.max_retries = 0;
+
+    u.circuit_breaker.enabled = true;
+    u.circuit_breaker.consecutive_failure_threshold = 3;
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    u.circuit_breaker.base_open_duration_ms = 5000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: CB-only SIGHUP propagates to live slice config.
+//
+// Build gateway with threshold=3. Reload with threshold=7. Verify the
+// slice's live config().consecutive_failure_threshold flipped to 7.
+// ---------------------------------------------------------------------------
+void TestCbReloadPropagatesToSlice() {
+    std::cout << "\n[TEST] CB Reload: reload propagates to slice..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        gw.upstreams.push_back(
+            MakeReloadUpstream("svc", "127.0.0.1", backend_port));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        auto* host = cbm->GetHost("svc");
+        auto* slice = host->GetSlice(0);
+        int threshold_before = slice->config().consecutive_failure_threshold;
+        int window_before = slice->config().window_seconds;
+
+        // Build reloaded config with modified CB fields only.
+        ServerConfig reloaded = gw;
+        reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 7;
+        reloaded.upstreams[0].circuit_breaker.window_seconds = 20;
+
+        bool ok = gateway.Reload(reloaded);
+        // Reload enqueues per-slice updates on the owning dispatcher —
+        // brief sleep to let the dispatcher execute the queued Slice::Reload.
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        int threshold_after = slice->config().consecutive_failure_threshold;
+        int window_after = slice->config().window_seconds;
+
+        bool pass = ok && threshold_before == 3 && window_before == 10
+                    && threshold_after == 7 && window_after == 20;
+        TestFramework::RecordTest(
+            "CB Reload: reload propagates to slice", pass,
+            pass ? "" :
+            "ok=" + std::to_string(ok) +
+            " threshold_before=" + std::to_string(threshold_before) +
+            " threshold_after=" + std::to_string(threshold_after) +
+            " window_before=" + std::to_string(window_before) +
+            " window_after=" + std::to_string(window_after));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: reload propagates to slice", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: CB-only reload does NOT emit the topology "restart required"
+// warning. UpstreamConfig::operator== excludes circuit_breaker so a
+// CB-only edit doesn't make the outer config != comparison true — the
+// warning fires only on topology-field changes (host, port, pool, proxy,
+// tls), which remain restart-only.
+// ---------------------------------------------------------------------------
+void TestCbOnlyReloadNoRestartWarn() {
+    std::cout << "\n[TEST] CB Reload: CB-only reload emits no restart warn..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        gw.upstreams.push_back(
+            MakeReloadUpstream("svc", "127.0.0.1", backend_port));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+
+        // Attach ringbuffer sink AFTER gateway ctor (logging::Init
+        // rebuilds the default logger). See the observability test for rationale.
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        ServerConfig reloaded = gw;
+        reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 9;
+
+        gateway.Reload(reloaded);
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        bool saw_topology_warn = false;
+        bool saw_cb_config_applied = false;
+        for (const auto& msg : ring->last_formatted()) {
+            if (msg.find("upstream topology changes require a restart") !=
+                std::string::npos) {
+                saw_topology_warn = true;
+            }
+            if (msg.find("circuit breaker config applied") !=
+                std::string::npos) {
+                saw_cb_config_applied = true;
+            }
+        }
+
+        bool pass = !saw_topology_warn && saw_cb_config_applied;
+        TestFramework::RecordTest(
+            "CB Reload: CB-only reload emits no restart warn", pass,
+            pass ? "" :
+            "saw_topology_warn=" + std::to_string(saw_topology_warn) +
+            " saw_cb_config_applied=" + std::to_string(saw_cb_config_applied));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: CB-only reload emits no restart warn", false,
+            e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: Topology change (pool field edit) STILL emits the restart warn
+// — the exclusion of circuit_breaker from operator== must NOT compromise
+// the restart-required signal for unreloadable fields.
+// ---------------------------------------------------------------------------
+void TestTopologyChangeStillEmitsRestartWarn() {
+    std::cout << "\n[TEST] CB Reload: topology change still warns..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        gw.upstreams.push_back(
+            MakeReloadUpstream("svc", "127.0.0.1", backend_port));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        ServerConfig reloaded = gw;
+        // Topology-level edit that operator== still detects.
+        reloaded.upstreams[0].pool.max_connections = 16;
+        // Also flip a breaker field so we verify BOTH happen on the
+        // same reload (live CB edit + topology warn).
+        reloaded.upstreams[0].circuit_breaker.consecutive_failure_threshold = 5;
+
+        gateway.Reload(reloaded);
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        bool saw_topology_warn = false;
+        bool saw_cb_config_applied = false;
+        for (const auto& msg : ring->last_formatted()) {
+            if (msg.find("upstream topology changes require a restart") !=
+                std::string::npos) {
+                saw_topology_warn = true;
+            }
+            if (msg.find("circuit breaker config applied") !=
+                std::string::npos) {
+                saw_cb_config_applied = true;
+            }
+        }
+
+        bool pass = saw_topology_warn && saw_cb_config_applied;
+        TestFramework::RecordTest(
+            "CB Reload: topology change still warns", pass,
+            pass ? "" :
+            "saw_topology_warn=" + std::to_string(saw_topology_warn) +
+            " saw_cb_config_applied=" + std::to_string(saw_cb_config_applied));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: topology change still warns", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 4: Disable → enable toggle via reload. A CB-only reload that sets
+// `enabled=false` must make the slice short-circuit admissions; a
+// subsequent reload flipping `enabled=true` must re-engage the state
+// machine without requiring a restart. Verifies the "wire transition
+// callbacks for ALL upstreams regardless of enabled" design (§3.1 R3-1).
+// ---------------------------------------------------------------------------
+void TestReloadDisableThenEnable() {
+    std::cout << "\n[TEST] CB Reload: reload disable→enable..." << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        gw.upstreams.push_back(
+            MakeReloadUpstream("svc", "127.0.0.1", backend_port));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        auto* slice = cbm->GetHost("svc")->GetSlice(0);
+
+        // Start: enabled=true.
+        bool enabled_before = slice->config().enabled;
+
+        // Reload to enabled=false.
+        ServerConfig disabled = gw;
+        disabled.upstreams[0].circuit_breaker.enabled = false;
+        gateway.Reload(disabled);
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        bool disabled_after = !slice->config().enabled;
+
+        // Reload back to enabled=true with a new threshold.
+        ServerConfig reenabled = gw;
+        reenabled.upstreams[0].circuit_breaker.enabled = true;
+        reenabled.upstreams[0].circuit_breaker.consecutive_failure_threshold = 11;
+        gateway.Reload(reenabled);
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+        bool enabled_again = slice->config().enabled;
+        int threshold_after = slice->config().consecutive_failure_threshold;
+
+        bool pass = enabled_before && disabled_after &&
+                    enabled_again && threshold_after == 11;
+        TestFramework::RecordTest(
+            "CB Reload: reload disable→enable", pass,
+            pass ? "" :
+            "enabled_before=" + std::to_string(enabled_before) +
+            " disabled_after=" + std::to_string(disabled_after) +
+            " enabled_again=" + std::to_string(enabled_again) +
+            " threshold_after=" + std::to_string(threshold_after));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: reload disable→enable", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - HOT-RELOAD TESTS" << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestCbReloadPropagatesToSlice();
+    TestCbOnlyReloadNoRestartWarn();
+    TestTopologyChangeStillEmitsRestartWarn();
+    TestReloadDisableThenEnable();
+}
+
+}  // namespace CircuitBreakerReloadTests
diff --git a/test/circuit_breaker_retry_budget_test.h b/test/circuit_breaker_retry_budget_test.h
new file mode 100644
index 00000000..608a0602
--- /dev/null
+++ b/test/circuit_breaker_retry_budget_test.h
@@ -0,0 +1,367 @@
+#pragma once
+
+// Retry-budget integration tests: retry budget wired into ProxyTransaction.
+//
+// The component suite covers the RetryBudget math (CAS, non-retry
+// denominator, min-concurrency floor) as unit tests against the
+// RetryBudget class in isolation. This suite tests the INTEGRATION:
+// ProxyTransaction resolves
+// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks
+// every attempt's in_flight via the RAII guard, and consults
+// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2
+// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed
+// back into the slice's failure math.
+//
+// Strategy: backends that always 502 with `retry_on_5xx=true` drive the
+// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`)
+// rejects every retry deterministically without needing concurrent
+// client load. The circuit-breaker consecutive-failure threshold is
+// raised well above the retry count so the breaker stays CLOSED — the
+// budget gate is tested in isolation from the state machine.
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <vector>
+
+namespace CircuitBreakerRetryBudgetTests {
+
+// Upstream config that always proxies /fail, with the circuit breaker
+// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker
+// thresholds intentionally unreachable for these tests — we want the
+// retry-budget gate fired in isolation, not co-tripping the state
+// machine.
+static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name,
+                                              const std::string& host,
+                                              int port,
+                                              int retry_budget_percent,
+                                              int retry_budget_min_concurrency,
+                                              bool dry_run = false) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    u.pool.max_connections       = 16;
+    u.pool.max_idle_connections  = 8;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 2000;
+
+    u.circuit_breaker.enabled = true;
+    u.circuit_breaker.dry_run = dry_run;
+    // Breaker thresholds unreachable — we don't want the state machine
+    // tripping during a retry-budget test.
+    u.circuit_breaker.consecutive_failure_threshold = 10000;
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    u.circuit_breaker.base_open_duration_ms = 30000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+
+    u.circuit_breaker.retry_budget_percent = retry_budget_percent;
+    u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency;
+    return u;
+}
+
+static bool HasRetryBudgetHeader(const std::string& response) {
+    return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos ||
+           response.find("x-retry-budget-exhausted: 1") != std::string::npos;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 +
+// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that
+// `TryConsumeRetry` runs BEFORE the retry executes and that
+// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse
+// path.
+//
+// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every
+// retry attempt's TryConsumeRetry returns false. First attempt is
+// unaffected (budget only gates retries), so the backend is hit exactly
+// once per client request; the retry is short-circuited locally.
+// ---------------------------------------------------------------------------
+void TestRetryBudgetRejectsRetry() {
+    std::cout << "\n[TEST] CB Retry Budget: retry budget rejects retry..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/0);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000);
+
+        bool is_503 = TestHttpClient::HasStatus(r, 503);
+        bool has_budget_hdr = HasRetryBudgetHeader(r);
+        // Backend should have been hit exactly once (the first attempt);
+        // every retry was short-circuited by the budget gate.
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool single_backend_hit = (hits == 1);
+
+        bool pass = is_503 && has_budget_hdr && single_backend_hit;
+        TestFramework::RecordTest(
+            "CB Retry Budget: retry budget rejects retry", pass,
+            pass ? "" :
+            "is_503=" + std::to_string(is_503) +
+            " budget_hdr=" + std::to_string(has_budget_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Retry Budget: retry budget rejects retry", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: The min-concurrency floor admits retries even when the %-based
+// cap would be zero. With percent=0 + min_concurrency=5, a single sequential
+// client request's retry chain (1 first + 3 retries = 4 backend hits) all
+// fit under the floor and proceed normally to the upstream — no 503, no
+// X-Retry-Budget-Exhausted, and the client sees the final 5xx response.
+//
+// This is the symmetric test to Test 1: same near-zero %-cap, but a floor
+// large enough that retries aren't budget-gated. Proves the floor is
+// consulted (retries admitted) instead of the %-cap (retries rejected).
+// ---------------------------------------------------------------------------
+void TestRetryBudgetMinConcurrencyFloor() {
+    std::cout << "\n[TEST] CB Retry Budget: retry budget min-concurrency floor..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        // percent=0 → no %-based capacity. min_concurrency=5 → floor
+        // admits up to 5 concurrent retries, easily covering the 3
+        // sequential retries from a single client request.
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/5);
+        u.proxy.retry.max_retries = 3;
+        u.proxy.retry.retry_on_5xx = true;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000);
+
+        // Client sees the upstream's final 502 — no local 503, no
+        // X-Retry-Budget-Exhausted.
+        bool is_502 = TestHttpClient::HasStatus(r, 502);
+        bool no_budget_hdr = !HasRetryBudgetHeader(r);
+        // 1 first attempt + 3 retries admitted by the floor = 4 backend hits.
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool all_retries_proceeded = (hits == 4);
+
+        bool pass = is_502 && no_budget_hdr && all_retries_proceeded;
+        TestFramework::RecordTest(
+            "CB Retry Budget: retry budget min-concurrency floor", pass,
+            pass ? "" :
+            "is_502=" + std::to_string(is_502) +
+            " no_budget_hdr=" + std::to_string(no_budget_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Retry Budget: retry budget min-concurrency floor", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 3: Dry-run bypasses the retry-budget gate.
+//
+// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry
+// returns false for every retry. But `circuit_breaker.dry_run=true`
+// switches the rejection path to a log-and-proceed: no token is
+// consumed, retry_token_held_ stays false, and AttemptCheckout runs as
+// though the budget was unlimited.
+//
+// Result: the client sees the upstream's 502 response (because the
+// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted.
+// ---------------------------------------------------------------------------
+void TestRetryBudgetDryRunPassthrough() {
+    std::cout << "\n[TEST] CB Retry Budget: retry budget dry-run passthrough..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/0,
+                                         /*dry_run=*/true);
+        u.proxy.retry.max_retries = 2;
+        u.proxy.retry.retry_on_5xx = true;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000);
+
+        // Retries proceeded despite would-reject decisions — the client
+        // sees the upstream's final 502, not our local 503.
+        bool is_502 = TestHttpClient::HasStatus(r, 502);
+        bool no_budget_hdr = !HasRetryBudgetHeader(r);
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool all_attempts_ran = (hits == 3);  // 1 first + 2 retries
+
+        bool pass = is_502 && no_budget_hdr && all_attempts_ran;
+        TestFramework::RecordTest(
+            "CB Retry Budget: retry budget dry-run passthrough", pass,
+            pass ? "" :
+            "is_502=" + std::to_string(is_502) +
+            " no_budget_hdr=" + std::to_string(no_budget_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " body=" + r.substr(0, 256));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Retry Budget: retry budget dry-run passthrough", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 4: First attempts are NOT budget-gated.
+//
+// The retry-budget cap applies only to retries (attempt_ > 0). First
+// attempts call TrackInFlight (which only ever increments) but skip
+// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a
+// backend that always 200s, every client request must succeed — if the
+// gate accidentally ran on first attempts, we'd see 503s here.
+//
+// Guards against a regression where TryConsumeRetry is called before
+// the `attempt_ > 0` gate, or where the gate is placed in
+// AttemptCheckout instead of MaybeRetry.
+// ---------------------------------------------------------------------------
+void TestFirstAttemptsNotGated() {
+    std::cout << "\n[TEST] CB Retry Budget: first attempts not gated..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            resp.Status(200).Body("ok", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
+                                         /*percent=*/0,
+                                         /*min_concurrency=*/0);
+        // No retries — every request is a first attempt.
+        u.proxy.retry.max_retries = 0;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        int client_count = 5;
+        int successes = 0;
+        for (int i = 0; i < client_count; ++i) {
+            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+            if (TestHttpClient::HasStatus(r, 200)) ++successes;
+            if (HasRetryBudgetHeader(r)) {
+                // Any X-Retry-Budget-Exhausted on a first-attempt-only
+                // path is a bug. Record and bail.
+                TestFramework::RecordTest(
+                    "CB Retry Budget: first attempts not gated", false,
+                    "unexpected X-Retry-Budget-Exhausted on first-attempt path "
+                    "i=" + std::to_string(i));
+                return;
+            }
+        }
+
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool pass = (successes == client_count) && (hits == client_count);
+        TestFramework::RecordTest(
+            "CB Retry Budget: first attempts not gated", pass,
+            pass ? "" :
+            "successes=" + std::to_string(successes) +
+            "/" + std::to_string(client_count) +
+            " backend_hits=" + std::to_string(hits));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Retry Budget: first attempts not gated", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - RETRY BUDGET INTEGRATION TESTS"
+              << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestRetryBudgetRejectsRetry();
+    TestRetryBudgetMinConcurrencyFloor();
+    TestRetryBudgetDryRunPassthrough();
+    TestFirstAttemptsNotGated();
+}
+
+}  // namespace CircuitBreakerRetryBudgetTests
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index 65b03777..bed54da0 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -732,7 +732,7 @@ void TestHalfOpenStopsAdmittingAfterFirstProbeFailure() {
 }
 
 // Verifies the dedicated HALF_OPEN-full counter is bumped separately from the
-// generic `rejected_` counter, so Phase 7 snapshots can distinguish
+// generic `rejected_` counter, so observability snapshots can distinguish
 // "open, backoff not elapsed" from "probing, no slots left".
 void TestHalfOpenFullCounterSeparate() {
     std::cout << "\n[TEST] CB: HALF_OPEN_FULL counter separate..." << std::endl;
@@ -958,7 +958,7 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() {
 
 // BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left
 // `open_until_steady_ns_` populated, violating the documented OpenUntil()
-// contract ("zero when not OPEN"). A Phase 4 consumer computing Retry-After
+// contract ("zero when not OPEN"). A consumer computing Retry-After
 // from a HALF_OPEN slice would compute (stale_deadline - now), which is
 // negative once HALF_OPEN begins.
 void TestOpenUntilZeroWhenHalfOpen() {
diff --git a/test/circuit_breaker_wait_queue_drain_test.h b/test/circuit_breaker_wait_queue_drain_test.h
new file mode 100644
index 00000000..d2200094
--- /dev/null
+++ b/test/circuit_breaker_wait_queue_drain_test.h
@@ -0,0 +1,261 @@
+#pragma once
+
+// Wait-queue-drain integration tests: wait-queue drain on CLOSED → OPEN trip.
+//
+// The integration suite covers "new requests after a trip hit
+// REJECTED_OPEN". This suite covers the orthogonal case: a request that passed ConsultBreaker
+// pre-trip and is waiting in the pool's bounded wait queue when the trip
+// fires. Without the drain, that waiter would sit until either the pool
+// frees a slot (and then re-hit the upstream — pointless traffic) or the
+// queue-timeout / open-duration elapses (up to 60s latency spike).
+//
+// Mechanism tested: `HttpServer::MarkServerReady` installs a transition
+// callback on every slice that routes CLOSED → OPEN to the corresponding
+// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives
+// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps
+// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`).
+//
+// Strategy: gate concurrency via a 1-connection pool. The first request
+// hangs at the backend long enough to let a second request queue behind
+// it. When the first's response lands (502), the breaker trips and the
+// drain fires, causing the queued request to receive 503 + circuit-open
+// headers instead of the backend's 502 (which would happen if the drain
+// were missing and the queued request proceeded).
+
+#include "test_framework.h"
+#include "test_server_runner.h"
+#include "http_test_client.h"
+#include "http/http_server.h"
+#include "config/server_config.h"
+
+#include <thread>
+#include <chrono>
+#include <atomic>
+#include <vector>
+#include <future>
+
+namespace CircuitBreakerWaitQueueDrainTests {
+
+static UpstreamConfig MakeDrainTripUpstream(const std::string& name,
+                                             const std::string& host,
+                                             int port,
+                                             bool breaker_enabled) {
+    UpstreamConfig u;
+    u.name = name;
+    u.host = host;
+    u.port = port;
+    // Single connection per partition — forces the second concurrent
+    // request to queue behind the first. Since tests run with
+    // worker_threads=1, one partition exists and it has exactly one
+    // connection slot.
+    u.pool.max_connections       = 1;
+    u.pool.max_idle_connections  = 1;
+    u.pool.connect_timeout_ms    = 3000;
+    u.pool.idle_timeout_sec      = 30;
+    u.pool.max_lifetime_sec      = 3600;
+    u.pool.max_requests_per_conn = 0;
+
+    u.proxy.route_prefix = "/fail";
+    u.proxy.strip_prefix = false;
+    u.proxy.response_timeout_ms = 5000;
+    u.proxy.retry.max_retries = 0;  // Deterministic — no retry confounds.
+
+    u.circuit_breaker.enabled = breaker_enabled;
+    u.circuit_breaker.consecutive_failure_threshold = 1;  // Trip on first 5xx.
+    u.circuit_breaker.failure_rate_threshold = 100;
+    u.circuit_breaker.minimum_volume = 10000;
+    u.circuit_breaker.window_seconds = 10;
+    u.circuit_breaker.permitted_half_open_calls = 2;
+    // Long open duration so the drain is unambiguously the thing that
+    // surfaces the 503 to the queued client — not a timer-driven
+    // HALF_OPEN recovery admitting a subsequent attempt.
+    u.circuit_breaker.base_open_duration_ms = 30000;
+    u.circuit_breaker.max_open_duration_ms  = 60000;
+    return u;
+}
+
+// ---------------------------------------------------------------------------
+// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker.
+//
+// Request A takes the single pool slot and hangs at the backend for ~300ms.
+// Request B queues (pool exhausted). At t≈300ms, A's backend response
+// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip →
+// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives
+// 503 + `X-Circuit-Breaker: open`.
+//
+// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the
+// backend itself, gets 502, client sees 502 — NOT 503 and NOT
+// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header`
+// fails without the drain wiring.
+// ---------------------------------------------------------------------------
+void TestWaitQueueDrainedOnTrip() {
+    std::cout << "\n[TEST] CB Wait-Queue Drain: wait queue drained on trip..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            // Delay so the gateway's pool holds the connection long
+            // enough for a second client request to queue on it.
+            std::this_thread::sleep_for(std::chrono::milliseconds(300));
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;  // Single partition → single wait queue.
+        gw.http2.enabled = false;
+
+        gw.upstreams.push_back(
+            MakeDrainTripUpstream("svc", "127.0.0.1", backend_port,
+                                  /*breaker_enabled=*/true));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Launch A first (takes the one connection), then B 50ms later
+        // so B is guaranteed to enter the wait queue.
+        std::promise<std::string> a_resp, b_resp;
+        auto a_fut = a_resp.get_future();
+        auto b_fut = b_resp.get_future();
+        std::thread a([&]() {
+            a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        std::thread b([&]() {
+            b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        a.join();
+        b.join();
+
+        std::string ra = a_fut.get();
+        std::string rb = b_fut.get();
+
+        // A unambiguously hits the backend (owns the slot) and sees 502.
+        bool a_is_502 = TestHttpClient::HasStatus(ra, 502);
+        // B must see the circuit-open short-circuit from the drain —
+        // NOT a 502 from the backend, which is what happens without
+        // the drain wiring.
+        bool b_is_503 = TestHttpClient::HasStatus(rb, 503);
+        bool b_has_breaker_hdr =
+            rb.find("X-Circuit-Breaker: open") != std::string::npos ||
+            rb.find("x-circuit-breaker: open") != std::string::npos;
+        // Exactly one backend hit — B was drained before making it to
+        // the upstream. Without the drain, backend_hits would be 2.
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool single_hit = (hits == 1);
+
+        bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit;
+        TestFramework::RecordTest(
+            "CB Wait-Queue Drain: wait queue drained on trip", pass,
+            pass ? "" :
+            "a_is_502=" + std::to_string(a_is_502) +
+            " b_is_503=" + std::to_string(b_is_503) +
+            " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) +
+            " backend_hits=" + std::to_string(hits) +
+            " rb_head=" + rb.substr(0, 200));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Wait-Queue Drain: wait queue drained on trip", false, e.what());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Test 2: With the breaker disabled, the drain does NOT fire — the queued
+// waiter proceeds to the upstream as it would absent the circuit-breaker
+// layer entirely.
+//
+// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices
+// short-circuit in TryAcquire and never invoke transition callbacks, so
+// DrainWaitQueueOnTrip is never called. Request B must hit the backend
+// (backend_hits == 2) and receive the upstream's 502 — NOT a 503.
+// ---------------------------------------------------------------------------
+void TestDisabledBreakerDoesNotDrain() {
+    std::cout << "\n[TEST] CB Wait-Queue Drain: disabled breaker does not drain..."
+              << std::endl;
+    try {
+        std::atomic<int> backend_hits{0};
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
+            backend_hits.fetch_add(1, std::memory_order_relaxed);
+            std::this_thread::sleep_for(std::chrono::milliseconds(300));
+            resp.Status(502).Body("upstream-err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+
+        gw.upstreams.push_back(
+            MakeDrainTripUpstream("svc", "127.0.0.1", backend_port,
+                                  /*breaker_enabled=*/false));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        std::promise<std::string> a_resp, b_resp;
+        auto a_fut = a_resp.get_future();
+        auto b_fut = b_resp.get_future();
+        std::thread a([&]() {
+            a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+        std::thread b([&]() {
+            b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
+        });
+        a.join();
+        b.join();
+
+        std::string ra = a_fut.get();
+        std::string rb = b_fut.get();
+
+        // Both reach the backend — disabled breaker = no drain.
+        bool a_is_502 = TestHttpClient::HasStatus(ra, 502);
+        bool b_is_502 = TestHttpClient::HasStatus(rb, 502);
+        // Neither should carry the circuit-open header.
+        bool no_breaker_on_a =
+            ra.find("X-Circuit-Breaker") == std::string::npos &&
+            ra.find("x-circuit-breaker") == std::string::npos;
+        bool no_breaker_on_b =
+            rb.find("X-Circuit-Breaker") == std::string::npos &&
+            rb.find("x-circuit-breaker") == std::string::npos;
+        int hits = backend_hits.load(std::memory_order_relaxed);
+        bool two_hits = (hits == 2);
+
+        bool pass = a_is_502 && b_is_502 && no_breaker_on_a &&
+                    no_breaker_on_b && two_hits;
+        TestFramework::RecordTest(
+            "CB Wait-Queue Drain: disabled breaker does not drain", pass,
+            pass ? "" :
+            "a_is_502=" + std::to_string(a_is_502) +
+            " b_is_502=" + std::to_string(b_is_502) +
+            " no_breaker_on_a=" + std::to_string(no_breaker_on_a) +
+            " no_breaker_on_b=" + std::to_string(no_breaker_on_b) +
+            " backend_hits=" + std::to_string(hits));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Wait-Queue Drain: disabled breaker does not drain", false, e.what());
+    }
+}
+
+void RunAllTests() {
+    std::cout << "\n" << std::string(60, '=') << std::endl;
+    std::cout << "CIRCUIT BREAKER - WAIT-QUEUE DRAIN ON TRIP TESTS"
+              << std::endl;
+    std::cout << std::string(60, '=') << std::endl;
+
+    TestWaitQueueDrainedOnTrip();
+    TestDisabledBreakerDoesNotDrain();
+}
+
+}  // namespace CircuitBreakerWaitQueueDrainTests
diff --git a/test/config_test.h b/test/config_test.h
index fe164ec3..778f464b 100644
--- a/test/config_test.h
+++ b/test/config_test.h
@@ -562,14 +562,15 @@ namespace ConfigTests {
             "circuit_breaker.enabled must be a boolean");
     }
 
-    // Test 14: UpstreamConfig::operator== INCLUDES circuit_breaker until Phase 8.
-    // Until CircuitBreakerManager::Reload is wired in HttpServer::Reload, a
-    // CB-only SIGHUP has no propagation path. Keeping circuit_breaker in the
-    // equality check ensures the server fires the "restart required" warning
-    // rather than silently reporting "reload OK" with stale live settings.
-    // TODO(phase-8): flip this test when CB hot-reload is implemented.
+    // UpstreamConfig::operator== EXCLUDES circuit_breaker.
+    // CircuitBreakerManager::Reload is wired in HttpServer::Reload, so a
+    // CB-only SIGHUP is a clean hot reload. Excluding circuit_breaker from
+    // the equality check ensures the outer reload doesn't fire a spurious
+    // "restart required" warning on a pure CB-fields edit.
+    // Topology fields (name, host, port, tls, pool, proxy) remain
+    // restart-only and must still trigger inequality.
     void TestCircuitBreakerEquality() {
-        std::cout << "\n[TEST] Circuit Breaker Equality (CB included until Phase 8)..." << std::endl;
+        std::cout << "\n[TEST] Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)..." << std::endl;
         try {
             UpstreamConfig a;
             a.name = "svc"; a.host = "h"; a.port = 80;
@@ -578,16 +579,17 @@ namespace ConfigTests {
             // Default equal.
             bool equal_default = (a == b);
 
-            // Circuit-breaker-only edit DOES change UpstreamConfig equality
-            // (until Phase 8 ships the live-reload path).
+            // Circuit-breaker-only edit must NOT break equality — breaker
+            // fields are live-reloadable via CircuitBreakerManager::Reload.
             b.circuit_breaker.enabled = true;
             b.circuit_breaker.window_seconds = 30;
-            bool cb_edit_detected = (a != b);
+            bool cb_edit_invisible = (a == b);
 
-            // CircuitBreakerConfig::operator== agrees on the field diff.
+            // CircuitBreakerConfig::operator== still detects the field diff
+            // (CircuitBreakerManager::Reload relies on this inner comparison).
             bool cb_fields_differ = (a.circuit_breaker != b.circuit_breaker);
 
-            // Topology changes also make configs unequal.
+            // Topology changes still make configs unequal.
             UpstreamConfig c = a;
             c.host = "different";
             bool topology_changed = (a != c);
@@ -596,20 +598,20 @@ namespace ConfigTests {
             d.port = 9999;
             bool port_change_detected = (a != d);
 
-            bool pass = equal_default && cb_edit_detected &&
+            bool pass = equal_default && cb_edit_invisible &&
                         cb_fields_differ && topology_changed &&
                         port_change_detected;
-            TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)",
+            TestFramework::RecordTest("Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)",
                 pass,
                 pass ? "" :
                 "equal_default=" + std::to_string(equal_default) +
-                " cb_edit_detected=" + std::to_string(cb_edit_detected) +
+                " cb_edit_invisible=" + std::to_string(cb_edit_invisible) +
                 " cb_fields_differ=" + std::to_string(cb_fields_differ) +
                 " topology_changed=" + std::to_string(topology_changed) +
                 " port_change_detected=" + std::to_string(port_change_detected),
                 TestFramework::TestCategory::OTHER);
         } catch (const std::exception& e) {
-            TestFramework::RecordTest("Circuit Breaker Equality (CB included until Phase 8)",
+            TestFramework::RecordTest("Circuit Breaker Equality (CB excluded from UpstreamConfig::operator==)",
                 false, e.what(), TestFramework::TestCategory::OTHER);
         }
     }
@@ -629,7 +631,7 @@ namespace ConfigTests {
         TestEnvOverrides();
         TestMissingFile();
 
-        // Phase 1: Circuit breaker config
+        // Circuit breaker config tests
         TestCircuitBreakerDefaults();
         TestCircuitBreakerJsonParse();
         TestCircuitBreakerJsonPartial();
diff --git a/test/run_test.cc b/test/run_test.cc
index 17d7eed9..0419c6ee 100644
--- a/test/run_test.cc
+++ b/test/run_test.cc
@@ -14,11 +14,12 @@
 #include "proxy_test.h"
 #include "rate_limit_test.h"
 #include "circuit_breaker_test.h"
-#include "circuit_breaker_phase3_test.h"
-#include "circuit_breaker_phase4_test.h"
-#include "circuit_breaker_phase5_test.h"
-#include "circuit_breaker_phase6_test.h"
-#include "circuit_breaker_phase7_test.h"
+#include "circuit_breaker_components_test.h"
+#include "circuit_breaker_integration_test.h"
+#include "circuit_breaker_retry_budget_test.h"
+#include "circuit_breaker_wait_queue_drain_test.h"
+#include "circuit_breaker_observability_test.h"
+#include "circuit_breaker_reload_test.h"
 #include "test_framework.h"
 #include <algorithm>
 #include <sys/resource.h>
@@ -86,21 +87,24 @@ void RunAllTest(){
     // Run circuit breaker tests
     CircuitBreakerTests::RunAllTests();
 
-    // Run circuit breaker Phase 3 tests (host / manager / retry budget)
-    CircuitBreakerPhase3Tests::RunAllTests();
+    // Run circuit-breaker component unit tests (RetryBudget / Host / Manager)
+    CircuitBreakerComponentsTests::RunAllTests();
 
-    // Run circuit breaker Phase 4 integration tests (end-to-end through
+    // Run circuit-breaker integration tests (end-to-end through
     // ProxyTransaction + UpstreamManager + HttpServer)
-    CircuitBreakerPhase4Tests::RunAllTests();
+    CircuitBreakerIntegrationTests::RunAllTests();
 
-    // Run circuit breaker Phase 5 retry-budget integration tests
-    CircuitBreakerPhase5Tests::RunAllTests();
+    // Run circuit-breaker retry-budget integration tests
+    CircuitBreakerRetryBudgetTests::RunAllTests();
 
-    // Run circuit breaker Phase 6 wait-queue-drain-on-trip tests
-    CircuitBreakerPhase6Tests::RunAllTests();
+    // Run circuit-breaker wait-queue-drain-on-trip tests
+    CircuitBreakerWaitQueueDrainTests::RunAllTests();
 
-    // Run circuit breaker Phase 7 observability tests
-    CircuitBreakerPhase7Tests::RunAllTests();
+    // Run circuit-breaker observability tests
+    CircuitBreakerObservabilityTests::RunAllTests();
+
+    // Run circuit-breaker hot-reload tests
+    CircuitBreakerReloadTests::RunAllTests();
 
     std::cout << "====================================\n" << std::endl;
 }
@@ -180,14 +184,15 @@ int main(int argc, char* argv[]) {
         // Run rate limit tests
         }else if(mode == "rate_limit" || mode == "-L"){
             RateLimitTests::RunAllTests();
-        // Run circuit breaker tests (phases 1-7: unit + phase3 + phase4 + phase5 + phase6 + phase7)
+        // Run circuit-breaker tests (unit + components + integration + retry-budget + drain + observability + reload)
         }else if(mode == "circuit_breaker" || mode == "-B"){
             CircuitBreakerTests::RunAllTests();
-            CircuitBreakerPhase3Tests::RunAllTests();
-            CircuitBreakerPhase4Tests::RunAllTests();
-            CircuitBreakerPhase5Tests::RunAllTests();
-            CircuitBreakerPhase6Tests::RunAllTests();
-            CircuitBreakerPhase7Tests::RunAllTests();
+            CircuitBreakerComponentsTests::RunAllTests();
+            CircuitBreakerIntegrationTests::RunAllTests();
+            CircuitBreakerRetryBudgetTests::RunAllTests();
+            CircuitBreakerWaitQueueDrainTests::RunAllTests();
+            CircuitBreakerObservabilityTests::RunAllTests();
+            CircuitBreakerReloadTests::RunAllTests();
         // Show help
         }else if(mode == "help" || mode == "-h" || mode == "--help"){
             PrintUsage(argv[0]);

From ed2946f6a43232fa2c3bd82b5061a03d88a4c131 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 22:05:00 +0800
Subject: [PATCH 26/37] Fix review comment

---
 test/circuit_breaker_phase3_test.h |  506 ------------
 test/circuit_breaker_phase4_test.h | 1213 ----------------------------
 test/circuit_breaker_phase5_test.h |  366 ---------
 test/circuit_breaker_phase6_test.h |  261 ------
 test/circuit_breaker_phase7_test.h |  405 ----------
 5 files changed, 2751 deletions(-)
 delete mode 100644 test/circuit_breaker_phase3_test.h
 delete mode 100644 test/circuit_breaker_phase4_test.h
 delete mode 100644 test/circuit_breaker_phase5_test.h
 delete mode 100644 test/circuit_breaker_phase6_test.h
 delete mode 100644 test/circuit_breaker_phase7_test.h

diff --git a/test/circuit_breaker_phase3_test.h b/test/circuit_breaker_phase3_test.h
deleted file mode 100644
index 87ed28e7..00000000
--- a/test/circuit_breaker_phase3_test.h
+++ /dev/null
@@ -1,506 +0,0 @@
-#pragma once
-
-#include "test_framework.h"
-#include "config/server_config.h"
-#include "circuit_breaker/circuit_breaker_state.h"
-#include "circuit_breaker/circuit_breaker_slice.h"
-#include "circuit_breaker/retry_budget.h"
-#include "circuit_breaker/circuit_breaker_host.h"
-#include "circuit_breaker/circuit_breaker_manager.h"
-#include "dispatcher.h"
-
-#include <iostream>
-#include <string>
-#include <thread>
-#include <vector>
-
-// Phase 3 unit tests: RetryBudget, CircuitBreakerHost, CircuitBreakerManager.
-//
-// These tests exercise the standalone data structures introduced in Phase 3
-// without any integration into the request path (that comes in Phase 4).
-// Every test constructs the object under test in isolation — no live
-// dispatchers, no network I/O. A minimal Dispatcher is instantiated only
-// where CircuitBreakerHost::Reload needs one to enqueue per-slice Reload
-// calls.
-namespace CircuitBreakerPhase3Tests {
-
-using circuit_breaker::CircuitBreakerHost;
-using circuit_breaker::CircuitBreakerHostSnapshot;
-using circuit_breaker::CircuitBreakerManager;
-using circuit_breaker::Decision;
-using circuit_breaker::FailureKind;
-using circuit_breaker::RetryBudget;
-using circuit_breaker::State;
-
-static CircuitBreakerConfig DefaultCbConfig() {
-    CircuitBreakerConfig cb;
-    cb.enabled = true;
-    cb.consecutive_failure_threshold = 5;
-    cb.failure_rate_threshold = 50;
-    cb.minimum_volume = 20;
-    cb.window_seconds = 10;
-    cb.permitted_half_open_calls = 3;
-    cb.base_open_duration_ms = 5000;
-    cb.max_open_duration_ms = 60000;
-    cb.retry_budget_percent = 20;
-    cb.retry_budget_min_concurrency = 3;
-    return cb;
-}
-
-// ============================================================================
-// RetryBudget tests
-// ============================================================================
-
-// Min-concurrency floor: with tiny in_flight, min_concurrency still permits
-// the configured floor of concurrent retries (otherwise a 20% budget allows 0
-// retries when in_flight < 5 — useless in low-volume services).
-void TestRetryBudgetMinConcurrencyFloor() {
-    std::cout << "\n[TEST] RetryBudget: min_concurrency floor permits retries..."
-              << std::endl;
-    try {
-        // percent=20, min=3. Even with 0 in_flight, 3 retries allowed.
-        RetryBudget rb(20, 3);
-
-        // Without any in_flight, min floor is what gates us.
-        bool r1 = rb.TryConsumeRetry();  // 1/3
-        bool r2 = rb.TryConsumeRetry();  // 2/3
-        bool r3 = rb.TryConsumeRetry();  // 3/3
-        bool r4 = rb.TryConsumeRetry();  // over → rejected
-
-        bool pass = r1 && r2 && r3 && !r4 &&
-                    rb.RetriesInFlight() == 3 &&
-                    rb.RetriesRejected() == 1;
-
-        rb.ReleaseRetry(); rb.ReleaseRetry(); rb.ReleaseRetry();
-        pass = pass && rb.RetriesInFlight() == 0;
-
-        TestFramework::RecordTest("RetryBudget min_concurrency floor", pass,
-            pass ? "" : "r1=" + std::to_string(r1) +
-                        " r2=" + std::to_string(r2) +
-                        " r3=" + std::to_string(r3) +
-                        " r4=" + std::to_string(r4) +
-                        " inflight=" + std::to_string(rb.RetriesInFlight()) +
-                        " rejected=" + std::to_string(rb.RetriesRejected()),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("RetryBudget min_concurrency floor", false,
-            e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Percent-based cap scales with in_flight.
-//   percent=20, min=0, in_flight=50 → cap = 10 retries.
-void TestRetryBudgetPercentCap() {
-    std::cout << "\n[TEST] RetryBudget: percent cap scales with in_flight..."
-              << std::endl;
-    try {
-        RetryBudget rb(20, 0);  // no min floor — pure percent
-
-        // Push in_flight to 50 via guards that we intentionally keep
-        // alive. Per the documented API, callers hold TrackInFlight()
-        // for BOTH first attempts and retries — but TryConsumeRetry
-        // subtracts retries_in_flight from the base so the budget
-        // doesn't self-inflate as retries are admitted.
-        std::vector<RetryBudget::InFlightGuard> guards;
-        for (int i = 0; i < 50; ++i) guards.push_back(rb.TrackInFlight());
-
-        // With 50 non-retry in-flight and 20% budget the first
-        // admission is against cap=10, but each admission shrinks the
-        // non-retry base by 1. The admission count converges at r
-        // where r >= floor((50-r) * 20 / 100). Solving: r = 8. The
-        // pre-fix formula (cap computed from raw in_flight) would
-        // admit 10, drifting the effective ratio above 20% of
-        // originals.
-        int admitted = 0;
-        for (int i = 0; i < 20; ++i) {
-            if (rb.TryConsumeRetry()) ++admitted;
-        }
-        bool cap_hit = admitted == 8;
-        bool rejected_count = rb.RetriesRejected() == 12;
-
-        // Release guards — in_flight drops to 0; future TryConsumeRetry with
-        // min=0 and in_flight=0 rejects everything.
-        for (auto& g : guards) (void)std::move(g);
-        guards.clear();
-        for (int i = 0; i < admitted; ++i) rb.ReleaseRetry();
-
-        bool pass = cap_hit && rejected_count && rb.InFlight() == 0 &&
-                    rb.RetriesInFlight() == 0;
-        TestFramework::RecordTest("RetryBudget percent cap", pass,
-            pass ? "" : "admitted=" + std::to_string(admitted) +
-                        " rejected=" + std::to_string(rb.RetriesRejected()) +
-                        " inflight=" + std::to_string(rb.InFlight()),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("RetryBudget percent cap", false,
-            e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// TrackInFlight guards must be RAII-safe: destroying the guard decrements
-// in_flight_; moving the guard transfers ownership; self-move safe.
-void TestRetryBudgetInFlightGuardRaii() {
-    std::cout << "\n[TEST] RetryBudget: InFlightGuard RAII..." << std::endl;
-    try {
-        RetryBudget rb(20, 3);
-
-        bool zero_init = rb.InFlight() == 0;
-        {
-            auto g = rb.TrackInFlight();
-            bool one_after_track = rb.InFlight() == 1;
-
-            // Move-construct: counter transfers, original is empty.
-            auto g2 = std::move(g);
-            bool still_one_after_move = rb.InFlight() == 1;
-            // g is now empty, destroying it decrements nothing.
-            (void)g;
-
-            // g2 goes out of scope next.
-            if (!zero_init || !one_after_track || !still_one_after_move) {
-                TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
-                    false, "mid-test state wrong",
-                    TestFramework::TestCategory::OTHER);
-                return;
-            }
-        }
-        bool zero_after_drop = rb.InFlight() == 0;
-        TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
-            zero_after_drop,
-            zero_after_drop ? "" : "in_flight not zero after guard drop",
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("RetryBudget InFlightGuard RAII",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Reload updates tuning atomically without resetting in-flight counters —
-// the admission formula changes, outstanding retries keep running.
-void TestRetryBudgetReloadPreservesCounters() {
-    std::cout << "\n[TEST] RetryBudget: Reload preserves in-flight..."
-              << std::endl;
-    try {
-        RetryBudget rb(20, 3);
-        bool r1 = rb.TryConsumeRetry();  // 1/3
-
-        // Tighten tuning mid-flight.
-        rb.Reload(10, 1);
-
-        // Outstanding retry is still tracked.
-        bool inflight_preserved = rb.RetriesInFlight() == 1;
-
-        // New tuning applies — min=1, so 1/1 retry allowed max.
-        // Current retries_in_flight=1 already, next attempt rejects.
-        bool r2 = rb.TryConsumeRetry();
-
-        rb.ReleaseRetry();
-        bool cleanup_ok = rb.RetriesInFlight() == 0;
-
-        bool pass = r1 && inflight_preserved && !r2 && cleanup_ok;
-        TestFramework::RecordTest("RetryBudget Reload preserves counters", pass,
-            pass ? "" : "r1=" + std::to_string(r1) +
-                        " inflight_preserved=" + std::to_string(inflight_preserved) +
-                        " r2=" + std::to_string(r2) +
-                        " cleanup_ok=" + std::to_string(cleanup_ok),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("RetryBudget Reload preserves counters",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Clamp guards: negative percent / negative min_concurrency are clamped at
-// construction (mirrors ConfigLoader::Validate — programmatic callers that
-// bypass validation get safe defaults).
-void TestRetryBudgetClampsInvalidTuning() {
-    std::cout << "\n[TEST] RetryBudget: clamps invalid tuning..." << std::endl;
-    try {
-        RetryBudget rb(-50, -10);
-        bool clamped = rb.percent() == 0 && rb.min_concurrency() == 0;
-
-        // Over-max percent clamps to 100.
-        RetryBudget rb2(500, 5);
-        bool over_clamped = rb2.percent() == 100;
-
-        // Reload also clamps.
-        rb.Reload(-1, -1);
-        bool reload_clamped = rb.percent() == 0 && rb.min_concurrency() == 0;
-
-        bool pass = clamped && over_clamped && reload_clamped;
-        TestFramework::RecordTest("RetryBudget clamps invalid tuning", pass,
-            pass ? "" :
-            "clamped=" + std::to_string(clamped) +
-            " over_clamped=" + std::to_string(over_clamped) +
-            " reload_clamped=" + std::to_string(reload_clamped),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("RetryBudget clamps invalid tuning",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// ============================================================================
-// CircuitBreakerHost tests
-// ============================================================================
-
-// Host creates partition_count slices, GetSlice looks up by index, out-of-
-// range returns nullptr (not a crash).
-void TestHostCreatesSlicesAndGetSlice() {
-    std::cout << "\n[TEST] CircuitBreakerHost: creates slices + GetSlice..."
-              << std::endl;
-    try {
-        auto cb = DefaultCbConfig();
-        CircuitBreakerHost host("svc", "10.0.0.1", 8080, 4, cb);
-
-        bool count_ok = host.partition_count() == 4;
-        bool slice0 = host.GetSlice(0) != nullptr;
-        bool slice3 = host.GetSlice(3) != nullptr;
-        bool slice4_null = host.GetSlice(4) == nullptr;  // out of range
-        bool slice_big_null = host.GetSlice(100) == nullptr;
-
-        // Retry budget always present.
-        bool rb_present = host.GetRetryBudget() != nullptr;
-
-        // Field getters.
-        bool fields_ok = host.service_name() == "svc" &&
-                        host.host() == "10.0.0.1" &&
-                        host.port() == 8080;
-
-        bool pass = count_ok && slice0 && slice3 && slice4_null &&
-                    slice_big_null && rb_present && fields_ok;
-        TestFramework::RecordTest("CircuitBreakerHost GetSlice", pass, "",
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("CircuitBreakerHost GetSlice", false,
-            e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Host Snapshot aggregates counters across slices and rolls up states.
-void TestHostSnapshotAggregates() {
-    std::cout << "\n[TEST] CircuitBreakerHost: Snapshot aggregates..."
-              << std::endl;
-    try {
-        auto cb = DefaultCbConfig();
-        cb.consecutive_failure_threshold = 2;
-        cb.failure_rate_threshold = 100;
-        cb.minimum_volume = 1000;
-        CircuitBreakerHost host("svc", "h", 80, 3, cb);
-
-        // Trip slice 0 and 2 → 2 open_partitions, 1 closed.
-        for (int p : {0, 2}) {
-            auto* s = host.GetSlice(p);
-            for (int i = 0; i < 2; ++i) {
-                auto a = s->TryAcquire();
-                s->ReportFailure(FailureKind::RESPONSE_5XX, false, a.generation);
-            }
-        }
-
-        auto snap = host.Snapshot();
-
-        bool rows_ok = snap.slices.size() == 3;
-        bool total_trips = snap.total_trips == 2;
-        bool open = snap.open_partitions == 2;
-        bool halfopen = snap.half_open_partitions == 0;
-        bool svc_ok = snap.service_name == "svc" &&
-                      snap.host == "h" && snap.port == 80;
-
-        bool pass = rows_ok && total_trips && open && halfopen && svc_ok;
-        TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates", pass,
-            pass ? "" :
-            "rows=" + std::to_string(snap.slices.size()) +
-            " trips=" + std::to_string(snap.total_trips) +
-            " open=" + std::to_string(snap.open_partitions),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("CircuitBreakerHost Snapshot aggregates",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Host Reload with mismatched dispatcher count logs error and does nothing.
-// Uses an empty dispatcher vector — the mismatch path must NOT dereference.
-void TestHostReloadDispatcherMismatchIsSafe() {
-    std::cout << "\n[TEST] CircuitBreakerHost: Reload dispatcher mismatch..."
-              << std::endl;
-    try {
-        auto cb = DefaultCbConfig();
-        CircuitBreakerHost host("svc", "h", 80, 3, cb);
-
-        auto new_cb = cb;
-        new_cb.failure_rate_threshold = 80;
-
-        // Mismatch: 0 dispatchers vs 3 slices. Must not crash, must not
-        // apply (retry budget atomics should stay at old values).
-        std::vector<std::shared_ptr<Dispatcher>> empty;
-        host.Reload(empty, new_cb);
-
-        // Retry budget fields should be unchanged — Reload bailed early.
-        bool rb_unchanged =
-            host.GetRetryBudget()->percent() == cb.retry_budget_percent &&
-            host.GetRetryBudget()->min_concurrency() ==
-                cb.retry_budget_min_concurrency;
-
-        TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe",
-            rb_unchanged,
-            rb_unchanged ? "" : "retry budget incorrectly updated on bail",
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("CircuitBreakerHost Reload mismatch is safe",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// ============================================================================
-// CircuitBreakerManager tests
-// ============================================================================
-
-// Manager builds one host per upstream (regardless of enabled). GetHost
-// returns non-null for known names and null for unknown.
-void TestManagerGetHostLookup() {
-    std::cout << "\n[TEST] CircuitBreakerManager: GetHost lookup..."
-              << std::endl;
-    try {
-        std::vector<UpstreamConfig> upstreams(2);
-        upstreams[0].name = "svc-a";
-        upstreams[0].host = "10.0.0.1";
-        upstreams[0].port = 8080;
-        upstreams[0].circuit_breaker = DefaultCbConfig();
-        upstreams[1].name = "svc-b";
-        upstreams[1].host = "10.0.0.2";
-        upstreams[1].port = 9090;
-        upstreams[1].circuit_breaker = DefaultCbConfig();
-        upstreams[1].circuit_breaker.enabled = false;  // disabled still built
-
-        CircuitBreakerManager mgr(upstreams, 4, {});
-
-        bool count_ok = mgr.host_count() == 2;
-        auto* a = mgr.GetHost("svc-a");
-        auto* b = mgr.GetHost("svc-b");
-        auto* unknown = mgr.GetHost("nope");
-
-        bool a_ok = a != nullptr && a->port() == 8080 &&
-                    a->partition_count() == 4;
-        bool b_ok = b != nullptr && b->port() == 9090 &&
-                    b->partition_count() == 4;
-        bool unknown_null = unknown == nullptr;
-
-        bool pass = count_ok && a_ok && b_ok && unknown_null;
-        TestFramework::RecordTest("CircuitBreakerManager GetHost lookup", pass,
-            pass ? "" :
-            "count_ok=" + std::to_string(count_ok) +
-            " a=" + std::to_string(a_ok) +
-            " b=" + std::to_string(b_ok) +
-            " unknown_null=" + std::to_string(unknown_null),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("CircuitBreakerManager GetHost lookup",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// SnapshotAll returns one entry per host; topology-preserved Reload logs and
-// skips new/removed names without crashing.
-void TestManagerSnapshotAllAndReloadSkipsTopologyChanges() {
-    std::cout << "\n[TEST] CircuitBreakerManager: SnapshotAll + Reload skips topology..."
-              << std::endl;
-    try {
-        std::vector<UpstreamConfig> upstreams(1);
-        upstreams[0].name = "svc-a";
-        upstreams[0].host = "h";
-        upstreams[0].port = 80;
-        upstreams[0].circuit_breaker = DefaultCbConfig();
-
-        CircuitBreakerManager mgr(upstreams, 2, {});
-
-        auto snaps = mgr.SnapshotAll();
-        bool one_snapshot = snaps.size() == 1;
-        bool snap_name_ok = snaps[0].service_name == "svc-a";
-
-        // Reload with a NEW name + REMOVED existing name — both must log
-        // warn and do nothing (topology is restart-only).
-        std::vector<UpstreamConfig> new_upstreams(1);
-        new_upstreams[0].name = "svc-NEW";
-        new_upstreams[0].host = "h";
-        new_upstreams[0].port = 80;
-        new_upstreams[0].circuit_breaker = DefaultCbConfig();
-
-        mgr.Reload(new_upstreams);
-
-        // Manager must still only know about svc-a (the original).
-        bool original_preserved = mgr.GetHost("svc-a") != nullptr;
-        bool new_not_added = mgr.GetHost("svc-NEW") == nullptr;
-        bool count_stable = mgr.host_count() == 1;
-
-        bool pass = one_snapshot && snap_name_ok && original_preserved &&
-                    new_not_added && count_stable;
-        TestFramework::RecordTest(
-            "CircuitBreakerManager SnapshotAll + topology-skip", pass,
-            pass ? "" :
-            "one_snap=" + std::to_string(one_snapshot) +
-            " name_ok=" + std::to_string(snap_name_ok) +
-            " preserved=" + std::to_string(original_preserved) +
-            " new_not_added=" + std::to_string(new_not_added) +
-            " count=" + std::to_string(mgr.host_count()),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CircuitBreakerManager SnapshotAll + topology-skip",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Empty-name upstream is skipped defensively (ConfigLoader::Validate rejects
-// empty names, but manager must not blow up if something slips through).
-void TestManagerSkipsEmptyNameUpstream() {
-    std::cout << "\n[TEST] CircuitBreakerManager: skips empty-name upstream..."
-              << std::endl;
-    try {
-        std::vector<UpstreamConfig> upstreams(2);
-        upstreams[0].name = "";  // defensive — should be skipped
-        upstreams[0].host = "h";
-        upstreams[0].port = 80;
-        upstreams[0].circuit_breaker = DefaultCbConfig();
-        upstreams[1].name = "svc-b";
-        upstreams[1].host = "h";
-        upstreams[1].port = 81;
-        upstreams[1].circuit_breaker = DefaultCbConfig();
-
-        CircuitBreakerManager mgr(upstreams, 2, {});
-
-        bool pass = mgr.host_count() == 1 &&
-                    mgr.GetHost("svc-b") != nullptr &&
-                    mgr.GetHost("") == nullptr;
-        TestFramework::RecordTest(
-            "CircuitBreakerManager skips empty-name upstream", pass,
-            pass ? "" : "count=" + std::to_string(mgr.host_count()),
-            TestFramework::TestCategory::OTHER);
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CircuitBreakerManager skips empty-name upstream",
-            false, e.what(), TestFramework::TestCategory::OTHER);
-    }
-}
-
-// Run all Phase 3 tests.
-void RunAllTests() {
-    std::cout << "\n" << std::string(60, '=') << std::endl;
-    std::cout << "CIRCUIT BREAKER PHASE 3 - UNIT TESTS" << std::endl;
-    std::cout << std::string(60, '=') << std::endl;
-
-    TestRetryBudgetMinConcurrencyFloor();
-    TestRetryBudgetPercentCap();
-    TestRetryBudgetInFlightGuardRaii();
-    TestRetryBudgetReloadPreservesCounters();
-    TestRetryBudgetClampsInvalidTuning();
-
-    TestHostCreatesSlicesAndGetSlice();
-    TestHostSnapshotAggregates();
-    TestHostReloadDispatcherMismatchIsSafe();
-
-    TestManagerGetHostLookup();
-    TestManagerSnapshotAllAndReloadSkipsTopologyChanges();
-    TestManagerSkipsEmptyNameUpstream();
-}
-
-}  // namespace CircuitBreakerPhase3Tests
diff --git a/test/circuit_breaker_phase4_test.h b/test/circuit_breaker_phase4_test.h
deleted file mode 100644
index 5626b77a..00000000
--- a/test/circuit_breaker_phase4_test.h
+++ /dev/null
@@ -1,1213 +0,0 @@
-#pragma once
-
-// Phase 4 integration tests: circuit breaker wired into ProxyTransaction +
-// UpstreamManager + HttpServer. Exercises the full request path end-to-end.
-//
-// Strategy: use a backend that returns 5xx on every request so repeated hits
-// trip the breaker via the consecutive-failure threshold. 5xx responses are
-// the cheapest way to accumulate failures (no connect timeouts to wait for).
-// Low thresholds keep tests fast.
-
-#include "test_framework.h"
-#include "test_server_runner.h"
-#include "http_test_client.h"
-#include "http/http_server.h"
-#include "config/server_config.h"
-#include "upstream/upstream_manager.h"
-#include "circuit_breaker/circuit_breaker_manager.h"
-#include "circuit_breaker/circuit_breaker_host.h"
-#include "circuit_breaker/circuit_breaker_slice.h"
-
-#include <thread>
-#include <chrono>
-#include <atomic>
-
-namespace CircuitBreakerPhase4Tests {
-
-using circuit_breaker::State;
-
-// Shared helper: build an upstream config that proxies /echo → backend and
-// has a breaker configured with low thresholds for fast trip.
-static UpstreamConfig MakeBreakerUpstream(const std::string& name,
-                                           const std::string& host,
-                                           int port,
-                                           bool breaker_enabled,
-                                           int consecutive_threshold = 3) {
-    UpstreamConfig u;
-    u.name = name;
-    u.host = host;
-    u.port = port;
-    u.pool.max_connections       = 8;
-    u.pool.max_idle_connections  = 4;
-    u.pool.connect_timeout_ms    = 3000;
-    u.pool.idle_timeout_sec      = 30;
-    u.pool.max_lifetime_sec      = 3600;
-    u.pool.max_requests_per_conn = 0;
-
-    // Exact-match route — simpler than prefix patterns for integration tests.
-    u.proxy.route_prefix = "/fail";
-    u.proxy.strip_prefix = false;
-    u.proxy.response_timeout_ms = 2000;
-    // No retries — keeps the test deterministic: one request = one attempt.
-    u.proxy.retry.max_retries = 0;
-
-    u.circuit_breaker.enabled = breaker_enabled;
-    u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold;
-    // Disable the rate-based trip path — we drive everything through
-    // consecutive failures to keep the test count predictable.
-    u.circuit_breaker.failure_rate_threshold = 100;
-    u.circuit_breaker.minimum_volume = 10000;
-    u.circuit_breaker.window_seconds = 10;
-    u.circuit_breaker.permitted_half_open_calls = 2;
-    u.circuit_breaker.base_open_duration_ms = 500;   // short so recovery test is quick
-    u.circuit_breaker.max_open_duration_ms = 60000;
-    return u;
-}
-
-// ---------------------------------------------------------------------------
-// Test 1: Breaker trips on consecutive 5xx responses and emits circuit-open
-// headers on the rejected request.
-// ---------------------------------------------------------------------------
-void TestBreakerTripsAfterConsecutiveFailures() {
-    std::cout << "\n[TEST] CB Phase 4: breaker trips after consecutive 5xx..."
-              << std::endl;
-    try {
-        // Backend always returns 502 — gateway classifies the response as
-        // FailureKind::RESPONSE_5XX and reports to the breaker on every attempt.
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("upstream err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // worker_threads=1 → all TCP connections land on dispatcher 0
-        // (NetServer shards new connections by fd%worker_threads), so
-        // per-request failures accumulate deterministically on slice[0]
-        // instead of splitting across multiple slices.  // single thread → single breaker partition exercised
-        gw.upstreams.push_back(
-            MakeBreakerUpstream("bad-svc", "127.0.0.1", backend_port,
-                                /*enabled=*/true, /*threshold=*/3));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Hit the failing backend threshold times — each 502 from backend
-        // propagates to the client as 502 (gateway pass-through) AND counts
-        // as a RESPONSE_5XX failure in the breaker.
-        for (int i = 0; i < 3; ++i) {
-            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-            if (!TestHttpClient::HasStatus(r, 502)) {
-                TestFramework::RecordTest(
-                    "CB Phase 4: trip after consecutive failures", false,
-                    "pre-trip request " + std::to_string(i) + " expected 502, got: " +
-                    r.substr(0, 32));
-                return;
-            }
-        }
-
-        // Next request must be rejected by the breaker (not proxied). The
-        // response is 503 with X-Circuit-Breaker: open and Retry-After.
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        bool is_503 = TestHttpClient::HasStatus(r, 503);
-        bool has_breaker_header =
-            r.find("X-Circuit-Breaker: open") != std::string::npos ||
-            r.find("x-circuit-breaker: open") != std::string::npos;
-        bool has_retry_after =
-            r.find("Retry-After:") != std::string::npos ||
-            r.find("retry-after:") != std::string::npos;
-        bool has_upstream_host =
-            r.find("X-Upstream-Host:") != std::string::npos ||
-            r.find("x-upstream-host:") != std::string::npos;
-
-        bool pass = is_503 && has_breaker_header && has_retry_after &&
-                    has_upstream_host;
-        TestFramework::RecordTest(
-            "CB Phase 4: trip after consecutive failures", pass,
-            pass ? "" :
-            "is_503=" + std::to_string(is_503) +
-            " breaker_hdr=" + std::to_string(has_breaker_header) +
-            " retry_after=" + std::to_string(has_retry_after) +
-            " upstream_host=" + std::to_string(has_upstream_host) +
-            " body=" + r.substr(0, 256));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: trip after consecutive failures", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 2: When circuit_breaker.enabled=false, the breaker is bypassed entirely.
-// The same failure pattern that would trip an enabled breaker must leave the
-// pass-through path untouched — every request still reaches the backend.
-// ---------------------------------------------------------------------------
-void TestBreakerDisabledPassesThrough() {
-    std::cout << "\n[TEST] CB Phase 4: disabled breaker passes through..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // worker_threads=1 → all TCP connections land on dispatcher 0
-        // (NetServer shards new connections by fd%worker_threads), so
-        // per-request failures accumulate deterministically on slice[0]
-        // instead of splitting across multiple slices.
-        gw.upstreams.push_back(
-            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                /*enabled=*/false, /*threshold=*/3));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // 10 requests — with breaker disabled, all 10 reach backend.
-        for (int i = 0; i < 10; ++i) {
-            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-            if (!TestHttpClient::HasStatus(r, 502)) {
-                TestFramework::RecordTest(
-                    "CB Phase 4: disabled breaker passes through", false,
-                    "request " + std::to_string(i) + " expected 502, got: " +
-                    r.substr(0, 32));
-                return;
-            }
-        }
-
-        bool all_hit = backend_hits.load() == 10;
-        TestFramework::RecordTest(
-            "CB Phase 4: disabled breaker passes through", all_hit,
-            all_hit ? "" :
-            "expected 10 backend hits, got " + std::to_string(backend_hits.load()));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: disabled breaker passes through", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 3: 2xx responses are reported as success — they reset the
-// consecutive-failure counter so the breaker doesn't trip on interleaved
-// success/failure traffic.
-// ---------------------------------------------------------------------------
-void TestSuccessResetsConsecutiveFailureCounter() {
-    std::cout << "\n[TEST] CB Phase 4: 2xx success resets consecutive-failure counter..."
-              << std::endl;
-    try {
-        std::atomic<bool> fail_mode{true};
-        HttpServer backend("127.0.0.1", 0);
-        // Backend must serve /fail — that's the exact-match route the
-        // proxy forwards (MakeBreakerUpstream sets route_prefix="/fail",
-        // strip_prefix=false). A different backend path would leave
-        // the gateway 404-ing every request without ever exercising
-        // the proxy, and the CLOSED-state assertion below would pass
-        // for the wrong reason.
-        backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
-            if (fail_mode.load()) {
-                resp.Status(502).Body("err", "text/plain");
-            } else {
-                resp.Status(200).Body("ok", "text/plain");
-            }
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // worker_threads=1 → all TCP connections land on dispatcher 0
-        // (NetServer shards new connections by fd%worker_threads), so
-        // per-request failures accumulate deterministically on slice[0]
-        // instead of splitting across multiple slices.
-        gw.upstreams.push_back(
-            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                /*enabled=*/true, /*threshold=*/3));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Pattern: F F S F F — 5 total: 2 fails, 1 success, 2 fails.
-        // With reset semantics, consecutive_failures_ never exceeds 2 → no trip.
-        for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);  // FAIL
-        }
-        fail_mode.store(false);
-        TestHttpClient::HttpGet(gw_port, "/fail", 3000);      // SUCCESS → reset
-        fail_mode.store(true);
-        for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);  // FAIL
-        }
-
-        // Inspect the breaker's state directly. The slice must be CLOSED
-        // AND must have observed activity — without the second check, a
-        // gateway that 404's every request (e.g. because the proxy route
-        // doesn't match) would also pass trivially.
-        auto* cbm = gateway.GetUpstreamManager() ?
-            gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
-        auto* host = cbm ? cbm->GetHost("svc") : nullptr;
-        auto* slice = host ? host->GetSlice(0) : nullptr;
-        bool still_closed = slice && slice->CurrentState() == State::CLOSED;
-        // No trip fired: total_trips should be zero for this slice.
-        int64_t trips = slice ? slice->Trips() : -1;
-        bool no_trips = (trips == 0);
-
-        bool pass = still_closed && no_trips;
-        TestFramework::RecordTest(
-            "CB Phase 4: success resets consecutive counter", pass,
-            pass ? "" :
-            "state=" + std::to_string(static_cast<int>(
-                slice ? slice->CurrentState() : State::CLOSED)) +
-            " trips=" + std::to_string(trips));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: success resets consecutive counter", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 4: After the trip, the live slice state is OPEN. Verifies the
-// integration actually drives the slice state machine (not just the response).
-// ---------------------------------------------------------------------------
-void TestTripDrivesSliceState() {
-    std::cout << "\n[TEST] CB Phase 4: trip drives slice state to OPEN..."
-              << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // worker_threads=1 → all TCP connections land on dispatcher 0
-        // (NetServer shards new connections by fd%worker_threads), so
-        // per-request failures accumulate deterministically on slice[0]
-        // instead of splitting across multiple slices.
-        gw.upstreams.push_back(
-            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                /*enabled=*/true, /*threshold=*/3));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // 3 failures → trip.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-
-        // With worker_threads > 1 the 3 failing requests can land on either
-        // dispatcher (hash-dependent). Check the aggregate snapshot — at
-        // least one partition must be OPEN with exactly one trip recorded.
-        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
-        auto* host = cbm->GetHost("svc");
-        auto snap = host->Snapshot();
-        bool at_least_one_open = snap.open_partitions >= 1;
-        bool one_trip = snap.total_trips == 1;
-        // Sanity: the tripped partition should be the one that saw all 3
-        // failures (consecutive trip is single-slice, not cross-slice).
-        bool single_partition_tripped = snap.open_partitions == 1;
-
-        bool pass = at_least_one_open && one_trip && single_partition_tripped;
-        TestFramework::RecordTest(
-            "CB Phase 4: trip drives slice state to OPEN", pass,
-            pass ? "" :
-            "at_least_one_open=" + std::to_string(at_least_one_open) +
-            " one_trip=" + std::to_string(one_trip) +
-            " single_partition=" + std::to_string(single_partition_tripped) +
-            " (open_partitions=" + std::to_string(snap.open_partitions) +
-            ", total_trips=" + std::to_string(snap.total_trips) + ")");
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: trip drives slice state to OPEN", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 5: Breaker-rejected requests do NOT hit the backend. After the trip,
-// subsequent requests must be served locally (503) without any upstream I/O.
-// Prevents regression where the gate leaked admissions to a known-bad upstream.
-// ---------------------------------------------------------------------------
-void TestOpenBreakerShortCircuitsUpstreamCall() {
-    std::cout << "\n[TEST] CB Phase 4: OPEN breaker short-circuits upstream call..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // worker_threads=1 → all TCP connections land on dispatcher 0
-        // (NetServer shards new connections by fd%worker_threads), so
-        // per-request failures accumulate deterministically on slice[0]
-        // instead of splitting across multiple slices.
-        gw.upstreams.push_back(
-            MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                /*enabled=*/true, /*threshold=*/3));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // 3 failing requests to trip.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-        int hits_at_trip = backend_hits.load();
-
-        // 5 more requests — all should be rejected locally.
-        for (int i = 0; i < 5; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-        int hits_after = backend_hits.load();
-
-        // Backend hits must not grow during the post-trip burst.
-        bool no_leak = hits_after == hits_at_trip;
-        TestFramework::RecordTest(
-            "CB Phase 4: OPEN short-circuits upstream call", no_leak,
-            no_leak ? "" :
-            "backend hits grew from " + std::to_string(hits_at_trip) +
-            " to " + std::to_string(hits_after) + " after trip");
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: OPEN short-circuits upstream call", false, e.what());
-    }
-}
-
-// Sanity check: verify the bare proxy setup works without the breaker
-// before blaming the breaker integration.
-void TestBareProxyWorks() {
-    std::cout << "\n[TEST] CB Phase 4: bare proxy (sanity)..." << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        UpstreamConfig u;
-        u.name = "svc";
-        u.host = "127.0.0.1";
-        u.port = backend_port;
-        u.pool.max_connections = 8;
-        u.pool.max_idle_connections = 4;
-        u.pool.connect_timeout_ms = 3000;
-        u.proxy.route_prefix = "/fail";
-        u.proxy.response_timeout_ms = 5000;
-        u.circuit_breaker.enabled = true;  // sanity + breaker enabled
-        u.circuit_breaker.consecutive_failure_threshold = 3;
-        u.circuit_breaker.failure_rate_threshold = 100;
-        u.circuit_breaker.minimum_volume = 10000;
-        u.circuit_breaker.window_seconds = 10;
-        u.circuit_breaker.permitted_half_open_calls = 2;
-        u.circuit_breaker.base_open_duration_ms = 500;
-        u.circuit_breaker.max_open_duration_ms = 60000;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000);
-        bool pass = TestHttpClient::HasStatus(r, 502);
-        TestFramework::RecordTest(
-            "CB Phase 4: bare proxy sanity", pass,
-            pass ? "" : "expected 502, got: " + r.substr(0, 128));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest("CB Phase 4: bare proxy sanity",
-            false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 7: Retry-After header carries a sensible value — within [1, configured
-// max_open_duration_ms / 1000], and in the right ballpark of OpenUntil()-now.
-// ---------------------------------------------------------------------------
-void TestRetryAfterHeaderValue() {
-    std::cout << "\n[TEST] CB Phase 4: Retry-After value correctness..."
-              << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // base_open_duration 2000ms, max 60_000ms — Retry-After should
-        // ceiling-round and fall inside [1, 60].
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        u.circuit_breaker.base_open_duration_ms = 2000;
-        u.circuit_breaker.max_open_duration_ms  = 60000;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Trip the breaker.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-
-        // Capture the open-rejection response.
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        bool is_503 = TestHttpClient::HasStatus(r, 503);
-
-        // Extract Retry-After integer value (case-insensitive header).
-        int retry_after = -1;
-        const char* markers[] = {"Retry-After:", "retry-after:"};
-        for (const char* m : markers) {
-            auto pos = r.find(m);
-            if (pos == std::string::npos) continue;
-            pos += std::string(m).size();
-            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
-            int val = 0;
-            bool any = false;
-            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
-                val = val * 10 + (r[pos] - '0');
-                any = true;
-                ++pos;
-            }
-            if (any) { retry_after = val; break; }
-        }
-
-        // Contract: value ≥ 1 and ≤ max_open_duration_ms / 1000 (60).
-        // For base_open_duration 2000ms the remaining-seconds at this
-        // moment is ≤ 2 (probably 1 or 2 after ceiling), so the upper
-        // sanity bound is generous but still rules out 300/3600-class
-        // buggy fallbacks.
-        bool in_range = (retry_after >= 1 && retry_after <= 60);
-        bool reasonable = (retry_after >= 1 && retry_after <= 3);
-
-        bool pass = is_503 && in_range && reasonable;
-        TestFramework::RecordTest(
-            "CB Phase 4: Retry-After value in range", pass,
-            pass ? "" :
-            "is_503=" + std::to_string(is_503) +
-            " retry_after=" + std::to_string(retry_after) +
-            " body=" + r.substr(0, 256));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: Retry-After value in range", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 8: Retry loop is terminal on CIRCUIT_OPEN — even with max_retries=3,
-// a request that hits an OPEN breaker gets exactly ONE 503 (no retry-flavored
-// second 503). Ensures ReportBreakerOutcome doesn't feed the reject back into
-// the breaker and MaybeRetry stays out.
-// ---------------------------------------------------------------------------
-void TestCircuitOpenTerminalForRetry() {
-    std::cout << "\n[TEST] CB Phase 4: CIRCUIT_OPEN terminal for retry loop..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // Retries enabled on 5xx — if the breaker reject leaked into
-        // MaybeRetry, the test would see extra backend hits after the
-        // trip. Long open window so the breaker stays OPEN for the
-        // duration of the post-trip assertion (no HALF_OPEN probe
-        // admission racing the test).
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        u.proxy.retry.max_retries = 3;
-        u.proxy.retry.retry_on_5xx = true;
-        u.circuit_breaker.base_open_duration_ms = 30000;
-        u.circuit_breaker.max_open_duration_ms  = 60000;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Trip the breaker. Each pre-trip request may retry up to 3
-        // times (all failing 5xx), so backend sees up to 3*threshold=12
-        // hits. That's acceptable — we just care about post-trip behavior.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 5000);
-        }
-        int pre_trip_hits = backend_hits.load();
-
-        // Post-trip request: expect a single 503 and NO new backend hits.
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        bool is_503 = TestHttpClient::HasStatus(r, 503);
-        int post_trip_hits = backend_hits.load();
-        bool no_new_hits = (post_trip_hits == pre_trip_hits);
-
-        bool pass = is_503 && no_new_hits;
-        TestFramework::RecordTest(
-            "CB Phase 4: CIRCUIT_OPEN terminal for retry", pass,
-            pass ? "" :
-            "is_503=" + std::to_string(is_503) +
-            " pre=" + std::to_string(pre_trip_hits) +
-            " post=" + std::to_string(post_trip_hits));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: CIRCUIT_OPEN terminal for retry", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 9: Dry-run mode — dry_run=true forwards rejected requests to the
-// upstream (pass-through) but still increments the rejected_ counter so
-// operators can observe the would-reject rate without production impact.
-// ---------------------------------------------------------------------------
-void TestDryRunPassthrough() {
-    std::cout << "\n[TEST] CB Phase 4: dry-run passthrough..." << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        u.circuit_breaker.dry_run = true;  // would-reject, but still forward
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Trip thresholds with 5 requests. All should reach backend (502),
-        // not a 503 — dry-run never short-circuits.
-        for (int i = 0; i < 5; ++i) {
-            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-            if (!TestHttpClient::HasStatus(r, 502)) {
-                TestFramework::RecordTest(
-                    "CB Phase 4: dry-run passthrough", false,
-                    "request " + std::to_string(i) +
-                    " expected 502, got: " + r.substr(0, 64));
-                return;
-            }
-        }
-
-        bool all_hit = (backend_hits.load() == 5);
-
-        // Verify the slice observed trips/rejected even though traffic passed.
-        auto* mgr = gateway.GetUpstreamManager() ?
-                     gateway.GetUpstreamManager()->GetCircuitBreakerManager() :
-                     nullptr;
-        int64_t trips = 0, rejected = 0;
-        if (mgr) {
-            auto* host = mgr->GetHost("svc");
-            if (host) {
-                auto snap = host->Snapshot();
-                trips = snap.total_trips;
-                rejected = snap.total_rejected;
-            }
-        }
-        // At least one trip fired (consecutive_threshold=3 → slice
-        // transitioned at least once during the run), and the post-trip
-        // requests were counted as would-reject (rejected > 0).
-        bool observed = (trips >= 1) && (rejected >= 1);
-
-        bool pass = all_hit && observed;
-        TestFramework::RecordTest(
-            "CB Phase 4: dry-run passthrough", pass,
-            pass ? "" :
-            "hits=" + std::to_string(backend_hits.load()) +
-            " trips=" + std::to_string(trips) +
-            " rejected=" + std::to_string(rejected));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: dry-run passthrough", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 10: HALF_OPEN → CLOSED recovery round-trip through the proxy. Trip the
-// breaker, wait for the open window to elapse, then serve success responses
-// and assert the slice transitions back to CLOSED (consecutive_successes
-// crosses the threshold — default 2 from DefaultCbConfig / phase-4 config).
-// ---------------------------------------------------------------------------
-void TestHalfOpenRecoveryRoundTrip() {
-    std::cout << "\n[TEST] CB Phase 4: HALF_OPEN → CLOSED recovery..."
-              << std::endl;
-    try {
-        std::atomic<bool> fail_mode{true};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&fail_mode](const HttpRequest&, HttpResponse& resp) {
-            if (fail_mode.load()) {
-                resp.Status(502).Body("err", "text/plain");
-            } else {
-                resp.Status(200).Body("ok", "text/plain");
-            }
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        // Short open duration so recovery path finishes quickly.
-        u.circuit_breaker.base_open_duration_ms = 300;
-        u.circuit_breaker.max_open_duration_ms = 1000;
-        // Two probes needed to close (default permitted_half_open_calls=2).
-        u.circuit_breaker.permitted_half_open_calls = 2;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Trip by hitting the failing backend.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-
-        // Flip backend to success and wait for the open window to elapse.
-        fail_mode.store(false);
-        std::this_thread::sleep_for(std::chrono::milliseconds(500));
-
-        // Probe the proxy — each successful 200 advances HALF_OPEN toward
-        // CLOSED. Do more than permitted_half_open_calls; some will be
-        // rejected as half_open_full but the ones that are admitted will
-        // close the breaker.
-        bool saw_success = false;
-        for (int i = 0; i < 8; ++i) {
-            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-            if (TestHttpClient::HasStatus(r, 200)) saw_success = true;
-            // Small gap between probes — HALF_OPEN only admits permitted
-            // probes per cycle; spacing lets subsequent probes observe a
-            // possibly-closed breaker.
-            std::this_thread::sleep_for(std::chrono::milliseconds(50));
-        }
-
-        // Verify slice aggregate: at least one CLOSED transition observed
-        // (probe_successes >= 1 and total_trips == 1 — we only tripped once).
-        auto* mgr = gateway.GetUpstreamManager() ?
-                     gateway.GetUpstreamManager()->GetCircuitBreakerManager() :
-                     nullptr;
-        int64_t probe_succ = 0;
-        int open_parts = 0, half_open_parts = 0;
-        if (mgr) {
-            auto* host = mgr->GetHost("svc");
-            if (host) {
-                auto snap = host->Snapshot();
-                probe_succ = 0;
-                for (const auto& row : snap.slices) {
-                    probe_succ += row.probe_successes;
-                }
-                open_parts = snap.open_partitions;
-                half_open_parts = snap.half_open_partitions;
-            }
-        }
-
-        // Recovery complete: saw at least one 200 through the breaker,
-        // at least one probe success counted, and no partition still
-        // stuck in OPEN (HALF_OPEN may still linger on the unused slice,
-        // which is fine for a 2-partition setup).
-        bool pass = saw_success && (probe_succ >= 1) && (open_parts == 0);
-        TestFramework::RecordTest(
-            "CB Phase 4: HALF_OPEN → CLOSED recovery", pass,
-            pass ? "" :
-            "saw_success=" + std::to_string(saw_success) +
-            " probe_succ=" + std::to_string(probe_succ) +
-            " open_parts=" + std::to_string(open_parts) +
-            " half_open_parts=" + std::to_string(half_open_parts));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: HALF_OPEN → CLOSED recovery", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 11: Retry-After ceils the config cap from a non-second-aligned
-// max_open_duration_ms (e.g. 1500ms → 2s, not 1s). Floor-rounding the cap
-// would clamp the advertised retry window below what the breaker honors,
-// causing well-behaved clients to re-hit the 503.
-// ---------------------------------------------------------------------------
-void TestRetryAfterCapCeilsNonAlignedMax() {
-    std::cout << "\n[TEST] CB Phase 4: Retry-After cap ceils non-aligned max..."
-              << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // Configure a non-second-aligned max backoff. base = 1500ms so
-        // the actual OpenUntil-now at trip time is ~1.5s, which ceil-
-        // rounds to 2s. If cfg_cap_secs floor-rounded max_open_duration
-        // (1500ms → 1s), the clamp would drop Retry-After to 1s even
-        // though the breaker would keep rejecting through the second
-        // half of that window.
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        u.circuit_breaker.base_open_duration_ms = 1500;
-        u.circuit_breaker.max_open_duration_ms  = 1500;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-
-        int retry_after = -1;
-        const char* markers[] = {"Retry-After:", "retry-after:"};
-        for (const char* m : markers) {
-            auto pos = r.find(m);
-            if (pos == std::string::npos) continue;
-            pos += std::string(m).size();
-            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
-            int val = 0;
-            bool any = false;
-            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
-                val = val * 10 + (r[pos] - '0');
-                any = true;
-                ++pos;
-            }
-            if (any) { retry_after = val; break; }
-        }
-
-        // Expectation: Retry-After is in [1, 2] — cfg_cap_secs ceil-
-        // rounds 1500ms to 2s, and the remaining-time ceil-rounds to
-        // 2 at the moment of trip (may be 1 if enough wall-clock has
-        // elapsed between trip and response). Critically it must NEVER
-        // be zero or exceed 2 (clamped to the 2s cap).
-        bool in_range = (retry_after >= 1 && retry_after <= 2);
-        TestFramework::RecordTest(
-            "CB Phase 4: Retry-After ceils non-aligned cap", in_range,
-            in_range ? "" :
-            "retry_after=" + std::to_string(retry_after));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: Retry-After ceils non-aligned cap", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 12: Retried failures are reported BEFORE the retry fires. With retries
-// enabled on 5xx, each attempt's outcome must be counted against the breaker;
-// otherwise the slice trips only after the final retry exhausts, under-
-// counting failures and potentially never tripping if retries mask enough of
-// them. Verifies the trip still happens within the expected number of client
-// requests once reporting is attached to the retry path.
-// ---------------------------------------------------------------------------
-void TestRetriedFailuresCountTowardTrip() {
-    std::cout << "\n[TEST] CB Phase 4: retried failures count toward trip..."
-              << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        // Retries on 5xx enabled. threshold=3 — with retry_on_5xx, each
-        // client request produces 1 + max_retries=3 = 4 upstream
-        // attempts, each reporting RESPONSE_5XX via the ReportBreakerOutcome
-        // path that this fix patches in. The breaker must trip after
-        // at most 3 upstream failure reports (which the first client
-        // request alone produces).
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        u.proxy.retry.max_retries = 3;
-        u.proxy.retry.retry_on_5xx = true;
-        u.circuit_breaker.base_open_duration_ms = 30000;
-        u.circuit_breaker.max_open_duration_ms  = 60000;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // One client request → 4 upstream attempts → 4 RESPONSE_5XX
-        // reports. Threshold=3 should trip during this single request.
-        TestHttpClient::HttpGet(gw_port, "/fail", 5000);
-
-        // Second client request must hit the OPEN breaker → 503.
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        bool is_503 = TestHttpClient::HasStatus(r, 503);
-        bool has_breaker_header =
-            r.find("X-Circuit-Breaker: open") != std::string::npos ||
-            r.find("x-circuit-breaker: open") != std::string::npos;
-
-        bool pass = is_503 && has_breaker_header;
-        TestFramework::RecordTest(
-            "CB Phase 4: retried failures count toward trip", pass,
-            pass ? "" :
-            "is_503=" + std::to_string(is_503) +
-            " breaker_hdr=" + std::to_string(has_breaker_header) +
-            " body=" + r.substr(0, 256));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: retried failures count toward trip", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 13: HALF_OPEN rejects emit a distinct X-Circuit-Breaker label.
-// TryAcquire returns REJECTED_OPEN for three situations (true OPEN,
-// half_open_full, half_open_recovery_failing). When the slice is in
-// HALF_OPEN, OpenUntil is cleared and a generic MakeCircuitOpenResponse
-// would fall back to Retry-After=1 + X-Circuit-Breaker:open — misleading
-// clients. The fix emits X-Circuit-Breaker:half_open for HALF_OPEN rejects
-// with a more conservative Retry-After hint.
-//
-// Strategy: trip the breaker, wait for the open window to elapse so the
-// slice transitions HALF_OPEN on the next admission attempt, then flood
-// concurrent requests so some hit half_open_full.
-// ---------------------------------------------------------------------------
-void TestHalfOpenRejectLabel() {
-    std::cout << "\n[TEST] CB Phase 4: HALF_OPEN reject label..."
-              << std::endl;
-    try {
-        // Backend hangs to keep probes in-flight so later concurrent
-        // requests hit half_open_full.
-        std::atomic<bool> hang{false};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) {
-            if (hang.load()) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(600));
-            }
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/3);
-        u.circuit_breaker.base_open_duration_ms = 200;
-        u.circuit_breaker.max_open_duration_ms  = 500;
-        u.circuit_breaker.permitted_half_open_calls = 1;  // tiny budget
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Trip the breaker.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-        // Wait for the open window to elapse so the next admission
-        // flips the slice to HALF_OPEN.
-        std::this_thread::sleep_for(std::chrono::milliseconds(300));
-
-        // Flip backend to hang so the probe occupies the single probe
-        // slot while we fire sibling requests that must hit half_open_full.
-        hang.store(true);
-
-        std::atomic<bool> saw_half_open{false};
-        std::atomic<bool> saw_open{false};
-        auto probe = [&](int id) {
-            (void)id;
-            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
-            if (!TestHttpClient::HasStatus(r, 503)) return;
-            if (r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
-                r.find("x-circuit-breaker: half_open") != std::string::npos) {
-                saw_half_open.store(true);
-            }
-            if (r.find("X-Circuit-Breaker: open") != std::string::npos ||
-                r.find("x-circuit-breaker: open") != std::string::npos) {
-                // We want to distinguish the labels; the "open" substring
-                // also matches "half_open". Only count true "open" if
-                // "half_open" didn't appear in THIS response.
-                if (r.find("half_open") == std::string::npos) {
-                    saw_open.store(true);
-                }
-            }
-        };
-
-        std::vector<std::thread> threads;
-        for (int i = 0; i < 6; ++i) {
-            threads.emplace_back(probe, i);
-            std::this_thread::sleep_for(std::chrono::milliseconds(20));
-        }
-        for (auto& t : threads) t.join();
-
-        // Pass if at least one HALF_OPEN-labelled reject was observed.
-        // saw_open may or may not be observed (some rejects could have
-        // hit between cycles) — the key contract is that HALF_OPEN
-        // rejects no longer get the plain "open" label.
-        bool pass = saw_half_open.load();
-        TestFramework::RecordTest(
-            "CB Phase 4: HALF_OPEN reject label", pass,
-            pass ? "" :
-            "saw_half_open=" + std::to_string(saw_half_open.load()) +
-            " saw_open=" + std::to_string(saw_open.load()));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: HALF_OPEN reject label", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 14: HALF_OPEN Retry-After reflects the current exponential backoff,
-// not just base_open_duration_ms. After multiple trips the next OPEN window
-// (base << consecutive_trips_, clamped by max) can exceed 1 second; the old
-// base-only hint (ceil(base/1000) = 1s for base=100ms) would under-report
-// the worst-case wait, which this test must fail for.
-//
-// Strategy: keep the backend failing and drive MULTIPLE re-trips by letting
-// the OPEN window elapse and single probe fail each cycle. Successful
-// recoveries must be avoided — TransitionHalfOpenToClosed resets
-// consecutive_trips_ to 0, which hides the exponential hint.
-// ---------------------------------------------------------------------------
-void TestHalfOpenRetryAfterScalesWithBackoff() {
-    std::cout << "\n[TEST] CB Phase 4: HALF_OPEN Retry-After exponential..."
-              << std::endl;
-    try {
-        // Backend fails fast by default. When `hang` is set, the
-        // handler blocks — used at the end to pin the probe slot so
-        // a concurrent request observes HALF_OPEN rejection.
-        std::atomic<bool> hang{false};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&hang](const HttpRequest&, HttpResponse& resp) {
-            if (hang.load()) {
-                std::this_thread::sleep_for(std::chrono::milliseconds(1500));
-            }
-            resp.Status(502).Body("err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;  // pin all traffic to slice[0]
-        gw.http2.enabled = false;
-        auto u = MakeBreakerUpstream("svc", "127.0.0.1", backend_port,
-                                     /*enabled=*/true, /*threshold=*/2);
-        u.circuit_breaker.base_open_duration_ms = 100;     // config minimum
-        u.circuit_breaker.max_open_duration_ms  = 8000;    // cap at 8s
-        u.circuit_breaker.permitted_half_open_calls = 1;   // single probe
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        auto* cbm = gateway.GetUpstreamManager() ?
-            gateway.GetUpstreamManager()->GetCircuitBreakerManager() : nullptr;
-        auto* host = cbm ? cbm->GetHost("svc") : nullptr;
-        auto* slice = host ? host->GetSlice(0) : nullptr;
-        if (!slice) {
-            TestFramework::RecordTest(
-                "CB Phase 4: HALF_OPEN Retry-After exponential-aware",
-                false, "slice lookup failed");
-            return;
-        }
-
-        // Initial trip: 2 consecutive failures with threshold=2.
-        for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-
-        // Drive consecutive_trips_ up by letting successive OPEN windows
-        // elapse and probes fail (no recovery → no reset). Stop when
-        // NextOpenDurationMs crosses 1000ms, which is the threshold
-        // where the HALF_OPEN Retry-After hint starts exceeding the
-        // base-only value (ceil(100ms)=1s).
-        //
-        // The slice re-trips on each failed probe; each trip doubles
-        // the open duration. We run ~8 cycles with safety margin which
-        // is comfortably past the trip count needed for Retry-After>=2.
-        for (int cycle = 0; cycle < 8; ++cycle) {
-            // Wait past the current open window. Upper bound: max=8s,
-            // so 1200ms is plenty for the first few short cycles, and
-            // we re-check after each request anyway.
-            int64_t next_ms = slice->NextOpenDurationMs();
-            // Current OPEN window is the one stored BEFORE the upcoming
-            // re-trip — we don't have that directly, so sleep past the
-            // NEXT duration as an over-approximation (next is always >=
-            // current). This ensures OPEN has elapsed.
-            auto sleep_ms = std::max<int64_t>(next_ms + 50, 200);
-            if (sleep_ms > 2000) sleep_ms = 2000;  // cap per cycle
-            std::this_thread::sleep_for(std::chrono::milliseconds(sleep_ms));
-
-            // One request — it should admit as a probe (HALF_OPEN),
-            // the backend fails fast (502), probe fails → re-trip with
-            // consecutive_trips_++ and fresh OPEN.
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-
-            // Bail early once the exponential hint crosses 1s → the
-            // subsequent HALF_OPEN reject will carry Retry-After >= 2.
-            if (slice->NextOpenDurationMs() >= 2000) break;
-        }
-
-        int64_t next_open_ms = slice->NextOpenDurationMs();
-        if (next_open_ms < 2000) {
-            TestFramework::RecordTest(
-                "CB Phase 4: HALF_OPEN Retry-After exponential-aware",
-                false,
-                "setup failed: next_open_ms=" + std::to_string(next_open_ms) +
-                " (need >= 2000 to distinguish from base-only hint)");
-            return;
-        }
-
-        // Now trigger a HALF_OPEN reject: wait for current OPEN to
-        // elapse, start a hanging probe (pins the slot), then fire a
-        // sibling request — it must see half_open_full with the
-        // exponential Retry-After.
-        int64_t post_wait_ms = next_open_ms + 100;
-        if (post_wait_ms > 4000) post_wait_ms = 4000;
-        std::this_thread::sleep_for(std::chrono::milliseconds(post_wait_ms));
-
-        hang.store(true);
-        std::thread probe([&]() {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3500);
-        });
-        // Let the probe get admitted and start hanging.
-        std::this_thread::sleep_for(std::chrono::milliseconds(200));
-
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 1500);
-        hang.store(false);
-        probe.join();
-
-        bool is_half_open =
-            r.find("X-Circuit-Breaker: half_open") != std::string::npos ||
-            r.find("x-circuit-breaker: half_open") != std::string::npos;
-
-        int retry_after = -1;
-        const char* markers[] = {"Retry-After:", "retry-after:"};
-        for (const char* m : markers) {
-            auto pos = r.find(m);
-            if (pos == std::string::npos) continue;
-            pos += std::string(m).size();
-            while (pos < r.size() && (r[pos] == ' ' || r[pos] == '\t')) ++pos;
-            int val = 0;
-            bool any = false;
-            while (pos < r.size() && r[pos] >= '0' && r[pos] <= '9') {
-                val = val * 10 + (r[pos] - '0');
-                any = true;
-                ++pos;
-            }
-            if (any) { retry_after = val; break; }
-        }
-
-        // Post-fix: Retry-After = ceil(next_open_ms / 1000) >= 2.
-        // Pre-fix (base-only): Retry-After = ceil(base/1000) = 1.
-        // Asserting >= 2 fails the pre-fix implementation.
-        bool retry_after_ok = (retry_after >= 2 && retry_after <= 8);
-        bool pass = is_half_open && retry_after_ok;
-        TestFramework::RecordTest(
-            "CB Phase 4: HALF_OPEN Retry-After exponential-aware", pass,
-            pass ? "" :
-            "is_half_open=" + std::to_string(is_half_open) +
-            " retry_after=" + std::to_string(retry_after) +
-            " next_open_ms=" + std::to_string(next_open_ms));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 4: HALF_OPEN Retry-After exponential-aware",
-            false, e.what());
-    }
-}
-
-void RunAllTests() {
-    std::cout << "\n" << std::string(60, '=') << std::endl;
-    std::cout << "CIRCUIT BREAKER PHASE 4 - INTEGRATION TESTS" << std::endl;
-    std::cout << std::string(60, '=') << std::endl;
-
-    TestBareProxyWorks();
-    TestBreakerTripsAfterConsecutiveFailures();
-    TestBreakerDisabledPassesThrough();
-    TestSuccessResetsConsecutiveFailureCounter();
-    TestTripDrivesSliceState();
-    TestOpenBreakerShortCircuitsUpstreamCall();
-    TestRetryAfterHeaderValue();
-    TestCircuitOpenTerminalForRetry();
-    TestDryRunPassthrough();
-    TestHalfOpenRecoveryRoundTrip();
-    TestRetryAfterCapCeilsNonAlignedMax();
-    TestRetriedFailuresCountTowardTrip();
-    TestHalfOpenRejectLabel();
-    TestHalfOpenRetryAfterScalesWithBackoff();
-}
-
-}  // namespace CircuitBreakerPhase4Tests
diff --git a/test/circuit_breaker_phase5_test.h b/test/circuit_breaker_phase5_test.h
deleted file mode 100644
index 9b0c3f11..00000000
--- a/test/circuit_breaker_phase5_test.h
+++ /dev/null
@@ -1,366 +0,0 @@
-#pragma once
-
-// Phase 5 integration tests: retry budget wired into ProxyTransaction.
-//
-// Phase 3 covered the RetryBudget math (CAS, non-retry denominator,
-// min-concurrency floor) as unit tests against the RetryBudget class in
-// isolation. Phase 5 tests the INTEGRATION: ProxyTransaction resolves
-// `retry_budget_` from the same CircuitBreakerHost as `slice_`, tracks
-// every attempt's in_flight via the RAII guard, and consults
-// `TryConsumeRetry` before each retry. Exhaustion emits the §12.2
-// response (503 + `X-Retry-Budget-Exhausted: 1`) and does NOT feed
-// back into the slice's failure math.
-//
-// Strategy: backends that always 502 with `retry_on_5xx=true` drive the
-// retry path. A near-zero retry-budget (`percent=0, min_concurrency=0`)
-// rejects every retry deterministically without needing concurrent
-// client load. The circuit-breaker consecutive-failure threshold is
-// raised well above the retry count so the breaker stays CLOSED — the
-// budget gate is tested in isolation from the state machine.
-
-#include "test_framework.h"
-#include "test_server_runner.h"
-#include "http_test_client.h"
-#include "http/http_server.h"
-#include "config/server_config.h"
-
-#include <thread>
-#include <chrono>
-#include <atomic>
-#include <vector>
-
-namespace CircuitBreakerPhase5Tests {
-
-// Upstream config that always proxies /fail, with the circuit breaker
-// enabled so `retry_budget_` is resolved on `slice_`'s host. Breaker
-// thresholds intentionally unreachable for these tests — we want the
-// retry-budget gate fired in isolation, not co-tripping the state
-// machine.
-static UpstreamConfig MakeRetryBudgetUpstream(const std::string& name,
-                                              const std::string& host,
-                                              int port,
-                                              int retry_budget_percent,
-                                              int retry_budget_min_concurrency,
-                                              bool dry_run = false) {
-    UpstreamConfig u;
-    u.name = name;
-    u.host = host;
-    u.port = port;
-    u.pool.max_connections       = 16;
-    u.pool.max_idle_connections  = 8;
-    u.pool.connect_timeout_ms    = 3000;
-    u.pool.idle_timeout_sec      = 30;
-    u.pool.max_lifetime_sec      = 3600;
-    u.pool.max_requests_per_conn = 0;
-
-    u.proxy.route_prefix = "/fail";
-    u.proxy.strip_prefix = false;
-    u.proxy.response_timeout_ms = 2000;
-
-    u.circuit_breaker.enabled = true;
-    u.circuit_breaker.dry_run = dry_run;
-    // Breaker thresholds unreachable — we don't want the state machine
-    // tripping during a retry-budget test.
-    u.circuit_breaker.consecutive_failure_threshold = 10000;
-    u.circuit_breaker.failure_rate_threshold = 100;
-    u.circuit_breaker.minimum_volume = 10000;
-    u.circuit_breaker.window_seconds = 10;
-    u.circuit_breaker.permitted_half_open_calls = 2;
-    u.circuit_breaker.base_open_duration_ms = 30000;
-    u.circuit_breaker.max_open_duration_ms  = 60000;
-
-    u.circuit_breaker.retry_budget_percent = retry_budget_percent;
-    u.circuit_breaker.retry_budget_min_concurrency = retry_budget_min_concurrency;
-    return u;
-}
-
-static bool HasRetryBudgetHeader(const std::string& response) {
-    return response.find("X-Retry-Budget-Exhausted: 1") != std::string::npos ||
-           response.find("x-retry-budget-exhausted: 1") != std::string::npos;
-}
-
-// ---------------------------------------------------------------------------
-// Test 1: A retry attempt rejected by the retry-budget gate delivers 503 +
-// X-Retry-Budget-Exhausted instead of the upstream's 5xx. Verifies that
-// `TryConsumeRetry` runs BEFORE the retry executes and that
-// `MakeRetryBudgetResponse` is emitted through the standard DeliverResponse
-// path.
-//
-// retry_budget_percent=0 + retry_budget_min_concurrency=0 → cap = 0. Every
-// retry attempt's TryConsumeRetry returns false. First attempt is
-// unaffected (budget only gates retries), so the backend is hit exactly
-// once per client request; the retry is short-circuited locally.
-// ---------------------------------------------------------------------------
-void TestRetryBudgetRejectsRetry() {
-    std::cout << "\n[TEST] CB Phase 5: retry budget rejects retry..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
-                                         /*percent=*/0,
-                                         /*min_concurrency=*/0);
-        u.proxy.retry.max_retries = 3;
-        u.proxy.retry.retry_on_5xx = true;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 5000);
-
-        bool is_503 = TestHttpClient::HasStatus(r, 503);
-        bool has_budget_hdr = HasRetryBudgetHeader(r);
-        // Backend should have been hit exactly once (the first attempt);
-        // every retry was short-circuited by the budget gate.
-        int hits = backend_hits.load(std::memory_order_relaxed);
-        bool single_backend_hit = (hits == 1);
-
-        bool pass = is_503 && has_budget_hdr && single_backend_hit;
-        TestFramework::RecordTest(
-            "CB Phase 5: retry budget rejects retry", pass,
-            pass ? "" :
-            "is_503=" + std::to_string(is_503) +
-            " budget_hdr=" + std::to_string(has_budget_hdr) +
-            " backend_hits=" + std::to_string(hits) +
-            " body=" + r.substr(0, 256));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 5: retry budget rejects retry", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 2: The min-concurrency floor admits retries even when the %-based
-// cap would be zero. With percent=0 + min_concurrency=5, a single sequential
-// client request's retry chain (1 first + 3 retries = 4 backend hits) all
-// fit under the floor and proceed normally to the upstream — no 503, no
-// X-Retry-Budget-Exhausted, and the client sees the final 5xx response.
-//
-// This is the symmetric test to Test 1: same near-zero %-cap, but a floor
-// large enough that retries aren't budget-gated. Proves the floor is
-// consulted (retries admitted) instead of the %-cap (retries rejected).
-// ---------------------------------------------------------------------------
-void TestRetryBudgetMinConcurrencyFloor() {
-    std::cout << "\n[TEST] CB Phase 5: retry budget min-concurrency floor..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        // percent=0 → no %-based capacity. min_concurrency=5 → floor
-        // admits up to 5 concurrent retries, easily covering the 3
-        // sequential retries from a single client request.
-        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
-                                         /*percent=*/0,
-                                         /*min_concurrency=*/5);
-        u.proxy.retry.max_retries = 3;
-        u.proxy.retry.retry_on_5xx = true;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000);
-
-        // Client sees the upstream's final 502 — no local 503, no
-        // X-Retry-Budget-Exhausted.
-        bool is_502 = TestHttpClient::HasStatus(r, 502);
-        bool no_budget_hdr = !HasRetryBudgetHeader(r);
-        // 1 first attempt + 3 retries admitted by the floor = 4 backend hits.
-        int hits = backend_hits.load(std::memory_order_relaxed);
-        bool all_retries_proceeded = (hits == 4);
-
-        bool pass = is_502 && no_budget_hdr && all_retries_proceeded;
-        TestFramework::RecordTest(
-            "CB Phase 5: retry budget min-concurrency floor", pass,
-            pass ? "" :
-            "is_502=" + std::to_string(is_502) +
-            " no_budget_hdr=" + std::to_string(no_budget_hdr) +
-            " backend_hits=" + std::to_string(hits) +
-            " body=" + r.substr(0, 256));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 5: retry budget min-concurrency floor", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 3: Dry-run bypasses the retry-budget gate.
-//
-// With percent=0 + min_concurrency=0 (same as Test 1), TryConsumeRetry
-// returns false for every retry. But `circuit_breaker.dry_run=true`
-// switches the rejection path to a log-and-proceed: no token is
-// consumed, retry_token_held_ stays false, and AttemptCheckout runs as
-// though the budget was unlimited.
-//
-// Result: the client sees the upstream's 502 response (because the
-// retries actually fire), NOT a 503 + X-Retry-Budget-Exhausted.
-// ---------------------------------------------------------------------------
-void TestRetryBudgetDryRunPassthrough() {
-    std::cout << "\n[TEST] CB Phase 5: retry budget dry-run passthrough..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
-                                         /*percent=*/0,
-                                         /*min_concurrency=*/0,
-                                         /*dry_run=*/true);
-        u.proxy.retry.max_retries = 2;
-        u.proxy.retry.retry_on_5xx = true;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 10000);
-
-        // Retries proceeded despite would-reject decisions — the client
-        // sees the upstream's final 502, not our local 503.
-        bool is_502 = TestHttpClient::HasStatus(r, 502);
-        bool no_budget_hdr = !HasRetryBudgetHeader(r);
-        int hits = backend_hits.load(std::memory_order_relaxed);
-        bool all_attempts_ran = (hits == 3);  // 1 first + 2 retries
-
-        bool pass = is_502 && no_budget_hdr && all_attempts_ran;
-        TestFramework::RecordTest(
-            "CB Phase 5: retry budget dry-run passthrough", pass,
-            pass ? "" :
-            "is_502=" + std::to_string(is_502) +
-            " no_budget_hdr=" + std::to_string(no_budget_hdr) +
-            " backend_hits=" + std::to_string(hits) +
-            " body=" + r.substr(0, 256));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 5: retry budget dry-run passthrough", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 4: First attempts are NOT budget-gated.
-//
-// The retry-budget cap applies only to retries (attempt_ > 0). First
-// attempts call TrackInFlight (which only ever increments) but skip
-// TryConsumeRetry entirely. With percent=0 + min_concurrency=0 and a
-// backend that always 200s, every client request must succeed — if the
-// gate accidentally ran on first attempts, we'd see 503s here.
-//
-// Guards against a regression where TryConsumeRetry is called before
-// the `attempt_ > 0` gate, or where the gate is placed in
-// AttemptCheckout instead of MaybeRetry.
-// ---------------------------------------------------------------------------
-void TestFirstAttemptsNotGated() {
-    std::cout << "\n[TEST] CB Phase 5: first attempts not gated..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            resp.Status(200).Body("ok", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        auto u = MakeRetryBudgetUpstream("svc", "127.0.0.1", backend_port,
-                                         /*percent=*/0,
-                                         /*min_concurrency=*/0);
-        // No retries — every request is a first attempt.
-        u.proxy.retry.max_retries = 0;
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        int client_count = 5;
-        int successes = 0;
-        for (int i = 0; i < client_count; ++i) {
-            std::string r = TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-            if (TestHttpClient::HasStatus(r, 200)) ++successes;
-            if (HasRetryBudgetHeader(r)) {
-                // Any X-Retry-Budget-Exhausted on a first-attempt-only
-                // path is a bug. Record and bail.
-                TestFramework::RecordTest(
-                    "CB Phase 5: first attempts not gated", false,
-                    "unexpected X-Retry-Budget-Exhausted on first-attempt path "
-                    "i=" + std::to_string(i));
-                return;
-            }
-        }
-
-        int hits = backend_hits.load(std::memory_order_relaxed);
-        bool pass = (successes == client_count) && (hits == client_count);
-        TestFramework::RecordTest(
-            "CB Phase 5: first attempts not gated", pass,
-            pass ? "" :
-            "successes=" + std::to_string(successes) +
-            "/" + std::to_string(client_count) +
-            " backend_hits=" + std::to_string(hits));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 5: first attempts not gated", false, e.what());
-    }
-}
-
-void RunAllTests() {
-    std::cout << "\n" << std::string(60, '=') << std::endl;
-    std::cout << "CIRCUIT BREAKER PHASE 5 - RETRY BUDGET INTEGRATION TESTS"
-              << std::endl;
-    std::cout << std::string(60, '=') << std::endl;
-
-    TestRetryBudgetRejectsRetry();
-    TestRetryBudgetMinConcurrencyFloor();
-    TestRetryBudgetDryRunPassthrough();
-    TestFirstAttemptsNotGated();
-}
-
-}  // namespace CircuitBreakerPhase5Tests
diff --git a/test/circuit_breaker_phase6_test.h b/test/circuit_breaker_phase6_test.h
deleted file mode 100644
index 77eea2c1..00000000
--- a/test/circuit_breaker_phase6_test.h
+++ /dev/null
@@ -1,261 +0,0 @@
-#pragma once
-
-// Phase 6 integration tests: wait-queue drain on CLOSED → OPEN trip.
-//
-// Phase 4 already covered "new requests after a trip hit REJECTED_OPEN".
-// Phase 6 covers the orthogonal case: a request that passed ConsultBreaker
-// pre-trip and is waiting in the pool's bounded wait queue when the trip
-// fires. Without the drain, that waiter would sit until either the pool
-// frees a slot (and then re-hit the upstream — pointless traffic) or the
-// queue-timeout / open-duration elapses (up to 60s latency spike).
-//
-// Mechanism tested: `HttpServer::MarkServerReady` installs a transition
-// callback on every slice that routes CLOSED → OPEN to the corresponding
-// `PoolPartition::DrainWaitQueueOnTrip()`. Each waiter receives
-// `CHECKOUT_CIRCUIT_OPEN`, which `ProxyTransaction::OnCheckoutError` maps
-// to the standard circuit-open response (503 + `X-Circuit-Breaker: open`).
-//
-// Strategy: gate concurrency via a 1-connection pool. The first request
-// hangs at the backend long enough to let a second request queue behind
-// it. When the first's response lands (502), the breaker trips and the
-// drain fires, causing the queued request to receive 503 + circuit-open
-// headers instead of the backend's 502 (which would happen if the drain
-// were missing and the queued request proceeded).
-
-#include "test_framework.h"
-#include "test_server_runner.h"
-#include "http_test_client.h"
-#include "http/http_server.h"
-#include "config/server_config.h"
-
-#include <thread>
-#include <chrono>
-#include <atomic>
-#include <vector>
-#include <future>
-
-namespace CircuitBreakerPhase6Tests {
-
-static UpstreamConfig MakeDrainTripUpstream(const std::string& name,
-                                             const std::string& host,
-                                             int port,
-                                             bool breaker_enabled) {
-    UpstreamConfig u;
-    u.name = name;
-    u.host = host;
-    u.port = port;
-    // Single connection per partition — forces the second concurrent
-    // request to queue behind the first. Since tests run with
-    // worker_threads=1, one partition exists and it has exactly one
-    // connection slot.
-    u.pool.max_connections       = 1;
-    u.pool.max_idle_connections  = 1;
-    u.pool.connect_timeout_ms    = 3000;
-    u.pool.idle_timeout_sec      = 30;
-    u.pool.max_lifetime_sec      = 3600;
-    u.pool.max_requests_per_conn = 0;
-
-    u.proxy.route_prefix = "/fail";
-    u.proxy.strip_prefix = false;
-    u.proxy.response_timeout_ms = 5000;
-    u.proxy.retry.max_retries = 0;  // Deterministic — no retry confounds.
-
-    u.circuit_breaker.enabled = breaker_enabled;
-    u.circuit_breaker.consecutive_failure_threshold = 1;  // Trip on first 5xx.
-    u.circuit_breaker.failure_rate_threshold = 100;
-    u.circuit_breaker.minimum_volume = 10000;
-    u.circuit_breaker.window_seconds = 10;
-    u.circuit_breaker.permitted_half_open_calls = 2;
-    // Long open duration so the drain is unambiguously the thing that
-    // surfaces the 503 to the queued client — not a timer-driven
-    // HALF_OPEN recovery admitting a subsequent attempt.
-    u.circuit_breaker.base_open_duration_ms = 30000;
-    u.circuit_breaker.max_open_duration_ms  = 60000;
-    return u;
-}
-
-// ---------------------------------------------------------------------------
-// Test 1: CLOSED→OPEN trip drains queued waiter with 503 + X-Circuit-Breaker.
-//
-// Request A takes the single pool slot and hangs at the backend for ~300ms.
-// Request B queues (pool exhausted). At t≈300ms, A's backend response
-// arrives: 502 → slice trip → transition callback → DrainWaitQueueOnTrip →
-// B's error_callback fires with CHECKOUT_CIRCUIT_OPEN. B's client receives
-// 503 + `X-Circuit-Breaker: open`.
-//
-// Pre-fix (no drain): B waits ~300ms for A's slot to free, then hits the
-// backend itself, gets 502, client sees 502 — NOT 503 and NOT
-// X-Circuit-Breaker: open. The assertion `is_503 && has_breaker_header`
-// fails without the drain wiring.
-// ---------------------------------------------------------------------------
-void TestWaitQueueDrainedOnTrip() {
-    std::cout << "\n[TEST] CB Phase 6: wait queue drained on trip..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            // Delay so the gateway's pool holds the connection long
-            // enough for a second client request to queue on it.
-            std::this_thread::sleep_for(std::chrono::milliseconds(300));
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;  // Single partition → single wait queue.
-        gw.http2.enabled = false;
-
-        gw.upstreams.push_back(
-            MakeDrainTripUpstream("svc", "127.0.0.1", backend_port,
-                                  /*breaker_enabled=*/true));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Launch A first (takes the one connection), then B 50ms later
-        // so B is guaranteed to enter the wait queue.
-        std::promise<std::string> a_resp, b_resp;
-        auto a_fut = a_resp.get_future();
-        auto b_fut = b_resp.get_future();
-        std::thread a([&]() {
-            a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
-        });
-        std::this_thread::sleep_for(std::chrono::milliseconds(50));
-        std::thread b([&]() {
-            b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
-        });
-        a.join();
-        b.join();
-
-        std::string ra = a_fut.get();
-        std::string rb = b_fut.get();
-
-        // A unambiguously hits the backend (owns the slot) and sees 502.
-        bool a_is_502 = TestHttpClient::HasStatus(ra, 502);
-        // B must see the circuit-open short-circuit from the drain —
-        // NOT a 502 from the backend, which is what happens without
-        // the drain wiring.
-        bool b_is_503 = TestHttpClient::HasStatus(rb, 503);
-        bool b_has_breaker_hdr =
-            rb.find("X-Circuit-Breaker: open") != std::string::npos ||
-            rb.find("x-circuit-breaker: open") != std::string::npos;
-        // Exactly one backend hit — B was drained before making it to
-        // the upstream. Without the drain, backend_hits would be 2.
-        int hits = backend_hits.load(std::memory_order_relaxed);
-        bool single_hit = (hits == 1);
-
-        bool pass = a_is_502 && b_is_503 && b_has_breaker_hdr && single_hit;
-        TestFramework::RecordTest(
-            "CB Phase 6: wait queue drained on trip", pass,
-            pass ? "" :
-            "a_is_502=" + std::to_string(a_is_502) +
-            " b_is_503=" + std::to_string(b_is_503) +
-            " b_breaker_hdr=" + std::to_string(b_has_breaker_hdr) +
-            " backend_hits=" + std::to_string(hits) +
-            " rb_head=" + rb.substr(0, 200));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 6: wait queue drained on trip", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 2: With the breaker disabled, the drain does NOT fire — the queued
-// waiter proceeds to the upstream as it would absent the circuit-breaker
-// layer entirely.
-//
-// Same setup as Test 1 but `circuit_breaker.enabled=false`. Disabled slices
-// short-circuit in TryAcquire and never invoke transition callbacks, so
-// DrainWaitQueueOnTrip is never called. Request B must hit the backend
-// (backend_hits == 2) and receive the upstream's 502 — NOT a 503.
-// ---------------------------------------------------------------------------
-void TestDisabledBreakerDoesNotDrain() {
-    std::cout << "\n[TEST] CB Phase 6: disabled breaker does not drain..."
-              << std::endl;
-    try {
-        std::atomic<int> backend_hits{0};
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [&backend_hits](const HttpRequest&, HttpResponse& resp) {
-            backend_hits.fetch_add(1, std::memory_order_relaxed);
-            std::this_thread::sleep_for(std::chrono::milliseconds(300));
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        gw.upstreams.push_back(
-            MakeDrainTripUpstream("svc", "127.0.0.1", backend_port,
-                                  /*breaker_enabled=*/false));
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        std::promise<std::string> a_resp, b_resp;
-        auto a_fut = a_resp.get_future();
-        auto b_fut = b_resp.get_future();
-        std::thread a([&]() {
-            a_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
-        });
-        std::this_thread::sleep_for(std::chrono::milliseconds(50));
-        std::thread b([&]() {
-            b_resp.set_value(TestHttpClient::HttpGet(gw_port, "/fail", 5000));
-        });
-        a.join();
-        b.join();
-
-        std::string ra = a_fut.get();
-        std::string rb = b_fut.get();
-
-        // Both reach the backend — disabled breaker = no drain.
-        bool a_is_502 = TestHttpClient::HasStatus(ra, 502);
-        bool b_is_502 = TestHttpClient::HasStatus(rb, 502);
-        // Neither should carry the circuit-open header.
-        bool no_breaker_on_a =
-            ra.find("X-Circuit-Breaker") == std::string::npos &&
-            ra.find("x-circuit-breaker") == std::string::npos;
-        bool no_breaker_on_b =
-            rb.find("X-Circuit-Breaker") == std::string::npos &&
-            rb.find("x-circuit-breaker") == std::string::npos;
-        int hits = backend_hits.load(std::memory_order_relaxed);
-        bool two_hits = (hits == 2);
-
-        bool pass = a_is_502 && b_is_502 && no_breaker_on_a &&
-                    no_breaker_on_b && two_hits;
-        TestFramework::RecordTest(
-            "CB Phase 6: disabled breaker does not drain", pass,
-            pass ? "" :
-            "a_is_502=" + std::to_string(a_is_502) +
-            " b_is_502=" + std::to_string(b_is_502) +
-            " no_breaker_on_a=" + std::to_string(no_breaker_on_a) +
-            " no_breaker_on_b=" + std::to_string(no_breaker_on_b) +
-            " backend_hits=" + std::to_string(hits));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 6: disabled breaker does not drain", false, e.what());
-    }
-}
-
-void RunAllTests() {
-    std::cout << "\n" << std::string(60, '=') << std::endl;
-    std::cout << "CIRCUIT BREAKER PHASE 6 - WAIT-QUEUE DRAIN ON TRIP TESTS"
-              << std::endl;
-    std::cout << std::string(60, '=') << std::endl;
-
-    TestWaitQueueDrainedOnTrip();
-    TestDisabledBreakerDoesNotDrain();
-}
-
-}  // namespace CircuitBreakerPhase6Tests
diff --git a/test/circuit_breaker_phase7_test.h b/test/circuit_breaker_phase7_test.h
deleted file mode 100644
index 9dc841ba..00000000
--- a/test/circuit_breaker_phase7_test.h
+++ /dev/null
@@ -1,405 +0,0 @@
-#pragma once
-
-// Phase 7 integration tests: observability — counter accuracy, snapshot
-// API correctness, and log emission.
-//
-// Phases 2-6 each added counters and log lines as a side effect of their
-// functional work. Phase 7 locks those in as regressions:
-//
-//   * Counters (§11.2): trips, rejected, probe_successes, probe_failures,
-//     retries_rejected surface through CircuitBreakerManager::SnapshotAll.
-//   * Snapshot API (§11.3): per-slice rows aggregate into host-level
-//     totals; host-level fields (retries_in_flight / retries_rejected /
-//     in_flight) reflect the owning RetryBudget.
-//   * Logs (§11.1): the CLOSED→OPEN trip emits the full-context message
-//     including trigger, consecutive_failures, window_total,
-//     window_fail_rate, open_for_ms, and consecutive_trips.
-//
-// The log-emission test attaches a spdlog ring-buffer sink to the logger
-// for the duration of the test, triggers a trip, then asserts the
-// captured messages contain the expected fields. No log file I/O.
-
-#include "test_framework.h"
-#include "test_server_runner.h"
-#include "http_test_client.h"
-#include "http/http_server.h"
-#include "config/server_config.h"
-#include "upstream/upstream_manager.h"
-#include "circuit_breaker/circuit_breaker_manager.h"
-#include "circuit_breaker/circuit_breaker_host.h"
-#include "circuit_breaker/circuit_breaker_slice.h"
-#include "log/logger.h"
-#include "spdlog/sinks/ringbuffer_sink.h"
-
-#include <thread>
-#include <chrono>
-#include <atomic>
-#include <string>
-#include <vector>
-#include <memory>
-
-namespace CircuitBreakerPhase7Tests {
-
-using circuit_breaker::State;
-
-static UpstreamConfig MakeObservUpstream(const std::string& name,
-                                          const std::string& host,
-                                          int port,
-                                          int consecutive_threshold = 3) {
-    UpstreamConfig u;
-    u.name = name;
-    u.host = host;
-    u.port = port;
-    u.pool.max_connections       = 8;
-    u.pool.max_idle_connections  = 4;
-    u.pool.connect_timeout_ms    = 3000;
-    u.pool.idle_timeout_sec      = 30;
-    u.pool.max_lifetime_sec      = 3600;
-    u.pool.max_requests_per_conn = 0;
-
-    u.proxy.route_prefix = "/fail";
-    u.proxy.strip_prefix = false;
-    u.proxy.response_timeout_ms = 2000;
-    u.proxy.retry.max_retries = 0;
-
-    u.circuit_breaker.enabled = true;
-    u.circuit_breaker.consecutive_failure_threshold = consecutive_threshold;
-    u.circuit_breaker.failure_rate_threshold = 100;
-    u.circuit_breaker.minimum_volume = 10000;
-    u.circuit_breaker.window_seconds = 10;
-    u.circuit_breaker.permitted_half_open_calls = 2;
-    // Long open duration — keep the slice OPEN so post-trip assertions
-    // don't race a HALF_OPEN transition.
-    u.circuit_breaker.base_open_duration_ms = 30000;
-    u.circuit_breaker.max_open_duration_ms  = 60000;
-    return u;
-}
-
-// ---------------------------------------------------------------------------
-// Test 1: Snapshot API reflects per-slice trip/rejected counters and
-// host-level aggregates. Drives N+1 requests against a backend that always
-// 502s (N to trip, 1 more that the OPEN slice short-circuits) and asserts
-// the snapshot shows total_trips >= 1, total_rejected >= 1,
-// open_partitions >= 1.
-// ---------------------------------------------------------------------------
-void TestSnapshotReflectsCounters() {
-    std::cout << "\n[TEST] CB Phase 7: snapshot reflects counters..."
-              << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        auto u = MakeObservUpstream("svc", "127.0.0.1", backend_port,
-                                    /*threshold=*/3);
-        gw.upstreams.push_back(u);
-
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        // Trip (3 failures), then 2 more to accumulate rejected counter.
-        for (int i = 0; i < 3; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-        for (int i = 0; i < 2; ++i) {
-            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        }
-
-        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
-        if (!cbm) {
-            TestFramework::RecordTest(
-                "CB Phase 7: snapshot reflects counters", false,
-                "no circuit breaker manager attached");
-            return;
-        }
-        auto snaps = cbm->SnapshotAll();
-        bool found = false;
-        int64_t trips = 0, rejected = 0, probe_s = 0, probe_f = 0;
-        int open_parts = 0;
-        for (const auto& s : snaps) {
-            if (s.service_name == "svc") {
-                trips = s.total_trips;
-                rejected = s.total_rejected;
-                open_parts = s.open_partitions;
-                for (const auto& row : s.slices) {
-                    probe_s += row.probe_successes;
-                    probe_f += row.probe_failures;
-                }
-                found = true;
-                break;
-            }
-        }
-
-        bool pass = found
-                    && trips >= 1
-                    && rejected >= 2   // 2 post-trip short-circuits
-                    && open_parts >= 1
-                    && probe_s == 0    // never entered HALF_OPEN
-                    && probe_f == 0;
-        TestFramework::RecordTest(
-            "CB Phase 7: snapshot reflects counters", pass,
-            pass ? "" :
-            "found=" + std::to_string(found) +
-            " trips=" + std::to_string(trips) +
-            " rejected=" + std::to_string(rejected) +
-            " open_parts=" + std::to_string(open_parts) +
-            " probe_s=" + std::to_string(probe_s) +
-            " probe_f=" + std::to_string(probe_f));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 7: snapshot reflects counters", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 2: The CLOSED→OPEN trip log emits the §11.1 full-context message.
-// Attaches a spdlog ringbuffer_sink to the shared logger, triggers a trip,
-// then inspects the captured messages for the key tokens. The sink is
-// removed before the test returns so it doesn't affect later tests.
-// ---------------------------------------------------------------------------
-void TestTripLogEmission() {
-    std::cout << "\n[TEST] CB Phase 7: trip log emission..." << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        auto u = MakeObservUpstream("svc-log", "127.0.0.1", backend_port,
-                                    /*threshold=*/2);
-        gw.upstreams.push_back(u);
-
-        // `HttpServer` construction calls `logging::Init()` which rebuilds
-        // the default logger via `spdlog::set_default_logger`. Any sink
-        // attached BEFORE that point lands on a stale logger. Attach the
-        // ringbuffer sink AFTER the last HttpServer construction so it
-        // captures the live logger's output.
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        auto ring = std::make_shared<
-            spdlog::sinks::ringbuffer_sink_mt>(1024);
-        auto logger = logging::Get();
-        auto prev_level = logger->level();
-        logger->set_level(spdlog::level::debug);
-        logger->sinks().push_back(ring);
-
-        struct SinkGuard {
-            std::shared_ptr<spdlog::logger> logger;
-            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
-            spdlog::level::level_enum prev_level;
-            ~SinkGuard() {
-                auto& sinks = logger->sinks();
-                sinks.erase(std::remove(sinks.begin(), sinks.end(),
-                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
-                            sinks.end());
-                logger->set_level(prev_level);
-            }
-        } guard{logger, ring, prev_level};
-
-        // Drive exactly threshold=2 failures to trip.
-        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-        TestHttpClient::HttpGet(gw_port, "/fail", 3000);
-
-        // Give the dispatcher a breath to emit + the sink to settle.
-        std::this_thread::sleep_for(std::chrono::milliseconds(50));
-
-        auto messages = ring->last_formatted();
-        // Scan for the trip message. Look for the static prefix plus the
-        // §11.1 field tokens.
-        bool saw_tripped = false;
-        bool has_trigger = false;
-        bool has_consec_failures = false;
-        bool has_window_total = false;
-        bool has_fail_rate = false;
-        bool has_open_for_ms = false;
-        bool has_consec_trips = false;
-        for (const auto& msg : messages) {
-            if (msg.find("circuit breaker tripped") == std::string::npos) {
-                continue;
-            }
-            saw_tripped = true;
-            if (msg.find("trigger=") != std::string::npos) has_trigger = true;
-            if (msg.find("consecutive_failures=") != std::string::npos)
-                has_consec_failures = true;
-            if (msg.find("window_total=") != std::string::npos)
-                has_window_total = true;
-            if (msg.find("window_fail_rate=") != std::string::npos)
-                has_fail_rate = true;
-            if (msg.find("open_for_ms=") != std::string::npos)
-                has_open_for_ms = true;
-            if (msg.find("consecutive_trips=") != std::string::npos)
-                has_consec_trips = true;
-        }
-
-        bool pass = saw_tripped && has_trigger && has_consec_failures &&
-                    has_window_total && has_fail_rate &&
-                    has_open_for_ms && has_consec_trips;
-        TestFramework::RecordTest(
-            "CB Phase 7: trip log emission", pass,
-            pass ? "" :
-            "saw_tripped=" + std::to_string(saw_tripped) +
-            " trigger=" + std::to_string(has_trigger) +
-            " consec_failures=" + std::to_string(has_consec_failures) +
-            " window_total=" + std::to_string(has_window_total) +
-            " fail_rate=" + std::to_string(has_fail_rate) +
-            " open_for_ms=" + std::to_string(has_open_for_ms) +
-            " consec_trips=" + std::to_string(has_consec_trips));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 7: trip log emission", false, e.what());
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Test 3: Retry-budget observability — the exhausted log carries the
-// §11.1 fields (service, in_flight, retries_in_flight, cap), and the
-// host snapshot reflects retries_rejected.
-// ---------------------------------------------------------------------------
-void TestRetryBudgetObservability() {
-    std::cout << "\n[TEST] CB Phase 7: retry budget observability..."
-              << std::endl;
-    try {
-        HttpServer backend("127.0.0.1", 0);
-        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
-            resp.Status(502).Body("upstream-err", "text/plain");
-        });
-        TestServerRunner<HttpServer> backend_runner(backend);
-        int backend_port = backend_runner.GetPort();
-
-        ServerConfig gw;
-        gw.bind_host = "127.0.0.1";
-        gw.bind_port = 0;
-        gw.worker_threads = 1;
-        gw.http2.enabled = false;
-
-        // Budget: zero percent AND zero floor → every retry rejected.
-        auto u = MakeObservUpstream("svc-budget", "127.0.0.1", backend_port,
-                                    /*threshold=*/10000);
-        u.proxy.retry.max_retries = 2;
-        u.proxy.retry.retry_on_5xx = true;
-        u.circuit_breaker.retry_budget_percent = 0;
-        u.circuit_breaker.retry_budget_min_concurrency = 0;
-        gw.upstreams.push_back(u);
-
-        // Attach the ringbuffer AFTER gateway construction — see
-        // TestTripLogEmission for rationale (HttpServer's ctor
-        // replaces the default logger via logging::Init, detaching
-        // any previously-attached sinks).
-        HttpServer gateway(gw);
-        TestServerRunner<HttpServer> gw_runner(gateway);
-        int gw_port = gw_runner.GetPort();
-
-        auto ring = std::make_shared<
-            spdlog::sinks::ringbuffer_sink_mt>(1024);
-        auto logger = logging::Get();
-        auto prev_level = logger->level();
-        logger->set_level(spdlog::level::debug);
-        logger->sinks().push_back(ring);
-
-        struct SinkGuard {
-            std::shared_ptr<spdlog::logger> logger;
-            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
-            spdlog::level::level_enum prev_level;
-            ~SinkGuard() {
-                auto& sinks = logger->sinks();
-                sinks.erase(std::remove(sinks.begin(), sinks.end(),
-                                        std::shared_ptr<spdlog::sinks::sink>(ring)),
-                            sinks.end());
-                logger->set_level(prev_level);
-            }
-        } guard{logger, ring, prev_level};
-
-        // One client request: first attempt hits backend (502), retry
-        // blocked by budget → 503 + X-Retry-Budget-Exhausted.
-        TestHttpClient::HttpGet(gw_port, "/fail", 5000);
-
-        std::this_thread::sleep_for(std::chrono::milliseconds(50));
-
-        auto messages = ring->last_formatted();
-        bool saw_exhausted = false;
-        bool has_service = false;
-        bool has_inflight = false;
-        bool has_retries_inflight = false;
-        bool has_cap = false;
-        for (const auto& msg : messages) {
-            if (msg.find("retry budget exhausted") == std::string::npos) {
-                continue;
-            }
-            saw_exhausted = true;
-            if (msg.find("service=") != std::string::npos) has_service = true;
-            if (msg.find("in_flight=") != std::string::npos)
-                has_inflight = true;
-            if (msg.find("retries_in_flight=") != std::string::npos)
-                has_retries_inflight = true;
-            if (msg.find("cap=") != std::string::npos) has_cap = true;
-        }
-
-        // Snapshot: retries_rejected must be >= 1 (every rejection increments).
-        int64_t retries_rejected = 0;
-        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
-        if (cbm) {
-            for (const auto& s : cbm->SnapshotAll()) {
-                if (s.service_name == "svc-budget") {
-                    // Host aggregate — single host, so the sum is the
-                    // host's retries_rejected. The snapshot doesn't yet
-                    // expose that directly — derive from RetryBudget
-                    // via the host getter.
-                    auto* host = cbm->GetHost("svc-budget");
-                    if (host) {
-                        retries_rejected =
-                            host->GetRetryBudget()->RetriesRejected();
-                    }
-                    break;
-                }
-            }
-        }
-
-        bool pass = saw_exhausted && has_service && has_inflight &&
-                    has_retries_inflight && has_cap &&
-                    retries_rejected >= 1;
-        TestFramework::RecordTest(
-            "CB Phase 7: retry budget observability", pass,
-            pass ? "" :
-            "saw_exhausted=" + std::to_string(saw_exhausted) +
-            " service=" + std::to_string(has_service) +
-            " inflight=" + std::to_string(has_inflight) +
-            " retries_inflight=" + std::to_string(has_retries_inflight) +
-            " cap=" + std::to_string(has_cap) +
-            " retries_rejected=" + std::to_string(retries_rejected));
-    } catch (const std::exception& e) {
-        TestFramework::RecordTest(
-            "CB Phase 7: retry budget observability", false, e.what());
-    }
-}
-
-void RunAllTests() {
-    std::cout << "\n" << std::string(60, '=') << std::endl;
-    std::cout << "CIRCUIT BREAKER PHASE 7 - OBSERVABILITY TESTS" << std::endl;
-    std::cout << std::string(60, '=') << std::endl;
-
-    TestSnapshotReflectsCounters();
-    TestTripLogEmission();
-    TestRetryBudgetObservability();
-}
-
-}  // namespace CircuitBreakerPhase7Tests

From f977ed2988022a4de18f8f2b3de7ed23c01f3523 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Tue, 14 Apr 2026 23:56:49 +0800
Subject: [PATCH 27/37] Fix review comment

---
 docs/circuit_breaker.md     |   2 +-
 server/http_server.cc       |  26 +++++--
 server/proxy_transaction.cc | 133 ++++++++++++++++++++----------------
 3 files changed, 93 insertions(+), 68 deletions(-)

diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md
index 6f38de69..ef3a5ef0 100644
--- a/docs/circuit_breaker.md
+++ b/docs/circuit_breaker.md
@@ -134,7 +134,7 @@ Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a re
 
 ### Snapshot API
 
-`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A future `/admin/breakers` endpoint would JSON-serialize this.
+`CircuitBreakerManager::SnapshotAll()` returns one `CircuitBreakerHostSnapshot` per upstream with per-slice rows (`state`, `trips`, `rejected`, `probe_successes`, `probe_failures`) plus host-level aggregates (`total_trips`, `total_rejected`, `open_partitions`, `half_open_partitions`, `retries_in_flight`, `retries_rejected`, `in_flight`). A `/admin/breakers` HTTP endpoint that JSON-serializes this snapshot is **planned but not yet exposed** — the API is ready for future wiring.
 
 ---
 
diff --git a/server/http_server.cc b/server/http_server.cc
index abee42c0..2289ebef 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -3743,19 +3743,31 @@ bool HttpServer::Reload(const ServerConfig& new_config) {
     // UpstreamConfig deliberately excludes `circuit_breaker` so a CB-
     // only edit doesn't trigger this warning (the reload above already
     // applied the new breaker settings to live slices).
+    //
+    // When topology DIFFERS, we deliberately DO NOT copy the staged
+    // config into `upstream_configs_`: subsequent reloads (including
+    // the timer-cadence recomputation above) read from this vector to
+    // match live pool state. Adopting staged-but-inactive topology
+    // values would silently widen the dispatcher timer past the active
+    // pool timeouts — e.g. staging `pool.connect_timeout_ms=10000`
+    // (restart required) then reloading any unrelated field would
+    // recompute cadence from 10s while the live pool still uses 3s,
+    // firing connect-timeouts late. The CB-field portion of the edit
+    // was already applied live via `circuit_breaker_manager_->Reload`
+    // above, so the live slices carry the new tuning regardless of
+    // whether `upstream_configs_` shows it.
+    //
+    // When topology MATCHES (the common case, including CB-only
+    // edits), adopt the new snapshot as the fresh baseline so CB-
+    // field edits persist for later reload diffs.
     if (new_config.upstreams != upstream_configs_) {
         logging::Get()->warn("Reload: upstream topology changes require a "
                              "restart to take effect (circuit-breaker "
                              "field edits, if any, were applied live)");
+    } else {
+        upstream_configs_ = new_config.upstreams;
     }
 
-    // Persist the new upstreams (preserving the breaker propagation just
-    // applied). Subsequent reloads diff against this baseline, so without
-    // this update a second SIGHUP would re-propagate the same CB values
-    // and also see the original topology as "unchanged" rather than the
-    // attempted new state — confusing operators debugging reload behavior.
-    upstream_configs_ = new_config.upstreams;
-
     return true;
 }
 
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 29dbe550..d34bbc32 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -124,13 +124,19 @@ void ProxyTransaction::Start() {
             auto* host = cbm->GetHost(service_name_);
             if (host) {
                 slice_ = host->GetSlice(static_cast<size_t>(dispatcher_index_));
-                // Retry budget is host-level (shared across partitions).
-                // Resolve from the same host so retry admission math stays
-                // consistent with the slice's dispatcher routing. Always
-                // non-null when the host exists (budget is unconditionally
-                // constructed by the host ctor). Null only when `host`
-                // itself is null.
-                retry_budget_ = host->GetRetryBudget();
+                // Retry budget is part of the circuit-breaker feature and
+                // must inherit its opt-in default. CircuitBreakerHost
+                // unconditionally constructs a RetryBudget (one-per-host)
+                // so the pointer is always available — but engaging it
+                // when `circuit_breaker.enabled=false` would silently
+                // regress deployments that set `proxy.retry.max_retries>0`
+                // without ever opting into circuit breaking: a retry
+                // storm would suddenly see 503+X-Retry-Budget-Exhausted.
+                // Gate on the slice's live config so the enabled-toggle
+                // flip is the sole switch for the whole feature.
+                if (slice_ && slice_->config().enabled) {
+                    retry_budget_ = host->GetRetryBudget();
+                }
             }
         }
     }
@@ -153,6 +159,44 @@ void ProxyTransaction::AttemptCheckout() {
         return;
     }
 
+    // Retry-budget gate for retry attempts (attempt_ > 0). Gating here
+    // rather than in MaybeRetry means a delayed retry holds no token
+    // during its backoff sleep — the budget's `retries_in_flight`
+    // reflects only retries that are actually about to reach (or are
+    // reaching) the upstream, matching the "aggregate upstream load"
+    // semantics of the %-of-in-flight cap. Gating in MaybeRetry
+    // instead would count queued-but-sleeping retries toward the cap
+    // and trigger X-Retry-Budget-Exhausted even when no retry has
+    // actually contacted the upstream yet.
+    //
+    // The `!retry_token_held_` guard is defensive — Cleanup() between
+    // retry attempts always releases the prior token, so this branch
+    // never normally sees an already-held token; the check only
+    // prevents a re-entrant AttemptCheckout from double-consuming.
+    if (retry_budget_ && attempt_ > 0 && !retry_token_held_) {
+        bool is_dry_run = slice_ && slice_->config().dry_run;
+        if (retry_budget_->TryConsumeRetry()) {
+            retry_token_held_ = true;
+        } else if (is_dry_run) {
+            logging::Get()->info(
+                "ProxyTransaction retry budget would-reject (dry-run) "
+                "client_fd={} service={} attempt={}",
+                client_fd_, service_name_, attempt_);
+        } else {
+            logging::Get()->warn(
+                "retry budget exhausted service={} in_flight={} "
+                "retries_in_flight={} cap={} client_fd={} attempt={}",
+                service_name_,
+                retry_budget_->InFlight(),
+                retry_budget_->RetriesInFlight(),
+                retry_budget_->ComputeCap(),
+                client_fd_, attempt_);
+            state_ = State::FAILED;
+            DeliverResponse(MakeRetryBudgetResponse());
+            return;
+        }
+    }
+
     // Track this attempt against the host-level retry budget's
     // in_flight counter. Replaces any prior guard (from the previous
     // attempt of the same transaction) — move-assignment decrements
@@ -691,8 +735,12 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) {
 
         // Release old lease, clear callbacks, poison if tainted.
         // Cleanup also releases any retry token held by the previous
-        // retry attempt (attempt_ > 1) so the next TryConsumeRetry sees
-        // a fresh counter.
+        // retry attempt so the next TryConsumeRetry in AttemptCheckout
+        // sees a fresh counter. The retry-budget gate itself now lives
+        // at the top of AttemptCheckout — that way a delayed retry
+        // doesn't hold a token during its backoff sleep, which would
+        // otherwise pollute the budget's retries_in_flight with
+        // queued-but-sleeping work that hasn't reached the upstream.
         Cleanup();
         codec_.Reset();
         // Re-apply request method after reset — llhttp_init() zeroes
@@ -701,49 +749,6 @@ void ProxyTransaction::MaybeRetry(RetryPolicy::RetryCondition condition) {
         codec_.SetRequestMethod(method_);
         poison_connection_ = false;
 
-        // Retry-budget gate. `attempt_ > 0` here is guaranteed — we
-        // just incremented. The budget bounds how many retries can be
-        // concurrently in flight against this upstream HOST (aggregated
-        // across all transactions for the service), preventing a retry
-        // storm from amplifying traffic to a struggling backend.
-        //
-        // Dry-run: log the would-reject but still proceed (consistent
-        // with REJECTED_OPEN_DRYRUN on the slice path). No token is
-        // consumed, so no ReleaseRetry is needed on the dry-run path.
-        //
-        // Full mode: deliver the §12.2 retry-budget response (503 +
-        // X-Retry-Budget-Exhausted) and terminate. Does NOT call
-        // ReportBreakerOutcome — our own reject must not feed back
-        // into the slice's failure math.
-        if (retry_budget_) {
-            bool is_dry_run = slice_ && slice_->config().dry_run;
-            if (retry_budget_->TryConsumeRetry()) {
-                retry_token_held_ = true;
-            } else if (is_dry_run) {
-                logging::Get()->info(
-                    "ProxyTransaction retry budget would-reject (dry-run) "
-                    "client_fd={} service={} attempt={}",
-                    client_fd_, service_name_, attempt_);
-            } else {
-                // §11.1 format: log per-host budget state so operators
-                // can diagnose retry-storm throttling without hitting
-                // an admin endpoint. `cap` is the live effective ceiling
-                // (may have shifted since the failing TryConsumeRetry
-                // due to other transactions' in_flight changes).
-                logging::Get()->warn(
-                    "retry budget exhausted service={} in_flight={} "
-                    "retries_in_flight={} cap={} client_fd={} attempt={}",
-                    service_name_,
-                    retry_budget_->InFlight(),
-                    retry_budget_->RetriesInFlight(),
-                    retry_budget_->ComputeCap(),
-                    client_fd_, attempt_);
-                state_ = State::FAILED;
-                DeliverResponse(MakeRetryBudgetResponse());
-                return;
-            }
-        }
-
         // Condition-dependent first-retry policy:
         // Connection-level failures (stale keep-alive, connect refused)
         // are transient — a different pooled connection will succeed.
@@ -1089,18 +1094,26 @@ HttpResponse ProxyTransaction::MakeErrorResponse(int result_code) {
     }
     if (result_code == RESULT_CIRCUIT_OPEN) {
         // The static factory has no `this`, so it cannot build the
-        // §12.1-compliant response (Retry-After / X-Circuit-Breaker /
-        // X-Upstream-Host). All in-class paths for CIRCUIT_OPEN use
-        // the non-static MakeCircuitOpenResponse() — reaching this
-        // branch means a future caller forgot that rule, and would
-        // silently serve a non-compliant 503. Log loudly so the
-        // mistake shows up in logs instead of producing a stealth
-        // regression against the public contract.
+        // fully §12.1-compliant response (Retry-After derived from
+        // slice state, X-Upstream-Host). All in-class paths for
+        // CIRCUIT_OPEN use the non-static MakeCircuitOpenResponse()
+        // — reaching this branch means a future caller forgot that
+        // rule. Log loudly so the mistake shows up in logs instead
+        // of producing a stealth regression against the contract.
+        //
+        // Still emit `X-Circuit-Breaker: open` + `Connection: close`
+        // so the response remains self-identifying as a circuit-open
+        // reject. Clients inspecting that header will correctly back
+        // off via their own client-side logic rather than treating
+        // this as an anonymous 503.
         logging::Get()->error(
             "ProxyTransaction::MakeErrorResponse(RESULT_CIRCUIT_OPEN) "
             "invoked from static context — use MakeCircuitOpenResponse() "
             "to emit §12.1-compliant headers");
-        return HttpResponse::ServiceUnavailable();
+        HttpResponse resp = HttpResponse::ServiceUnavailable();
+        resp.Header("X-Circuit-Breaker", "open");
+        resp.Header("Connection", "close");
+        return resp;
     }
     if (result_code == RESULT_CHECKOUT_FAILED ||
         result_code == RESULT_SEND_FAILED ||

From f2f72efda28ee2aceabdfa0c9ff31e53437d1a57 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 00:23:56 +0800
Subject: [PATCH 28/37] Fix review comment

---
 include/config/config_loader.h | 14 +++++++
 server/config_loader.cc        | 67 ++++++++++++++++++++++++++++++++++
 server/http_server.cc          | 36 +++++++++++++-----
 server/main.cc                 | 19 ++++++++++
 server/proxy_transaction.cc    | 65 +++++++++++++++++++--------------
 5 files changed, 164 insertions(+), 37 deletions(-)

diff --git a/include/config/config_loader.h b/include/config/config_loader.h
index ba13f62a..fbf3319e 100644
--- a/include/config/config_loader.h
+++ b/include/config/config_loader.h
@@ -27,6 +27,20 @@ class ConfigLoader {
     // Throws std::invalid_argument if validation fails.
     static void Validate(const ServerConfig& config);
 
+    // Validate ONLY the fields that are live-reloadable without a
+    // restart — today this is the per-upstream circuit_breaker block.
+    // Used by the SIGHUP reload path, which downgrades the full
+    // `Validate()` failure to a warn because most of its rules cover
+    // restart-only fields. That downgrade is unsafe for live-
+    // reloadable fields: an invalid breaker threshold would be
+    // pushed into live slices even though the same value would be
+    // rejected at startup. Call this BEFORE applying a reloaded
+    // config and abort the reload if it throws.
+    //
+    // Throws std::invalid_argument with a message identifying the
+    // offending upstream and field.
+    static void ValidateHotReloadable(const ServerConfig& config);
+
     // Return a ServerConfig with all default values.
     static ServerConfig Default();
 
diff --git a/server/config_loader.cc b/server/config_loader.cc
index f92dd3f2..d6566904 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -561,6 +561,73 @@ void ConfigLoader::ApplyEnvOverrides(ServerConfig& config) {
     if (val) config.rate_limit.status_code = EnvToInt(val, "REACTOR_RATE_LIMIT_STATUS_CODE");
 }
 
+void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) {
+    // Mirrors the circuit_breaker validation block in Validate().
+    // Kept in lock-step with that block — any rule added there for a
+    // hot-reloadable field must be added here too, or the SIGHUP
+    // reload path would silently accept values the startup path
+    // rejects (which is exactly the regression this helper exists
+    // to prevent).
+    for (size_t i = 0; i < config.upstreams.size(); ++i) {
+        const auto& u = config.upstreams[i];
+        const std::string idx = "upstreams[" + std::to_string(i) + "]";
+        const auto& cb = u.circuit_breaker;
+        if (cb.consecutive_failure_threshold < 1 ||
+            cb.consecutive_failure_threshold > 10000) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.consecutive_failure_threshold must be in [1, 10000]");
+        }
+        if (cb.failure_rate_threshold < 0 || cb.failure_rate_threshold > 100) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.failure_rate_threshold must be in [0, 100]");
+        }
+        if (cb.minimum_volume < 1 || cb.minimum_volume > 10000000) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.minimum_volume must be in [1, 10000000]");
+        }
+        if (cb.window_seconds < 1 || cb.window_seconds > 3600) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.window_seconds must be in [1, 3600]");
+        }
+        if (cb.permitted_half_open_calls < 1 ||
+            cb.permitted_half_open_calls > 1000) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.permitted_half_open_calls must be in [1, 1000]");
+        }
+        if (cb.base_open_duration_ms < 100) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.base_open_duration_ms must be >= 100");
+        }
+        if (cb.max_open_duration_ms < cb.base_open_duration_ms) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.max_open_duration_ms must be >= base_open_duration_ms");
+        }
+        if (cb.max_ejection_percent_per_host_set < 0 ||
+            cb.max_ejection_percent_per_host_set > 100) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.max_ejection_percent_per_host_set must be in [0, 100]");
+        }
+        if (cb.retry_budget_percent < 0 || cb.retry_budget_percent > 100) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.retry_budget_percent must be in [0, 100]");
+        }
+        if (cb.retry_budget_min_concurrency < 0) {
+            throw std::invalid_argument(
+                idx + " ('" + u.name +
+                "'): circuit_breaker.retry_budget_min_concurrency must be >= 0");
+        }
+    }
+}
+
 void ConfigLoader::Validate(const ServerConfig& config) {
     // Validate bind_host is a strict dotted-quad IPv4 address.
     // Use inet_pton (not inet_addr) to reject legacy shorthand forms
diff --git a/server/http_server.cc b/server/http_server.cc
index 2289ebef..1f8ee83c 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -462,15 +462,33 @@ void HttpServer::MarkServerReady() {
                         [um, service, i](circuit_breaker::State old_s,
                                          circuit_breaker::State new_s,
                                          const char* /*trigger*/) {
-                            // Drain only on CLOSED→OPEN. HALF_OPEN→OPEN
-                            // doesn't need draining — in HALF_OPEN, non-
-                            // probe admissions are already REJECTED_OPEN
-                            // before reaching the pool queue, so the
-                            // queue stays empty (or holds only probes,
-                            // which are in-flight by the time HALF_OPEN
-                            // trips back).
-                            if (old_s == circuit_breaker::State::CLOSED &&
-                                new_s == circuit_breaker::State::OPEN) {
+                            // Drain the partition's wait queue whenever
+                            // the slice enters OPEN — from CLOSED (fresh
+                            // trip) OR from HALF_OPEN (probe cycle re-
+                            // tripped).
+                            //
+                            // CLOSED→OPEN is the classic case: queued
+                            // non-probe waiters need to fail fast with
+                            // CHECKOUT_CIRCUIT_OPEN rather than wait for
+                            // the full open duration.
+                            //
+                            // HALF_OPEN→OPEN (probe_fail) matters
+                            // because probe admissions pass
+                            // ConsultBreaker() BEFORE CheckoutAsync() —
+                            // if the pool was saturated during the
+                            // probe cycle, those admitted probes may
+                            // still be queued when the cycle re-trips.
+                            // Without draining, a saw_failure probe
+                            // cycle can leave the pool with queued
+                            // waiters that still eventually dispatch to
+                            // a known-bad upstream. Draining also
+                            // sweeps any non-probe waiters that
+                            // somehow queued during HALF_OPEN (defense
+                            // in depth — TryAcquire normally rejects
+                            // non-probes before they reach the pool).
+                            if (new_s == circuit_breaker::State::OPEN &&
+                                (old_s == circuit_breaker::State::CLOSED ||
+                                 old_s == circuit_breaker::State::HALF_OPEN)) {
                                 if (auto* part = um->GetPoolPartition(
                                         service, i)) {
                                     part->DrainWaitQueueOnTrip();
diff --git a/server/main.cc b/server/main.cc
index 0d7474e9..f7bac586 100644
--- a/server/main.cc
+++ b/server/main.cc
@@ -328,7 +328,26 @@ static bool ReloadConfig(const std::string& config_path,
             }
         }
     }
+    // Hot-reloadable fields (today: per-upstream `circuit_breaker.*`)
+    // are the only ones that go LIVE on a SIGHUP reload. Validate
+    // them strictly — a bad value here would be pushed into running
+    // slices and keep running until an operator-driven restart fixes
+    // the config file. Hard-reject so operators see the error
+    // immediately instead of discovering drift the next time the
+    // startup path rejects the same file.
+    try {
+        ConfigLoader::ValidateHotReloadable(new_config);
+    } catch (const std::invalid_argument& e) {
+        logging::Get()->error("Config reload rejected: {}", e.what());
+        reopen_existing_logs();
+        return false;
+    }
+
     // Warn about restart-required field issues (not applied during reload).
+    // Full Validate() includes both hot-reloadable rules (already checked
+    // above) and restart-only rules; by the time we reach this point the
+    // hot-reloadable subset is known valid, so any exception thrown here
+    // is from restart-only rules and is legitimately a warn, not an error.
     try {
         ConfigLoader::Validate(new_config);
     } catch (const std::invalid_argument& e) {
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index d34bbc32..a5483006 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -124,19 +124,13 @@ void ProxyTransaction::Start() {
             auto* host = cbm->GetHost(service_name_);
             if (host) {
                 slice_ = host->GetSlice(static_cast<size_t>(dispatcher_index_));
-                // Retry budget is part of the circuit-breaker feature and
-                // must inherit its opt-in default. CircuitBreakerHost
-                // unconditionally constructs a RetryBudget (one-per-host)
-                // so the pointer is always available — but engaging it
-                // when `circuit_breaker.enabled=false` would silently
-                // regress deployments that set `proxy.retry.max_retries>0`
-                // without ever opting into circuit breaking: a retry
-                // storm would suddenly see 503+X-Retry-Budget-Exhausted.
-                // Gate on the slice's live config so the enabled-toggle
-                // flip is the sole switch for the whole feature.
-                if (slice_ && slice_->config().enabled) {
-                    retry_budget_ = host->GetRetryBudget();
-                }
+                // Cache the retry-budget pointer unconditionally when
+                // the host exists — usage at each attempt is gated by
+                // the live `slice_->config().enabled` flag so that
+                // SIGHUP toggles take effect on the next retry within
+                // a running transaction. Resolution-time gating would
+                // miss the flip in either direction.
+                retry_budget_ = host->GetRetryBudget();
             }
         }
     }
@@ -164,17 +158,22 @@ void ProxyTransaction::AttemptCheckout() {
     // during its backoff sleep — the budget's `retries_in_flight`
     // reflects only retries that are actually about to reach (or are
     // reaching) the upstream, matching the "aggregate upstream load"
-    // semantics of the %-of-in-flight cap. Gating in MaybeRetry
-    // instead would count queued-but-sleeping retries toward the cap
-    // and trigger X-Retry-Budget-Exhausted even when no retry has
-    // actually contacted the upstream yet.
+    // semantics of the %-of-in-flight cap.
+    //
+    // Live-check `slice_->config().enabled` at each attempt — the
+    // cached `retry_budget_` pointer is resolved once in Start(), but
+    // the `enabled` flag is the documented live master switch. A
+    // SIGHUP flipping enabled=true→false mid-flight must stop
+    // enforcing the budget on subsequent retries; enabled=false→true
+    // mid-flight must start. Gating at the pointer level would miss
+    // both directions.
     //
     // The `!retry_token_held_` guard is defensive — Cleanup() between
-    // retry attempts always releases the prior token, so this branch
-    // never normally sees an already-held token; the check only
-    // prevents a re-entrant AttemptCheckout from double-consuming.
-    if (retry_budget_ && attempt_ > 0 && !retry_token_held_) {
-        bool is_dry_run = slice_ && slice_->config().dry_run;
+    // retry attempts always releases the prior token.
+    bool breaker_live_enabled = slice_ && slice_->config().enabled;
+    if (retry_budget_ && breaker_live_enabled &&
+        attempt_ > 0 && !retry_token_held_) {
+        bool is_dry_run = slice_->config().dry_run;
         if (retry_budget_->TryConsumeRetry()) {
             retry_token_held_ = true;
         } else if (is_dry_run) {
@@ -191,6 +190,17 @@ void ProxyTransaction::AttemptCheckout() {
                 retry_budget_->RetriesInFlight(),
                 retry_budget_->ComputeCap(),
                 client_fd_, attempt_);
+            // CRITICAL: release the slice admission before bailing.
+            // ConsultBreaker() already admitted this attempt — in
+            // HALF_OPEN that means a probe slot was reserved
+            // (half_open_inflight_ / half_open_admitted_ both
+            // incremented). Returning here without releasing would
+            // strand that slot forever, wedging the slice in
+            // half_open_full until an operator-driven reload/reset.
+            // Neutral release decrements both counters for probes;
+            // no-op for non-probe (CLOSED) admissions, matching the
+            // general "local cause, no upstream signal" semantic.
+            ReleaseBreakerAdmissionNeutral();
             state_ = State::FAILED;
             DeliverResponse(MakeRetryBudgetResponse());
             return;
@@ -198,12 +208,11 @@ void ProxyTransaction::AttemptCheckout() {
     }
 
     // Track this attempt against the host-level retry budget's
-    // in_flight counter. Replaces any prior guard (from the previous
-    // attempt of the same transaction) — move-assignment decrements
-    // the old counter and takes ownership of the new, so a retrying
-    // transaction stays at exactly one in_flight unit throughout. No-op
-    // when retry_budget_ is null (no breaker attached for this service).
-    if (retry_budget_) {
+    // in_flight counter. Gated by the live `enabled` flag so disabling
+    // the breaker mid-flight stops tracking immediately; enabling it
+    // starts tracking at the next attempt. No-op when retry_budget_
+    // is null (no breaker manager / unknown host).
+    if (retry_budget_ && breaker_live_enabled) {
         inflight_guard_ = retry_budget_->TrackInFlight();
     }
 

From 6520d86c99cdd567e4a074723e2f18b220896899 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 09:47:40 +0800
Subject: [PATCH 29/37] Fix review comment

---
 server/config_loader.cc     | 22 ++++++++++++++++++++++
 server/http_server.cc       | 22 ++++++++++++++++++++--
 server/proxy_transaction.cc | 10 ++++++++++
 3 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/server/config_loader.cc b/server/config_loader.cc
index d6566904..a34672ce 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -568,6 +568,28 @@ void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) {
     // reload path would silently accept values the startup path
     // rejects (which is exactly the regression this helper exists
     // to prevent).
+
+    // Reject duplicate upstream service names BEFORE the per-upstream
+    // CB validation. CircuitBreakerManager::Reload iterates the new
+    // upstream list and applies each entry's `circuit_breaker` block
+    // to GetHost(name). With duplicates, the first entry's CB values
+    // are applied, then the second entry's overwrite them — last
+    // write silently wins. Startup's full Validate() rejects the
+    // file outright; the hot-reload path must match.
+    {
+        std::unordered_set<std::string> seen;
+        seen.reserve(config.upstreams.size());
+        for (size_t i = 0; i < config.upstreams.size(); ++i) {
+            const auto& name = config.upstreams[i].name;
+            if (!seen.insert(name).second) {
+                throw std::invalid_argument(
+                    "upstreams[" + std::to_string(i) +
+                    "] duplicate service name '" + name +
+                    "' (upstream service names must be unique)");
+            }
+        }
+    }
+
     for (size_t i = 0; i < config.upstreams.size(); ++i) {
         const auto& u = config.upstreams[i];
         const std::string idx = "upstreams[" + std::to_string(i) + "]";
diff --git a/server/http_server.cc b/server/http_server.cc
index 1f8ee83c..8d4008c5 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -3545,8 +3545,16 @@ bool HttpServer::Reload(const ServerConfig& new_config) {
         //      field changes (timeouts, limits, log level).
         validation_copy.http2.enabled =
             http2_enabled_ && new_config.http2.enabled;
-        // Upstream configs are restart-only — clear them so staged edits
-        // in the config file don't block live-safe field reloads.
+        // Upstream configs are RESTART-ONLY for topology fields, but the
+        // per-upstream `circuit_breaker` block is HOT-RELOADABLE — clearing
+        // upstreams entirely from validation_copy would skip CB-field
+        // validation here. Instead: clear the topology-restart-only
+        // path (the full Validate would reject those) and run a separate
+        // ValidateHotReloadable on the original new_config so live-
+        // reloadable CB rules (range checks, duplicate names) are
+        // enforced symmetrically with the SIGHUP path in main.cc.
+        // Without this, in-process callers using HttpServer::Reload
+        // directly would bypass the gate that the CLI path enforces.
         validation_copy.upstreams.clear();
         // Rate limit config IS live-reloadable and MUST be validated.
         // Unlike upstreams (restart-only), rate_limit changes are applied
@@ -3559,6 +3567,16 @@ bool HttpServer::Reload(const ServerConfig& new_config) {
             logging::Get()->error("Reload() rejected invalid config: {}", e.what());
             return false;
         }
+        // Strict gate for hot-reloadable CB fields + duplicate names.
+        // Mirrors main.cc::ReloadConfig — both entry points must reject
+        // invalid CB tuning before it reaches live slices.
+        try {
+            ConfigLoader::ValidateHotReloadable(new_config);
+        } catch (const std::invalid_argument& e) {
+            logging::Get()->error("Reload() rejected invalid config: {}",
+                                  e.what());
+            return false;
+        }
     }
 
     // Three-phase update to prevent mid-reload connections from seeing
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index a5483006..0e1e5a6f 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -946,6 +946,16 @@ void ProxyTransaction::Cleanup() {
     // via the retry_token_held_ flag.
     ReleaseRetryToken();
 
+    // Release the in-flight guard from the just-ended attempt. If
+    // MaybeRetry schedules a delayed backoff, the gap between Cleanup
+    // and the eventual AttemptCheckout (which would move-assign a
+    // fresh guard) holds the old slot in `retry_budget_->in_flight_`
+    // for the entire backoff sleep. That inflates the effective
+    // denominator of the percent-cap formula, weakening the budget
+    // exactly during retry storms. Move-assign from a default
+    // (empty) guard decrements the old counter immediately.
+    inflight_guard_ = circuit_breaker::RetryBudget::InFlightGuard{};
+
     if (lease_) {
         auto* conn = lease_.Get();
         if (conn) {

From a32469b6e7282de400a75d3c6ceb592dc32838e6 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 10:11:42 +0800
Subject: [PATCH 30/37] Fix review comment

---
 include/config/config_loader.h | 27 ++++++++++++++++++++--
 server/config_loader.cc        | 34 ++++++++++++++++++++++------
 server/http_server.cc          | 25 ++++++++++++++++-----
 server/main.cc                 | 41 +++++++++++++++++++++++-----------
 4 files changed, 99 insertions(+), 28 deletions(-)

diff --git a/include/config/config_loader.h b/include/config/config_loader.h
index fbf3319e..2a76c3b8 100644
--- a/include/config/config_loader.h
+++ b/include/config/config_loader.h
@@ -3,6 +3,7 @@
 #include "config/server_config.h"
 #include <string>
 #include <stdexcept>
+#include <unordered_set>
 
 class ConfigLoader {
 public:
@@ -28,7 +29,9 @@ class ConfigLoader {
     static void Validate(const ServerConfig& config);
 
     // Validate ONLY the fields that are live-reloadable without a
-    // restart — today this is the per-upstream circuit_breaker block.
+    // restart — today this is the per-upstream circuit_breaker block
+    // plus a duplicate-name check.
+    //
     // Used by the SIGHUP reload path, which downgrades the full
     // `Validate()` failure to a warn because most of its rules cover
     // restart-only fields. That downgrade is unsafe for live-
@@ -37,9 +40,29 @@ class ConfigLoader {
     // rejected at startup. Call this BEFORE applying a reloaded
     // config and abort the reload if it throws.
     //
+    // Scope of CB-field validation:
+    //   `live_upstream_names` lists service names CURRENTLY known to
+    //   the running server. CB fields are validated only for entries
+    //   whose name is in this set, because
+    //   `CircuitBreakerManager::Reload` only applies CB changes to
+    //   pre-existing hosts (new/removed names are restart-only and
+    //   skipped with a warn). Validating CB blocks for not-yet-
+    //   running entries would block otherwise-safe reloads — e.g. a
+    //   reload that stages a new upstream with an intentionally
+    //   placeholder breaker block would abort even though the live
+    //   server would never apply it. Pass an empty set when no
+    //   upstreams are running yet (only the duplicate-name check
+    //   runs in that case).
+    //
+    // Duplicate-name rejection runs unconditionally on the new
+    // config's upstream list: even for new/renamed entries, the
+    // file itself is malformed if names collide.
+    //
     // Throws std::invalid_argument with a message identifying the
     // offending upstream and field.
-    static void ValidateHotReloadable(const ServerConfig& config);
+    static void ValidateHotReloadable(
+        const ServerConfig& config,
+        const std::unordered_set<std::string>& live_upstream_names);
 
     // Return a ServerConfig with all default values.
     static ServerConfig Default();
diff --git a/server/config_loader.cc b/server/config_loader.cc
index a34672ce..38fb2fb4 100644
--- a/server/config_loader.cc
+++ b/server/config_loader.cc
@@ -561,7 +561,9 @@ void ConfigLoader::ApplyEnvOverrides(ServerConfig& config) {
     if (val) config.rate_limit.status_code = EnvToInt(val, "REACTOR_RATE_LIMIT_STATUS_CODE");
 }
 
-void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) {
+void ConfigLoader::ValidateHotReloadable(
+        const ServerConfig& config,
+        const std::unordered_set<std::string>& live_upstream_names) {
     // Mirrors the circuit_breaker validation block in Validate().
     // Kept in lock-step with that block — any rule added there for a
     // hot-reloadable field must be added here too, or the SIGHUP
@@ -570,12 +572,14 @@ void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) {
     // to prevent).
 
     // Reject duplicate upstream service names BEFORE the per-upstream
-    // CB validation. CircuitBreakerManager::Reload iterates the new
-    // upstream list and applies each entry's `circuit_breaker` block
-    // to GetHost(name). With duplicates, the first entry's CB values
-    // are applied, then the second entry's overwrite them — last
-    // write silently wins. Startup's full Validate() rejects the
-    // file outright; the hot-reload path must match.
+    // CB validation. Even for new/renamed entries, the file is
+    // malformed if names collide: `CircuitBreakerManager::Reload`
+    // iterates the new upstream list and applies each entry's
+    // `circuit_breaker` block to GetHost(name); duplicates would
+    // silently overwrite (last-write wins). Startup's full Validate()
+    // rejects the file outright; the hot-reload path must match.
+    // This rule runs UNCONDITIONALLY on the new config — it doesn't
+    // depend on `live_upstream_names`.
     {
         std::unordered_set<std::string> seen;
         seen.reserve(config.upstreams.size());
@@ -593,6 +597,22 @@ void ConfigLoader::ValidateHotReloadable(const ServerConfig& config) {
     for (size_t i = 0; i < config.upstreams.size(); ++i) {
         const auto& u = config.upstreams[i];
         const std::string idx = "upstreams[" + std::to_string(i) + "]";
+
+        // CB-field validation is scoped to upstreams that are LIVE in
+        // the running server. CircuitBreakerManager::Reload only
+        // applies CB changes to pre-existing hosts — new/renamed
+        // entries are restart-only and skipped with a warn — so
+        // validating their CB blocks here would block otherwise-safe
+        // reloads (e.g. a reload that stages a new upstream alongside
+        // a log-level edit would abort even though the live server
+        // would never apply the new upstream's CB block).
+        //
+        // The empty-set case (no live upstreams yet) is handled by
+        // the same check: every entry is "new", so every entry is
+        // skipped — only the duplicate-name check runs.
+        if (live_upstream_names.find(u.name) == live_upstream_names.end()) {
+            continue;
+        }
         const auto& cb = u.circuit_breaker;
         if (cb.consecutive_failure_threshold < 1 ||
             cb.consecutive_failure_threshold > 10000) {
diff --git a/server/http_server.cc b/server/http_server.cc
index 8d4008c5..247c5795 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -3570,12 +3570,25 @@ bool HttpServer::Reload(const ServerConfig& new_config) {
         // Strict gate for hot-reloadable CB fields + duplicate names.
         // Mirrors main.cc::ReloadConfig — both entry points must reject
         // invalid CB tuning before it reaches live slices.
-        try {
-            ConfigLoader::ValidateHotReloadable(new_config);
-        } catch (const std::invalid_argument& e) {
-            logging::Get()->error("Reload() rejected invalid config: {}",
-                                  e.what());
-            return false;
+        //
+        // CB validation is scoped to existing upstream names: only
+        // those entries get applied via CircuitBreakerManager::Reload,
+        // so validating CB blocks for new/renamed entries would
+        // block otherwise-safe reloads. `upstream_configs_` is the
+        // post-Start snapshot of running upstreams.
+        {
+            std::unordered_set<std::string> live_names;
+            live_names.reserve(upstream_configs_.size());
+            for (const auto& u : upstream_configs_) {
+                live_names.insert(u.name);
+            }
+            try {
+                ConfigLoader::ValidateHotReloadable(new_config, live_names);
+            } catch (const std::invalid_argument& e) {
+                logging::Get()->error("Reload() rejected invalid config: {}",
+                                      e.what());
+                return false;
+            }
         }
     }
 
diff --git a/server/main.cc b/server/main.cc
index f7bac586..e0fa7790 100644
--- a/server/main.cc
+++ b/server/main.cc
@@ -328,19 +328,34 @@ static bool ReloadConfig(const std::string& config_path,
             }
         }
     }
-    // Hot-reloadable fields (today: per-upstream `circuit_breaker.*`)
-    // are the only ones that go LIVE on a SIGHUP reload. Validate
-    // them strictly — a bad value here would be pushed into running
-    // slices and keep running until an operator-driven restart fixes
-    // the config file. Hard-reject so operators see the error
-    // immediately instead of discovering drift the next time the
-    // startup path rejects the same file.
-    try {
-        ConfigLoader::ValidateHotReloadable(new_config);
-    } catch (const std::invalid_argument& e) {
-        logging::Get()->error("Config reload rejected: {}", e.what());
-        reopen_existing_logs();
-        return false;
+    // Hot-reloadable fields (today: per-upstream `circuit_breaker.*`
+    // on existing services + duplicate-name uniqueness across the
+    // new file) are the only ones that go LIVE on a SIGHUP reload.
+    // Validate them strictly — a bad value here would be pushed into
+    // running slices and keep running until an operator-driven
+    // restart fixes the config file. Hard-reject so operators see
+    // the error immediately instead of discovering drift the next
+    // time the startup path rejects the same file.
+    //
+    // CB validation is scoped to existing upstream names —
+    // CircuitBreakerManager::Reload only applies CB changes to those.
+    // New/renamed upstreams are restart-only; their CB blocks are
+    // skipped here so an intentional placeholder doesn't block other
+    // live-safe edits in the same reload (log/rate-limit/breaker
+    // edits on existing services).
+    {
+        std::unordered_set<std::string> live_names;
+        live_names.reserve(current_config.upstreams.size());
+        for (const auto& u : current_config.upstreams) {
+            live_names.insert(u.name);
+        }
+        try {
+            ConfigLoader::ValidateHotReloadable(new_config, live_names);
+        } catch (const std::invalid_argument& e) {
+            logging::Get()->error("Config reload rejected: {}", e.what());
+            reopen_existing_logs();
+            return false;
+        }
     }
 
     // Warn about restart-required field issues (not applied during reload).

From 1e9793e1e8c1ac26a09969072a5f07e923892d2b Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 10:58:50 +0800
Subject: [PATCH 31/37] Fix review comment

---
 server/http_server.cc | 51 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 10 deletions(-)

diff --git a/server/http_server.cc b/server/http_server.cc
index 247c5795..74e24973 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -458,10 +458,20 @@ void HttpServer::MarkServerReady() {
                 for (size_t i = 0; i < host->partition_count(); ++i) {
                     auto* slice = host->GetSlice(i);
                     if (!slice) continue;
+                    // Capture the slice pointer so the callback can read
+                    // the LIVE `dry_run` flag on every fire — operators
+                    // can toggle dry_run via SIGHUP, and the drain
+                    // decision must reflect the current setting, not a
+                    // snapshot from server startup. Slice lifetime is
+                    // tied to the manager (declared after upstream
+                    // manager → destructs first), so the raw pointer
+                    // outlives every possible callback invocation.
+                    auto* slice_ptr = slice;
                     slice->SetTransitionCallback(
-                        [um, service, i](circuit_breaker::State old_s,
-                                         circuit_breaker::State new_s,
-                                         const char* /*trigger*/) {
+                        [um, service, i, slice_ptr](
+                                circuit_breaker::State old_s,
+                                circuit_breaker::State new_s,
+                                const char* /*trigger*/) {
                             // Drain the partition's wait queue whenever
                             // the slice enters OPEN — from CLOSED (fresh
                             // trip) OR from HALF_OPEN (probe cycle re-
@@ -486,13 +496,34 @@ void HttpServer::MarkServerReady() {
                             // somehow queued during HALF_OPEN (defense
                             // in depth — TryAcquire normally rejects
                             // non-probes before they reach the pool).
-                            if (new_s == circuit_breaker::State::OPEN &&
-                                (old_s == circuit_breaker::State::CLOSED ||
-                                 old_s == circuit_breaker::State::HALF_OPEN)) {
-                                if (auto* part = um->GetPoolPartition(
-                                        service, i)) {
-                                    part->DrainWaitQueueOnTrip();
-                                }
+                            if (new_s != circuit_breaker::State::OPEN ||
+                                (old_s != circuit_breaker::State::CLOSED &&
+                                 old_s != circuit_breaker::State::HALF_OPEN)) {
+                                return;
+                            }
+                            // Dry-run honors the shadow-mode contract:
+                            // the slice already log-but-admits
+                            // would-reject decisions, so the wait-queue
+                            // drain — which would deliver hard 503s
+                            // (CHECKOUT_CIRCUIT_OPEN → RESULT_CIRCUIT_OPEN)
+                            // to queued waiters — must also be a no-op.
+                            // Otherwise shadow-mode rollouts can still
+                            // drop queued requests under backpressure,
+                            // defeating the safety of enabling dry_run
+                            // on a live service. Logged at info so
+                            // operators see the trip event without
+                            // the side effect.
+                            if (slice_ptr && slice_ptr->config().dry_run) {
+                                logging::Get()->info(
+                                    "[dry-run] circuit breaker would drain "
+                                    "wait queue on trip — skipping (shadow "
+                                    "mode) service={} partition={}",
+                                    service, i);
+                                return;
+                            }
+                            if (auto* part = um->GetPoolPartition(
+                                    service, i)) {
+                                part->DrainWaitQueueOnTrip();
                             }
                         });
                 }

From 2d474bf8a001bd252924bb5befe5356249792fc1 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 11:06:29 +0800
Subject: [PATCH 32/37] Fix review comment

---
 server/circuit_breaker_slice.cc | 32 ++++++++++++
 server/http_server.cc           | 89 ++++++++++++++++++---------------
 2 files changed, 81 insertions(+), 40 deletions(-)

diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 1ff6e00e..9dad6a31 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -527,6 +527,10 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
     const bool enabled_changed = (config_.enabled != new_config.enabled);
     const bool window_changed =
         (config_.window_seconds != new_config.window_seconds);
+    // Snapshot the OLD dry_run before config_ is overwritten — used at
+    // the end of Reload to detect a true→false flip and signal the
+    // host to drain any waiters that accumulated during shadow mode.
+    const bool old_dry_run = config_.dry_run;
 
     config_ = new_config;
     if (window_changed) {
@@ -610,6 +614,34 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         new_config.failure_rate_threshold,
         new_config.consecutive_failure_threshold,
         enabled_changed ? " (enabled toggled — state reset to CLOSED)" : "");
+
+    // dry_run true→false on a slice that's STILL OPEN: enforcement is
+    // back on, but the OPEN→OPEN intra-state config edit doesn't fire
+    // any natural transition callback. The pool partition may have
+    // queued waiters from the shadow-mode period (the original
+    // CLOSED→OPEN drain was skipped because dry_run was true at the
+    // time). Without flushing them now, those queued requests will
+    // eventually dispatch to the unhealthy upstream once a pool slot
+    // frees, defeating the just-re-enabled enforcement.
+    //
+    // Signal the host via a synthetic OPEN→OPEN transition callback
+    // with trigger="dry_run_disabled". The HttpServer-installed
+    // callback recognizes this special trigger and drains the
+    // partition queue. Real state transitions never reuse the same
+    // old/new state with this trigger string, so there's no overlap.
+    //
+    // Only fire when we KNOW the state is still OPEN — the
+    // enabled-toggle branch above resets to CLOSED, in which case the
+    // drain is unnecessary (no enforcement to re-engage). State is
+    // dispatcher-thread-only here; a plain load is sufficient.
+    if (old_dry_run && !new_config.dry_run &&
+        state_.load(std::memory_order_acquire) == State::OPEN &&
+        transition_cb_) {
+        logging::Get()->info(
+            "circuit breaker dry_run disabled while OPEN {} — "
+            "flushing wait queue", host_label_);
+        transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled");
+    }
 }
 
 void CircuitBreakerSlice::SetTransitionCallback(StateTransitionCallback cb) {
diff --git a/server/http_server.cc b/server/http_server.cc
index 74e24973..4275aad1 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -471,48 +471,57 @@ void HttpServer::MarkServerReady() {
                         [um, service, i, slice_ptr](
                                 circuit_breaker::State old_s,
                                 circuit_breaker::State new_s,
-                                const char* /*trigger*/) {
-                            // Drain the partition's wait queue whenever
-                            // the slice enters OPEN — from CLOSED (fresh
-                            // trip) OR from HALF_OPEN (probe cycle re-
-                            // tripped).
-                            //
-                            // CLOSED→OPEN is the classic case: queued
-                            // non-probe waiters need to fail fast with
-                            // CHECKOUT_CIRCUIT_OPEN rather than wait for
-                            // the full open duration.
-                            //
-                            // HALF_OPEN→OPEN (probe_fail) matters
-                            // because probe admissions pass
-                            // ConsultBreaker() BEFORE CheckoutAsync() —
-                            // if the pool was saturated during the
-                            // probe cycle, those admitted probes may
-                            // still be queued when the cycle re-trips.
-                            // Without draining, a saw_failure probe
-                            // cycle can leave the pool with queued
-                            // waiters that still eventually dispatch to
-                            // a known-bad upstream. Draining also
-                            // sweeps any non-probe waiters that
-                            // somehow queued during HALF_OPEN (defense
-                            // in depth — TryAcquire normally rejects
-                            // non-probes before they reach the pool).
-                            if (new_s != circuit_breaker::State::OPEN ||
-                                (old_s != circuit_breaker::State::CLOSED &&
-                                 old_s != circuit_breaker::State::HALF_OPEN)) {
+                                const char* trigger) {
+                            // Three drain triggers, all entering OPEN:
+                            //   CLOSED→OPEN  : fresh trip; queued non-
+                            //     probe waiters need CHECKOUT_CIRCUIT_OPEN
+                            //     instead of waiting out the full open
+                            //     window.
+                            //   HALF_OPEN→OPEN : probe cycle re-tripped;
+                            //     probe admissions passed ConsultBreaker
+                            //     before CheckoutAsync, so saturated
+                            //     pools can leave them queued. Without
+                            //     draining they eventually dispatch to a
+                            //     known-bad upstream.
+                            //   OPEN→OPEN with trigger="dry_run_disabled"
+                            //     : synthetic signal from
+                            //     CircuitBreakerSlice::Reload when
+                            //     dry_run flips true→false on a slice
+                            //     that's still OPEN. The earlier trip
+                            //     skipped the drain (shadow mode); now
+                            //     enforcement is back on, queued
+                            //     waiters from that period must be
+                            //     flushed before the pool services
+                            //     them. Real transitions never use this
+                            //     trigger string with old==new==OPEN,
+                            //     so there's no overlap with normal
+                            //     state-machine signals.
+                            const bool normal_trip =
+                                new_s == circuit_breaker::State::OPEN &&
+                                (old_s == circuit_breaker::State::CLOSED ||
+                                 old_s == circuit_breaker::State::HALF_OPEN);
+                            const bool dry_run_disable_drain =
+                                old_s == circuit_breaker::State::OPEN &&
+                                new_s == circuit_breaker::State::OPEN &&
+                                trigger != nullptr &&
+                                std::strcmp(trigger,
+                                            "dry_run_disabled") == 0;
+                            if (!normal_trip && !dry_run_disable_drain) {
                                 return;
                             }
-                            // Dry-run honors the shadow-mode contract:
-                            // the slice already log-but-admits
-                            // would-reject decisions, so the wait-queue
-                            // drain — which would deliver hard 503s
-                            // (CHECKOUT_CIRCUIT_OPEN → RESULT_CIRCUIT_OPEN)
-                            // to queued waiters — must also be a no-op.
-                            // Otherwise shadow-mode rollouts can still
-                            // drop queued requests under backpressure,
-                            // defeating the safety of enabling dry_run
-                            // on a live service. Logged at info so
-                            // operators see the trip event without
-                            // the side effect.
+                            // Dry-run shadow-mode contract: the slice
+                            // log-but-admits would-reject decisions, so
+                            // the wait-queue drain — which would
+                            // deliver hard 503s (CHECKOUT_CIRCUIT_OPEN
+                            // → RESULT_CIRCUIT_OPEN) to queued
+                            // waiters — must also be a no-op while
+                            // dry_run is true. Note: when this fires
+                            // via the dry_run_disabled trigger, the
+                            // slice's config_.dry_run was already
+                            // updated to false in Reload BEFORE the
+                            // synthetic callback, so this guard
+                            // correctly does NOT skip the drain in
+                            // that case.
                             if (slice_ptr && slice_ptr->config().dry_run) {
                                 logging::Get()->info(
                                     "[dry-run] circuit breaker would drain "

From 94958d7924378cf7ee7517d36995a057d7a4ab01 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 11:29:23 +0800
Subject: [PATCH 33/37] Add more circuit break test

---
 test/circuit_breaker_reload_test.h | 221 +++++++++++++++++++++++++++++
 1 file changed, 221 insertions(+)

diff --git a/test/circuit_breaker_reload_test.h b/test/circuit_breaker_reload_test.h
index 220c718e..5b63e6b4 100644
--- a/test/circuit_breaker_reload_test.h
+++ b/test/circuit_breaker_reload_test.h
@@ -359,6 +359,224 @@ void TestReloadDisableThenEnable() {
     }
 }
 
+// Regression: a SIGHUP carrying an invalid CB threshold (e.g.
+// `consecutive_failure_threshold = 0`) on an EXISTING upstream must
+// be hard-rejected. The downgrade-to-warn behavior of the wider
+// `Validate()` call would otherwise push the bad value into live
+// slices even though startup rejects the same file.
+void TestReloadRejectsInvalidCbField() {
+    std::cout << "\n[TEST] CB Reload: invalid CB tuning is hard-rejected..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        gw.upstreams.push_back(
+            MakeReloadUpstream("svc", "127.0.0.1", backend_port));
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+
+        // Build an invalid reload — threshold below the [1, 10000] range.
+        ServerConfig invalid = gw;
+        invalid.upstreams[0].circuit_breaker.consecutive_failure_threshold = 0;
+
+        bool reload_returned = gateway.Reload(invalid);
+        // The slice's threshold must NOT have been pushed live.
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        auto* slice = cbm->GetHost("svc")->GetSlice(0);
+        int live_threshold = slice->config().consecutive_failure_threshold;
+
+        bool pass = reload_returned == false && live_threshold == 3;
+        TestFramework::RecordTest(
+            "CB Reload: invalid CB tuning is hard-rejected", pass,
+            pass ? "" :
+            "reload_returned=" + std::to_string(reload_returned) +
+            " live_threshold=" + std::to_string(live_threshold) +
+            " (expected reload=false, threshold=3 unchanged)");
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: invalid CB tuning is hard-rejected", false, e.what());
+    }
+}
+
+// Regression: with `dry_run=true`, the CLOSED→OPEN transition callback
+// must NOT drain the partition wait queue (shadow-mode contract: log
+// would-reject decisions, admit traffic). The breaker's dry_run check
+// inside the transition callback covers this; the regression we lock
+// in is the log-emitted breadcrumb plus the absence of CHECKOUT_CIRCUIT_OPEN
+// to queued waiters.
+void TestDryRunDoesNotDrainOnTrip() {
+    std::cout << "\n[TEST] CB Reload: dry-run skips wait-queue drain on trip..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        UpstreamConfig u = MakeReloadUpstream("svc", "127.0.0.1", backend_port);
+        u.circuit_breaker.dry_run = true;
+        u.circuit_breaker.consecutive_failure_threshold = 2;
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                            std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        int gw_port = gw_runner.GetPort();
+        // Trip the breaker via 2 failures.
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+        std::this_thread::sleep_for(std::chrono::milliseconds(50));
+
+        bool saw_dryrun_drain_skip = false;
+        for (const auto& msg : ring->last_formatted()) {
+            if (msg.find("[dry-run] circuit breaker would drain wait queue") !=
+                std::string::npos) {
+                saw_dryrun_drain_skip = true;
+                break;
+            }
+        }
+
+        TestFramework::RecordTest(
+            "CB Reload: dry-run skips wait-queue drain on trip",
+            saw_dryrun_drain_skip,
+            saw_dryrun_drain_skip ? "" :
+            "expected '[dry-run] circuit breaker would drain wait queue' log line");
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: dry-run skips wait-queue drain on trip", false, e.what());
+    }
+}
+
+// Regression: when `dry_run` flips true→false on a slice that's
+// currently OPEN, `Slice::Reload` fires a synthetic OPEN→OPEN
+// transition with trigger="dry_run_disabled". The HttpServer-installed
+// callback recognizes it and drains the partition queue so shadow-mode
+// waiters don't leak through to the upstream once enforcement is back on.
+void TestDryRunDisableOnOpenTriggersDrainSignal() {
+    std::cout << "\n[TEST] CB Reload: dry_run disable on OPEN triggers drain..."
+              << std::endl;
+    try {
+        HttpServer backend("127.0.0.1", 0);
+        backend.Get("/fail", [](const HttpRequest&, HttpResponse& resp) {
+            resp.Status(502).Body("err", "text/plain");
+        });
+        TestServerRunner<HttpServer> backend_runner(backend);
+        int backend_port = backend_runner.GetPort();
+
+        ServerConfig gw;
+        gw.bind_host = "127.0.0.1";
+        gw.bind_port = 0;
+        gw.worker_threads = 1;
+        gw.http2.enabled = false;
+        UpstreamConfig u = MakeReloadUpstream("svc", "127.0.0.1", backend_port);
+        u.circuit_breaker.dry_run = true;
+        u.circuit_breaker.consecutive_failure_threshold = 2;
+        u.circuit_breaker.base_open_duration_ms = 60000;  // long open window
+        gw.upstreams.push_back(u);
+
+        HttpServer gateway(gw);
+        TestServerRunner<HttpServer> gw_runner(gateway);
+        int gw_port = gw_runner.GetPort();
+
+        // Trip the breaker (dry-run still records the trip; state goes OPEN).
+        for (int i = 0; i < 2; ++i) {
+            TestHttpClient::HttpGet(gw_port, "/fail", 3000);
+        }
+
+        auto* cbm = gateway.GetUpstreamManager()->GetCircuitBreakerManager();
+        auto* slice = cbm->GetHost("svc")->GetSlice(0);
+        bool was_open = slice->CurrentState() == circuit_breaker::State::OPEN;
+
+        auto ring = std::make_shared<
+            spdlog::sinks::ringbuffer_sink_mt>(1024);
+        auto logger = logging::Get();
+        auto prev_level = logger->level();
+        logger->set_level(spdlog::level::debug);
+        logger->sinks().push_back(ring);
+        struct SinkGuard {
+            std::shared_ptr<spdlog::logger> logger;
+            std::shared_ptr<spdlog::sinks::ringbuffer_sink_mt> ring;
+            spdlog::level::level_enum prev_level;
+            ~SinkGuard() {
+                auto& sinks = logger->sinks();
+                sinks.erase(std::remove(sinks.begin(), sinks.end(),
+                            std::shared_ptr<spdlog::sinks::sink>(ring)),
+                            sinks.end());
+                logger->set_level(prev_level);
+            }
+        } guard{logger, ring, prev_level};
+
+        // Reload with dry_run=false, everything else same.
+        ServerConfig disable_dry = gw;
+        disable_dry.upstreams[0].circuit_breaker.dry_run = false;
+        gateway.Reload(disable_dry);
+        std::this_thread::sleep_for(std::chrono::milliseconds(150));
+
+        // The synthetic-callback fire path emits a slice-side log line.
+        bool saw_flush_log = false;
+        for (const auto& msg : ring->last_formatted()) {
+            if (msg.find("dry_run disabled while OPEN") != std::string::npos &&
+                msg.find("flushing wait queue") != std::string::npos) {
+                saw_flush_log = true;
+                break;
+            }
+        }
+        bool live_dry_run = slice->config().dry_run;
+        bool still_open = slice->CurrentState() == circuit_breaker::State::OPEN;
+
+        bool pass = was_open && !live_dry_run && saw_flush_log && still_open;
+        TestFramework::RecordTest(
+            "CB Reload: dry_run disable on OPEN triggers drain", pass,
+            pass ? "" :
+            "was_open=" + std::to_string(was_open) +
+            " live_dry_run=" + std::to_string(live_dry_run) +
+            " saw_flush_log=" + std::to_string(saw_flush_log) +
+            " still_open=" + std::to_string(still_open));
+    } catch (const std::exception& e) {
+        TestFramework::RecordTest(
+            "CB Reload: dry_run disable on OPEN triggers drain", false,
+            e.what());
+    }
+}
+
 void RunAllTests() {
     std::cout << "\n" << std::string(60, '=') << std::endl;
     std::cout << "CIRCUIT BREAKER - HOT-RELOAD TESTS" << std::endl;
@@ -368,6 +586,9 @@ void RunAllTests() {
     TestCbOnlyReloadNoRestartWarn();
     TestTopologyChangeStillEmitsRestartWarn();
     TestReloadDisableThenEnable();
+    TestReloadRejectsInvalidCbField();
+    TestDryRunDoesNotDrainOnTrip();
+    TestDryRunDisableOnOpenTriggersDrainSignal();
 }
 
 }  // namespace CircuitBreakerReloadTests

From 3aef3c53a3e75a128cc625bb2ed6b0bc07e5627b Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 11:58:26 +0800
Subject: [PATCH 34/37] Add more circuit break test

---
 .../circuit_breaker/circuit_breaker_slice.h   |  9 ++-
 server/circuit_breaker_slice.cc               | 64 ++++++++++++-------
 server/http_server.cc                         | 21 +++---
 3 files changed, 61 insertions(+), 33 deletions(-)

diff --git a/include/circuit_breaker/circuit_breaker_slice.h b/include/circuit_breaker/circuit_breaker_slice.h
index 1c96dcd0..d6899bae 100644
--- a/include/circuit_breaker/circuit_breaker_slice.h
+++ b/include/circuit_breaker/circuit_breaker_slice.h
@@ -246,7 +246,14 @@ class CircuitBreakerSlice {
     StateTransitionCallback transition_cb_;
 
     // Internal transitions (dispatcher-thread).
-    void TripClosedToOpen(const char* trigger);
+    // `now` is threaded through from ReportFailure so the window_total /
+    // window_fail_rate fields in the trip log reflect the SAME sliding-window
+    // view that ShouldTripClosed just saw — a fresh Now() here can cross a
+    // bucket boundary (especially with window_seconds=1 or under a dispatcher
+    // stall) and trigger Window::Advance's full-reset, zeroing the bucket that
+    // holds the failure which actually tripped the breaker.
+    void TripClosedToOpen(const char* trigger,
+                          std::chrono::steady_clock::time_point now);
     void TransitionOpenToHalfOpen();
     void TransitionHalfOpenToClosed();
     void TripHalfOpenToOpen(const char* trigger);
diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index 9dad6a31..b24f352a 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -80,8 +80,15 @@ bool CircuitBreakerSlice::ShouldTripClosed(
            (static_cast<int64_t>(config_.failure_rate_threshold) * total);
 }
 
-void CircuitBreakerSlice::TripClosedToOpen(const char* trigger) {
-    auto now = Now();
+void CircuitBreakerSlice::TripClosedToOpen(
+        const char* trigger, std::chrono::steady_clock::time_point now) {
+    // `now` is the same time_point the caller (ReportFailure) passed to
+    // AddFailure/ShouldTripClosed — reusing it keeps the trip log's
+    // window_total/window_fail_rate consistent with the rate check that
+    // fired the trip. Calling Now() fresh here would risk crossing a
+    // bucket boundary and logging window_total=0 for the very failure
+    // that tripped the breaker.
+    //
     // Capture pre-reset observability context BEFORE mutating state.
     // §11.1 log format asks for consecutive_failures + window_total +
     // window_fail_rate at the trip event so operators can distinguish a
@@ -479,7 +486,9 @@ void CircuitBreakerSlice::ReportFailure(FailureKind kind, bool probe,
         const char* trigger =
             (consecutive_failures_ >= config_.consecutive_failure_threshold)
                 ? "consecutive" : "rate";
-        TripClosedToOpen(trigger);
+        // Thread `now` through so the trip log's window stats reflect the
+        // same view ShouldTripClosed just used.
+        TripClosedToOpen(trigger, now);
     }
 }
 
@@ -615,32 +624,41 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         new_config.consecutive_failure_threshold,
         enabled_changed ? " (enabled toggled — state reset to CLOSED)" : "");
 
-    // dry_run true→false on a slice that's STILL OPEN: enforcement is
-    // back on, but the OPEN→OPEN intra-state config edit doesn't fire
-    // any natural transition callback. The pool partition may have
-    // queued waiters from the shadow-mode period (the original
-    // CLOSED→OPEN drain was skipped because dry_run was true at the
-    // time). Without flushing them now, those queued requests will
-    // eventually dispatch to the unhealthy upstream once a pool slot
-    // frees, defeating the just-re-enabled enforcement.
+    // dry_run true→false on a slice that's still rejecting traffic
+    // (OPEN or HALF_OPEN): enforcement is back on, but the same-state
+    // intra-config edit doesn't fire any natural transition callback.
+    // The pool partition may have queued waiters from the shadow-mode
+    // period — drain reasons per state:
+    //   OPEN: the original CLOSED→OPEN drain was skipped because
+    //     dry_run was true at trip time, so every request that arrived
+    //     during the open window was admitted and may be queued.
+    //   HALF_OPEN: under dry_run the slice log-but-admits both probe-
+    //     budget-exhausted (half_open_full) and saw-failure short-
+    //     circuits (half_open_recovery_failing). Those requests sit in
+    //     the pool wait queue even though enforcement would reject
+    //     them. Without a drain they reach the unhealthy upstream once
+    //     a pool slot frees, defeating re-enabled enforcement.
     //
-    // Signal the host via a synthetic OPEN→OPEN transition callback
+    // Signal the host via a synthetic same-state transition callback
     // with trigger="dry_run_disabled". The HttpServer-installed
-    // callback recognizes this special trigger and drains the
-    // partition queue. Real state transitions never reuse the same
-    // old/new state with this trigger string, so there's no overlap.
+    // callback recognizes this trigger and drains the partition
+    // queue. Real state transitions never reuse old==new with this
+    // trigger string, so there's no overlap with normal signals.
     //
-    // Only fire when we KNOW the state is still OPEN — the
+    // Only fire when we KNOW the state is still rejecting — the
     // enabled-toggle branch above resets to CLOSED, in which case the
     // drain is unnecessary (no enforcement to re-engage). State is
     // dispatcher-thread-only here; a plain load is sufficient.
-    if (old_dry_run && !new_config.dry_run &&
-        state_.load(std::memory_order_acquire) == State::OPEN &&
-        transition_cb_) {
-        logging::Get()->info(
-            "circuit breaker dry_run disabled while OPEN {} — "
-            "flushing wait queue", host_label_);
-        transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled");
+    if (old_dry_run && !new_config.dry_run && transition_cb_) {
+        State s = state_.load(std::memory_order_acquire);
+        if (s == State::OPEN || s == State::HALF_OPEN) {
+            const char* state_label =
+                (s == State::OPEN) ? "OPEN" : "HALF_OPEN";
+            logging::Get()->info(
+                "circuit breaker dry_run disabled while {} {} — "
+                "flushing wait queue", state_label, host_label_);
+            transition_cb_(s, s, "dry_run_disabled");
+        }
     }
 }
 
diff --git a/server/http_server.cc b/server/http_server.cc
index 4275aad1..e3c423fb 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -483,17 +483,19 @@ void HttpServer::MarkServerReady() {
                             //     pools can leave them queued. Without
                             //     draining they eventually dispatch to a
                             //     known-bad upstream.
-                            //   OPEN→OPEN with trigger="dry_run_disabled"
-                            //     : synthetic signal from
+                            //   OPEN→OPEN or HALF_OPEN→HALF_OPEN with
+                            //     trigger="dry_run_disabled" :
+                            //     synthetic signal from
                             //     CircuitBreakerSlice::Reload when
                             //     dry_run flips true→false on a slice
-                            //     that's still OPEN. The earlier trip
-                            //     skipped the drain (shadow mode); now
-                            //     enforcement is back on, queued
+                            //     that's still rejecting traffic. The
+                            //     earlier trip / HALF_OPEN rejects
+                            //     skipped enforcement (shadow mode);
+                            //     now enforcement is back on, queued
                             //     waiters from that period must be
                             //     flushed before the pool services
-                            //     them. Real transitions never use this
-                            //     trigger string with old==new==OPEN,
+                            //     them. Real transitions never use
+                            //     this trigger string with old==new,
                             //     so there's no overlap with normal
                             //     state-machine signals.
                             const bool normal_trip =
@@ -501,8 +503,9 @@ void HttpServer::MarkServerReady() {
                                 (old_s == circuit_breaker::State::CLOSED ||
                                  old_s == circuit_breaker::State::HALF_OPEN);
                             const bool dry_run_disable_drain =
-                                old_s == circuit_breaker::State::OPEN &&
-                                new_s == circuit_breaker::State::OPEN &&
+                                old_s == new_s &&
+                                (old_s == circuit_breaker::State::OPEN ||
+                                 old_s == circuit_breaker::State::HALF_OPEN) &&
                                 trigger != nullptr &&
                                 std::strcmp(trigger,
                                             "dry_run_disabled") == 0;

From 7b356806a1631533bab59449ef83a0a0f6ac6a44 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 12:52:28 +0800
Subject: [PATCH 35/37] Add more circuit break test

---
 server/circuit_breaker_slice.cc | 70 ++++++++++++++++++---------------
 server/http_server.cc           | 25 ++++++------
 server/proxy_transaction.cc     | 31 ++++++++++-----
 3 files changed, 73 insertions(+), 53 deletions(-)

diff --git a/server/circuit_breaker_slice.cc b/server/circuit_breaker_slice.cc
index b24f352a..e6bd1c93 100644
--- a/server/circuit_breaker_slice.cc
+++ b/server/circuit_breaker_slice.cc
@@ -624,41 +624,47 @@ void CircuitBreakerSlice::Reload(const CircuitBreakerConfig& new_config) {
         new_config.consecutive_failure_threshold,
         enabled_changed ? " (enabled toggled — state reset to CLOSED)" : "");
 
-    // dry_run true→false on a slice that's still rejecting traffic
-    // (OPEN or HALF_OPEN): enforcement is back on, but the same-state
-    // intra-config edit doesn't fire any natural transition callback.
-    // The pool partition may have queued waiters from the shadow-mode
-    // period — drain reasons per state:
-    //   OPEN: the original CLOSED→OPEN drain was skipped because
-    //     dry_run was true at trip time, so every request that arrived
-    //     during the open window was admitted and may be queued.
-    //   HALF_OPEN: under dry_run the slice log-but-admits both probe-
-    //     budget-exhausted (half_open_full) and saw-failure short-
-    //     circuits (half_open_recovery_failing). Those requests sit in
-    //     the pool wait queue even though enforcement would reject
-    //     them. Without a drain they reach the unhealthy upstream once
-    //     a pool slot frees, defeating re-enabled enforcement.
+    // dry_run true→false on a slice that's STILL OPEN: enforcement is
+    // back on, but the OPEN→OPEN intra-state config edit doesn't fire
+    // any natural transition callback. The pool partition may have
+    // queued waiters from the shadow-mode period (the original
+    // CLOSED→OPEN drain was skipped because dry_run was true at the
+    // time). Without flushing them now, those queued requests will
+    // eventually dispatch to the unhealthy upstream once a pool slot
+    // frees, defeating the just-re-enabled enforcement.
     //
-    // Signal the host via a synthetic same-state transition callback
+    // Signal the host via a synthetic OPEN→OPEN transition callback
     // with trigger="dry_run_disabled". The HttpServer-installed
-    // callback recognizes this trigger and drains the partition
-    // queue. Real state transitions never reuse old==new with this
-    // trigger string, so there's no overlap with normal signals.
+    // callback recognizes this special trigger and drains the
+    // partition queue. Real state transitions never reuse the same
+    // old/new state with this trigger string, so there's no overlap.
     //
-    // Only fire when we KNOW the state is still rejecting — the
-    // enabled-toggle branch above resets to CLOSED, in which case the
-    // drain is unnecessary (no enforcement to re-engage). State is
-    // dispatcher-thread-only here; a plain load is sufficient.
-    if (old_dry_run && !new_config.dry_run && transition_cb_) {
-        State s = state_.load(std::memory_order_acquire);
-        if (s == State::OPEN || s == State::HALF_OPEN) {
-            const char* state_label =
-                (s == State::OPEN) ? "OPEN" : "HALF_OPEN";
-            logging::Get()->info(
-                "circuit breaker dry_run disabled while {} {} — "
-                "flushing wait queue", state_label, host_label_);
-            transition_cb_(s, s, "dry_run_disabled");
-        }
+    // IMPORTANT — why this does NOT fire in HALF_OPEN: HALF_OPEN
+    // queues can mix two admission kinds that share a partition wait
+    // slot but differ on slice bookkeeping:
+    //   (a) Valid probes admitted within permitted_half_open_calls —
+    //       admission_generation_ = current halfopen_gen_, holding a
+    //       real half_open_inflight_/admitted_ slot. These drive
+    //       recovery on a healthy upstream and must NOT be disrupted
+    //       by an operator config flip.
+    //   (b) Dry-run-admitted shadow requests (half_open_full /
+    //       half_open_recovery_failing paths) — admission_generation_
+    //       = 0 (RejectWithLog sentinel). Their outcomes drop as
+    //       stale-gen on report, so they never influence the slice's
+    //       state machine and are bounded by pool queue size.
+    // DrainWaitQueueOnTrip is partition-wide and can't tell (a) from
+    // (b); draining would 503 valid probes (delaying/preventing
+    // recovery) to also drop the harmless (b). We accept the small
+    // bounded leak of (b) as the lesser evil.
+    //
+    // State is dispatcher-thread-only here; a plain load is sufficient.
+    if (old_dry_run && !new_config.dry_run &&
+        state_.load(std::memory_order_acquire) == State::OPEN &&
+        transition_cb_) {
+        logging::Get()->info(
+            "circuit breaker dry_run disabled while OPEN {} — "
+            "flushing wait queue", host_label_);
+        transition_cb_(State::OPEN, State::OPEN, "dry_run_disabled");
     }
 }
 
diff --git a/server/http_server.cc b/server/http_server.cc
index e3c423fb..67575de7 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -483,29 +483,30 @@ void HttpServer::MarkServerReady() {
                             //     pools can leave them queued. Without
                             //     draining they eventually dispatch to a
                             //     known-bad upstream.
-                            //   OPEN→OPEN or HALF_OPEN→HALF_OPEN with
-                            //     trigger="dry_run_disabled" :
-                            //     synthetic signal from
+                            //   OPEN→OPEN with trigger="dry_run_disabled"
+                            //     : synthetic signal from
                             //     CircuitBreakerSlice::Reload when
                             //     dry_run flips true→false on a slice
-                            //     that's still rejecting traffic. The
-                            //     earlier trip / HALF_OPEN rejects
-                            //     skipped enforcement (shadow mode);
-                            //     now enforcement is back on, queued
+                            //     that's still OPEN. The earlier trip
+                            //     skipped the drain (shadow mode); now
+                            //     enforcement is back on, queued
                             //     waiters from that period must be
                             //     flushed before the pool services
-                            //     them. Real transitions never use
-                            //     this trigger string with old==new,
+                            //     them. Real transitions never use this
+                            //     trigger string with old==new==OPEN,
                             //     so there's no overlap with normal
                             //     state-machine signals.
+                            //     (The slice intentionally does NOT
+                            //     fire this signal in HALF_OPEN — see
+                            //     CircuitBreakerSlice::Reload for why
+                            //     valid probes must not be flushed.)
                             const bool normal_trip =
                                 new_s == circuit_breaker::State::OPEN &&
                                 (old_s == circuit_breaker::State::CLOSED ||
                                  old_s == circuit_breaker::State::HALF_OPEN);
                             const bool dry_run_disable_drain =
-                                old_s == new_s &&
-                                (old_s == circuit_breaker::State::OPEN ||
-                                 old_s == circuit_breaker::State::HALF_OPEN) &&
+                                old_s == circuit_breaker::State::OPEN &&
+                                new_s == circuit_breaker::State::OPEN &&
                                 trigger != nullptr &&
                                 std::strcmp(trigger,
                                             "dry_run_disabled") == 0;
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index 0e1e5a6f..d3e8bd82 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -354,20 +354,33 @@ void ProxyTransaction::OnCheckoutError(int error_code) {
     static constexpr int CIRCUIT_OPEN    = -6;
 
     if (error_code == CIRCUIT_OPEN) {
-        // Drain path: breaker tripped while this transaction was queued
-        // Do NOT Report to the slice —
-        // our own reject must not feed back into the failure math. Emit
-        // the §12.1 circuit-open response directly.
+        // Drain path: breaker tripped while this transaction was queued.
+        // Do NOT Report success/failure to the slice — our own reject
+        // must not feed back into the failure math. Emit the §12.1
+        // circuit-open response directly.
         logging::Get()->info(
             "ProxyTransaction checkout drained by circuit breaker "
             "client_fd={} service={}",
             client_fd_, service_name_);
+        // Neutral-release the slice admission instead of just clearing
+        // admission_generation_. Three drain paths reach here:
+        //   CLOSED→OPEN  : closed_gen_ was bumped by the trip; our
+        //                  generation is now stale → ReportNeutral
+        //                  drops as stale-gen. No state mutation. Safe.
+        //   HALF_OPEN→OPEN : halfopen_gen_ was bumped by the trip AND
+        //                  half_open_inflight_/admitted_ reset to 0 by
+        //                  TransitionOpenToHalfOpen's sibling path →
+        //                  ReportNeutral drops as stale-gen. Safe.
+        //   (Any future same-cycle drain without a generation bump):
+        //                  admission_generation_ is still current →
+        //                  ReportNeutral correctly returns the slot,
+        //                  preventing half_open_inflight_/admitted_
+        //                  from leaking and wedging the slice in
+        //                  half_open_full until the next reset.
+        // ReleaseBreakerAdmissionNeutral clears admission_generation_
+        // internally, so Cleanup/destructor won't double-report.
+        ReleaseBreakerAdmissionNeutral();
         DeliverResponse(MakeCircuitOpenResponse());
-        // Clear admission_generation_ so Cleanup / destructor doesn't
-        // double-report. The admission was already fire-and-forget —
-        // slice-side bookkeeping is intact (the drain itself doesn't
-        // touch inflight counters because the breaker didn't admit).
-        admission_generation_ = 0;
         return;
     }
 

From f54033c066e55b7172074ea23269da049b51e4c4 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 13:30:38 +0800
Subject: [PATCH 36/37] Add more circuit break test

---
 server/circuit_breaker_host.cc | 49 ++++++++++++++++++++++++++++++++--
 server/http_server.cc          | 31 ++++++++++++++++++++-
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/server/circuit_breaker_host.cc b/server/circuit_breaker_host.cc
index 4523d3be..4e2640ae 100644
--- a/server/circuit_breaker_host.cc
+++ b/server/circuit_breaker_host.cc
@@ -2,6 +2,8 @@
 #include "dispatcher.h"
 #include "log/logger.h"
 
+#include <future>
+
 namespace circuit_breaker {
 
 CircuitBreakerHost::CircuitBreakerHost(std::string service_name,
@@ -104,11 +106,23 @@ void CircuitBreakerHost::Reload(
     retry_budget_->Reload(new_config.retry_budget_percent,
                           new_config.retry_budget_min_concurrency);
 
-    // Enqueue per-slice Reload on each owning dispatcher. The slice is
+    // Apply per-slice Reload on each owning dispatcher. The slice is
     // dispatcher-thread-local for mutation, so the config swap must
     // happen there. Passing slice as raw pointer is safe: slices_ is
     // owned by `this` (the host), which outlives the manager's reload
     // (enforced by CircuitBreakerManager's lifetime).
+    //
+    // Synchronize: wait for every enqueued slice Reload to actually run
+    // before returning. Without this, HttpServer::Reload could return
+    // "success" while requests already queued on a dispatcher still run
+    // with the OLD enabled/dry_run/thresholds — a SIGHUP flipping a
+    // tripped breaker to disabled (or to dry_run) could still emit hard
+    // 503s or enforce the old retry budget for a brief window after the
+    // operator sees reload-ok. Dispatcher-local inline on the current
+    // thread avoids self-deadlock if Reload is ever called from a
+    // dispatcher thread.
+    std::vector<std::future<void>> pending;
+    pending.reserve(slices_.size());
     for (size_t i = 0; i < slices_.size(); ++i) {
         CircuitBreakerSlice* slice = slices_[i].get();
         auto& dispatcher = dispatchers[i];
@@ -118,11 +132,42 @@ void CircuitBreakerHost::Reload(
                 service_name_, host_, i);
             continue;
         }
-        dispatcher->EnQueue([slice, new_config]() {
+        if (dispatcher->is_on_loop_thread()) {
+            // Caller IS this dispatcher — apply inline to preserve
+            // dispatcher-thread-local invariant without self-enqueueing
+            // (which would only run after this frame returns, defeating
+            // the sync contract). No future to wait on for this slice.
+            slice->Reload(new_config);
+            continue;
+        }
+        auto promise = std::make_shared<std::promise<void>>();
+        pending.push_back(promise->get_future());
+        dispatcher->EnQueue([slice, new_config, promise]() {
             slice->Reload(new_config);
+            promise->set_value();
         });
     }
 
+    // Bounded wait: slice Reload is trivial (config copy + optional
+    // synthetic transition callback), so each dispatcher only needs one
+    // event-loop iteration to drain. A 2s ceiling protects callers from
+    // a stalled / stopping dispatcher — if the wait times out we log and
+    // proceed; the remaining slice(s) will pick up the new config when
+    // the queued task eventually runs (via the shared_ptr-captured
+    // new_config copy), so we never lose an edit — just delay its visibility.
+    const auto deadline =
+        std::chrono::steady_clock::now() + std::chrono::seconds(2);
+    for (auto& fut : pending) {
+        if (fut.wait_until(deadline) != std::future_status::ready) {
+            logging::Get()->warn(
+                "CircuitBreakerHost::Reload({}:{}) timed out waiting for "
+                "slice apply — new config will be applied when the "
+                "dispatcher drains", service_name_, host_);
+            break;  // No benefit to waiting out the remaining futures
+                    // after the first timeout — they share the deadline.
+        }
+    }
+
     // Save the new config for future Snapshot() / construction-like
     // operations. Other threads never read config_ directly.
     config_ = new_config;
diff --git a/server/http_server.cc b/server/http_server.cc
index 67575de7..f9a36bf5 100644
--- a/server/http_server.cc
+++ b/server/http_server.cc
@@ -3853,7 +3853,36 @@ bool HttpServer::Reload(const ServerConfig& new_config) {
     // When topology MATCHES (the common case, including CB-only
     // edits), adopt the new snapshot as the fresh baseline so CB-
     // field edits persist for later reload diffs.
-    if (new_config.upstreams != upstream_configs_) {
+    //
+    // Compare as name-keyed maps rather than vectors: live pools and
+    // CircuitBreakerManager are both keyed by upstream name, so a pure
+    // reorder of otherwise-identical entries is NOT a topology change.
+    // Vector equality would fire a spurious "restart required" warning
+    // and skip the upstream_configs_ update, leaving every subsequent
+    // breaker-only reload on that reordered file forever looking like a
+    // topology change. UpstreamConfig::operator== already excludes the
+    // live-reloadable `circuit_breaker` field, so map equality reflects
+    // the true restart-vs-live partition. Duplicate names were rejected
+    // upstream by ValidateHotReloadable, so the map conversion is
+    // lossless here.
+    auto by_name = [](const std::vector<UpstreamConfig>& v) {
+        std::map<std::string, const UpstreamConfig*> m;
+        for (const auto& u : v) m[u.name] = &u;
+        return m;
+    };
+    const auto old_map = by_name(upstream_configs_);
+    const auto new_map = by_name(new_config.upstreams);
+    bool topology_match = old_map.size() == new_map.size();
+    if (topology_match) {
+        for (const auto& entry : old_map) {
+            auto it = new_map.find(entry.first);
+            if (it == new_map.end() || *entry.second != *it->second) {
+                topology_match = false;
+                break;
+            }
+        }
+    }
+    if (!topology_match) {
         logging::Get()->warn("Reload: upstream topology changes require a "
                              "restart to take effect (circuit-breaker "
                              "field edits, if any, were applied live)");

From 79f91ed95cbb25c098d3ccdcf51ee25c4eeb9ee6 Mon Sep 17 00:00:00 2001
From: mwfj <mwfj0215@gmail.com>
Date: Wed, 15 Apr 2026 15:43:11 +0800
Subject: [PATCH 37/37] Fix review comment

---
 docs/architecture.md        |  2 +-
 docs/circuit_breaker.md     |  2 --
 server/proxy_transaction.cc |  3 +--
 test/circuit_breaker_test.h | 33 ++++++++++++++++-----------------
 test/route_test.h           |  2 +-
 5 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/docs/architecture.md b/docs/architecture.md
index 5f537bcf..8f990e8e 100644
--- a/docs/architecture.md
+++ b/docs/architecture.md
@@ -126,7 +126,7 @@ HttpServer
 - **Synchronous LRU eviction on insert** — `FindOrCreate` evicts LRU tail before creating a new entry if the shard is at capacity, guaranteeing `max_entries` is honored even under high-cardinality bursts
 - **Disable-first / enable-last reload ordering** — ensures no request can observe `enabled=true` with the previous (stale) zone list during a `(false,[])→(true,[Z])` transition
 
-See `docs/configuration.md` for the full config reference and `.claude/documents/features/RATE_LIMITING.md` for implementation internals.
+See `docs/configuration.md` for the full config reference.
 
 ## Memory Management
 
diff --git a/docs/circuit_breaker.md b/docs/circuit_breaker.md
index ef3a5ef0..64743d77 100644
--- a/docs/circuit_breaker.md
+++ b/docs/circuit_breaker.md
@@ -145,5 +145,3 @@ Topology edits (`host`, `port`, `pool.*`, `proxy.*`, `tls.*`) still require a re
 - **Generation tokens.** Every admission is stamped with a per-domain generation counter (`closed_gen_` or `halfopen_gen_`, depending on state). `Report*` drops stale-generation completions so pre-transition requests can't pollute a fresh cycle. Window resizes bump only `closed_gen_` so in-flight probes aren't stranded.
 - **Retry budget CAS.** `TryConsumeRetry` uses `compare_exchange_weak` to serialize concurrent retry admissions. A plain load-check-add would let N callers all observe `current < cap` and all increment past the cap.
 - **Non-retry denominator.** The budget base is `in_flight - retries_in_flight`, not raw `in_flight`. Retries count in both terms but subtract out here so admitting a retry doesn't inflate its own cap.
-
-For the full design document (motivations, trade-offs, failure modes, revision history, test strategy), see [.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md](../.claude/documents/design/CIRCUIT_BREAKER_DESIGN.md).
diff --git a/server/proxy_transaction.cc b/server/proxy_transaction.cc
index d3e8bd82..a427f629 100644
--- a/server/proxy_transaction.cc
+++ b/server/proxy_transaction.cc
@@ -933,8 +933,7 @@ void ProxyTransaction::Cancel() {
     // mid-request, counting that as an upstream-health failure would
     // trip the breaker against a backend that may be perfectly healthy
     // (browser cancels, user-initiated timeouts, etc. are all common
-    // causes). The reviewer guidance is explicit: client-initiated
-    // aborts must be neutral from the breaker's perspective.
+    // causes). Client-initiated aborts must be neutral from the breaker's perspective.
     //
     // Trade-off: in HALF_OPEN, ReportNeutral on a probe decrements
     // both inflight and admitted, so a cancelled probe makes the slot
diff --git a/test/circuit_breaker_test.h b/test/circuit_breaker_test.h
index bed54da0..b5b9da95 100644
--- a/test/circuit_breaker_test.h
+++ b/test/circuit_breaker_test.h
@@ -579,7 +579,7 @@ void TestSuccessClearsConsecutive() {
 }
 
 // ============================================================================
-// Regression tests — critical bugs caught in code review
+// Regression tests
 // ============================================================================
 
 // BUG: late non-probe failure after trip re-entered TripClosedToOpen, inflating
@@ -775,7 +775,7 @@ void TestHalfOpenFullCounterSeparate() {
     }
 }
 
-// BUG (review round 2, P2): Reload preserved stale state across enabled
+// Reload preserved stale state across enabled
 // toggles. Disabling while OPEN and re-enabling later resumed the OPEN state,
 // rejecting requests despite an explicit operator off→on cycle. Disabling
 // after accumulated consecutive failures would re-trip on the very next
@@ -830,8 +830,7 @@ void TestReloadResetsStateOnEnabledToggleWhileOpen() {
     }
 }
 
-// BUG (review round 2, P2, variant): if disable happens while
-// consecutive_failures_ has accumulated but not yet tripped, re-enable would
+// If disable happens while consecutive_failures_ has accumulated but not yet tripped, re-enable would
 // inherit that count and trip early on the next failure.
 void TestReloadResetsConsecutiveFailuresOnEnabledToggle() {
     std::cout << "\n[TEST] CB: reload clears consecutive_failures on enable toggle..."
@@ -909,7 +908,7 @@ void TestReloadThresholdChangePreservesState() {
     }
 }
 
-// BUG (review round 2, P3): saw_failure short-circuit incorrectly bumped the
+// saw_failure short-circuit incorrectly bumped the
 // HALF_OPEN_FULL counter, polluting dashboards that need to distinguish
 // "probing, no capacity left" from "recovery attempt is failing".
 void TestSawFailureDoesNotBumpHalfOpenFullCounter() {
@@ -956,7 +955,7 @@ void TestSawFailureDoesNotBumpHalfOpenFullCounter() {
     }
 }
 
-// BUG (review round 3, P2): TransitionOpenToHalfOpen deliberately left
+// TransitionOpenToHalfOpen deliberately left
 // `open_until_steady_ns_` populated, violating the documented OpenUntil()
 // contract ("zero when not OPEN"). A consumer computing Retry-After
 // from a HALF_OPEN slice would compute (stale_deadline - now), which is
@@ -1002,7 +1001,7 @@ void TestOpenUntilZeroWhenHalfOpen() {
     }
 }
 
-// BUG (review round 3, P1): Reload reset the state on enabled toggle but
+// Reload reset the state on enabled toggle but
 // gave Report* no way to distinguish pre-toggle admissions from post-toggle
 // ones. Stale completions then polluted the fresh CLOSED cycle. Fixed with
 // a generation token captured at admission and checked at report.
@@ -1122,7 +1121,7 @@ void TestStaleGenerationReportsDroppedAcrossStateTransitions() {
     }
 }
 
-// BUG (review round 4, P2): Reload that resizes the rolling window without
+// Reload that resizes the rolling window without
 // toggling enabled cleared the window buckets but left generation_ unchanged.
 // Late reports from pre-reload admissions would carry the still-current
 // generation, pass the guard, and re-populate the freshly empty window —
@@ -1239,7 +1238,7 @@ void TestThresholdOnlyReloadDoesNotAdvanceGeneration() {
     }
 }
 
-// BUG (review round 5, P1): Reload with window_seconds change while the
+// Reload with window_seconds change while the
 // slice is HALF_OPEN used to bump the single `generation_`, invalidating
 // every in-flight probe. Those probes' late Report* calls then dropped
 // WITHOUT decrementing half_open_inflight_, wedging the slice in HALF_OPEN
@@ -1377,7 +1376,7 @@ void TestWindowResizeStillInvalidatesClosedAdmissions() {
     }
 }
 
-// BUG (review round 7, P2): Reload() lowering permitted_half_open_calls
+// Reload() lowering permitted_half_open_calls
 // while a HALF_OPEN cycle is active could close the breaker early and
 // discard failures from already-admitted probes.
 //
@@ -1463,7 +1462,7 @@ void TestHalfOpenBudgetFrozenAcrossReload() {
     }
 }
 
-// BUG (review round 6, P2): Reload with window_seconds change preserved
+// Reload with window_seconds change preserved
 // consecutive_failures_ while bumping closed_gen_. Pre-reload CLOSED
 // reports are correctly blocked (stale gen), but they can no longer
 // clear or advance consecutive_failures_ either. The counter becomes an
@@ -1538,7 +1537,7 @@ void TestWindowResizeResetConsecutiveFailures() {
     }
 }
 
-// BUG (review round 9, P2-1): ReportFailure captured Now() separately in
+// ReportFailure captured Now() separately in
 // AddFailure() and ShouldTripClosed()'s internal TotalCount/FailureCount
 // calls. If a second boundary elapsed between the two calls, Advance() could
 // wipe the just-recorded failure — with window_seconds=1, the 1-second delta
@@ -1602,7 +1601,7 @@ void TestReportFailureUsesOneTimestampAcrossTripEval() {
     }
 }
 
-// BUG (review round 8, P2): CircuitBreakerWindow's constructor allocated
+// CircuitBreakerWindow's constructor allocated
 // `max(1, window_seconds)` buckets but stored the RAW window_seconds_ value.
 // Programmatic callers bypassing ConfigLoader::Validate() (tests, future
 // direct users) that passed window_seconds <= 0 would trigger BucketIndex's
@@ -1639,7 +1638,7 @@ void TestWindowNonPositiveWindowSizeClamp() {
     }
 }
 
-// BUG (review round 9, P3): CircuitBreakerSlice copied permitted_half_open_calls
+// CircuitBreakerSlice copied permitted_half_open_calls
 // into the HALF_OPEN snapshot verbatim. For programmatic callers bypassing
 // ConfigLoader::Validate() (same class as the window ctor clamp), a zero or
 // negative budget would permanently wedge the breaker in HALF_OPEN:
@@ -1703,7 +1702,7 @@ void TestHalfOpenClampsNonPositiveProbeBudget() {
     }
 }
 
-// BUG (review round 10, P1): TryAcquire gated HALF_OPEN admission on
+// TryAcquire gated HALF_OPEN admission on
 // half_open_inflight_, so a probe slot was reused once an earlier probe
 // completed. With permitted_half_open_calls=2:
 //
@@ -1788,7 +1787,7 @@ void TestHalfOpenDoesNotReuseProbeSlots() {
     }
 }
 
-// BUG (review round 11, P1): Admission contract has ReportSuccess and
+// Admission contract has ReportSuccess and
 // ReportFailure but no path for probes that complete without touching the
 // upstream (POOL_EXHAUSTED after probe admission, shutdown, client
 // disconnect, PARSE_ERROR). Following the §7 "don't report these as
@@ -1921,7 +1920,7 @@ void TestReportNeutralLastProbeAfterFailureReTrips() {
     }
 }
 
-// BUG (review round 12, P2): ComputeOpenDuration read base/max durations
+// ComputeOpenDuration read base/max durations
 // straight from config_, so a programmatic caller bypassing
 // ConfigLoader::Validate() with base_open_duration_ms <= 0 or max < base
 // would compute scaled_ms <= 0. open_until = now + 0 → next TryAcquire
diff --git a/test/route_test.h b/test/route_test.h
index c7b86aa4..31cdfe7d 100644
--- a/test/route_test.h
+++ b/test/route_test.h
@@ -1571,7 +1571,7 @@ void TestRouterProxyCompanionYieldsForMarkedMethod() {
     }
 }
 
-// P2 (latest review): per-pattern paired_with_get. When a proxy
+// Per-pattern paired_with_get. When a proxy
 // registers both a companion pattern and a catch-all pattern, the
 // per-(method,pattern) async-conflict filter may drop GET on ONE
 // pattern while keeping it on the OTHER. MarkProxyDefaultHead must