Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/reference.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ DeployBot resolves the pull request for the current branch.
| `deploybot status [--json]` | Read active thread metadata, unbound open PRs, pending native notifications, PR stages, exact-head deploy intent, pre-queue intent overlaps, request-stage timing alerts, queue state, exact-main CI, deployment, and pipeline control state. |
| `deploybot plan [--json]` | Read the ordered queue, dependencies, blockers, and source-overlap groups. |
| `deploybot inspect [PR] [--json]` | Evaluate one exact PR head without granting merge authority. |
| `deploybot metrics [--limit N] [--json]` | Summarize p50, p95, and maximum delivery timings for recent merged PRs. The default limit is 25. |
| `deploybot metrics [--limit N] [--json]` | Summarize p50, p95, and maximum delivery timings for recent merged PRs, each stage's attainment against the configured speed targets, and the first-pass merge rate (deliveries that merged without a recorded repair handoff). The default limit is 25. |

`status`, `plan`, `inspect`, `doctor`, and `metrics` are read-only. In
particular, do not use `freeze` as a status command because it writes a durable
Expand Down
54 changes: 49 additions & 5 deletions src/agent_merge_queue/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5869,12 +5869,15 @@ def delivery_metrics(client: GitHub, *, limit: int) -> dict[str, Any]:
comments_by_number = client.comments_for_pull_requests(
int(pull["number"]) for pull in pulls
)
repair_logins = coordinator_logins(client) | set(client.trusted_logins)

def sample(pull: dict[str, Any]) -> dict[str, Any]:
number = int(pull["number"])
comments = comments_by_number.get(number, [])
intent = latest_intent(comments, client.trusted_logins)
marker = queue_marker_for_client(client, comments)
repair = latest_payload(comments, REPAIR_MARKER, repair_logins)
repaired = bool(repair) and not repair_marker_is_transitional(repair)
merged_at = str(pull.get("merged_at") or "") or None
merge_sha = str(pull.get("merge_commit_sha") or "")
live_at: str | None = None
Expand All @@ -5897,6 +5900,7 @@ def sample(pull: dict[str, Any]) -> dict[str, Any]:
"queued_at": queued_at,
"merged_at": merged_at,
"live_at": live_at,
"repaired": repaired,
"request_to_queue_seconds": seconds_between(requested_at, queued_at),
"queue_to_merge_seconds": seconds_between(queued_at, merged_at),
"merge_to_live_seconds": seconds_between(merged_at, live_at),
Expand All @@ -5910,7 +5914,27 @@ def sample(pull: dict[str, Any]) -> dict[str, Any]:
max_workers=min(client.config.pipeline.promotion_workers, len(pulls))
) as executor:
samples = list(executor.map(sample, pulls))
return summarize_metrics(samples)
# Compare each stage against the operator's own speed targets so the
# historical view answers "are PRs going out the door quickly enough?",
# mirroring how `status` applies these same targets to live work.
ready_target = client.config.pipeline.ready_to_merge_target_minutes * 60
live_target = client.config.pipeline.merge_to_live_target_minutes * 60
targets = {
"queue_to_merge_seconds": ready_target,
"merge_to_live_seconds": live_target,
"request_to_live_seconds": ready_target + live_target,
}
summary = summarize_metrics(samples, targets=targets)
repaired_merges = sum(1 for value in samples if value.get("repaired"))
summary["reliability"] = {
"merged_samples": len(samples),
"first_pass_merges": len(samples) - repaired_merges,
"repaired_merges": repaired_merges,
"first_pass_rate": (
(len(samples) - repaired_merges) / len(samples) if samples else None
),
}
return summary


def print_doctor(rows: list[dict[str, Any]], *, json_output: bool) -> None:
Expand Down Expand Up @@ -6164,12 +6188,32 @@ def main(argv: list[str] | None = None) -> int:
print(json.dumps(value, indent=2, sort_keys=True))
else:
print(f"delivery samples: {value['sample_count']}")
reliability = value.get("reliability") or {}
if reliability.get("merged_samples"):
rate = reliability.get("first_pass_rate")
rate_text = f"{rate * 100:.0f}%" if rate is not None else "n/a"
print(
"first-pass merges: "
f"{reliability['first_pass_merges']}/"
f"{reliability['merged_samples']} ({rate_text})"
)
for name, summary in value.items():
if name.endswith("_seconds"):
print(
f"{name}: p50={summary['p50']}s "
f"p95={summary['p95']}s max={summary['max']}s"
if not name.endswith("_seconds"):
continue
line = (
f"{name}: p50={summary['p50']}s "
f"p95={summary['p95']}s max={summary['max']}s"
)
if summary.get("target_seconds") is not None:
within = summary.get("within_target_rate")
within_text = (
f"{within * 100:.0f}%" if within is not None else "n/a"
)
line += (
f" target={summary['target_seconds']}s "
f"within={within_text}"
)
print(line)
elif arguments.command == "block":
command_block(client, arguments.pr, arguments.reason)
elif arguments.command == "unblock":
Expand Down
18 changes: 16 additions & 2 deletions src/agent_merge_queue/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,24 +183,38 @@ def percentile(values: Iterable[float], fraction: float) -> float | None:
return ordered[index]


def summarize_metrics(samples: list[dict[str, Any]]) -> dict[str, Any]:
def summarize_metrics(
samples: list[dict[str, Any]],
*,
targets: dict[str, float] | None = None,
) -> dict[str, Any]:
stages = (
"request_to_queue_seconds",
"queue_to_merge_seconds",
"merge_to_live_seconds",
"request_to_live_seconds",
)
targets = targets or {}
summary: dict[str, Any] = {"sample_count": len(samples), "samples": samples}
for stage in stages:
values = [
float(sample[stage]) for sample in samples if sample.get(stage) is not None
]
summary[stage] = {
stage_summary: dict[str, Any] = {
"count": len(values),
"p50": percentile(values, 0.50),
"p95": percentile(values, 0.95),
"max": max(values) if values else None,
}
target = targets.get(stage)
if target is not None:
within = sum(1 for value in values if value <= target)
stage_summary["target_seconds"] = target
stage_summary["within_target"] = within
stage_summary["within_target_rate"] = (
within / len(values) if values else None
)
summary[stage] = stage_summary
return summary


Expand Down
103 changes: 103 additions & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3902,6 +3902,109 @@ def test_metrics_skip_deployments_that_finished_before_the_merge(self) -> None:
datetime(2026, 6, 20, 1, tzinfo=timezone.utc),
)

def test_metrics_report_target_attainment_and_first_pass_rate(self) -> None:
clean_merge = "c" * 40
repaired_merge = "f" * 40
clean_deploy = "d" * 40
repaired_deploy = "e" * 40
head_42 = "a" * 40
head_43 = "b" * 40
client = Mock()
client.config = CONFIG
client.repository = "example/repo"
client.trusted_logins = {"trusted"}
client.coordinator_logins = {"trusted"}
client.successful_workflow_runs.return_value = [
{
"name": "Deploy",
"conclusion": "success",
"head_sha": clean_deploy,
"created_at": "2026-06-20T00:09:00Z",
"updated_at": "2026-06-20T00:09:00Z",
},
{
"name": "Deploy",
"conclusion": "success",
"head_sha": repaired_deploy,
"created_at": "2026-06-20T00:25:00Z",
"updated_at": "2026-06-20T00:25:00Z",
},
]
client.recent_merged_pull_requests.return_value = [
{
"number": 42,
"merged_at": "2026-06-20T00:05:00Z",
"merge_commit_sha": clean_merge,
},
{
"number": 43,
"merged_at": "2026-06-20T00:05:00Z",
"merge_commit_sha": repaired_merge,
},
]

def comment(login: str, body: str, created_at: str) -> dict[str, Any]:
return {"user": {"login": login}, "body": body, "created_at": created_at}

def intent_comment(head: str) -> dict[str, Any]:
return comment(
"trusted",
intent_body(
intent_id=f"intent-{head[:1]}",
state="requested",
requested_at="2026-06-20T00:00:00Z",
requested_head=head,
),
"2026-06-20T00:00:00Z",
)

def queue_comment(head: str) -> dict[str, Any]:
return comment(
"trusted",
queue_state_body("queued", head, queued_at="2026-06-20T00:01:00Z"),
"2026-06-20T00:01:00Z",
)

client.comments_for_pull_requests.return_value = {
42: [intent_comment(head_42), queue_comment(head_42)],
43: [
intent_comment(head_43),
queue_comment(head_43),
comment(
"trusted",
repair_body({"head_sha": head_43, "reason": "CI failed"}),
"2026-06-20T00:02:00Z",
),
],
}

def is_ancestor(merge_sha: str, deployed_sha: str) -> bool:
return (merge_sha, deployed_sha) in {
(clean_merge, clean_deploy),
(repaired_merge, repaired_deploy),
}

client.is_ancestor.side_effect = is_ancestor

result = delivery_metrics(client, limit=25)

self.assertEqual(result["sample_count"], 2)
self.assertEqual(result["reliability"]["first_pass_merges"], 1)
self.assertEqual(result["reliability"]["repaired_merges"], 1)
self.assertEqual(result["reliability"]["first_pass_rate"], 0.5)
merge_to_live = result["merge_to_live_seconds"]
self.assertEqual(merge_to_live["target_seconds"], 600)
self.assertEqual(merge_to_live["within_target"], 1)
self.assertEqual(merge_to_live["within_target_rate"], 0.5)
request_to_live = result["request_to_live_seconds"]
self.assertEqual(request_to_live["target_seconds"], 1500)
self.assertEqual(request_to_live["within_target"], 2)
self.assertNotIn("target_seconds", result["request_to_queue_seconds"])
repaired = {
value["pull_request"]: value["repaired"] for value in result["samples"]
}
self.assertEqual(repaired, {42: False, 43: True})

def test_integration_seal_round_trips_through_git_commit(self) -> None:
parent_sha = "p" * 40
seal_sha = "s" * 40
Expand Down
22 changes: 22 additions & 0 deletions tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,28 @@ def test_metrics_report_p50_and_p95(self) -> None:
self.assertEqual(summary["request_to_queue_seconds"]["p50"], 1)
self.assertEqual(summary["request_to_queue_seconds"]["p95"], 3)

def test_metrics_report_target_attainment(self) -> None:
summary = summarize_metrics(
[
{"merge_to_live_seconds": 100},
{"merge_to_live_seconds": 700},
],
targets={"merge_to_live_seconds": 600},
)
stage = summary["merge_to_live_seconds"]
self.assertEqual(stage["target_seconds"], 600)
self.assertEqual(stage["within_target"], 1)
self.assertEqual(stage["within_target_rate"], 0.5)

def test_metrics_omit_attainment_without_targets(self) -> None:
summary = summarize_metrics([{"merge_to_live_seconds": 100}])
self.assertNotIn("target_seconds", summary["merge_to_live_seconds"])
self.assertIsNone(
summarize_metrics([], targets={"merge_to_live_seconds": 600})[
"merge_to_live_seconds"
]["within_target_rate"]
)

def test_webhook_failure_never_blocks_delivery_state(self) -> None:
config = parse_config(
{
Expand Down