diff --git a/docs/reference.md b/docs/reference.md index 6a6d078..4670b53 100644 --- a/docs/reference.md +++ b/docs/reference.md @@ -26,7 +26,7 @@ DeployBot resolves the pull request for the current branch. | `deploybot status [--json]` | Read active thread metadata, unbound open PRs, pending native notifications, PR stages, exact-head deploy intent, pre-queue intent overlaps, request-stage timing alerts, queue state, exact-main CI, deployment, and pipeline control state. | | `deploybot plan [--json]` | Read the ordered queue, dependencies, blockers, and source-overlap groups. | | `deploybot inspect [PR] [--json]` | Evaluate one exact PR head without granting merge authority. | -| `deploybot metrics [--limit N] [--json]` | Summarize p50, p95, and maximum delivery timings for recent merged PRs. The default limit is 25. | +| `deploybot metrics [--limit N] [--json]` | Summarize p50, p95, and maximum delivery timings for recent merged PRs, each stage's attainment against the configured speed targets, and the first-pass merge rate (deliveries that merged without a recorded repair handoff). The default limit is 25. | `status`, `plan`, `inspect`, `doctor`, and `metrics` are read-only. In particular, do not use `freeze` as a status command because it writes a durable diff --git a/src/agent_merge_queue/cli.py b/src/agent_merge_queue/cli.py index cac0e07..be47858 100755 --- a/src/agent_merge_queue/cli.py +++ b/src/agent_merge_queue/cli.py @@ -5869,12 +5869,15 @@ def delivery_metrics(client: GitHub, *, limit: int) -> dict[str, Any]: comments_by_number = client.comments_for_pull_requests( int(pull["number"]) for pull in pulls ) + repair_logins = coordinator_logins(client) | set(client.trusted_logins) def sample(pull: dict[str, Any]) -> dict[str, Any]: number = int(pull["number"]) comments = comments_by_number.get(number, []) intent = latest_intent(comments, client.trusted_logins) marker = queue_marker_for_client(client, comments) + repair = latest_payload(comments, REPAIR_MARKER, repair_logins) + repaired = bool(repair) and not repair_marker_is_transitional(repair) merged_at = str(pull.get("merged_at") or "") or None merge_sha = str(pull.get("merge_commit_sha") or "") live_at: str | None = None @@ -5897,6 +5900,7 @@ def sample(pull: dict[str, Any]) -> dict[str, Any]: "queued_at": queued_at, "merged_at": merged_at, "live_at": live_at, + "repaired": repaired, "request_to_queue_seconds": seconds_between(requested_at, queued_at), "queue_to_merge_seconds": seconds_between(queued_at, merged_at), "merge_to_live_seconds": seconds_between(merged_at, live_at), @@ -5910,7 +5914,27 @@ def sample(pull: dict[str, Any]) -> dict[str, Any]: max_workers=min(client.config.pipeline.promotion_workers, len(pulls)) ) as executor: samples = list(executor.map(sample, pulls)) - return summarize_metrics(samples) + # Compare each stage against the operator's own speed targets so the + # historical view answers "are PRs going out the door quickly enough?", + # mirroring how `status` applies these same targets to live work. + ready_target = client.config.pipeline.ready_to_merge_target_minutes * 60 + live_target = client.config.pipeline.merge_to_live_target_minutes * 60 + targets = { + "queue_to_merge_seconds": ready_target, + "merge_to_live_seconds": live_target, + "request_to_live_seconds": ready_target + live_target, + } + summary = summarize_metrics(samples, targets=targets) + repaired_merges = sum(1 for value in samples if value.get("repaired")) + summary["reliability"] = { + "merged_samples": len(samples), + "first_pass_merges": len(samples) - repaired_merges, + "repaired_merges": repaired_merges, + "first_pass_rate": ( + (len(samples) - repaired_merges) / len(samples) if samples else None + ), + } + return summary def print_doctor(rows: list[dict[str, Any]], *, json_output: bool) -> None: @@ -6164,12 +6188,32 @@ def main(argv: list[str] | None = None) -> int: print(json.dumps(value, indent=2, sort_keys=True)) else: print(f"delivery samples: {value['sample_count']}") + reliability = value.get("reliability") or {} + if reliability.get("merged_samples"): + rate = reliability.get("first_pass_rate") + rate_text = f"{rate * 100:.0f}%" if rate is not None else "n/a" + print( + "first-pass merges: " + f"{reliability['first_pass_merges']}/" + f"{reliability['merged_samples']} ({rate_text})" + ) for name, summary in value.items(): - if name.endswith("_seconds"): - print( - f"{name}: p50={summary['p50']}s " - f"p95={summary['p95']}s max={summary['max']}s" + if not name.endswith("_seconds"): + continue + line = ( + f"{name}: p50={summary['p50']}s " + f"p95={summary['p95']}s max={summary['max']}s" + ) + if summary.get("target_seconds") is not None: + within = summary.get("within_target_rate") + within_text = ( + f"{within * 100:.0f}%" if within is not None else "n/a" + ) + line += ( + f" target={summary['target_seconds']}s " + f"within={within_text}" ) + print(line) elif arguments.command == "block": command_block(client, arguments.pr, arguments.reason) elif arguments.command == "unblock": diff --git a/src/agent_merge_queue/pipeline.py b/src/agent_merge_queue/pipeline.py index 88b0e0d..469c530 100644 --- a/src/agent_merge_queue/pipeline.py +++ b/src/agent_merge_queue/pipeline.py @@ -183,24 +183,38 @@ def percentile(values: Iterable[float], fraction: float) -> float | None: return ordered[index] -def summarize_metrics(samples: list[dict[str, Any]]) -> dict[str, Any]: +def summarize_metrics( + samples: list[dict[str, Any]], + *, + targets: dict[str, float] | None = None, +) -> dict[str, Any]: stages = ( "request_to_queue_seconds", "queue_to_merge_seconds", "merge_to_live_seconds", "request_to_live_seconds", ) + targets = targets or {} summary: dict[str, Any] = {"sample_count": len(samples), "samples": samples} for stage in stages: values = [ float(sample[stage]) for sample in samples if sample.get(stage) is not None ] - summary[stage] = { + stage_summary: dict[str, Any] = { "count": len(values), "p50": percentile(values, 0.50), "p95": percentile(values, 0.95), "max": max(values) if values else None, } + target = targets.get(stage) + if target is not None: + within = sum(1 for value in values if value <= target) + stage_summary["target_seconds"] = target + stage_summary["within_target"] = within + stage_summary["within_target_rate"] = ( + within / len(values) if values else None + ) + summary[stage] = stage_summary return summary diff --git a/tests/test_cli.py b/tests/test_cli.py index 2fef07c..4f40488 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -3902,6 +3902,109 @@ def test_metrics_skip_deployments_that_finished_before_the_merge(self) -> None: datetime(2026, 6, 20, 1, tzinfo=timezone.utc), ) + def test_metrics_report_target_attainment_and_first_pass_rate(self) -> None: + clean_merge = "c" * 40 + repaired_merge = "f" * 40 + clean_deploy = "d" * 40 + repaired_deploy = "e" * 40 + head_42 = "a" * 40 + head_43 = "b" * 40 + client = Mock() + client.config = CONFIG + client.repository = "example/repo" + client.trusted_logins = {"trusted"} + client.coordinator_logins = {"trusted"} + client.successful_workflow_runs.return_value = [ + { + "name": "Deploy", + "conclusion": "success", + "head_sha": clean_deploy, + "created_at": "2026-06-20T00:09:00Z", + "updated_at": "2026-06-20T00:09:00Z", + }, + { + "name": "Deploy", + "conclusion": "success", + "head_sha": repaired_deploy, + "created_at": "2026-06-20T00:25:00Z", + "updated_at": "2026-06-20T00:25:00Z", + }, + ] + client.recent_merged_pull_requests.return_value = [ + { + "number": 42, + "merged_at": "2026-06-20T00:05:00Z", + "merge_commit_sha": clean_merge, + }, + { + "number": 43, + "merged_at": "2026-06-20T00:05:00Z", + "merge_commit_sha": repaired_merge, + }, + ] + + def comment(login: str, body: str, created_at: str) -> dict[str, Any]: + return {"user": {"login": login}, "body": body, "created_at": created_at} + + def intent_comment(head: str) -> dict[str, Any]: + return comment( + "trusted", + intent_body( + intent_id=f"intent-{head[:1]}", + state="requested", + requested_at="2026-06-20T00:00:00Z", + requested_head=head, + ), + "2026-06-20T00:00:00Z", + ) + + def queue_comment(head: str) -> dict[str, Any]: + return comment( + "trusted", + queue_state_body("queued", head, queued_at="2026-06-20T00:01:00Z"), + "2026-06-20T00:01:00Z", + ) + + client.comments_for_pull_requests.return_value = { + 42: [intent_comment(head_42), queue_comment(head_42)], + 43: [ + intent_comment(head_43), + queue_comment(head_43), + comment( + "trusted", + repair_body({"head_sha": head_43, "reason": "CI failed"}), + "2026-06-20T00:02:00Z", + ), + ], + } + + def is_ancestor(merge_sha: str, deployed_sha: str) -> bool: + return (merge_sha, deployed_sha) in { + (clean_merge, clean_deploy), + (repaired_merge, repaired_deploy), + } + + client.is_ancestor.side_effect = is_ancestor + + result = delivery_metrics(client, limit=25) + + self.assertEqual(result["sample_count"], 2) + self.assertEqual(result["reliability"]["first_pass_merges"], 1) + self.assertEqual(result["reliability"]["repaired_merges"], 1) + self.assertEqual(result["reliability"]["first_pass_rate"], 0.5) + merge_to_live = result["merge_to_live_seconds"] + self.assertEqual(merge_to_live["target_seconds"], 600) + self.assertEqual(merge_to_live["within_target"], 1) + self.assertEqual(merge_to_live["within_target_rate"], 0.5) + request_to_live = result["request_to_live_seconds"] + self.assertEqual(request_to_live["target_seconds"], 1500) + self.assertEqual(request_to_live["within_target"], 2) + self.assertNotIn("target_seconds", result["request_to_queue_seconds"]) + repaired = { + value["pull_request"]: value["repaired"] for value in result["samples"] + } + self.assertEqual(repaired, {42: False, 43: True}) + def test_integration_seal_round_trips_through_git_commit(self) -> None: parent_sha = "p" * 40 seal_sha = "s" * 40 diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index c062175..cb06f64 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -306,6 +306,28 @@ def test_metrics_report_p50_and_p95(self) -> None: self.assertEqual(summary["request_to_queue_seconds"]["p50"], 1) self.assertEqual(summary["request_to_queue_seconds"]["p95"], 3) + def test_metrics_report_target_attainment(self) -> None: + summary = summarize_metrics( + [ + {"merge_to_live_seconds": 100}, + {"merge_to_live_seconds": 700}, + ], + targets={"merge_to_live_seconds": 600}, + ) + stage = summary["merge_to_live_seconds"] + self.assertEqual(stage["target_seconds"], 600) + self.assertEqual(stage["within_target"], 1) + self.assertEqual(stage["within_target_rate"], 0.5) + + def test_metrics_omit_attainment_without_targets(self) -> None: + summary = summarize_metrics([{"merge_to_live_seconds": 100}]) + self.assertNotIn("target_seconds", summary["merge_to_live_seconds"]) + self.assertIsNone( + summarize_metrics([], targets={"merge_to_live_seconds": 600})[ + "merge_to_live_seconds" + ]["within_target_rate"] + ) + def test_webhook_failure_never_blocks_delivery_state(self) -> None: config = parse_config( {