From 8d0317a071f6ef8beb1795edf489075283e70d95 Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Fri, 29 May 2026 23:56:59 -0700 Subject: [PATCH 1/2] Surface the self-approval prohibition at the top of verifier.json When a PR weakens the release policy or touches a trust root, a coding agent must not silently self-approve a change to its own gate. That prohibition was only present inside a fix_task instruction (PR #146); promote it to the two fields an agent reads first. - Add _self_approval_note(): the explicit "a coding agent cannot self-approve that change - a human must review it" message for policy_weakened (taking precedence) and trust_root_touched. - verifier.json headline leads with the note when present. - human_review.why leads with the note, and a self-approval note forces human_review.required=True regardless of the verdict path. Full suite: 2346 passed, 4 skipped. No schema change. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../cli/verify/orchestrator.py | 62 ++++++++++-- tests/test_self_approval_signal.py | 99 +++++++++++++++++++ 2 files changed, 153 insertions(+), 8 deletions(-) create mode 100644 tests/test_self_approval_signal.py diff --git a/src/agents_shipgate/cli/verify/orchestrator.py b/src/agents_shipgate/cli/verify/orchestrator.py index 3b5a57e5..eb6ee255 100644 --- a/src/agents_shipgate/cli/verify/orchestrator.py +++ b/src/agents_shipgate/cli/verify/orchestrator.py @@ -20,6 +20,7 @@ MergeVerdict, VerifierArtifact, VerifierBaseStatus, + VerifierCapabilityReview, VerifierFixTask, VerifierHumanReview, VerifierNextAction, @@ -536,10 +537,39 @@ def _can_merge_without_human( return not (release_decision.blockers or release_decision.review_items) +def _self_approval_note(capability_review: VerifierCapabilityReview | None) -> str | None: + """The explicit self-approval prohibition when this PR edits the rules that + evaluate it. + + A coding agent must never silently self-approve a change to its own release + gate (reward hacking). When the head scan flags a weakened policy or a + touched trust root, that prohibition is surfaced as the verifier headline + and the human-review reason — not left implicit in a fix_task instruction. + """ + if capability_review is None: + return None + if capability_review.policy_weakened: + return ( + "This PR weakens the release policy that evaluates it; a coding " + "agent cannot self-approve that change — a human must review it." + ) + if capability_review.trust_root_touched: + return ( + "This PR edits a release trust root (the manifest, CI gate, agent " + "instructions, or trigger catalog used to evaluate it); a coding " + "agent cannot self-approve that change — a human must review it." + ) + return None + + def _human_review( - *, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None + *, + merge_verdict: MergeVerdict, + release_decision: ReleaseDecision | None, + capability_review: VerifierCapabilityReview | None = None, ) -> VerifierHumanReview: - required = merge_verdict in { + note = _self_approval_note(capability_review) + required = note is not None or merge_verdict in { "human_review_required", "blocked", "insufficient_evidence", @@ -547,10 +577,14 @@ def _human_review( } if not required: return VerifierHumanReview(required=False, why=None) - why = release_decision.reason if release_decision is not None else None + # A self-approval prohibition is the most important reason a human is + # needed; surface it ahead of the generic release-decision reason. + reason = release_decision.reason if release_decision is not None else None return VerifierHumanReview( required=True, - why=why or "A human must review this agent-capability change before merge.", + why=note + or reason + or "A human must review this agent-capability change before merge.", ) @@ -611,8 +645,17 @@ def _first_next_action( def _verifier_headline( - *, report: ReadinessReport | None, merge_verdict: MergeVerdict, head_status: str + *, + report: ReadinessReport | None, + merge_verdict: MergeVerdict, + head_status: str, + capability_review: VerifierCapabilityReview | None = None, ) -> str | None: + # An agent editing the rules that evaluate its own change must see the + # self-approval prohibition first, ahead of the generic scan headline. + note = _self_approval_note(capability_review) + if note is not None: + return note if report is not None and report.agent_summary is not None: return report.agent_summary.headline if head_status == "skipped": @@ -671,11 +714,13 @@ def _build_verifier( ) decision = release_decision_model.decision if release_decision_model else None merge_verdict = merge_verdict_for(decision=decision, head_status=head_status) - human_review = _human_review( - merge_verdict=merge_verdict, release_decision=release_decision_model - ) agent_summary_model = report.agent_summary if report is not None else None capability_review = build_capability_review(report) if report is not None else None + human_review = _human_review( + merge_verdict=merge_verdict, + release_decision=release_decision_model, + capability_review=capability_review, + ) fix_task = build_fix_task( report, merge_verdict=merge_verdict, @@ -728,6 +773,7 @@ def _build_verifier( report=report, merge_verdict=merge_verdict, head_status=head_status, + capability_review=capability_review, ), human_review=human_review, first_next_action=_first_next_action( diff --git a/tests/test_self_approval_signal.py b/tests/test_self_approval_signal.py new file mode 100644 index 00000000..a159cfa0 --- /dev/null +++ b/tests/test_self_approval_signal.py @@ -0,0 +1,99 @@ +"""PR-D: the self-approval prohibition is surfaced at the top of verifier.json. + +When a PR edits the rules that evaluate it — a weakened release policy or a +touched trust root — a coding agent must not silently self-approve (reward +hacking). That prohibition is the verifier headline and the human-review +reason, not buried in a fix_task instruction. +""" + +from __future__ import annotations + +from agents_shipgate.cli.verify.orchestrator import ( + _human_review, + _self_approval_note, + _verifier_headline, +) +from agents_shipgate.schemas.verifier import VerifierCapabilityReview + + +def _cr(**kwargs) -> VerifierCapabilityReview: + return VerifierCapabilityReview(**kwargs) + + +# --- the note itself -------------------------------------------------------- + + +def test_policy_weakened_note_calls_out_self_approval() -> None: + note = _self_approval_note(_cr(policy_weakened=True)) + assert note is not None + assert "self-approve" in note + assert "policy" in note + + +def test_trust_root_note_calls_out_self_approval() -> None: + note = _self_approval_note(_cr(trust_root_touched=True)) + assert note is not None + assert "self-approve" in note + assert "trust root" in note + + +def test_policy_weakening_takes_precedence_over_trust_root() -> None: + note = _self_approval_note(_cr(policy_weakened=True, trust_root_touched=True)) + assert note is not None + assert "policy" in note + + +def test_clean_review_has_no_note() -> None: + assert _self_approval_note(_cr()) is None + assert _self_approval_note(None) is None + + +# --- human_review surfacing ------------------------------------------------- + + +def test_human_review_why_leads_with_self_approval_note() -> None: + review = _human_review( + merge_verdict="human_review_required", + release_decision=None, + capability_review=_cr(policy_weakened=True), + ) + assert review.required is True + assert review.why is not None + assert "self-approve" in review.why + + +def test_self_approval_forces_human_review_even_if_verdict_not_human() -> None: + # Defensive: a weakened policy must require a human even if some other path + # produced a non-human verdict — the agent can never clear its own gate. + review = _human_review( + merge_verdict="mergeable", + release_decision=None, + capability_review=_cr(trust_root_touched=True), + ) + assert review.required is True + assert review.why is not None + assert "self-approve" in review.why + + +# --- headline surfacing ----------------------------------------------------- + + +def test_headline_leads_with_self_approval_note() -> None: + headline = _verifier_headline( + report=None, + merge_verdict="human_review_required", + head_status="succeeded", + capability_review=_cr(trust_root_touched=True), + ) + assert headline is not None + assert "self-approve" in headline + + +def test_headline_without_note_falls_back_to_default() -> None: + headline = _verifier_headline( + report=None, + merge_verdict="unknown", + head_status="failed", + capability_review=_cr(), + ) + assert headline == "Shipgate could not complete the scan; human review required." From 9ef25cd7385f61e1e3691341d23fcac394e2c33c Mon Sep 17 00:00:00 2001 From: Pengfei Hu Date: Sat, 30 May 2026 14:42:19 -0700 Subject: [PATCH 2/2] Self-approval: keep all top-level convenience fields consistent (review fix) Addresses review of #148: a self-approval note forced human_review.required=True, but can_merge_without_human and first_next_action still keyed only off merge_verdict, so the defensive (mergeable + note) path could emit "human review required" and "safe to merge" at once. - _can_merge_without_human returns False whenever a self-approval note exists. - _first_next_action routes to a human review (never the "safe to merge" action) when a self-approval note is present, including the fix_task-None defensive case. - Both thread capability_review from _build_verifier. Clean mergeable behavior (no note) is unchanged; covered by a regression test. Full suite: 2349 passed, 4 skipped. No schema change. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../cli/verify/orchestrator.py | 22 +++++++- tests/test_self_approval_signal.py | 50 +++++++++++++++++++ 2 files changed, 70 insertions(+), 2 deletions(-) diff --git a/src/agents_shipgate/cli/verify/orchestrator.py b/src/agents_shipgate/cli/verify/orchestrator.py index eb6ee255..7cc1ed7e 100644 --- a/src/agents_shipgate/cli/verify/orchestrator.py +++ b/src/agents_shipgate/cli/verify/orchestrator.py @@ -526,8 +526,15 @@ def _map_optional_tree_path( def _can_merge_without_human( - *, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None + *, + merge_verdict: MergeVerdict, + release_decision: ReleaseDecision | None, + capability_review: VerifierCapabilityReview | None = None, ) -> bool: + # A self-approval change can never clear its own gate, even in the + # defensive case where the verdict was somehow not human-routed. + if _self_approval_note(capability_review) is not None: + return False if merge_verdict != "mergeable": return False if release_decision is None: @@ -594,14 +601,23 @@ def _first_next_action( fix_task: VerifierFixTask | None, agent_summary: AgentSummary | None, reason: str | None, + capability_review: VerifierCapabilityReview | None = None, ) -> VerifierNextAction: - if merge_verdict == "mergeable": + self_approval = _self_approval_note(capability_review) + if merge_verdict == "mergeable" and self_approval is None: return VerifierNextAction( actor="coding_agent", kind="none", command=None, why="No agent-capability changes gate this PR; safe to merge.", ) + if self_approval is not None and fix_task is None: + # Defensive self-approval path (e.g. a 'mergeable' verdict that still + # carries a self-approval note): a human must review — never emit the + # "safe to merge" action. + return VerifierNextAction( + actor="human", kind="review", command=None, why=self_approval + ) # The fix_task is the single repair contract; the headline next-step must # not contradict it. Borrow the agent summary's concrete action (e.g. an # apply-patches command) only when its implied actor agrees with the @@ -768,6 +784,7 @@ def _build_verifier( can_merge_without_human=_can_merge_without_human( merge_verdict=merge_verdict, release_decision=release_decision_model, + capability_review=capability_review, ), headline=_verifier_headline( report=report, @@ -781,6 +798,7 @@ def _build_verifier( fix_task=fix_task, agent_summary=agent_summary_model, reason=release_decision_model.reason if release_decision_model else None, + capability_review=capability_review, ), fix_task=fix_task, artifacts=artifacts, diff --git a/tests/test_self_approval_signal.py b/tests/test_self_approval_signal.py index a159cfa0..081c6bfd 100644 --- a/tests/test_self_approval_signal.py +++ b/tests/test_self_approval_signal.py @@ -9,6 +9,8 @@ from __future__ import annotations from agents_shipgate.cli.verify.orchestrator import ( + _can_merge_without_human, + _first_next_action, _human_review, _self_approval_note, _verifier_headline, @@ -97,3 +99,51 @@ def test_headline_without_note_falls_back_to_default() -> None: capability_review=_cr(), ) assert headline == "Shipgate could not complete the scan; human review required." + + +# --- convenience fields stay consistent in the defensive case --------------- + + +def test_self_approval_blocks_can_merge_without_human() -> None: + assert ( + _can_merge_without_human( + merge_verdict="mergeable", + release_decision=None, + capability_review=_cr(policy_weakened=True), + ) + is False + ) + + +def test_self_approval_first_next_action_routes_to_human_when_mergeable() -> None: + # The defensive path: a 'mergeable' verdict carrying a self-approval note + # must not emit "safe to merge"; the next step is a human review. + action = _first_next_action( + merge_verdict="mergeable", + fix_task=None, + agent_summary=None, + reason=None, + capability_review=_cr(trust_root_touched=True), + ) + assert action.actor == "human" + assert action.kind == "review" + assert "self-approve" in action.why + + +def test_clean_mergeable_still_merges_and_keeps_safe_action() -> None: + # Regression: with no self-approval note, mergeable behaves as before. + assert ( + _can_merge_without_human( + merge_verdict="mergeable", release_decision=None, capability_review=_cr() + ) + is True + ) + action = _first_next_action( + merge_verdict="mergeable", + fix_task=None, + agent_summary=None, + reason=None, + capability_review=_cr(), + ) + assert action.actor == "coding_agent" + assert action.kind == "none"