ThreeMoonsLab · pengfei-threemoonslab · May 30, 2026 · May 30, 2026 · May 30, 2026
diff --git a/src/agents_shipgate/cli/verify/orchestrator.py b/src/agents_shipgate/cli/verify/orchestrator.py
@@ -20,6 +20,7 @@
     MergeVerdict,
     VerifierArtifact,
     VerifierBaseStatus,
+    VerifierCapabilityReview,
     VerifierFixTask,
     VerifierHumanReview,
     VerifierNextAction,
@@ -525,8 +526,15 @@ def _map_optional_tree_path(
 
 
 def _can_merge_without_human(
-    *, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None
+    *,
+    merge_verdict: MergeVerdict,
+    release_decision: ReleaseDecision | None,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> bool:
+    # A self-approval change can never clear its own gate, even in the
+    # defensive case where the verdict was somehow not human-routed.
+    if _self_approval_note(capability_review) is not None:
+        return False
     if merge_verdict != "mergeable":
         return False
     if release_decision is None:
@@ -536,21 +544,54 @@ def _can_merge_without_human(
     return not (release_decision.blockers or release_decision.review_items)
 
 
+def _self_approval_note(capability_review: VerifierCapabilityReview | None) -> str | None:
+    """The explicit self-approval prohibition when this PR edits the rules that
+    evaluate it.
+
+    A coding agent must never silently self-approve a change to its own release
+    gate (reward hacking). When the head scan flags a weakened policy or a
+    touched trust root, that prohibition is surfaced as the verifier headline
+    and the human-review reason — not left implicit in a fix_task instruction.
+    """
+    if capability_review is None:
+        return None
+    if capability_review.policy_weakened:
+        return (
+            "This PR weakens the release policy that evaluates it; a coding "
+            "agent cannot self-approve that change — a human must review it."
+        )
+    if capability_review.trust_root_touched:
+        return (
+            "This PR edits a release trust root (the manifest, CI gate, agent "
+            "instructions, or trigger catalog used to evaluate it); a coding "
+            "agent cannot self-approve that change — a human must review it."
+        )
+    return None
+
+
 def _human_review(
-    *, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None
+    *,
+    merge_verdict: MergeVerdict,
+    release_decision: ReleaseDecision | None,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> VerifierHumanReview:
-    required = merge_verdict in {
+    note = _self_approval_note(capability_review)
+    required = note is not None or merge_verdict in {
         "human_review_required",
         "blocked",
         "insufficient_evidence",
         "unknown",
     }
     if not required:
         return VerifierHumanReview(required=False, why=None)
-    why = release_decision.reason if release_decision is not None else None
+    # A self-approval prohibition is the most important reason a human is
+    # needed; surface it ahead of the generic release-decision reason.
+    reason = release_decision.reason if release_decision is not None else None
     return VerifierHumanReview(
         required=True,
-        why=why or "A human must review this agent-capability change before merge.",
+        why=note
+        or reason
+        or "A human must review this agent-capability change before merge.",
     )
 
 
@@ -560,14 +601,23 @@ def _first_next_action(
     fix_task: VerifierFixTask | None,
     agent_summary: AgentSummary | None,
     reason: str | None,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> VerifierNextAction:
-    if merge_verdict == "mergeable":
+    self_approval = _self_approval_note(capability_review)
+    if merge_verdict == "mergeable" and self_approval is None:
         return VerifierNextAction(
             actor="coding_agent",
             kind="none",
             command=None,
             why="No agent-capability changes gate this PR; safe to merge.",
         )
+    if self_approval is not None and fix_task is None:
+        # Defensive self-approval path (e.g. a 'mergeable' verdict that still
+        # carries a self-approval note): a human must review — never emit the
+        # "safe to merge" action.
+        return VerifierNextAction(
+            actor="human", kind="review", command=None, why=self_approval
+        )
     # The fix_task is the single repair contract; the headline next-step must
     # not contradict it. Borrow the agent summary's concrete action (e.g. an
     # apply-patches command) only when its implied actor agrees with the
@@ -611,8 +661,17 @@ def _first_next_action(
 
 
 def _verifier_headline(
-    *, report: ReadinessReport | None, merge_verdict: MergeVerdict, head_status: str
+    *,
+    report: ReadinessReport | None,
+    merge_verdict: MergeVerdict,
+    head_status: str,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> str | None:
+    # An agent editing the rules that evaluate its own change must see the
+    # self-approval prohibition first, ahead of the generic scan headline.
+    note = _self_approval_note(capability_review)
+    if note is not None:
+        return note
     if report is not None and report.agent_summary is not None:
         return report.agent_summary.headline
     if head_status == "skipped":
@@ -671,11 +730,13 @@ def _build_verifier(
     )
     decision = release_decision_model.decision if release_decision_model else None
     merge_verdict = merge_verdict_for(decision=decision, head_status=head_status)
-    human_review = _human_review(
-        merge_verdict=merge_verdict, release_decision=release_decision_model
-    )
     agent_summary_model = report.agent_summary if report is not None else None
     capability_review = build_capability_review(report) if report is not None else None
+    human_review = _human_review(
+        merge_verdict=merge_verdict,
+        release_decision=release_decision_model,
+        capability_review=capability_review,
+    )
     fix_task = build_fix_task(
         report,
         merge_verdict=merge_verdict,
@@ -723,18 +784,21 @@ def _build_verifier(
         can_merge_without_human=_can_merge_without_human(
             merge_verdict=merge_verdict,
             release_decision=release_decision_model,
+            capability_review=capability_review,
         ),
         headline=_verifier_headline(
             report=report,
             merge_verdict=merge_verdict,
             head_status=head_status,
+            capability_review=capability_review,
         ),
         human_review=human_review,
         first_next_action=_first_next_action(
             merge_verdict=merge_verdict,
             fix_task=fix_task,
             agent_summary=agent_summary_model,
             reason=release_decision_model.reason if release_decision_model else None,
+            capability_review=capability_review,
         ),
         fix_task=fix_task,
         artifacts=artifacts,

diff --git a/tests/test_self_approval_signal.py b/tests/test_self_approval_signal.py
@@ -0,0 +1,149 @@
+"""PR-D: the self-approval prohibition is surfaced at the top of verifier.json.
+
+When a PR edits the rules that evaluate it — a weakened release policy or a
+touched trust root — a coding agent must not silently self-approve (reward
+hacking). That prohibition is the verifier headline and the human-review
+reason, not buried in a fix_task instruction.
+"""
+
+from __future__ import annotations
+
+from agents_shipgate.cli.verify.orchestrator import (
+    _can_merge_without_human,
+    _first_next_action,
+    _human_review,
+    _self_approval_note,
+    _verifier_headline,
+)
+from agents_shipgate.schemas.verifier import VerifierCapabilityReview
+
+
+def _cr(**kwargs) -> VerifierCapabilityReview:
+    return VerifierCapabilityReview(**kwargs)
+
+
+# --- the note itself --------------------------------------------------------
+
+
+def test_policy_weakened_note_calls_out_self_approval() -> None:
+    note = _self_approval_note(_cr(policy_weakened=True))
+    assert note is not None
+    assert "self-approve" in note
+    assert "policy" in note
+
+
+def test_trust_root_note_calls_out_self_approval() -> None:
+    note = _self_approval_note(_cr(trust_root_touched=True))
+    assert note is not None
+    assert "self-approve" in note
+    assert "trust root" in note
+
+
+def test_policy_weakening_takes_precedence_over_trust_root() -> None:
+    note = _self_approval_note(_cr(policy_weakened=True, trust_root_touched=True))
+    assert note is not None
+    assert "policy" in note
+
+
+def test_clean_review_has_no_note() -> None:
+    assert _self_approval_note(_cr()) is None
+    assert _self_approval_note(None) is None
+
+
+# --- human_review surfacing -------------------------------------------------
+
+
+def test_human_review_why_leads_with_self_approval_note() -> None:
+    review = _human_review(
+        merge_verdict="human_review_required",
+        release_decision=None,
+        capability_review=_cr(policy_weakened=True),
+    )
+    assert review.required is True
+    assert review.why is not None
+    assert "self-approve" in review.why
+
+
+def test_self_approval_forces_human_review_even_if_verdict_not_human() -> None:
+    # Defensive: a weakened policy must require a human even if some other path
+    # produced a non-human verdict — the agent can never clear its own gate.
+    review = _human_review(
+        merge_verdict="mergeable",
+        release_decision=None,
+        capability_review=_cr(trust_root_touched=True),
+    )
+    assert review.required is True
+    assert review.why is not None
+    assert "self-approve" in review.why
+
+
+# --- headline surfacing -----------------------------------------------------
+
+
+def test_headline_leads_with_self_approval_note() -> None:
+    headline = _verifier_headline(
+        report=None,
+        merge_verdict="human_review_required",
+        head_status="succeeded",
+        capability_review=_cr(trust_root_touched=True),
+    )
+    assert headline is not None
+    assert "self-approve" in headline
+
+
+def test_headline_without_note_falls_back_to_default() -> None:
+    headline = _verifier_headline(
+        report=None,
+        merge_verdict="unknown",
+        head_status="failed",
+        capability_review=_cr(),
+    )
+    assert headline == "Shipgate could not complete the scan; human review required."
+
+
+# --- convenience fields stay consistent in the defensive case ---------------
+
+
+def test_self_approval_blocks_can_merge_without_human() -> None:
+    assert (
+        _can_merge_without_human(
+            merge_verdict="mergeable",
+            release_decision=None,
+            capability_review=_cr(policy_weakened=True),
+        )
+        is False
+    )
+
+
+def test_self_approval_first_next_action_routes_to_human_when_mergeable() -> None:
+    # The defensive path: a 'mergeable' verdict carrying a self-approval note
+    # must not emit "safe to merge"; the next step is a human review.
+    action = _first_next_action(
+        merge_verdict="mergeable",
+        fix_task=None,
+        agent_summary=None,
+        reason=None,
+        capability_review=_cr(trust_root_touched=True),
+    )
+    assert action.actor == "human"
+    assert action.kind == "review"
+    assert "self-approve" in action.why
+
+
+def test_clean_mergeable_still_merges_and_keeps_safe_action() -> None:
+    # Regression: with no self-approval note, mergeable behaves as before.
+    assert (
+        _can_merge_without_human(
+            merge_verdict="mergeable", release_decision=None, capability_review=_cr()
+        )
+        is True
+    )
+    action = _first_next_action(
+        merge_verdict="mergeable",
+        fix_task=None,
+        agent_summary=None,
+        reason=None,
+        capability_review=_cr(),
+    )
+    assert action.actor == "coding_agent"
+    assert action.kind == "none"