From 8d0317a071f6ef8beb1795edf489075283e70d95 Mon Sep 17 00:00:00 2001
From: Pengfei Hu <pengfei@threemoonslab.com>
Date: Fri, 29 May 2026 23:56:59 -0700
Subject: [PATCH 1/2] Surface the self-approval prohibition at the top of
 verifier.json

When a PR weakens the release policy or touches a trust root, a coding agent
must not silently self-approve a change to its own gate. That prohibition was
only present inside a fix_task instruction (PR #146); promote it to the two
fields an agent reads first.

- Add _self_approval_note(): the explicit "a coding agent cannot self-approve
  that change - a human must review it" message for policy_weakened (taking
  precedence) and trust_root_touched.
- verifier.json headline leads with the note when present.
- human_review.why leads with the note, and a self-approval note forces
  human_review.required=True regardless of the verdict path.

Full suite: 2346 passed, 4 skipped. No schema change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../cli/verify/orchestrator.py                | 62 ++++++++++--
 tests/test_self_approval_signal.py            | 99 +++++++++++++++++++
 2 files changed, 153 insertions(+), 8 deletions(-)
 create mode 100644 tests/test_self_approval_signal.py

diff --git a/src/agents_shipgate/cli/verify/orchestrator.py b/src/agents_shipgate/cli/verify/orchestrator.py
index 3b5a57e5..eb6ee255 100644
--- a/src/agents_shipgate/cli/verify/orchestrator.py
+++ b/src/agents_shipgate/cli/verify/orchestrator.py
@@ -20,6 +20,7 @@
     MergeVerdict,
     VerifierArtifact,
     VerifierBaseStatus,
+    VerifierCapabilityReview,
     VerifierFixTask,
     VerifierHumanReview,
     VerifierNextAction,
@@ -536,10 +537,39 @@ def _can_merge_without_human(
     return not (release_decision.blockers or release_decision.review_items)
 
 
+def _self_approval_note(capability_review: VerifierCapabilityReview | None) -> str | None:
+    """The explicit self-approval prohibition when this PR edits the rules that
+    evaluate it.
+
+    A coding agent must never silently self-approve a change to its own release
+    gate (reward hacking). When the head scan flags a weakened policy or a
+    touched trust root, that prohibition is surfaced as the verifier headline
+    and the human-review reason — not left implicit in a fix_task instruction.
+    """
+    if capability_review is None:
+        return None
+    if capability_review.policy_weakened:
+        return (
+            "This PR weakens the release policy that evaluates it; a coding "
+            "agent cannot self-approve that change — a human must review it."
+        )
+    if capability_review.trust_root_touched:
+        return (
+            "This PR edits a release trust root (the manifest, CI gate, agent "
+            "instructions, or trigger catalog used to evaluate it); a coding "
+            "agent cannot self-approve that change — a human must review it."
+        )
+    return None
+
+
 def _human_review(
-    *, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None
+    *,
+    merge_verdict: MergeVerdict,
+    release_decision: ReleaseDecision | None,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> VerifierHumanReview:
-    required = merge_verdict in {
+    note = _self_approval_note(capability_review)
+    required = note is not None or merge_verdict in {
         "human_review_required",
         "blocked",
         "insufficient_evidence",
@@ -547,10 +577,14 @@ def _human_review(
     }
     if not required:
         return VerifierHumanReview(required=False, why=None)
-    why = release_decision.reason if release_decision is not None else None
+    # A self-approval prohibition is the most important reason a human is
+    # needed; surface it ahead of the generic release-decision reason.
+    reason = release_decision.reason if release_decision is not None else None
     return VerifierHumanReview(
         required=True,
-        why=why or "A human must review this agent-capability change before merge.",
+        why=note
+        or reason
+        or "A human must review this agent-capability change before merge.",
     )
 
 
@@ -611,8 +645,17 @@ def _first_next_action(
 
 
 def _verifier_headline(
-    *, report: ReadinessReport | None, merge_verdict: MergeVerdict, head_status: str
+    *,
+    report: ReadinessReport | None,
+    merge_verdict: MergeVerdict,
+    head_status: str,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> str | None:
+    # An agent editing the rules that evaluate its own change must see the
+    # self-approval prohibition first, ahead of the generic scan headline.
+    note = _self_approval_note(capability_review)
+    if note is not None:
+        return note
     if report is not None and report.agent_summary is not None:
         return report.agent_summary.headline
     if head_status == "skipped":
@@ -671,11 +714,13 @@ def _build_verifier(
     )
     decision = release_decision_model.decision if release_decision_model else None
     merge_verdict = merge_verdict_for(decision=decision, head_status=head_status)
-    human_review = _human_review(
-        merge_verdict=merge_verdict, release_decision=release_decision_model
-    )
     agent_summary_model = report.agent_summary if report is not None else None
     capability_review = build_capability_review(report) if report is not None else None
+    human_review = _human_review(
+        merge_verdict=merge_verdict,
+        release_decision=release_decision_model,
+        capability_review=capability_review,
+    )
     fix_task = build_fix_task(
         report,
         merge_verdict=merge_verdict,
@@ -728,6 +773,7 @@ def _build_verifier(
             report=report,
             merge_verdict=merge_verdict,
             head_status=head_status,
+            capability_review=capability_review,
         ),
         human_review=human_review,
         first_next_action=_first_next_action(
diff --git a/tests/test_self_approval_signal.py b/tests/test_self_approval_signal.py
new file mode 100644
index 00000000..a159cfa0
--- /dev/null
+++ b/tests/test_self_approval_signal.py
@@ -0,0 +1,99 @@
+"""PR-D: the self-approval prohibition is surfaced at the top of verifier.json.
+
+When a PR edits the rules that evaluate it — a weakened release policy or a
+touched trust root — a coding agent must not silently self-approve (reward
+hacking). That prohibition is the verifier headline and the human-review
+reason, not buried in a fix_task instruction.
+"""
+
+from __future__ import annotations
+
+from agents_shipgate.cli.verify.orchestrator import (
+    _human_review,
+    _self_approval_note,
+    _verifier_headline,
+)
+from agents_shipgate.schemas.verifier import VerifierCapabilityReview
+
+
+def _cr(**kwargs) -> VerifierCapabilityReview:
+    return VerifierCapabilityReview(**kwargs)
+
+
+# --- the note itself --------------------------------------------------------
+
+
+def test_policy_weakened_note_calls_out_self_approval() -> None:
+    note = _self_approval_note(_cr(policy_weakened=True))
+    assert note is not None
+    assert "self-approve" in note
+    assert "policy" in note
+
+
+def test_trust_root_note_calls_out_self_approval() -> None:
+    note = _self_approval_note(_cr(trust_root_touched=True))
+    assert note is not None
+    assert "self-approve" in note
+    assert "trust root" in note
+
+
+def test_policy_weakening_takes_precedence_over_trust_root() -> None:
+    note = _self_approval_note(_cr(policy_weakened=True, trust_root_touched=True))
+    assert note is not None
+    assert "policy" in note
+
+
+def test_clean_review_has_no_note() -> None:
+    assert _self_approval_note(_cr()) is None
+    assert _self_approval_note(None) is None
+
+
+# --- human_review surfacing -------------------------------------------------
+
+
+def test_human_review_why_leads_with_self_approval_note() -> None:
+    review = _human_review(
+        merge_verdict="human_review_required",
+        release_decision=None,
+        capability_review=_cr(policy_weakened=True),
+    )
+    assert review.required is True
+    assert review.why is not None
+    assert "self-approve" in review.why
+
+
+def test_self_approval_forces_human_review_even_if_verdict_not_human() -> None:
+    # Defensive: a weakened policy must require a human even if some other path
+    # produced a non-human verdict — the agent can never clear its own gate.
+    review = _human_review(
+        merge_verdict="mergeable",
+        release_decision=None,
+        capability_review=_cr(trust_root_touched=True),
+    )
+    assert review.required is True
+    assert review.why is not None
+    assert "self-approve" in review.why
+
+
+# --- headline surfacing -----------------------------------------------------
+
+
+def test_headline_leads_with_self_approval_note() -> None:
+    headline = _verifier_headline(
+        report=None,
+        merge_verdict="human_review_required",
+        head_status="succeeded",
+        capability_review=_cr(trust_root_touched=True),
+    )
+    assert headline is not None
+    assert "self-approve" in headline
+
+
+def test_headline_without_note_falls_back_to_default() -> None:
+    headline = _verifier_headline(
+        report=None,
+        merge_verdict="unknown",
+        head_status="failed",
+        capability_review=_cr(),
+    )
+    assert headline == "Shipgate could not complete the scan; human review required."

From 9ef25cd7385f61e1e3691341d23fcac394e2c33c Mon Sep 17 00:00:00 2001
From: Pengfei Hu <pengfei@threemoonslab.com>
Date: Sat, 30 May 2026 14:42:19 -0700
Subject: [PATCH 2/2] Self-approval: keep all top-level convenience fields
 consistent (review fix)

Addresses review of #148: a self-approval note forced human_review.required=True,
but can_merge_without_human and first_next_action still keyed only off
merge_verdict, so the defensive (mergeable + note) path could emit "human review
required" and "safe to merge" at once.

- _can_merge_without_human returns False whenever a self-approval note exists.
- _first_next_action routes to a human review (never the "safe to merge" action)
  when a self-approval note is present, including the fix_task-None defensive
  case.
- Both thread capability_review from _build_verifier. Clean mergeable behavior
  (no note) is unchanged; covered by a regression test.

Full suite: 2349 passed, 4 skipped. No schema change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../cli/verify/orchestrator.py                | 22 +++++++-
 tests/test_self_approval_signal.py            | 50 +++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)

diff --git a/src/agents_shipgate/cli/verify/orchestrator.py b/src/agents_shipgate/cli/verify/orchestrator.py
index eb6ee255..7cc1ed7e 100644
--- a/src/agents_shipgate/cli/verify/orchestrator.py
+++ b/src/agents_shipgate/cli/verify/orchestrator.py
@@ -526,8 +526,15 @@ def _map_optional_tree_path(
 
 
 def _can_merge_without_human(
-    *, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None
+    *,
+    merge_verdict: MergeVerdict,
+    release_decision: ReleaseDecision | None,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> bool:
+    # A self-approval change can never clear its own gate, even in the
+    # defensive case where the verdict was somehow not human-routed.
+    if _self_approval_note(capability_review) is not None:
+        return False
     if merge_verdict != "mergeable":
         return False
     if release_decision is None:
@@ -594,14 +601,23 @@ def _first_next_action(
     fix_task: VerifierFixTask | None,
     agent_summary: AgentSummary | None,
     reason: str | None,
+    capability_review: VerifierCapabilityReview | None = None,
 ) -> VerifierNextAction:
-    if merge_verdict == "mergeable":
+    self_approval = _self_approval_note(capability_review)
+    if merge_verdict == "mergeable" and self_approval is None:
         return VerifierNextAction(
             actor="coding_agent",
             kind="none",
             command=None,
             why="No agent-capability changes gate this PR; safe to merge.",
         )
+    if self_approval is not None and fix_task is None:
+        # Defensive self-approval path (e.g. a 'mergeable' verdict that still
+        # carries a self-approval note): a human must review — never emit the
+        # "safe to merge" action.
+        return VerifierNextAction(
+            actor="human", kind="review", command=None, why=self_approval
+        )
     # The fix_task is the single repair contract; the headline next-step must
     # not contradict it. Borrow the agent summary's concrete action (e.g. an
     # apply-patches command) only when its implied actor agrees with the
@@ -768,6 +784,7 @@ def _build_verifier(
         can_merge_without_human=_can_merge_without_human(
             merge_verdict=merge_verdict,
             release_decision=release_decision_model,
+            capability_review=capability_review,
         ),
         headline=_verifier_headline(
             report=report,
@@ -781,6 +798,7 @@ def _build_verifier(
             fix_task=fix_task,
             agent_summary=agent_summary_model,
             reason=release_decision_model.reason if release_decision_model else None,
+            capability_review=capability_review,
         ),
         fix_task=fix_task,
         artifacts=artifacts,
diff --git a/tests/test_self_approval_signal.py b/tests/test_self_approval_signal.py
index a159cfa0..081c6bfd 100644
--- a/tests/test_self_approval_signal.py
+++ b/tests/test_self_approval_signal.py
@@ -9,6 +9,8 @@
 from __future__ import annotations
 
 from agents_shipgate.cli.verify.orchestrator import (
+    _can_merge_without_human,
+    _first_next_action,
     _human_review,
     _self_approval_note,
     _verifier_headline,
@@ -97,3 +99,51 @@ def test_headline_without_note_falls_back_to_default() -> None:
         capability_review=_cr(),
     )
     assert headline == "Shipgate could not complete the scan; human review required."
+
+
+# --- convenience fields stay consistent in the defensive case ---------------
+
+
+def test_self_approval_blocks_can_merge_without_human() -> None:
+    assert (
+        _can_merge_without_human(
+            merge_verdict="mergeable",
+            release_decision=None,
+            capability_review=_cr(policy_weakened=True),
+        )
+        is False
+    )
+
+
+def test_self_approval_first_next_action_routes_to_human_when_mergeable() -> None:
+    # The defensive path: a 'mergeable' verdict carrying a self-approval note
+    # must not emit "safe to merge"; the next step is a human review.
+    action = _first_next_action(
+        merge_verdict="mergeable",
+        fix_task=None,
+        agent_summary=None,
+        reason=None,
+        capability_review=_cr(trust_root_touched=True),
+    )
+    assert action.actor == "human"
+    assert action.kind == "review"
+    assert "self-approve" in action.why
+
+
+def test_clean_mergeable_still_merges_and_keeps_safe_action() -> None:
+    # Regression: with no self-approval note, mergeable behaves as before.
+    assert (
+        _can_merge_without_human(
+            merge_verdict="mergeable", release_decision=None, capability_review=_cr()
+        )
+        is True
+    )
+    action = _first_next_action(
+        merge_verdict="mergeable",
+        fix_task=None,
+        agent_summary=None,
+        reason=None,
+        capability_review=_cr(),
+    )
+    assert action.actor == "coding_agent"
+    assert action.kind == "none"