Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 74 additions & 10 deletions src/agents_shipgate/cli/verify/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
MergeVerdict,
VerifierArtifact,
VerifierBaseStatus,
VerifierCapabilityReview,
VerifierFixTask,
VerifierHumanReview,
VerifierNextAction,
Expand Down Expand Up @@ -525,8 +526,15 @@ def _map_optional_tree_path(


def _can_merge_without_human(
*, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None
*,
merge_verdict: MergeVerdict,
release_decision: ReleaseDecision | None,
capability_review: VerifierCapabilityReview | None = None,
) -> bool:
# A self-approval change can never clear its own gate, even in the
# defensive case where the verdict was somehow not human-routed.
if _self_approval_note(capability_review) is not None:
return False
if merge_verdict != "mergeable":
return False
if release_decision is None:
Expand All @@ -536,21 +544,54 @@ def _can_merge_without_human(
return not (release_decision.blockers or release_decision.review_items)


def _self_approval_note(capability_review: VerifierCapabilityReview | None) -> str | None:
"""The explicit self-approval prohibition when this PR edits the rules that
evaluate it.

A coding agent must never silently self-approve a change to its own release
gate (reward hacking). When the head scan flags a weakened policy or a
touched trust root, that prohibition is surfaced as the verifier headline
and the human-review reason — not left implicit in a fix_task instruction.
"""
if capability_review is None:
return None
if capability_review.policy_weakened:
return (
"This PR weakens the release policy that evaluates it; a coding "
"agent cannot self-approve that change — a human must review it."
)
if capability_review.trust_root_touched:
return (
"This PR edits a release trust root (the manifest, CI gate, agent "
"instructions, or trigger catalog used to evaluate it); a coding "
"agent cannot self-approve that change — a human must review it."
)
return None


def _human_review(
*, merge_verdict: MergeVerdict, release_decision: ReleaseDecision | None
*,
merge_verdict: MergeVerdict,
release_decision: ReleaseDecision | None,
capability_review: VerifierCapabilityReview | None = None,
) -> VerifierHumanReview:
required = merge_verdict in {
note = _self_approval_note(capability_review)
required = note is not None or merge_verdict in {
"human_review_required",
"blocked",
"insufficient_evidence",
"unknown",
}
if not required:
return VerifierHumanReview(required=False, why=None)
why = release_decision.reason if release_decision is not None else None
# A self-approval prohibition is the most important reason a human is
# needed; surface it ahead of the generic release-decision reason.
reason = release_decision.reason if release_decision is not None else None
return VerifierHumanReview(
required=True,
why=why or "A human must review this agent-capability change before merge.",
why=note
or reason
or "A human must review this agent-capability change before merge.",
)


Expand All @@ -560,14 +601,23 @@ def _first_next_action(
fix_task: VerifierFixTask | None,
agent_summary: AgentSummary | None,
reason: str | None,
capability_review: VerifierCapabilityReview | None = None,
) -> VerifierNextAction:
if merge_verdict == "mergeable":
self_approval = _self_approval_note(capability_review)
if merge_verdict == "mergeable" and self_approval is None:
return VerifierNextAction(
actor="coding_agent",
kind="none",
command=None,
why="No agent-capability changes gate this PR; safe to merge.",
)
if self_approval is not None and fix_task is None:
# Defensive self-approval path (e.g. a 'mergeable' verdict that still
# carries a self-approval note): a human must review — never emit the
# "safe to merge" action.
return VerifierNextAction(
actor="human", kind="review", command=None, why=self_approval
)
# The fix_task is the single repair contract; the headline next-step must
# not contradict it. Borrow the agent summary's concrete action (e.g. an
# apply-patches command) only when its implied actor agrees with the
Expand Down Expand Up @@ -611,8 +661,17 @@ def _first_next_action(


def _verifier_headline(
*, report: ReadinessReport | None, merge_verdict: MergeVerdict, head_status: str
*,
report: ReadinessReport | None,
merge_verdict: MergeVerdict,
head_status: str,
capability_review: VerifierCapabilityReview | None = None,
) -> str | None:
# An agent editing the rules that evaluate its own change must see the
# self-approval prohibition first, ahead of the generic scan headline.
note = _self_approval_note(capability_review)
if note is not None:
return note
if report is not None and report.agent_summary is not None:
return report.agent_summary.headline
if head_status == "skipped":
Expand Down Expand Up @@ -671,11 +730,13 @@ def _build_verifier(
)
decision = release_decision_model.decision if release_decision_model else None
merge_verdict = merge_verdict_for(decision=decision, head_status=head_status)
human_review = _human_review(
merge_verdict=merge_verdict, release_decision=release_decision_model
)
agent_summary_model = report.agent_summary if report is not None else None
capability_review = build_capability_review(report) if report is not None else None
human_review = _human_review(
merge_verdict=merge_verdict,
release_decision=release_decision_model,
capability_review=capability_review,
)
fix_task = build_fix_task(
report,
merge_verdict=merge_verdict,
Expand Down Expand Up @@ -723,18 +784,21 @@ def _build_verifier(
can_merge_without_human=_can_merge_without_human(
merge_verdict=merge_verdict,
release_decision=release_decision_model,
capability_review=capability_review,
),
headline=_verifier_headline(
report=report,
merge_verdict=merge_verdict,
head_status=head_status,
capability_review=capability_review,
),
human_review=human_review,
first_next_action=_first_next_action(
merge_verdict=merge_verdict,
fix_task=fix_task,
agent_summary=agent_summary_model,
reason=release_decision_model.reason if release_decision_model else None,
capability_review=capability_review,
),
fix_task=fix_task,
artifacts=artifacts,
Expand Down
149 changes: 149 additions & 0 deletions tests/test_self_approval_signal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
"""PR-D: the self-approval prohibition is surfaced at the top of verifier.json.

When a PR edits the rules that evaluate it — a weakened release policy or a
touched trust root — a coding agent must not silently self-approve (reward
hacking). That prohibition is the verifier headline and the human-review
reason, not buried in a fix_task instruction.
"""

from __future__ import annotations

from agents_shipgate.cli.verify.orchestrator import (
_can_merge_without_human,
_first_next_action,
_human_review,
_self_approval_note,
_verifier_headline,
)
from agents_shipgate.schemas.verifier import VerifierCapabilityReview


def _cr(**kwargs) -> VerifierCapabilityReview:
return VerifierCapabilityReview(**kwargs)


# --- the note itself --------------------------------------------------------


def test_policy_weakened_note_calls_out_self_approval() -> None:
note = _self_approval_note(_cr(policy_weakened=True))
assert note is not None
assert "self-approve" in note
assert "policy" in note


def test_trust_root_note_calls_out_self_approval() -> None:
note = _self_approval_note(_cr(trust_root_touched=True))
assert note is not None
assert "self-approve" in note
assert "trust root" in note


def test_policy_weakening_takes_precedence_over_trust_root() -> None:
note = _self_approval_note(_cr(policy_weakened=True, trust_root_touched=True))
assert note is not None
assert "policy" in note


def test_clean_review_has_no_note() -> None:
assert _self_approval_note(_cr()) is None
assert _self_approval_note(None) is None


# --- human_review surfacing -------------------------------------------------


def test_human_review_why_leads_with_self_approval_note() -> None:
review = _human_review(
merge_verdict="human_review_required",
release_decision=None,
capability_review=_cr(policy_weakened=True),
)
assert review.required is True
assert review.why is not None
assert "self-approve" in review.why


def test_self_approval_forces_human_review_even_if_verdict_not_human() -> None:
# Defensive: a weakened policy must require a human even if some other path
# produced a non-human verdict — the agent can never clear its own gate.
review = _human_review(
merge_verdict="mergeable",
release_decision=None,
capability_review=_cr(trust_root_touched=True),
)
assert review.required is True
assert review.why is not None
assert "self-approve" in review.why


# --- headline surfacing -----------------------------------------------------


def test_headline_leads_with_self_approval_note() -> None:
headline = _verifier_headline(
report=None,
merge_verdict="human_review_required",
head_status="succeeded",
capability_review=_cr(trust_root_touched=True),
)
assert headline is not None
assert "self-approve" in headline


def test_headline_without_note_falls_back_to_default() -> None:
headline = _verifier_headline(
report=None,
merge_verdict="unknown",
head_status="failed",
capability_review=_cr(),
)
assert headline == "Shipgate could not complete the scan; human review required."


# --- convenience fields stay consistent in the defensive case ---------------


def test_self_approval_blocks_can_merge_without_human() -> None:
assert (
_can_merge_without_human(
merge_verdict="mergeable",
release_decision=None,
capability_review=_cr(policy_weakened=True),
)
is False
)


def test_self_approval_first_next_action_routes_to_human_when_mergeable() -> None:
# The defensive path: a 'mergeable' verdict carrying a self-approval note
# must not emit "safe to merge"; the next step is a human review.
action = _first_next_action(
merge_verdict="mergeable",
fix_task=None,
agent_summary=None,
reason=None,
capability_review=_cr(trust_root_touched=True),
)
assert action.actor == "human"
assert action.kind == "review"
assert "self-approve" in action.why


def test_clean_mergeable_still_merges_and_keeps_safe_action() -> None:
# Regression: with no self-approval note, mergeable behaves as before.
assert (
_can_merge_without_human(
merge_verdict="mergeable", release_decision=None, capability_review=_cr()
)
is True
)
action = _first_next_action(
merge_verdict="mergeable",
fix_task=None,
agent_summary=None,
reason=None,
capability_review=_cr(),
)
assert action.actor == "coding_agent"
assert action.kind == "none"