Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
211 changes: 211 additions & 0 deletions artifacts/multi_family_admissibility_results.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,211 @@
{
"artifact_id": "multi_family_admissibility_results_v1",
"families": [
{
"curve": {
"curve_id": "coding_workflow_pr_review_curve_v1",
"generated_by": "DegradationCurveGenerator",
"points": [
{
"expected_admissible": true,
"failed_contracts": [],
"failure_labels": [],
"fixture_id": "coding_workflow_pr_review_v1",
"fixture_path": "fixtures/coding_workflow_pr_review_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": true,
"operational_score": 1.0,
"overall_admissibility_score": 1.0,
"passed_contracts": [
"no_orphan_tool_calls",
"pre_merge_review",
"recovery_path_available",
"security_causal_block"
],
"relational_score": 1.0,
"structural_score": 1.0
},
{
"expected_admissible": false,
"failed_contracts": [
"recovery_path_available"
],
"failure_labels": [
"RECOVERY_PATH_INVALID"
],
"fixture_id": "coding_workflow_pr_review_mild_v1",
"fixture_path": "fixtures/coding_workflow_pr_review_mild_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": false,
"operational_score": 1.0,
"overall_admissibility_score": 0.9166666666666666,
"passed_contracts": [
"no_orphan_tool_calls",
"pre_merge_review",
"security_causal_block"
],
"relational_score": 0.6666666666666666,
"structural_score": 1.0
},
{
"expected_admissible": false,
"failed_contracts": [
"recovery_path_available",
"security_causal_block"
],
"failure_labels": [
"CAUSAL_DEPENDENCY_LOSS",
"RECOVERY_PATH_INVALID"
],
"fixture_id": "coding_workflow_pr_review_moderate_v1",
"fixture_path": "fixtures/coding_workflow_pr_review_moderate_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": false,
"operational_score": 1.0,
"overall_admissibility_score": 0.8333333333333334,
"passed_contracts": [
"no_orphan_tool_calls",
"pre_merge_review"
],
"relational_score": 0.3333333333333333,
"structural_score": 1.0
},
{
"expected_admissible": false,
"failed_contracts": [
"no_orphan_tool_calls",
"pre_merge_review",
"recovery_path_available",
"security_causal_block"
],
"failure_labels": [
"CAUSAL_DEPENDENCY_LOSS",
"INVARIANT_VIOLATION",
"POLICY_ORDER_BROKEN",
"RECOVERY_PATH_INVALID"
],
"fixture_id": "coding_workflow_pr_review_degraded_v1",
"fixture_path": "fixtures/coding_workflow_pr_review_degraded_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": false,
"operational_score": 0.0,
"overall_admissibility_score": 0.5,
"passed_contracts": [],
"relational_score": 0.0,
"structural_score": 1.0
}
],
"version": "1.0"
},
"family": "coding_workflow_pr_review"
},
{
"curve": {
"curve_id": "incident_response_page_triage_curve_v1",
"generated_by": "DegradationCurveGenerator",
"points": [
{
"expected_admissible": true,
"failed_contracts": [],
"failure_labels": [],
"fixture_id": "incident_response_page_triage_v1",
"fixture_path": "fixtures/incident_response_page_triage_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": true,
"operational_score": 1.0,
"overall_admissibility_score": 1.0,
"passed_contracts": [
"alert_ack_before_mitigation",
"no_orphan_mitigation_steps",
"rollback_reachable",
"root_cause_links_incident"
],
"relational_score": 1.0,
"structural_score": 1.0
},
{
"expected_admissible": false,
"failed_contracts": [
"rollback_reachable"
],
"failure_labels": [
"RECOVERY_PATH_INVALID"
],
"fixture_id": "incident_response_page_triage_mild_v1",
"fixture_path": "fixtures/incident_response_page_triage_mild_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": false,
"operational_score": 1.0,
"overall_admissibility_score": 0.9166666666666666,
"passed_contracts": [
"alert_ack_before_mitigation",
"no_orphan_mitigation_steps",
"root_cause_links_incident"
],
"relational_score": 0.6666666666666666,
"structural_score": 1.0
},
{
"expected_admissible": false,
"failed_contracts": [
"no_orphan_mitigation_steps",
"rollback_reachable"
],
"failure_labels": [
"INVARIANT_VIOLATION",
"RECOVERY_PATH_INVALID"
],
"fixture_id": "incident_response_page_triage_moderate_v1",
"fixture_path": "fixtures/incident_response_page_triage_moderate_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": false,
"operational_score": 1.0,
"overall_admissibility_score": 0.8333333333333334,
"passed_contracts": [
"alert_ack_before_mitigation",
"root_cause_links_incident"
],
"relational_score": 0.3333333333333333,
"structural_score": 1.0
},
Comment on lines +154 to +177
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The moderate degradation point for the incident_response_page_triage family is identical to the mild point (lines 131-154) in terms of scores, failed contracts, and failure labels. For a benchmark artifact intended to represent a degradation curve, these levels should ideally show progressive loss of quality. Please check if the fixtures for this family need to be updated to provide more granular degradation steps.

{
"expected_admissible": false,
"failed_contracts": [
"alert_ack_before_mitigation",
"no_orphan_mitigation_steps",
"rollback_reachable",
"root_cause_links_incident"
],
"failure_labels": [
"CAUSAL_DEPENDENCY_LOSS",
"INVARIANT_VIOLATION",
"POLICY_ORDER_BROKEN",
"RECOVERY_PATH_INVALID"
],
"fixture_id": "incident_response_page_triage_degraded_v1",
"fixture_path": "fixtures/incident_response_page_triage_degraded_v1",
"fixture_version": "1.0.0",
"governance_score": 1.0,
"observed_admissible": false,
"operational_score": 0.0,
"overall_admissibility_score": 0.5,
"passed_contracts": [],
"relational_score": 0.0,
"structural_score": 1.0
}
],
"version": "1.0"
},
"family": "incident_response_page_triage"
}
],
"generated_by": "DegradationCurveGenerator",
"version": "1.0"
}
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,12 @@
"target": "escalate_to_human",
"relation": "RECOVERY",
"metadata": {}
},
{
"source": "incident_classified",
"target": "rollback_available",
"relation": "RECOVERY",
"metadata": {}
}
]
}
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"test:replay": "pytest tests/test_paper_replay_bench.py tests/test_agent_trace_replay.py tests/test_replay_continuity.py -q",
"layout": "python scripts/check_repo_layout.py",
"check": "npm run layout && npm run typecheck && npm run validate && npm run build && npm run test",
"generate:layered-admissibility": "python scripts/generate_layered_admissibility_artifact.py"
"generate:layered-admissibility": "python scripts/generate_layered_admissibility_artifact.py",
"generate:multi-family-admissibility": "python scripts/generate_multi_family_admissibility_artifact.py"
}
}
72 changes: 72 additions & 0 deletions scripts/generate_multi_family_admissibility_artifact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
"""Deterministic entrypoint for multi-family admissibility artifact regeneration."""

from __future__ import annotations

import json
import sys
from pathlib import Path
from typing import Any

REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))

from src.validation.degradation_curve_generator import (
LAYERED_CURVE_LEVELS,
MANIFEST_PATH,
DegradationCurveGenerator,
)

ARTIFACT_ID = "multi_family_admissibility_results_v1"
OUTPUT_PATH = Path("artifacts/multi_family_admissibility_results.json")


def _families_with_standard_levels(
manifest_path: Path = MANIFEST_PATH,
levels: tuple[str, ...] = LAYERED_CURVE_LEVELS,
) -> tuple[str, ...]:
manifest = json.loads(manifest_path.read_text(encoding="utf-8"))
fixtures = manifest.get("fixtures")
if not isinstance(fixtures, list):
raise ValueError(f"invalid fixture manifest format: {manifest_path}")

family_to_levels: dict[str, set[str]] = {}
for entry in fixtures:
family = entry.get("family")
level = entry.get("degradation_level")
if not family or not level:
continue
family_to_levels.setdefault(str(family), set()).add(str(level))

return tuple(sorted(family for family, family_levels in family_to_levels.items() if set(levels).issubset(family_levels)))


def generate_multi_family_admissibility_artifact(output_path: Path = OUTPUT_PATH) -> Path:
generator = DegradationCurveGenerator()
families_payload: list[dict[str, Any]] = []

for family in _families_with_standard_levels():
fixtures = generator.fixtures_for_manifest_family(family)
curve = generator.generate(fixtures, curve_id=f"{family}_curve_v1")
families_payload.append({"family": family, "curve": generator.to_dict(curve)})

payload = {
"artifact_id": ARTIFACT_ID,
"generated_by": DegradationCurveGenerator.__name__,
"version": "1.0",
"families": families_payload,
}

output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(payload, indent=2, sort_keys=True) + "\n", encoding="utf-8")
return output_path


def main() -> int:
output_path = generate_multi_family_admissibility_artifact()
print(output_path.as_posix())
return 0


if __name__ == "__main__":
raise SystemExit(main())
8 changes: 8 additions & 0 deletions tests/test_artifact_reproducibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
from scripts.generate_layered_admissibility_artifact import (
generate_layered_admissibility_artifact,
)
from scripts.generate_multi_family_admissibility_artifact import (
generate_multi_family_admissibility_artifact,
)


@dataclass(frozen=True)
Expand All @@ -25,6 +28,11 @@ class DeterministicArtifactSpec:
committed_path=Path("artifacts/layered_admissibility_results.json"),
regenerate=generate_layered_admissibility_artifact,
),
DeterministicArtifactSpec(
name="multi_family_admissibility_results",
committed_path=Path("artifacts/multi_family_admissibility_results.json"),
regenerate=generate_multi_family_admissibility_artifact,
),
)


Expand Down
Loading
Loading