diff --git a/docs/failure_taxonomy.md b/docs/failure_taxonomy.md new file mode 100644 index 0000000..2c59969 --- /dev/null +++ b/docs/failure_taxonomy.md @@ -0,0 +1,32 @@ +# Deterministic Operational Failure Taxonomy + +This taxonomy defines stable replay/admissibility failure labels with explicit operational semantics. Every label maps to an observable deterministic condition, a failed contract/invariant type, or explicit artifact/metric drift. + +Non-goals: +- no semantic-only labels +- no fuzzy matching labels +- no model-judged labels + +Canonical source for registered labels and field definitions: `src/validation/failure_taxonomy.py`. + +## Required fields per label + +Each registered label includes: +- label name +- operational meaning +- observable trigger +- linked contract or invariant type +- severity class +- explicit non-goal (what it must not mean) + +## Preferred labels hardened in this taxonomy + +- `TOOL_ORDER_VIOLATION` +- `RECOVERY_PATH_LOSS` +- `BLOCKER_DETACHMENT` +- `GOVERNANCE_DRIFT` +- `DEPENDENCY_CHAIN_BREAK` +- `EVIDENCE_SURVIVAL_LOSS` +- `HIGH_CRITICAL_EVIDENCE_LOSS` + +These preferred labels are operationally defined in the canonical registry, regardless of whether a given fixture family currently emits each one. diff --git a/src/validation/failure_taxonomy.py b/src/validation/failure_taxonomy.py new file mode 100644 index 0000000..cf7960b --- /dev/null +++ b/src/validation/failure_taxonomy.py @@ -0,0 +1,157 @@ +"""Deterministic operational failure taxonomy for replay admissibility validation.""" + +from __future__ import annotations + +from typing import Final + +BANNED_FUZZY_TERMS: Final[tuple[str, ...]] = ( + "ambiguous", + "semantic", + "fuzzy", + "llm_judge", + "reasoning", + "confusion", +) + +FAILURE_TAXONOMY: Final[dict[str, dict[str, str]]] = { + "TOOL_ORDER_VIOLATION": { + "operational_meaning": "Observed tool execution sequence diverges from required deterministic step ordering.", + "observable_trigger": "A strict order assertion fails when replayed tool-call indices are compared to contract order.", + "contract_or_invariant_type": "ordering", + "severity_class": "high", + "non_goal": "Not a semantic quality judgment; only deterministic ordering mismatch.", + }, + "RECOVERY_PATH_LOSS": { + "operational_meaning": "At least one required recovery route is not preserved in reconstructed dependency paths.", + "observable_trigger": "Reachability from a required failure node to one or more recovery targets is absent.", + "contract_or_invariant_type": "reachability", + "severity_class": "high", + "non_goal": "Not a probabilistic prediction about future recovery behavior.", + }, + "BLOCKER_DETACHMENT": { + "operational_meaning": "Blocking constraints no longer remain attached at full survival during replay.", + "observable_trigger": "blocker_survival_rate < 1.0 in deterministic replay metrics.", + "contract_or_invariant_type": "operational_metric", + "severity_class": "high", + "non_goal": "Not a narrative claim about intent; metric-derived only.", + }, + "GOVERNANCE_DRIFT": { + "operational_meaning": "Governance-linked constraints drift below full deterministic preservation.", + "observable_trigger": "Governance layer contract score or required governance metric falls below 1.0.", + "contract_or_invariant_type": "governance", + "severity_class": "medium", + "non_goal": "Not a policy interpretation beyond explicit contract outputs.", + }, + "DEPENDENCY_CHAIN_BREAK": { + "operational_meaning": "Required dependency edges or causal chains are missing in reconstructed graphs.", + "observable_trigger": "Comparator reports missing dependency or causal-edge preservation below full coverage.", + "contract_or_invariant_type": "relational_dependency", + "severity_class": "high", + "non_goal": "Not an inferred semantic relation; strictly graph-structural.", + }, + "EVIDENCE_SURVIVAL_LOSS": { + "operational_meaning": "Evidence units expected in replay are not fully preserved.", + "observable_trigger": "has_evidence is true and (evidence_survived < evidence_total or evidence_survival_rate < 1.0).", + "contract_or_invariant_type": "operational_metric", + "severity_class": "medium", + "non_goal": "Not a free-form evidence relevance judgement.", + }, + "HIGH_CRITICAL_EVIDENCE_LOSS": { + "operational_meaning": "High-critical evidence survival is below full preservation.", + "observable_trigger": "has_high_critical_evidence is true and high_critical_evidence_survival_rate < 1.0.", + "contract_or_invariant_type": "operational_metric", + "severity_class": "critical", + "non_goal": "Not any low-priority evidence drop; only high-critical metric-gated loss.", + }, + "POLICY_ORDER_BROKEN": { + "operational_meaning": "Ordering contract failed for policy-ordered replay steps.", + "observable_trigger": "ContractValidator ordering contract returns passed == false.", + "contract_or_invariant_type": "ordering", + "severity_class": "high", + "non_goal": "Not an evaluation of policy correctness, only order conformance.", + }, + "RECOVERY_PATH_INVALID": { + "operational_meaning": "Reachability contract indicates required recovery path set is not satisfied.", + "observable_trigger": "ContractValidator reachability contract returns passed == false.", + "contract_or_invariant_type": "reachability", + "severity_class": "high", + "non_goal": "Not a runtime incident-resolution claim outside replay graph checks.", + }, + "CAUSAL_DEPENDENCY_LOSS": { + "operational_meaning": "Required causal dependency edges are missing in reconstruction.", + "observable_trigger": "Causality contract or comparator reports missing required causal edges.", + "contract_or_invariant_type": "causality", + "severity_class": "high", + "non_goal": "Not temporal speculation; explicit causal-edge checks only.", + }, + "INVARIANT_VIOLATION": { + "operational_meaning": "A predefined deterministic invariant is violated.", + "observable_trigger": "Invariant contract evaluation returns passed == false.", + "contract_or_invariant_type": "invariant", + "severity_class": "high", + "non_goal": "Not a broad quality label; only declared invariant failure.", + }, + "ORPHAN_DEPENDENCY": { + "operational_meaning": "Nodes requiring upstream dependencies became orphaned after replay.", + "observable_trigger": "Comparator orphan_rate > 0 with affected node evidence.", + "contract_or_invariant_type": "relational_dependency", + "severity_class": "high", + "non_goal": "Not missing optional links; only required incoming dependency loss.", + }, + "DETACHED_DEPENDENCY": { + "operational_meaning": "Required dependency edges are detached in reconstructed graph.", + "observable_trigger": "Comparator detached_dependency_rate > 0.", + "contract_or_invariant_type": "relational_dependency", + "severity_class": "high", + "non_goal": "Not a semantic mismatch; edge-presence based only.", + }, + "CYCLE_INTRODUCED": { + "operational_meaning": "Reconstructed graph introduces cyclic dependency where baseline is acyclic.", + "observable_trigger": "Comparator acyclicity_preserved == false.", + "contract_or_invariant_type": "relational_dependency", + "severity_class": "high", + "non_goal": "Not a cycle severity estimate; binary structural condition only.", + }, + "GRAPH_FRAGMENTATION": { + "operational_meaning": "Connected dependency structure fragments across replayed graph segments.", + "observable_trigger": "Comparator dependency_integrity/reachability evidence indicates fragmentation failure.", + "contract_or_invariant_type": "relational_dependency", + "severity_class": "medium", + "non_goal": "Not about organizational context loss; graph-connectivity only.", + }, + "TEMPORAL_ORDER_VIOLATION": { + "operational_meaning": "Relative deterministic topological order over shared nodes is violated.", + "observable_trigger": "Comparator temporal_order_violation_rate > 0.", + "contract_or_invariant_type": "relational_temporal", + "severity_class": "medium", + "non_goal": "Not wall-clock latency drift; ordering relation only.", + }, + "ARTIFACT_INTEGRITY_VIOLATION": { + "operational_meaning": "Expected artifact fields drift from checked deterministic contract bundle.", + "observable_trigger": "Artifact drift or expected artifact parity checks fail.", + "contract_or_invariant_type": "artifact_integrity", + "severity_class": "critical", + "non_goal": "Not formatting-only differences; contract-relevant drift.", + }, + "REPLAY_NON_REPRODUCIBLE": { + "operational_meaning": "Replay output fails reproducibility requirements under fixed deterministic inputs.", + "observable_trigger": "Reproducibility check reports non-stable artifact or score output.", + "contract_or_invariant_type": "reproducibility", + "severity_class": "critical", + "non_goal": "Not a single-run runtime fault outside deterministic replay validation.", + }, + "CONSTRAINT_DRIFT": { + "operational_meaning": "Constraint preservation falls below full deterministic survival.", + "observable_trigger": "constraint_survival_rate < 1.0 in replay metrics.", + "contract_or_invariant_type": "operational_metric", + "severity_class": "medium", + "non_goal": "Not policy reinterpretation; metric threshold only.", + }, + "EVIDENCE_LOSS": { + "operational_meaning": "Evidence preservation falls below required full survival.", + "observable_trigger": "has_evidence is true and evidence metrics indicate < 1.0 survival.", + "contract_or_invariant_type": "operational_metric", + "severity_class": "medium", + "non_goal": "Not semantic evidence relevance scoring.", + }, +} diff --git a/tests/test_failure_taxonomy.py b/tests/test_failure_taxonomy.py new file mode 100644 index 0000000..bda138f --- /dev/null +++ b/tests/test_failure_taxonomy.py @@ -0,0 +1,78 @@ +from __future__ import annotations + +import json +from pathlib import Path + +from src.validation.failure_taxonomy import BANNED_FUZZY_TERMS, FAILURE_TAXONOMY + + +ROOT = Path(__file__).resolve().parent.parent + + +def _load_json(path: Path) -> object: + return json.loads(path.read_text(encoding="utf-8")) + + +def _collect_fixture_failure_labels() -> set[str]: + labels: set[str] = set() + for path in sorted((ROOT / "fixtures").glob("**/expected/failures.json")): + payload = _load_json(path) + if not isinstance(payload, dict): + continue + for key in ("expected_failures", "allowed_failures", "disallowed_failures"): + values = payload.get(key, []) + if isinstance(values, list): + labels.update(str(value) for value in values) + return labels + + +def _collect_artifact_failure_labels() -> set[str]: + labels: set[str] = set() + for path in sorted((ROOT / "artifacts").glob("*.json")): + payload = _load_json(path) + + def walk(value: object) -> None: + if isinstance(value, dict): + for key, nested in value.items(): + if key == "failure_labels" and isinstance(nested, list): + labels.update(str(item) for item in nested) + walk(nested) + elif isinstance(value, list): + for nested in value: + walk(nested) + + walk(payload) + return labels + + +def test_fixture_expected_failure_labels_are_registered() -> None: + fixture_labels = _collect_fixture_failure_labels() + missing = sorted(label for label in fixture_labels if label not in FAILURE_TAXONOMY) + assert not missing, f"fixture labels missing from failure taxonomy: {missing}" + + +def test_artifact_failure_labels_are_registered() -> None: + artifact_labels = _collect_artifact_failure_labels() + missing = sorted(label for label in artifact_labels if label not in FAILURE_TAXONOMY) + assert not missing, f"artifact labels missing from failure taxonomy: {missing}" + + +def test_registered_labels_have_required_operational_fields() -> None: + required_fields = ( + "operational_meaning", + "observable_trigger", + "contract_or_invariant_type", + "severity_class", + "non_goal", + ) + for label, spec in FAILURE_TAXONOMY.items(): + for field in required_fields: + value = spec.get(field, "") + assert isinstance(value, str) and value.strip(), f"label {label} missing required field {field}" + + +def test_registered_labels_do_not_use_banned_fuzzy_terms() -> None: + for label in FAILURE_TAXONOMY: + normalized = label.lower() + for banned in BANNED_FUZZY_TERMS: + assert banned not in normalized, f"label '{label}' contains banned fuzzy term '{banned}'"