From 71a59c7c2404fdd1a4ed9874941515922914b2dc Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Sat, 4 Apr 2026 19:36:41 +0200 Subject: [PATCH 01/16] feat(core): implement context-aware VSM resolution + ReDoS canary (ZRT-002/ZRT-004) - ResolutionContext for source-file-relative href resolution - _assert_regex_canary(): SIGALRM watchdog (100ms) at engine construction - Fix B904: raise PluginContractError from None - Conditional shield import guard + @_shield_skip markers in test suite - Add arch/vsm_engine.md and internal/security/shattered_mirror_report.md to mkdocs nav (doc files already on disk; nav entries prevent ORPHAN warnings in pre-commit self-check) --- .gitignore | 1 + mkdocs.yml | 8 + src/zenzic/core/rules.py | 157 ++++++++++++- tests/test_redteam_remediation.py | 355 ++++++++++++++++++++++++++++++ tests/test_rules.py | 2 +- 5 files changed, 513 insertions(+), 10 deletions(-) create mode 100644 tests/test_redteam_remediation.py diff --git a/.gitignore b/.gitignore index aefcb23..3a0574d 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ .claude/ .agent/ .deepcode/ +.redteam/ # ──────────────────────────────────────────────────────────────────────────── # Python diff --git a/mkdocs.yml b/mkdocs.yml index 771265d..1f26f43 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -111,6 +111,10 @@ plugins: Docs Issue: Problema nei Docs Change Request: Richiesta di Modifica Pull Requests: Pull Request + Internals: Architettura Interna + VSM Engine: Motore VSM + Security Reports: Rapporti di Sicurezza + Security Analysis v0.5.0a3: Analisi di Sicurezza v0.5.0a3 markdown_extensions: - admonition @@ -188,6 +192,10 @@ nav: - Writing an Adapter: developers/writing-an-adapter.md - Writing Plugin Rules: developers/plugins.md - Example Projects: developers/examples.md + - Internals: + - VSM Engine: arch/vsm_engine.md + - Security Reports: + - Security Analysis v0.5.0a3: internal/security/shattered_mirror_report.md - Community: - Get Involved: community/index.md - How to Contribute: community/contribute/index.md diff --git a/src/zenzic/core/rules.py b/src/zenzic/core/rules.py index 24ce7f0..25d3195 100644 --- a/src/zenzic/core/rules.py +++ b/src/zenzic/core/rules.py @@ -76,6 +76,28 @@ from zenzic.models.vsm import VSM, Route +# ─── ResolutionContext (ZRT-004) ──────────────────────────────────────────────── + + +@dataclass(slots=True) +class ResolutionContext: + """Source-file context for VSM-aware rules that resolve relative links. + + Passed as the ``context`` argument to :meth:`BaseRule.check_vsm` and + :meth:`AdaptiveRuleEngine.run_vsm`. Enables rules like + :class:`VSMBrokenLinkRule` to resolve ``..``-relative hrefs correctly + relative to the *physical* location of the source file in the docs tree, + rather than treating every href as if it originated from the docs root. + + Attributes: + docs_root: Absolute path to the ``docs/`` directory. + source_file: Absolute path of the Markdown file currently being checked. + """ + + docs_root: Path + source_file: Path + + # ─── Finding ────────────────────────────────────────────────────────────────── Severity = Literal["error", "warning", "info"] @@ -237,6 +259,7 @@ def check_vsm( text: str, vsm: Mapping[str, Route], anchors_cache: dict[Path, set[str]], + context: ResolutionContext | None = None, ) -> list[Violation]: """Analyse a file against the pre-built Virtual Site Map. @@ -262,6 +285,13 @@ def check_vsm( anchors_cache: Pre-computed mapping of absolute ``Path`` → anchor slug set. Use this for anchor validation instead of re-parsing file content. + context: Optional :class:`ResolutionContext` with the + ``docs_root`` and ``source_file`` paths. When + present, rules that resolve relative hrefs should + use ``context.source_file.parent`` as the base + directory — not the docs root. ``None`` for + backwards-compatibility with rules that do not + require source-file context. Returns: A list of :class:`Violation` objects, or an empty list. @@ -369,6 +399,71 @@ def _assert_pickleable(rule: BaseRule) -> None: ) from exc +# Canary strings that trigger catastrophic backtracking in ReDoS-vulnerable +# patterns. A safe regex at n=30 takes microseconds; a ReDoS pattern at n=30 +# takes seconds or longer. +_CANARY_STRINGS: tuple[str, ...] = ( + "a" * 30 + "b", # classic (a+)+ / (a*)* style + "A" * 25 + "!", # uppercase variant + "1" * 20 + "x", # numeric variant +) +_CANARY_TIMEOUT_S: float = 0.1 # 100 ms + + +def _assert_regex_canary(rule: BaseRule) -> None: + """Raise :class:`PluginContractError` if a :class:`CustomRule` pattern hangs. + + ZRT-002 defence: a regex that causes catastrophic backtracking inside a + worker process will deadlock the :class:`~concurrent.futures.ProcessPoolExecutor` + because the executor has no timeout. This canary tests each + :class:`CustomRule` pattern against stress strings under a ``SIGALRM`` + watchdog **before** the engine is distributed to worker processes. + + Only :class:`CustomRule` instances are tested (they carry user-supplied + regexes). Python-native :class:`BaseRule` subclasses are trusted to + have been written with complexity in mind. + + This function is a no-op on Windows (``signal.SIGALRM`` is unavailable). + + Args: + rule: A :class:`BaseRule` instance to validate. + + Raises: + PluginContractError: When the pattern takes longer than + :data:`_CANARY_TIMEOUT_S` on any canary string. + """ + import platform + import signal + + from zenzic.core.exceptions import PluginContractError + + if platform.system() == "Windows" or not isinstance(rule, CustomRule): + return + + def _alarm(_signum: int, _frame: object) -> None: + raise TimeoutError + + old_handler = signal.signal(signal.SIGALRM, _alarm) + try: + for canary in _CANARY_STRINGS: + signal.setitimer(signal.ITIMER_REAL, _CANARY_TIMEOUT_S) + try: + rule.check(Path("__canary__.md"), canary) + except TimeoutError: + raise PluginContractError( + f"Rule '{rule.rule_id}': pattern {rule.pattern!r} may cause " + f"catastrophic backtracking (ReDoS). The pattern timed out " + f"after {int(_CANARY_TIMEOUT_S * 1000)} ms on the stress string " + f"{canary!r}.\n" + " Fix: simplify the regex to avoid nested quantifiers " + "such as (a+)+, (a*)*, (a|aa)+, etc." + ) from None + finally: + signal.setitimer(signal.ITIMER_REAL, 0) # cancel alarm + finally: + signal.signal(signal.SIGALRM, old_handler) + + class AdaptiveRuleEngine: """Applies a collection of :class:`BaseRule` instances to a Markdown file. @@ -397,6 +492,7 @@ class AdaptiveRuleEngine: def __init__(self, rules: Sequence[BaseRule]) -> None: for rule in rules: _assert_pickleable(rule) + _assert_regex_canary(rule) # ZRT-002: ReDoS pre-flight check self._rules = rules def __bool__(self) -> bool: @@ -443,6 +539,7 @@ def run_vsm( text: str, vsm: VSM, anchors_cache: dict[Path, set[str]], + context: ResolutionContext | None = None, ) -> list[RuleFinding]: """Run VSM-aware rules against *text* and the pre-built routing table. @@ -456,6 +553,10 @@ def run_vsm( text: Raw Markdown content. vsm: Pre-built VSM (canonical URL → Route). anchors_cache: Pre-computed anchor slug sets. + context: Optional :class:`ResolutionContext` for source-file- + relative link resolution. When provided, each rule + that overrides :meth:`BaseRule.check_vsm` will receive + the context to resolve ``..``-relative hrefs correctly. Returns: Flat list of :class:`RuleFinding` from all VSM-aware rules. @@ -463,7 +564,7 @@ def run_vsm( findings: list[RuleFinding] = [] for rule in self._rules: try: - violations = rule.check_vsm(file_path, text, vsm, anchors_cache) + violations = rule.check_vsm(file_path, text, vsm, anchors_cache, context) findings.extend(v.as_finding() for v in violations) except Exception as exc: # noqa: BLE001 findings.append( @@ -577,6 +678,7 @@ def check_vsm( text: str, vsm: Mapping[str, Route], anchors_cache: dict[Path, set[str]], + context: ResolutionContext | None = None, ) -> list[Violation]: """Validate all inline links in *text* against the VSM. @@ -614,7 +716,11 @@ def check_vsm( # guide/index.md → /guide/ # guide/install.md → /guide/install/ # Paths without .md suffix (e.g. "guide/install") are also handled. - target_url = self._to_canonical_url(url) + target_url = self._to_canonical_url( + url, + source_dir=context.source_file.parent if context else None, + docs_root=context.docs_root if context else None, + ) if target_url is None: continue @@ -666,24 +772,41 @@ def check_vsm( return violations - @staticmethod - def _to_canonical_url(href: str) -> str | None: + def _to_canonical_url( + self, + href: str, + source_dir: Path | None = None, + docs_root: Path | None = None, + ) -> str | None: """Convert a relative Markdown href to a canonical URL string. + ZRT-004 fix: when ``source_dir`` and ``docs_root`` are provided the + href is resolved **relative to the source file's directory** instead of + root-relative. This correctly handles ``..``-prefixed hrefs from files + nested in subdirectories. + + Without context (``source_dir=None``), behaves exactly as the original + ``@staticmethod`` to preserve full backwards-compatibility with callers + that do not supply a :class:`ResolutionContext`. + Applies the standard MkDocs / Zensical clean-URL rule: ``page.md`` → ``/page/``, ``dir/index.md`` → ``/dir/``. - Returns ``None`` for hrefs that cannot be converted to a meaningful - canonical URL (e.g. bare query strings, empty paths). + Returns ``None`` for hrefs that cannot be converted (e.g. bare query + strings, empty paths, or paths that escape ``docs_root``). - Pure: no I/O, no Path.exists(). + Pure: no I/O, no ``Path.exists()``. Args: - href: Raw href extracted from a Markdown link, already stripped of - any title portion. + href: Raw href extracted from a Markdown link. + source_dir: Absolute directory of the file that contains the link. + Required for correct ``..``-relative resolution. + docs_root: Absolute path to the docs root directory. + Required for context-aware boundary checking. Returns: Canonical URL string (leading and trailing ``/``), or ``None``. """ + import os from urllib.parse import unquote, urlsplit parsed = urlsplit(href) @@ -691,6 +814,22 @@ def _to_canonical_url(href: str) -> str | None: if not path: return None + # ZRT-004: context-aware relative resolution + # When source_dir + docs_root are provided and the href has .. segments, + # resolve them relative to the source file's directory rather than the + # docs root. Without context (backwards-compatible path), the original + # root-relative logic is used. + if source_dir is not None and docs_root is not None and ".." in path: + raw_target = os.path.normpath(str(source_dir) + os.sep + path.replace("/", os.sep)) + root_str = str(docs_root) + if not (raw_target == root_str or raw_target.startswith(root_str + os.sep)): + return None # path escapes docs_root — Shield territory, skip + try: + rel = str(Path(raw_target).relative_to(docs_root)).replace(os.sep, "/") + except ValueError: + return None + path = rel if rel != "." else "" + # Strip .md suffix if present if path.endswith(".md"): path = path[:-3] diff --git a/tests/test_redteam_remediation.py b/tests/test_redteam_remediation.py new file mode 100644 index 0000000..659133e --- /dev/null +++ b/tests/test_redteam_remediation.py @@ -0,0 +1,355 @@ +# SPDX-FileCopyrightText: 2026 PythonWoods +# SPDX-License-Identifier: Apache-2.0 +"""Tests for ZRT Red-Team remediation (v0.5.0a4 hotfix). + +Covers: +- ZRT-001: Shield must detect secrets in YAML frontmatter +- ZRT-002: _assert_regex_canary must reject ReDoS patterns at engine construction +- ZRT-003: Shield normalizer must catch split-token obfuscation in tables +- ZRT-004: VSMBrokenLinkRule must resolve relative links with source-file context +""" + +from __future__ import annotations + +import platform +from pathlib import Path + +import pytest + +from zenzic.core.exceptions import PluginContractError +from zenzic.core.rules import ( + AdaptiveRuleEngine, + CustomRule, + ResolutionContext, + Violation, + VSMBrokenLinkRule, + _assert_regex_canary, +) +from zenzic.models.vsm import Route + + +# Shield/Scanner symbols are committed in Commit 2 (shield.py + scanner.py). +# Guard the import so that Commit 1 alone remains test-runnable: the two +# shield-dependent test classes are skipped until Commit 2 is applied. +try: + from zenzic.core.scanner import ReferenceScanner + from zenzic.core.shield import _normalize_line_for_shield, scan_line_for_secrets + + _SHIELD_AVAILABLE = True +except ImportError: + _normalize_line_for_shield = None # type: ignore[assignment] + scan_line_for_secrets = None # type: ignore[assignment] + ReferenceScanner = None # type: ignore[assignment] + _SHIELD_AVAILABLE = False + +_shield_skip = pytest.mark.skipif( + not _SHIELD_AVAILABLE, + reason="shield.py normalizer and scanner.py dual-stream not yet committed (Commit 2)", +) + + +# ─── ZRT-001: Shield must detect secrets in YAML frontmatter ────────────────── + + +@_shield_skip +class TestShieldFrontmatterCoverage: + """ZRT-001: The Shield stream must scan ALL lines including frontmatter.""" + + def test_shield_catches_aws_key_in_yaml_frontmatter(self, tmp_path: Path) -> None: + """AWS access key inside YAML frontmatter must trigger a SecurityFinding.""" + from zenzic.core.scanner import ReferenceScanner + + md = tmp_path / "secret.md" + md.write_text( + "---\n" + "aws_key: AKIA1234567890ABCDEF\n" + "title: API Guide\n" + "---\n\n" + "# Guide\n\nNormal content here.\n" + ) + scanner = ReferenceScanner(md) + secrets = [data for _, evt, data in scanner.harvest() if evt == "SECRET"] + assert len(secrets) >= 1, "Shield must catch AWS key inside YAML frontmatter" + secret_types = {s.secret_type for s in secrets} + assert "aws-access-key" in secret_types + + def test_shield_catches_github_token_in_yaml_frontmatter(self, tmp_path: Path) -> None: + """GitHub PAT inside YAML frontmatter must trigger a SecurityFinding.""" + from zenzic.core.scanner import ReferenceScanner + + md = tmp_path / "github_secret.md" + md.write_text( + "---\n" + "author: John Doe\n" + "github_token: ghp_1234567890123456789012345678901234567\n" + "---\n\n" + "# Guide\n\nNormal content.\n" + ) + scanner = ReferenceScanner(md) + secrets = [data for _, evt, data in scanner.harvest() if evt == "SECRET"] + assert len(secrets) >= 1, "Shield must catch GitHub token inside YAML frontmatter" + + def test_shield_does_not_create_false_positive_on_clean_frontmatter( + self, tmp_path: Path + ) -> None: + """A doc with only safe frontmatter metadata must emit zero secrets.""" + from zenzic.core.scanner import ReferenceScanner + + md = tmp_path / "clean.md" + md.write_text( + "---\n" + "title: Clean Page\n" + "author: Jane Doe\n" + "tags: [docs, guide]\n" + "---\n\n" + "# Clean Page\n\nThis page has no secrets.\n" + ) + scanner = ReferenceScanner(md) + secrets = [data for _, evt, data in scanner.harvest() if evt == "SECRET"] + assert secrets == [], f"Expected 0 secrets, got: {secrets}" + + def test_shield_secret_line_number_is_inside_frontmatter(self, tmp_path: Path) -> None: + """The reported line number of a frontmatter secret must be correct.""" + from zenzic.core.scanner import ReferenceScanner + + md = tmp_path / "line_check.md" + md.write_text( + "---\n" # line 1 + "title: Guide\n" # line 2 + "aws_key: AKIA1234567890ABCDEF\n" # line 3 + "---\n" # line 4 + ) + scanner = ReferenceScanner(md) + secrets = [data for _, evt, data in scanner.harvest() if evt == "SECRET"] + assert len(secrets) >= 1 + # The secret is on line 3 + assert secrets[0].line_no == 3 + + +# ─── ZRT-002: ReDoS canary must reject catastrophic patterns at construction ── + + +@pytest.mark.skipif( + platform.system() == "Windows", + reason="SIGALRM not available on Windows — canary is a no-op there", +) +class TestReDoSCanary: + """ZRT-002: AdaptiveRuleEngine must reject ReDoS patterns before worker dispatch.""" + + def test_canary_rejects_classic_redos_pattern(self) -> None: + """Pattern (a+)+ must be caught by the canary before engine construction.""" + rule = CustomRule( + id="ZZ-REDOS", + pattern=r"^(a+)+$", + message="ReDoS test.", + severity="error", + ) + with pytest.raises(PluginContractError, match="catastrophic backtracking"): + _assert_regex_canary(rule) + + def test_canary_rejects_alternation_redos(self) -> None: + """Alternation-based ReDoS (a|aa)+ also caught.""" + rule = CustomRule( + id="ZZ-REDOS2", + pattern=r"^(a|aa)+$", + message="ReDoS alt test.", + severity="error", + ) + with pytest.raises(PluginContractError, match="catastrophic backtracking"): + _assert_regex_canary(rule) + + def test_engine_construction_rejects_redos_custom_rule(self) -> None: + """AdaptiveRuleEngine.__init__ must raise at construction for ReDoS rules.""" + rule = CustomRule( + id="ZZ-DEADLOCK", + pattern=r"^(a+)+$", + message="Deadlock pattern.", + severity="error", + ) + with pytest.raises(PluginContractError, match="catastrophic backtracking"): + AdaptiveRuleEngine([rule]) + + def test_canary_passes_safe_pattern(self) -> None: + """A simple, safe regex must pass the canary without raising.""" + rule = CustomRule( + id="ZZ-SAFE", + pattern=r"TODO", + message="TODO found.", + severity="warning", + ) + # Must not raise + _assert_regex_canary(rule) + + def test_canary_passes_anchored_safe_pattern(self) -> None: + """A more complex but safe anchored pattern must pass the canary.""" + rule = CustomRule( + id="ZZ-SAFE2", + pattern=r"^(DRAFT|WIP|TODO):?\s", + message="Status marker.", + severity="info", + ) + _assert_regex_canary(rule) + + def test_canary_skips_non_custom_rules(self) -> None: + """BaseRule subclasses that are not CustomRule are not tested by the canary.""" + from zenzic.core.rules import BaseRule, RuleFinding + + class _TrustedRule(BaseRule): + @property + def rule_id(self) -> str: + return "TRUSTED-001" + + def check(self, file_path: Path, text: str) -> list[RuleFinding]: + return [] + + # Must not raise even though _TrustedRule is not a CustomRule + _assert_regex_canary(_TrustedRule()) + + +# ─── ZRT-003: Split-token Shield bypass via Markdown table normalizer ────────── + + +@_shield_skip +class TestShieldNormalizer: + """ZRT-003: The pre-scan normalizer must reconstruct split-token secrets.""" + + def test_normalize_strips_backtick_spans(self) -> None: + """`AKIA` → AKIA (unwrap inline code).""" + result = _normalize_line_for_shield("`AKIA`1234567890ABCDEF") + assert "AKIA1234567890ABCDEF" in result + + def test_normalize_removes_concat_operator(self) -> None: + """`AKIA` + `1234567890ABCDEF` → AKIA1234567890ABCDEF.""" + result = _normalize_line_for_shield("`AKIA` + `1234567890ABCDEF`") + assert "AKIA1234567890ABCDEF" in result + + def test_normalize_strips_table_pipes(self) -> None: + """Pipes → spaces so table cells don't break token continuity.""" + result = _normalize_line_for_shield("| Key | AKIA1234567890ABCDEF |") + assert "|" not in result + assert "AKIA1234567890ABCDEF" in result + + def test_normalize_handles_combined_table_and_concat(self) -> None: + """Full attack vector: table cell with split backtick-concat key.""" + line = "| Access Key | `AKIA` + `1234567890ABCDEF` |" + result = _normalize_line_for_shield(line) + assert "AKIA1234567890ABCDEF" in result + + def test_scan_line_catches_split_token_aws_key(self) -> None: + """scan_line_for_secrets must catch an AWS key split across backtick spans.""" + line = "| Key | `AKIA` + `1234567890ABCDEF` |" + findings = list(scan_line_for_secrets(line, Path("docs/config.md"), 7)) + assert len(findings) >= 1, f"Expected >=1 finding, got: {findings}" + assert findings[0].secret_type == "aws-access-key" + + def test_scan_line_no_false_positive_on_clean_table(self) -> None: + """Clean table rows must not trigger any findings.""" + line = "| API endpoint | https://api.example.com/v1/users |" + findings = list(scan_line_for_secrets(line, Path("docs/api.md"), 3)) + assert findings == [] + + def test_scan_line_still_catches_plain_aws_key(self) -> None: + """Normalizer must not break detection of non-obfuscated secrets.""" + line = "aws_key = AKIA1234567890ABCDEF" + findings = list(scan_line_for_secrets(line, Path("docs/config.md"), 1)) + assert len(findings) >= 1 + assert findings[0].secret_type == "aws-access-key" + + def test_no_duplicate_findings_for_same_secret(self) -> None: + """If raw and normalised both match, only ONE finding is emitted per type.""" + # This line has the key both raw AND in a table — should only emit once + line = "AKIA1234567890ABCDEF" + findings = list(scan_line_for_secrets(line, Path("docs/x.md"), 1)) + types = [f.secret_type for f in findings] + assert types.count("aws-access-key") == 1, "Deduplication must prevent double-emit" + + +# ─── ZRT-004: VSMBrokenLinkRule context-aware URL resolution ────────────────── + + +def _make_vsm(*urls: str, status: str = "REACHABLE") -> dict[str, Route]: + return { + url: Route(url=url, source=f"{url.strip('/')}.md", status=status) # type: ignore[arg-type] + for url in urls + } + + +class TestVSMContextAwareResolution: + """ZRT-004: VSMBrokenLinkRule must resolve relative .. hrefs from the source dir.""" + + _RULE = VSMBrokenLinkRule() + _DOCS_ROOT = Path("/docs") + + def _ctx(self, source_rel: str) -> ResolutionContext: + """Build a context for a source file inside /docs.""" + return ResolutionContext( + docs_root=self._DOCS_ROOT, + source_file=self._DOCS_ROOT / source_rel, + ) + + def _run_with_ctx(self, text: str, vsm: dict, source_rel: str) -> list[Violation]: + ctx = self._ctx(source_rel) + return self._RULE.check_vsm(self._DOCS_ROOT / source_rel, text, vsm, {}, ctx) + + def test_context_aware_resolves_dotdot_to_sibling(self) -> None: + """../../c/target.md from docs/a/b/page.md → /c/target/.""" + vsm = _make_vsm("/c/target/") + violations = self._run_with_ctx("[T](../../c/target.md)", vsm, "a/b/page.md") + assert violations == [], "Link ../../c/target.md from docs/a/b/ must resolve to /c/target/" + + def test_context_aware_single_dotdot(self) -> None: + """../sibling.md from docs/subdir/page.md → /sibling/.""" + vsm = _make_vsm("/sibling/") + violations = self._run_with_ctx("[Sibling](../sibling.md)", vsm, "subdir/page.md") + assert violations == [], "Link ../sibling.md from docs/subdir/ must resolve to /sibling/" + + def test_context_aware_dotdot_absent_from_vsm_emits_violation(self) -> None: + """A context-resolved link to an absent URL must still emit Z001.""" + vsm = _make_vsm("/other/") # /sibling/ is absent + violations = self._run_with_ctx("[Broken](../sibling.md)", vsm, "subdir/page.md") + assert len(violations) == 1 + assert violations[0].code == "Z001" + + def test_context_aware_traversal_escape_returns_none(self) -> None: + """A path that escapes docs_root via .. must be silently skipped (no crash).""" + vsm = _make_vsm("/etc/") + violations = self._run_with_ctx("[Escape](../../../../etc/passwd)", vsm, "subdir/page.md") + # The path escapes docs_root — must not emit a false Z001 nor crash + assert violations == [] + + def test_without_context_preserves_backward_compatibility(self) -> None: + """Without context, behaviour is identical to the original @staticmethod.""" + vsm = _make_vsm("/guide/") + # docs/guide.md with no context → should still work as before + violations = self._RULE.check_vsm( + Path("docs/index.md"), + "[Guide](guide.md)", + vsm, + {}, + context=None, # explicit None + ) + assert violations == [] + + def test_context_aware_index_md_resolves_to_dir(self) -> None: + """../section/index.md from docs/a/page.md → /section/.""" + vsm = _make_vsm("/section/") + violations = self._run_with_ctx("[Sec](../section/index.md)", vsm, "a/page.md") + assert violations == [] + + def test_run_vsm_passes_context_to_rule(self) -> None: + """AdaptiveRuleEngine.run_vsm must forward the context to check_vsm.""" + engine = AdaptiveRuleEngine([VSMBrokenLinkRule()]) + vsm = _make_vsm("/sibling/") + ctx = ResolutionContext( + docs_root=Path("/docs"), + source_file=Path("/docs/subdir/page.md"), + ) + # ../sibling.md from /docs/subdir/page.md → /sibling/ + findings = engine.run_vsm( + Path("/docs/subdir/page.md"), + "[Sibling](../sibling.md)", + vsm, + {}, + context=ctx, + ) + assert findings == [], f"Expected no findings with context, got: {findings}" diff --git a/tests/test_rules.py b/tests/test_rules.py index 6df6ebc..216a696 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -70,7 +70,7 @@ def rule_id(self) -> str: def check(self, file_path: Path, text: str) -> list[RuleFinding]: return [] - def check_vsm(self, file_path, text, vsm, anchors_cache) -> list[Violation]: + def check_vsm(self, file_path, text, vsm, anchors_cache, context=None) -> list[Violation]: raise RuntimeError("vsm rule internal error") From 05ae6ac6f9942b1a62fa8729e1667b9065c55db0 Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Sat, 4 Apr 2026 19:46:06 +0200 Subject: [PATCH 02/16] feat(security): integrate Shield with SentinelReporter and Exit Code 2 (ZRT-001/ZRT-003) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SecurityFinding gains col_start + match_text for surgical caret rendering - _map_shield_to_finding(): sole authorised Shield→reporter bridge (Mutation Gate target) - _obfuscate_secret(): partial redaction (AKIA****1234) safe for CI logs - SentinelReporter: dedicated red breach panels pre-section (Q2); render_quiet one-liner (Q3) - Remove early-exit hard-stop; breach findings flow through _to_findings() pipeline - Exit 2 decided post-render by CLI runner, not scanner (Q3 / Obligation 4) --- src/zenzic/cli.py | 21 +++--- src/zenzic/core/reporter.py | 99 ++++++++++++++++++++----- src/zenzic/core/scanner.py | 144 ++++++++++++++++++++++++++++++++++-- src/zenzic/core/shield.py | 93 ++++++++++++++++++++--- 4 files changed, 313 insertions(+), 44 deletions(-) diff --git a/src/zenzic/cli.py b/src/zenzic/cli.py index fa373a3..9e368a5 100644 --- a/src/zenzic/cli.py +++ b/src/zenzic/cli.py @@ -27,6 +27,7 @@ from zenzic.core.reporter import Finding, SentinelReporter from zenzic.core.scanner import ( PlaceholderFinding, + _map_shield_to_finding, find_orphans, find_placeholders, find_repo_root, @@ -580,6 +581,11 @@ def _rel(path: Path) -> str: match_text=rule_f.match_text, ) ) + # Convert Shield security findings into breach-severity Finding objects. + # _map_shield_to_finding() is the sole authorised bridge between the Shield + # and the reporter (see Obligation 4 / Mutation Gate in CONTRIBUTING.md). + for sf in report.security_findings: + findings.append(_map_shield_to_finding(sf, docs_root)) return findings @@ -721,15 +727,6 @@ def check_all( results = _collect_all_results(repo_root, config, strict=effective_strict) elapsed = time.monotonic() - t0 - # ── Security hard-stop (exit code 2) ────────────────────────────────────── - if results.security_events: - if not quiet: - console.print( - f"\n[bold red]SECURITY CRITICAL:[/] {results.security_events} " - "credential(s) detected — rotate immediately." - ) - raise typer.Exit(2) - # ── JSON format ─────────────────────────────────────────────────────────── if output_format == "json": ref_errors = [] @@ -817,6 +814,12 @@ def check_all( strict=effective_strict, ) + # Breach findings cause Exit 2; all other failures cause Exit 1. + # This check runs after rendering so the report is always printed first. + breaches = sum(1 for f in all_findings if f.severity == "security_breach") + if breaches and not effective_exit_zero: + raise typer.Exit(2) + # In strict mode, warnings are promoted to failures. # Use reporter-derived counts (from filtered all_findings) so that target-mode # does not fail on findings outside the requested scope. diff --git a/src/zenzic/core/reporter.py b/src/zenzic/core/reporter.py index 10b11ea..db56bec 100644 --- a/src/zenzic/core/reporter.py +++ b/src/zenzic/core/reporter.py @@ -35,9 +35,31 @@ class Finding: "error": f"bold {ROSE}", "warning": f"bold {AMBER}", "info": f"bold {INDIGO}", + "security_breach": f"bold white on {ROSE}", } +def _obfuscate_secret(raw: str) -> str: + """Partially redact a secret for safe display in logs and CI output. + + Preserves the first four and last four characters so reviewers can + identify the secret type and suffix without exposing the full credential. + Strings of length ≤ 8 are fully redacted. + + This function is the only place where raw secret material is allowed + to be formatted for human consumption. It **must never** be bypassed. + + Args: + raw: The raw matched secret string from the Shield. + + Returns: + A partially-redacted string safe for log output. + """ + if len(raw) <= 8: # too short to redact partially — hide the whole thing + return "*" * len(raw) + return raw[:4] + "*" * (len(raw) - 8) + raw[-4:] + + def _strip_prefix(rel_path: str, line_no: int, message: str) -> str: """Remove the redundant 'relpath:lineno: ' prefix already shown in the file header.""" if line_no > 0: @@ -142,19 +164,27 @@ def render( docs_count: int = 0, assets_count: int = 0, engine: str = "auto", - security_events: int = 0, target: str | None = None, strict: bool = False, ) -> tuple[int, int]: """Print the full Sentinel Report. + Breach findings (``severity=="security_breach"``) are rendered as + dedicated red panels **before** the grouped findings section and are + excluded from the grouped view to avoid noise. All other findings flow + through the normal grouped pipeline. + Returns: - ``(error_count, warning_count)`` so the caller can decide the - exit code. + ``(error_count, warning_count)`` — breaches are counted separately + by the caller (``cli.py``) and cause Exit 2, not Exit 1. """ errors = sum(1 for f in findings if f.severity == "error") warnings = sum(1 for f in findings if f.severity == "warning") + # ── Split: breach findings get dedicated panels; rest goes to the grouped view + breach_findings = [f for f in findings if f.severity == "security_breach"] + normal_findings = [f for f in findings if f.severity != "security_breach"] + # ── Telemetry line ──────────────────────────────────────────────────── dot = emoji("dot") total = docs_count + assets_count @@ -168,7 +198,39 @@ def render( parts.append(f"[{INDIGO}]{elapsed:.1f}[/]s") telemetry = Text.from_markup(f"[{SLATE}]{f' {dot} '.join(parts)}[/]") - if not findings: + # ── Security breach panels (rendered BEFORE main panel) ─────────────── + if breach_findings: + for bf in breach_findings: + obfuscated = _obfuscate_secret(bf.match_text) if bf.match_text else "[redacted]" + breach_body = Group( + Text.from_markup(f" {emoji('cross')} [bold]Finding:[/] {_esc(bf.message)}"), + Text.from_markup( + f" {emoji('cross')} [bold]Location:[/] " + f"[bold]{_esc(self._full_rel(bf.rel_path))}[/]:{bf.line_no}" + ), + Text.from_markup( + f" {emoji('cross')} [bold]Credential:[/] " + f"[bold reverse] {_esc(obfuscated)} [/]" + ), + Text(), + Text.from_markup( + " [bold]Action:[/] Rotate this credential immediately " + "and purge it from the repository history." + ), + ) + self._con.print() + self._con.print( + Panel( + breach_body, + title=f"[bold white on {ROSE}] SECURITY BREACH DETECTED ", + title_align="center", + border_style=f"bold {ROSE}", + padding=(1, 2), + expand=True, + ) + ) + + if not normal_findings and not breach_findings: # ── All-clear panel ─────────────────────────────────────────────── self._con.print() self._con.print( @@ -192,20 +254,9 @@ def render( ) return 0, 0 - # ── Security ────────────────────────────────────────────────────────── - security_line: list[RenderableType] = [] - if security_events: - security_line = [ - Text.from_markup( - f"[{ROSE}]{emoji('shield')} SECURITY CRITICAL:[/] {security_events} " - f"credential(s) detected — rotate immediately." - ), - Text(), - ] - - # ── Grouped findings ────────────────────────────────────────────────── + # ── Grouped findings (non-breach only) ─────────────────────────────── grouped: dict[str, list[Finding]] = defaultdict(list) - for f in findings: + for f in normal_findings: grouped[f.rel_path].append(f) renderables: list[RenderableType] = [] @@ -281,7 +332,7 @@ def render( self._con.print() self._con.print( Panel( - Group(telemetry, Text(), *security_line, *renderables), + Group(telemetry, Text(), *renderables), title=f"[bold white on {INDIGO}] {emoji('shield')} ZENZIC SENTINEL v{version} [/]", title_align="center", border_style=f"bold {INDIGO}", @@ -296,9 +347,19 @@ def render( # ── Quiet mode (pre-commit) ────────────────────────────────────────────── def render_quiet(self, findings: list[Finding]) -> tuple[int, int]: - """Minimal one-line output for pre-commit hooks.""" + """Minimal output for pre-commit hooks. + + Breach findings always produce a one-liner even in quiet mode — silent + failure on a credential leak is more dangerous than noisy CI output. + """ + breaches = [f for f in findings if f.severity == "security_breach"] errors = sum(1 for f in findings if f.severity == "error") warnings = sum(1 for f in findings if f.severity == "warning") + if breaches: + self._con.print( + f"[bold red]SECURITY CRITICAL:[/] {len(breaches)} secret(s) detected — " + f"rotate immediately. Exit 2." + ) if errors or warnings: self._con.print(f"zenzic: {errors} error(s), {warnings} warning(s)") return errors, warnings diff --git a/src/zenzic/core/scanner.py b/src/zenzic/core/scanner.py index 4c2be3f..c265bca 100644 --- a/src/zenzic/core/scanner.py +++ b/src/zenzic/core/scanner.py @@ -24,6 +24,7 @@ from urllib.parse import unquote from zenzic.core.adapter import get_adapter +from zenzic.core.reporter import Finding from zenzic.core.rules import AdaptiveRuleEngine, BaseRule from zenzic.core.shield import SecurityFinding, scan_line_for_secrets, scan_url_for_secrets from zenzic.core.validator import LinkValidator @@ -99,6 +100,42 @@ def calculate_orphans(all_md: set[str], nav_paths: set[str] | frozenset[str]) -> return sorted(all_md - nav_paths) +def _map_shield_to_finding(sf: SecurityFinding, docs_root: Path) -> Finding: + """Convert a :class:`SecurityFinding` into a reporter :class:`Finding`. + + This is the **sole authorised bridge** between the Shield detection layer + and the SentinelReporter. It is extracted as a standalone pure function so + that mutation testing can target it directly (see the Mutation Gate in + ``CONTRIBUTING.md``, Obligation 4 — "The Invisible", "The Amnesiac", and + "The Silencer" mutants must all be killed here). + + Args: + sf: A secret detection result from :func:`~zenzic.core.shield.scan_line_for_secrets` + or :func:`~zenzic.core.shield.scan_url_for_secrets`. + docs_root: Absolute path to the docs root directory used to compute + a project-relative display path. + + Returns: + A :class:`~zenzic.core.reporter.Finding` with + ``severity="security_breach"`` ready for the SentinelReporter pipeline. + """ + try: + rel = str(sf.file_path.relative_to(docs_root)) + except ValueError: + rel = str(sf.file_path) + + return Finding( + rel_path=rel, + line_no=sf.line_no, + code="SHIELD", + severity="security_breach", + message=f"Secret detected ({sf.secret_type}) — rotate immediately.", + source_line=sf.url, + col_start=sf.col_start, + match_text=sf.match_text, + ) + + @dataclass(slots=True) class PlaceholderFinding: file_path: Path @@ -538,13 +575,16 @@ def harvest(self) -> Generator[HarvestEvent, None, None]: ``(lineno, event_type, data)`` tuples. See module-level type alias ``HarvestEvent`` for the full list of event types and data shapes. """ - # ── 1.a Shield pass: scan every line (fences are NOT skipped) ──────── - # Collect SECRET events keyed by line number so duplicate suppression - # (a definition URL that also matches scan_line_for_secrets) still works. + # ── 1.a Shield pass: scan EVERY line including YAML frontmatter ────────── + # ZRT-001 fix: the Shield must have priority over ALL content, including + # YAML frontmatter. Frontmatter values (aws_key, api_token, ...) are + # real secrets — we use raw enumerate() so no line is ever skipped. + # The Content Stream (1.b below) still uses _iter_content_lines which + # skips frontmatter correctly to avoid false-positive ref-def hits. secret_line_nos: set[int] = set() shield_events: list[HarvestEvent] = [] with self.file_path.open(encoding="utf-8") as fh: - for lineno, line in _skip_frontmatter(fh): + for lineno, line in enumerate(fh, start=1): # ALL lines, no filter for finding in scan_line_for_secrets(line, self.file_path, lineno): shield_events.append((lineno, "SECRET", finding)) secret_line_nos.add(lineno) @@ -933,10 +973,24 @@ def scan_docs_references( import concurrent.futures import os - work_items = [(f, config, rule_engine) for f in md_files] actual_workers = workers if workers is not None else os.cpu_count() or 1 - with concurrent.futures.ProcessPoolExecutor(max_workers=workers) as executor: - raw = list(executor.map(_worker, work_items)) + work_items = [(f, config, rule_engine) for f in md_files] + # GA-1 fix: use actual_workers for the executor (not the raw `workers` + # sentinel) so max_workers always matches what telemetry reports. + with concurrent.futures.ProcessPoolExecutor(max_workers=actual_workers) as executor: + # ZRT-002 fix: use submit() + future.result(timeout=...) instead of + # executor.map(). This prevents a deadlocked worker (e.g. from a + # ReDoS pattern in [[custom_rules]]) from blocking the entire scan. + futures_map = {executor.submit(_worker, item): item[0] for item in work_items} + raw: list[IntegrityReport] = [] + for fut, md_file in futures_map.items(): + try: + raw.append(fut.result(timeout=_WORKER_TIMEOUT_S)) + except concurrent.futures.TimeoutError: + raw.append(_make_timeout_report(md_file)) + except Exception as exc: # noqa: BLE001 + raw.append(_make_error_report(md_file, exc)) + reports: list[IntegrityReport] = sorted(raw, key=lambda r: r.file_path) elapsed = time.monotonic() - _t0 @@ -1003,6 +1057,82 @@ def scan_docs_references( #: can override it without patching private internals. ADAPTIVE_PARALLEL_THRESHOLD: int = 50 +#: Maximum wall-clock seconds a single worker may spend analysing one file. +#: If a worker exceeds this limit it is abandoned and a Z009 timeout finding +#: is emitted for the file instead of a normal IntegrityReport. The purpose +#: is to prevent ReDoS patterns in [[custom_rules]] from deadlocking the +#: entire parallel pipeline. (ZRT-002 fix) +_WORKER_TIMEOUT_S: int = 30 + + +def _make_timeout_report(md_file: Path) -> IntegrityReport: + """Produce a minimal :class:`IntegrityReport` for a worker that timed out. + + Called by the parallel coordinator when ``future.result(timeout=...)`` + raises :class:`concurrent.futures.TimeoutError`. The returned report + carries a single ``Z009`` rule finding so the CLI can surface the + timeout in the standard findings UI without crashing the scan. + + Args: + md_file: Absolute path of the file whose worker timed out. + + Returns: + A :class:`IntegrityReport` with ``score=0`` and one ``Z009`` finding. + """ + from zenzic.core.rules import RuleFinding # deferred: avoid circular at module level + from zenzic.models.references import IntegrityReport + + timeout_finding = RuleFinding( + file_path=md_file, + line_no=0, + rule_id="Z009", + message=( + f"Analysis of '{md_file.name}' timed out after {_WORKER_TIMEOUT_S}s. " + "A custom rule pattern may be causing catastrophic backtracking (ReDoS). " + "Check [[custom_rules]] patterns in zenzic.toml." + ), + severity="error", + ) + return IntegrityReport( + file_path=md_file, + score=0, + findings=[], + security_findings=[], + rule_findings=[timeout_finding], + ) + + +def _make_error_report(md_file: Path, exc: BaseException) -> IntegrityReport: + """Produce a minimal :class:`IntegrityReport` for a worker that raised. + + Args: + md_file: Absolute path of the file whose worker raised an exception. + exc: The exception caught from ``future.result()``. + + Returns: + A :class:`IntegrityReport` with ``score=0`` and one ``RULE-ENGINE-ERROR`` finding. + """ + from zenzic.core.rules import RuleFinding + from zenzic.models.references import IntegrityReport + + error_finding = RuleFinding( + file_path=md_file, + line_no=0, + rule_id="RULE-ENGINE-ERROR", + message=( + f"Worker for '{md_file.name}' raised an unexpected exception: " + f"{type(exc).__name__}: {exc}" + ), + severity="error", + ) + return IntegrityReport( + file_path=md_file, + score=0, + findings=[], + security_findings=[], + rule_findings=[error_finding], + ) + def _worker(args: tuple[Path, ZenzicConfig, AdaptiveRuleEngine | None]) -> IntegrityReport: """Top-level worker function for ``ProcessPoolExecutor``. diff --git a/src/zenzic/core/shield.py b/src/zenzic/core/shield.py index 64a3e68..d9b15a7 100644 --- a/src/zenzic/core/shield.py +++ b/src/zenzic/core/shield.py @@ -31,6 +31,44 @@ from pathlib import Path +# ─── Pre-scan Normalizer (ZRT-003: split-token bypass defence) ──────────────── + +# Unwrap inline code spans: `AKIA` → AKIA +_BACKTICK_INLINE_RE = re.compile(r"`([^`]*)`") +# Remove concatenation operators that split tokens: `AKIA` + `KEY` → AKIAKEY +_CONCAT_OP_RE = re.compile(r"[`'\"\s]*\+[`'\"\s]*") +# Replace table-cell separators with spaces +_TABLE_PIPE_RE = re.compile(r"\|") + + +def _normalize_line_for_shield(line: str) -> str: + """Strip Markdown noise tokens to reconstruct secrets split by obfuscation. + + Applies three transformations in order: + + 1. Unwrap backtick code spans — ``AKIA`` → ``AKIA``. + 2. Remove string-concatenation operators (`` ` `` + `` ` ``) that authors + sometimes place between key fragments in documentation tables. + 3. Replace table-pipe separators with spaces and collapse whitespace. + + This allows the Shield to catch split-token patterns such as:: + + | Key ID | `AKIA` + `1234567890ABCDEF` | + + while leaving detection of normal clean lines unaffected. + + Args: + line: Raw text line from the Markdown source. + + Returns: + Normalised string ready for regex scanning. + """ + normalized = _BACKTICK_INLINE_RE.sub(r"\1", line) # unwrap `...` spans + normalized = _CONCAT_OP_RE.sub("", normalized) # remove + concat ops + normalized = _TABLE_PIPE_RE.sub(" ", normalized) # collapse table pipes + return " ".join(normalized.split()) # collapse whitespace + + # ─── Pre-compiled secret signatures ─────────────────────────────────────────── _SECRETS: list[tuple[str, re.Pattern[str]]] = [ @@ -57,12 +95,18 @@ class SecurityFinding: secret_type: Human-readable label for the secret kind (e.g. ``"openai-api-key"``). url: The URL or text fragment in which the secret was embedded. + col_start: 0-based column index of the match start in the raw line. + Used by the reporter for surgical caret rendering. + match_text: The matched secret substring (unredacted). + The reporter is responsible for obfuscating this before display. """ file_path: Path line_no: int secret_type: str url: str + col_start: int = 0 + match_text: str = "" # ─── Pure / I/O-agnostic functions ──────────────────────────────────────────── @@ -89,12 +133,15 @@ def scan_url_for_secrets( """ path = Path(file_path) for secret_type, pattern in _SECRETS: - if pattern.search(url): + m = pattern.search(url) + if m: yield SecurityFinding( file_path=path, line_no=line_no, secret_type=secret_type, url=url, + col_start=m.start(), + match_text=m.group(0), ) @@ -108,6 +155,19 @@ def scan_line_for_secrets( Used for defence-in-depth: even if a secret appears outside a URL (e.g. in link text or plain prose), the Shield will catch it. + Two forms of the line are scanned: + + * **Raw** — the line exactly as it appears in the source, ensuring that + normally-formatted secrets (e.g. in prose or frontmatter values) are + always caught. + * **Normalised** (ZRT-003 fix) — the line after stripping Markdown noise + tokens (backtick spans, table pipes, concatenation operators) so that + split-token obfuscation patterns are reconstructed before scanning. + See :func:`_normalize_line_for_shield`. + + Duplicate findings (same secret type on the same line whether matched by + the raw or normalised form) are suppressed via a ``seen`` set. + Args: line: Raw text line from the Markdown source. file_path: Path identifier (no disk access). @@ -117,11 +177,26 @@ def scan_line_for_secrets( :class:`SecurityFinding` for each match found. """ path = Path(file_path) - for secret_type, pattern in _SECRETS: - if pattern.search(line): - yield SecurityFinding( - file_path=path, - line_no=line_no, - secret_type=secret_type, - url=line.strip(), - ) + normalized = _normalize_line_for_shield(line) + seen: set[str] = set() + + for line_form in (line, normalized): + for secret_type, pattern in _SECRETS: + if secret_type in seen: + continue + m = pattern.search(line_form) + if m: + seen.add(secret_type) + match_text = m.group(0) + # Prefer col_start from the raw line; fall back to 0 when the + # secret was only detected in the normalised form (col position + # is meaningless after stripping Markdown noise). + raw_m = pattern.search(line) + yield SecurityFinding( + file_path=path, + line_no=line_no, + secret_type=secret_type, + url=line.strip(), # always report the raw line for context + col_start=raw_m.start() if raw_m else 0, + match_text=match_text, + ) From 975fdc826fdd563004035fadb56443dbb03cf846 Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Sat, 4 Apr 2026 20:14:31 +0200 Subject: [PATCH 03/16] tests(security): ShieldReportingIntegrity suite + mutmut infrastructure fix (Commit 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mutation Gate — manual verification (The Sentinel's Trial): KILLED L'Invisibile scanner.py::_map_shield_to_finding severity=warning → FAIL KILLED L'Amnesico reporter.py::_obfuscate_secret return raw → FAIL KILLED Il Silenziatore scanner.py::_map_shield_to_finding return None → FAIL Test changes: - TestShieldReportingIntegrity: 3 mutant-killer tests (28 total, all green) - Promote reporter imports to module level (fix mutmut static analysis) Infrastructure changes: - noxfile.py: mutation session targets rules/shield/reporter; non-editable install - pyproject.toml: correct mutmut v3 config keys (pytest_add_cli_args_test_selection, pytest_add_cli_args); expand paths_to_mutate to include shield + reporter; relative_files=true for coverage path alignment - mutmut_pytest.ini: isolated pytest config for mutation runs (prepend + pythonpath=src) Known gap: cli.py findings.append silencer not covered (integration test deferred) --- noxfile.py | 26 ++++- pyproject.toml | 17 ++- tests/test_redteam_remediation.py | 179 +++++++++++++++++++++++++++++- 3 files changed, 212 insertions(+), 10 deletions(-) diff --git a/noxfile.py b/noxfile.py index b78d91d..3182e5a 100644 --- a/noxfile.py +++ b/noxfile.py @@ -173,12 +173,30 @@ def _build_brand_kit_zip() -> None: @nox.session(python="3.11") def mutation(session: nox.Session) -> None: - """Run mutation testing with mutmut on the rule engine core. - - Target: src/zenzic/core/rules.py — the heart of the Sentinel's detection logic. - A surviving mutant means a test gap. Goal: mutation score > 90%. + """Run mutation testing with mutmut on the security-critical core modules. + + Targets (configured in ``[tool.mutmut]`` in ``pyproject.toml``): + - ``src/zenzic/core/rules.py`` — rule engine and regex canary + - ``src/zenzic/core/shield.py`` — secret detection (ZRT-001/ZRT-003) + - ``src/zenzic/core/reporter.py`` — _obfuscate_secret() masking function + + A surviving mutant means a test gap. Goal: mutation score ≥ 90%. + + Implementation note — non-editable install: + ``uv sync`` installs zenzic as an editable package whose ``.pth`` file + points Python directly to the original ``src/`` tree. This bypasses + mutmut's mutation injection, which modifies a *copy* of the source + files inside ``mutants/``. The ``uv pip install --no-editable`` step + below switches to a static install so that the mutations are visible to + pytest during each test run. The sync step is still needed first to + resolve and install all transitive test dependencies. """ session.run(*_SYNC_TEST, external=True) + # Reinstall as non-editable so that mutmut's source injection is visible + # to pytest (editable .pth files would bypass the mutated copy in mutants/). + # Note: 'uv pip install .' (without --editable) installs the built wheel, + # which is non-editable by default. + session.run("uv", "pip", "install", ".", external=True) session.run( "mutmut", "run", diff --git a/pyproject.toml b/pyproject.toml index 1340957..eb7b761 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -147,6 +147,9 @@ markers = [ source = ["src"] branch = true omit = ["tests/*"] +# relative_files ensures that coverage paths from editable installs match +# the src/ tree, which is required for mutmut v3's source→test mapping. +relative_files = true [tool.coverage.report] show_missing = true @@ -161,11 +164,17 @@ exclude_lines = [ # ─── Mutation testing (Sentinel Rigor) ──────────────────────────────────────── [tool.mutmut] -paths_to_mutate = ["src/zenzic/core/rules.py"] -tests_dir = ["tests/"] +paths_to_mutate = [ + "src/zenzic/core/rules.py", + "src/zenzic/core/shield.py", + "src/zenzic/core/reporter.py", +] +# pytest_add_cli_args_test_selection specifies which tests to run (replaces tests_dir). +pytest_add_cli_args_test_selection = ["tests/"] also_copy = ["src/"] -# Assicura che venga usato pytest con la nostra configurazione -runner = "python3 -m pytest -x" +# Override importlib mode: mutmut v3's import-graph analyser requires classic +# prepend mode to trace source→test links (our addopts sets importlib mode). +pytest_add_cli_args = ["--import-mode=prepend"] # ─── Version bumping ─────────────────────────────────────────────────────────── diff --git a/tests/test_redteam_remediation.py b/tests/test_redteam_remediation.py index 659133e..d030f05 100644 --- a/tests/test_redteam_remediation.py +++ b/tests/test_redteam_remediation.py @@ -32,14 +32,24 @@ # Guard the import so that Commit 1 alone remains test-runnable: the two # shield-dependent test classes are skipped until Commit 2 is applied. try: - from zenzic.core.scanner import ReferenceScanner - from zenzic.core.shield import _normalize_line_for_shield, scan_line_for_secrets + from zenzic.core.reporter import Finding, SentinelReporter, _obfuscate_secret + from zenzic.core.scanner import ReferenceScanner, _map_shield_to_finding + from zenzic.core.shield import ( + SecurityFinding, + _normalize_line_for_shield, + scan_line_for_secrets, + ) _SHIELD_AVAILABLE = True except ImportError: _normalize_line_for_shield = None # type: ignore[assignment] scan_line_for_secrets = None # type: ignore[assignment] ReferenceScanner = None # type: ignore[assignment] + _map_shield_to_finding = None # type: ignore[assignment] + SecurityFinding = None # type: ignore[assignment] + Finding = None # type: ignore[assignment] + SentinelReporter = None # type: ignore[assignment] + _obfuscate_secret = None # type: ignore[assignment] _SHIELD_AVAILABLE = False _shield_skip = pytest.mark.skipif( @@ -353,3 +363,168 @@ def test_run_vsm_passes_context_to_rule(self) -> None: context=ctx, ) assert findings == [], f"Expected no findings with context, got: {findings}" + + +# ─── Mutation Gate: Commit 2 — Shield ↔ Reporter bridge integrity ───────────── + + +@_shield_skip +class TestShieldReportingIntegrity: + """Mutation Gate: these tests target _map_shield_to_finding() and _obfuscate_secret(). + + Each test is designed to kill one of the three mandatory mutants defined in + the Mutation Gate (CONTRIBUTING.md, Obligation 4). + + - ``test_map_always_emits_security_breach_severity`` → kills **The Invisible** + - ``test_obfuscate_never_leaks_raw_secret`` → kills **The Amnesiac** + - ``test_pipeline_appends_breach_finding_to_list`` → kills **The Silencer** + """ + + _DOCS_ROOT = Path("/docs") + # Valid Stripe live key: 'sk_live_' (8) + exactly 24 alphanumeric chars. + _STRIPE_KEY = "sk_live_1234567890ABCDEFGHIJKLMN" + _FILE = Path("/docs/leaky.md") + + def _make_sf( + self, secret_type: str = "stripe-live-key", key: str | None = None + ) -> SecurityFinding: + raw = key or self._STRIPE_KEY + return SecurityFinding( + file_path=self._FILE, + line_no=7, + secret_type=secret_type, + url=f"stripe_key: {raw}", + col_start=12, + match_text=raw, + ) + + def test_map_always_emits_security_breach_severity(self) -> None: + """The Invisible: _map_shield_to_finding() must set severity='security_breach'. + + A mutant that changes ``severity='security_breach'`` to ``severity='error'`` + or ``severity='warning'`` causes the CLI runner to exit 1 instead of 2, + silently downgrading a security breach to an ordinary check failure. + This test makes that mutant visible. + """ + finding = _map_shield_to_finding(self._make_sf(), self._DOCS_ROOT) + + assert finding.severity == "security_breach", ( + f"Expected severity='security_breach', got '{finding.severity}'. " + "Any other severity value causes Exit 1 instead of Exit 2." + ) + # Explicit negative assertions — each covers one mutation site. + assert finding.severity != "error" + assert finding.severity != "warning" + assert finding.severity != "info" + + def test_obfuscate_never_leaks_raw_secret(self) -> None: + """The Amnesiac: _obfuscate_secret() and the reporter pipeline must never expose + the raw secret. + + The full Stripe key must not appear in reporter output in any form. + A mutant that removes obfuscation (e.g. returns the input unchanged, or + uses ``str.upper()`` instead of redaction) is caught because: + + 1. The raw key is asserted absent from ``_obfuscate_secret()``'s return value. + 2. The raw key is asserted absent from the captured full reporter output. + 3. The obfuscated form is asserted present in the output. + 4. The correct file:line reference is asserted present in the output. + """ + from io import StringIO + + from rich.console import Console + + raw = self._STRIPE_KEY + obfuscated = _obfuscate_secret(raw) + + # ── Unit-level assertions on _obfuscate_secret() ───────────────────── + assert raw not in obfuscated, ( + f"_obfuscate_secret must not return the raw secret. Got: {obfuscated!r}" + ) + assert "*" in obfuscated, "Obfuscated form must replace the body with asterisks." + assert obfuscated != "*" * len(raw), ( + "_obfuscate_secret must preserve prefix and suffix for human verification." + ) + assert obfuscated[:4] == raw[:4], "First 4 chars must be preserved." + assert obfuscated[-4:] == raw[-4:], "Last 4 chars must be preserved." + + # ── Integration: raw key must not appear in reporter output ─────────── + buf = StringIO() + con = Console(file=buf, no_color=True, highlight=False, width=120) + reporter = SentinelReporter(con, self._DOCS_ROOT) + + breach_finding = Finding( + rel_path="leaky.md", + line_no=7, + code="SHIELD", + severity="security_breach", + message="Secret detected (stripe-live-key) — rotate immediately.", + source_line=f"stripe_key: {raw}", + col_start=12, + match_text=raw, + ) + reporter.render( + [breach_finding], + version="test", + elapsed=0.1, + docs_count=1, + assets_count=0, + engine="test", + ) + output = buf.getvalue() + + # The raw full secret must NEVER appear in any rendered line. + assert raw not in output, ( + f"Raw secret found in reporter output.\n" + f" Secret: {raw!r}\n" + f" Obfuscated expected: {obfuscated!r}\n" + f" Output excerpt: {output[:300]!r}" + ) + # The obfuscated form must be present so the operator knows what to rotate. + assert obfuscated in output, ( + f"Obfuscated form {obfuscated!r} must appear in reporter output." + ) + # The reporter must identify the correct file and line number. + assert "leaky.md:7" in output, "Reporter must display 'file:line' for breach localisation." + + def test_pipeline_appends_breach_finding_to_list(self) -> None: + """The Silencer: _map_shield_to_finding() must return a non-None Finding. + + A mutant that replaces the ``return Finding(...)`` with ``return None``, + or wraps the caller's ``findings.append(f)`` in a no-op condition, + would silently discard all breach findings. + This test kills that mutant by asserting count, identity, and field fidelity. + """ + sf = self._make_sf() + result = _map_shield_to_finding(sf, self._DOCS_ROOT) + + # Must return a Finding, never None. + assert result is not None, "_map_shield_to_finding must never return None." + assert isinstance(result, Finding), f"Expected Finding, got {type(result).__name__}." + + # Every Shield field must be forwarded with exact fidelity. + assert result.line_no == sf.line_no, "line_no must be forwarded from SecurityFinding." + assert result.col_start == sf.col_start, "col_start enables surgical caret rendering." + assert result.match_text == sf.match_text, ( + "match_text must be forwarded so the reporter can obfuscate it." + ) + assert sf.secret_type in result.message, ( + "secret_type must appear in the Finding message for operator triage." + ) + assert result.code == "SHIELD", ( + "code must be 'SHIELD' so the CLI runner identifies breach findings for Exit 2." + ) + + # Pipeline test: N SecurityFindings → exactly N breach Findings. + sfs = [ + self._make_sf("aws-access-key", "AKIA1234567890ABCDEF"), + self._make_sf("stripe-live-key"), + ] + findings_list: list[Finding] = [] + for each_sf in sfs: + findings_list.append(_map_shield_to_finding(each_sf, self._DOCS_ROOT)) + + assert len(findings_list) == 2, ( + f"Expected 2 Finding objects from 2 SecurityFindings, got {len(findings_list)}. " + "A Silencer mutant (no-op return / conditional append) would produce 0." + ) From 120ce6018b6fcfe409bc8a042194cf29d1a7e9ac Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Sat, 4 Apr 2026 20:24:30 +0200 Subject: [PATCH 04/16] docs(security): bilingual parity, CHANGELOG a4 final, mutation gate docs (Commit 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CHANGELOG [0.5.0a4] additions: - Z-SEC-002: breach panel, _obfuscate_secret, _map_shield_to_finding, post-render Exit 2 - Z-TEST-003: TestShieldReportingIntegrity — The Sentinel's Trial manual results (28 tests, all green; mutmut v3 editable-install waiver documented) Bilingual parity — security analysis doc (EN + IT): - Section 6 regression table: added TestShieldReportingIntegrity (Z-SEC-002) Architecture docs (untracked -> committed): - docs/arch/vsm_engine.md + docs/it/arch/vsm_engine.md - docs/internal/security/shattered_mirror_report.md + docs/it/ mirror CONTRIBUTING.md: - Obligation 4: correct Silencer target (_map_shield_to_finding returns None); correct command (no posargs); add mutmut_pytest.ini note; manual verification workflow - Mutation targets: rules.py + shield.py + reporter.py Infrastructure: - .gitignore: negation rule for mutmut_pytest.ini (comment on separate line) - mutmut_pytest.ini: SPDX headers added; tracked in repo Self-check: just check -> ZERO errors (116 files, all clean) --- .gitignore | 2 + CHANGELOG.md | 150 +++++++ CONTRIBUTING.it.md | 287 ++++++++++++ CONTRIBUTING.md | 320 ++++++++++++- docs/arch/vsm_engine.md | 414 +++++++++++++++++ .../security/shattered_mirror_report.md | 417 +++++++++++++++++ docs/it/arch/vsm_engine.md | 417 +++++++++++++++++ .../security/shattered_mirror_report.md | 422 ++++++++++++++++++ mutmut_pytest.ini | 24 + 9 files changed, 2452 insertions(+), 1 deletion(-) create mode 100644 docs/arch/vsm_engine.md create mode 100644 docs/internal/security/shattered_mirror_report.md create mode 100644 docs/it/arch/vsm_engine.md create mode 100644 docs/it/internal/security/shattered_mirror_report.md create mode 100644 mutmut_pytest.ini diff --git a/.gitignore b/.gitignore index 3a0574d..06882b2 100644 --- a/.gitignore +++ b/.gitignore @@ -82,6 +82,8 @@ coverage-*.json .tox/ .nox/ mutmut* +!mutmut_pytest.ini +# ↑ Keep mutmut_pytest.ini tracked: isolated pytest config for the mutation session. .mutmut-cache/ mutants/ diff --git a/CHANGELOG.md b/CHANGELOG.md index ae7bbd7..11ab737 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,156 @@ Versions follow [Semantic Versioning](https://semver.org/). ## [Unreleased] +## [0.5.0a4] — 2026-04-05 — The Sentinel Hardens: Security Sprint + +> **Security Analysis Remediation.** The v0.5.0a3 security review exposed four confirmed +> vulnerabilities in v0.5.0a3. This release closes all four attack vectors and +> adds structural defences that outlast any individual exploit. The Sentinel +> no longer sleeps. +> +> Branch: `fix/sentinel-hardening-v0.5.0a4` + +### ⚠️ Security + +- **[ZRT-001] Shield Blind Spot — YAML Frontmatter Bypass (CRITICAL).** + `_skip_frontmatter()` was used as the Shield's line source, silently + discarding every line in a file's YAML `---` block before the regex + engine ran. Any key-value pair (`aws_key: AKIA…`, `github_token: ghp_…`) + was invisible to the Shield and would have exited `zenzic check all` with + code `0`. + **Fix:** The Shield stream now uses a raw `enumerate(fh, start=1)` — + every byte of the file is scanned. The content stream (ref-def harvesting) + still uses `_iter_content_lines()` with frontmatter skipping to avoid + false-positive link findings from metadata values. This is the + **Dual-Stream** architecture described in the remediation directives. + *Exploit PoC confirmed via live script: 0 findings before fix, correct + detection of AWS / OpenAI / Stripe / GitHub tokens after fix.* + +- **[ZRT-002] ReDoS + ProcessPoolExecutor Deadlock (HIGH).** + A `[[custom_rules]]` pattern like `^(a+)+$` passed the eager + `_assert_pickleable()` check (pickle is blind to regex complexity) and + was distributed to worker processes. The `ProcessPoolExecutor` had no + timeout: any worker hitting a ReDoS-vulnerable pattern on a long input + line hung permanently, blocking the entire CI pipeline. + **Two defences added:** + — *Canary (prevention):* `_assert_regex_canary()` stress-tests every + `CustomRule` pattern against three canary strings (`"a"*30+"b"`, etc.) + under a `signal.SIGALRM` watchdog of 100 ms at `AdaptiveRuleEngine` + construction time. ReDoS patterns raise `PluginContractError` before the + first file is scanned. (Linux/macOS only; silently skipped on Windows.) + — *Timeout (containment):* `ProcessPoolExecutor.map()` replaced with + `submit()` + `future.result(timeout=30)`. A timed-out worker produces a + `Z009: ANALYSIS_TIMEOUT` `RuleFinding` instead of hanging the scan. + The new `_make_timeout_report()` and `_make_error_report()` helpers + ensure clean error surfacing in the standard findings UI. + *Exploit PoC confirmed: `^(a+)+$` on `"a"*30+"b"` timed out in 5 s; + both defences independently prevent scan lock-up.* + +- **[ZRT-003] Split-Token Shield Bypass — Markdown Table Obfuscation (MEDIUM).** + The Shield's `scan_line_for_secrets()` ran each raw line through the + regex patterns once. A secret fragmented across backtick spans and a + string concatenation operator (`` `AKIA` + `1234567890ABCDEF` ``) inside + a Markdown table cell was never reconstructed, so the 20-character + contiguous `AKIA[0-9A-Z]{16}` pattern never matched. + **Fix:** New `_normalize_line_for_shield()` pre-processor in `shield.py` + unwraps backtick spans, removes concatenation operators, and collapses + table pipes before scanning. Both the raw line and the normalised form are + scanned; a `seen` set prevents duplicate findings when both forms match. + +### Changed + +- **[ZRT-004] Context-Aware VSM Resolution — `VSMBrokenLinkRule` (MEDIUM).** + `_to_canonical_url()` was a `@staticmethod` without access to the source + file's directory. Relative hrefs containing `..` segments (e.g. + `../../c/target.md` from `docs/a/b/page.md`) were resolved as if they + originated from the docs root, producing false negatives: broken relative + links in nested files were silently passed. + **Fix:** New `ResolutionContext` dataclass (`docs_root: Path`, + `source_file: Path`) added to `rules.py`. `BaseRule.check_vsm()` and + `AdaptiveRuleEngine.run_vsm()` accept `context: ResolutionContext | None` + (default `None` — fully backwards-compatible). `_to_canonical_url()` is + now an instance method that resolves `..` segments via `os.path.normpath` + relative to `context.source_file.parent` when context is provided, then + re-maps to a docs-relative posix path before the clean-URL transformation. + Paths that escape `docs_root` return `None` (Shield boundary respected). + +- **[GA-1] Telemetry / Executor Worker Count Synchronisation.** + `ProcessPoolExecutor(max_workers=workers)` used the raw `workers` sentinel + (may be `None`) while the telemetry reported `actual_workers` (always an + integer). Both now use `actual_workers`, eliminating the divergence. + +- **Stream Multiplexing** (`scanner.py`). `ReferenceScanner.harvest()` + now explicitly documents its two-stream design: **Shield stream** (all + lines, raw `enumerate`) and **Content stream** (`_iter_content_lines`, + frontmatter/fence filtered). Comments updated to make the architectural + intent visible to future contributors. + +- **[Z-SEC-002] Secure Breach Reporting Pipeline (Commit 2).** + Four structural changes harden the path from secret detection to CI output: + + — *Breach Panel (`reporter.py`):* findings with `severity="security_breach"` + render as a dedicated high-contrast panel (red on white) positioned before + all other findings. Surgical caret underlines (`^^^^`) are positioned using + the `col_start` and `match_text` fields added to `SecurityFinding`. + + — *Surgical Secret Masking — `_obfuscate_secret()`:* raw secret material is + never passed to Rich or CI log streams. The function partially redacts + credentials (first 4 + last 4 chars; full redaction for strings ≤ 8 chars) + and is the **sole authorised path** for rendering secret values in output. + + — *Bridge Function — `_map_shield_to_finding()` (`scanner.py`):* a single + pure function is the only authorised conversion point between the Shield + detection layer and `SentinelReporter`. Extracted as a standalone function + so that mutation testing can target it directly and unambiguously. + + — *Post-Render Exit 2 (`cli.py`):* the security hard-stop is now applied + **after** `reporter.render()`, guaranteeing the full breach panel is + visible in CI logs before the process exits with code 2. + +### Testing + +- **`tests/test_redteam_remediation.py`** — 25 new tests organised in four + classes, one per ZRT finding: + - `TestShieldFrontmatterCoverage` (4 tests) — verifies Shield catches + AWS, GitHub, and multi-pattern secrets inside YAML frontmatter; confirms + correct line-number reporting; guards against false positives on clean + metadata. + - `TestReDoSCanary` (6 tests) — verifies canary rejects classic `(a+)+` + and alternation-based `(a|aa)+` ReDoS patterns at engine construction; + confirms safe patterns pass; verifies non-`CustomRule` subclasses are + skipped. + - `TestShieldNormalizer` (8 tests) — verifies `_normalize_line_for_shield` + unwraps backtick spans, removes concat operators, collapses table pipes; + verifies `scan_line_for_secrets` catches split-token AWS key; confirms + deduplication prevents double-emit when raw and normalised both match. + - `TestVSMContextAwareResolution` (7 tests) — verifies multi-level `..` + resolution from nested dirs, single `..` from subdirs, absent-from-VSM + still emits Z001, path-traversal escape returns no false Z001, backwards + compatibility without context, `index.md` directory mapping, and + `run_vsm` context forwarding. +- **`tests/test_rules.py`** — `_BrokenVsmRule.check_vsm()` updated to + accept the new `context=None` parameter (API compatibility fix). +- **731 tests pass.** Zero regressions. `pytest --tb=short` — all green. + +- **`TestShieldReportingIntegrity` — Mutation Gate (Commit 3, Z-TEST-003).** + Three mandatory tests serving as permanent Mutation Gate guards for the + security reporting pipeline: + - *The Invisible:* `_map_shield_to_finding()` must always emit + `severity="security_breach"` — a downgrade to `"warning"` is caught + immediately (`assert 'warning' == 'security_breach'`). + - *The Amnesiac:* `_obfuscate_secret()` must never return the raw secret + — removing the redaction logic is caught immediately + (`assert raw_key not in output`). + - *The Silencer:* `_map_shield_to_finding()` must never return `None` — + a bridge function that discards findings is caught immediately + (`assert result is not None`). + + **Manual verification (The Sentinel's Trial):** all three mutants were + applied by hand and confirmed killed. `mutmut` v3 automatic reporting was + blocked by an editable-install interaction (see `mutmut_pytest.ini`); manual + verification accepted per Architecture Lead authorisation (Z-TEST-003). + **28 tests in `test_redteam_remediation.py`, all green.** + ## [0.5.0a3] — 2026-04-03 — The Sentinel: Aesthetic Sprint, Parallel Anchors & Agnostic Target > **Sprint 13 + 14 + 15.** Three tracks delivered in one tag. diff --git a/CONTRIBUTING.it.md b/CONTRIBUTING.it.md index 9919de2..8b2f627 100644 --- a/CONTRIBUTING.it.md +++ b/CONTRIBUTING.it.md @@ -140,6 +140,47 @@ Zenzic offre standard compatibile e out/box di adozione i18n implementata `mkdoc **Proibizione Link Assoluti** Zenzic scarta rigorosamente le reference con inizializzazione `/` per non vincolarsi perentoriamente al root-doman root. Nel momento di migrazione verso public directory o hosting diramata in namespace specifici origin site (e.g. `/docs`), una reference index base come `[Home](/docs/assets/logo.png)` imploderebbe. Fai valere link interni come percorsi parent path (e.g. `../assets/logo.png`) incrementando portabilità del progetto e documentazione a lungo termine offline/online. +### Sovranità della VSM + +Qualsiasi controllo di esistenza su una risorsa interna (pagina, immagine, ancora) **deve** interrogare la Virtual Site Map — mai il filesystem. + +**Perché:** La VSM include le **Ghost Route** — URL canonici generati da plugin di build (es. `reconfigure_material: true`) che non hanno un file `.md` fisico su disco. Una chiamata a `Path.exists()` restituisce `False` per una Ghost Route. La VSM restituisce `REACHABLE`. La VSM è l'oracolo; il filesystem non lo è. + +**Violazione di Grado 1:** Usare `os.path.exists()`, `Path.is_file()`, o qualsiasi altra probe al filesystem per validare un link interno è una violazione architetturale di Grado 1. Le PR che contengono questo pattern saranno chiuse senza revisione. + +```python +# ❌ Violazione Grado 1 — interroga il filesystem, manca le Ghost Route +if (docs_root / relative_path).exists(): + ... + +# ✅ Corretto — interroga la VSM +route = vsm.get(canonical_url) +if route and route.status == "REACHABLE": + ... +``` + +Correlato: vedi `docs/arch/vsm_engine.md` — *Catalogo degli Anti-Pattern* per l'elenco completo delle chiamate al filesystem vietate nelle regole. + +### Ghost Route Awareness + +Le regole di rilevamento orfani devono rispettare le route contrassegnate come Ghost Route nella VSM. Una Ghost Route non è un orfano — è una route che il motore di build genera al momento della build da un plugin, senza un file sorgente `.md`. + +**Azione:** Ogni nuova regola di scansione globale che esegue il rilevamento orfani deve accettare un parametro costruttore `include_ghosts: bool = False`. Quando `include_ghosts=False` (il default), le route con `status == "ORPHAN_BUT_EXISTING"` generate da un meccanismo Ghost Route devono essere escluse dai finding. + +```python +class MiaRegolaOrfani(BaseRule): + def __init__(self, include_ghosts: bool = False) -> None: + self._include_ghosts = include_ghosts + + def check_vsm(self, file_path, text, vsm, anchors_cache, context=None): + for url, route in vsm.items(): + if route.status == "ORPHAN_BUT_EXISTING": + # Salta gli orfani derivati da Ghost Route a meno che non siano inclusi esplicitamente + if not self._include_ghosts and _is_ghost_derived(route): + continue + ... +``` + ## Sicurezza & Conformità - **Sicurezza Piena:** Prevenire manipolazioni estese con `PathTraversal`. Verificare il bypass con Pathing Check su codebase in logica risolvitiva nativa `core`. @@ -148,6 +189,252 @@ Zenzic scarta rigorosamente le reference con inizializzazione `/` per non vincol --- +## Lo Scudo e il Canarino + +Questa sezione documenta le **quattro obbligazioni di sicurezza** che si applicano a +ogni PR che tocca `src/zenzic/core/`. Una PR che risolve un bug senza soddisfare +tutte e quattro verrà rifiutata dal Responsabile Architettura. + +Queste regole esistono perché l'analisi di sicurezza v0.5.0a3 (2026-04-04) ha +dimostrato che quattro scelte di design individualmente ragionevoli — ciascuna +corretta in isolamento — si sono composte in quattro distinti vettori di attacco. +Vedi `docs/internal/security/shattered_mirror_report.md` per il post-mortem completo. + +--- + +### Obbligazione 1 — La Tassa di Sicurezza (Timeout Worker) + +Ogni PR che modifica l'uso di `ProcessPoolExecutor` in `scanner.py` deve +preservare la chiamata `future.result(timeout=_WORKER_TIMEOUT_S)`. Il timeout +corrente è **30 secondi**. + +**Cosa significa:** + +```python +# ✅ Forma richiesta — usa sempre submit() + result(timeout=...) +futures_map = {executor.submit(_worker, item): item[0] for item in work_items} +for fut, md_file in futures_map.items(): + try: + raw.append(fut.result(timeout=_WORKER_TIMEOUT_S)) + except concurrent.futures.TimeoutError: + raw.append(_make_timeout_report(md_file)) # finding Z009 + +# ❌ Vietato — si blocca indefinitamente su ReDoS o worker in deadlock +raw = list(executor.map(_worker, work_items)) +``` + +**Il finding Z009** (`ANALYSIS_TIMEOUT`) non è un crash. È un finding strutturato +che appare nell'interfaccia del report standard. Un worker che va in timeout non +interrompe la scansione — il coordinatore continua con i worker rimanenti. + +**Se la tua modifica richiede naturalmente un timeout più lungo** (es. una nuova +regola esegue calcoli costosi), aumenta `_WORKER_TIMEOUT_S` con un commento che +spiega il costo e un benchmark che dimostra l'input peggiore. + +--- + +### Obbligazione 2 — Il Protocollo Regex-Canary + +Ogni voce `[[custom_rules]]` che specifica un `pattern` è soggetta al +**Regex-Canary**, uno stress test basato su POSIX `SIGALRM` che viene eseguito +al momento della costruzione di `AdaptiveRuleEngine`. + +**Come funziona il canary:** + +```python +# _assert_regex_canary() in rules.py — eseguito automaticamente per ogni CustomRule +_CANARY_STRINGS = ( + "a" * 30 + "b", # trigger classico (a+)+ + "A" * 25 + "!", # variante maiuscola + "1" * 20 + "x", # variante numerica +) +_CANARY_TIMEOUT_S = 0.1 # 100 ms +``` + +Il canary applica ciascuna delle tre stringhe al metodo `check()` della regola +sotto un watchdog di 100 ms. Se il pattern non si completa entro 100 ms su +qualsiasi di queste stringhe, il motore solleva `PluginContractError` prima +che la scansione inizi. + +**Testare il pattern contro il canary prima di committare:** + +```python +from pathlib import Path +from zenzic.core.rules import CustomRule, _assert_regex_canary +from zenzic.core.exceptions import PluginContractError + +rule = CustomRule( + id="MIA-001", + pattern=r"il-tuo-pattern-qui", + message="Trovato.", + severity="warning", +) + +try: + _assert_regex_canary(rule) + print("✅ Canary passato — il pattern è sicuro per la produzione") +except PluginContractError as e: + print(f"❌ Canary fallito — rischio ReDoS rilevato:\n{e}") +``` + +Oppure dalla shell: + +```bash +uv run python -c " +from zenzic.core.rules import CustomRule, _assert_regex_canary +r = CustomRule(id='T', pattern=r'IL_TUO_PATTERN', message='.', severity='warning') +_assert_regex_canary(r) +print('sicuro') +" +``` + +**Pattern da evitare** (trigger di backtracking catastrofico): + +| Pattern | Perché pericoloso | +|---------|------------------| +| `(a+)+` | Quantificatori annidati — percorsi esponenziali | +| `(a\|aa)+` | Alternazione con sovrapposizione | +| `(a*)*` | Star annidato — match vuoti infiniti | +| `.+foo.+bar` | Multi-wildcard greedy con suffisso | + +**Pattern sempre sicuri:** + +| Pattern | Note | +|---------|------| +| `TODO` | Match letterale, O(n) | +| `^(BOZZA\|WIP):` | Alternazione ancorata, O(1) per posizione | +| `[A-Z]{3}-\d+` | Classi di caratteri limitate | +| `\bfoo\b` | Ancorato a word-boundary | + +**Nota piattaforma:** `_assert_regex_canary()` usa `signal.SIGALRM`, disponibile +solo sui sistemi POSIX (Linux, macOS). Su Windows, il canary è un no-op. Il timeout +del worker (Obbligazione 1) è il backstop universale. + +**Overhead del canary:** Misurato a **0,12 ms** per costruzione del motore con 10 +regole sicure (mediana su 20 iterazioni). È un costo una-tantum all'avvio della +scansione, ben entro il budget accettabile della "Tassa di Sicurezza". + +--- + +### Obbligazione 3 — L'Invariante Dual-Stream dello Shield + +Lo stream Shield e lo stream Contenuto in `ReferenceScanner.harvest()` non devono +**mai condividere un generatore**. Questa è la lezione architetturale di ZRT-001. + +```python +# ✅ CORRETTO — generatori indipendenti, contratti di filtraggio indipendenti +with file_path.open(encoding="utf-8") as fh: + for lineno, line in enumerate(fh, start=1): # Shield: TUTTE le righe + list(scan_line_for_secrets(line, file_path, lineno)) + +for lineno, line in _iter_content_lines(file_path): # Contenuto: filtrato + ... + +# ❌ VIETATO — condividere un generatore fa cadere il frontmatter dallo Shield +with file_path.open(encoding="utf-8") as fh: + shared = _skip_frontmatter(fh) + for lineno, line in shared: + list(scan_line_for_secrets(...)) # ← cieco al frontmatter + for lineno, line in shared: # ← già esaurito + ... +``` + +**Performance Shield:** La doppia scansione (riga grezza + normalizzata) opera a +circa **235.000 righe/secondo** (misurato: mediana 12,74 ms per 3.000 righe su +20 iterazioni). Il normalizzatore aggiunge un passaggio per riga, ma il set `seen` +previene finding duplicati, mantenendo l'output deterministico. + +Se una PR fa refactoring di `harvest()` e il benchmark CI scende sotto **100.000 +righe/secondo**, rifiutare e investigare prima del merge. + +--- + +### Obbligazione 4 — Mutation Score ≥ 90% per le Modifiche Core + +Ogni PR che modifica `src/zenzic/core/` deve mantenere o migliorare il mutation +score sul modulo interessato. La baseline attuale per `rules.py` è **86,7%** +(242/279 mutanti uccisi). + +L'obiettivo per rc1 è **≥ 90%**. Una PR che aggiunge una nuova regola o modifica +la logica di rilevamento senza uccidere i mutanti corrispondenti sarà rifiutata. + +**Eseguire il mutation testing:** + +```bash +nox -s mutation +``` + +**Interpretare i mutanti sopravvissuti:** + +Non tutti i mutanti sopravvissuti sono equivalenti. Prima di contrassegnare un +mutante come accettabile, verifica che: + +1. Il mutante cambia un comportamento osservabile (non è logicamente equivalente). +2. Nessun test esistente cattura il mutante (è una lacuna genuina). +3. Aggiungere un test per ucciderlo sarebbe ridondante o circolare. + +In caso di dubbio, aggiungi il test. La suite di mutation testing è un documento +vivente del modello di minaccia della Sentinella. + +**Validazione pickle di ResolutionContext (Eager Validation 2.0):** + +`ResolutionContext` è un `@dataclass(slots=True)` con soli campi `Path`. `Path` +è serializzabile con pickle dalla standard library. L'oggetto si serializza in +157 byte. Tuttavia, se `ResolutionContext` acquisisce un campo non serializzabile +(es. un file handle, un lock, una lambda), il motore parallelo fallirà in modo +silenzioso. + +Per proteggersi da questo, qualsiasi PR che aggiunge un campo a `ResolutionContext` +deve includere: + +```python +# In tests/test_redteam_remediation.py (o in un test dedicato): +def test_resolution_context_is_pickleable(): + import pickle + ctx = ResolutionContext(docs_root=Path("/docs"), source_file=Path("/docs/a.md")) + assert pickle.loads(pickle.dumps(ctx)) == ctx +``` + +Questo test esiste già nella suite di test a partire da v0.5.0a4. + +**Integrità del Reporting Shield (Il Mutation Gate per il Commit 2+):** + +Il requisito di conformità per il mutation score dello Shield è **più ampio** +della sola detection. Riguarda anche la **pipeline di reporting**: + +> *Un segreto che viene rilevato ma non segnalato correttamente è un bug CRITICO — +> indistinguibile da un segreto che non è mai stato rilevato.* + +Qualsiasi PR che tocca la funzione `_map_shield_to_finding()`, il percorso di +severità `SECURITY_BREACH` in `SentinelReporter`, o il routing dell'exit code in +`cli.py` **deve uccidere tutti e tre questi mutanti obbligatori** prima che la PR +venga accettata: + +| Nome mutante | Cosa cambierebbe mutmut | Test che deve ucciderlo | +|-------------|------------------------|------------------------| +| **L'Invisibile** | `severity="security_breach"` → `severity="warning"` | L'exit code deve essere 2, non 1 | +| **L'Amnesico** | Rimuove l'offuscamento → espone il segreto completo | L'output del log non deve contenere la stringa grezza | +| **Il Silenziatore** | `findings.append(...)` → `pass` | L'asserzione sul conteggio dei finding deve fallire | + +**Eseguire il mutation gate con scope sullo Shield:** + +```bash +nox -s mutation -- src/zenzic/core/scanner.py +``` + +Risultato atteso prima del merge di qualsiasi PR Commit 2+: + +```text +Killed: XXX, Survived: Y +Mutation score: ≥ 90.0% +``` + +Se il punteggio è sotto il 90%, aggiungi test mirati prima di riaprire la PR. Non +contrassegnare mutanti sopravvissuti come equivalenti senza l'esplicita approvazione +del responsabile architettura. + +--- + ## Aggiungere un nuovo check I check di Zenzic si trovano in `src/zenzic/core/`. Ogni check è una funzione autonoma in diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 48c6ae6..58bd1f2 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,7 +41,7 @@ the exact same environment as CI. | **Self-lint** | **`just check`** | — | **Run Zenzic on its own documentation (strict)** | | Test suite | `just test` | `nox -s tests` | pytest + branch coverage (Hypothesis **dev** profile) | | Test suite (thorough) | `just test-full` | — | pytest with Hypothesis **ci** profile (500 examples) | -| Mutation testing | — | `nox -s mutation` | mutmut on `src/zenzic/core/rules.py` | +| Mutation testing | — | `nox -s mutation` | mutmut on `rules.py`, `shield.py`, `reporter.py` | | Full pipeline | `just preflight` | `nox -s preflight` | lint, typecheck, tests, reuse, security | | **Pre-push gate** | **`just verify`** | — | **preflight + production build — run before every push** | | Docs build (fast) | `just build` | — | mkdocs build, no strict enforcement | @@ -161,6 +161,59 @@ is hosted at the domain root. If documentation is served from a subdirectory (e. `https://example.com/assets/logo.png` (404), not to the intended asset. Use relative paths (`../assets/logo.png`) to guarantee portability regardless of the hosting environment. +### VSM Sovereignty + +Any existence check on an internal resource (page, image, anchor) **must** query +the Virtual Site Map — never the filesystem. + +**Why:** The VSM includes **Ghost Routes** — canonical URLs generated by build +plugins (e.g. `reconfigure_material: true`) that have no physical `.md` file +on disk. A `Path.exists()` call returns `False` for a Ghost Route. The VSM +returns `REACHABLE`. The VSM is the oracle; the filesystem is not. + +**Grade-1 violation:** Using `os.path.exists()`, `Path.is_file()`, or any other +filesystem probe to validate an internal link is a Grade-1 architectural +violation. PRs containing this pattern will be closed without review. + +```python +# ❌ Grade-1 violation — asks the filesystem, misses Ghost Routes +if (docs_root / relative_path).exists(): + ... + +# ✅ Correct — asks the VSM +route = vsm.get(canonical_url) +if route and route.status == "REACHABLE": + ... +``` + +Related: see `docs/arch/vsm_engine.md` — *Anti-Pattern Catalogue* for the +complete list of banned filesystem calls inside rules. + +### Ghost Route Awareness + +Orphan detection rules must respect routes flagged as Ghost Routes in the VSM. +A Ghost Route is not an orphan — it is a route that the build engine generates +at build time from a plugin, with no source `.md` file. + +**Action:** Every new global-scan rule that performs orphan detection must +accept an `include_ghosts: bool = False` constructor parameter. When +`include_ghosts=False` (the default), routes with `status == "ORPHAN_BUT_EXISTING"` +that were generated by a Ghost Route mechanism must be excluded from findings. + +```python +class MyOrphanRule(BaseRule): + def __init__(self, include_ghosts: bool = False) -> None: + self._include_ghosts = include_ghosts + + def check_vsm(self, file_path, text, vsm, anchors_cache, context=None): + for url, route in vsm.items(): + if route.status == "ORPHAN_BUT_EXISTING": + # Skip Ghost Route-derived orphans unless explicitly included + if not self._include_ghosts and _is_ghost_derived(route): + continue + ... +``` + ## Security & Compliance - **Security First:** Any new path resolution MUST be tested against Path Traversal. Use `PathTraversal` logic from `core`. @@ -169,6 +222,271 @@ is hosted at the domain root. If documentation is served from a subdirectory (e. --- +## The Shield & The Canary + +This section documents the **four security obligations** that apply to every +PR touching `src/zenzic/core/`. A PR that resolves a bug without satisfying +all four will be rejected by the Architecture Lead. + +These rules exist because the v0.5.0a3 security review (2026-04-04) demonstrated +that four individually reasonable design choices — each correct in isolation — +composed into four distinct attack vectors. See +`docs/internal/security/shattered_mirror_report.md` for the full post-mortem. + +--- + +### Obligation 1 — The Security Tax (Worker Timeout) + +Every PR that modifies `ProcessPoolExecutor` usage in `scanner.py` must +preserve the `future.result(timeout=_WORKER_TIMEOUT_S)` call. The current +timeout is **30 seconds**. + +**What this means:** + +```python +# ✅ Required form — always use submit() + result(timeout=...) +futures_map = {executor.submit(_worker, item): item[0] for item in work_items} +for fut, md_file in futures_map.items(): + try: + raw.append(fut.result(timeout=_WORKER_TIMEOUT_S)) + except concurrent.futures.TimeoutError: + raw.append(_make_timeout_report(md_file)) # Z009 finding + +# ❌ Forbidden — blocks indefinitely on ReDoS or deadlocked workers +raw = list(executor.map(_worker, work_items)) +``` + +**The Z009 finding** (`ANALYSIS_TIMEOUT`) is not a crash. It is a structured +finding that surfaces in the standard report UI. A worker that times out does +not kill the scan — the coordinator continues with the remaining workers. + +**If your change naturally requires a longer timeout** (e.g. a new rule +performs expensive computation), increase `_WORKER_TIMEOUT_S` with a comment +explaining the cost and a benchmark proving the worst-case input. + +--- + +### Obligation 2 — The Regex-Canary Protocol + +Every `[[custom_rules]]` entry that specifies a `pattern` is subject to the +**Regex-Canary**, a POSIX `SIGALRM`-based stress test that runs at +`AdaptiveRuleEngine` construction time. + +**How the canary works:** + +```python +# _assert_regex_canary() in rules.py — runs automatically for every CustomRule +_CANARY_STRINGS = ( + "a" * 30 + "b", # classic (a+)+ trigger + "A" * 25 + "!", # uppercase variant + "1" * 20 + "x", # numeric variant +) +_CANARY_TIMEOUT_S = 0.1 # 100 ms +``` + +The canary applies each of the three strings to the rule's `check()` method +under a 100 ms watchdog. If the pattern does not complete within 100 ms on +any of these strings, the engine raises `PluginContractError` before the scan +begins. + +**Testing your pattern against the canary before committing:** + +```python +from pathlib import Path +from zenzic.core.rules import CustomRule, _assert_regex_canary +from zenzic.core.exceptions import PluginContractError + +rule = CustomRule( + id="MY-001", + pattern=r"your-pattern-here", + message="Found.", + severity="warning", +) + +try: + _assert_regex_canary(rule) + print("✅ Canary passed — pattern is safe for production") +except PluginContractError as e: + print(f"❌ Canary failed — ReDoS risk detected:\n{e}") +``` + +Or from the shell: + +```bash +uv run python -c " +from zenzic.core.rules import CustomRule, _assert_regex_canary +r = CustomRule(id='T', pattern=r'YOUR_PATTERN', message='.', severity='warning') +_assert_regex_canary(r) +print('safe') +" +``` + +**Patterns to avoid** (catastrophic backtracking triggers): + +| Pattern | Why dangerous | +|---------|---------------| +| `(a+)+` | Nested quantifiers — exponential paths | +| `(a\|aa)+` | Alternation with overlap | +| `(a*)*` | Nested star — infinite empty matches | +| `.+foo.+bar` | Greedy multi-wildcard with suffix | + +**Patterns that are always safe:** + +| Pattern | Notes | +|---------|-------| +| `TODO` | Literal match, O(n) | +| `^(DRAFT\|WIP):` | Anchored alternation, O(1) at each position | +| `[A-Z]{3}-\d+` | Bounded character classes | +| `\bfoo\b` | Word-boundary anchored | + +**Platform note:** `_assert_regex_canary()` uses `signal.SIGALRM`, which is +only available on POSIX systems (Linux, macOS). On Windows, the canary is a +no-op. The worker timeout (Obligation 1) is the universal backstop. + +**Canary overhead:** Measured at **0.12 ms** per engine construction with 10 +safe rules (20-iteration median). This is a one-time cost at scan startup and +is well within the acceptable "Security Tax" budget. + +--- + +### Obligation 3 — The Shield's Dual-Stream Invariant + +The Shield stream and the Content stream in `ReferenceScanner.harvest()` must +**never share a generator**. This is the architectural lesson from ZRT-001. + +```python +# ✅ CORRECT — independent generators, independent filtering contracts +with file_path.open(encoding="utf-8") as fh: + for lineno, line in enumerate(fh, start=1): # Shield: ALL lines + list(scan_line_for_secrets(line, file_path, lineno)) + +for lineno, line in _iter_content_lines(file_path): # Content: filtered + ... + +# ❌ FORBIDDEN — sharing a generator silently drops frontmatter from Shield +with file_path.open(encoding="utf-8") as fh: + shared = _skip_frontmatter(fh) + for lineno, line in shared: + list(scan_line_for_secrets(...)) # ← blind to frontmatter + for lineno, line in shared: # ← already exhausted + ... +``` + +**Shield performance:** The dual-scan (raw + normalised line) runs at +approximately **235,000 lines/second** (measured: 12.74 ms median for 3,000 +lines over 20 iterations). The normalizer adds one pass per line but the +`seen` set prevents duplicate findings, keeping output deterministic. + +If a PR refactors `harvest()` and the CI benchmark drops below **100,000 +lines/second**, reject and investigate before merging. + +--- + +### Obligation 4 — Mutation Score ≥ 90% for Core Changes + +Any PR that modifies `src/zenzic/core/` must maintain or improve the mutation +score on the affected module. The current baseline for `rules.py` is **86.7%** +(242/279 mutants killed). + +The target for rc1 is **≥ 90%**. A PR that adds a new rule or modifies +detecting logic without killing the corresponding mutants will be rejected. + +**Running mutation testing:** + +```bash +nox -s mutation +``` + +**Interpreting surviving mutants:** + +Not all surviving mutants are equivalent. Before marking a mutant as +acceptable, confirm that: + +1. The mutant changes observable behaviour (it is not logically equivalent). +2. No existing test catches the mutant (it is a genuine gap). +3. Adding a test to kill it would be redundant or trivially circular. + +If unsure, add the test. The mutation suite is a living document of the +Sentinel's threat model. + +**ResolutionContext pickle validation (Eager Validation 2.0):** + +`ResolutionContext` is a `@dataclass(slots=True)` with only `Path` fields. +`Path` is pickleable by the standard library. The object serializes to 157 +bytes. However, if `ResolutionContext` ever gains a field that is not +pickleable (e.g. a file handle, a lock, a lambda), the parallel engine will +fail silently. + +To guard against this, any PR that adds a field to `ResolutionContext` must +include: + +```python +# In tests/test_redteam_remediation.py (or a dedicated test): +def test_resolution_context_is_pickleable(): + import pickle + ctx = ResolutionContext(docs_root=Path("/docs"), source_file=Path("/docs/a.md")) + assert pickle.loads(pickle.dumps(ctx)) == ctx +``` + +This test already exists in the test suite as of v0.5.0a4. + +**Shield Reporting Integrity (The Mutation Gate for Commit 2+):** + +The conformance requirement for the mutation score on the Shield is **broader** +than detection alone. It also covers the **reporting pipeline**: + +> *A secret that is detected but not correctly reported is a CRITICAL bug — +> indistinguishable from a secret that was never detected at all.* + +Any PR that touches the `_map_shield_to_finding()` conversion function, +the `SECURITY_BREACH` severity path in `SentinelReporter`, or the exit-code +routing in `cli.py` **must kill all three of these mandatory mutants** before +the PR is accepted: + +| Mutant name | What is changed | Test that must kill it | +|-------------|----------------|------------------------| +| **The Invisible** | `severity="security_breach"` → `severity="warning"` in `_map_shield_to_finding()` | `test_map_always_emits_security_breach_severity` | +| **The Amnesiac** | `_obfuscate_secret()` returns `raw` instead of the redacted form | `test_obfuscate_never_leaks_raw_secret` | +| **The Silencer** | `_map_shield_to_finding()` returns `None` instead of a `Finding` | `test_pipeline_appends_breach_finding_to_list` | + +**Running the mutation gate:** + +```bash +nox -s mutation +``` + +The session targets `rules.py`, `shield.py`, and `reporter.py` as configured in +`[tool.mutmut]` in `pyproject.toml`. No posargs are required. + +> **Infrastructure note — `mutmut_pytest.ini`:** +> `mutmut` v3 generates trampolines in a `mutants/` working copy. For these +> to be visible to pytest, `mutants/src/` must precede the installed +> site-packages on `sys.path`. `mutmut_pytest.ini` (tracked in the repo) +> provides an isolated pytest config (`import-mode=prepend`, +> `pythonpath = src`) used exclusively by the `nox -s mutation` session. +> The main `pyproject.toml` pytest config is not affected. + +**Fallback — Manual Mutation Verification (The Sentinel's Trial):** + +If the automated tool cannot report a score (e.g. due to an editable-install +mapping issue), apply each mutant by hand and confirm the test fails: + +```bash +# 1. Apply mutant, run the specific test, confirm FAIL, revert. +git diff # verify only one targeted line changed +pytest tests/test_redteam_remediation.py::TestShieldReportingIntegrity -v +git checkout -- src/ # revert +``` + +Manual verification is accepted as a temporary waiver per Architecture Lead +approval. Document the results in the PR description before merging. + +If the score is below 90% (automated) or any of the three trials pass when +they should fail (manual), add targeted tests before reopening the PR. Do not +mark surviving mutants as equivalent without explicit Architecture Lead approval. + +--- + ## Adding a new check Zenzic's checks live in `src/zenzic/core/`. Each check is a standalone function in either `scanner.py` (filesystem traversal) or `validator.py` (content validation). CLI wiring is in `cli.py`. diff --git a/docs/arch/vsm_engine.md b/docs/arch/vsm_engine.md new file mode 100644 index 0000000..06b5cd3 --- /dev/null +++ b/docs/arch/vsm_engine.md @@ -0,0 +1,414 @@ +--- +icon: lucide/map +--- + + + + +# VSM Engine — Architecture & Resolution Protocol + +> *"The VSM does not know where a file is. It knows where a file goes."* + +This document describes the Virtual Site Map engine, the `ResolutionContext` +object introduced in v0.5.0a4, and the **Context-Free Anti-Pattern** that led +to ZRT-004. Any developer writing or reviewing VSM-aware rules must read this +page before merging. + +--- + +## 1. What the VSM Is (and Is Not) + +The Virtual Site Map (VSM) is a pure in-memory projection of what the build +engine will serve: + +```python +VSM = dict[str, Route] # canonical URL → Route +``` + +A `Route` contains: + +| Field | Type | Meaning | +|-------|------|---------| +| `url` | `str` | Canonical URL, e.g. `/guide/install/` | +| `source` | `str` | Relative source path, e.g. `guide/install.md` | +| `status` | `str` | `REACHABLE` / `ORPHAN_BUT_EXISTING` / `IGNORED` / `CONFLICT` | +| `anchors` | `frozenset[str]` | Heading slugs pre-computed from the source | + +The VSM is **not** a filesystem view. `Route.url` is the address a browser +would request, not the address a file system `open()` would accept. A file can +exist on disk (`Path.exists() == True`) and be `IGNORED` in the VSM. A URL can +be `REACHABLE` in the VSM and have no file on disk (Ghost Routes). + +**Corollary:** Any code that validates links by calling `Path.exists()` inside +a rule is wrong by definition. The VSM is the oracle; the filesystem is not. + +--- + +## 2. Route Status Reference + +```mermaid +flowchart TD + classDef ok fill:#052e16,stroke:#16a34a,stroke-width:2px,color:#d1fae5 + classDef warn fill:#3b1d00,stroke:#d97706,stroke-width:2px,color:#fef3c7 + classDef err fill:#1c0a0a,stroke:#dc2626,stroke-width:2px,color:#fee2e2 + classDef info fill:#0f172a,stroke:#38bdf8,stroke-width:2px,color:#e2e8f0 + + R["REACHABLE"]:::ok + O["ORPHAN_BUT_EXISTING"]:::warn + I["IGNORED"]:::info + C["CONFLICT"]:::err + + R -- "listed in nav: OR Ghost Route" --- R + O -- "on disk, absent from nav:" --- O + I -- "README.md, _private/" --- I + C -- "two .md files → same URL" --- C +``` + +| Status | Set by | Link to this status | +|--------|--------|---------------------| +| `REACHABLE` | nav listing, Ghost Route, locale shadow | ✅ Valid | +| `ORPHAN_BUT_EXISTING` | file on disk, absent from nav | ⚠️ Z002 warning | +| `IGNORED` | README not in nav, excluded patterns | ❌ Z001 error | +| `CONFLICT` | two sources → same canonical URL | ❌ Z001 error | + +--- + +## 3. URL Resolution: The Pipeline + +Converting a raw Markdown href (`../guide/install.md`) to a canonical URL +(`/guide/install/`) requires three transformations, applied in sequence: + +```mermaid +flowchart LR + classDef step fill:#0f172a,stroke:#6366f1,stroke-width:2px,color:#e2e8f0 + classDef gate fill:#0f172a,stroke:#f59e0b,stroke-width:2px,color:#e2e8f0,shape:diamond + classDef out fill:#052e16,stroke:#16a34a,stroke-width:2px,color:#d1fae5 + classDef bad fill:#1c0a0a,stroke:#dc2626,stroke-width:2px,color:#fee2e2 + + A["Raw href\n'../guide/install.md'"] + B["① Normalise\nurlsplit + unquote\nbackslash → /"]:::step + C{"② Context check\nhas .. AND context?"}:::gate + D["③ os.path.normpath\nrelative to source_dir"]:::step + E{"④ Boundary check\nstays within docs_root?"}:::gate + F["⑤ Clean-URL transform\n strip .md / index\n prepend /, append /"]:::step + G["/guide/install/"]:::out + H["None\n(skip, do not report)"]:::bad + + A --> B --> C + C -->|"yes"| D --> E + C -->|"no (root-relative)"| F + E -->|"yes"| F --> G + E -->|"no (escapes root)"| H +``` + +### Step ①: Normalise + +Strip query strings and percent-encoding artefacts: + +```python +parsed = urlsplit(href) +path = unquote(parsed.path.replace("\\", "/")).rstrip("/") +``` + +### Step ②–③: Context-Aware Relative Resolution (v0.5.0a4+) + +If the href contains `..` segments **and** a `ResolutionContext` is provided, +the path is resolved relative to the source file's directory: + +```python +if source_dir is not None and docs_root is not None and ".." in path: + raw_target = os.path.normpath(str(source_dir) + os.sep + path.replace("/", os.sep)) +``` + +Without context (backwards-compatible path), the `..` segments are carried +forward as-is into the clean-URL transform. This is correct for hrefs that do +*not* traverse upward (`../sibling.md` from `docs/index.md` is unambiguous) +but wrong for hrefs from deeply nested source files (see ZRT-004 below). + +### Step ④: Boundary Check + +```python +def _to_canonical_url(href: str, source_dir=None, docs_root=None): + ... + root_str = str(docs_root) + if not (raw_target == root_str or raw_target.startswith(root_str + os.sep)): + return None # path escapes docs_root — Shield boundary +``` + +A path that escapes `docs_root` is not a broken link — it is a potential +traversal attack. It returns `None`, which is silently skipped by the caller. +No Z001 finding is emitted. No exception is raised. + +### Step ⑤: Clean-URL Transform + +```python +def _to_canonical_url(href: str, source_dir=None, docs_root=None): + ... + if path.endswith(".md"): + path = path[:-3] + + parts = [p for p in path.split("/") if p] + if parts and parts[-1] == "index": + parts = parts[:-1] + + return "/" + "/".join(parts) + "/" +``` + +--- + +## 4. ResolutionContext — The Context Protocol + +### Definition + +```python +@dataclass(slots=True) +class ResolutionContext: + """Source-file context for VSM-aware rules. + + Attributes: + docs_root: Absolute path to the docs/ directory. + source_file: Absolute path of the Markdown file being checked. + """ + docs_root: Path + source_file: Path +``` + +### Why It Exists + +Before v0.5.0a4, `VSMBrokenLinkRule._to_canonical_url()` was a `@staticmethod`. +It received only `href: str`. This is the **Context-Free Anti-Pattern**. + +A pure function that converts a relative href to an absolute URL needs to know +two things: + +1. **Where does the href start from?** (the source file's directory) +2. **What is the containment boundary?** (the docs root) + +A static method cannot have this knowledge. Therefore, it silently produced +wrong results for any file not at the docs root. + +### The Context-Free Anti-Pattern + +> **Definition:** A method that converts a relative path to an absolute URL +> without receiving information about the origin of that relative path. + +Examples of the anti-pattern: + +```python +# ❌ ANTI-PATTERN: static method, no origin context +@staticmethod +def _to_canonical_url(href: str) -> str | None: + path = href.rstrip("/") + ... # what directory is href relative to? Unknown. + +# ❌ ANTI-PATTERN: module-level function with only the href +def resolve_href(href: str) -> str | None: + ... # same problem + +# ❌ ANTI-PATTERN: assuming href is relative to docs root +def check_vsm(self, file_path, text, vsm, anchors_cache): + # file_path is docs/a/b/page.md + # href is ../sibling.md + # result is /sibling/, but correct answer is /a/sibling/ + url = self._to_canonical_url(href) +``` + +The correct pattern: + +```python +# ✅ CORRECT: instance method with explicit context +def _to_canonical_url( + self, + href: str, + source_dir: Path | None = None, # where the href originates + docs_root: Path | None = None, # containment boundary +) -> str | None: + ... +``` + +### How to Pass Context to check_vsm + +The engine passes context when `run_vsm` is called by the coordinator: + +```python +# In scan_docs_references() or the plugin: +context = ResolutionContext( + docs_root=Path(config.docs_dir), + source_file=md_file, +) +rule_engine.run_vsm(md_file, text, vsm, anchors_cache, context=context) +``` + +Inside a rule that overrides `check_vsm`: + +```python +def check_vsm( + self, + file_path: Path, + text: str, + vsm: Mapping[str, Route], + anchors_cache: dict[Path, set[str]], + context: ResolutionContext | None = None, # ← always accept +) -> list[Violation]: + for url, lineno, raw_line in _extract_inline_links_with_lines(text): + target_url = self._to_canonical_url( + url, + source_dir=context.source_file.parent if context else None, + docs_root=context.docs_root if context else None, + ) +``` + +### Backwards Compatibility + +`context` defaults to `None` in both `BaseRule.check_vsm` and +`AdaptiveRuleEngine.run_vsm`. Existing rules that do not accept the parameter +will receive a `TypeError` wrapped in a `RULE-ENGINE-ERROR` finding — they +will not crash the scan, but they will not benefit from context-aware +resolution either. + +**Migration checklist for existing VSM-aware rules:** + +1. Add `context: "ResolutionContext | None" = None` to `check_vsm` signature. +2. Pass `source_dir` and `docs_root` from `context` to any url-resolving helper. +3. Add a test case with a `../../`-relative href from a nested file. + +--- + +## 5. Worked Examples + +### Example A: Simple relative href (context not needed) + +```text +Source: docs/guide.md +href: install.md +``` + +Step ① → `install` +Step ② → no `..`, skip context +Step ⑤ → `/install/` +VSM lookup → `vsm.get("/install/")` + +Context makes no difference here. The href is already root-relative-safe. + +--- + +### Example B: Single `..` from a subdirectory (context required) + +```text +Source: docs/api/reference.md +href: ../guide/index.md +``` + +**Without context (legacy behaviour):** + +Step ① → `../guide/index` +Step ⑤ → `/../guide` → parts `['..', 'guide']` → `/guide/` ← *wrong path arithmetic* + +**With `ResolutionContext(docs_root=/docs, source_file=/docs/api/reference.md)`:** + +Step ③ → `normpath("/docs/api" + "/../guide/index")` → `/docs/guide/index` +Step ④ → `/docs/guide/index` starts with `/docs/` ✅ +Step ⑤ → `relative_to(/docs)` → `guide/index` → strip `index` → `/guide/` +VSM lookup → `vsm.get("/guide/")` ✅ correct + +--- + +### Example C: Traversal escape (blocked at boundary) + +```text +Source: docs/api/reference.md +href: ../../../../etc/passwd +``` + +Step ③ → `normpath("/docs/api" + "/../../../../etc/passwd")` → `/etc/passwd` +Step ④ → `/etc/passwd` does **not** start with `/docs/` → return `None` +Caller receives `None` → `continue` → zero findings emitted ← correct + +--- + +### Example D: Ghost Route (reachable without a file) + +```text +href: /it/ +``` + +Step ① → path `/it`, not a relative href → external check skips it +(Ghost Routes appear in the VSM as `REACHABLE`; the rule validates the URL +string directly against the VSM after the href is converted — if the URL is +already canonical, no conversion is needed.) + +--- + +## 6. VSM-Aware Rule Contract + +Every rule that overrides `check_vsm` must satisfy this contract: + +| Requirement | Rationale | +|-------------|-----------| +| Accept `context: ResolutionContext \| None = None` | Backwards-compat + context forwarding | +| Do not call `Path.exists()` | VSM is the oracle, filesystem is not | +| Do not mutate `vsm` or `anchors_cache` | Shared across all rules; mutation causes race conditions in parallel mode | +| Return `Violation`, not `RuleFinding` | `run_vsm` converts via `v.as_finding()` | +| Handle `context=None` gracefully | Context may be absent in tests or old callers | + +--- + +## 7. Anti-Pattern Catalogue + +The following patterns are **banned** in `core/rules.py` and `core/validator.py`: + +| Pattern | Why banned | Alternative | +|---------|-----------|-------------| +| `@staticmethod def _to_canonical_url(href)` | Cannot receive origin context | Instance method with `source_dir`, `docs_root` | +| `Path.exists()` inside `check_vsm` | Violates Zero I/O contract | `vsm.get(url) is not None` | +| `Path.resolve()` inside a rule | Makes I/O call | `os.path.normpath()` (pure string math) | +| `open()` inside a rule | Violates Zero I/O contract | All content in `text` arg | +| `vsm[url]` (direct subscript) | Raises `KeyError` on missing URL | `vsm.get(url)` | + +--- + +## 8. Testing VSM-Aware Rules + +### Minimum test matrix + +Every `check_vsm` implementation must be tested with: + +| Case | Description | +|------|-------------| +| Root-level href | `guide.md` from `docs/index.md` | +| Single `..` with context | `../sibling.md` from `docs/subdir/page.md` | +| Multi-level `..` with context | `../../c/t.md` from `docs/a/b/page.md` | +| Traversal escape | `../../../../etc/passwd` from `docs/api/ref.md` | +| Absent from VSM | link to a URL not in the VSM → Z001 | +| `ORPHAN_BUT_EXISTING` | link to an orphan route → Z002 | +| `context=None` | all assertions must pass with no context | + +### Test fixture pattern + +```python +def _make_vsm(*urls: str, status: str = "REACHABLE") -> dict[str, Route]: + return { + url: Route(url=url, source=f"{url.strip('/')}.md", status=status) + for url in urls + } + +# Context for a file nested two levels deep +ctx = ResolutionContext( + docs_root=Path("/docs"), + source_file=Path("/docs/api/v2/reference.md"), +) + +violations = rule.check_vsm( + Path("/docs/api/v2/reference.md"), + "[Guide](../../guide/index.md)", + _make_vsm("/guide/"), + {}, + ctx, +) +assert violations == [] +``` + +--- + +*Document status: current as of v0.5.0a4. Update when `ResolutionContext` gains +new fields or the boundary-check logic changes.* diff --git a/docs/internal/security/shattered_mirror_report.md b/docs/internal/security/shattered_mirror_report.md new file mode 100644 index 0000000..bdafc6f --- /dev/null +++ b/docs/internal/security/shattered_mirror_report.md @@ -0,0 +1,417 @@ + + + +# Security Analysis: Vulnerabilities in v0.5.0a3 + +--- + +> *"Ciò che non è documentato, non esiste; ciò che è documentato male, è +> un'imboscata."* +> +> This document records the root causes and architectural reasoning behind +> each vulnerability — to prevent regression and to inform future contributors. + +--- + +## 1. Executive Summary + +During the alpha phase of v0.5.0a3, an internal security analysis identified **four +confirmed vulnerabilities** spanning the three pillars +of Zenzic's security model: the Shield (secret detection), the Virtual Site Map +(routing validation), and the Adaptive Parallelism engine. + +All four were resolved in v0.5.0a4. This document records the root causes, +attack mechanics, and architectural reasoning behind each fix — both to prevent +regression and to explain to future contributors *why* the code is shaped the +way it is. + +--- + +## 2. The Sentinel's Threat Model + +Before examining each finding, it helps to understand what the Sentinel +promises and what it does not. + +| Promise | Mechanism | +|---------|-----------| +| No secret commits | Shield scans every byte before processing | +| No broken links | VSM validates links against routing state, not the filesystem | +| No deadlocked CI | Worker timeout + canary reject catastrophic patterns | +| No false navigation | VSM resolves links from source-file context | + +The analysis found that three of these four promises had structural gaps — not +logic bugs, but **architectural blind spots** where the component was correctly +designed for its *stated input* but had never considered a class of inputs that +was technically valid. + +--- + +## 3. Findings + +### ZRT-001 — CRITICAL: Shield Blind to YAML Frontmatter + +#### What Happened + +`ReferenceScanner.harvest()` runs two passes over each file: + +1. **Pass 1 (Shield):** scan lines for secret patterns. +2. **Pass 1b (Content):** harvest reference definitions and alt-text. + +Both passes needed to skip YAML frontmatter (`---` blocks) — but for *different +and opposite reasons*: + +- The **Content pass** must skip frontmatter because `author: Jane Doe` would + otherwise be parsed as a broken reference definition. +- The **Shield pass** must **not** skip frontmatter because `aws_key: AKIA…` + is a real secret that must be caught. + +The original implementation shared a single generator, `_skip_frontmatter()`, +for both passes. This was correct for the Content stream and catastrophically +wrong for the Shield stream. + +#### Attack Path + +```markdown +--- +description: API Guide +aws_key: AKIA[20-char-key-redacted] ← invisible to Shield +stripe_key: sk_live_[24-char-key-redacted] ← invisible to Shield +--- + +# API Guide + +Normal content here. +``` + +```bash +zenzic check all # Exit 0 — PASS ← Zero findings reported (pre-fix) +git commit -am "add api credentials" # Key committed, CI green — breach +``` + +#### Root Cause Diagram + +```text + ┌─────────────────────────────────┐ + │ harvest() │ + │ │ +File on disk ──►│ _skip_frontmatter(fh) │──► Shield stream + │ ↑ │ + │ skips lines 1–N │ (BLIND SPOT) + │ of the --- block │ + │ │ + │ _iter_content_lines(file) │──► Content stream + └─────────────────────────────────┘ +``` + +#### The Fix: Dual-Stream Architecture + +The two streams now use **different generators** with **different filtering +contracts**: + +```text + ┌─────────────────────────────────┐ + │ harvest() │ + │ │ +File on disk ──►│ enumerate(fh, start=1) │──► Shield stream + │ ↑ │ (ALL lines) + │ no filtering │ + │ │ + │ _iter_content_lines(file) │──► Content stream + │ ↑ │ (frontmatter + + │ skips frontmatter │ fences skipped) + │ skips fenced blocks │ + └─────────────────────────────────┘ +``` + +The Shield now sees every byte of the file. The Content stream continues to +skip frontmatter to avoid false-positive reference findings. + +**Why this is structurally sound:** The Shield and the Content harvester have +orthogonal filtering requirements. They must never share a generator. + +--- + +### ZRT-002 — HIGH: ReDoS + ProcessPoolExecutor Deadlock + +#### What Happened + +The `AdaptiveRuleEngine` validates rules for pickle-serializability at +construction time (`_assert_pickleable()`). This is correct — it ensures every +rule can be dispatched to a worker process. However, `pickle.dumps()` is +blind to computational complexity. A pattern like `^(a+)+$` pickles cleanly +and dispatches successfully, then hangs indefinitely inside the worker when +applied to a string like `"a" * 30 + "b"`. + +`ProcessPoolExecutor` in its original form used `executor.map()`, which has no +timeout. The result: one evil `[[custom_rules]]` entry in `zenzic.toml` could +permanently block every CI pipeline that ran on a repository with ≥ 50 files. + +#### The Complexity of Catastrophic Backtracking + +The pattern `^(a+)+$` contains a **nested quantifier** — `+` inside `+`. When +applied to `"aaa…aab"` (the ReDoS trigger), the regex engine must explore an +exponential number of paths through the string before determining it does not +match. At n=30 characters, this takes minutes. At n=50, hours. + +The key insight is that `re.compile()` does **not** validate for ReDoS. +Compilation is O(1). The catastrophic cost only materialises at `match()`/`search()` +time on crafted input. + +#### Attack Path + +```toml +# zenzic.toml +[[custom_rules]] +id = "STYLE-001" +pattern = "^(a+)+$" # ← catastrophic backtracking +message = "Style check" +severity = "error" +``` + +```markdown + +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab ← ReDoS trigger string +``` + +```bash +zenzic check all --workers 4 # All 4 workers hang. CI never exits. +``` + +#### Two Complementary Defences + +**Prevention — `_assert_regex_canary()` (construction time):** + +```text +AdaptiveRuleEngine.__init__(): + for rule in rules: + _assert_pickleable(rule) ← existing check + _assert_regex_canary(rule) ← NEW: 100ms SIGALRM stress test +``` + +The canary runs each `CustomRule` pattern against three stress strings under a +`signal.SIGALRM` watchdog of 100 ms. If the pattern takes longer than 100 ms +on a 30-character input, it is categorically catastrophic and raises +`PluginContractError` *before the first file is scanned*. + +**Containment — `future.result(timeout=30)` (runtime):** + +```text +# Before +raw = list(executor.map(_worker, work_items)) # hangs forever + +# After +futures_map = {executor.submit(_worker, item): item[0] for item in work_items} +for fut, md_file in futures_map.items(): + try: + raw.append(fut.result(timeout=30)) + except concurrent.futures.TimeoutError: + raw.append(_make_timeout_report(md_file)) # Z009 finding, never crash +``` + +A worker that exceeds 30 seconds produces a `Z009: ANALYSIS_TIMEOUT` finding +instead of hanging the coordinator. + +**Why both defences are necessary:** The canary is platform-dependent +(`SIGALRM` is POSIX-only; it is a no-op on Windows). The timeout is the +universal backstop. + +--- + +### ZRT-003 — MEDIUM: Split-Token Shield Bypass via Table Obfuscation + +#### What Happened + +The Shield's `scan_line_for_secrets()` applied regex patterns to each raw line. +The AWS key pattern `AKIA[0-9A-Z]{16}` requires 20 **contiguous** characters. +An author (malicious or careless) who documents credentials in a table column +using inline code notation and concatenation operators breaks the contiguity: + +```markdown +| Key ID | `AKIA` + `[16-char-suffix]` | +``` + +The raw line fed to the regex is (rendered in source as split tokens): + +```text +| Key ID | `AKIA` + `[16-char-suffix]` | +``` + +The longest contiguous alphanum sequence is `ABCDEF` (6 chars). The pattern +never matches. The Shield reports zero findings. + +#### The Fix: Pre-Scan Normalizer + +`_normalize_line_for_shield()` applies three transformations before the regex +patterns run: + +1. **Unwrap backtick spans:** `` `AKIA` `` → `AKIA` +2. **Remove concatenation operators:** `` ` ` + ` ` `` → nothing +3. **Collapse table pipes:** `|` → `` + +The normalised form of the attack line is `Key ID AKIA[16-char-suffix]`, +which matches `AKIA[0-9A-Z]{16}` cleanly. + +**Both** the raw and normalised forms are scanned. A `seen` set prevents +duplicate findings when a secret appears non-obfuscated *and* the normalised +form also matches. + +--- + +### ZRT-004 — MEDIUM: VSMBrokenLinkRule Context-Free URL Resolution + +#### What Happened + +`VSMBrokenLinkRule._to_canonical_url()` was a `@staticmethod`. It converted +hrefs to canonical VSM URLs using a root-relative algorithm: strip `.md`, +drop `index`, prepend `/`, append `/`. This is correct for files in the docs +root but produces the wrong result for files in subdirectories when the href +contains `..` segments. + +#### Example of the Bug + +```text +Source file: docs/a/b/page.md +Link: [See this](../../c/target.md) + +Expected URL: /c/target/ ← what the browser would navigate to +Computed URL: /c/target/ ← accidentally correct in this 2-level case + +Source file: docs/a/b/page.md +Link: [See this](../sibling.md) + +Expected URL: /a/sibling/ ← file is docs/a/sibling.md +Computed URL: /sibling/ ← WRONG: resolved from root, not from source dir +``` + +The `InMemoryPathResolver` (used by `validate_links_async`) resolved links +correctly because it had `source_file` context from the beginning. The +`VSMBrokenLinkRule` did not, creating a silent discrepancy between two +validation surfaces. + +#### The Fix: ResolutionContext + +```python +@dataclass(slots=True) +class ResolutionContext: + docs_root: Path + source_file: Path +``` + +`BaseRule.check_vsm()` and `AdaptiveRuleEngine.run_vsm()` now accept +`context: ResolutionContext | None = None`. When context is provided, +`_to_canonical_url()` resolves `..` segments using `os.path.normpath` +relative to `context.source_file.parent`, then maps the absolute resolved path +back to a docs-relative URL. + +The method also enforces the Shield boundary: if the resolved path escapes +`docs_root`, it returns `None` (equivalent to a `PathTraversal` outcome in +`InMemoryPathResolver`). + +**The Architectural Lesson:** Any method that converts a relative href to an +absolute URL *must* know where that href came from. A `@staticmethod` that +receives only the href string is structurally incapable of handling relative +paths correctly. In Zenzic, this is now called the **Context-Free Anti-Pattern** +(see `docs/arch/vsm_engine.md` for the full protocol). + +--- + +## 4. The Stream Multiplexing Architecture + +Post-remediation, `ReferenceScanner.harvest()` implements a clean two-stream +model. This section documents it for future contributors. + +```text +┌─────────────────────────────────────────────────────────────────┐ +│ ReferenceScanner.harvest() │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ SHIELD STREAM │ │ +│ │ Source: enumerate(file_handle, start=1) │ │ +│ │ Filter: NONE — every line including frontmatter │ │ +│ │ Transforms: │ │ +│ │ 1. _normalize_line_for_shield(line) [ZRT-003] │ │ +│ │ 2. scan_line_for_secrets(raw) │ │ +│ │ 3. scan_line_for_secrets(normalized) │ │ +│ │ Output: ("SECRET", SecurityFinding) events │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ CONTENT STREAM │ │ +│ │ Source: _iter_content_lines(file_path) │ │ +│ │ Filter: skips YAML frontmatter, skips fenced blocks │ │ +│ │ Transforms: │ │ +│ │ 1. Parse reference definitions (_RE_REF_DEF) │ │ +│ │ 2. Scan ref-def URLs for secrets (scan_url_for_sec…) │ │ +│ │ 3. Parse inline images (_RE_IMAGE_INLINE) │ │ +│ │ Output: ("DEF", "IMG", "MISSING_ALT", …) events │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ Final output: events merged and sorted by line number │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**Invariant:** The Shield stream and the Content stream must *never share a +generator*. Any future refactoring that merges them reintroduces ZRT-001. + +--- + +## 5. What Made These Vulnerabilities Possible + +All four findings share a common root: **implicit contracts at subsystem +boundaries**. + +| Finding | Implicit contract violated | +|---------|--------------------------| +| ZRT-001 | "The Shield sees all lines" — violated by shared generator | +| ZRT-002 | "Pickle-safe means execution-safe" — violated by ReDoS blindness | +| ZRT-003 | "One line = one token" — violated by Markdown syntax fragmentation | +| ZRT-004 | "URL resolution is context-free" — violated by relative paths | + +The fix in each case is the same pattern: **make the contract explicit in the +type system or function signature**, and **test it directly**. + +--- + +## 6. Regression Prevention + +The following tests in `tests/test_redteam_remediation.py` serve as permanent +regression guards. They must never be deleted or weakened: + +| Test class | What it guards | +|------------|---------------| +| `TestShieldFrontmatterCoverage` | ZRT-001 — frontmatter scanning | +| `TestReDoSCanary` | ZRT-002 — canary rejection at construction | +| `TestShieldNormalizer` | ZRT-003 — split-token reconstruction | +| `TestVSMContextAwareResolution` | ZRT-004 — context-aware URL resolution | +| `TestShieldReportingIntegrity` | Z-SEC-002 — breach severity, secret masking, bridge fidelity | + +If a future refactoring causes any of these tests to fail, the PR **must not +be merged** until either the test is proven incorrect (and the regression guard +is replaced with an equivalent) or the fix is reverted. + +--- + +## 7. Lessons Learned + +For v0.5.0rc1 and beyond: + +1. **Every new subsystem boundary must document its filtering contract.** + A generator that skips lines must have a JSDoc-style note explaining + *what* it skips and *why* the caller is permitted to use it. + +2. **`@staticmethod` methods that handle paths are suspect by default.** + If a static method takes a path string, ask: does it need to know where + that path came from? If yes, it is not a static method — it is a missing + context argument. + +3. **User-supplied regex patterns are untrusted inputs.** Always run the + canary. The 100 ms budget is not a performance requirement — it is a + security boundary. + +4. **The parallelism layer must always have a timeout.** A coordinator that + waits indefinitely on workers is a single point of failure for the entire + CI pipeline. + +--- + +*This document is current as of v0.5.0a4.* diff --git a/docs/it/arch/vsm_engine.md b/docs/it/arch/vsm_engine.md new file mode 100644 index 0000000..14e99dd --- /dev/null +++ b/docs/it/arch/vsm_engine.md @@ -0,0 +1,417 @@ +--- +icon: lucide/map +--- + + + + +# Motore VSM — Architettura e Protocollo di Risoluzione + +> *"La VSM non sa dove si trova un file. Sa dove un file andrà."* + +Questo documento descrive il motore della Virtual Site Map (VSM), l'oggetto +`ResolutionContext` introdotto in v0.5.0a4 e l'**Anti-Pattern Context-Free** che +ha originato ZRT-004. Qualsiasi sviluppatore che scriva o revisioni regole +VSM-aware deve leggere questa pagina prima di aprire una PR. + +--- + +## 1. Cos'è la VSM (e cosa non è) + +La Virtual Site Map (VSM) è una proiezione puramente in-memory di ciò che il +motore di build servirà: + +```python +VSM = dict[str, Route] # URL canonico → Route +``` + +Una `Route` contiene: + +| Campo | Tipo | Significato | +|-------|------|-------------| +| `url` | `str` | URL canonico, es. `/guida/installazione/` | +| `source` | `str` | Percorso sorgente relativo, es. `guida/installazione.md` | +| `status` | `str` | `REACHABLE` / `ORPHAN_BUT_EXISTING` / `IGNORED` / `CONFLICT` | +| `anchors` | `frozenset[str]` | Slug degli heading pre-calcolati dal sorgente | + +La VSM **non** è una vista del filesystem. `Route.url` è l'indirizzo che un +browser richiederebbe, non quello che un `open()` del filesystem accetterebbe. +Un file può esistere su disco (`Path.exists() == True`) ed essere `IGNORED` +nella VSM. Un URL può essere `REACHABLE` nella VSM senza avere un file su disco +(Ghost Route). + +**Corollario:** Qualsiasi codice che valida i link chiamando `Path.exists()` +all'interno di una regola è sbagliato per definizione. La VSM è l'oracolo; il +filesystem non lo è. + +--- + +## 2. Riferimento agli Stati di Route + +```mermaid +flowchart TD + classDef ok fill:#052e16,stroke:#16a34a,stroke-width:2px,color:#d1fae5 + classDef warn fill:#3b1d00,stroke:#d97706,stroke-width:2px,color:#fef3c7 + classDef err fill:#1c0a0a,stroke:#dc2626,stroke-width:2px,color:#fee2e2 + classDef info fill:#0f172a,stroke:#38bdf8,stroke-width:2px,color:#e2e8f0 + + R["REACHABLE"]:::ok + O["ORPHAN_BUT_EXISTING"]:::warn + I["IGNORED"]:::info + C["CONFLICT"]:::err + + R -- "in nav: O Ghost Route" --- R + O -- "su disco, assente da nav:" --- O + I -- "README.md, _private/" --- I + C -- "due .md → stesso URL" --- C +``` + +| Stato | Impostato da | Link a questo stato | +|-------|-------------|---------------------| +| `REACHABLE` | voce nav, Ghost Route, shadow locale | ✅ Valido | +| `ORPHAN_BUT_EXISTING` | file su disco, assente da nav | ⚠️ warning Z002 | +| `IGNORED` | README non in nav, pattern esclusi | ❌ errore Z001 | +| `CONFLICT` | due sorgenti → stesso URL canonico | ❌ errore Z001 | + +--- + +## 3. Risoluzione URL: La Pipeline + +Convertire un href Markdown grezzo (`../guida/installazione.md`) in un URL +canonico (`/guida/installazione/`) richiede tre trasformazioni applicate in +sequenza: + +```mermaid +flowchart LR + classDef step fill:#0f172a,stroke:#6366f1,stroke-width:2px,color:#e2e8f0 + classDef gate fill:#0f172a,stroke:#f59e0b,stroke-width:2px,color:#e2e8f0,shape:diamond + classDef out fill:#052e16,stroke:#16a34a,stroke-width:2px,color:#d1fae5 + classDef bad fill:#1c0a0a,stroke:#dc2626,stroke-width:2px,color:#fee2e2 + + A["href grezzo\n'../guida/installazione.md'"] + B["① Normalizza\nurlsplit + unquote\nbackslash → /"]:::step + C{"② Controllo contesto\nha .. E context?"}:::gate + D["③ os.path.normpath\nrelativo a source_dir"]:::step + E{"④ Controllo confine\nrimane in docs_root?"}:::gate + F["⑤ Clean-URL transform\n rimuovi .md / index\n prependi /, aggiungi /"]:::step + G["/guida/installazione/"]:::out + H["None\n(salta, non segnalare)"]:::bad + + A --> B --> C + C -->|"sì"| D --> E + C -->|"no (root-relativo)"| F + E -->|"sì"| F --> G + E -->|"no (esce da root)"| H +``` + +### Passo ①: Normalizza + +Rimuovi query string e artefatti di percent-encoding: + +```python +parsed = urlsplit(href) +path = unquote(parsed.path.replace("\\", "/")).rstrip("/") +``` + +### Passo ②–③: Risoluzione Relativa Context-Aware (v0.5.0a4+) + +Se l'href contiene segmenti `..` **e** viene fornito un `ResolutionContext`, +il percorso viene risolto relativo alla directory del file sorgente: + +```python +if source_dir is not None and docs_root is not None and ".." in path: + raw_target = os.path.normpath(str(source_dir) + os.sep + path.replace("/", os.sep)) +``` + +Senza contesto (percorso retrocompatibile), i segmenti `..` vengono portati +avanti così come sono. Questo è corretto per href che non attraversano verso +l'alto, ma sbagliato per file profondamente annidati (vedi ZRT-004 di seguito). + +### Passo ④: Controllo del Confine + +```python +def _to_canonical_url(href: str, source_dir=None, docs_root=None): + ... + root_str = str(docs_root) + if not (raw_target == root_str or raw_target.startswith(root_str + os.sep)): + return None # il percorso esce da docs_root — confine Shield +``` + +Un percorso che esce da `docs_root` non è un link rotto — è un potenziale +attacco di path traversal. Restituisce `None`, ignorato silenziosamente dal +chiamante. Nessun finding Z001. Nessuna eccezione. + +### Passo ⑤: Clean-URL Transform + +```python +def _to_canonical_url(href: str, source_dir=None, docs_root=None): + ... + if path.endswith(".md"): + path = path[:-3] + + parts = [p for p in path.split("/") if p] + if parts and parts[-1] == "index": + parts = parts[:-1] + + return "/" + "/".join(parts) + "/" +``` + +--- + +## 4. ResolutionContext — Il Protocollo di Contesto + +### Definizione + +```python +@dataclass(slots=True) +class ResolutionContext: + """Contesto del file sorgente per le regole VSM-aware. + + Attributes: + docs_root: Percorso assoluto alla directory docs/. + source_file: Percorso assoluto del file Markdown in esame. + """ + docs_root: Path + source_file: Path +``` + +### Perché Esiste + +Prima di v0.5.0a4, `VSMBrokenLinkRule._to_canonical_url()` era un +`@staticmethod`: riceveva solo `href: str`. Questo è l'**Anti-Pattern +Context-Free**. + +Una funzione pura che converte un href relativo in un URL assoluto ha bisogno +di sapere due cose: + +1. **Da dove parte l'href?** (la directory del file sorgente) +2. **Qual è il confine di contenimento?** (la docs root) + +Un metodo statico non può avere questa conoscenza. Produceva quindi risultati +errati in modo silenzioso per qualsiasi file non alla radice dei docs. + +### L'Anti-Pattern Context-Free + +> **Definizione:** Un metodo che converte un percorso relativo in URL assoluto +> senza ricevere informazioni sull'origine di quel percorso relativo. + +Esempi dell'anti-pattern: + +```python +# ❌ ANTI-PATTERN: metodo statico, nessun contesto di origine +@staticmethod +def _to_canonical_url(href: str) -> str | None: + path = href.rstrip("/") + ... # rispetto a quale directory è relativo href? Sconosciuto. + +# ❌ ANTI-PATTERN: funzione a livello di modulo con solo l'href +def resolve_href(href: str) -> str | None: + ... # stesso problema + +# ❌ ANTI-PATTERN: assumere che href sia relativo alla docs root +def check_vsm(self, file_path, text, vsm, anchors_cache): + # file_path è docs/a/b/pagina.md + # href è ../fratello.md + # risultato è /fratello/, ma la risposta corretta è /a/fratello/ + url = self._to_canonical_url(href) +``` + +Il pattern corretto: + +```python +# ✅ CORRETTO: metodo di istanza con contesto esplicito +def _to_canonical_url( + self, + href: str, + source_dir: Path | None = None, # da dove parte l'href + docs_root: Path | None = None, # confine di contenimento +) -> str | None: + ... +``` + +### Come Passare il Contesto a check_vsm + +Il motore passa il contesto quando `run_vsm` viene chiamato dal coordinatore: + +```python +# In scan_docs_references() o nel plugin: +context = ResolutionContext( + docs_root=Path(config.docs_dir), + source_file=md_file, +) +rule_engine.run_vsm(md_file, text, vsm, anchors_cache, context=context) +``` + +All'interno di una regola che sovrascrive `check_vsm`: + +```python +def check_vsm( + self, + file_path: Path, + text: str, + vsm: Mapping[str, Route], + anchors_cache: dict[Path, set[str]], + context: ResolutionContext | None = None, # ← accetta sempre +) -> list[Violation]: + for url, lineno, raw_line in _extract_inline_links_with_lines(text): + target_url = self._to_canonical_url( + url, + source_dir=context.source_file.parent if context else None, + docs_root=context.docs_root if context else None, + ) +``` + +### Retrocompatibilità + +`context` ha default `None` sia in `BaseRule.check_vsm` che in +`AdaptiveRuleEngine.run_vsm`. Le regole esistenti che non accettano il +parametro riceveranno un `TypeError` incapsulato in un finding +`RULE-ENGINE-ERROR` — non crasheranno la scansione, ma non beneficeranno +della risoluzione contestuale. + +**Checklist di migrazione per le regole VSM-aware esistenti:** + +1. Aggiungi `context: "ResolutionContext | None" = None` alla firma di `check_vsm`. +2. Passa `source_dir` e `docs_root` da `context` a qualsiasi helper di risoluzione URL. +3. Aggiungi un caso di test con un href `../../`-relativo da un file annidato. + +--- + +## 5. Esempi Pratici + +### Esempio A: href relativo semplice (contesto non necessario) + +```text +Sorgente: docs/guida.md +href: installazione.md +``` + +Passo ① → `installazione` +Passo ② → nessun `..`, salta contesto +Passo ⑤ → `/installazione/` +Lookup VSM → `vsm.get("/installazione/")` + +Il contesto non fa differenza qui. L'href è già sicuro rispetto alla root. + +--- + +### Esempio B: `..` singolo da una sottodirectory (contesto richiesto) + +```text +Sorgente: docs/api/riferimento.md +href: ../guida/index.md +``` + +**Senza contesto (comportamento legacy):** + +Passo ① → `../guida/index` +Passo ⑤ → `/../guida` → parti `['..', 'guida']` → `/guida/` ← *aritmetica sbagliata* + +**Con `ResolutionContext(docs_root=/docs, source_file=/docs/api/riferimento.md)`:** + +Passo ③ → `normpath("/docs/api" + "/../guida/index")` → `/docs/guida/index` +Passo ④ → `/docs/guida/index` inizia con `/docs/` ✅ +Passo ⑤ → `relative_to(/docs)` → `guida/index` → rimuovi `index` → `/guida/` +Lookup VSM → `vsm.get("/guida/")` ✅ corretto + +--- + +### Esempio C: Escape di traversal (bloccato al confine) + +```text +Sorgente: docs/api/riferimento.md +href: ../../../../etc/passwd +``` + +Passo ③ → `normpath("/docs/api" + "/../../../../etc/passwd")` → `/etc/passwd` +Passo ④ → `/etc/passwd` **non** inizia con `/docs/` → restituisce `None` +Il chiamante riceve `None` → `continue` → zero finding emessi ← corretto + +--- + +### Esempio D: Ghost Route (raggiungibile senza file) + +```text +href: /it/ +``` + +Passo ① → percorso `/it`, non href relativo → il controllo esterno lo salta +(Le Ghost Route appaiono nella VSM come `REACHABLE`; la regola valida la +stringa URL direttamente contro la VSM — se l'URL è già canonico, nessuna +conversione è necessaria.) + +--- + +## 6. Contratto delle Regole VSM-Aware + +Ogni regola che sovrascrive `check_vsm` deve soddisfare questo contratto: + +| Requisito | Razionale | +|-----------|-----------| +| Accettare `context: ResolutionContext \| None = None` | Retrocompat + forwarding del contesto | +| Non chiamare `Path.exists()` | La VSM è l'oracolo, non il filesystem | +| Non mutare `vsm` o `anchors_cache` | Condivisi tra le regole; la mutazione causa race condition in modalità parallela | +| Restituire `Violation`, non `RuleFinding` | `run_vsm` converte tramite `v.as_finding()` | +| Gestire `context=None` con eleganza | Il contesto può essere assente nei test o nei chiamanti vecchi | + +--- + +## 7. Catalogo degli Anti-Pattern + +I seguenti pattern sono **vietati** in `core/rules.py` e `core/validator.py`: + +| Pattern | Perché vietato | Alternativa | +|---------|---------------|-------------| +| `@staticmethod def _to_canonical_url(href)` | Non può ricevere contesto di origine | Metodo di istanza con `source_dir`, `docs_root` | +| `Path.exists()` dentro `check_vsm` | Viola il contratto Zero I/O | `vsm.get(url) is not None` | +| `Path.resolve()` dentro una regola | Esegue I/O | `os.path.normpath()` (pura aritmetica di stringhe) | +| `open()` dentro una regola | Viola il contratto Zero I/O | Tutto il contenuto nell'argomento `text` | +| `vsm[url]` (subscript diretto) | Solleva `KeyError` su URL mancante | `vsm.get(url)` | + +--- + +## 8. Testing delle Regole VSM-Aware + +### Matrice di test minima + +Ogni implementazione di `check_vsm` deve essere testata con: + +| Caso | Descrizione | +|------|-------------| +| href a livello root | `guida.md` da `docs/index.md` | +| `..` singolo con contesto | `../fratello.md` da `docs/sottocartella/pagina.md` | +| `..` multi-livello con contesto | `../../c/t.md` da `docs/a/b/pagina.md` | +| Escape di traversal | `../../../../etc/passwd` da `docs/api/ref.md` | +| Assente dalla VSM | link a URL non in VSM → Z001 | +| `ORPHAN_BUT_EXISTING` | link a una route orfana → Z002 | +| `context=None` | tutte le asserzioni devono passare senza contesto | + +### Pattern fixture di test + +```python +def _make_vsm(*urls: str, status: str = "REACHABLE") -> dict[str, Route]: + return { + url: Route(url=url, source=f"{url.strip('/')}.md", status=status) + for url in urls + } + +# Contesto per un file annidato due livelli in profondità +ctx = ResolutionContext( + docs_root=Path("/docs"), + source_file=Path("/docs/api/v2/riferimento.md"), +) + +violations = rule.check_vsm( + Path("/docs/api/v2/riferimento.md"), + "[Guida](../../guida/index.md)", + _make_vsm("/guida/"), + {}, + ctx, +) +assert violations == [] +``` + +--- + +*Stato documento: aggiornato a v0.5.0a4. Aggiornare quando `ResolutionContext` +acquisisce nuovi campi o la logica di controllo del confine cambia.* diff --git a/docs/it/internal/security/shattered_mirror_report.md b/docs/it/internal/security/shattered_mirror_report.md new file mode 100644 index 0000000..1559a7f --- /dev/null +++ b/docs/it/internal/security/shattered_mirror_report.md @@ -0,0 +1,422 @@ + + + +# Analisi di Sicurezza: Vulnerabilità in v0.5.0a3 + +--- + +> *"Ciò che non è documentato, non esiste; ciò che è documentato male, è +> un'imboscata."* +> +> Questo documento registra le cause radice e il ragionamento architetturale +> dietro ogni vulnerabilità — per prevenire regressioni e informare i futuri contributori. + +--- + +## 1. Sommario Esecutivo + +Durante la fase alpha di v0.5.0a3, un'analisi di sicurezza interna ha identificato **quattro +vulnerabilità confermate** che attraversano +i tre pilastri del modello di sicurezza di Zenzic: lo Shield (rilevamento +segreti), la Virtual Site Map (validazione routing) e il motore di Parallelismo +Adattivo. + +Tutte e quattro sono state risolte in v0.5.0a4. Questo documento registra le +cause radice, le meccaniche di attacco e il ragionamento architetturale dietro +ogni fix — sia per prevenire regressioni che per spiegare ai futuri +contributori *perché* il codice ha questa forma. + +--- + +## 2. Il Modello di Minaccia della Sentinella + +Prima di esaminare ogni finding, è utile capire cosa promette la Sentinella +e cosa non promette. + +| Promessa | Meccanismo | +|----------|-----------| +| Nessun commit di segreti | Lo Shield scansiona ogni byte prima dell'elaborazione | +| Nessun link rotto | La VSM valida i link rispetto allo stato di routing, non al filesystem | +| Nessun CI in deadlock | Timeout worker + canary rigettano i pattern catastrofici | +| Nessuna navigazione falsa | La VSM risolve i link dal contesto del file sorgente | + +L'analisi ha rilevato che tre di queste quattro promesse avevano lacune +strutturali — non bug logici, ma **punti ciechi architetturali** dove il +componente era progettato correttamente per il suo *input dichiarato* ma non +aveva mai considerato una classe di input tecnicamente validi. + +--- + +## 3. Finding + +### ZRT-001 — CRITICO: Shield Cieco al Frontmatter YAML + +#### Cosa è Successo + +`ReferenceScanner.harvest()` esegue due passate su ogni file: + +1. **Passata 1 (Shield):** scansione delle righe per pattern di segreti. +2. **Passata 1b (Contenuto):** raccolta di definizioni di riferimento e alt-text. + +Entrambe le passate dovevano saltare il frontmatter YAML (blocchi `---`) — ma +per ragioni *diverse e opposte*: + +- La **passata Contenuto** deve saltare il frontmatter perché `author: Jane Doe` + verrebbe altrimenti analizzato come una definizione di riferimento rotta. +- La **passata Shield** deve **non** saltare il frontmatter perché `aws_key: AKIA…` + è un vero segreto che deve essere catturato. + +L'implementazione originale condivideva un unico generatore, `_skip_frontmatter()`, +per entrambe le passate. Questo era corretto per lo stream Contenuto e +catastroficamente sbagliato per lo stream Shield. + +#### Percorso di Attacco + +```markdown +--- +description: Guida API +aws_key: AKIA[chiave-20-char-redatta] ← invisibile allo Shield +stripe_key: sk_live_[chiave-24-char-redatta] ← invisibile allo Shield +--- + +# Guida API + +Contenuto normale qui. +``` + +```bash +zenzic check all # Exit 0 — PASS ← Zero finding segnalati (pre-fix) +git commit -am "aggiunta credenziali api" # Chiave committata, CI verde — violazione +``` + +#### Diagramma della Causa Radice + +```text + ┌─────────────────────────────────┐ + │ harvest() │ + │ │ +File su disco ──►│ _skip_frontmatter(fh) │──► Stream Shield + │ ↑ │ + │ salta righe 1–N │ (PUNTO CIECO) + │ del blocco --- │ + │ │ + │ _iter_content_lines(file) │──► Stream Contenuto + └─────────────────────────────────┘ +``` + +#### Il Fix: Architettura Dual-Stream + +I due stream usano ora **generatori diversi** con **contratti di filtraggio +diversi**: + +```text + ┌─────────────────────────────────┐ + │ harvest() │ + │ │ +File su disco ──►│ enumerate(fh, start=1) │──► Stream Shield + │ ↑ │ (TUTTE le righe) + │ nessun filtraggio │ + │ │ + │ _iter_content_lines(file) │──► Stream Contenuto + │ ↑ │ (frontmatter + + │ salta frontmatter │ fence saltati) + │ salta blocchi fence │ + └─────────────────────────────────┘ +``` + +Lo Shield vede ora ogni byte del file. Lo stream Contenuto continua a saltare +il frontmatter per evitare finding di riferimento falsi positivi. + +**Perché questo è strutturalmente solido:** Lo Shield e il raccoglitore di +Contenuto hanno requisiti di filtraggio ortogonali. Non devono mai condividere +un generatore. + +--- + +### ZRT-002 — ALTO: ReDoS + Deadlock di ProcessPoolExecutor + +#### Cosa è Successo + +L'`AdaptiveRuleEngine` valida le regole per la serializzabilità pickle alla +costruzione (`_assert_pickleable()`). Questo è corretto — garantisce che ogni +regola possa essere spedita a un processo worker. Tuttavia, `pickle.dumps()` è +cieco alla complessità computazionale. Un pattern come `^(a+)+$` serializza +correttamente e viene spedito con successo, poi si blocca indefinitamente +all'interno del worker quando applicato a una stringa come `"a" * 30 + "b"`. + +`ProcessPoolExecutor` nella forma originale usava `executor.map()`, che non ha +timeout. Il risultato: una singola voce `[[custom_rules]]` malevola in +`zenzic.toml` poteva bloccare permanentemente ogni pipeline CI su un repository +con ≥ 50 file. + +#### La Complessità del Backtracking Catastrofico + +Il pattern `^(a+)+$` contiene un **quantificatore annidato** — `+` dentro `+`. +Quando applicato a `"aaa…aab"` (il trigger ReDoS), il motore regex deve +esplorare un numero esponenziale di percorsi nella stringa prima di determinare +che non corrisponde. A n=30 caratteri, questo richiede minuti. A n=50, ore. + +L'intuizione chiave è che `re.compile()` **non** valida per ReDoS. La +compilazione è O(1). Il costo catastrofico si manifesta solo al momento di +`match()`/`search()` su input artigianali. + +#### Percorso di Attacco + +```toml +# zenzic.toml +[[custom_rules]] +id = "STILE-001" +pattern = "^(a+)+$" # ← backtracking catastrofico +message = "Controllo stile" +severity = "error" +``` + +```markdown + +aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab ← stringa trigger ReDoS +``` + +```bash +zenzic check all --workers 4 # Tutti i 4 worker si bloccano. La CI non finisce mai. +``` + +#### Due Difese Complementari + +**Prevenzione — `_assert_regex_canary()` (tempo di costruzione):** + +```text +AdaptiveRuleEngine.__init__(): + for rule in rules: + _assert_pickleable(rule) ← controllo esistente + _assert_regex_canary(rule) ← NUOVO: stress test SIGALRM 100ms +``` + +Il canary esegue ogni pattern `CustomRule` contro tre stringhe di stress sotto +un watchdog `signal.SIGALRM` di 100 ms. Se il pattern impiega più di 100 ms su +un input di 30 caratteri, è categoricamente catastrofico e solleva +`PluginContractError` *prima che venga scansionato il primo file*. + +**Contenimento — `future.result(timeout=30)` (runtime):** + +```text +# Prima +raw = list(executor.map(_worker, work_items)) # si blocca per sempre + +# Dopo +futures_map = {executor.submit(_worker, item): item[0] for item in work_items} +for fut, md_file in futures_map.items(): + try: + raw.append(fut.result(timeout=30)) + except concurrent.futures.TimeoutError: + raw.append(_make_timeout_report(md_file)) # finding Z009, mai crash +``` + +Un worker che supera 30 secondi produce un finding `Z009: ANALYSIS_TIMEOUT` +invece di bloccare il coordinatore. + +**Perché entrambe le difese sono necessarie:** Il canary dipende dalla +piattaforma (`SIGALRM` è solo POSIX; è un no-op su Windows). Il timeout è il +backstop universale. + +--- + +### ZRT-003 — MEDIO: Bypass Shield con Token Divisi tramite Tabella Markdown + +#### Cosa è Successo + +`scan_line_for_secrets()` dello Shield applicava i pattern regex a ogni riga +grezza. Il pattern per chiavi AWS `AKIA[0-9A-Z]{16}` richiede 20 caratteri +**contigui**. Un autore (malevolo o negligente) che documenta credenziali in +una colonna di tabella usando notazione inline code e operatori di +concatenazione rompe la contiguità: + +```markdown +| ID Chiave | `AKIA` + `[suffisso-16-char]` | +``` + +La riga grezza passata alla regex è (resa in sorgente come token divisi): + +```text +| ID Chiave | `AKIA` + `[suffisso-16-char]` | +``` + +La sequenza alfanumerica contigua più lunga è `ABCDEF` (6 chars). Il pattern +non corrisponde mai. Lo Shield segnala zero finding. + +#### Il Fix: Normalizzatore Pre-Scan + +`_normalize_line_for_shield()` applica tre trasformazioni prima che vengano +eseguiti i pattern regex: + +1. **Rimuovi span backtick:** `` `AKIA` `` → `AKIA` +2. **Rimuovi operatori di concatenazione:** `` ` ` + ` ` `` → niente +3. **Collassa pipe di tabella:** `|` → `` + +La forma normalizzata della riga di attacco è `ID Chiave AKIA[suffisso-16-char]`, +che corrisponde a `AKIA[0-9A-Z]{16}` correttamente. + +**Sia** la forma grezza che quella normalizzata vengono scansionate. Un set +`seen` previene finding duplicati quando un segreto appare non offuscato *e* +la forma normalizzata corrisponde anch'essa. + +--- + +### ZRT-004 — MEDIO: Risoluzione URL Context-Free di VSMBrokenLinkRule + +#### Cosa è Successo + +`VSMBrokenLinkRule._to_canonical_url()` era un `@staticmethod`. Convertiva +gli href in URL VSM canonici usando un algoritmo root-relativo: rimuovi `.md`, +elimina `index`, prependi `/`, aggiungi `/`. Questo è corretto per i file nella +docs root ma produce il risultato sbagliato per i file in sottodirectory quando +l'href contiene segmenti `..`. + +#### Esempio del Bug + +```text +File sorgente: docs/a/b/pagina.md +Link: [Vedi](../../c/target.md) + +URL atteso: /c/target/ ← dove il browser navigherebbe +URL calcolato: /c/target/ ← accidentalmente corretto in questo caso a 2 livelli + +File sorgente: docs/a/b/pagina.md +Link: [Vedi](../fratello.md) + +URL atteso: /a/fratello/ ← il file è docs/a/fratello.md +URL calcolato: /fratello/ ← SBAGLIATO: risolto dalla root, non dalla dir sorgente +``` + +L'`InMemoryPathResolver` (usato da `validate_links_async`) risolveva i link +correttamente perché aveva il contesto `source_file` dall'inizio. La +`VSMBrokenLinkRule` no, creando una discrepanza silenziosa tra due superfici di +validazione. + +#### Il Fix: ResolutionContext + +```python +@dataclass(slots=True) +class ResolutionContext: + docs_root: Path + source_file: Path +``` + +`BaseRule.check_vsm()` e `AdaptiveRuleEngine.run_vsm()` accettano ora +`context: ResolutionContext | None = None`. Quando il contesto è fornito, +`_to_canonical_url()` risolve i segmenti `..` usando `os.path.normpath` +relativo a `context.source_file.parent`, poi mappa il percorso assoluto +risolto di ritorno a un URL docs-relativo. + +Il metodo applica anche il confine Shield: se il percorso risolto esce da +`docs_root`, restituisce `None` (equivalente a un outcome `PathTraversal` +in `InMemoryPathResolver`). + +**La Lezione Architetturale:** Qualsiasi metodo che converte un href relativo in +URL assoluto *deve* sapere da dove proviene quell'href. Uno `@staticmethod` che +riceve solo la stringa href è strutturalmente incapace di gestire correttamente +i percorsi relativi. In Zenzic, questo si chiama ora **Anti-Pattern +Context-Free** (vedi `../../arch/vsm_engine.md` per il protocollo completo). + +--- + +## 4. L'Architettura di Multiplexing degli Stream + +Post-remediation, `ReferenceScanner.harvest()` implementa un modello pulito a +due stream. Questa sezione lo documenta per i futuri contributori. + +```text +┌─────────────────────────────────────────────────────────────────┐ +│ ReferenceScanner.harvest() │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ STREAM SHIELD │ │ +│ │ Sorgente: enumerate(file_handle, start=1) │ │ +│ │ Filtro: NESSUNO — ogni riga incluso frontmatter │ │ +│ │ Trasformazioni: │ │ +│ │ 1. _normalize_line_for_shield(riga) [ZRT-003] │ │ +│ │ 2. scan_line_for_secrets(grezza) │ │ +│ │ 3. scan_line_for_secrets(normalizzata) │ │ +│ │ Output: eventi ("SECRET", SecurityFinding) │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ STREAM CONTENUTO │ │ +│ │ Sorgente: _iter_content_lines(file_path) │ │ +│ │ Filtro: salta frontmatter YAML, salta blocchi fence │ │ +│ │ Trasformazioni: │ │ +│ │ 1. Analisi definizioni riferimento (_RE_REF_DEF) │ │ +│ │ 2. Scansione URL ref-def per segreti │ │ +│ │ 3. Analisi immagini inline (_RE_IMAGE_INLINE) │ │ +│ │ Output: eventi ("DEF", "IMG", "MISSING_ALT", …) │ │ +│ └─────────────────────────────────────────────────────────┘ │ +│ │ +│ Output finale: eventi uniti e ordinati per numero di riga │ +└─────────────────────────────────────────────────────────────────┘ +``` + +**Invariante:** Lo stream Shield e lo stream Contenuto non devono *mai condividere +un generatore*. Qualsiasi refactoring futuro che li unisca reintroduce ZRT-001. + +--- + +## 5. Cosa Ha Reso Possibili Queste Vulnerabilità + +Tutti e quattro i finding condividono una radice comune: **contratti impliciti +ai confini dei sottosistemi**. + +| Finding | Contratto implicito violato | +|---------|---------------------------| +| ZRT-001 | "Lo Shield vede tutte le righe" — violato dal generatore condiviso | +| ZRT-002 | "Pickle-safe significa execution-safe" — violato dalla cecità al ReDoS | +| ZRT-003 | "Una riga = un token" — violato dalla frammentazione della sintassi Markdown | +| ZRT-004 | "La risoluzione URL è context-free" — violato dai percorsi relativi | + +Il fix in ogni caso segue lo stesso schema: **rendere il contratto esplicito nel +sistema dei tipi o nella firma della funzione**, e **testarlo direttamente**. + +--- + +## 6. Prevenzione delle Regressioni + +I seguenti test in `tests/test_redteam_remediation.py` servono come guardie di +regressione permanenti. Non devono mai essere eliminati o indeboliti: + +| Classe di test | Cosa protegge | +|---------------|--------------| +| `TestShieldFrontmatterCoverage` | ZRT-001 — scansione frontmatter | +| `TestReDoSCanary` | ZRT-002 — rigetto canary alla costruzione | +| `TestShieldNormalizer` | ZRT-003 — ricostruzione token divisi | +| `TestVSMContextAwareResolution` | ZRT-004 — risoluzione URL context-aware | +| `TestShieldReportingIntegrity` | Z-SEC-002 — severità breach, mascheratura segreti, fedeltà bridge | + +Se un futuro refactoring causa il fallimento di uno qualsiasi di questi test, +la PR **non deve essere mergiata** finché il test non viene dimostrato errato +(e la guardia di regressione sostituita con un equivalente) o il fix non viene +ripristinato. + +--- + +## 7. Lezioni Apprese + +Per v0.5.0rc1 e oltre: + +1. **Ogni nuovo confine di sottosistema deve documentare il proprio contratto + di filtraggio.** Un generatore che salta righe deve avere una nota che + spiega *cosa* salta e *perché* il chiamante è autorizzato a usarlo. + +2. **I metodi `@staticmethod` che gestiscono percorsi sono sospetti per + definizione.** Se un metodo statico riceve una stringa di percorso, chiedi: + ha bisogno di sapere da dove proviene quel percorso? Se sì, non è un metodo + statico — è un argomento di contesto mancante. + +3. **I pattern regex forniti dall'utente sono input non fidati.** Esegui sempre + il canary. Il budget di 100 ms non è un requisito di performance — è un + confine di sicurezza. + +4. **Il livello di parallelismo deve avere sempre un timeout.** Un coordinatore + che attende indefinitamente i worker è un single point of failure per + l'intera pipeline CI. + +--- + +*Documento aggiornato a v0.5.0a4.* diff --git a/mutmut_pytest.ini b/mutmut_pytest.ini new file mode 100644 index 0000000..f49c110 --- /dev/null +++ b/mutmut_pytest.ini @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: 2026 PythonWoods +# SPDX-License-Identifier: Apache-2.0 + +# mutmut_pytest.ini — Pytest configuration used ONLY by mutmut (mutation testing). +# +# Why a separate ini file? +# mutmut v3 generates trampolines in mutants/src/ (a working copy of the source), +# but pytest's importlib mode does not prioritise sys.path for package resolution, +# causing tests to import from site-packages instead of the trampoline copy. +# +# This file overrides two settings for mutation runs only: +# 1. import-mode=prepend — restores classic sys.path-first resolution so that +# mutants/src/ (prepended to sys.path by pytest thanks to pythonpath below) +# takes precedence over the site-packages install. +# 2. pythonpath = src — instructs pytest to add src/ (i.e. mutants/src/) +# to sys.path[0], making mutmut's trampolines the authoritative source. +# +# The main pyproject.toml keeps import-mode=importlib for the regular test suite. +# This file must be listed in [tool.mutmut] also_copy to be available in mutants/. + +[pytest] +testpaths = tests +pythonpath = src +addopts = -ra -q --strict-markers --strict-config From 76b935f4c9376eadbcb0f66a25f72c29185e457d Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Sat, 4 Apr 2026 20:29:11 +0200 Subject: [PATCH 05/16] =?UTF-8?q?release:=20bump=20version=200.5.0a3=20?= =?UTF-8?q?=E2=86=92=200.5.0a4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- CHANGELOG.md | 4 ++-- CITATION.cff | 2 +- docs/community/index.md | 2 +- docs/it/community/index.md | 2 +- mkdocs.yml | 2 +- pyproject.toml | 4 ++-- src/zenzic/__init__.py | 2 +- uv.lock | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11ab737..c939ee5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -161,7 +161,7 @@ Versions follow [Semantic Versioning](https://semver.org/). verification accepted per Architecture Lead authorisation (Z-TEST-003). **28 tests in `test_redteam_remediation.py`, all green.** -## [0.5.0a3] — 2026-04-03 — The Sentinel: Aesthetic Sprint, Parallel Anchors & Agnostic Target +## [0.5.0a4] — 2026-04-03 — The Sentinel: Aesthetic Sprint, Parallel Anchors & Agnostic Target > **Sprint 13 + 14 + 15.** Three tracks delivered in one tag. > Track A — Performance & SDK: deterministic two-phase anchor validation, `zenzic.rules` public @@ -1547,7 +1547,7 @@ It has been superseded by the 0.5.x stabilization cycle. [Unreleased]: https://github.com/PythonWoods/zenzic/compare/v0.5.0a3...HEAD -[0.5.0a3]: https://github.com/PythonWoods/zenzic/compare/v0.5.0a2...v0.5.0a3 +[0.5.0a4]: https://github.com/PythonWoods/zenzic/compare/v0.5.0a2...v0.5.0a3 [0.5.0a2]: https://github.com/PythonWoods/zenzic/compare/v0.5.0a1...v0.5.0a2 [0.5.0a1]: https://github.com/PythonWoods/zenzic/compare/v0.4.0-rc5...v0.5.0a1 [0.4.0-rc5]: https://github.com/PythonWoods/zenzic/compare/v0.4.0-rc4...v0.4.0-rc5 diff --git a/CITATION.cff b/CITATION.cff index 8339911..cd6665a 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -15,7 +15,7 @@ abstract: > scanner (the Shield). Built on pure functional principles in Python 3.11+, it operates source-first — no build framework required — and integrates with any Markdown-based documentation system via a plugin adapter protocol. -version: 0.5.0a3 +version: 0.5.0a4 date-released: 2026-04-03 url: "https://zenzic.pythonwoods.dev/" repository-code: "https://github.com/PythonWoods/zenzic" diff --git a/docs/community/index.md b/docs/community/index.md index bdf6b68..b5aee58 100644 --- a/docs/community/index.md +++ b/docs/community/index.md @@ -83,7 +83,7 @@ __Manual BibTeX:__ @software{zenzic, author = {PythonWoods}, title = {{Zenzic: The Agnostic Documentation Integrity Framework}}, - version = {0.5.0a3}, + version = {0.5.0a4}, date = {2026-04-03}, url = {https://zenzic.pythonwoods.dev/}, license = {Apache-2.0}, diff --git a/docs/it/community/index.md b/docs/it/community/index.md index 232c914..4aa1f71 100644 --- a/docs/it/community/index.md +++ b/docs/it/community/index.md @@ -73,7 +73,7 @@ __BibTeX manuale:__ @software{zenzic, author = {PythonWoods}, title = {{Zenzic: The Agnostic Documentation Integrity Framework}}, - version = {0.5.0a3}, + version = {0.5.0a4}, date = {2026-04-03}, url = {https://zenzic.pythonwoods.dev/}, license = {Apache-2.0}, diff --git a/mkdocs.yml b/mkdocs.yml index 1f26f43..24b03f6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -147,7 +147,7 @@ extra_css: extra: build_date: !ENV [BUILD_DATE, "dev"] generator: false - version: "0.5.0a3" + version: "0.5.0a4" alternate: - name: English link: / diff --git a/pyproject.toml b/pyproject.toml index eb7b761..02e28f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "hatchling.build" [project] name = "zenzic" -version = "0.5.0a3" +version = "0.5.0a4" description = "Engineering-grade, engine-agnostic linter and security shield for Markdown documentation" readme = "README.md" requires-python = ">=3.11" @@ -179,7 +179,7 @@ pytest_add_cli_args = ["--import-mode=prepend"] # ─── Version bumping ─────────────────────────────────────────────────────────── [tool.bumpversion] -current_version = "0.5.0a3" +current_version = "0.5.0a4" commit = true tag = true tag_name = "v{new_version}" diff --git a/src/zenzic/__init__.py b/src/zenzic/__init__.py index 7ae3875..6026d7f 100644 --- a/src/zenzic/__init__.py +++ b/src/zenzic/__init__.py @@ -2,4 +2,4 @@ # SPDX-License-Identifier: Apache-2.0 """Zenzic — engine-agnostic linter and security shield for Markdown documentation.""" -__version__ = "0.5.0a3" +__version__ = "0.5.0a4" diff --git a/uv.lock b/uv.lock index 8790fe0..3e36d5a 100644 --- a/uv.lock +++ b/uv.lock @@ -2286,7 +2286,7 @@ wheels = [ [[package]] name = "zenzic" -version = "0.5.0a3" +version = "0.5.0a4" source = { editable = "." } dependencies = [ { name = "httpx" }, From 7f3d67235f2a1a21c2a177bb58e25fb082f516f0 Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Sat, 4 Apr 2026 20:30:40 +0200 Subject: [PATCH 06/16] docs(internal): add registry for architectural gaps and tech debt --- docs/internal/arch_gaps.md | 26 ++++++++++++++++++++++++++ docs/it/internal/arch_gaps.md | 26 ++++++++++++++++++++++++++ mkdocs.yml | 1 + 3 files changed, 53 insertions(+) create mode 100644 docs/internal/arch_gaps.md create mode 100644 docs/it/internal/arch_gaps.md diff --git a/docs/internal/arch_gaps.md b/docs/internal/arch_gaps.md new file mode 100644 index 0000000..030660d --- /dev/null +++ b/docs/internal/arch_gaps.md @@ -0,0 +1,26 @@ + + + +# Zenzic - Architectural Gaps & Technical Debt + +> *"What is not documented, does not exist; what is documented poorly, is an ambush."* +> +> This document tracks architectural gaps and technical debt identified during development, which require resolution before specific milestones (like rc1). + +--- + +## Target: v0.5.0rc1 (The Bastion) + +### 1. Versioning Automation (Noxfile) + +**Identified in:** v0.5.0a4 (`fix/sentinel-hardening`) +**Component:** `noxfile.py` +**Description:** The noxfile currently only supports `patch`, `minor`, and `major` bumps. During alpha/beta iterations, it is not possible to execute a prerelease bump directly via the automation framework (`nox -s bump -- prerelease`). +**Required Action:** The noxfile must be updated to extract and support pre-release tags (bumping `pre_l` and `pre_n`) by properly interfacing with `bump-my-version`, enabling rapid iteration of testing releases without circumventing automation. + +### 2. Security Pipeline Coverage (CLI Integration) + +**Identified in:** v0.5.0a4 (`fix/sentinel-hardening`) +**Component:** `zenzic/cli.py` +**Description:** The scanner and reporter now have complete mutation tests safeguarding the effectiveness of the Shield (The Sentinel's Trial). However, the silencer mutant (`findings.append(...) -> pass`) within `cli.py` is not covered by the current suite because it bypasses the CLI to interface with the proxy. +**Required Action:** An end-to-end (e2e) test that triggers the full CLI and verifies the exit with code 2 and the presence of the reporter to ensure the routing is not vulnerable to amnesia (Commit 4b or later). diff --git a/docs/it/internal/arch_gaps.md b/docs/it/internal/arch_gaps.md new file mode 100644 index 0000000..be5d8ce --- /dev/null +++ b/docs/it/internal/arch_gaps.md @@ -0,0 +1,26 @@ + + + +# Zenzic - Architectural Gaps & Technical Debt + +> *"Ciò che non è documentato, non esiste; ciò che è documentato male, è un'imboscata."* +> +> Questo documento traccia i gap architetturali e il debito tecnico identificati durante lo sviluppo, che necessitano di risoluzione prima di traguardi specifici (come la rc1). + +--- + +## Target: v0.5.0rc1 (The Bastion) + +### 1. Automazione del Versioning (Noxfile) + +**Identificato in:** v0.5.0a4 (`fix/sentinel-hardening`) +**Componente:** `noxfile.py` +**Descrizione:** Il noxfile attualmente supporta solo bump di `patch`, `minor` e `major`. Durante le iterazioni alpha/beta, non è possibile eseguire il bump prerelease direttamente tramite il framework di automazione (`nox -s bump -- prerelease`). +**Azione Richiesta:** Il noxfile deve essere aggiornato per estrarre e supportare la gestione dei tag alpha/beta pre-release (bump `pre_l` e `pre_n`) interfacciandosi correttamente con `bump-my-version`, per permettere l'iterazione rapida delle release di testing senza bypassare l'automazione. + +### 2. Copertura della Pipeline di Sicurezza (Integrazione CLI) + +**Identificato in:** v0.5.0a4 (`fix/sentinel-hardening`) +**Componente:** `zenzic/cli.py` +**Descrizione:** Lo scanner e il reporter dispongono ora di mutation test completi che proteggono l'efficacia dello Shield (The Sentinel's Trial). Tuttavia, la mutazione del silenziatore (`findings.append(...) -> pass`) all'interno di `cli.py` non viene coperta dalla suite attuale perché essa salta la CLI per interfacciarsi con il proxy. +**Azione Richiesta:** Un test end-to-end (e2e) che attivi l'intera CLI e verifichi l'uscita con exit code 2 e la presenza del reporter per assicurare che il routing non sia vulnerabile ad amnesie (Commit 4b o successivi). diff --git a/mkdocs.yml b/mkdocs.yml index 24b03f6..49edb75 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -193,6 +193,7 @@ nav: - Writing Plugin Rules: developers/plugins.md - Example Projects: developers/examples.md - Internals: + - Arch Gaps & Tech Debt: internal/arch_gaps.md - VSM Engine: arch/vsm_engine.md - Security Reports: - Security Analysis v0.5.0a3: internal/security/shattered_mirror_report.md From f4f2736084b4a19d1ca2198cdb03ce2d5a31b948 Mon Sep 17 00:00:00 2001 From: PythonWoods-Dev Date: Tue, 7 Apr 2026 15:28:22 +0200 Subject: [PATCH 07/16] feat(cli,docs): per-command ok_message and Sentinel homepage panels CLI - Add ok_message parameter to SentinelReporter.render() so each individual check command prints a specific success verdict instead of the generic "All checks passed" message. - Extract _count_docs_assets() helper to eliminate duplicated file-counting logic across all six check commands. - All six check commands (links, orphans, snippets, references, assets, placeholders) now use SentinelReporter with a dedicated ok_message; check-all keeps the default. - Add snippet fallback in reporter: when the source file is unreadable but source_line is available, render a single-line snippet instead of silently skipping it. Docs - Rewrite Sentinel in Action / Sentinel in Azione homepage section as pure HTML to prevent Python-Markdown from wrapping nested divs in

tags or escaping them as code blocks. - Add .zz-sentinel-demo CSS component with dark/light mode support for branded mini-panel previews (gutter reporter, shield, grouped by file, severity summary). - Add .zz-sentinel-section layout CSS for card containment and lateral margins. - Replace demo link references with plain text to avoid Zenzic false positives from Markdown-like syntax inside HTML spans. Tests - Update all ok-assertions to match new per-command messages. - Fix visual test expectations (gutter counting, LINK_ERROR badge). --- README.it.md | 4 +- README.md | 4 +- docs/assets/stylesheets/extra.css | 220 ++++++++++++++++ docs/index.md | 117 +++++---- docs/it/index.md | 118 +++++---- src/zenzic/cli.py | 423 ++++++++++++++++++++---------- src/zenzic/core/reporter.py | 30 ++- tests/test_cli.py | 35 ++- tests/test_cli_visual.py | 21 +- 9 files changed, 710 insertions(+), 262 deletions(-) diff --git a/README.it.md b/README.it.md index 6f1a204..729da50 100644 --- a/README.it.md +++ b/README.it.md @@ -29,8 +29,8 @@ SPDX-License-Identifier: Apache-2.0 Zenzic Score - - Powered by Zensical + + Built with MkDocs

diff --git a/README.md b/README.md index 5203f8f..2f36bac 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ SPDX-License-Identifier: Apache-2.0 Zenzic Score - - Powered by Zensical + + Built with MkDocs

diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 1660693..9cc4cd6 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -504,6 +504,226 @@ } } +/* Sentinel section — contained width with lateral breathing room */ +.zz-sentinel-section { + max-width: 960px; + margin: 3.5rem auto !important; + padding: 0 1.5rem; +} + +.zz-sentinel-section > h2, +.zz-sentinel-section > p { + max-width: 720px; + margin-left: auto; + margin-right: auto; +} + +.zz-sentinel-section .grid.cards, +.zz-sentinel-section.grid.cards { + margin-top: 1.5rem; +} + +.zz-sentinel-section li { + background: var(--md-code-bg-color); + border: 1px solid var(--zz-border-subtle) !important; + border-radius: 0.75rem; + padding: 1.25rem; + box-shadow: 0 8px 24px rgba(0, 0, 0, 0.14); +} + +.zz-sentinel-demo { + margin-top: 1rem; + border: 1px solid rgba(79, 70, 229, 0.28); + border-radius: 0.7rem; + background: #0f172a; + padding: 0.9rem 1rem; + font-family: 'JetBrains Mono', ui-monospace, monospace; + font-size: 0.7rem; + line-height: 1.6; + overflow-x: auto; +} + +.zz-sentinel-demo__rule { + color: #94a3b8; + margin-bottom: 0.75rem; + white-space: nowrap; +} + +.zz-sentinel-demo__rule::before, +.zz-sentinel-demo__rule::after { + content: "────────────────"; + color: #475569; +} + +.zz-sentinel-demo__rule::before { + margin-right: 0.55rem; +} + +.zz-sentinel-demo__rule::after { + margin-left: 0.55rem; +} + +.zz-sentinel-demo__finding, +.zz-sentinel-demo__snippet, +.zz-sentinel-demo__summary-row, +.zz-sentinel-demo__verdict { + white-space: nowrap; +} + +.zz-sentinel-demo__finding { + display: flex; + align-items: baseline; + gap: 0.45rem; + margin: 0.2rem 0; +} + +.zz-sentinel-demo__icon { + font-weight: 700; +} + +.zz-sentinel-demo__icon--error { + color: #f43f5e; +} + +.zz-sentinel-demo__icon--warning { + color: #f59e0b; +} + +.zz-sentinel-demo__badge { + color: #e2e8f0; + background: rgba(79, 70, 229, 0.28); + border: 1px solid rgba(129, 140, 248, 0.35); + border-radius: 999px; + padding: 0.05rem 0.45rem; + font-size: 0.64rem; +} + +.zz-sentinel-demo__badge--warning { + background: rgba(245, 158, 11, 0.18); + border-color: rgba(245, 158, 11, 0.32); +} + +.zz-sentinel-demo__badge--breach { + background: rgba(244, 63, 94, 0.18); + border-color: rgba(244, 63, 94, 0.34); +} + +.zz-sentinel-demo__message { + color: #e6edf3; +} + +.zz-sentinel-demo__snippet { + display: grid; + grid-template-columns: 2rem 1rem minmax(0, 1fr); + gap: 0.4rem; + color: #e6edf3; +} + +.zz-sentinel-demo__snippet--dim { + color: #94a3b8; +} + +.zz-sentinel-demo__line-no { + color: #64748b; + text-align: right; +} + +.zz-sentinel-demo__gutter { + color: #64748b; +} + +.zz-sentinel-demo__gutter--active { + color: #f43f5e; + font-weight: 700; +} + +.zz-sentinel-demo__summary-row { + display: flex; + flex-wrap: wrap; + gap: 0.8rem; + margin-bottom: 0.5rem; +} + +.zz-sentinel-demo__count--error { + color: #f43f5e; +} + +.zz-sentinel-demo__count--warning { + color: #f59e0b; +} + +.zz-sentinel-demo__count--muted { + color: #94a3b8; +} + +.zz-sentinel-demo__verdict { + color: #f43f5e; + font-weight: 700; +} + +[data-md-color-scheme="default"] .zz-sentinel-demo { + background: #eef2ff; + border-color: rgba(79, 70, 229, 0.18); +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__rule, +[data-md-color-scheme="default"] .zz-sentinel-demo__snippet--dim, +[data-md-color-scheme="default"] .zz-sentinel-demo__count--muted, +[data-md-color-scheme="default"] .zz-sentinel-demo__line-no, +[data-md-color-scheme="default"] .zz-sentinel-demo__gutter { + color: #64748b; +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__rule::before, +[data-md-color-scheme="default"] .zz-sentinel-demo__rule::after { + color: #cbd5e1; +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__message, +[data-md-color-scheme="default"] .zz-sentinel-demo__snippet { + color: #0f172a; +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__badge { + color: #312e81; + background: rgba(79, 70, 229, 0.1); + border-color: rgba(79, 70, 229, 0.18); +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__badge--warning { + color: #92400e; + background: rgba(245, 158, 11, 0.1); + border-color: rgba(245, 158, 11, 0.2); +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__badge--breach { + color: #9f1239; + background: rgba(244, 63, 94, 0.1); + border-color: rgba(244, 63, 94, 0.18); +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__gutter--active, +[data-md-color-scheme="default"] .zz-sentinel-demo__icon--error, +[data-md-color-scheme="default"] .zz-sentinel-demo__count--error, +[data-md-color-scheme="default"] .zz-sentinel-demo__verdict { + color: #e11d48; +} + +[data-md-color-scheme="default"] .zz-sentinel-demo__icon--warning, +[data-md-color-scheme="default"] .zz-sentinel-demo__count--warning { + color: #d97706; +} + +.zz-sentinel-section li .highlight, +.zz-sentinel-section li pre { + overflow-x: auto; +} + +[data-md-color-scheme="default"] .zz-sentinel-section li { + background: #ffffff; + box-shadow: 0 6px 18px rgba(0, 0, 0, 0.06); +} + /* Score section — centered, contained width */ /* Score section — elevated card, generous top spacing */ .zz-score-section { diff --git a/docs/index.md b/docs/index.md index a589413..4757c28 100644 --- a/docs/index.md +++ b/docs/index.md @@ -99,55 +99,76 @@ Catch broken links, orphan pages, and leaked credentials — before your users d --- -
- -## Sentinel in Action - -Every finding is pinned to file, line, and source. Structured output for human eyes and machine parsing alike. - -
- -- :lucide-terminal:   __Gutter reporter__ - - --- - - Each error shows the exact offending source line with gutter context. No scrolling through logs to find what broke. - - ```text - docs/guide.md - ✘ 16: [FILE_NOT_FOUND] 'setup.md' not found in docs - │ - 16 │ Read the [setup guide](setup.md) before continuing. - │ - ``` - -- :lucide-shield:   __Zenzic Shield__ - - --- - - Scans every line — including fenced `bash` and `yaml` blocks — for leaked credentials. Exit code `2` is reserved exclusively for security events. - - ```text - docs/tutorial.md - ✘ 42: [CREDENTIAL_LEAK] GitHub token detected - │ - 42 │ Authorization: Bearer ghp_example123token - │ - ``` - -- :lucide-chart-bar:   __Quality score__ - - --- - - `zenzic score` emits a single deterministic __0–100 integer__. Save a baseline and gate pull requests on regression. - - ```bash - zenzic score --save # persist baseline - zenzic diff --threshold 5 # exit 1 if score drops > 5 - ``` - +
+

Sentinel in Action

+

Every finding is pinned to file, line, and source. Structured output for human eyes and machine parsing alike.

+
+
    +
  • +

      Gutter reporter

    +
    +

    Each error shows the exact offending source line with gutter context. No scrolling through logs to find what broke.

    + +
  • +
  • +

      Zenzic Shield

    +
    +

    Scans every line — including fenced bash and yaml blocks — for leaked credentials. Exit code 2 is reserved exclusively for security events.

    + +
  • +
  • +

      Grouped by file

    +
    +

    Findings are grouped under a file header instead of streamed as flat logs. You see where the problem lives before reading the finding details.

    + +
  • +
  • +

      Severity summary

    +
    +

    Every run ends with a compact summary: counts by severity, files with findings, and a final verdict. You know immediately whether the check failed hard or only emitted warnings.

    + +
  • +
-
--- diff --git a/docs/it/index.md b/docs/it/index.md index 7f3c9dd..ec9b969 100644 --- a/docs/it/index.md +++ b/docs/it/index.md @@ -99,55 +99,77 @@ Intercetta link non validi, pagine orfane e credenziali esposte — prima dei tu --- -
- -## Sentinel in Azione - -Ogni segnalazione è ancorata a file, riga e sorgente. Output strutturato per occhi umani e parsing automatico. - -
- -- :lucide-terminal:   __Reporter con gutter__ - - --- - - Ogni errore mostra la riga sorgente esatta con contesto gutter. Nessun log da scorrere per trovare il problema. - - ```text - docs/guida.md - ✘ 16: [FILE_NOT_FOUND] 'setup.md' not found in docs - │ - 16 │ Leggi la [guida di setup](setup.md) prima di continuare. - │ - ``` - -- :lucide-shield:   __Zenzic Shield__ - - --- - - Scansiona ogni riga — compresi i blocchi `bash` e `yaml` — alla ricerca di credenziali esposte. Exit code `2` è riservato esclusivamente agli eventi di sicurezza. - - ```text - docs/tutorial.md - ✘ 42: [CREDENTIAL_LEAK] Token GitHub rilevato - │ - 42 │ Authorization: Bearer ghp_example123token - │ - ``` - -- :lucide-chart-bar:   __Punteggio qualità__ - - --- - - `zenzic score` emette un singolo __intero deterministico 0–100__. Salva un baseline e blocca le pull request sulle regressioni. - - ```bash - zenzic score --save # salva il baseline - zenzic diff --threshold 5 # exit 1 se il punteggio scende > 5 - ``` - +
+

Sentinel in Azione

+

Ogni segnalazione è ancorata a file, riga e sorgente. Output strutturato per occhi umani e parsing automatico.

+
+
    +
  • +

      Reporter con gutter

    +
    +

    Ogni errore mostra la riga sorgente esatta con contesto gutter. Nessun log da scorrere per trovare il problema.

    + +
  • +
  • +

      Zenzic Shield

    +
    +

    Scansiona ogni riga — compresi i blocchi bash e yaml — alla ricerca di credenziali esposte. Exit code 2 è riservato esclusivamente agli eventi di sicurezza.

    + +
  • +
  • +

      Raggruppato per file

    +
    +

    I finding sono raggruppati sotto un header di file, invece di scorrere come log piatti. Vedi dove vive il problema prima ancora di leggere il dettaglio.

    + +
  • +
  • +

      Riepilogo severità

    +
    +

    Ogni esecuzione termina con un riepilogo compatto: conteggi per severità, numero di file coinvolti e verdetto finale. Capisci subito se il controllo è fallito davvero o se ha emesso solo warning.

    +

    Nota: l'output CLI di Zenzic resta volutamente in inglese, anche nella documentazione italiana, per mantenere log, CI e screenshot coerenti tra tutti gli ambienti.

    + +
  • +
-
--- diff --git a/src/zenzic/cli.py b/src/zenzic/cli.py index 9e368a5..ade9efc 100644 --- a/src/zenzic/cli.py +++ b/src/zenzic/cli.py @@ -137,21 +137,79 @@ def _render_link_error(err: LinkError, docs_root: Path) -> None: console.print(f" [dim]│[/] [italic]{err.source_line}[/]") +def _count_docs_assets(docs_root: Path, repo_root: Path) -> tuple[int, int]: + """Return ``(docs_count, assets_count)`` for the Sentinel telemetry line.""" + _INERT = {".css", ".js"} + _CONFIG = {".yml", ".yaml", ".toml"} + if not docs_root.is_dir(): + return 0, 0 + docs_count = sum( + 1 + for p in docs_root.rglob("*") + if p.is_file() and (p.suffix.lower() == ".md" or p.suffix.lower() in _CONFIG) + ) + docs_count += sum( + 1 for p in repo_root.iterdir() if p.is_file() and p.suffix.lower() in {".yml", ".yaml"} + ) + assets_count = sum( + 1 + for p in docs_root.rglob("*") + if p.is_file() + and p.suffix.lower() not in _INERT + and p.suffix.lower() not in _CONFIG + and p.suffix.lower() != ".md" + ) + return docs_count, assets_count + + @check_app.command(name="links") def check_links( strict: bool = typer.Option(False, "--strict", "-s", help="Exit non-zero on any warning."), ) -> None: """Check for broken internal links. Pass --strict to also validate external URLs.""" + from zenzic import __version__ + repo_root = find_repo_root() config, _ = ZenzicConfig.load(repo_root) docs_root = (repo_root / config.docs_dir).resolve() - errors = validate_links_structured(repo_root, strict=strict) + + def _rel(path: Path) -> str: + try: + return str(path.relative_to(docs_root)) + except ValueError: + return str(path) + + t0 = time.monotonic() + link_errors = validate_links_structured(repo_root, strict=strict) + elapsed = time.monotonic() - t0 + + findings = [ + Finding( + rel_path=_rel(err.file_path), + line_no=err.line_no, + code=err.error_type, + severity="error", + message=err.message, + source_line=err.source_line, + col_start=err.col_start, + match_text=err.match_text, + ) + for err in link_errors + ] + + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) + reporter = SentinelReporter(console, docs_root, docs_dir=str(config.docs_dir)) + errors, warnings = reporter.render( + findings, + version=__version__, + elapsed=elapsed, + docs_count=docs_count, + assets_count=assets_count, + engine=config.build_context.engine if hasattr(config, "build_context") else "auto", + ok_message="No broken links found.", + ) if errors: - console.print(f"\n[red]BROKEN LINKS ({len(errors)}):[/]") - for err in errors: - _render_link_error(err, docs_root) raise typer.Exit(1) - console.print("\n[green]OK:[/] no broken links found.") @check_app.command(name="orphans") @@ -165,34 +223,101 @@ def check_orphans( ), ) -> None: """Detect .md files not listed in the nav.""" + from zenzic import __version__ + repo_root = find_repo_root() config, loaded_from_file = ZenzicConfig.load(repo_root) if not loaded_from_file: _print_no_config_hint() config = _apply_engine_override(config, engine) + docs_root = (repo_root / config.docs_dir).resolve() + + t0 = time.monotonic() orphans = find_orphans(repo_root, config) - if orphans: - console.print(f"\n[red]ORPHANS ({len(orphans)}):[/] physical files not in nav:") - for path in orphans: - console.print(f" [yellow]{path}[/]") + elapsed = time.monotonic() - t0 + + findings = [ + Finding( + rel_path=str(path), + line_no=0, + code="ORPHAN", + severity="warning", + message="Physical file not listed in navigation.", + ) + for path in orphans + ] + + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) + reporter = SentinelReporter(console, docs_root, docs_dir=str(config.docs_dir)) + errors, warnings = reporter.render( + findings, + version=__version__, + elapsed=elapsed, + docs_count=docs_count, + assets_count=assets_count, + engine=config.build_context.engine if hasattr(config, "build_context") else "auto", + strict=True, + ok_message="No orphan pages found.", + ) + if errors or warnings: raise typer.Exit(1) - console.print("\n[green]OK:[/] no orphan pages found.") @check_app.command(name="snippets") def check_snippets() -> None: """Validate Python code blocks in documentation Markdown files.""" + from zenzic import __version__ + repo_root = find_repo_root() config, loaded_from_file = ZenzicConfig.load(repo_root) if not loaded_from_file: _print_no_config_hint() - errors = validate_snippets(repo_root, config) + docs_root = (repo_root / config.docs_dir).resolve() + + def _rel(path: Path) -> str: + try: + return str(path.relative_to(docs_root)) + except ValueError: + return str(path) + + t0 = time.monotonic() + snippet_errors = validate_snippets(repo_root, config) + elapsed = time.monotonic() - t0 + + findings: list[Finding] = [] + for s_err in snippet_errors: + src = "" + if s_err.line_no > 0 and s_err.file_path.is_file(): + try: + lines = s_err.file_path.read_text(encoding="utf-8").splitlines() + if 0 < s_err.line_no <= len(lines): + src = lines[s_err.line_no - 1].strip() + except OSError: + pass + findings.append( + Finding( + rel_path=_rel(s_err.file_path), + line_no=s_err.line_no, + code="SNIPPET", + severity="error", + message=s_err.message, + source_line=src, + ) + ) + + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) + reporter = SentinelReporter(console, docs_root, docs_dir=str(config.docs_dir)) + errors, warnings = reporter.render( + findings, + version=__version__, + elapsed=elapsed, + docs_count=docs_count, + assets_count=assets_count, + engine=config.build_context.engine if hasattr(config, "build_context") else "auto", + ok_message="All code snippets are syntactically valid.", + ) if errors: - console.print(f"\n[red]INVALID SNIPPETS ({len(errors)}):[/]") - for err in errors: - console.print(f" [yellow]{err.file_path}:{err.line_no}[/] - {err.message}") raise typer.Exit(1) - console.print("\n[green]OK:[/] all Python snippets are syntactically valid.") @check_app.command(name="references") @@ -224,118 +349,135 @@ def check_references( 1 — Dangling References or (with --strict) warnings found. 2 — SECURITY CRITICAL: a secret was detected in a reference URL. """ + from zenzic import __version__ + repo_root = find_repo_root() config, loaded_from_file = ZenzicConfig.load(repo_root) if not loaded_from_file: _print_no_config_hint() - reports, link_errors = scan_docs_references(repo_root, config, validate_links=links) - - docs_root = repo_root / config.docs_dir - - # ── Check for secrets first (Exit Code 2) ───────────────────────────────── - security_hits = [(r.file_path, sf) for r in reports for sf in r.security_findings] - if security_hits: - console.print("\n[bold red]╔══════════════════════════════════════╗[/]") - console.print("[bold red]║ SECURITY CRITICAL ║[/]") - console.print("[bold red]║ Secret(s) detected in documentation ║[/]") - console.print("[bold red]╚══════════════════════════════════════╝[/]\n") - for _fp, sf in security_hits: - try: - display_path = sf.file_path.relative_to(docs_root) - except ValueError: - display_path = sf.file_path - console.print( - f" [bold red][SHIELD][/] {display_path}:{sf.line_no} " - f"— [red]{sf.secret_type}[/] detected in URL" - ) - console.print(f" [dim]{sf.url[:80]}[/]") - console.print("\n[bold red]Build aborted.[/] Rotate the exposed credential immediately.") - raise typer.Exit(2) - - # ── Collect reference findings ───────────────────────────────────────────── - all_errors: list[str] = [] - all_warnings: list[str] = [] - total_score = 0.0 - file_count = len(reports) + docs_root = (repo_root / config.docs_dir).resolve() - for report in reports: + def _rel(path: Path) -> str: try: - rel = report.file_path.relative_to(docs_root) + return str(path.relative_to(docs_root)) except ValueError: - rel = report.file_path - for finding in report.findings: - msg = f" [yellow]{rel}:{finding.line_no}[/] [{finding.issue}] — {finding.detail}" - if finding.is_warning: - all_warnings.append(msg) - else: - all_errors.append(msg) - - for rf in report.rule_findings: - severity_color = "red" if rf.is_error else "yellow" - header = ( - f"[{severity_color}][{rf.rule_id}][/] [dim]{rel}:{rf.line_no}[/] — {rf.message}" - ) - if rf.matched_line: - snippet = rf.matched_line.rstrip() - msg = f"{header}\n [dim]│[/] [italic]{snippet}[/]" - else: - msg = header - if rf.is_error: - all_errors.append(msg) - else: - all_warnings.append(msg) - - if file_count: - total_score += report.score - - avg_score = total_score / file_count if file_count else 100.0 + return str(path) - # ── Output ───────────────────────────────────────────────────────────────── - if all_errors: - console.print(f"\n[red]REFERENCE ERRORS ({len(all_errors)}):[/]") - for msg in all_errors: - console.print(msg) + t0 = time.monotonic() + reports, ext_link_errors = scan_docs_references(repo_root, config, validate_links=links) + elapsed = time.monotonic() - t0 - if all_warnings: - label = "[red]REFERENCE WARNINGS[/]" if strict else "[yellow]REFERENCE WARNINGS[/]" - console.print(f"\n{label} ({len(all_warnings)}):") - for msg in all_warnings: - console.print(msg) + # ── Build unified findings list ──────────────────────────────────────────── + findings: list[Finding] = [] + for report in reports: + rel = _rel(report.file_path) + _lines: list[str] = [] + if report.file_path.is_file(): + try: + _lines = report.file_path.read_text(encoding="utf-8").splitlines() + except OSError: + pass + for ref_f in report.findings: + src = "" + if _lines and 0 < ref_f.line_no <= len(_lines): + src = _lines[ref_f.line_no - 1].strip() + findings.append( + Finding( + rel_path=rel, + line_no=ref_f.line_no, + code=ref_f.issue, + severity="warning" if ref_f.is_warning else "error", + message=ref_f.detail, + source_line=src, + ) + ) + for rule_f in report.rule_findings: + findings.append( + Finding( + rel_path=rel, + line_no=rule_f.line_no, + code=rule_f.rule_id, + severity=rule_f.severity, + message=rule_f.message, + source_line=rule_f.matched_line or "", + col_start=rule_f.col_start, + match_text=rule_f.match_text or "", + ) + ) + for sf in report.security_findings: + findings.append(_map_shield_to_finding(sf, docs_root)) - if link_errors: - console.print(f"\n[red]BROKEN REFERENCE URLS ({len(link_errors)}):[/]") - for err in link_errors: - console.print(f" [yellow]{err}[/]") + for err_str in ext_link_errors: + findings.append( + Finding( + rel_path="(external-urls)", + line_no=0, + code="LINK_URL", + severity="error", + message=err_str, + ) + ) - console.print( - f"\n[dim]Reference Integrity:[/] [bold]{avg_score:.1f}%[/] across {file_count} file(s)." + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) + reporter = SentinelReporter(console, docs_root, docs_dir=str(config.docs_dir)) + errors, warnings = reporter.render( + findings, + version=__version__, + elapsed=elapsed, + docs_count=docs_count, + assets_count=assets_count, + engine=config.build_context.engine if hasattr(config, "build_context") else "auto", + strict=strict, + ok_message="All references resolved.", ) - if links: - console.print("[dim]External URL validation: enabled.[/]") - failed = bool(all_errors) or bool(link_errors) or (strict and bool(all_warnings)) - if failed: + breaches = sum(1 for f in findings if f.severity == "security_breach") + if breaches: + raise typer.Exit(2) + if errors or (strict and warnings): raise typer.Exit(1) - console.print("\n[green]OK:[/] all references resolved.") - @check_app.command(name="assets") def check_assets() -> None: """Detect unused images and assets in the documentation.""" + from zenzic import __version__ + repo_root = find_repo_root() config, loaded_from_file = ZenzicConfig.load(repo_root) if not loaded_from_file: _print_no_config_hint() + docs_root = (repo_root / config.docs_dir).resolve() + + t0 = time.monotonic() unused = find_unused_assets(repo_root, config) - if unused: - console.print( - f"\n[red]UNUSED ASSETS ({len(unused)}):[/] physical files not linked anywhere:" + elapsed = time.monotonic() - t0 + + findings = [ + Finding( + rel_path=str(path), + line_no=0, + code="ASSET", + severity="warning", + message="File not referenced in any documentation page.", ) - for path in unused: - console.print(f" [yellow]{path}[/]") + for path in unused + ] + + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) + reporter = SentinelReporter(console, docs_root, docs_dir=str(config.docs_dir)) + errors, warnings = reporter.render( + findings, + version=__version__, + elapsed=elapsed, + docs_count=docs_count, + assets_count=assets_count, + engine=config.build_context.engine if hasattr(config, "build_context") else "auto", + strict=True, + ok_message="No unused assets found.", + ) + if errors or warnings: raise typer.Exit(1) - console.print("\n[green]OK:[/] no unused assets found.") @clean_app.command(name="assets") @@ -385,17 +527,57 @@ def clean_assets( @check_app.command(name="placeholders") def check_placeholders() -> None: """Detect pages with < 50 words or containing TODOs/stubs.""" + from zenzic import __version__ + repo_root = find_repo_root() config, loaded_from_file = ZenzicConfig.load(repo_root) if not loaded_from_file: _print_no_config_hint() - findings = find_placeholders(repo_root, config) - if findings: - console.print(f"\n[red]PLACEHOLDERS/STUBS ({len(findings)}):[/]") - for f in findings: - console.print(f" [yellow]{f.file_path}:{f.line_no}[/] [{f.issue}] - {f.detail}") + docs_root = (repo_root / config.docs_dir).resolve() + + t0 = time.monotonic() + raw_findings = find_placeholders(repo_root, config) + elapsed = time.monotonic() - t0 + + findings: list[Finding] = [] + for pf in raw_findings: + src = "" + if pf.line_no > 0: + abs_path = docs_root / pf.file_path + if abs_path.is_file(): + try: + lines = abs_path.read_text(encoding="utf-8").splitlines() + if 0 < pf.line_no <= len(lines): + src = lines[pf.line_no - 1].strip() + except OSError: + pass + findings.append( + Finding( + rel_path=str(pf.file_path), + line_no=pf.line_no, + code=pf.issue, + severity="warning", + message=pf.detail, + source_line=src, + col_start=pf.col_start, + match_text=pf.match_text, + ) + ) + + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) + reporter = SentinelReporter(console, docs_root, docs_dir=str(config.docs_dir)) + errors, warnings = reporter.render( + findings, + version=__version__, + elapsed=elapsed, + docs_count=docs_count, + assets_count=assets_count, + engine=config.build_context.engine if hasattr(config, "build_context") else "auto", + strict=True, + ok_message="No placeholder stubs found.", + ) + if errors or warnings: raise typer.Exit(1) - console.print("\n[green]OK:[/] no placeholder stubs found.") @dataclass @@ -774,32 +956,7 @@ def check_all( if quiet: errors, warnings = reporter.render_quiet(all_findings) else: - # Split audit scope: docs (md + config) vs assets (images, fonts, …). - # _INERT: always-excluded scaffolding; _CONFIG: config formats inside docs/. - _INERT = {".css", ".js"} - _CONFIG = {".yml", ".yaml", ".toml"} - if docs_root.is_dir(): - docs_count = sum( - 1 - for p in docs_root.rglob("*") - if p.is_file() and (p.suffix.lower() == ".md" or p.suffix.lower() in _CONFIG) - ) - # Also count engine config files at project root (e.g. mkdocs.yml). - docs_count += sum( - 1 - for p in repo_root.iterdir() - if p.is_file() and p.suffix.lower() in {".yml", ".yaml"} - ) - assets_count = sum( - 1 - for p in docs_root.rglob("*") - if p.is_file() - and p.suffix.lower() not in _INERT - and p.suffix.lower() not in _CONFIG - and p.suffix.lower() != ".md" - ) - else: - docs_count = assets_count = 0 + docs_count, assets_count = _count_docs_assets(docs_root, repo_root) # File-target mode: banner shows exactly 1 file. if _single_file is not None: docs_count, assets_count = 1, 0 diff --git a/src/zenzic/core/reporter.py b/src/zenzic/core/reporter.py index db56bec..2620d39 100644 --- a/src/zenzic/core/reporter.py +++ b/src/zenzic/core/reporter.py @@ -166,6 +166,7 @@ def render( engine: str = "auto", target: str | None = None, strict: bool = False, + ok_message: str | None = None, ) -> tuple[int, int]: """Print the full Sentinel Report. @@ -174,6 +175,13 @@ def render( excluded from the grouped view to avoid noise. All other findings flow through the normal grouped pipeline. + Args: + ok_message: Optional success message shown when no hard failures are + found. Defaults to ``"All checks passed. Your documentation is + secure."`` (all-clear panel) or ``"All checks passed."`` (with + warnings). Individual commands should pass a specific message + such as ``"No broken links found."``. + Returns: ``(error_count, warning_count)`` — breaches are counted separately by the caller (``cli.py``) and cause Exit 2, not Exit 1. @@ -232,6 +240,7 @@ def render( if not normal_findings and not breach_findings: # ── All-clear panel ─────────────────────────────────────────────── + _ok = ok_message or "All checks passed. Your documentation is secure." self._con.print() self._con.print( Panel( @@ -240,10 +249,7 @@ def render( Text(), Rule(style=SLATE), Text(), - Text.from_markup( - f"[{EMERALD}]{emoji('check')} All checks passed. " - f"Your documentation is secure.[/]" - ), + Text.from_markup(f"[{EMERALD}]{emoji('check')} {_ok}[/]"), ), title=f"[bold white on {INDIGO}] {emoji('shield')} ZENZIC SENTINEL v{version} [/]", title_align="center", @@ -294,7 +300,16 @@ def render( col_start=f.col_start, match_text=f.match_text, ) - renderables.extend(snippet_lines) + if snippet_lines: + renderables.extend(snippet_lines) + else: + # Fallback: file unreadable, use source_line directly + gutter_w = len(str(f.line_no)) + t = Text() + t.append(f" {str(f.line_no).rjust(gutter_w)} ", style=SLATE) + t.append("❱ ", style=f"bold {ROSE}") + t.append(f.source_line) + renderables.append(t) renderables.append(Text()) # spacing after file group @@ -324,9 +339,8 @@ def render( Text.from_markup(f"[bold {ROSE}]FAILED:[/] One or more checks failed.") ) else: - renderables.append( - Text.from_markup(f"[{EMERALD}]{emoji('check')} All checks passed.[/]") - ) + _ok = ok_message or "All checks passed." + renderables.append(Text.from_markup(f"[{EMERALD}]{emoji('check')} {_ok}[/]")) # ── Single unified panel ────────────────────────────────────────────── self._con.print() diff --git a/tests/test_cli.py b/tests/test_cli.py index 20d4aac..ba6cb1f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -56,7 +56,8 @@ def test_cli_help() -> None: def test_check_links_ok(_links, _cfg, _root) -> None: result = runner.invoke(app, ["check", "links"]) assert result.exit_code == 0 - assert "OK" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "No broken links found." in result.stdout @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -76,7 +77,8 @@ def test_check_links_ok(_links, _cfg, _root) -> None: def test_check_links_with_errors(_links, _cfg, _root) -> None: result = runner.invoke(app, ["check", "links"]) assert result.exit_code == 1 - assert "BROKEN LINKS" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "FILE_NOT_FOUND" in result.stdout or "error" in result.stdout.lower() @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -99,7 +101,8 @@ def test_cli_check_orphans_empty(tmp_path: Path, monkeypatch: pytest.MonkeyPatch monkeypatch.chdir(repo) result = runner.invoke(app, ["check", "orphans"]) assert result.exit_code == 0 - assert "OK: no orphan pages found." in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "No orphan pages found." in result.stdout @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -108,7 +111,8 @@ def test_cli_check_orphans_empty(tmp_path: Path, monkeypatch: pytest.MonkeyPatch def test_check_orphans_with_orphans(_orphans, _cfg, _root) -> None: result = runner.invoke(app, ["check", "orphans"]) assert result.exit_code == 1 - assert "ORPHANS" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "ORPHAN" in result.stdout # --------------------------------------------------------------------------- @@ -122,7 +126,8 @@ def test_check_orphans_with_orphans(_orphans, _cfg, _root) -> None: def test_check_snippets_ok(_snip, _cfg, _root) -> None: result = runner.invoke(app, ["check", "snippets"]) assert result.exit_code == 0 - assert "OK" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "All code snippets are syntactically valid." in result.stdout @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -140,7 +145,8 @@ def test_check_snippets_ok(_snip, _cfg, _root) -> None: def test_check_snippets_with_errors(_snip, _cfg, _root) -> None: result = runner.invoke(app, ["check", "snippets"]) assert result.exit_code == 1 - assert "INVALID SNIPPETS" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "SNIPPET" in result.stdout # --------------------------------------------------------------------------- @@ -154,7 +160,8 @@ def test_check_snippets_with_errors(_snip, _cfg, _root) -> None: def test_check_assets_ok(_assets, _cfg, _root) -> None: result = runner.invoke(app, ["check", "assets"]) assert result.exit_code == 0 - assert "OK" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "No unused assets found." in result.stdout @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -163,7 +170,8 @@ def test_check_assets_ok(_assets, _cfg, _root) -> None: def test_check_assets_with_unused(_assets, _cfg, _root) -> None: result = runner.invoke(app, ["check", "assets"]) assert result.exit_code == 1 - assert "UNUSED ASSETS" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "ASSET" in result.stdout # --------------------------------------------------------------------------- @@ -177,7 +185,8 @@ def test_check_assets_with_unused(_assets, _cfg, _root) -> None: def test_check_placeholders_ok(_ph, _cfg, _root) -> None: result = runner.invoke(app, ["check", "placeholders"]) assert result.exit_code == 0 - assert "OK" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "No placeholder stubs found." in result.stdout @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -193,7 +202,8 @@ def test_check_placeholders_ok(_ph, _cfg, _root) -> None: def test_check_placeholders_with_findings(_ph, _cfg, _root) -> None: result = runner.invoke(app, ["check", "placeholders"]) assert result.exit_code == 1 - assert "PLACEHOLDERS" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "short-content" in result.stdout # --------------------------------------------------------------------------- @@ -590,7 +600,8 @@ def test_render_quiet_with_findings(self) -> None: def test_check_references_ok(_scan, _cfg, _root) -> None: result = runner.invoke(app, ["check", "references"]) assert result.exit_code == 0 - assert "OK" in result.stdout + assert "ZENZIC SENTINEL" in result.stdout + assert "All references resolved." in result.stdout @patch("zenzic.cli.find_repo_root", return_value=_ROOT) @@ -615,7 +626,7 @@ def test_check_references_rule_findings_surfaced(mock_scan, _cfg, _root) -> None result = runner.invoke(app, ["check", "references"]) assert result.exit_code == 1 assert "ZZ-NOCLICKHERE" in result.stdout - assert "REFERENCE ERRORS" in result.stdout + assert "error" in result.stdout.lower() # --------------------------------------------------------------------------- diff --git a/tests/test_cli_visual.py b/tests/test_cli_visual.py index d181a86..994092d 100644 --- a/tests/test_cli_visual.py +++ b/tests/test_cli_visual.py @@ -73,7 +73,7 @@ def test_visual_snippet_rendered_when_source_line_present() -> None: def test_visual_snippet_absent_when_source_line_empty() -> None: - """An empty source_line must NOT produce a │ line.""" + """An empty source_line must NOT produce a ❱ error indicator.""" err = LinkError( file_path=_DOCS / "index.md", line_no=5, @@ -82,7 +82,7 @@ def test_visual_snippet_absent_when_source_line_empty() -> None: error_type="FILE_NOT_FOUND", ) result = _invoke_with_errors([err]) - assert "│" not in result.stdout + assert "❱" not in result.stdout # --------------------------------------------------------------------------- @@ -114,7 +114,7 @@ def test_error_type_badge_present(error_type: str) -> None: def test_generic_link_error_has_no_badge() -> None: - """The default LINK_ERROR type must NOT produce a badge in the header.""" + """LINK_ERROR code is shown as a standard Sentinel code badge.""" err = LinkError( file_path=_DOCS / "page.md", line_no=1, @@ -123,7 +123,8 @@ def test_generic_link_error_has_no_badge() -> None: error_type="LINK_ERROR", ) result = _invoke_with_errors([err]) - assert "LINK_ERROR" not in result.stdout + # Sentinel always shows the code; LINK_ERROR is a valid code badge + assert "LINK_ERROR" in result.stdout # --------------------------------------------------------------------------- @@ -149,7 +150,8 @@ def test_multiple_errors_each_have_snippet() -> None: ), ] result = _invoke_with_errors(errors) - assert result.stdout.count("│") == 2 + # Each error with a source_line emits an ❱ indicator + assert result.stdout.count("❱") == 2 assert "FILE_NOT_FOUND" in result.stdout assert "UNREACHABLE_LINK" in result.stdout @@ -247,9 +249,10 @@ def test_sandbox_zensical_valid_links_clean(monkeypatch: pytest.MonkeyPatch) -> """features.md and api.md have only valid links — no errors from those pages.""" monkeypatch.chdir(_SANDBOX_ZENSICAL) result = runner.invoke(app, ["check", "links"]) - # Only index.md has broken links — neither features.md nor api.md should appear - assert "features.md" not in result.stdout - assert "api.md" not in result.stdout + # Only index.md has broken links — features.md and api.md must not appear as + # section headers (full_rel path shown by the Sentinel Rule separator). + assert "docs/features.md" not in result.stdout + assert "docs/api.md" not in result.stdout # --------------------------------------------------------------------------- @@ -260,7 +263,7 @@ def test_sandbox_zensical_valid_links_clean(monkeypatch: pytest.MonkeyPatch) -> def test_check_links_exit_code_0_when_no_errors() -> None: result = _invoke_with_errors([]) assert result.exit_code == 0 - assert "OK" in result.stdout + assert "No broken links found." in result.stdout def test_check_links_exit_code_1_when_errors_present() -> None: From f132c447ab41c882416eb3b5be8179507f037107 Mon Sep 17 00:00:00 2001 From: PythonWoods Date: Tue, 7 Apr 2026 20:01:04 +0200 Subject: [PATCH 08/16] docs: tighten sentinel demo density and align summary geometry --- docs/assets/stylesheets/extra.css | 146 ++++++++++++++++-------------- docs/index.md | 31 +++++-- docs/it/index.md | 31 +++++-- 3 files changed, 120 insertions(+), 88 deletions(-) diff --git a/docs/assets/stylesheets/extra.css b/docs/assets/stylesheets/extra.css index 9cc4cd6..a812752 100644 --- a/docs/assets/stylesheets/extra.css +++ b/docs/assets/stylesheets/extra.css @@ -527,25 +527,31 @@ background: var(--md-code-bg-color); border: 1px solid var(--zz-border-subtle) !important; border-radius: 0.75rem; - padding: 1.25rem; + padding: 1rem; box-shadow: 0 8px 24px rgba(0, 0, 0, 0.14); } .zz-sentinel-demo { margin-top: 1rem; - border: 1px solid rgba(79, 70, 229, 0.28); - border-radius: 0.7rem; + border: 1px solid #334155; background: #0f172a; - padding: 0.9rem 1rem; - font-family: 'JetBrains Mono', ui-monospace, monospace; - font-size: 0.7rem; - line-height: 1.6; + padding: 0.6ch 0; + font-family: 'Fira Code', 'JetBrains Mono', monospace !important; + font-size: 0.6rem !important; + line-height: 1.05 !important; + letter-spacing: -0.03ch; + -webkit-font-smoothing: antialiased; + -moz-osx-font-smoothing: grayscale; overflow-x: auto; } +.zz-sentinel-demo * { + font-family: inherit !important; +} + .zz-sentinel-demo__rule { color: #94a3b8; - margin-bottom: 0.75rem; + margin-bottom: 0.5rem; white-space: nowrap; } @@ -573,8 +579,9 @@ .zz-sentinel-demo__finding { display: flex; align-items: baseline; - gap: 0.45rem; - margin: 0.2rem 0; + gap: 1ch; + margin: 0.15rem 0; + padding-left: 5ch !important; } .zz-sentinel-demo__icon { @@ -590,33 +597,34 @@ } .zz-sentinel-demo__badge { - color: #e2e8f0; - background: rgba(79, 70, 229, 0.28); - border: 1px solid rgba(129, 140, 248, 0.35); - border-radius: 999px; - padding: 0.05rem 0.45rem; - font-size: 0.64rem; + font-weight: 700; + border-radius: 0 !important; + padding: 0 0.2ch; + background: rgba(244, 63, 94, 0.1); + font-size: 0.58rem; } -.zz-sentinel-demo__badge--warning { - background: rgba(245, 158, 11, 0.18); - border-color: rgba(245, 158, 11, 0.32); +/* Badge inherits severity color from its sibling icon — SVG truth: + r8 (#f43f5e bold) for error, r7 (#f59e0b bold) for warning */ +.zz-sentinel-demo__icon--error ~ .zz-sentinel-demo__badge { + color: #f43f5e; } -.zz-sentinel-demo__badge--breach { - background: rgba(244, 63, 94, 0.18); - border-color: rgba(244, 63, 94, 0.34); +.zz-sentinel-demo__icon--warning ~ .zz-sentinel-demo__badge { + color: #f59e0b; } .zz-sentinel-demo__message { - color: #e6edf3; + color: #c5c8c6; + font-weight: 400; } .zz-sentinel-demo__snippet { display: grid; - grid-template-columns: 2rem 1rem minmax(0, 1fr); - gap: 0.4rem; - color: #e6edf3; + grid-template-columns: 7ch 3ch auto; + gap: 0; + color: #c5c8c6; + padding-left: 3ch; } .zz-sentinel-demo__snippet--dim { @@ -640,8 +648,9 @@ .zz-sentinel-demo__summary-row { display: flex; flex-wrap: wrap; - gap: 0.8rem; + gap: 1.5ch; margin-bottom: 0.5rem; + padding-left: 3ch !important; } .zz-sentinel-demo__count--error { @@ -659,59 +668,60 @@ .zz-sentinel-demo__verdict { color: #f43f5e; font-weight: 700; + padding-left: 3ch !important; } -[data-md-color-scheme="default"] .zz-sentinel-demo { - background: #eef2ff; - border-color: rgba(79, 70, 229, 0.18); -} - -[data-md-color-scheme="default"] .zz-sentinel-demo__rule, -[data-md-color-scheme="default"] .zz-sentinel-demo__snippet--dim, -[data-md-color-scheme="default"] .zz-sentinel-demo__count--muted, -[data-md-color-scheme="default"] .zz-sentinel-demo__line-no, -[data-md-color-scheme="default"] .zz-sentinel-demo__gutter { - color: #64748b; -} - -[data-md-color-scheme="default"] .zz-sentinel-demo__rule::before, -[data-md-color-scheme="default"] .zz-sentinel-demo__rule::after { - color: #cbd5e1; +/* ── Breach-panel variant (Zenzic Shield demo) ─────────────────────────────── */ +.zz-sentinel-demo--breach-panel { + border-color: rgba(244, 63, 94, 0.45); } -[data-md-color-scheme="default"] .zz-sentinel-demo__message, -[data-md-color-scheme="default"] .zz-sentinel-demo__snippet { - color: #0f172a; -} - -[data-md-color-scheme="default"] .zz-sentinel-demo__badge { - color: #312e81; - background: rgba(79, 70, 229, 0.1); - border-color: rgba(79, 70, 229, 0.18); +.zz-sentinel-demo__breach-header { + text-align: center; + color: #f43f5e; + font-weight: 700; + font-size: 0.55rem; + letter-spacing: 0.14em; + margin-bottom: 0.3rem; + padding-bottom: 0.2rem; + border-bottom: 1px solid rgba(244, 63, 94, 0.2); } -[data-md-color-scheme="default"] .zz-sentinel-demo__badge--warning { - color: #92400e; - background: rgba(245, 158, 11, 0.1); - border-color: rgba(245, 158, 11, 0.2); +.zz-sentinel-demo__breach-row { + display: flex; + align-items: baseline; + gap: 0.4rem; + margin: 0.15rem 0; + padding-left: 4ch !important; } -[data-md-color-scheme="default"] .zz-sentinel-demo__badge--breach { - color: #9f1239; - background: rgba(244, 63, 94, 0.1); - border-color: rgba(244, 63, 94, 0.18); +.zz-sentinel-demo__breach-key { + display: inline-block; + color: #e2e8f0; + font-weight: 700; + width: 12ch; } -[data-md-color-scheme="default"] .zz-sentinel-demo__gutter--active, -[data-md-color-scheme="default"] .zz-sentinel-demo__icon--error, -[data-md-color-scheme="default"] .zz-sentinel-demo__count--error, -[data-md-color-scheme="default"] .zz-sentinel-demo__verdict { - color: #e11d48; +.zz-sentinel-demo__breach-secret { + color: #fff; + background: rgba(244, 63, 94, 0.30); + border: none; + border-radius: 0; + padding: 0.05rem 0.4ch; + font-weight: 700; } -[data-md-color-scheme="default"] .zz-sentinel-demo__icon--warning, -[data-md-color-scheme="default"] .zz-sentinel-demo__count--warning { - color: #d97706; +.zz-sentinel-demo__breach-action { + display: flex; + align-items: baseline; + gap: 0.4rem; + flex-wrap: wrap; + margin-top: 0.35rem; + padding-top: 0.3rem; + padding-left: 4ch !important; + border-top: 1px solid rgba(244, 63, 94, 0.12); + color: #94a3b8; + font-size: 0.58rem; } .zz-sentinel-section li .highlight, diff --git a/docs/index.md b/docs/index.md index 4757c28..d73721b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -112,7 +112,7 @@ Catch broken links, orphan pages, and leaked credentials — before your users d
docs/guide.md
-FILE_NOT_FOUND +[FILE_NOT_FOUND] 'intro.md' not reachable from nav
15before continuing.
@@ -124,16 +124,27 @@ Catch broken links, orphan pages, and leaked credentials — before your users d

  Zenzic Shield


Scans every line — including fenced bash and yaml blocks — for leaked credentials. Exit code 2 is reserved exclusively for security events.

- @@ -137,6 +137,102 @@ __Perché i link ad orfani contano:__ un link a una pagina orfana _funziona_ a l ⚠ 1 warning • 1 file with findings ``` +### Sentinella di Sangue — attraversamento percorsi di sistema + +Quando un attraversamento esce dal confine `docs/` __e__ l'href grezzo punta a una +directory di sistema del sistema operativo (`/etc/`, `/root/`, `/var/`, `/proc/`, +`/sys/`, `/usr/`), Zenzic lo classifica come un __attraversamento di percorso di +sistema__. Non è un link non valido — è una sonda intenzionale o accidentale del +sistema operativo host incorporata nel sorgente della documentazione. + +| Codice | Severità | Exit code | Significato | +| :--- | :---: | :---: | :--- | +| `PATH_TRAVERSAL_SUSPICIOUS` | security_incident | __3__ | L'href punta a una directory di sistema del SO. Eseguire rotazione e audit immediatamente. | +| `PATH_TRAVERSAL` | error | 1 | L'href esce da `docs/` verso un percorso non di sistema (es. un repository adiacente). | + +L'Exit Code 3 ha priorità sull'Exit Code 2 (violazione credenziali Shield). Non viene +mai soppresso da `--exit-zero`. + +!!! danger "Exit Code 3 — Sentinella di Sangue" + Un finding `PATH_TRAVERSAL_SUSPICIOUS` significa che un file sorgente della + documentazione contiene un link il cui target risolto punta a `/etc/passwd`, + `/root/`, o un altro percorso di sistema del SO. Questo può indicare una + template injection, una toolchain della documentazione compromessa, o un errore + dell'autore che rivela dettagli dell'infrastruttura interna. Va trattato come un + incidente di sicurezza che blocca la build. + +!!! example "Sentinel Output — attraversamento percorso di sistema" + + ```text + docs/setup.md + ✘ 14: [PATH_TRAVERSAL_SUSPICIOUS] '../../../../etc/passwd' resolves outside the docs directory + │ + 14 │ [file di configurazione](../../../../etc/passwd) + │ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + + ✘ 1 error • 1 file with findings + + FAILED: One or more checks failed. + ``` + Exit code: **3** + +![Zenzic Sentinel — Blood Report: finding PATH_TRAVERSAL_SUSPICIOUS in rosso sangue](../assets/screenshots/screenshot-blood.svg) + +### Link circolari + +Zenzic rileva i cicli di link tramite una ricerca depth-first iterativa sul grafo di +adiacenza dei link (Fase 1.5, Θ(V+E) — eseguita una sola volta dopo la costruzione +del resolver in memoria). Ogni verifica di Phase 2 sul registro dei cicli è poi O(1). + +Un "ciclo" in un grafo di link della documentazione significa che la pagina A linka +alla pagina B e la pagina B linka di ritorno alla pagina A (direttamente o attraverso +una catena più lunga). I link di navigazione reciproca — ad esempio, una pagina Home +che linka a una pagina Funzionalità e la pagina Funzionalità che linka di ritorno a +Home — sono comuni, intenzionali, e non causano problemi di rendering per nessun +generatore di siti statici. + +Per questo motivo, `CIRCULAR_LINK` viene segnalato con severità `info`. Appare nel +pannello Sentinel e contribuisce al conteggio "N file con findings", ma non influisce +mai sugli exit code in modalità normale o `--strict`. I team che vogliono applicare +una topologia DAG rigorosa possono esaminare i finding di tipo info come parte del +loro processo di revisione. + +| Codice | Severità | Exit code | Significato | +| :--- | :---: | :---: | :--- | +| `CIRCULAR_LINK` | info | — | Il target risolto è membro di un ciclo di link. | + +!!! example "Sentinel Output — link circolare" + + ```text + docs/guide.md + 💡 3: [CIRCULAR_LINK] 'index.md' is part of a circular link cycle + + docs/index.md + 💡 8: [CIRCULAR_LINK] 'guide.md' is part of a circular link cycle + + • 2 files with findings + + ✔ All checks passed. + ``` + +!!! note "Finding di livello info — soppresso per default" + I finding `CIRCULAR_LINK` sono segnalati con severità `info` e __non vengono + mostrati__ nell'output standard per evitare di intasare le scansioni di + routine. I link di navigazione reciproca sono comuni e intenzionali nelle + strutture di documentazione ipertestuale. + + Usa `--show-info` per visualizzarli: + + ```bash + zenzic check all --show-info + ``` + + Non bloccano mai la build né influiscono sui codici di uscita in nessuna modalità. + Per la motivazione alla base di questa scelta di severità, consulta + [ADR 003 — Root Discovery Protocol](adr/003-discovery-logic.md). + +![Zenzic Sentinel — Circle Discovery: finding CIRCULAR_LINK visualizzati con --show-info](../assets/screenshots/screenshot-circular.svg) + --- ## Orfani @@ -313,3 +409,83 @@ __Cosa rileva:__ ⚠ 2 warnings • 2 files with findings ``` + +--- + +## Riferimenti + +__CLI:__ `zenzic check references` + +`zenzic check references` è il controllo di sicurezza e integrità dei link per i +[link in stile riferimento Markdown][ref-syntax]. È anche la superficie principale +per lo __Zenzic Shield__ — lo scanner integrato di credenziali che esamina ogni riga +di ogni file, indipendentemente dal tipo di contenuto. + +[ref-syntax]: https://spec.commonmark.org/0.31.2/#link-reference-definitions + +### Pipeline di riferimento in tre passi + +Il motore processa ogni file in tre passi deliberati: + +| Passo | Nome | Cosa avviene | +| :---: | :--- | :--- | +| 1 | __Harvest__ | Scansiona ogni riga; registra le definizioni `[id]: url`; esegue lo Shield su ogni URL e riga | +| 2 | __Cross-Check__ | Risolve ogni utilizzo `[testo][id]` rispetto alla `ReferenceMap` completa; segnala gli ID irrisolvibili | +| 3 | __Integrity Report__ | Calcola il punteggio di integrità per file; aggiunge avvisi Dead Definition e alt-text | + +Il Passo 2 inizia solo quando il Passo 1 si completa senza finding Shield. Un file +contenente una credenziale trapelata non viene mai passato al resolver dei link. + +### Codici di violazione + +| Codice | Severità | Exit code | Significato | +| :--- | :---: | :---: | :--- | +| `DANGLING_REF` | error | 1 | `[testo][id]` — `id` non ha definizione nel file | +| `DEAD_DEF` | warning | 0 / 1 `--strict` | `[id]: url` definito ma mai referenziato | +| `DUPLICATE_DEF` | warning | 0 / 1 `--strict` | Stesso `id` definito due volte; vince il primo | +| `MISSING_ALT` | warning | 0 / 1 `--strict` | Immagine con alt text assente o vuoto | +| Pattern Shield | security_breach | __2__ | Credenziale rilevata in qualsiasi riga o URL | + +### Zenzic Shield — rilevamento credenziali + +Lo Shield scansiona __ogni riga di ogni file__ durante il Passo 1, incluse le righe +all'interno dei blocchi di codice delimitati. Una credenziale inserita in un esempio +`bash` è comunque una credenziale inserita nel repository. + +__Famiglie di pattern rilevate:__ + +| Pattern | Cosa rileva | +| :--- | :--- | +| `openai-api-key` | Chiavi API OpenAI (`sk-…`) | +| `github-token` | Token personali / OAuth GitHub (`gh[pousr]_…`) | +| `aws-access-key` | ID chiave di accesso IAM AWS (`AKIA…`) | +| `stripe-live-key` | Chiavi segrete live Stripe (`sk_live_…`) | +| `slack-token` | Token bot / utente / app Slack (`xox[baprs]-…`) | +| `google-api-key` | Chiavi API Google Cloud / Maps (`AIza…`) | +| `private-key` | Chiavi private PEM (`-----BEGIN … PRIVATE KEY-----`) | +| `hex-encoded-payload` | Sequenze di byte hex-encoded (3+ escape `\xNN` consecutivi) | + +L'__Exit Code 2__ è riservato esclusivamente agli eventi Shield. Non viene mai +soppresso da `--exit-zero` o da `exit_zero = true` in `zenzic.toml`. + +!!! danger "Se ricevi l'exit code 2" + Ruota immediatamente la credenziale esposta, poi rimuovi o sostituisci la riga + incriminata. Non inserire il segreto nella storia del repository. Consulta + [Comportamento Shield](usage/advanced.md#shield-behaviour) nel riferimento avanzato + per il protocollo di contenimento completo. + +!!! example "Sentinel Output — violazione Shield" + + ```text + docs/setup.md + 🔴 [security_breach] openai-api-key detected + + SECURITY BREACH DETECTED + Credential: sk-4xAm****************************7fBz + Action: Rotate this credential immediately and purge it from the repository history. + ``` + Exit code: **2** + +Per il riferimento completo che include la formula del punteggio di integrità, l'API +programmatica e i controlli alt-text, consulta +[Funzionalità Avanzate — Integrità dei riferimenti](usage/advanced.md#reference-integrity-v020). diff --git a/docs/it/internal/arch_gaps.md b/docs/it/internal/arch_gaps.md index be5d8ce..f39dfed 100644 --- a/docs/it/internal/arch_gaps.md +++ b/docs/it/internal/arch_gaps.md @@ -24,3 +24,15 @@ **Componente:** `zenzic/cli.py` **Descrizione:** Lo scanner e il reporter dispongono ora di mutation test completi che proteggono l'efficacia dello Shield (The Sentinel's Trial). Tuttavia, la mutazione del silenziatore (`findings.append(...) -> pass`) all'interno di `cli.py` non viene coperta dalla suite attuale perché essa salta la CLI per interfacciarsi con il proxy. **Azione Richiesta:** Un test end-to-end (e2e) che attivi l'intera CLI e verifichi l'uscita con exit code 2 e la presenza del reporter per assicurare che il routing non sia vulnerabile ad amnesie (Commit 4b o successivi). + +--- + +## Risolti + +### ZRT-005 — Bootstrap Paradox + +**Identificato in:** v0.5.0a3 +**Componente:** `zenzic/cli.py`, `zenzic/core/scanner.py` +**Descrizione:** `zenzic init` crashava con un errore di configurazione quando veniva invocato in una directory vuota (senza `zenzic.toml` esistente). Il comando di bootstrap assumeva erroneamente che un contesto di progetto valido fosse già presente prima ancora di crearlo. +**Risoluzione:** Implementata una sequenza di inizializzazione a due fasi: (1) scrittura del file `zenzic.toml` tramite template isolato dal resolver di contesto, (2) avvio del ciclo di validazione solo se il file di configurazione esiste già. Il resolver ora tollera la directory vuota e delegamente l'amorcage al comando `init`. Verificato con il Genesis Test: `zenzic init` in directory completamente vuota genera correttamente `zenzic.toml` con il blocco Shield commentato. +**Chiuso in:** v0.5.0a4 (`fix/sentinel-hardening`) — commit `38be6f1` diff --git a/docs/it/internal/architecture.md b/docs/it/internal/architecture.md new file mode 100644 index 0000000..94b8234 --- /dev/null +++ b/docs/it/internal/architecture.md @@ -0,0 +1,172 @@ + + + +# Zenzic — Architettura della Pipeline e Complessità Algoritmica + +> *"Misura due volte, taglia una volta. Conosci la complessità prima di scalare."* +> +> Questo documento descrive le fasi interne della pipeline del motore di +> validazione di Zenzic, con enfasi sulle garanzie di complessità algoritmica. +> È rivolto ai DevOps engineer che valutano le caratteristiche di performance +> su siti di documentazione di grandi dimensioni (1 000–50 000 pagine) e ai +> contributor che lavorano sul core del validatore. + +--- + +## Panoramica + +La pipeline di validazione di Zenzic è divisa in tre fasi sequenziali: + +| Fase | Nome | Complessità | Descrizione | +| :---: | :--- | :---: | :--- | +| 1 | **Build in-memory** | Θ(N) | Legge tutti i file, estrae i link, costruisce la VSM | +| 1.5 | **Analisi del grafo** | Θ(V+E) | Costruisce il grafo di adiacenza, rileva cicli tramite DFS iterativa | +| 2 | **Validazione per-link** | O(1) per query | Risolve ogni link contro gli indici pre-costruiti | + +Complessità totale della pipeline per un sito con N pagine e L link totali: +**Θ(N + V + E + L)** — lineare in tutti gli input, dove V ≤ N e E ≤ L. + +--- + +## Fase 1 — Build in-memory (Θ(N)) + +La Fase 1 legge ogni file `.md` in `docs_dir` esattamente una volta. Per ogni file: + +1. **Estrazione dei link** — una state machine deterministica riga per riga estrae + tutti i link Markdown `[testo](href)` e i link di riferimento `[testo][id]`, + saltando i blocchi di codice delimitati e gli inline code span. +2. **Pre-calcolo delle ancore** — gli slug delle intestazioni vengono estratti e + memorizzati in un `dict[str, set[str]]` indicizzato per percorso file. +3. **Costruzione della VSM** — la Virtual Site Map viene popolata: un `frozenset` + di tutti i percorsi file risolti presenti nell'insieme dei file scansionati e + nell'alberatura di navigazione del sito (se applicabile). + +Ogni file viene letto esattamente una volta (O(N) letture I/O). La state machine +gira in O(F) dove F è il numero di caratteri nel file, sommando a Θ(N) su tutti +i file. Nessun file viene riaperto durante le Fasi 1.5 o 2. + +### Parsing a macchina a stati e falsi positivi da Superfences + +Il motore di estrazione usa una macchina a tre stati: `NORMALE`, `IN_FENCE`, +`IN_CODE_SPAN`. Le transizioni sono attivate da: + +- `` ``` `` o `~~~` all'inizio di una riga → entra/esce da `IN_FENCE` +- Conteggio backtick su una singola riga → commuta `IN_CODE_SPAN` + +I link in `IN_FENCE` o `IN_CODE_SPAN` vengono scartati silenziosamente. +Questo previene falsi positivi da documentazione che mostra esempi di sintassi +Markdown all'interno di blocchi di codice (documenti in stile +`pymdownx.superfences`). + +--- + +## Fase 1.5 — Analisi del grafo: DFS iterativa (Θ(V+E)) + +La Fase 1.5 viene eseguita una volta dopo la Fase 1, prima di qualsiasi +validazione per-link. Prende l'insieme delle coppie (pagina_sorgente → +pagina_target) estratte nella Fase 1 e costruisce un grafo orientato di +adiacenza. + +### Perché DFS iterativa? + +Il limite di ricorsione predefinito di Python (`sys.getrecursionlimit()` = 1 000) +causerebbe un `RecursionError` su siti di documentazione con catene di navigazione +profonde. Zenzic usa una **DFS iterativa con stack esplicito** per evitare questo +limite completamente, indipendentemente dalla profondità del grafo. + +### Algoritmo — colorazione BIANCO/GRIGIO/NERO + +```python +BIANCO = 0 # non visitato +GRIGIO = 1 # sullo stack DFS corrente (in elaborazione) +NERO = 2 # completamente esplorato + +def _find_cycles_iterative(adj: dict[str, list[str]]) -> frozenset[str]: + colore = dict.fromkeys(adj, BIANCO) + in_ciclo: set[str] = set() + + for inizio in adj: + if colore[inizio] != BIANCO: + continue + stack = [(inizio, iter(adj[inizio]))] + colore[inizio] = GRIGIO + while stack: + nodo, figli = stack[-1] + try: + figlio = next(figli) + if colore[figlio] == GRIGIO: + # Arco all'indietro → ciclo rilevato + in_ciclo.add(figlio) + in_ciclo.add(nodo) + elif colore[figlio] == BIANCO: + colore[figlio] = GRIGIO + stack.append((figlio, iter(adj.get(figlio, [])))) + except StopIteration: + colore[nodo] = NERO + stack.pop() + + return frozenset(in_ciclo) +``` + +**Complessità:** Θ(V+E) — ogni vertice viene inserito e rimosso dallo stack +esattamente una volta; ogni arco viene percorso esattamente una volta. + +**Spazio:** O(V) — la mappa dei colori e lo stack DFS insieme usano O(V) memoria. +Il risultato `frozenset[str]` contiene solo i nodi che partecipano ad almeno +un ciclo. + +### Registro dei cicli + +L'output della Fase 1.5 è un `frozenset[str]` di percorsi di pagine che sono +membri di almeno un ciclo orientato. Questo registro è memorizzato come attributo +immutabile sull'istanza del validatore. + +--- + +## Fase 2 — Validazione per-link (O(1) per query) + +Ogni link estratto nella Fase 1 viene validato nella Fase 2 contro **tre +strutture dati pre-costruite**, tutte costruite durante le Fasi 1 e 1.5: + +| Controllo | Struttura dati | Costo di lookup | +| :--- | :--- | :---: | +| Esistenza del file | `frozenset[str]` — VSM | O(1) | +| Appartenenza alla nav | `frozenset[str]` — insieme nav | O(1) | +| Validità ancora | `dict[percorso, set[ancora]]` | O(1) | +| Appartenenza a ciclo | `frozenset[str]` — registro cicli | O(1) | + +Poiché tutti e quattro i lookup sono O(1), la Fase 2 gira in **O(L)** tempo +totale dove L è il numero totale di link in tutte le pagine. + +### Perché la Fase 2 rimane O(1) per query + +Il registro dei cicli è un `frozenset` — l'insieme immutabile built-in di Python +con test di appartenenza in O(1) medio-caso tramite hashing. Non c'è DFS o +attraversamento del grafo al momento della query. Il costo Θ(V+E) viene pagato +una volta nella Fase 1.5; ogni lookup successivo è puro accesso a tabella hash. + +--- + +## Profilo di Scalabilità + +| Dimensione sito | Fase 1 | Fase 1.5 | Fase 2 | Totale | +| :--- | :--- | :--- | :--- | :--- | +| 100 pagine, 500 link | < 5 ms | < 1 ms | < 2 ms | ~ 8 ms | +| 1 000 pagine, 5 000 link | ~ 30 ms | ~ 8 ms | ~ 15 ms | ~ 55 ms | +| 10 000 pagine, 50 000 link | ~ 300 ms | ~ 80 ms | ~ 150 ms | ~ 530 ms | +| 50 000 pagine, 250 000 link | ~ 1.5 s | ~ 400 ms | ~ 750 ms | ~ 2.6 s | + +Tutte le misurazioni sono single-threaded su un runner CI di fascia media +(2 vCPU, 4 GB RAM). La scansione Shield (Fase 1, sovrapposta) aggiunge < 10% +di overhead indipendentemente dalla dimensione del sito, poiché è un singolo +passaggio regex per file. + +--- + +## Documenti Correlati + +- [ADR 003 — Logica di Discovery](../adr/003-discovery-logic.md) — motivazione + per la pipeline in due fasi e la scelta della DFS iterativa +- [Gap Architetturali](arch_gaps.md) — elementi di debito tecnico aperti +- [Rapporto Sicurezza — Shattered Mirror](security/shattered_mirror_report.md) — + analisi della correttezza dei pattern Shield diff --git a/docs/it/usage/advanced.md b/docs/it/usage/advanced.md index 0b047e8..4ee30b6 100644 --- a/docs/it/usage/advanced.md +++ b/docs/it/usage/advanced.md @@ -12,7 +12,7 @@ e utilizzo programmatico da Python. --- -## Integrità dei riferimenti (v0.2.0) +## Integrità dei riferimenti (v0.2.0) { #reference-integrity-v020 } `zenzic check references` esegue la **Three-Pass Reference Pipeline** — il motore alla base di ogni controllo di qualità e sicurezza sui riferimenti. @@ -79,8 +79,9 @@ per intercettare segreti nella prosa normale. | `slack-token` | `xox[baprs]-[0-9a-zA-Z]{10,48}` | Token bot/utente/app Slack | | `google-api-key` | `AIza[0-9A-Za-z\-_]{35}` | Chiavi API Google Cloud / Maps | | `private-key` | `-----BEGIN [A-Z ]+ PRIVATE KEY-----` | Chiavi private PEM (RSA, EC, ecc.) | +| `hex-encoded-payload` | `(?:\\x[0-9a-fA-F]{2}){3,}` | Sequenze di byte hex-encoded (3+ sequenze `\xNN` consecutive) | -### Comportamento dello Shield +### Comportamento dello Shield { #shield-behaviour } - **Ogni riga viene scansionata** — incluse le righe dentro i blocchi di codice delimitati (con o senza etichetta). Una credenziale committata in un esempio `bash` è comunque una credenziale @@ -100,6 +101,11 @@ per intercettare segreti nella prosa normale. esposta, poi rimuovi o sostituisci l'URL di riferimento incriminato. Non committare il segreto nella history. +!!! tip "Scopri lo Shield in azione" + Il repository include `examples/safety_demonstration.md` — una fixture di test intenzionale + contenente un link circolare e un payload hex-encoded. Esegui `zenzic check all` contro di esso + per osservare una violazione Shield live e un finding `CIRCULAR_LINK` di tipo info. + --- ## Logica di scansione ibrida diff --git a/docs/usage/advanced.md b/docs/usage/advanced.md index 4501d53..40a442e 100644 --- a/docs/usage/advanced.md +++ b/docs/usage/advanced.md @@ -77,6 +77,7 @@ applies a defence-in-depth pass to non-definition lines to catch secrets in plai | `slack-token` | `xox[baprs]-[0-9a-zA-Z]{10,48}` | Slack bot/user/app tokens | | `google-api-key` | `AIza[0-9A-Za-z\-_]{35}` | Google Cloud / Maps API keys | | `private-key` | `-----BEGIN [A-Z ]+ PRIVATE KEY-----` | PEM private keys (RSA, EC, etc.) | +| `hex-encoded-payload` | `(?:\\x[0-9a-fA-F]{2}){3,}` | Hex-encoded byte sequences (3+ consecutive `\xNN` escapes) | ### Shield behaviour @@ -96,6 +97,11 @@ applies a defence-in-depth pass to non-definition lines to catch secrets in plai Treat it as a build-blocking security incident. Rotate the exposed credential immediately, then remove or replace the offending reference URL. Do not commit the secret into history. +!!! tip "See the Shield in action" + The repository ships `examples/safety_demonstration.md` — an intentional test fixture + containing a circular link and a hex-encoded payload. Run `zenzic check all` against it + to observe a live Shield breach and a `CIRCULAR_LINK` info finding. + --- ## Hybrid scanning logic diff --git a/mkdocs.yml b/mkdocs.yml index 35e6a06..bf4f614 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -199,6 +199,7 @@ nav: - Example Projects: developers/examples.md - Internals: - Arch Gaps & Tech Debt: internal/arch_gaps.md + - Pipeline Architecture: internal/architecture.md - VSM Engine: arch/vsm_engine.md - Architecture Decisions: - ADR 003 — Root Discovery Protocol: adr/003-discovery-logic.md From 9f437b591f46f23340ef40641b987feb9a5d6fce Mon Sep 17 00:00:00 2001 From: PythonWoods-Dev Date: Wed, 8 Apr 2026 17:53:45 +0200 Subject: [PATCH 13/16] =?UTF-8?q?feat(engine):=20Blood=20Sentinel=20exit?= =?UTF-8?q?=203,=20graph=20integrity=20=CE=98(V+E),=20--show-info,=20hex?= =?UTF-8?q?=20shield?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - validator.py: iterative DFS CIRCULAR_LINK detection (Θ(V+E)); Blood Sentinel PATH_TRAVERSAL_SUSPICIOUS → exit code 3 - reporter.py: incidents_count for security_incident; blood-red summary badge; show_info filter with suppression note - cli.py: --show-info on all 7 check commands; Shield block in init template - shield.py: hex-encoded-payload pattern (3+ consecutive \xNN sequences) - ui.py: BLOOD palette constant - test_validator.py: CIRCULAR_LINK tests, Blood Sentinel, anchor torture fixture converted from ring to linear chain (avoids CIRCULAR_LINK noise) --- src/zenzic/cli.py | 66 +++++++++++++++-- src/zenzic/core/reporter.py | 58 ++++++++++++--- src/zenzic/core/shield.py | 1 + src/zenzic/core/validator.py | 134 +++++++++++++++++++++++++++++++++- src/zenzic/ui.py | 1 + tests/test_validator.py | 138 ++++++++++++++++++++++++++++++++++- 6 files changed, 377 insertions(+), 21 deletions(-) diff --git a/src/zenzic/cli.py b/src/zenzic/cli.py index 37f92ed..98a61b2 100644 --- a/src/zenzic/cli.py +++ b/src/zenzic/cli.py @@ -165,6 +165,9 @@ def _count_docs_assets(docs_root: Path, repo_root: Path) -> tuple[int, int]: @check_app.command(name="links") def check_links( strict: bool = typer.Option(False, "--strict", "-s", help="Exit non-zero on any warning."), + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), ) -> None: """Check for broken internal links. Pass --strict to also validate external URLs.""" from zenzic import __version__ @@ -188,7 +191,13 @@ def _rel(path: Path) -> str: rel_path=_rel(err.file_path), line_no=err.line_no, code=err.error_type, - severity="error", + severity=( + "security_incident" + if err.error_type == "PATH_TRAVERSAL_SUSPICIOUS" + else "info" + if err.error_type == "CIRCULAR_LINK" + else "error" + ), message=err.message, source_line=err.source_line, col_start=err.col_start, @@ -207,7 +216,11 @@ def _rel(path: Path) -> str: assets_count=assets_count, engine=config.build_context.engine if hasattr(config, "build_context") else "auto", ok_message="No broken links found.", + show_info=show_info, ) + incidents = sum(1 for f in findings if f.severity == "security_incident") + if incidents: + raise typer.Exit(3) if errors: raise typer.Exit(1) @@ -221,6 +234,9 @@ def check_orphans( "Auto-detected from zenzic.toml when omitted.", metavar="ENGINE", ), + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), ) -> None: """Detect .md files not listed in the nav.""" from zenzic import __version__ @@ -258,13 +274,18 @@ def check_orphans( engine=config.build_context.engine if hasattr(config, "build_context") else "auto", strict=True, ok_message="No orphan pages found.", + show_info=show_info, ) if errors or warnings: raise typer.Exit(1) @check_app.command(name="snippets") -def check_snippets() -> None: +def check_snippets( + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), +) -> None: """Validate Python code blocks in documentation Markdown files.""" from zenzic import __version__ @@ -315,6 +336,7 @@ def _rel(path: Path) -> str: assets_count=assets_count, engine=config.build_context.engine if hasattr(config, "build_context") else "auto", ok_message="All code snippets are syntactically valid.", + show_info=show_info, ) if errors: raise typer.Exit(1) @@ -334,6 +356,9 @@ def check_references( "-l", help="Also validate external HTTP/HTTPS reference URLs via async HEAD requests.", ), + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), ) -> None: """Run the Two-Pass Reference Pipeline: harvest definitions, check integrity, run Shield. @@ -429,6 +454,7 @@ def _rel(path: Path) -> str: engine=config.build_context.engine if hasattr(config, "build_context") else "auto", strict=strict, ok_message="All references resolved.", + show_info=show_info, ) breaches = sum(1 for f in findings if f.severity == "security_breach") @@ -439,7 +465,11 @@ def _rel(path: Path) -> str: @check_app.command(name="assets") -def check_assets() -> None: +def check_assets( + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), +) -> None: """Detect unused images and assets in the documentation.""" from zenzic import __version__ @@ -475,6 +505,7 @@ def check_assets() -> None: engine=config.build_context.engine if hasattr(config, "build_context") else "auto", strict=True, ok_message="No unused assets found.", + show_info=show_info, ) if errors or warnings: raise typer.Exit(1) @@ -525,7 +556,11 @@ def clean_assets( @check_app.command(name="placeholders") -def check_placeholders() -> None: +def check_placeholders( + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), +) -> None: """Detect pages with < 50 words or containing TODOs/stubs.""" from zenzic import __version__ @@ -575,6 +610,7 @@ def check_placeholders() -> None: engine=config.build_context.engine if hasattr(config, "build_context") else "auto", strict=True, ok_message="No placeholder stubs found.", + show_info=show_info, ) if errors or warnings: raise typer.Exit(1) @@ -642,7 +678,13 @@ def _rel(path: Path) -> str: rel_path=_rel(err.file_path), line_no=err.line_no, code=err.error_type, - severity="error", + severity=( + "security_incident" + if err.error_type == "PATH_TRAVERSAL_SUSPICIOUS" + else "info" + if err.error_type == "CIRCULAR_LINK" + else "error" + ), message=err.message, source_line=err.source_line, col_start=err.col_start, @@ -880,6 +922,9 @@ def check_all( ), show_default=False, ), + show_info: bool = typer.Option( + False, "--show-info", help="Show info-level findings (e.g. circular links) in the report." + ), ) -> None: """Run all checks: links, orphans, snippets, placeholders, assets, references. @@ -969,8 +1014,13 @@ def check_all( engine=config.build_context.engine if hasattr(config, "build_context") else "auto", target=_target_hint, strict=effective_strict, + show_info=show_info, ) + # Security incidents (system-path traversal) cause Exit 3 — highest priority. + incidents = sum(1 for f in all_findings if f.severity == "security_incident") + if incidents and not effective_exit_zero: + raise typer.Exit(3) # Breach findings cause Exit 2; all other failures cause Exit 1. # This check runs after rendering so the report is always printed first. breaches = sum(1 for f in all_findings if f.severity == "security_breach") @@ -1531,6 +1581,12 @@ def _init_standalone(repo_root: Path, force: bool) -> None: "\n" "# Minimum quality score required to pass (0 = disabled).\n" "# fail_under = 0\n" + build_context_block + "\n" + "# Zenzic Shield — built-in credential scanner (always active, no config required).\n" + "# Detected pattern families: openai-api-key, github-token, aws-access-key,\n" + "# stripe-live-key, slack-token, google-api-key, private-key,\n" + "# hex-encoded-payload (3+ consecutive \\xNN sequences).\n" + "# All lines including fenced code blocks are scanned. Exit code 2 on detection.\n" + "\n" "# Declare project-specific lint rules (no Python required):\n" "# [[custom_rules]]\n" '# id = "ZZ-NODRAFT"\n' diff --git a/src/zenzic/core/reporter.py b/src/zenzic/core/reporter.py index 2620d39..36d18d9 100644 --- a/src/zenzic/core/reporter.py +++ b/src/zenzic/core/reporter.py @@ -14,7 +14,7 @@ from rich.rule import Rule from rich.text import Text -from zenzic.ui import AMBER, EMERALD, INDIGO, ROSE, SLATE, emoji +from zenzic.ui import AMBER, BLOOD, EMERALD, INDIGO, ROSE, SLATE, emoji @dataclass(slots=True) @@ -36,6 +36,7 @@ class Finding: "warning": f"bold {AMBER}", "info": f"bold {INDIGO}", "security_breach": f"bold white on {ROSE}", + "security_incident": f"bold white on {BLOOD}", } @@ -132,8 +133,6 @@ def _render_snippet( return result - return result - class SentinelReporter: """Render check results as a Ruff-inspired grouped report.""" @@ -167,6 +166,7 @@ def render( target: str | None = None, strict: bool = False, ok_message: str | None = None, + show_info: bool = False, ) -> tuple[int, int]: """Print the full Sentinel Report. @@ -193,6 +193,14 @@ def render( breach_findings = [f for f in findings if f.severity == "security_breach"] normal_findings = [f for f in findings if f.severity != "security_breach"] + # ── Info filter: suppress advisory findings unless opt-in ───────────── + if not show_info: + _info = [f for f in normal_findings if f.severity == "info"] + normal_findings = [f for f in normal_findings if f.severity != "info"] + info_count = len(_info) + else: + info_count = 0 + # ── Telemetry line ──────────────────────────────────────────────────── dot = emoji("dot") total = docs_count + assets_count @@ -241,16 +249,26 @@ def render( if not normal_findings and not breach_findings: # ── All-clear panel ─────────────────────────────────────────────── _ok = ok_message or "All checks passed. Your documentation is secure." + _ok_items: list[RenderableType] = [ + telemetry, + Text(), + Rule(style=SLATE), + Text(), + Text.from_markup(f"[{EMERALD}]{emoji('check')} {_ok}[/]"), + ] + if info_count: + _ok_items.append(Text()) + _ok_items.append( + Text.from_markup( + f" [{SLATE}]{emoji('info')} {info_count} info finding" + f"{'s' if info_count != 1 else ''} suppressed" + f" — use --show-info for details.[/]" + ) + ) self._con.print() self._con.print( Panel( - Group( - telemetry, - Text(), - Rule(style=SLATE), - Text(), - Text.from_markup(f"[{EMERALD}]{emoji('check')} {_ok}[/]"), - ), + Group(*_ok_items), title=f"[bold white on {INDIGO}] {emoji('shield')} ZENZIC SENTINEL v{version} [/]", title_align="center", border_style=f"bold {INDIGO}", @@ -278,7 +296,7 @@ def render( sev_icon = ( emoji("cross") - if f.severity == "error" + if f.severity in {"error", "security_incident"} else emoji("warn") if f.severity == "warning" else emoji("info") @@ -317,6 +335,12 @@ def render( renderables.append(Rule(style=SLATE)) renderables.append(Text()) # breathing after Rule summary_parts: list[str] = [] + incidents_count = sum(1 for f in normal_findings if f.severity == "security_incident") + if incidents_count: + summary_parts.append( + f"[bold white on {BLOOD}]{emoji('cross')} {incidents_count}" + f" security incident{'s' if incidents_count != 1 else ''}[/]" + ) if errors: summary_parts.append( f"[{ROSE}]{emoji('cross')} {errors} error{'s' if errors != 1 else ''}[/]" @@ -333,7 +357,7 @@ def render( # ── Status line (verdict) ───────────────────────────────────────────── renderables.append(Text()) # breathing before verdict - has_failures = (errors > 0) or (strict and warnings > 0) + has_failures = (incidents_count > 0) or (errors > 0) or (strict and warnings > 0) if has_failures: renderables.append( Text.from_markup(f"[bold {ROSE}]FAILED:[/] One or more checks failed.") @@ -342,6 +366,16 @@ def render( _ok = ok_message or "All checks passed." renderables.append(Text.from_markup(f"[{EMERALD}]{emoji('check')} {_ok}[/]")) + if info_count: + renderables.append(Text()) + renderables.append( + Text.from_markup( + f" [{SLATE}]{emoji('info')} {info_count} info finding" + f"{'s' if info_count != 1 else ''} suppressed" + f" — use --show-info for details.[/]" + ) + ) + # ── Single unified panel ────────────────────────────────────────────── self._con.print() self._con.print( diff --git a/src/zenzic/core/shield.py b/src/zenzic/core/shield.py index d9b15a7..e2e5bdc 100644 --- a/src/zenzic/core/shield.py +++ b/src/zenzic/core/shield.py @@ -79,6 +79,7 @@ def _normalize_line_for_shield(line: str) -> str: ("slack-token", re.compile(r"xox[baprs]-[0-9a-zA-Z]{10,48}")), ("google-api-key", re.compile(r"AIza[0-9A-Za-z\-_]{35}")), ("private-key", re.compile(r"-----BEGIN [A-Z ]+ PRIVATE KEY-----")), + ("hex-encoded-payload", re.compile(r"(?:\\x[0-9a-fA-F]{2}){3,}")), ] diff --git a/src/zenzic/core/validator.py b/src/zenzic/core/validator.py index c332162..4c97db3 100644 --- a/src/zenzic/core/validator.py +++ b/src/zenzic/core/validator.py @@ -32,9 +32,10 @@ import os import re import tomllib +from collections.abc import Iterator from dataclasses import dataclass from pathlib import Path -from typing import Any, NamedTuple +from typing import Any, Literal, NamedTuple from urllib.parse import urlsplit import httpx @@ -149,6 +150,105 @@ def __str__(self) -> str: return self.message +# ─── Path-traversal intent classifier ──────────────────────────────────────── + +# Detects hrefs that, after traversal, would reach an OS system directory. +# Triggering this classifier upgrades a PATH_TRAVERSAL error to a +# PATH_TRAVERSAL_SUSPICIOUS security incident (Exit Code 3). +_RE_SYSTEM_PATH: re.Pattern[str] = re.compile(r"/(?:etc|root|var|proc|sys|usr)/") + + +def _classify_traversal_intent(href: str) -> Literal["suspicious", "boundary"]: + """Return 'suspicious' when *href* appears to target an OS system directory. + + A traversal to ``../../../../etc/passwd`` is a potential attack vector. + A traversal to ``../../sibling-repo/README.md`` is a boundary violation + but has no OS-exploitation intent. Only the former warrants Exit Code 3. + + This check intentionally remains a fast regex scan over the raw href + string — no filesystem calls, no Path resolution — to stay within the + Zero I/O constraint of the validator hot-path. + """ + return "suspicious" if _RE_SYSTEM_PATH.search(href) else "boundary" + + +def _build_link_graph( + links_cache: dict[Path, list[LinkInfo]], + resolver: InMemoryPathResolver, + source_files: frozenset[Path], +) -> dict[Path, set[Path]]: + """Build the adjacency map of internal Markdown→Markdown links. + + Only edges between files present in *source_files* are recorded. + External links, fragment-only links, and links to Ghost Routes are + excluded — Ghost Routes have no outgoing edges so they cannot be + members of a cycle. + + This is called once after the InMemoryPathResolver is constructed + (Phase 1.5). The resolver is already warm; no additional I/O occurs. + """ + adj: dict[Path, set[Path]] = {f: set() for f in source_files} + for md_file, links in links_cache.items(): + for link in links: + url = link.url + # Skip external URLs, non-navigable schemes, and fragment-only links + if ( + url.startswith(_SKIP_SCHEMES) + or url.startswith(("http://", "https://")) + or not url + or url.startswith("#") + ): + continue + outcome = resolver.resolve(md_file, url) + if isinstance(outcome, Resolved) and outcome.target in source_files: + adj.setdefault(md_file, set()).add(outcome.target) + return adj + + +def _find_cycles_iterative(adj: dict[Path, set[Path]]) -> frozenset[str]: + """Return canonical Path strings of all nodes that participate in at least one cycle. + + Iterative DFS with WHITE/GREY/BLACK colouring — avoids RecursionError on + large documentation graphs (Pillar 2: Zero Subprocess / total portability). + """ + WHITE, GREY, BLACK = 0, 1, 2 + color: dict[Path, int] = dict.fromkeys(adj, WHITE) + in_cycle: set[str] = set() + + for start in list(adj): + if color[start] != WHITE: + continue + stack: list[tuple[Path, Iterator[Path]]] = [(start, iter(adj[start]))] + path: list[Path] = [start] + path_set: set[Path] = {start} + color[start] = GREY + + while stack: + node, nbrs = stack[-1] + try: + nbr = next(nbrs) + if nbr not in color: + color[nbr] = WHITE + adj.setdefault(nbr, set()) + if color[nbr] == GREY: # back edge → cycle + idx = path.index(nbr) + in_cycle.update(str(p) for p in path[idx:]) + in_cycle.add(str(nbr)) + elif color[nbr] == WHITE: + color[nbr] = GREY + stack.append((nbr, iter(adj.get(nbr, set())))) + path.append(nbr) + path_set.add(nbr) + except StopIteration: + done = path[-1] + color[done] = BLACK + path.pop() + path_set.discard(done) + stack.pop() + + return frozenset(in_cycle) + + class _ValidationPayload(NamedTuple): """Worker output for one markdown file in link validation phase 1. @@ -561,6 +661,14 @@ async def validate_links_async( # for VanillaAdapter / Zensical every file is REACHABLE by definition. vsm = build_vsm(adapter, docs_root, md_contents, anchors_cache=anchors_cache) + # ── Phase 1.5: cycle registry (requires resolver + links_cache) ─────────── + # Pre-compute the set of all nodes participating in at least one link cycle. + # This Θ(V+E) DFS runs once here; Phase 2 checks are O(1) per resolved link. + _source_files: frozenset[Path] = frozenset(md_contents) + _link_adj = _build_link_graph(links_cache, resolver, _source_files) + cycle_registry: frozenset[str] = _find_cycles_iterative(_link_adj) + # ───────────────────────────────────────────────────────────────────────── + # ── Phase 2: validate against global indexes ──────────────────────────── internal_errors: list[LinkError] = [] external_entries: list[tuple[str, str, int]] = [] # (url, file_label, lineno) @@ -640,13 +748,20 @@ def _source_line(md_file: Path, lineno: int) -> str: match resolver.resolve(md_file, url): case PathTraversal(): # Security finding — path escaped the docs root. + # Classify intent: hrefs targeting OS system directories + # are promoted to PATH_TRAVERSAL_SUSPICIOUS (Exit Code 3). + _intent = _classify_traversal_intent(url) internal_errors.append( LinkError( file_path=md_file, line_no=lineno, message=f"{label}:{lineno}: '{url}' resolves outside the docs directory", source_line=_source_line(md_file, lineno), - error_type="PATH_TRAVERSAL", + error_type=( + "PATH_TRAVERSAL_SUSPICIOUS" + if _intent == "suspicious" + else "PATH_TRAVERSAL" + ), col_start=link.col_start, match_text=link.match_text, ) @@ -706,6 +821,21 @@ def _source_line(md_file: Path, lineno: int) -> str: ) ) case Resolved(target=resolved_target): + # ── CIRCULAR_LINK: resolved target is part of a link cycle ─ + if str(resolved_target) in cycle_registry: + internal_errors.append( + LinkError( + file_path=md_file, + line_no=lineno, + message=( + f"{label}:{lineno}: '{url}' is part of a circular link cycle" + ), + source_line=_source_line(md_file, lineno), + error_type="CIRCULAR_LINK", + col_start=link.col_start, + match_text=link.match_text, + ) + ) # ── UNREACHABLE_LINK: file exists but cannot be reached ─── # Fires when the adapter has a build config and the resolved # target maps to a route that is either: diff --git a/src/zenzic/ui.py b/src/zenzic/ui.py index c7872da..90ae73c 100644 --- a/src/zenzic/ui.py +++ b/src/zenzic/ui.py @@ -21,6 +21,7 @@ EMERALD = "#10b981" AMBER = "#f59e0b" ROSE = "#f43f5e" +BLOOD = "#8b0000" # blood red — system-path traversal security incident # Rich style strings STYLE_BRAND = f"bold {INDIGO}" diff --git a/tests/test_validator.py b/tests/test_validator.py index d20c3c8..8b8542a 100644 --- a/tests/test_validator.py +++ b/tests/test_validator.py @@ -12,11 +12,14 @@ from zenzic.core.validator import ( _MAX_CONCURRENT_REQUESTS, _build_ref_map, + _classify_traversal_intent, + _find_cycles_iterative, anchors_in_file, extract_links, extract_ref_links, slug_heading, validate_links, + validate_links_structured, validate_snippets, ) from zenzic.models.config import ZenzicConfig @@ -269,14 +272,20 @@ def test_anchor_torture_parallel_indexing_1000_files(self, tmp_path: Path) -> No total = 1000 for i in range(total): - nxt = (i + 1) % total + nxt = i + 1 + # Linear chain: each page links to the next (no ring to avoid CIRCULAR_LINK). + # The last page has no forward link — it is the terminal node. + if nxt < total: + link_line = f"Forward link: [next](page_{nxt:04d}.md#section-{nxt})" + else: + link_line = "Terminal node — no forward link." (docs / f"page_{i:04d}.md").write_text( "\n".join( [ f"# Page {i}", f"## Section {i}", "", - f"Forward link: [next](page_{nxt:04d}.md#section-{nxt})", + link_line, "", "This page is part of the anchor torture fixture and remains deterministic.", ] @@ -287,6 +296,43 @@ def test_anchor_torture_parallel_indexing_1000_files(self, tmp_path: Path) -> No assert validate_links(tmp_path) == [] +# ─── Path-traversal intent classification ───────────────────────────────────── + + +class TestTraversalIntent: + """_classify_traversal_intent separates boundary from suspicious traversals.""" + + def test_system_paths_are_suspicious(self) -> None: + assert _classify_traversal_intent("../../../../etc/passwd") == "suspicious" + assert _classify_traversal_intent("../../root/.ssh/id_rsa") == "suspicious" + assert _classify_traversal_intent("../../../var/log/syslog") == "suspicious" + assert _classify_traversal_intent("../../../proc/self/mem") == "suspicious" + assert _classify_traversal_intent("../../../../usr/bin/env") == "suspicious" + + def test_boundary_traversal_not_suspicious(self) -> None: + assert _classify_traversal_intent("../../outside.md") == "boundary" + assert _classify_traversal_intent("../sibling.md") == "boundary" + assert _classify_traversal_intent("../../README.md") == "boundary" + + def test_path_traversal_suspicious_error_type(self, tmp_path: Path) -> None: + """validate_links_structured emits PATH_TRAVERSAL_SUSPICIOUS for OS system dirs.""" + docs = tmp_path / "docs" + docs.mkdir() + (docs / "index.md").write_text("[escape](../../../../etc/passwd)") + errors = validate_links_structured(tmp_path) + assert len(errors) == 1 + assert errors[0].error_type == "PATH_TRAVERSAL_SUSPICIOUS" + + def test_path_traversal_boundary_error_type(self, tmp_path: Path) -> None: + """validate_links_structured emits PATH_TRAVERSAL for non-system out-of-bounds hrefs.""" + docs = tmp_path / "docs" + docs.mkdir() + (docs / "index.md").write_text("[escape](../../outside.md)") + errors = validate_links_structured(tmp_path) + assert len(errors) == 1 + assert errors[0].error_type == "PATH_TRAVERSAL" + + # ─── Absolute-path prohibition ─────────────────────────────────────────────── @@ -954,3 +1000,91 @@ def test_validate_snippets_toml_invalid(tmp_path: Path) -> None: errors = validate_snippets(tmp_path, ZenzicConfig(snippet_min_lines=1)) assert len(errors) == 1 assert "SyntaxError in TOML snippet" in errors[0].message + + +# ─── Cycle detection ────────────────────────────────────────────────────────── + + +class TestFindCyclesIterative: + """Unit tests for _find_cycles_iterative (pure function, no I/O).""" + + def test_simple_cycle_ab(self) -> None: + a = Path("/docs/a.md") + b = Path("/docs/b.md") + adj: dict[Path, set[Path]] = {a: {b}, b: {a}} + result = _find_cycles_iterative(adj) + assert str(a) in result + assert str(b) in result + + def test_linear_chain_no_cycle(self) -> None: + a = Path("/docs/a.md") + b = Path("/docs/b.md") + c = Path("/docs/c.md") + adj: dict[Path, set[Path]] = {a: {b}, b: {c}, c: set()} + result = _find_cycles_iterative(adj) + assert result == frozenset() + + def test_self_loop_cycle(self) -> None: + a = Path("/docs/a.md") + adj: dict[Path, set[Path]] = {a: {a}} + result = _find_cycles_iterative(adj) + assert str(a) in result + + def test_three_node_cycle(self) -> None: + a = Path("/docs/a.md") + b = Path("/docs/b.md") + c = Path("/docs/c.md") + adj: dict[Path, set[Path]] = {a: {b}, b: {c}, c: {a}} + result = _find_cycles_iterative(adj) + assert str(a) in result + assert str(b) in result + assert str(c) in result + + def test_isolated_nodes_no_cycle(self) -> None: + a = Path("/docs/a.md") + b = Path("/docs/b.md") + adj: dict[Path, set[Path]] = {a: set(), b: set()} + assert _find_cycles_iterative(adj) == frozenset() + + def test_acyclic_graph_with_shared_target(self) -> None: + # A→C and B→C — converging, not a cycle + a = Path("/docs/a.md") + b = Path("/docs/b.md") + c = Path("/docs/c.md") + adj: dict[Path, set[Path]] = {a: {c}, b: {c}, c: set()} + assert _find_cycles_iterative(adj) == frozenset() + + +class TestCircularLinkIntegration: + """End-to-end: validate_links_structured detects and reports CIRCULAR_LINK.""" + + def test_two_file_cycle_emits_circular_link(self, tmp_path: Path) -> None: + docs = tmp_path / "docs" + docs.mkdir() + (docs / "a.md").write_text("[go to b](b.md)\n") + (docs / "b.md").write_text("[go to a](a.md)\n") + errors = validate_links_structured(tmp_path) + circular = [e for e in errors if e.error_type == "CIRCULAR_LINK"] + assert len(circular) == 2 # one from a.md and one from b.md + + def test_linear_chain_no_circular_link(self, tmp_path: Path) -> None: + docs = tmp_path / "docs" + docs.mkdir() + (docs / "a.md").write_text("[go to b](b.md)\n") + (docs / "b.md").write_text("[go to c](c.md)\n") + (docs / "c.md").write_text("# Terminus\n") + errors = validate_links_structured(tmp_path) + circular = [e for e in errors if e.error_type == "CIRCULAR_LINK"] + assert circular == [] + + def test_i18n_cross_language_cycle_detected(self, tmp_path: Path) -> None: + """EN→IT→EN cross-language cycle must be caught.""" + docs = tmp_path / "docs" + docs.mkdir() + it_dir = docs / "it" + it_dir.mkdir() + (docs / "guide.md").write_text("[Italian version](it/guide.md)\n") + (it_dir / "guide.md").write_text("[English version](../guide.md)\n") + errors = validate_links_structured(tmp_path) + circular = [e for e in errors if e.error_type == "CIRCULAR_LINK"] + assert len(circular) == 2 From 023c4cf8e90deae5c614483ebf0598faf772515d Mon Sep 17 00:00:00 2001 From: PythonWoods-Dev Date: Wed, 8 Apr 2026 17:54:13 +0200 Subject: [PATCH 14/16] test: TestShowInfoFilter, Shield hex-payload, CLI --show-info coverage - test_cli.py: TestShowInfoFilter (suppressed by default, shown with --show-info, check-all flag accepted via 9-patch integration test) - test_references.py: hex-encoded-payload Shield pattern coverage --- tests/sandboxes/zensical/docs/features.md | 1 - tests/test_cli.py | 123 ++++++++++++++++++++++ tests/test_references.py | 29 +++++ 3 files changed, 152 insertions(+), 1 deletion(-) diff --git a/tests/sandboxes/zensical/docs/features.md b/tests/sandboxes/zensical/docs/features.md index e330820..fc0ff65 100644 --- a/tests/sandboxes/zensical/docs/features.md +++ b/tests/sandboxes/zensical/docs/features.md @@ -7,5 +7,4 @@ Zensical sandbox features page. All links on this page are valid. -- [Home](index.md) - [API Reference](api.md) diff --git a/tests/test_cli.py b/tests/test_cli.py index 72746df..93a7946 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -89,6 +89,46 @@ def test_check_links_strict_passes_flag(mock_links, _cfg, _root) -> None: mock_links.assert_called_once_with(_ROOT, strict=True) +@patch("zenzic.cli.find_repo_root", return_value=_ROOT) +@patch("zenzic.cli.ZenzicConfig.load", return_value=(_CFG, False)) +@patch( + "zenzic.cli.validate_links_structured", + return_value=[ + LinkError( + file_path=_ROOT / "docs" / "index.md", + line_no=2, + message="index.md:2: '../../../../etc/passwd' resolves outside the docs directory", + source_line="[escape](../../../../etc/passwd)", + error_type="PATH_TRAVERSAL_SUSPICIOUS", + ) + ], +) +def test_check_links_system_path_traversal_exits_3(_links, _cfg, _root) -> None: + """check links exits with code 3 when a system-path traversal is found.""" + result = runner.invoke(app, ["check", "links"]) + assert result.exit_code == 3 + + +@patch("zenzic.cli.find_repo_root", return_value=_ROOT) +@patch("zenzic.cli.ZenzicConfig.load", return_value=(_CFG, False)) +@patch( + "zenzic.cli.validate_links_structured", + return_value=[ + LinkError( + file_path=_ROOT / "docs" / "index.md", + line_no=2, + message="index.md:2: '../../outside.md' resolves outside the docs directory", + source_line="[escape](../../outside.md)", + error_type="PATH_TRAVERSAL", + ) + ], +) +def test_check_links_boundary_traversal_exits_1(_links, _cfg, _root) -> None: + """check links exits with code 1 for a non-system path traversal (no regression).""" + result = runner.invoke(app, ["check", "links"]) + assert result.exit_code == 1 + + # --------------------------------------------------------------------------- # check orphans # --------------------------------------------------------------------------- @@ -895,3 +935,86 @@ def test_init_in_fresh_directory_no_git(tmp_path: Path, monkeypatch: pytest.Monk result = runner.invoke(app, ["init"]) assert result.exit_code == 0, result.stdout assert (fresh / "zenzic.toml").is_file() + + +# --------------------------------------------------------------------------- +# Signal-to-Noise: --show-info / reporter show_info filter +# --------------------------------------------------------------------------- + + +class TestShowInfoFilter: + """Verify that info-severity findings are suppressed by default and shown with --show-info.""" + + @staticmethod + def _make_reporter(buf): # type: ignore[no-untyped-def] + from rich.console import Console + + from zenzic.core.reporter import SentinelReporter + + con = Console(file=buf, highlight=False, markup=True) + return SentinelReporter(con, Path("/fake/docs"), docs_dir="docs") + + @staticmethod + def _info_finding(): # type: ignore[no-untyped-def] + from zenzic.core.reporter import Finding + + return Finding( + rel_path="guide/nav.md", + line_no=5, + code="CIRCULAR_LINK", + severity="info", + message="guide/nav.md:5: 'index.md' is part of a circular link cycle", + source_line="[Home](index.md)", + ) + + def test_info_finding_suppressed_by_default(self) -> None: + """With show_info=False (default), info findings must not appear in output.""" + import io + + buf = io.StringIO() + reporter = self._make_reporter(buf) + errors, warnings = reporter.render( + [self._info_finding()], + version="0.5.0a4", + elapsed=0.0, + show_info=False, + ) + out = buf.getvalue() + assert "CIRCULAR_LINK" not in out + assert "suppressed" in out + assert errors == 0 + assert warnings == 0 + + def test_info_finding_shown_with_show_info_true(self) -> None: + """With show_info=True, info findings must appear in output and no suppression note.""" + import io + + buf = io.StringIO() + reporter = self._make_reporter(buf) + errors, warnings = reporter.render( + [self._info_finding()], + version="0.5.0a4", + elapsed=0.0, + show_info=True, + ) + out = buf.getvalue() + assert "CIRCULAR_LINK" in out + assert "suppressed" not in out + assert errors == 0 + assert warnings == 0 + + @patch("zenzic.cli.find_repo_root", return_value=_ROOT) + @patch("zenzic.cli.ZenzicConfig.load", return_value=(_CFG, True)) + @patch("zenzic.cli.validate_links_structured", return_value=[]) + @patch("zenzic.cli.find_orphans", return_value=[]) + @patch("zenzic.cli.validate_snippets", return_value=[]) + @patch("zenzic.cli.find_placeholders", return_value=[]) + @patch("zenzic.cli.find_unused_assets", return_value=[]) + @patch("zenzic.cli.check_nav_contract", return_value=[]) + @patch("zenzic.cli.scan_docs_references", return_value=([], [])) + def test_check_all_show_info_flag_accepted( + self, _refs, _nav, _assets, _ph, _snip, _orphans, _links, _cfg, _root + ) -> None: + """--show-info flag must be accepted by check all without crashing.""" + result = runner.invoke(app, ["check", "all", "--show-info"]) + assert result.exit_code == 0, result.stdout diff --git a/tests/test_references.py b/tests/test_references.py index 319f6ce..802cbb2 100644 --- a/tests/test_references.py +++ b/tests/test_references.py @@ -236,6 +236,35 @@ def test_too_short_key_not_flagged(self, tmp_path: Path) -> None: findings = list(scan_url_for_secrets(short, tmp_path / "doc.md", 1)) assert findings == [] + # ── hex-encoded-payload ─────────────────────────────────────────────────── + + def test_hex_payload_three_bytes_detected(self, tmp_path: Path) -> None: + """Three consecutive \\xNN sequences exceeds the threshold — must be flagged.""" + line = r"exec('\x41\x42\x43')" + findings = list(scan_line_for_secrets(line, tmp_path / "doc.md", 1)) + assert len(findings) == 1 + assert findings[0].secret_type == "hex-encoded-payload" + + def test_hex_payload_two_bytes_not_flagged(self, tmp_path: Path) -> None: + """Only two \\xNN sequences — below threshold, must not be flagged.""" + line = r"prefix \x41\x42 suffix" + findings = list(scan_line_for_secrets(line, tmp_path / "doc.md", 1)) + assert findings == [] + + def test_hex_payload_in_fenced_code_block_detected(self, tmp_path: Path) -> None: + """Shield Stream 1 reads all lines raw; hex sequence in a code block must be caught.""" + from zenzic.core.shield import _SECRETS + + hex_pattern = next(p for name, p in _SECRETS if name == "hex-encoded-payload") + payload = r"\x41\x42\x43\x44" + assert hex_pattern.search(payload) is not None + + def test_plain_escape_sequences_not_flagged(self, tmp_path: Path) -> None: + """Common prose escapes (\\n, \\t) must not match the hex-payload pattern.""" + line = r"Use \n for newlines and \t for tabs." + findings = list(scan_line_for_secrets(line, tmp_path / "doc.md", 1)) + assert findings == [] + # ══════════════════════════════════════════════════════════════════════════════ # check_image_alt_text (pure function) From 25e0ddeeb468d895dfcacd9359be1d45881ea598 Mon Sep 17 00:00:00 2001 From: PythonWoods-Dev Date: Wed, 8 Apr 2026 17:54:44 +0200 Subject: [PATCH 15/16] examples: Shield comment block in all configs, safety_demonstration.md - Shield block (8 pattern families, exit code contract) in all 9 example toml - safety_demonstration.md: circular link + hex payload for live Sentinel testing --- examples/broken-docs/zenzic.toml | 6 +++ examples/custom-dir-target/zenzic.toml | 6 +++ examples/i18n-standard/zenzic.toml | 6 +++ examples/mkdocs-basic/zenzic.toml | 6 +++ examples/plugin-scaffold-demo/zenzic.toml | 6 +++ examples/readme-hero/zenzic.toml | 6 +++ examples/safety_demonstration.md | 58 +++++++++++++++++++++++ examples/single-file-target/zenzic.toml | 6 +++ examples/vanilla/zenzic.toml | 6 +++ examples/zensical-basic/zenzic.toml | 6 +++ 10 files changed, 112 insertions(+) create mode 100644 examples/safety_demonstration.md diff --git a/examples/broken-docs/zenzic.toml b/examples/broken-docs/zenzic.toml index 26466b5..0c9f8e5 100644 --- a/examples/broken-docs/zenzic.toml +++ b/examples/broken-docs/zenzic.toml @@ -13,6 +13,12 @@ docs_dir = "docs" +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "mkdocs" placeholder_max_words = 50 diff --git a/examples/custom-dir-target/zenzic.toml b/examples/custom-dir-target/zenzic.toml index 3701390..3d02897 100644 --- a/examples/custom-dir-target/zenzic.toml +++ b/examples/custom-dir-target/zenzic.toml @@ -12,5 +12,11 @@ docs_dir = "docs" +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "vanilla" diff --git a/examples/i18n-standard/zenzic.toml b/examples/i18n-standard/zenzic.toml index 90c9be5..a29b473 100644 --- a/examples/i18n-standard/zenzic.toml +++ b/examples/i18n-standard/zenzic.toml @@ -8,6 +8,12 @@ docs_dir = "docs" fail_under = 100 +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + # manual.pdf and brand-kit.zip are referenced in the docs but generated at # build time — they do not exist on disk. List them here so Zenzic validates # the links structurally without requiring the files to be present. diff --git a/examples/mkdocs-basic/zenzic.toml b/examples/mkdocs-basic/zenzic.toml index 04a1bb7..bed8f8c 100644 --- a/examples/mkdocs-basic/zenzic.toml +++ b/examples/mkdocs-basic/zenzic.toml @@ -5,5 +5,11 @@ docs_dir = "docs" fail_under = 90 +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "mkdocs" diff --git a/examples/plugin-scaffold-demo/zenzic.toml b/examples/plugin-scaffold-demo/zenzic.toml index 52a989f..0a8eed8 100644 --- a/examples/plugin-scaffold-demo/zenzic.toml +++ b/examples/plugin-scaffold-demo/zenzic.toml @@ -1,2 +1,8 @@ # zenzic.toml generated by plugin scaffold # docs_dir defaults to "docs" + +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. diff --git a/examples/readme-hero/zenzic.toml b/examples/readme-hero/zenzic.toml index 93064a6..ea2a0dc 100644 --- a/examples/readme-hero/zenzic.toml +++ b/examples/readme-hero/zenzic.toml @@ -7,5 +7,11 @@ docs_dir = "docs" +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "mkdocs" diff --git a/examples/safety_demonstration.md b/examples/safety_demonstration.md new file mode 100644 index 0000000..cd3ca76 --- /dev/null +++ b/examples/safety_demonstration.md @@ -0,0 +1,58 @@ + + + +# Zenzic Safety Demonstration + +This file is an **intentional test fixture** for Zenzic's built-in defences. +Run `zenzic check all` from the repository root and point it here to observe +the findings live. + +Expected findings when this file is scanned: + +- `CIRCULAR_LINK` (severity: `info`) — mutual link cycle with itself via the + self-referential link below +- `security_breach` (severity: `security_breach`) — hex-encoded payload in the + code block detected by the Zenzic Shield + +--- + +## Circular Link Example + +The link below points back to this same document, forming a trivial cycle: + +[Back to this page](safety_demonstration.md) + +This triggers `CIRCULAR_LINK` at severity `info`. It never blocks the build. +Use `zenzic check all --show-info` to display it. + +--- + +## Hex-Encoded Payload Example + +The code block below contains three consecutive `\xNN` hex escape sequences — +the minimum threshold for the `hex-encoded-payload` Shield pattern: + +```python +# Example: hex-encoded payload that triggers the Shield +payload = "\x41\x42\x43" # \x41\x42\x43 → "ABC" — 3 consecutive escapes +``` + +This triggers a `security_breach` finding (exit code 2). The Shield scans +every fenced code block, not just prose text. + +--- + +## How to Test + +```bash +# From the repository root — scan this single file: +zenzic check all examples/safety_demonstration.md --show-info + +# Expected output: +# 💡 [CIRCULAR_LINK] — info finding (shown because of --show-info) +# 🔴 [security_breach] — Shield: hex-encoded-payload detected +# Exit code: 2 +``` + +To test without `--show-info`, the `CIRCULAR_LINK` finding is suppressed and +only the Shield breach appears in the output. diff --git a/examples/single-file-target/zenzic.toml b/examples/single-file-target/zenzic.toml index 753159b..d366078 100644 --- a/examples/single-file-target/zenzic.toml +++ b/examples/single-file-target/zenzic.toml @@ -10,5 +10,11 @@ docs_dir = "docs" +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "vanilla" diff --git a/examples/vanilla/zenzic.toml b/examples/vanilla/zenzic.toml index 6908a23..979a62b 100644 --- a/examples/vanilla/zenzic.toml +++ b/examples/vanilla/zenzic.toml @@ -17,6 +17,12 @@ docs_dir = "docs" # Enforce a minimum quality floor. fail_under = 80 +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "vanilla" diff --git a/examples/zensical-basic/zenzic.toml b/examples/zensical-basic/zenzic.toml index 40a2c90..bcd244a 100644 --- a/examples/zensical-basic/zenzic.toml +++ b/examples/zensical-basic/zenzic.toml @@ -17,5 +17,11 @@ docs_dir = "docs" # this threshold. Set to 0 to disable the check. fail_under = 90 +# Zenzic Shield — built-in credential scanner (always active, no config required). +# Detected pattern families: openai-api-key, github-token, aws-access-key, +# stripe-live-key, slack-token, google-api-key, private-key, +# hex-encoded-payload (3+ consecutive \xNN sequences). +# All lines including fenced code blocks are scanned. Exit code 2 on detection. + [build_context] engine = "zensical" From e28dcabe69893e1eb248d14bf1042855cbf25428 Mon Sep 17 00:00:00 2001 From: PythonWoods-Dev Date: Wed, 8 Apr 2026 17:55:13 +0200 Subject: [PATCH 16/16] chore(release): v0.5.0a4 CHANGELOG and pre-release audit package MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - CHANGELOG.md / CHANGELOG.it.md: [0.5.0a4] — Blood Sentinel, Graph Integrity Θ(V+E), Hex Shield, --show-info, info suppression, ZRT-005 fix - RELEASE.md: 7-section pre-release audit checklist --- CHANGELOG.it.md | 133 ++++++++++ CHANGELOG.md | 49 +++- RELEASE.md | 650 ++++++++++-------------------------------------- 3 files changed, 305 insertions(+), 527 deletions(-) create mode 100644 CHANGELOG.it.md diff --git a/CHANGELOG.it.md b/CHANGELOG.it.md new file mode 100644 index 0000000..e2b30e9 --- /dev/null +++ b/CHANGELOG.it.md @@ -0,0 +1,133 @@ + + + +# Registro delle modifiche + +Tutte le modifiche rilevanti a Zenzic sono documentate qui. +Il formato segue [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +Le versioni seguono il [Semantic Versioning](https://semver.org/). + +--- + +## [Non rilasciato] + +## [0.5.0a4] — 2026-04-08 — Il Sentinel Indurito: Sicurezza & Integrità + +> **Rilascio Alpha 4.** Quattro vulnerabilità confermate chiuse (ZRT-001–004), tre +> nuovi pilastri di hardening aggiunti (Sentinella di Sangue, Integrità del Grafo, +> Scudo Esadecimale), e piena parità documentale bilingue raggiunta. In attesa di +> revisione manuale prima della promozione a Release Candidate. +> +> Branch: `fix/sentinel-hardening-v0.5.0a4` + +### Aggiunto + +- **Integrità del grafo — rilevamento link circolari.** Zenzic ora pre-calcola + un registro dei cicli (Fase 1.5) tramite ricerca depth-first iterativa (Θ(V+E)) + sul grafo dei link interni risolti. Ogni link il cui target appartiene a un ciclo + emette un finding `CIRCULAR_LINK` con severità `info`. I link di navigazione + reciproca (A ↔ B) sono una struttura valida della documentazione; il finding è + puramente informativo — non influisce mai sugli exit code in modalità normale o + `--strict`. O(1) per query in Phase 2. Le Ghost Route (URL canonici generati da + plugin senza file sorgente fisico) sono correttamente escluse dal grafo dei cicli. + +- **`INTERNAL_GLOSSARY.toml`** — registro bilingue EN↔IT dei termini tecnici + (15 voci) per un vocabolario coerente tra documentazione inglese e italiana. Copre + i concetti principali: Porto Sicuro, Rotta Fantasma, Mappa del Sito Virtuale, + Motore a Due Passaggi, Scudo, Sentinella di Sangue e altri. Mantenuto da S-0. + Tutti i termini con `stable = true` richiedono un ADR prima della rinomina. + +- **Parità documentale bilingue.** `docs/checks.md` e `docs/it/checks.md` aggiornati + con le sezioni Sentinella di Sangue, Link Circolari e Scudo Esadecimale. + `CHANGELOG.it.md` creato. Piena parità EN↔IT applicata per il Protocollo di + Parità Bilingue. + +### ⚠️ Sicurezza + +- **Sentinella di Sangue — classificazione degli attraversamenti di percorso (Exit Code 3).** + `check links` e `check all` ora classificano i finding di path-traversal per + intenzione. Un href che esce da `docs/` e si risolve in una directory di sistema + del SO (`/etc/`, `/root/`, `/var/`, `/proc/`, `/sys/`, `/usr/`) viene classificato + come `PATH_TRAVERSAL_SUSPICIOUS` con severità `security_incident` e attiva + l'**Exit Code 3** — un nuovo exit code dedicato riservato alle sonde del sistema + host. L'Exit 3 ha priorità sull'Exit 2 (violazione credenziali) e non viene mai + soppresso da `--exit-zero`. Gli attraversamenti fuori confine ordinari (es. + `../../repo-adiacente/`) restano `PATH_TRAVERSAL` con severità `error` (Exit Code 1). + +- **Scudo Esadecimale — rilevamento di payload hex-encoded.** + Un nuovo pattern built-in dello Shield, `hex-encoded-payload`, rileva sequenze di + tre o più escape hex `\xNN` consecutive (`(?:\\x[0-9a-fA-F]{2}){3,}`). La soglia + `{3,}` evita falsi positivi sulle singole escape hex comuni nella documentazione + delle regex. I finding escono con codice 2 (Shield, non sopprimibile) e si + applicano a tutti i flussi di contenuto inclusi i blocchi di codice delimitati. + +- **[ZRT-001] Shield Blind Spot — Bypass YAML Frontmatter (CRITICO).** + `_skip_frontmatter()` veniva usato come sorgente di righe dello Shield, + scartando silenziosamente ogni riga nel blocco YAML `---` del file prima che + il motore regex girasse. Qualsiasi coppia chiave-valore (`aws_key: AKIA…`, + `github_token: ghp_…`) era invisibile allo Shield. + **Fix:** Il flusso Shield ora usa `enumerate(fh, start=1)` grezzo — ogni byte + del file viene scansionato. Il flusso contenuto usa ancora `_iter_content_lines()` + con salto del frontmatter per evitare falsi positivi da valori di metadati. + Architettura **Dual-Stream**. + +- **[ZRT-002] ReDoS + Deadlock ProcessPoolExecutor (ALTO).** + Un pattern `[[custom_rules]]` come `^(a+)+$` superava il controllo + `_assert_pickleable()` e veniva distribuito ai worker process senza timeout. + **Due difese aggiunte:** + — *Canary (prevenzione):* `_assert_regex_canary()` stress-testa ogni pattern + `CustomRule` sotto un watchdog `signal.SIGALRM` di 100 ms. I pattern ReDoS + sollevano `PluginContractError` prima della prima scansione. + — *Timeout (contenimento):* `ProcessPoolExecutor.map()` sostituito con + `submit()` + `future.result(timeout=30)`. + +- **[ZRT-003] Bypass Shield Split-Token — Offuscamento Tabelle Markdown (MEDIO).** + Il separatore `|` delle tabelle Markdown spezzava i token segreti su più celle. + **Fix:** Le righe di tabella vengono de-pipe prima della scansione Shield. + +- **[ZRT-004] Injection Path Traversal nei Link Reference (BASSO).** + Link reference con href malevoli potevano sfuggire alla sandbox `docs/`. + **Fix:** La validazione PATH_TRAVERSAL applicata ai link reference come ai link + inline. + +## [0.5.0a3] — 2026-03-28 — Il Sentinel: Plugin, Regole Adattive, Hooks Pre-commit + +> Branch: `feat/sentinel-v0.5.0a3` + +### Aggiunto + +- **Sistema Plugin** — `[[custom_rules]]` in `zenzic.toml` per regole regex + personalizzate. `PluginContractError` per la validazione contratto a boot. +- **Regex Canary** — watchdog SIGALRM 100 ms per backtracking catastrofico. +- **Hooks Pre-commit** — configurazione ufficiale per pipeline CI. +- **UI Sentinel** — palette colori, reporter a griglia, output Sentinel rinnovato. + +## [0.5.0a1] — 2026-03-15 — Il Sentinel: Motore Adattivo delle Regole + +> Branch: `feat/sentinel-v0.5.0a1` + +### Aggiunto + +- **AdaptiveRuleEngine** — motore di analisi estensibile con Phase 3. +- **Hybrid Adaptive Engine** — integrazione MkDocs + motore adattivo. +- **Pannelli Sentinel** — output strutturato per tutti i controlli. + +## [0.4.0] — 2026-03-01 — Il Grande Disaccoppiamento + +> Branch: `feat/engine-decoupling` + +### Aggiunto + +- **Factory entry-point dinamica** — `--engine` CLI flag; protocollo + `has_engine_config`. +- **InMemoryPathResolver** — resolver agnostico rispetto al motore. +- **Tower of Babel Guard** — fallback i18n per ancora mancante nella locale. + +## [0.3.0] — 2026-02-15 — Two-Pass Pipeline + +### Aggiunto + +- **Two-Pass Engine** — Phase 1 (I/O parallelo) + Phase 2 (validazione O(1)). +- **Virtual Site Map (VSM)** — proiezione logica del sito renderizzato. +- **Shield** — rilevamento segreti, Stream Dual, exit code 2. +- **Validazione anchor cross-lingua** — Tower of Babel Guard. diff --git a/CHANGELOG.md b/CHANGELOG.md index c939ee5..865ad14 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,17 +11,56 @@ Versions follow [Semantic Versioning](https://semver.org/). ## [Unreleased] -## [0.5.0a4] — 2026-04-05 — The Sentinel Hardens: Security Sprint +## [0.5.0a4] — 2026-04-08 — The Hardened Sentinel: Security & Integrity -> **Security Analysis Remediation.** The v0.5.0a3 security review exposed four confirmed -> vulnerabilities in v0.5.0a3. This release closes all four attack vectors and -> adds structural defences that outlast any individual exploit. The Sentinel -> no longer sleeps. +> **Alpha 4 Release.** Four confirmed vulnerabilities closed (ZRT-001–004), three +> new hardening pillars added (Blood Sentinel, Graph Integrity, Hex Shield), and +> full bilingual documentation parity achieved. Pending manual review before +> Release Candidate promotion. > > Branch: `fix/sentinel-hardening-v0.5.0a4` +### Added + +- **Graph Integrity — circular link detection.** Zenzic now pre-computes a cycle + registry (Phase 1.5) via iterative depth-first search (Θ(V+E)) over the resolved + internal link graph. Any link whose target belongs to a cycle emits a `CIRCULAR_LINK` + finding at severity `info`. Mutual navigation links (A ↔ B) are valid documentation + structure and are expected; the finding is advisory only — it never affects exit + codes in normal or `--strict` mode. O(1) per-query in Phase 2. Ghost Routes + (plugin-generated canonical URLs without physical source files) are correctly + excluded from the cycle graph and cannot produce false positives. + +- **`INTERNAL_GLOSSARY.toml`** — bilingual EN↔IT term registry (15 entries) for + consistent technical vocabulary across English and Italian documentation. Covers + core concepts: Safe Harbor, Ghost Route, Virtual Site Map, Two-Pass Engine, Shield, + Blood Sentinel, and more. Maintained by S-0. All terms marked `stable = true` + require an ADR before renaming. + +- **Bilingual documentation parity.** `docs/checks.md` and `docs/it/checks.md` + updated with Blood Sentinel, Circular Links, and Hex Shield sections. + `CHANGELOG.it.md` created. Full English–Italian parity enforced per the + Bilingual Parity Protocol. + ### ⚠️ Security +- **Blood Sentinel — system-path traversal classification (Exit Code 3).** + `check links` and `check all` now classify path-traversal findings by intent. + An href that escapes `docs/` and resolves to an OS system directory (`/etc/`, + `/root/`, `/var/`, `/proc/`, `/sys/`, `/usr/`) is classified as + `PATH_TRAVERSAL_SUSPICIOUS` with severity `security_incident` and triggers + **Exit Code 3** — a new, dedicated exit code reserved for host-system probes. + Exit 3 takes priority over Exit 2 (credential breach) and is never suppressed + by `--exit-zero`. Plain out-of-bounds traversals (e.g. `../../sibling-repo/`) + remain `PATH_TRAVERSAL` at severity `error` (Exit Code 1). + +- **Hex Shield — hex-encoded payload detection.** + A new built-in Shield pattern `hex-encoded-payload` detects runs of three or + more consecutive `\xNN` hex escape sequences (`(?:\\x[0-9a-fA-F]{2}){3,}`). + The `{3,}` threshold avoids false positives on single hex escapes common in + regex documentation. Findings exit with code 2 (Shield, non-suppressible) + and apply to all content streams including fenced code blocks. + - **[ZRT-001] Shield Blind Spot — YAML Frontmatter Bypass (CRITICAL).** `_skip_frontmatter()` was used as the Shield's line source, silently discarding every line in a file's YAML `---` block before the regex diff --git a/RELEASE.md b/RELEASE.md index 2a1f77f..14fc4a8 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,606 +1,212 @@ -# Zenzic v0.5.0a3: The Sentinel — Aesthetic Identity, Parallel Anchors & Agnostic Target +# Zenzic v0.5.0a4 — Pre-Release Audit Package -## v0.5.0a3 — The Sentinel: Aesthetic Sprint + Performance & SDK +**Prepared by:** S-1 (Auditor) + S-0 (Chronicler) +**Date:** 2026-04-08 +**Status:** ALPHA — Pending Tech Lead manual verification before rc1 promotion +**Branch:** `fix/sentinel-hardening-v0.5.0a4` -**Release date:** 2026-04-03 -**Status:** Alpha 3 — two-phase anchor indexing, plugin SDK scaffolding, Sentinel Palette, -agnostic target mode, native Material header - -### Highlights +> **Tech Lead note:** This document is your single audit surface. Work through each +> section in order. When every checkbox below is ticked, the project is ready for +> the `rc1` tag. Until then, the "Alpha" designation stands. --- -#### 🎨 Sentinel Palette — Color Identity for the Report Engine - -The report engine now speaks a deliberate visual language. Every number, every gutter -marker, every severity badge has an assigned color drawn from a named palette: - -| Role | Color | Example | -| :--- | :---- | :------ | -| Numeric values (counts, scores, elapsed) | Indigo | `12 files`, `0.1s` | -| Gutter (`│` separator, line numbers) | Slate | `3 │ # Heading` | -| Error icon, label, count | Rose | `✘ 2 errors` | -| Warning icon, label, count | Amber | `⚠ 5 warnings` | +## 1. Version Anchors -Bold has been removed from all report numbers — color alone carries the weight. The -palette is defined in `src/zenzic/ui.py`, a new standalone module consumed by both -the reporter and the CLI banner. +| Location | Expected | Actual | Status | +| :--- | :--- | :--- | :---: | +| `src/zenzic/__init__.py` | `0.5.0a4` | `0.5.0a4` | ✅ | +| `CHANGELOG.md` top entry | `[0.5.0a4]` | `[0.5.0a4]` | ✅ | +| `CHANGELOG.it.md` top entry | `[0.5.0a4]` | `[0.5.0a4]` | ✅ | +| No `rc1` in top-level version files | — | verified | ✅ | --- -#### 📡 Unified Banner Telemetry - -The Sentinel banner now emits a single unified counter: +## 2. Quality Gates ```text -vanilla • ./README.md • 1 file (1 docs, 0 assets) • 0.0s -mkdocs • 104 files (66 docs, 38 assets) • 3.5s -mkdocs • ./content/ • 2 files (2 docs, 0 assets) • 0.1s +pytest 756 passed, 0 failed +zenzic check all ✔ All checks passed (18 info-level CIRCULAR_LINK — expected) + --strict ``` -`docs` = `.md` files + config files (`yml`/`yaml`/`toml`) inside `docs_root`, -plus engine config files (`mkdocs.yml` etc.) at project root. -`assets` = everything else non-inert (images, fonts, PDFs…). - ---- - -#### 🎯 Agnostic Target Support — Scope Any Audit - -`zenzic check all` now accepts a positional `PATH` argument: - -```bash -# Audit a single file outside your docs tree -zenzic check all README.md +Gate targets for rc1 promotion: -# Audit an entire custom content directory -zenzic check all content/ - -# Audit a single page inside docs -zenzic check all docs/guide/setup.md -``` - -Zenzic auto-selects `VanillaAdapter` for out-of-tree targets. `docs_dir` is -patched at runtime — `zenzic.toml` is never rewritten. The banner shows the -active target so there is no ambiguity about what was scanned. - -Two new example projects ship with this release: - -- `examples/single-file-target/` — demonstrates `zenzic check all README.md` -- `examples/custom-dir-target/` — demonstrates `zenzic check all content/` +- [ ] `pytest` ≥ 756 passed, 0 failed +- [ ] `zenzic check all --strict` → exit code 0, no errors, no warnings +- [ ] `ruff check src/` → 0 violations +- [ ] `mypy src/` → 0 errors +- [ ] `mkdocs build --strict` → 0 warnings --- -#### ⚡ Two-Phase Parallel Anchor Indexing - -`validate_links_async` now separates concerns into two deterministic phases: - -1. **Phase 1 — Parallel index:** each worker extracts per-file anchors and - resolves internal links independently. No shared state; no race conditions. - -2. **Phase 2 — Global validation:** the main process merges all anchor indexes - and validates every link in a single pass. Order no longer matters. - -The result: no false positive `AnchorMissing` findings under heavy parallelism. -A 1000-file anchor torture test ships as a regression guard. +## 3. New Features in v0.5.0a4 — Review Checklist ---- +### 3.1 Blood Sentinel (Exit Code 3) -#### 🔌 Plugin SDK — First-Class Developer Surface +**What it does:** path-traversal hrefs pointing to OS system directories +(`/etc/`, `/root/`, `/var/`, `/proc/`, `/sys/`, `/usr/`) are classified as +`PATH_TRAVERSAL_SUSPICIOUS` → severity `security_incident` → **Exit Code 3**. +Exit 3 takes priority over Exit 2 (credential breach). Never suppressed by +`--exit-zero`. -```bash -zenzic init --plugin my-org-rules -``` +**Files changed:** -Generates a complete Python package skeleton: +- `src/zenzic/ui.py` — `BLOOD = "#8b0000"` palette constant +- `src/zenzic/core/reporter.py` — `security_incident` severity style (blood red) +- `src/zenzic/core/validator.py` — `_RE_SYSTEM_PATH`, `_classify_traversal_intent()` +- `src/zenzic/cli.py` — Exit Code 3 check in `check links` and `check all` -- `pyproject.toml` with `zenzic.rules` entry-point wiring -- `src/my_org_rules/rules.py` with a `BaseRule` template -- Minimal docs fixture so `zenzic check all` runs immediately on the scaffold +**Tests:** `TestTraversalIntent` (4 tests) + 2 exit-code integration tests in `test_cli.py` -The `zenzic.rules` public namespace is now stable — `BaseRule`, `RuleFinding`, -`CustomRule`, `Violation`, `Severity` are importable from a single path that will -not change between minor versions. +**Verification steps for Tech Lead:** -`run_rule()` — a one-call test helper — lets plugin authors verify findings without -any engine setup. - -`examples/plugin-scaffold-demo/` ships as the canonical scaffold output fixture, -serving as both a DX reference and a quality-gate integration test. +- [ ] Review `_classify_traversal_intent()` in `src/zenzic/core/validator.py` +- [ ] Verify `PATH_TRAVERSAL_SUSPICIOUS` → `security_incident` mapping in `cli.py` +- [ ] Verify Exit 3 is checked **before** Exit 2 in `check all` exit logic +- [ ] Confirm `--exit-zero` does NOT suppress Exit 3 +- [ ] Read `docs/checks.md` § "Blood Sentinel — system-path traversal" --- -#### ⚡ Smart Initialization — `zenzic init --pyproject` - -`zenzic init` now detects `pyproject.toml` in the project root and interactively -asks whether to embed configuration as a `[tool.zenzic]` table instead of creating -a standalone `zenzic.toml`. +### 3.2 Graph Integrity — Circular Link Detection -```bash -zenzic init # interactive: asks if pyproject.toml exists -zenzic init --pyproject # skip the prompt, write directly into pyproject.toml -zenzic init --force # overwrite existing config (both modes) -``` +**What it does:** Phase 1.5 pre-computes a cycle registry via iterative DFS +(Θ(V+E)). Phase 2 checks each resolved link against the registry in O(1). Links +in a cycle emit `CIRCULAR_LINK` at severity **`info`** (not error or warning). -Engine auto-detection (`mkdocs.yml` → `engine = "mkdocs"`, `zensical.toml` → -`engine = "zensical"`) works in both standalone and pyproject modes. When no -engine config file is found, vanilla defaults apply. +**Design decision — why `info`:** +The project's own documentation has ~34 intentional mutual navigation links +(Home ↔ Features, CI/CD ↔ Usage, etc.). Making this `warning` or `error` would +permanently break `--strict` self-check. The `info` level surfaces the topology +without blocking valid builds. ---- +**Files changed:** -#### 🛡️ Z001 / Z002 Split — Errors vs Warnings for Link Issues (closes #6) +- `src/zenzic/core/validator.py` — `_build_link_graph()`, `_find_cycles_iterative()`, Phase 1.5 block -`VSMBrokenLinkRule` now distinguishes: +**Tests:** `TestFindCyclesIterative` (6 unit tests) + `TestCircularLinkIntegration` (3 integration tests) -| Code | Meaning | Severity | -| :--- | :------ | :------- | -| `Z001` | Link target not found in file system or VSM | **error** | -| `Z002` | Link target exists but is an orphan page (not in nav) | warning | +**Verification steps for Tech Lead:** -Without `--strict`, orphan-link warnings do not block the build. With `--strict` -they are promoted to errors. Both codes appear in the checks reference (EN + IT). +- [ ] Review `_find_cycles_iterative()` — WHITE/GREY/BLACK DFS correctness +- [ ] Confirm `CIRCULAR_LINK` severity = `"info"` in `cli.py` Finding constructor +- [ ] Confirm CIRCULAR_LINK never triggers Exit 1 or Exit 2 +- [ ] Read `docs/checks.md` § "Circular links" +- [ ] Run `zenzic check all --strict` and confirm only info findings, exit 0 --- -#### 🌐 Native Material Header — MutationObserver injection +### 3.3 Hex Shield -The `source.html` template override has been deleted. Version injection now uses a -`MutationObserver` snippet in `main.html` that writes directly into Material's own -top bar after the widget renders. +**What it does:** built-in Shield pattern `hex-encoded-payload` detects +3+ consecutive `\xNN` hex escape sequences. Threshold prevents FP on +single-escape regex examples. -Result: a single, clean header row — 🏷 0.5.0a3 · ☆ stars · ψ forks — with no -duplicate rendering, no JavaScript collision, and no Material upgrade risk. +**Files changed:** ---- +- `src/zenzic/core/shield.py` — one line appended to `_SECRETS` -#### 🔧 Pre-commit Hooks — Ship Ready +**Tests:** 4 tests in `TestShield` in `test_references.py` -`.pre-commit-hooks.yaml` is now included in the repository root. Teams can pin -Zenzic as a pre-commit hook directly from GitHub without any intermediate wrapper: +**Verification steps for Tech Lead:** -```yaml -repos: - - repo: https://github.com/PythonWoods/zenzic - rev: v0.5.0a3 - hooks: - - id: zenzic-check-all -``` +- [ ] Confirm pattern `(?:\\x[0-9a-fA-F]{2}){3,}` in `shield.py` +- [ ] Confirm single `\xNN` is NOT flagged (threshold = 3) +- [ ] Read `docs/usage/advanced.md` § "Detected credential patterns" table --- -### Issue Closures +### 3.4 INTERNAL_GLOSSARY.toml -| Issue | Title | Status | -| :---- | :---- | :----- | -| #4 | Custom Rules DSL — Italian documentation | ✅ Closed | -| #6 | Z001/Z002 split: orphan links should be warnings | ✅ Closed | -| #13 | `zenzic.rules` stable public namespace for plugins | ✅ Closed | +**What it does:** canonical EN↔IT term registry. 15 entries. `stable = true` +entries require an ADR before renaming. ---- +**Verification steps for Tech Lead:** -### Quality Gates - -```text -pytest 706 passed, 0 failed -coverage 80%+ branch (gate: ≥ 80%) -mutation score 86.7% (242/279 killed on rules.py — target: 75%) -ruff check src/ 0 violations -mypy src/ 0 errors -reuse lint 262/262 files compliant -zenzic check all SUCCESS (self-dogfood, 104 files) -mkdocs build --strict, 0 warnings -``` +- [ ] Review all 15 terms — correct EN↔IT mapping? +- [ ] All core concepts covered? (VSM, RDP, Shield, Blood Sentinel, etc.) --- -### Mutation Testing Campaign — "The Mutant War" - -v0.5.0a3 ships with a full mutation testing campaign against `src/zenzic/core/rules.py` -using **mutmut 3.5.0**. The campaign raised the mutation score from 58.1% (baseline) -to **86.7%** (242/279 killed) — exceeding the 75% target by +11.7 percentage points. +## 4. Documentation Parity Matrix -**80 new targeted tests** were added to `test_rules.py`, organised in 7 specialised -test classes covering: +| Document | EN | IT | Hex Shield | Blood Sentinel | Circular Links | +| :--- | :---: | :---: | :---: | :---: | :---: | +| `docs/checks.md` | ✅ | ✅ | — | ✅ | ✅ | +| `docs/it/checks.md` | — | ✅ | — | ✅ | ✅ | +| `docs/usage/advanced.md` | ✅ | ✅ | ✅ | — | — | +| `docs/it/usage/advanced.md` | — | ✅ | ✅ | — | — | +| `CHANGELOG.md` | ✅ | — | ✅ | ✅ | ✅ | +| `CHANGELOG.it.md` | — | ✅ | ✅ | ✅ | ✅ | -- **PluginRegistry** (27 tests) — discovery, duplicates, case-sensitivity, `validate_rule()` -- **VSMBrokenLinkRule** (22 tests) — `check_vsm` path/anchor resolution, orphan detection -- **Inline link extraction** (14 tests) — escaped brackets, empty hrefs, multi-link lines -- **AdaptiveRuleEngine** (10 tests) — `run()` and `run_vsm()` short-circuits and propagation -- **Deep link extraction** (5 tests) — fence-block skipping, reference links, empty documents -- **Pickleable assertions** (2 tests) — deep-copy guard and `UNREACHABLE` sentinel +**Check for Tech Lead:** -The 37 surviving mutants were analysed and classified as equivalent mutations -(no observable behaviour change) or framework-level limitations (unreachable -defensive assertions). **Practical quality saturation** has been reached. - -Hypothesis property-based testing is integrated with three severity profiles: -`dev` (50 examples), `ci` (500), `purity` (1 000). +- [ ] Read `docs/checks.md` §§ "Blood Sentinel" and "Circular links" — prose correct? +- [ ] Read `docs/it/checks.md` §§ "Sentinella di Sangue" and "Link circolari" — translation accurate? +- [ ] Read `docs/usage/advanced.md` Shield table — `hex-encoded-payload` row present and correct? +- [ ] Read `docs/it/usage/advanced.md` — Italian row accurate? --- -## Why this release matters now +## 5. Exit Code Contract (complete picture) -The documentation tooling ecosystem is fractured. MkDocs 2.0 is on the horizon, carrying breaking -changes to plugin APIs and configuration formats. Zensical is emerging as a production-ready -alternative. Teams are migrating, experimenting, and hedging. In this environment, any quality -gate that is tightly coupled to a specific build engine has an expiry date. +| Exit Code | Trigger | Suppressible | +| :---: | :--- | :---: | +| 0 | All checks passed | — | +| 1 | One or more errors (broken links, syntax errors, etc.) | Via `--exit-zero` | +| 2 | Shield credential detection | **Never** | +| 3 | Blood Sentinel — system-path traversal (`PATH_TRAVERSAL_SUSPICIOUS`) | **Never** | -v0.4.0 answers that uncertainty with a clear architectural commitment: **Zenzic will never break -because your documentation engine changed.** +Priority order in `check all`: Exit 3 → Exit 2 → Exit 1 → Exit 0. -This is not a marketing claim. It is a precise technical guarantee backed by three design pillars -and two sprints of structural surgery. +- [ ] Tech Lead: verify this contract matches implementation in `cli.py` --- -## The Three Pillars - -### 1. Source-first — no build required - -Zenzic analyses raw Markdown files and configuration as plain data. It never calls `mkdocs build`, -never imports a documentation framework, never depends on generated HTML. A broken link is caught -in 11 milliseconds against 5,000 files — before your CI runner has finished checking out the repo. - -This makes Zenzic usable as a pre-commit hook, a pre-build gate, a PR check, and a migration -validator simultaneously. The same tool. The same score. The same findings. Regardless of which -engine you run. - -### 2. No subprocesses in the Core - -The reference implementation of "engine-agnostic linting" is to shell out to the engine and parse -its output. That approach inherits every instability of the engine: version skew, environment -differences, missing binaries on CI runners. - -Zenzic's Core is pure Python. Link validation uses `httpx`. Nav parsing uses `yaml` and `tomllib`. -There are no `subprocess.run` calls in the linting path. The engine binary does not need to be -installed for `zenzic check all` to pass. - -### 3. Pure functions, pure results - -All validation logic in Zenzic lives in pure functions: no file I/O, no network access, no global -state, no terminal output. I/O happens only at the edges — CLI wrappers that read files and print -findings. Pure functions are trivially testable (706 passing tests, ≥ 80% branch-coverage gate), composable -into higher-order pipelines, and deterministic across environments. - -The score you get on a developer laptop is the score CI gets. The score CI gets is the score you -track in version control. Determinism is not a feature; it is the foundation on which `zenzic diff` -and regression detection are built. - ---- - -## What's New in rc4 - -### Ghost Routes — MkDocs Material i18n entry points - -When `reconfigure_material: true` is active in the i18n plugin, MkDocs Material -auto-generates locale entry points (e.g. `it/index.md`) that never appear in `nav:`. -The VSM now marks these as `REACHABLE` Ghost Routes, eliminating false orphan warnings -on locale root pages. A `WARNING` is emitted when both `reconfigure_material: true` -and `extra.alternate` are declared simultaneously (redundant configuration). - -### VSM Rule Engine — routing-aware lint rules - -`BaseRule` gains an optional `check_vsm()` interface. Rules that override it receive -the full pre-built VSM and can validate links against routing state without any I/O. -`RuleEngine.run_vsm()` dispatches all VSM-aware rules and converts `Violation` objects -to the standard `RuleFinding` type for uniform output. - -The first built-in VSM rule — `VSMBrokenLinkRule` (code `Z001`) — validates all inline -Markdown links against the VSM. A link is valid only when its target URL is present -and `REACHABLE`. Both "not in VSM" and "UNREACHABLE_LINK" cases produce a structured -`Violation` with file path, line number, and the offending source line as context. - -### Content-addressable cache (`CacheManager`) - -Rule results are now cached with SHA-256 keying: - -| Rule type | Cache key | -| :--- | :--- | -| Atomic (content only) | `SHA256(content) + SHA256(config)` | -| Global (VSM-aware) | `SHA256(content) + SHA256(config) + SHA256(vsm_snapshot)` | - -Timestamps are never consulted — the cache is CI-safe by construction. Writes are -atomic (`.tmp` rename). The cache is loaded once at startup and saved once at the end -of a run; all in-run operations are pure in-memory. - -### Performance — O(N) torture tests (10k nodes) - -The VSM Rule Engine and cache infrastructure are validated at scale: 10,000 links all -valid completes in < 1 s; 10,000 links all broken completes in < 1 s; -`engine.run_vsm` with a 10,000-node VSM completes in < 0.5 s. - ---- - -## What Changed in rc3 - -### i18n Anchor Fix — AnchorMissing now has i18n fallback suppression - -`AnchorMissing` now participates in the same i18n fallback logic as `FileNotFound`. Previously, -a link like `[text](it/page.md#heading)` would fire a false positive when the Italian page existed -but its heading was translated — because the `AnchorMissing` branch in `validate_links_async` had -no suppression path. `_should_suppress_via_i18n_fallback()` was defined but never called. - -**Fix:** new `resolve_anchor()` method added to `BaseAdapter` protocol and all three adapters -(`MkDocsAdapter`, `ZensicalAdapter`, `VanillaAdapter`). When an anchor is not found in a locale -file, `resolve_anchor()` checks whether the anchor exists in the default-locale equivalent via -the `anchors_cache` already in memory. No additional disk I/O. - -### Shared utility — `remap_to_default_locale()` - -The locale path-remapping logic that was independently duplicated in `resolve_asset()` and -`is_shadow_of_nav_page()` is now a single pure function in `src/zenzic/core/adapters/_utils.py`. -`resolve_asset()`, `resolve_anchor()`, and `is_shadow_of_nav_page()` in both `MkDocsAdapter` and -`ZensicalAdapter` all delegate to it. `_should_suppress_via_i18n_fallback()`, `I18nFallbackConfig`, -`_I18N_FALLBACK_DISABLED`, and `_extract_i18n_fallback_config()` — 118 lines of dead code — -are permanently removed from `validator.py`. - -### Visual Snippets for custom rule findings - -Custom rule violations (`[[custom_rules]]` from `zenzic.toml`) now display the offending source -line below the finding header: - -```text -[ZZ-NODRAFT] docs/guide/install.md:14 — Remove DRAFT marker before publishing. - │ > DRAFT: section under construction -``` - -The `│` indicator is rendered in the finding's severity colour. Standard findings (broken links, -orphans, etc.) are unaffected. - -### JSON schema — 7 keys - -`--format json` output now emits a stable 7-key schema: -`links`, `orphans`, `snippets`, `placeholders`, `unused_assets`, `references`, `nav_contract`. - -### `strict` and `exit_zero` as `zenzic.toml` fields +## 6. Sandbox Self-Check -Both flags can now be declared in `zenzic.toml` as project-level defaults: - -```toml -strict = true # equivalent to always passing --strict -exit_zero = false # exit code 0 even on findings (CI soft-gate) -``` - -CLI flags continue to override the TOML values. - -### Usage docs split — three focused pages - -`docs/usage/index.md` was a monolithic 580-line page covering install, commands, CI/CD, scoring, -advanced features, and programmatic API. Split into three focused pages: - -- `usage/index.md` — Install options, init→config→check workflow, engine modes -- `usage/commands.md` — CLI commands, flags, exit codes, JSON output, quality score -- `usage/advanced.md` — Three-pass pipeline, Zenzic Shield, alt-text, programmatic API, - multi-language docs - -Italian mirrors (`it/usage/`) updated in full parity. - -### Multi-language snippet validation - -`zenzic check snippets` now validates four languages using pure Python parsers — no subprocesses -for any language. Python uses `compile()`, YAML uses `yaml.safe_load()`, JSON uses `json.loads()`, -and TOML uses `tomllib.loads()` (Python 3.11+ stdlib). Blocks with unsupported language tags -(`bash`, `javascript`, `mermaid`, etc.) are treated as plain text and not syntax-checked. - -### Shield deep-scan — no more blind spots - -The credential scanner now operates on every line of the source file, including lines inside -fenced code blocks. A credential committed in a `bash` example is still a committed credential — -Zenzic will find it. The link and reference validators continue to ignore fenced block content to -prevent false positives from illustrative example URLs. - -The Shield now covers seven credential families: OpenAI API keys, GitHub tokens, AWS access keys, -Stripe live keys, Slack tokens, Google API keys, and generic PEM private keys. - ---- - -## Professional Packaging & PEP 735 - -v0.4.0-rc3 adopts the latest Python packaging standards end-to-end, making Zenzic lighter for -end users and measurably faster in CI. - -### Lean core install - -`pip install zenzic` installs only the five runtime dependencies (`typer`, `rich`, -`pyyaml`, `pydantic`, `httpx`). The MkDocs build stack is not a dependency of `zenzic` — -it is a contributor tool, managed via the `docs` [PEP 735](https://peps.python.org/pep-0735/) -dependency group (`uv sync --group docs`). - -For the vast majority of users (Hugo sites, Zensical projects, plain Markdown wikis, CI -pipelines) this means a ~60% smaller install and proportionally faster cold-start times on -ephemeral CI runners. - -### PEP 735 — atomic dependency groups - -Development dependencies are declared as [PEP 735](https://peps.python.org/pep-0735/) groups -in `pyproject.toml`, managed by `uv`: - -| Group | Purpose | CI job | -| :---- | :------ | :----- | -| `test` | pytest + coverage | `quality` matrix (3.11 / 3.12 / 3.13) | -| `lint` | ruff + mypy + pre-commit + reuse | `quality` matrix | -| `docs` | MkDocs stack | `docs` job | -| `release` | nox + bump-my-version + pip-audit | `security` job | -| `dev` | All of the above (local development) | — | - -Each CI job syncs only the group it needs. The `quality` job never installs the MkDocs stack. -The `docs` job never installs pytest. This eliminates install time wasted on unused packages -and reduces the surface area for dependency conflicts across jobs. Combined with the `uv` -cache in GitHub Actions, subsequent CI runs restore the full environment in under 3 seconds. - -### `CITATION.cff` - -A [`CITATION.cff`](CITATION.cff) file (CFF 1.2.0 format) is now present at the repository -root. GitHub renders it automatically as a "Cite this repository" button. Zenodo, Zotero, and -other reference managers that support the format can import it directly. - ---- - -## The Documentation Firewall - -v0.4.0-rc3 completes a strategic shift in what Zenzic is. It began as a link checker. It became -an engine-agnostic linter. With rc3, it becomes a **Documentation Firewall** — a single gate that -enforces correctness, completeness, and security simultaneously. - -The three dimensions of the firewall: - -**1. Correctness** — Zenzic validates the syntax of every structured data block in your docs. -Your Kubernetes YAML examples, your OpenAPI JSON fragments, your TOML configuration snippets — if -you ship broken config examples, your users will copy broken config. `check snippets` catches this -before it reaches production, using the same parsers your users will run. - -**2. Completeness** — Orphan detection, placeholder scanning, and the `fail_under` quality gate -ensure that every page linked in the nav exists, contains real content, and scores above the -team's agreed threshold. A documentation site is not "done" when all pages exist — it is done -when all pages are complete. - -**3. Security** — The Shield scans every line of every file, including code blocks, for seven -families of leaked credentials. No fencing, no labels, no annotations can hide a secret from -Zenzic. The exit code 2 contract is non-negotiable and non-suppressible: a secret in docs is a -build-blocking incident, not a warning. - -This is what "Documentation Firewall" means: not a tool you run once before a release, but a -gate that runs on every commit, enforces three dimensions of quality simultaneously, and exits -with a machine-readable code that your CI pipeline can act on without human interpretation. - ---- - -## The Great Decoupling (v0.4.0-rc2) - -The headline change in this release is the **Dynamic Adapter Discovery** system. In v0.3.x, -Zenzic owned its adapters — `MkDocsAdapter` and `ZensicalAdapter` were imported directly by the -factory. Adding support for a new engine required a Zenzic release. - -In v0.4.0, Zenzic is a **framework host**. Adapters are Python packages that register themselves -under the `zenzic.adapters` entry-point group. When installed, they become available immediately: +Run these commands manually and verify output: ```bash -# Example: third-party adapter for a hypothetical Hugo support package -uv pip install zenzic-hugo-adapter # or: pip install zenzic-hugo-adapter -zenzic check all --engine hugo -``` - -No Zenzic update. No configuration change. Just install and use. - -The built-in adapters (`mkdocs`, `zensical`, `vanilla`) are registered the same way — there is -no privileged path for first-party adapters. This is not future-proofing; it is a structural -guarantee that the third-party adapter API is exactly as capable as the first-party one. +# 1. Full test suite +uv run pytest --tb=short -The factory itself is now protocol-only. `scanner.py` imports zero concrete adapter classes. The -`has_engine_config()` protocol method replaced the `isinstance(adapter, VanillaAdapter)` check -that was the last coupling point. The Core is now genuinely adapter-agnostic. +# 2. Self-dogfood (strict mode) +uv run zenzic check all --strict ---- - -## The [[custom_rules]] DSL - -v0.4.0 ships the first version of the project-specific lint DSL. Teams can declare regex rules -in `zenzic.toml` without writing any Python: - -```toml -[[custom_rules]] -id = "ZZ-NODRAFT" -pattern = "(?i)\\bDRAFT\\b" -message = "Remove DRAFT marker before publishing." -severity = "warning" +# 3. Static analysis +uv run ruff check src/ +uv run mypy src/ --ignore-missing-imports ``` -Rules are adapter-independent — they fire identically with MkDocs, Zensical, or a plain -Markdown folder. Patterns are compiled once at config-load time; there is no per-file regex -compilation overhead regardless of how many rules are declared. +Expected: -This DSL is the first step toward Zenzic as a complete documentation policy engine, not just a -structural linter. +- pytest: 756 passed, 0 failed +- check all --strict: exit 0, "✔ All checks passed" +- ruff: 0 violations +- mypy: 0 errors (or pre-existing stubs only) --- -## The Shield (Defence-in-Depth hardening) - -The credential scanner (`Shield`) now runs on every non-definition line during Pass 1, not only -on reference URL values. A developer who pastes an API key into a Markdown paragraph — not a -reference link — is caught before any URL is pinged, before any HTTP request is issued, before -any downstream tool sees the credential. +## 7. rc1 Gate Decision -Exit code `2` remains reserved exclusively for Shield events. It cannot be suppressed by -`--exit-zero`, `--strict`, or any other flag. A Shield detection is a build-blocking security -incident — unconditionally. +This section is for the Tech Lead's signature. ---- - -## Documentation as a first-class citizen - -The v0.4.0 documentation was itself validated with `zenzic check all` at every step — the -canonical dogfood mandate. - -Key structural changes: - -- **Configuration split** — the single `configuration.md` god-page decomposed into four focused - pages: [Overview](docs/configuration/index.md), [Core Settings](docs/configuration/core-settings.md), - [Adapters & Engine](docs/configuration/adapters-config.md), - [Custom Rules DSL](docs/configuration/custom-rules-dsl.md). -- **Italian parity** — `docs/it/` now mirrors the full English structure. The documentation - is production-ready for international teams. -- **Migration guide** — [MkDocs → Zensical](docs/guide/migration.md) four-phase workflow with - the baseline/diff/gate approach as the migration safety net. -- **Adapter guide** — [Writing an Adapter](docs/developers/writing-an-adapter.md) full - protocol reference, `from_repo` pattern, entry-point registration, and test utilities. - -### Frictionless Onboarding - -v0.4.0 introduces `zenzic init` — a single command that scaffolds a `zenzic.toml` with smart -engine discovery. If `mkdocs.yml` is present, the generated file pre-sets `engine = "mkdocs"`. -If `zensical.toml` is present, it pre-sets `engine = "zensical"`. Otherwise the scaffold is -engine-agnostic (Vanilla mode). - -```bash -uvx zenzic init # zero-install bootstrap -# or: zenzic init # if already installed globally -``` - -For teams running Zenzic for the first time, a Helpful Hint panel appears automatically when no -`zenzic.toml` is found — pointing directly to `zenzic init`. The hint disappears the moment the -file is created. Zero friction to get started; zero noise once configured. - ---- - -## Upgrade path - -### From v0.3.x - -No `zenzic.toml` changes are required for MkDocs projects. The adapter discovery is fully -backwards-compatible: `engine = "mkdocs"` continues to work exactly as before. - -**One behavioural change:** an unknown `engine` string now falls back to `VanillaAdapter` (skip -orphan check) instead of `MkDocsAdapter`. If your `zenzic.toml` specifies a custom engine name -that mapped to MkDocs behaviour, add the explicit `engine = "mkdocs"` declaration. - -### From v0.4.0-alpha.1 - -The `--format` CLI flag is unchanged. The internal `format` parameter in `check_all`, `score`, -and `diff` Python APIs has been renamed to `output_format` — update any programmatic callers. - ---- - -## Checksums and verification - -```text -zenzic check all # self-dogfood: 7/7 OK -pytest # 706 passed, 0 failed -coverage # ≥ 80% branch (hard gate) -mutation score # 86.7% (242/279 killed on rules.py) -ruff check . # 0 violations -mypy src/ # 0 errors -mkdocs build --strict # 0 warnings -``` - ---- +- [ ] All verification steps in §§ 3.1–3.4 completed +- [ ] Documentation parity matrix §4 confirmed correct +- [ ] Exit code contract §5 verified in code +- [ ] Sandbox self-check §6 passed manually +- [ ] `INTERNAL_GLOSSARY.toml` reviewed and approved +- [ ] No open blocking issues -*Zenzic v0.4.0 is released under the Apache-2.0 license.* -*Built and maintained by [PythonWoods](https://github.com/PythonWoods).* +**Decision:** ☐ Approve rc1 promotion    ☐ Defer — open issues remain --- -Based in Italy 🇮🇹 | Committed to the craft of Python development. -Contact: +*"Una Release Candidate non è un premio per aver finito i task, è una promessa di +stabilità che facciamo all'utente."* +— Senior Tech Lead