From 134f11f438b746a3f0f0304905e4a7744cfcc94b Mon Sep 17 00:00:00 2001 From: Eric W Cochran Date: Thu, 21 May 2026 07:30:52 -0500 Subject: [PATCH] Summarize unsupported blockers --- CHANGELOG.md | 8 ++++ README.md | 8 +++- ROADMAP.md | 23 +++++++++ pdf_mutation/engine.py | 47 ++++++++++++++++++ tests/test_pdf_glyph_replace.py | 85 +++++++++++++++++++++++++++++++++ 5 files changed, 169 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c4b521e..c7ba037 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,13 @@ # Changelog +## Unreleased + +### Added + +- Plan and audit JSON now include a non-sensitive `blocker_summary` that + aggregates unpatchable text-object reasons, split kinds, blocker reasons, and + blocker fonts. + ## v0.1.6 | Length Position Coverage Maintenance release that broadens public length-changing alignment evidence diff --git a/README.md b/README.md index d3a030e..8811d0b 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,9 @@ whether a mutation is structurally patchable: Audit JSON omits full decoded document text and literal search/replacement strings. It records text object indexes, stream objects, font resources, decoded lengths, short decoded-text hashes, match counts, patchability, and -split matches across text objects or font resources. +split matches across text objects or font resources. The `blocker_summary` +section aggregates unpatchable text-object reasons, split kinds, blocker +reasons, and blocker fonts without including decoded document text. To write a reviewable mutation plan without editing the PDF: @@ -91,7 +93,9 @@ Plan JSON is non-sensitive by default. It includes input fingerprint metadata, font resources, expected candidate counts, patchable match entries, glyph CID spans, replacement CIDs, and split candidates. Split candidates include ordered segment metadata and font-specific blockers, but remain unpatchable until a -separate segmented-plan schema is implemented. +separate segmented-plan schema is implemented. The top-level `blocker_summary` +aggregates split kinds and blocker reasons so unsupported plans can be triaged +without scanning every candidate. To apply a reviewed same-glyph-count plan later: diff --git a/ROADMAP.md b/ROADMAP.md index 6c037cf..a8be7d5 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -51,6 +51,29 @@ inventory, and release evidence exist only to validate and explain that product. ## Forward Milestones +### M15 | Structured Unsupported-Case Summary + +Status: DONE + +Goal: Make unsupported split-object and mixed-font plans easier to triage +without exposing decoded document text. + +Scope: +- Add a compact summary of unpatchable text-object reasons. +- Aggregate split candidate kinds and font-specific blocker reasons. +- Reuse the same non-sensitive summary in audit and plan JSON. + +Acceptance Criteria: +- Plan and audit outputs expose blocker counts by reason, split kind, and font. +- Existing detailed segment metadata remains available for deeper review. +- Reports continue to omit decoded document text and literal + search/replacement strings. + +Completed: +- Added top-level `blocker_summary` to plan and audit JSON. +- Covered cross-object, cross-font, and missing-glyph split blockers in tests. +- Documented the summary fields in README. + ### M7 | Mutation Planner Status: DONE diff --git a/pdf_mutation/engine.py b/pdf_mutation/engine.py index 4b1267f..731ae1c 100644 --- a/pdf_mutation/engine.py +++ b/pdf_mutation/engine.py @@ -614,6 +614,27 @@ def split_segments_for_match( return segments, blockers +def blocker_summary( + *, + text_object_reasons: list[str], + split_kinds: list[str], + split_blockers: list[dict[str, object]], +) -> dict[str, object]: + blocker_reason_counts = Counter(str(blocker["reason"]) for blocker in split_blockers) + blocker_font_counts = Counter(str(blocker["font"]) for blocker in split_blockers) + return { + "text_object_reason_counts": dict(sorted(Counter(text_object_reasons).items())), + "split_kind_counts": dict(sorted(Counter(split_kinds).items())), + "split_blocker_reason_counts": dict(sorted(blocker_reason_counts.items())), + "split_blocker_font_counts": dict(sorted(blocker_font_counts.items())), + "split_blocker_count": len(split_blockers), + "privacy": { + "decoded_text_included": False, + "literal_search_replacement_included": False, + }, + } + + def plan_qdf( qdf: bytes, search: str, @@ -746,6 +767,12 @@ def plan_qdf( patchable_matches = sum(1 for match in matches if match.patchable) unpatchable_matches = sum(1 for match in matches if not match.patchable) + len(split_candidates) + unpatchable_reasons = [match.reason for match in matches if not match.patchable] + split_blockers = [ + blocker + for candidate in split_candidates + for blocker in candidate.blockers + ] payload: dict[str, object] = { "schema": "pdf-mutation-plan", "schema_version": 1, @@ -769,6 +796,11 @@ def plan_qdf( }, "matches": [dataclasses.asdict(match) for match in matches], "split_candidates": [dataclasses.asdict(candidate) for candidate in split_candidates], + "blocker_summary": blocker_summary( + text_object_reasons=unpatchable_reasons, + split_kinds=[candidate.split_kind for candidate in split_candidates], + split_blockers=split_blockers, + ), "privacy": { "decoded_text_included": False, "literal_search_replacement_included": False, @@ -896,6 +928,16 @@ def audit_qdf(qdf: bytes, search: str, replacement: str, *, align: str) -> dict[ total_matches = sum(obj.match_count for obj in text_objects) + sum(match.match_count for match in split_matches) patchable_matches = sum(obj.match_count for obj in text_objects if obj.patchable) unpatchable_matches = total_matches - patchable_matches + unpatchable_reasons = [ + obj.reason + for obj in text_objects + if obj.match_count and not obj.patchable + ] + split_blockers = [ + blocker + for match in split_matches + for blocker in match.blockers + ] return { "version": __version__, "mode": "audit", @@ -915,6 +957,11 @@ def audit_qdf(qdf: bytes, search: str, replacement: str, *, align: str) -> dict[ "split_match_count": sum(match.match_count for match in split_matches), "text_objects": [dataclasses.asdict(obj) for obj in text_objects], "split_matches": [dataclasses.asdict(match) for match in split_matches], + "blocker_summary": blocker_summary( + text_object_reasons=unpatchable_reasons, + split_kinds=[match.split_kind for match in split_matches], + split_blockers=split_blockers, + ), "privacy": { "decoded_text_included": False, "literal_search_replacement_included": False, diff --git a/tests/test_pdf_glyph_replace.py b/tests/test_pdf_glyph_replace.py index 3ec7f9c..100c4b7 100644 --- a/tests/test_pdf_glyph_replace.py +++ b/tests/test_pdf_glyph_replace.py @@ -451,6 +451,20 @@ def test_audit_qdf_reports_all_text_objects_and_split_font_match_without_text(se [(1, "F4", 0, 2), (2, "F5", 0, 2)], ) self.assertEqual(payload["split_matches"][0]["blockers"], []) + self.assertEqual( + payload["blocker_summary"], + { + "text_object_reason_counts": {}, + "split_kind_counts": {"cross_text_object_and_font": 1}, + "split_blocker_reason_counts": {}, + "split_blocker_font_counts": {}, + "split_blocker_count": 0, + "privacy": { + "decoded_text_included": False, + "literal_search_replacement_included": False, + }, + }, + ) self.assertEqual([obj["font"] for obj in payload["text_objects"]], ["F4", "F5", "F5"]) self.assertEqual([obj["match_count"] for obj in payload["text_objects"]], [0, 0, 0]) self.assertFalse(payload["privacy"]["decoded_text_included"]) @@ -474,6 +488,36 @@ def test_audit_qdf_reports_patchable_mixed_font_object_match(self): self.assertTrue(payload["text_objects"][1]["patchable"]) self.assertEqual(payload["text_objects"][1]["alignment_contract"], "exact glyph-count replacement preserves existing layout operators") + def test_audit_qdf_summarizes_split_blockers_without_literal_text(self): + f5_map = dict(f.DEFAULT_CIDS) + del f5_map["4"] + qdf = mixed_font_qdf( + f.text_object("38", font="F4", x="100", y="10"), + f.text_object("07", font="F5", x="140", y="10", cid_map=f5_map), + f5_cid_map=f5_map, + ) + + payload = p.audit_qdf(qdf, "3807", "8304", align="exact") + + self.assertEqual( + payload["blocker_summary"], + { + "text_object_reason_counts": {}, + "split_kind_counts": {"cross_text_object_and_font": 1}, + "split_blocker_reason_counts": { + "replacement character(s) not present in active font": 1 + }, + "split_blocker_font_counts": {"F5": 1}, + "split_blocker_count": 1, + "privacy": { + "decoded_text_included": False, + "literal_search_replacement_included": False, + }, + }, + ) + self.assertNotIn("3807", str(payload)) + self.assertNotIn("8304", str(payload)) + def test_audit_exit_status_distinguishes_patchable_missing_and_unpatchable(self): patchable = { "total_matches": 1, @@ -504,6 +548,20 @@ def test_plan_qdf_records_patchable_exact_match_without_literal_text(self): self.assertEqual(payload["expected"]["patchable_matches"], 1) self.assertEqual(payload["expected"]["unpatchable_candidates"], 0) self.assertEqual(payload["expected"]["split_candidates"], 0) + self.assertEqual( + payload["blocker_summary"], + { + "text_object_reason_counts": {}, + "split_kind_counts": {}, + "split_blocker_reason_counts": {}, + "split_blocker_font_counts": {}, + "split_blocker_count": 0, + "privacy": { + "decoded_text_included": False, + "literal_search_replacement_included": False, + }, + }, + ) self.assertRegex(payload["plan_id"], r"^[0-9a-f]{16}$") self.assertFalse(payload["privacy"]["decoded_text_included"]) self.assertFalse(payload["privacy"]["literal_search_replacement_included"]) @@ -550,6 +608,11 @@ def test_plan_qdf_records_split_candidate_as_unpatchable(self): [(1, "F4", 0, 2), (2, "F5", 0, 2)], ) self.assertEqual(payload["split_candidates"][0]["blockers"], []) + self.assertEqual( + payload["blocker_summary"]["split_kind_counts"], + {"cross_text_object_and_font": 1}, + ) + self.assertEqual(payload["blocker_summary"]["split_blocker_count"], 0) def test_plan_qdf_records_adjacent_same_font_split_as_unpatchable_segmented_candidate(self): qdf = f.qdf_document( @@ -567,6 +630,8 @@ def test_plan_qdf_records_adjacent_same_font_split_as_unpatchable_segmented_cand self.assertEqual(split["fonts"], ["F4", "F4"]) self.assertEqual(split["blockers"], []) self.assertTrue(all(segment["replacement_glyphs_available"] for segment in split["segments"])) + self.assertEqual(payload["blocker_summary"]["split_kind_counts"], {"cross_text_object": 1}) + self.assertEqual(payload["blocker_summary"]["split_blocker_count"], 0) def test_plan_qdf_records_missing_replacement_glyph_without_literal_replacement(self): qdf = f.synthetic_qdf("3807") @@ -580,6 +645,10 @@ def test_plan_qdf_records_missing_replacement_glyph_without_literal_replacement( self.assertFalse(match["patchable"]) self.assertEqual(match["reason"], "replacement character(s) not present in active font") self.assertEqual(match["replacement_cids"], []) + self.assertEqual( + payload["blocker_summary"]["text_object_reason_counts"], + {"replacement character(s) not present in active font": 1}, + ) self.assertNotIn("3807", str(payload)) self.assertNotIn("ZZZZ", str(payload)) @@ -611,6 +680,22 @@ def test_plan_qdf_records_split_blocker_when_replacement_glyph_missing_in_one_fo } ], ) + self.assertEqual( + payload["blocker_summary"], + { + "text_object_reason_counts": {}, + "split_kind_counts": {"cross_text_object_and_font": 1}, + "split_blocker_reason_counts": { + "replacement character(s) not present in active font": 1 + }, + "split_blocker_font_counts": {"F5": 1}, + "split_blocker_count": 1, + "privacy": { + "decoded_text_included": False, + "literal_search_replacement_included": False, + }, + }, + ) self.assertNotIn("3807", str(payload)) self.assertNotIn("8304", str(payload))