Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
# Changelog

## Unreleased

### Added

- Plan and audit JSON now include a non-sensitive `blocker_summary` that
aggregates unpatchable text-object reasons, split kinds, blocker reasons, and
blocker fonts.

## v0.1.6 | Length Position Coverage

Maintenance release that broadens public length-changing alignment evidence
Expand Down
8 changes: 6 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ whether a mutation is structurally patchable:
Audit JSON omits full decoded document text and literal search/replacement
strings. It records text object indexes, stream objects, font resources,
decoded lengths, short decoded-text hashes, match counts, patchability, and
split matches across text objects or font resources.
split matches across text objects or font resources. The `blocker_summary`
section aggregates unpatchable text-object reasons, split kinds, blocker
reasons, and blocker fonts without including decoded document text.

To write a reviewable mutation plan without editing the PDF:

Expand All @@ -91,7 +93,9 @@ Plan JSON is non-sensitive by default. It includes input fingerprint metadata,
font resources, expected candidate counts, patchable match entries, glyph CID
spans, replacement CIDs, and split candidates. Split candidates include ordered
segment metadata and font-specific blockers, but remain unpatchable until a
separate segmented-plan schema is implemented.
separate segmented-plan schema is implemented. The top-level `blocker_summary`
aggregates split kinds and blocker reasons so unsupported plans can be triaged
without scanning every candidate.

To apply a reviewed same-glyph-count plan later:

Expand Down
23 changes: 23 additions & 0 deletions ROADMAP.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,29 @@ inventory, and release evidence exist only to validate and explain that product.

## Forward Milestones

### M15 | Structured Unsupported-Case Summary

Status: DONE

Goal: Make unsupported split-object and mixed-font plans easier to triage
without exposing decoded document text.

Scope:
- Add a compact summary of unpatchable text-object reasons.
- Aggregate split candidate kinds and font-specific blocker reasons.
- Reuse the same non-sensitive summary in audit and plan JSON.

Acceptance Criteria:
- Plan and audit outputs expose blocker counts by reason, split kind, and font.
- Existing detailed segment metadata remains available for deeper review.
- Reports continue to omit decoded document text and literal
search/replacement strings.

Completed:
- Added top-level `blocker_summary` to plan and audit JSON.
- Covered cross-object, cross-font, and missing-glyph split blockers in tests.
- Documented the summary fields in README.

### M7 | Mutation Planner

Status: DONE
Expand Down
47 changes: 47 additions & 0 deletions pdf_mutation/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,27 @@ def split_segments_for_match(
return segments, blockers


def blocker_summary(
*,
text_object_reasons: list[str],
split_kinds: list[str],
split_blockers: list[dict[str, object]],
) -> dict[str, object]:
blocker_reason_counts = Counter(str(blocker["reason"]) for blocker in split_blockers)
blocker_font_counts = Counter(str(blocker["font"]) for blocker in split_blockers)
return {
"text_object_reason_counts": dict(sorted(Counter(text_object_reasons).items())),
"split_kind_counts": dict(sorted(Counter(split_kinds).items())),
"split_blocker_reason_counts": dict(sorted(blocker_reason_counts.items())),
"split_blocker_font_counts": dict(sorted(blocker_font_counts.items())),
"split_blocker_count": len(split_blockers),
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
},
}


def plan_qdf(
qdf: bytes,
search: str,
Expand Down Expand Up @@ -746,6 +767,12 @@ def plan_qdf(

patchable_matches = sum(1 for match in matches if match.patchable)
unpatchable_matches = sum(1 for match in matches if not match.patchable) + len(split_candidates)
unpatchable_reasons = [match.reason for match in matches if not match.patchable]
split_blockers = [
blocker
for candidate in split_candidates
for blocker in candidate.blockers
]
payload: dict[str, object] = {
"schema": "pdf-mutation-plan",
"schema_version": 1,
Expand All @@ -769,6 +796,11 @@ def plan_qdf(
},
"matches": [dataclasses.asdict(match) for match in matches],
"split_candidates": [dataclasses.asdict(candidate) for candidate in split_candidates],
"blocker_summary": blocker_summary(
text_object_reasons=unpatchable_reasons,
split_kinds=[candidate.split_kind for candidate in split_candidates],
split_blockers=split_blockers,
),
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
Expand Down Expand Up @@ -896,6 +928,16 @@ def audit_qdf(qdf: bytes, search: str, replacement: str, *, align: str) -> dict[
total_matches = sum(obj.match_count for obj in text_objects) + sum(match.match_count for match in split_matches)
patchable_matches = sum(obj.match_count for obj in text_objects if obj.patchable)
unpatchable_matches = total_matches - patchable_matches
unpatchable_reasons = [
obj.reason
for obj in text_objects
if obj.match_count and not obj.patchable
]
split_blockers = [
blocker
for match in split_matches
for blocker in match.blockers
]
return {
"version": __version__,
"mode": "audit",
Expand All @@ -915,6 +957,11 @@ def audit_qdf(qdf: bytes, search: str, replacement: str, *, align: str) -> dict[
"split_match_count": sum(match.match_count for match in split_matches),
"text_objects": [dataclasses.asdict(obj) for obj in text_objects],
"split_matches": [dataclasses.asdict(match) for match in split_matches],
"blocker_summary": blocker_summary(
text_object_reasons=unpatchable_reasons,
split_kinds=[match.split_kind for match in split_matches],
split_blockers=split_blockers,
),
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
Expand Down
85 changes: 85 additions & 0 deletions tests/test_pdf_glyph_replace.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,20 @@ def test_audit_qdf_reports_all_text_objects_and_split_font_match_without_text(se
[(1, "F4", 0, 2), (2, "F5", 0, 2)],
)
self.assertEqual(payload["split_matches"][0]["blockers"], [])
self.assertEqual(
payload["blocker_summary"],
{
"text_object_reason_counts": {},
"split_kind_counts": {"cross_text_object_and_font": 1},
"split_blocker_reason_counts": {},
"split_blocker_font_counts": {},
"split_blocker_count": 0,
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
},
},
)
self.assertEqual([obj["font"] for obj in payload["text_objects"]], ["F4", "F5", "F5"])
self.assertEqual([obj["match_count"] for obj in payload["text_objects"]], [0, 0, 0])
self.assertFalse(payload["privacy"]["decoded_text_included"])
Expand All @@ -474,6 +488,36 @@ def test_audit_qdf_reports_patchable_mixed_font_object_match(self):
self.assertTrue(payload["text_objects"][1]["patchable"])
self.assertEqual(payload["text_objects"][1]["alignment_contract"], "exact glyph-count replacement preserves existing layout operators")

def test_audit_qdf_summarizes_split_blockers_without_literal_text(self):
f5_map = dict(f.DEFAULT_CIDS)
del f5_map["4"]
qdf = mixed_font_qdf(
f.text_object("38", font="F4", x="100", y="10"),
f.text_object("07", font="F5", x="140", y="10", cid_map=f5_map),
f5_cid_map=f5_map,
)

payload = p.audit_qdf(qdf, "3807", "8304", align="exact")

self.assertEqual(
payload["blocker_summary"],
{
"text_object_reason_counts": {},
"split_kind_counts": {"cross_text_object_and_font": 1},
"split_blocker_reason_counts": {
"replacement character(s) not present in active font": 1
},
"split_blocker_font_counts": {"F5": 1},
"split_blocker_count": 1,
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
},
},
)
self.assertNotIn("3807", str(payload))
self.assertNotIn("8304", str(payload))

def test_audit_exit_status_distinguishes_patchable_missing_and_unpatchable(self):
patchable = {
"total_matches": 1,
Expand Down Expand Up @@ -504,6 +548,20 @@ def test_plan_qdf_records_patchable_exact_match_without_literal_text(self):
self.assertEqual(payload["expected"]["patchable_matches"], 1)
self.assertEqual(payload["expected"]["unpatchable_candidates"], 0)
self.assertEqual(payload["expected"]["split_candidates"], 0)
self.assertEqual(
payload["blocker_summary"],
{
"text_object_reason_counts": {},
"split_kind_counts": {},
"split_blocker_reason_counts": {},
"split_blocker_font_counts": {},
"split_blocker_count": 0,
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
},
},
)
self.assertRegex(payload["plan_id"], r"^[0-9a-f]{16}$")
self.assertFalse(payload["privacy"]["decoded_text_included"])
self.assertFalse(payload["privacy"]["literal_search_replacement_included"])
Expand Down Expand Up @@ -550,6 +608,11 @@ def test_plan_qdf_records_split_candidate_as_unpatchable(self):
[(1, "F4", 0, 2), (2, "F5", 0, 2)],
)
self.assertEqual(payload["split_candidates"][0]["blockers"], [])
self.assertEqual(
payload["blocker_summary"]["split_kind_counts"],
{"cross_text_object_and_font": 1},
)
self.assertEqual(payload["blocker_summary"]["split_blocker_count"], 0)

def test_plan_qdf_records_adjacent_same_font_split_as_unpatchable_segmented_candidate(self):
qdf = f.qdf_document(
Expand All @@ -567,6 +630,8 @@ def test_plan_qdf_records_adjacent_same_font_split_as_unpatchable_segmented_cand
self.assertEqual(split["fonts"], ["F4", "F4"])
self.assertEqual(split["blockers"], [])
self.assertTrue(all(segment["replacement_glyphs_available"] for segment in split["segments"]))
self.assertEqual(payload["blocker_summary"]["split_kind_counts"], {"cross_text_object": 1})
self.assertEqual(payload["blocker_summary"]["split_blocker_count"], 0)

def test_plan_qdf_records_missing_replacement_glyph_without_literal_replacement(self):
qdf = f.synthetic_qdf("3807")
Expand All @@ -580,6 +645,10 @@ def test_plan_qdf_records_missing_replacement_glyph_without_literal_replacement(
self.assertFalse(match["patchable"])
self.assertEqual(match["reason"], "replacement character(s) not present in active font")
self.assertEqual(match["replacement_cids"], [])
self.assertEqual(
payload["blocker_summary"]["text_object_reason_counts"],
{"replacement character(s) not present in active font": 1},
)
self.assertNotIn("3807", str(payload))
self.assertNotIn("ZZZZ", str(payload))

Expand Down Expand Up @@ -611,6 +680,22 @@ def test_plan_qdf_records_split_blocker_when_replacement_glyph_missing_in_one_fo
}
],
)
self.assertEqual(
payload["blocker_summary"],
{
"text_object_reason_counts": {},
"split_kind_counts": {"cross_text_object_and_font": 1},
"split_blocker_reason_counts": {
"replacement character(s) not present in active font": 1
},
"split_blocker_font_counts": {"F5": 1},
"split_blocker_count": 1,
"privacy": {
"decoded_text_included": False,
"literal_search_replacement_included": False,
},
},
)
self.assertNotIn("3807", str(payload))
self.assertNotIn("8304", str(payload))

Expand Down