From 311fc504356b1187d807946829ef5793dc8aecc0 Mon Sep 17 00:00:00 2001 From: PDF Dieline Processor Date: Wed, 20 May 2026 17:26:53 +0200 Subject: [PATCH 1/2] fix: remove duplicate font methods in PDFUtils embed_all_fonts, outline_all_fonts and has_unembedded_fonts were each defined twice in the class. Python kept the second (later) copies, which lacked the q/Q graphics-state imbalance fix and the more thorough font-descriptor scan. The module-level aliases therefore bound to the inferior versions. Removed the second set so the q/Q-fixing implementations win. Co-Authored-By: Claude Opus 4.7 --- app/utils/pdf_utils.py | 115 ----------------------------------------- 1 file changed, 115 deletions(-) diff --git a/app/utils/pdf_utils.py b/app/utils/pdf_utils.py index 16240a2..974f3aa 100644 --- a/app/utils/pdf_utils.py +++ b/app/utils/pdf_utils.py @@ -577,121 +577,6 @@ def get_pdf_info(pdf_path: str) -> Dict: print(f"Error getting PDF info: {e}") return {} - @staticmethod - def embed_all_fonts(pdf_path: str) -> bool: - """ - Ensure all fonts are embedded using Ghostscript (if available). - Rewrites the PDF in place when successful. - """ - gs = shutil.which('gs') or shutil.which('ghostscript') - if not gs: - return False - try: - tmp = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) - tmp.close() - args = [ - gs, - '-sDEVICE=pdfwrite', - '-dCompatibilityLevel=1.6', - '-dNOPAUSE', - '-dQUIET', - '-dBATCH', - '-dAutoRotatePages=/None', # Preserve original page rotation - '-dDetectDuplicateImages=true', - '-dCompressFonts=true', - '-dSubsetFonts=true', - '-dEmbedAllFonts=true', - '-sOutputFile=' + tmp.name, - pdf_path, - ] - res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if res.returncode == 0 and os.path.getsize(tmp.name) > 0: - os.replace(tmp.name, pdf_path) - return True - except Exception: - pass - return False - - @staticmethod - def outline_all_fonts(pdf_path: str) -> bool: - """ - Convert all text to vector outlines using Ghostscript (pdfwrite). - Rewrites the PDF in place when successful. - """ - gs = shutil.which('gs') or shutil.which('ghostscript') - if not gs: - return False - try: - tmp = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) - tmp.close() - args = [ - gs, - '-sDEVICE=pdfwrite', - '-dCompatibilityLevel=1.6', - '-dNOPAUSE', - '-dQUIET', - '-dBATCH', - '-dAutoRotatePages=/None', # Preserve original page rotation - '-dNoOutputFonts', - '-sOutputFile=' + tmp.name, - pdf_path, - ] - res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) - if res.returncode == 0 and os.path.getsize(tmp.name) > 0: - os.replace(tmp.name, pdf_path) - return True - except Exception: - pass - return False - - @staticmethod - def has_unembedded_fonts(pdf_path: str) -> bool: - """ - Heuristic check: returns True if any font on any page lacks an embedded - font file (FontFile/FontFile2/FontFile3) in its FontDescriptor. - Also scans Form XObjects recursively. - """ - try: - reader = PdfReader(pdf_path) - - def check_resources(resources): - if resources is None: - return False - font_dict = resources.get('/Font') - if font_dict: - for font_name, font_obj in font_dict.items(): - try: - font = font_obj.get_object() if hasattr(font_obj, 'get_object') else font_obj - descriptor = font.get('/FontDescriptor') - if descriptor: - desc = descriptor.get_object() if hasattr(descriptor, 'get_object') else descriptor - # Check for embedded font file - if not any(key in desc for key in ['/FontFile', '/FontFile2', '/FontFile3']): - return True - except Exception: - pass - # Recurse into XObjects - xobject_dict = resources.get('/XObject') - if xobject_dict: - for xobj_name, xobj in xobject_dict.items(): - try: - xobj_resolved = xobj.get_object() if hasattr(xobj, 'get_object') else xobj - if xobj_resolved.get('/Subtype') == '/Form': - xobj_resources = xobj_resolved.get('/Resources') - if check_resources(xobj_resources): - return True - except Exception: - pass - return False - - for page in reader.pages: - resources = page.get('/Resources') - if check_resources(resources): - return True - return False - except Exception: - return False - # Module-level aliases for the PDFUtils staticmethods. The package __init__ # imports these as bare names (`from .pdf_utils import merge_pdfs, ...`); without From d97ec059c1ee0cfcb5ca06900b14fc8d66785714 Mon Sep 17 00:00:00 2001 From: PDF Dieline Processor Date: Wed, 20 May 2026 17:26:53 +0200 Subject: [PATCH 2/2] feat: wire retention storage into /process endpoints /api/pdf/process and /process-with-json-file now persist the original upload and the processed output into settings.storage_dir via FileManager, and prune files past the retention window. Retention is best-effort: failures are logged and never break a successful processing response. - Add _retain_pdf_files helper (best-effort save_original + save_processed + cleanup_old_files; never raises). - Un-xfail test_pdf_retention_flow now that the storage half is implemented. - gitignore pdf_storage/ and test_pdf_storage/ so retained PDFs are never accidentally committed. Co-Authored-By: Claude Opus 4.7 --- .gitignore | 4 ++++ app/api/endpoints/pdf.py | 39 +++++++++++++++++++++++++++++++++++++++ tests/test_retention.py | 7 ------- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index 5faca1e..b0b148c 100644 --- a/.gitignore +++ b/.gitignore @@ -222,5 +222,9 @@ temp/ *.tmp *pdf +# Retention storage (originals/processed persisted at runtime) +pdf_storage/ +test_pdf_storage/ + # UV package manager uv.lock diff --git a/app/api/endpoints/pdf.py b/app/api/endpoints/pdf.py index f2e8a8c..27aec4b 100644 --- a/app/api/endpoints/pdf.py +++ b/app/api/endpoints/pdf.py @@ -7,6 +7,7 @@ import base64 import json +import logging import os import shutil import tempfile @@ -20,6 +21,7 @@ from ...core.color_analyzer import ColorAnalyzer from ...core.pdf_analyzer import PDFAnalyzer from ...core.pdf_processor import PDFProcessor +from ...utils.file_manager import FileManager from ...models.schemas import ( ColorAnalysis, ErrorResponse, @@ -43,6 +45,31 @@ router.include_router(batch_router) router.include_router(repair_router) +logger = logging.getLogger(__name__) + + +def _retain_pdf_files( + input_path: str | None, + input_filename: str, + output_path: str | None, + reference: str | None, +) -> None: + """Best-effort retention of the original upload and processed output. + + Persists both into ``settings.storage_dir`` and prunes files past the + retention window. Never raises — retention must not break a successful + processing response. + """ + try: + fm = FileManager() + if input_path and os.path.exists(input_path): + fm.save_original(input_path, input_filename) + if output_path and os.path.exists(output_path): + fm.save_processed(output_path, f"{reference or 'processed'}.pdf") + fm.cleanup_old_files() + except Exception: + logger.warning("PDF retention storage failed (non-fatal)", exc_info=True) + @router.post("/analyze", response_model=PDFAnalysisResult) async def analyze_pdf( @@ -163,6 +190,12 @@ async def process_pdf( result = processor.process_pdf(temp_input_path, job_config_obj) if result["success"]: + _retain_pdf_files( + temp_input_path, + pdf_file.filename, + result.get("output_path"), + job_config_obj.reference, + ) return _build_process_response( result, job_config_obj, pdf_file.filename, return_json ) @@ -254,6 +287,12 @@ async def process_pdf_with_json_file( result = processor.process_pdf(temp_input_path, job_config_obj) if result["success"]: + _retain_pdf_files( + temp_input_path, + pdf_file.filename, + result.get("output_path"), + job_config_obj.reference, + ) return _build_process_response( result, job_config_obj, pdf_file.filename, return_json ) diff --git a/tests/test_retention.py b/tests/test_retention.py index 6c90247..b751137 100644 --- a/tests/test_retention.py +++ b/tests/test_retention.py @@ -32,13 +32,6 @@ def setup_teardown(): shutil.rmtree(TEST_STORAGE_DIR) settings.storage_dir = original_storage -@pytest.mark.xfail( - reason="Retention storage is not wired into /api/pdf/process yet — the endpoint " - "processes PDFs but does not persist originals/processed to settings.storage_dir. " - "Pre-existing gap, tracked separately. The /process TypeError (tuple->FileResponse) " - "is fixed; only the storage-persistence half remains.", - strict=False, -) def test_pdf_retention_flow(): # 1. Create a dummy PDF pdf_content = b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Kids [3 0 R]\n/Count 1\n/Type /Pages\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/MediaBox [0 0 595 842]\n/Resources <<\n/Font <<\n/F1 4 0 R\n>>\n>>\n/Parent 2 0 R\n/Contents 5 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Font\n/Subtype /Type1\n/BaseFont /Helvetica\n>>\nendobj\n5 0 obj\n<<\n/Length 44\n>>\nstream\nBT\n/F1 24 Tf\n100 100 Td\n(Hello World) Tj\nET\nendstream\nendobj\nxref\n0 6\n0000000000 65535 f\n0000000010 00000 n\n0000000060 00000 n\n0000000157 00000 n\n0000000304 00000 n\n0000000392 00000 n\ntrailer\n<<\n/Size 6\n/Root 1 0 R\n>>\nstartxref\n487\n%%EOF"