Whateverdoa · Whateverdoa · May 20, 2026 · May 20, 2026 · cursor · May 20, 2026
diff --git a/.gitignore b/.gitignore
@@ -222,5 +222,9 @@ temp/
 *.tmp
 *pdf
 
+# Retention storage (originals/processed persisted at runtime)
+pdf_storage/
+test_pdf_storage/
+
 # UV package manager
 uv.lock
diff --git a/app/api/endpoints/pdf.py b/app/api/endpoints/pdf.py
@@ -7,6 +7,7 @@
 
 import base64
 import json
+import logging
 import os
 import shutil
 import tempfile
@@ -20,6 +21,7 @@
 from ...core.color_analyzer import ColorAnalyzer
 from ...core.pdf_analyzer import PDFAnalyzer
 from ...core.pdf_processor import PDFProcessor
+from ...utils.file_manager import FileManager
 from ...models.schemas import (
     ColorAnalysis,
     ErrorResponse,
@@ -43,6 +45,31 @@
 router.include_router(batch_router)
 router.include_router(repair_router)
 
+logger = logging.getLogger(__name__)
+
+
+def _retain_pdf_files(
+    input_path: str | None,
+    input_filename: str,
+    output_path: str | None,
+    reference: str | None,
+) -> None:
+    """Best-effort retention of the original upload and processed output.
+
+    Persists both into ``settings.storage_dir`` and prunes files past the
+    retention window. Never raises — retention must not break a successful
+    processing response.
+    """
+    try:
+        fm = FileManager()
+        if input_path and os.path.exists(input_path):
+            fm.save_original(input_path, input_filename)
+        if output_path and os.path.exists(output_path):
+            fm.save_processed(output_path, f"{reference or 'processed'}.pdf")
+        fm.cleanup_old_files()
+    except Exception:
+        logger.warning("PDF retention storage failed (non-fatal)", exc_info=True)
+
 
 @router.post("/analyze", response_model=PDFAnalysisResult)
 async def analyze_pdf(
@@ -163,6 +190,12 @@ async def process_pdf(
         result = processor.process_pdf(temp_input_path, job_config_obj)
 
         if result["success"]:
+            _retain_pdf_files(
+                temp_input_path,
+                pdf_file.filename,
+                result.get("output_path"),
+                job_config_obj.reference,
+            )
             return _build_process_response(
                 result, job_config_obj, pdf_file.filename, return_json
             )
@@ -254,6 +287,12 @@ async def process_pdf_with_json_file(
         result = processor.process_pdf(temp_input_path, job_config_obj)
 
         if result["success"]:
+            _retain_pdf_files(
+                temp_input_path,
+                pdf_file.filename,
+                result.get("output_path"),
+                job_config_obj.reference,
+            )
             return _build_process_response(
                 result, job_config_obj, pdf_file.filename, return_json
             )

diff --git a/app/utils/pdf_utils.py b/app/utils/pdf_utils.py
@@ -577,121 +577,6 @@ def get_pdf_info(pdf_path: str) -> Dict:
             print(f"Error getting PDF info: {e}")
             return {}
 
-    @staticmethod
-    def embed_all_fonts(pdf_path: str) -> bool:
-        """
-        Ensure all fonts are embedded using Ghostscript (if available).
-        Rewrites the PDF in place when successful.
-        """
-        gs = shutil.which('gs') or shutil.which('ghostscript')
-        if not gs:
-            return False
-        try:
-            tmp = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
-            tmp.close()
-            args = [
-                gs,
-                '-sDEVICE=pdfwrite',
-                '-dCompatibilityLevel=1.6',
-                '-dNOPAUSE',
-                '-dQUIET',
-                '-dBATCH',
-                '-dAutoRotatePages=/None',  # Preserve original page rotation
-                '-dDetectDuplicateImages=true',
-                '-dCompressFonts=true',
-                '-dSubsetFonts=true',
-                '-dEmbedAllFonts=true',
-                '-sOutputFile=' + tmp.name,
-                pdf_path,
-            ]
-            res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if res.returncode == 0 and os.path.getsize(tmp.name) > 0:
-                os.replace(tmp.name, pdf_path)
-                return True
-        except Exception:
-            pass
-        return False
-
-    @staticmethod
-    def outline_all_fonts(pdf_path: str) -> bool:
-        """
-        Convert all text to vector outlines using Ghostscript (pdfwrite).
-        Rewrites the PDF in place when successful.
-        """
-        gs = shutil.which('gs') or shutil.which('ghostscript')
-        if not gs:
-            return False
-        try:
-            tmp = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
-            tmp.close()
-            args = [
-                gs,
-                '-sDEVICE=pdfwrite',
-                '-dCompatibilityLevel=1.6',
-                '-dNOPAUSE',
-                '-dQUIET',
-                '-dBATCH',
-                '-dAutoRotatePages=/None',  # Preserve original page rotation
-                '-dNoOutputFonts',
-                '-sOutputFile=' + tmp.name,
-                pdf_path,
-            ]
-            res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            if res.returncode == 0 and os.path.getsize(tmp.name) > 0:
-                os.replace(tmp.name, pdf_path)
-                return True
-        except Exception:
-            pass
-        return False
-
-    @staticmethod
-    def has_unembedded_fonts(pdf_path: str) -> bool:
-        """
-        Heuristic check: returns True if any font on any page lacks an embedded
-        font file (FontFile/FontFile2/FontFile3) in its FontDescriptor.
-        Also scans Form XObjects recursively.
-        """
-        try:
-            reader = PdfReader(pdf_path)
-
-            def check_resources(resources):
-                if resources is None:
-                    return False
-                font_dict = resources.get('/Font')
-                if font_dict:
-                    for font_name, font_obj in font_dict.items():
-                        try:
-                            font = font_obj.get_object() if hasattr(font_obj, 'get_object') else font_obj
-                            descriptor = font.get('/FontDescriptor')
-                            if descriptor:
-                                desc = descriptor.get_object() if hasattr(descriptor, 'get_object') else descriptor
-                                # Check for embedded font file
-                                if not any(key in desc for key in ['/FontFile', '/FontFile2', '/FontFile3']):
-                                    return True
-                        except Exception:
-                            pass
-                # Recurse into XObjects
-                xobject_dict = resources.get('/XObject')
-                if xobject_dict:
-                    for xobj_name, xobj in xobject_dict.items():
-                        try:
-                            xobj_resolved = xobj.get_object() if hasattr(xobj, 'get_object') else xobj
-                            if xobj_resolved.get('/Subtype') == '/Form':
-                                xobj_resources = xobj_resolved.get('/Resources')
-                                if check_resources(xobj_resources):
-                                    return True
-                        except Exception:
-                            pass
-                return False
-
-            for page in reader.pages:
-                resources = page.get('/Resources')
-                if check_resources(resources):
-                    return True
-            return False
-        except Exception:
-            return False
-
 
 # Module-level aliases for the PDFUtils staticmethods. The package __init__
 # imports these as bare names (`from .pdf_utils import merge_pdfs, ...`); without

diff --git a/tests/test_retention.py b/tests/test_retention.py
@@ -32,13 +32,6 @@ def setup_teardown():
         shutil.rmtree(TEST_STORAGE_DIR)
     settings.storage_dir = original_storage
 
-@pytest.mark.xfail(
-    reason="Retention storage is not wired into /api/pdf/process yet — the endpoint "
-    "processes PDFs but does not persist originals/processed to settings.storage_dir. "
-    "Pre-existing gap, tracked separately. The /process TypeError (tuple->FileResponse) "
-    "is fixed; only the storage-persistence half remains.",
-    strict=False,
-)
 def test_pdf_retention_flow():
     # 1. Create a dummy PDF
     pdf_content = b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Kids [3 0 R]\n/Count 1\n/Type /Pages\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/MediaBox [0 0 595 842]\n/Resources <<\n/Font <<\n/F1 4 0 R\n>>\n>>\n/Parent 2 0 R\n/Contents 5 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Font\n/Subtype /Type1\n/BaseFont /Helvetica\n>>\nendobj\n5 0 obj\n<<\n/Length 44\n>>\nstream\nBT\n/F1 24 Tf\n100 100 Td\n(Hello World) Tj\nET\nendstream\nendobj\nxref\n0 6\n0000000000 65535 f\n0000000010 00000 n\n0000000060 00000 n\n0000000157 00000 n\n0000000304 00000 n\n0000000392 00000 n\ntrailer\n<<\n/Size 6\n/Root 1 0 R\n>>\nstartxref\n487\n%%EOF"