Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -222,5 +222,9 @@ temp/
*.tmp
*pdf

# Retention storage (originals/processed persisted at runtime)
pdf_storage/
test_pdf_storage/

# UV package manager
uv.lock
39 changes: 39 additions & 0 deletions app/api/endpoints/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

import base64
import json
import logging
import os
import shutil
import tempfile
Expand All @@ -20,6 +21,7 @@
from ...core.color_analyzer import ColorAnalyzer
from ...core.pdf_analyzer import PDFAnalyzer
from ...core.pdf_processor import PDFProcessor
from ...utils.file_manager import FileManager
from ...models.schemas import (
ColorAnalysis,
ErrorResponse,
Expand All @@ -43,6 +45,31 @@
router.include_router(batch_router)
router.include_router(repair_router)

logger = logging.getLogger(__name__)


def _retain_pdf_files(
input_path: str | None,
input_filename: str,
output_path: str | None,
reference: str | None,
) -> None:
"""Best-effort retention of the original upload and processed output.

Persists both into ``settings.storage_dir`` and prunes files past the
retention window. Never raises — retention must not break a successful
processing response.
"""
try:
fm = FileManager()
if input_path and os.path.exists(input_path):
fm.save_original(input_path, input_filename)
if output_path and os.path.exists(output_path):
fm.save_processed(output_path, f"{reference or 'processed'}.pdf")
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Retention paths allow traversal

High Severity

_retain_pdf_files passes client-controlled job_config reference and upload filename into FileManager without sanitizing path separators. FileManager joins those strings with Path, so values containing .. or / can write retained PDFs outside original/processed or into the sibling retention folder.

Fix in Cursor Fix in Web

Reviewed by Cursor Bugbot for commit d97ec05. Configure here.

fm.cleanup_old_files()
except Exception:
logger.warning("PDF retention storage failed (non-fatal)", exc_info=True)


@router.post("/analyze", response_model=PDFAnalysisResult)
async def analyze_pdf(
Expand Down Expand Up @@ -163,6 +190,12 @@ async def process_pdf(
result = processor.process_pdf(temp_input_path, job_config_obj)

if result["success"]:
_retain_pdf_files(
temp_input_path,
pdf_file.filename,
result.get("output_path"),
job_config_obj.reference,
)
return _build_process_response(
result, job_config_obj, pdf_file.filename, return_json
)
Expand Down Expand Up @@ -254,6 +287,12 @@ async def process_pdf_with_json_file(
result = processor.process_pdf(temp_input_path, job_config_obj)

if result["success"]:
_retain_pdf_files(
temp_input_path,
pdf_file.filename,
result.get("output_path"),
job_config_obj.reference,
)
return _build_process_response(
result, job_config_obj, pdf_file.filename, return_json
)
Expand Down
115 changes: 0 additions & 115 deletions app/utils/pdf_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,121 +577,6 @@ def get_pdf_info(pdf_path: str) -> Dict:
print(f"Error getting PDF info: {e}")
return {}

@staticmethod
def embed_all_fonts(pdf_path: str) -> bool:
"""
Ensure all fonts are embedded using Ghostscript (if available).
Rewrites the PDF in place when successful.
"""
gs = shutil.which('gs') or shutil.which('ghostscript')
if not gs:
return False
try:
tmp = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
tmp.close()
args = [
gs,
'-sDEVICE=pdfwrite',
'-dCompatibilityLevel=1.6',
'-dNOPAUSE',
'-dQUIET',
'-dBATCH',
'-dAutoRotatePages=/None', # Preserve original page rotation
'-dDetectDuplicateImages=true',
'-dCompressFonts=true',
'-dSubsetFonts=true',
'-dEmbedAllFonts=true',
'-sOutputFile=' + tmp.name,
pdf_path,
]
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode == 0 and os.path.getsize(tmp.name) > 0:
os.replace(tmp.name, pdf_path)
return True
except Exception:
pass
return False

@staticmethod
def outline_all_fonts(pdf_path: str) -> bool:
"""
Convert all text to vector outlines using Ghostscript (pdfwrite).
Rewrites the PDF in place when successful.
"""
gs = shutil.which('gs') or shutil.which('ghostscript')
if not gs:
return False
try:
tmp = tempfile.NamedTemporaryFile(suffix='.pdf', delete=False)
tmp.close()
args = [
gs,
'-sDEVICE=pdfwrite',
'-dCompatibilityLevel=1.6',
'-dNOPAUSE',
'-dQUIET',
'-dBATCH',
'-dAutoRotatePages=/None', # Preserve original page rotation
'-dNoOutputFonts',
'-sOutputFile=' + tmp.name,
pdf_path,
]
res = subprocess.run(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
if res.returncode == 0 and os.path.getsize(tmp.name) > 0:
os.replace(tmp.name, pdf_path)
return True
except Exception:
pass
return False

@staticmethod
def has_unembedded_fonts(pdf_path: str) -> bool:
"""
Heuristic check: returns True if any font on any page lacks an embedded
font file (FontFile/FontFile2/FontFile3) in its FontDescriptor.
Also scans Form XObjects recursively.
"""
try:
reader = PdfReader(pdf_path)

def check_resources(resources):
if resources is None:
return False
font_dict = resources.get('/Font')
if font_dict:
for font_name, font_obj in font_dict.items():
try:
font = font_obj.get_object() if hasattr(font_obj, 'get_object') else font_obj
descriptor = font.get('/FontDescriptor')
if descriptor:
desc = descriptor.get_object() if hasattr(descriptor, 'get_object') else descriptor
# Check for embedded font file
if not any(key in desc for key in ['/FontFile', '/FontFile2', '/FontFile3']):
return True
except Exception:
pass
# Recurse into XObjects
xobject_dict = resources.get('/XObject')
if xobject_dict:
for xobj_name, xobj in xobject_dict.items():
try:
xobj_resolved = xobj.get_object() if hasattr(xobj, 'get_object') else xobj
if xobj_resolved.get('/Subtype') == '/Form':
xobj_resources = xobj_resolved.get('/Resources')
if check_resources(xobj_resources):
return True
except Exception:
pass
return False

for page in reader.pages:
resources = page.get('/Resources')
if check_resources(resources):
return True
return False
except Exception:
return False


# Module-level aliases for the PDFUtils staticmethods. The package __init__
# imports these as bare names (`from .pdf_utils import merge_pdfs, ...`); without
Expand Down
7 changes: 0 additions & 7 deletions tests/test_retention.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,6 @@ def setup_teardown():
shutil.rmtree(TEST_STORAGE_DIR)
settings.storage_dir = original_storage

@pytest.mark.xfail(
reason="Retention storage is not wired into /api/pdf/process yet — the endpoint "
"processes PDFs but does not persist originals/processed to settings.storage_dir. "
"Pre-existing gap, tracked separately. The /process TypeError (tuple->FileResponse) "
"is fixed; only the storage-persistence half remains.",
strict=False,
)
def test_pdf_retention_flow():
# 1. Create a dummy PDF
pdf_content = b"%PDF-1.4\n1 0 obj\n<<\n/Type /Catalog\n/Pages 2 0 R\n>>\nendobj\n2 0 obj\n<<\n/Kids [3 0 R]\n/Count 1\n/Type /Pages\n>>\nendobj\n3 0 obj\n<<\n/Type /Page\n/MediaBox [0 0 595 842]\n/Resources <<\n/Font <<\n/F1 4 0 R\n>>\n>>\n/Parent 2 0 R\n/Contents 5 0 R\n>>\nendobj\n4 0 obj\n<<\n/Type /Font\n/Subtype /Type1\n/BaseFont /Helvetica\n>>\nendobj\n5 0 obj\n<<\n/Length 44\n>>\nstream\nBT\n/F1 24 Tf\n100 100 Td\n(Hello World) Tj\nET\nendstream\nendobj\nxref\n0 6\n0000000000 65535 f\n0000000010 00000 n\n0000000060 00000 n\n0000000157 00000 n\n0000000304 00000 n\n0000000392 00000 n\ntrailer\n<<\n/Size 6\n/Root 1 0 R\n>>\nstartxref\n487\n%%EOF"
Expand Down