Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 104 additions & 1 deletion app/my_practice/management/commands/compress_media.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@
./dev.py manage compress_media
./dev.py manage compress_media --path taxes/2025
./dev.py manage compress_media --dry-run
./dev.py manage compress_media --force # bypass size threshold for images
./dev.py manage compress_media --rotate-pages 180 --path clients/ml # fix upside-down PDFs
"""

import io
from pathlib import Path

import pypdf
from django.conf import settings
from django.core.management.base import BaseCommand, CommandError

Expand All @@ -27,6 +31,26 @@
_PROCESSABLE = _IMAGE_EXTENSIONS | _PDF_EXTENSIONS


def _rotate_pdf_pages(path: Path, degrees: int) -> bool:
"""
Set /Rotate on every page of the PDF to the given degrees.
Writes the file in-place. Returns True if the file was modified.
"""
data = path.read_bytes()
reader = pypdf.PdfReader(io.BytesIO(data))
writer = pypdf.PdfWriter()
writer.append(reader)
for page in writer.pages:
page[pypdf.generic.NameObject("/Rotate")] = pypdf.generic.NumberObject(degrees)
buf = io.BytesIO()
writer.write(buf)
result = buf.getvalue()
if result != data:
path.write_bytes(result)
return True
return False


class Command(BaseCommand):
help = "Compress existing media files (images via Pillow, PDFs via Ghostscript)"

Expand All @@ -41,6 +65,21 @@ def add_arguments(self, parser):
action="store_true",
help="List files that would be processed without modifying them",
)
parser.add_argument(
"--force",
action="store_true",
help="Process all images regardless of size (use to fix EXIF orientation on already-compressed files)",
)
parser.add_argument(
"--rotate-pages",
type=int,
metavar="DEGREES",
help=(
"Set /Rotate on every page of all PDFs under --path to DEGREES "
"(e.g. 180 to fix upside-down scans whose rotation metadata was stripped). "
"Skips compression; only modifies PDFs."
),
)

def handle(self, *args, **options):
media_root = Path(settings.MEDIA_ROOT)
Expand All @@ -50,9 +89,20 @@ def handle(self, *args, **options):
if not root.exists():
raise CommandError(f"Path not found: {root}")

rotate_degrees = options.get("rotate_pages")

if rotate_degrees is not None:
self._handle_rotate(root, media_root, rotate_degrees, options["dry_run"])
return

dry_run = options["dry_run"]
force = options["force"]
if dry_run:
self.stdout.write(self.style.WARNING("DRY RUN — no files will be modified\n"))
if force:
self.stdout.write(
self.style.WARNING("FORCE mode — size threshold bypassed for images\n")
)

total_files = 0
total_compressed = 0
Expand Down Expand Up @@ -82,7 +132,7 @@ def handle(self, *args, **options):

try:
if ext in _IMAGE_EXTENSIONS:
saved = compress_image_inplace(str(filepath))
saved = compress_image_inplace(str(filepath), force=force)
else:
saved = compress_pdf_inplace(str(filepath))

Expand Down Expand Up @@ -113,6 +163,9 @@ def handle(self, *args, **options):
if dry_run:
self.stdout.write(f"\n{total_files} processable files found.")
self.stdout.write("Re-run without --dry-run to compress them.")
self.stdout.write(
"Add --force to also reprocess small images (e.g. to fix EXIF orientation)."
)
return

saved_kb = total_saved / 1024
Expand All @@ -130,3 +183,53 @@ def handle(self, *args, **options):

if total_errors:
self.stdout.write(self.style.WARNING("Check logs above for error details."))

def _handle_rotate(self, root: Path, media_root: Path, degrees: int, dry_run: bool) -> None:
if dry_run:
self.stdout.write(
self.style.WARNING(
f"DRY RUN — would set /Rotate {degrees} on all PDF pages under {root.relative_to(media_root)}\n"
)
)
else:
self.stdout.write(
self.style.WARNING(
f"Setting /Rotate {degrees} on all PDF pages under {root.relative_to(media_root)}\n"
)
)

modified = 0
skipped = 0
errors = 0

candidates = [root] if root.is_file() else sorted(root.rglob("*"))
for filepath in candidates:
if not filepath.is_file() or filepath.suffix.lower() not in _PDF_EXTENSIONS:
continue
rel = filepath.relative_to(media_root)
if dry_run:
self.stdout.write(f" [pdf] {rel}")
skipped += 1
continue
try:
changed = _rotate_pdf_pages(filepath, degrees)
if changed:
modified += 1
self.stdout.write(f" ✓ {rel}")
else:
skipped += 1
self.stdout.write(f" – {rel} (already correct)")
except Exception as exc:
errors += 1
self.stdout.write(self.style.ERROR(f" ✗ {rel}: {exc}"))

if dry_run:
self.stdout.write(
f"\n{skipped} PDF(s) found. Re-run without --dry-run to apply /Rotate {degrees}."
)
else:
self.stdout.write(
self.style.SUCCESS(
f"\nDone: {modified} modified, {skipped} unchanged, {errors} errors"
)
)
70 changes: 65 additions & 5 deletions app/my_practice/utils/file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
import tempfile
from pathlib import Path

import pypdf
from django.core.files.base import ContentFile
from PIL import Image
from PIL import Image, ImageOps

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -45,6 +46,7 @@ def compress_image_upload(upload) -> ContentFile:
Returns a ContentFile with a .jpg filename.
"""
img = Image.open(upload)
img = ImageOps.exif_transpose(img) # honour scanner EXIF orientation before stripping EXIF
if img.mode != "RGB":
img = img.convert("RGB")
if max(img.size) > MAX_IMAGE_PX:
Expand All @@ -55,11 +57,52 @@ def compress_image_upload(upload) -> ContentFile:
return ContentFile(buf.getvalue(), name=f"{stem}.jpg")


def _read_page_rotations(data: bytes) -> list[int]:
"""
Return the /Rotate value for each page in the given PDF bytes.
Ghostscript's pdfwrite device strips /Rotate entries, so we read them
before compression and restore them afterwards.
"""
try:
reader = pypdf.PdfReader(io.BytesIO(data))
return [int(page.get("/Rotate", 0) or 0) for page in reader.pages]
except Exception:
return []


def _restore_page_rotations(data: bytes, rotations: list[int]) -> bytes:
"""
Write /Rotate back onto each page of the PDF. Used to undo Ghostscript's
rotation-stripping after compression.
Returns unmodified data if rotations are all 0, empty, or pypdf fails.
"""
if not rotations or all(r == 0 for r in rotations):
return data
try:
reader = pypdf.PdfReader(io.BytesIO(data))
writer = pypdf.PdfWriter()
writer.append(reader)
for i, page in enumerate(writer.pages):
rot = rotations[i] if i < len(rotations) else 0
if rot:
page[pypdf.generic.NameObject("/Rotate")] = pypdf.generic.NumberObject(rot)
buf = io.BytesIO()
writer.write(buf)
return buf.getvalue()
except Exception:
logger.warning("Could not restore PDF page rotations; returning as-is")
return data


def _compress_pdf_bytes(data: bytes) -> bytes:
"""
Compress PDF bytes via Ghostscript. Returns compressed bytes, or the
original bytes if gs is unavailable, fails, or makes the file larger.
Page /Rotate attributes are preserved — Ghostscript strips them, so we
read them beforehand and restore them after compression.
"""
rotations = _read_page_rotations(data)

with (
tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f_in,
tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f_out,
Expand Down Expand Up @@ -97,7 +140,7 @@ def _compress_pdf_bytes(data: bytes) -> bytes:
logger.warning("gs failed (upload): %s", result.stderr.strip())
return data

compressed = Path(tmp_out).read_bytes()
compressed = _restore_page_rotations(Path(tmp_out).read_bytes(), rotations)
return compressed if len(compressed) < len(data) else data
except FileNotFoundError:
logger.warning("ghostscript not found; storing PDF uncompressed")
Expand Down Expand Up @@ -162,20 +205,31 @@ def process_upload(upload) -> ContentFile:
# ---------------------------------------------------------------------------


def compress_image_inplace(path: str) -> int:
def compress_image_inplace(path: str, force: bool = False) -> int:
"""
Compress an image file in-place.
Resizes to MAX_IMAGE_PX on the longest side and re-encodes.
Resizes to MAX_IMAGE_PX on the longest side, fixes EXIF orientation, and re-encodes.
Preserves the original format (no extension rename, so DB paths stay valid).
Returns bytes saved (0 if skipped or no improvement).
Pass force=True to bypass the size threshold (e.g. to fix orientation on small files).
"""
original_size = os.path.getsize(path)
ext = Path(path).suffix.lower()

img = Image.open(path)
# Physically rotate pixels to match EXIF orientation before stripping EXIF.
# Without this, re-saving strips the Orientation tag and viewers see raw scanner pixels.
img_transposed = ImageOps.exif_transpose(img)
needs_orientation_fix = img_transposed is not img
img = img_transposed
needs_resize = max(img.size) > MAX_IMAGE_PX

if original_size <= IMAGE_SKIP_BYTES and not needs_resize:
if (
not force
and original_size <= IMAGE_SKIP_BYTES
and not needs_resize
and not needs_orientation_fix
):
return 0

if img.mode not in ("RGB", "RGBA", "L"):
Expand Down Expand Up @@ -207,6 +261,8 @@ def compress_pdf_inplace(path: str) -> int:
if original_size <= PDF_SKIP_BYTES:
return 0

rotations = _read_page_rotations(Path(path).read_bytes())

parent = Path(path).parent
tmp_path: str | None = None
try:
Expand Down Expand Up @@ -242,6 +298,10 @@ def compress_pdf_inplace(path: str) -> int:
logger.warning("gs failed on %s: %s", path, result.stderr.strip())
return 0

compressed = Path(tmp_path).read_bytes()
fixed = _restore_page_rotations(compressed, rotations)
if fixed is not compressed: # _restore_page_rotations returns same object if no-op
Path(tmp_path).write_bytes(fixed)
compressed_size = os.path.getsize(tmp_path)
saved = original_size - compressed_size
if saved > 0:
Expand Down