diff --git a/app/my_practice/management/commands/compress_media.py b/app/my_practice/management/commands/compress_media.py index 692cee8..3bd9144 100644 --- a/app/my_practice/management/commands/compress_media.py +++ b/app/my_practice/management/commands/compress_media.py @@ -8,10 +8,14 @@ ./dev.py manage compress_media ./dev.py manage compress_media --path taxes/2025 ./dev.py manage compress_media --dry-run + ./dev.py manage compress_media --force # bypass size threshold for images + ./dev.py manage compress_media --rotate-pages 180 --path clients/ml # fix upside-down PDFs """ +import io from pathlib import Path +import pypdf from django.conf import settings from django.core.management.base import BaseCommand, CommandError @@ -27,6 +31,26 @@ _PROCESSABLE = _IMAGE_EXTENSIONS | _PDF_EXTENSIONS +def _rotate_pdf_pages(path: Path, degrees: int) -> bool: + """ + Set /Rotate on every page of the PDF to the given degrees. + Writes the file in-place. Returns True if the file was modified. + """ + data = path.read_bytes() + reader = pypdf.PdfReader(io.BytesIO(data)) + writer = pypdf.PdfWriter() + writer.append(reader) + for page in writer.pages: + page[pypdf.generic.NameObject("/Rotate")] = pypdf.generic.NumberObject(degrees) + buf = io.BytesIO() + writer.write(buf) + result = buf.getvalue() + if result != data: + path.write_bytes(result) + return True + return False + + class Command(BaseCommand): help = "Compress existing media files (images via Pillow, PDFs via Ghostscript)" @@ -41,6 +65,21 @@ def add_arguments(self, parser): action="store_true", help="List files that would be processed without modifying them", ) + parser.add_argument( + "--force", + action="store_true", + help="Process all images regardless of size (use to fix EXIF orientation on already-compressed files)", + ) + parser.add_argument( + "--rotate-pages", + type=int, + metavar="DEGREES", + help=( + "Set /Rotate on every page of all PDFs under --path to DEGREES " + "(e.g. 180 to fix upside-down scans whose rotation metadata was stripped). " + "Skips compression; only modifies PDFs." + ), + ) def handle(self, *args, **options): media_root = Path(settings.MEDIA_ROOT) @@ -50,9 +89,20 @@ def handle(self, *args, **options): if not root.exists(): raise CommandError(f"Path not found: {root}") + rotate_degrees = options.get("rotate_pages") + + if rotate_degrees is not None: + self._handle_rotate(root, media_root, rotate_degrees, options["dry_run"]) + return + dry_run = options["dry_run"] + force = options["force"] if dry_run: self.stdout.write(self.style.WARNING("DRY RUN — no files will be modified\n")) + if force: + self.stdout.write( + self.style.WARNING("FORCE mode — size threshold bypassed for images\n") + ) total_files = 0 total_compressed = 0 @@ -82,7 +132,7 @@ def handle(self, *args, **options): try: if ext in _IMAGE_EXTENSIONS: - saved = compress_image_inplace(str(filepath)) + saved = compress_image_inplace(str(filepath), force=force) else: saved = compress_pdf_inplace(str(filepath)) @@ -113,6 +163,9 @@ def handle(self, *args, **options): if dry_run: self.stdout.write(f"\n{total_files} processable files found.") self.stdout.write("Re-run without --dry-run to compress them.") + self.stdout.write( + "Add --force to also reprocess small images (e.g. to fix EXIF orientation)." + ) return saved_kb = total_saved / 1024 @@ -130,3 +183,53 @@ def handle(self, *args, **options): if total_errors: self.stdout.write(self.style.WARNING("Check logs above for error details.")) + + def _handle_rotate(self, root: Path, media_root: Path, degrees: int, dry_run: bool) -> None: + if dry_run: + self.stdout.write( + self.style.WARNING( + f"DRY RUN — would set /Rotate {degrees} on all PDF pages under {root.relative_to(media_root)}\n" + ) + ) + else: + self.stdout.write( + self.style.WARNING( + f"Setting /Rotate {degrees} on all PDF pages under {root.relative_to(media_root)}\n" + ) + ) + + modified = 0 + skipped = 0 + errors = 0 + + candidates = [root] if root.is_file() else sorted(root.rglob("*")) + for filepath in candidates: + if not filepath.is_file() or filepath.suffix.lower() not in _PDF_EXTENSIONS: + continue + rel = filepath.relative_to(media_root) + if dry_run: + self.stdout.write(f" [pdf] {rel}") + skipped += 1 + continue + try: + changed = _rotate_pdf_pages(filepath, degrees) + if changed: + modified += 1 + self.stdout.write(f" ✓ {rel}") + else: + skipped += 1 + self.stdout.write(f" – {rel} (already correct)") + except Exception as exc: + errors += 1 + self.stdout.write(self.style.ERROR(f" ✗ {rel}: {exc}")) + + if dry_run: + self.stdout.write( + f"\n{skipped} PDF(s) found. Re-run without --dry-run to apply /Rotate {degrees}." + ) + else: + self.stdout.write( + self.style.SUCCESS( + f"\nDone: {modified} modified, {skipped} unchanged, {errors} errors" + ) + ) diff --git a/app/my_practice/utils/file_processing.py b/app/my_practice/utils/file_processing.py index be11cb7..c704bb5 100644 --- a/app/my_practice/utils/file_processing.py +++ b/app/my_practice/utils/file_processing.py @@ -15,8 +15,9 @@ import tempfile from pathlib import Path +import pypdf from django.core.files.base import ContentFile -from PIL import Image +from PIL import Image, ImageOps logger = logging.getLogger(__name__) @@ -45,6 +46,7 @@ def compress_image_upload(upload) -> ContentFile: Returns a ContentFile with a .jpg filename. """ img = Image.open(upload) + img = ImageOps.exif_transpose(img) # honour scanner EXIF orientation before stripping EXIF if img.mode != "RGB": img = img.convert("RGB") if max(img.size) > MAX_IMAGE_PX: @@ -55,11 +57,52 @@ def compress_image_upload(upload) -> ContentFile: return ContentFile(buf.getvalue(), name=f"{stem}.jpg") +def _read_page_rotations(data: bytes) -> list[int]: + """ + Return the /Rotate value for each page in the given PDF bytes. + Ghostscript's pdfwrite device strips /Rotate entries, so we read them + before compression and restore them afterwards. + """ + try: + reader = pypdf.PdfReader(io.BytesIO(data)) + return [int(page.get("/Rotate", 0) or 0) for page in reader.pages] + except Exception: + return [] + + +def _restore_page_rotations(data: bytes, rotations: list[int]) -> bytes: + """ + Write /Rotate back onto each page of the PDF. Used to undo Ghostscript's + rotation-stripping after compression. + Returns unmodified data if rotations are all 0, empty, or pypdf fails. + """ + if not rotations or all(r == 0 for r in rotations): + return data + try: + reader = pypdf.PdfReader(io.BytesIO(data)) + writer = pypdf.PdfWriter() + writer.append(reader) + for i, page in enumerate(writer.pages): + rot = rotations[i] if i < len(rotations) else 0 + if rot: + page[pypdf.generic.NameObject("/Rotate")] = pypdf.generic.NumberObject(rot) + buf = io.BytesIO() + writer.write(buf) + return buf.getvalue() + except Exception: + logger.warning("Could not restore PDF page rotations; returning as-is") + return data + + def _compress_pdf_bytes(data: bytes) -> bytes: """ Compress PDF bytes via Ghostscript. Returns compressed bytes, or the original bytes if gs is unavailable, fails, or makes the file larger. + Page /Rotate attributes are preserved — Ghostscript strips them, so we + read them beforehand and restore them after compression. """ + rotations = _read_page_rotations(data) + with ( tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f_in, tempfile.NamedTemporaryFile(suffix=".pdf", delete=False) as f_out, @@ -97,7 +140,7 @@ def _compress_pdf_bytes(data: bytes) -> bytes: logger.warning("gs failed (upload): %s", result.stderr.strip()) return data - compressed = Path(tmp_out).read_bytes() + compressed = _restore_page_rotations(Path(tmp_out).read_bytes(), rotations) return compressed if len(compressed) < len(data) else data except FileNotFoundError: logger.warning("ghostscript not found; storing PDF uncompressed") @@ -162,20 +205,31 @@ def process_upload(upload) -> ContentFile: # --------------------------------------------------------------------------- -def compress_image_inplace(path: str) -> int: +def compress_image_inplace(path: str, force: bool = False) -> int: """ Compress an image file in-place. - Resizes to MAX_IMAGE_PX on the longest side and re-encodes. + Resizes to MAX_IMAGE_PX on the longest side, fixes EXIF orientation, and re-encodes. Preserves the original format (no extension rename, so DB paths stay valid). Returns bytes saved (0 if skipped or no improvement). + Pass force=True to bypass the size threshold (e.g. to fix orientation on small files). """ original_size = os.path.getsize(path) ext = Path(path).suffix.lower() img = Image.open(path) + # Physically rotate pixels to match EXIF orientation before stripping EXIF. + # Without this, re-saving strips the Orientation tag and viewers see raw scanner pixels. + img_transposed = ImageOps.exif_transpose(img) + needs_orientation_fix = img_transposed is not img + img = img_transposed needs_resize = max(img.size) > MAX_IMAGE_PX - if original_size <= IMAGE_SKIP_BYTES and not needs_resize: + if ( + not force + and original_size <= IMAGE_SKIP_BYTES + and not needs_resize + and not needs_orientation_fix + ): return 0 if img.mode not in ("RGB", "RGBA", "L"): @@ -207,6 +261,8 @@ def compress_pdf_inplace(path: str) -> int: if original_size <= PDF_SKIP_BYTES: return 0 + rotations = _read_page_rotations(Path(path).read_bytes()) + parent = Path(path).parent tmp_path: str | None = None try: @@ -242,6 +298,10 @@ def compress_pdf_inplace(path: str) -> int: logger.warning("gs failed on %s: %s", path, result.stderr.strip()) return 0 + compressed = Path(tmp_path).read_bytes() + fixed = _restore_page_rotations(compressed, rotations) + if fixed is not compressed: # _restore_page_rotations returns same object if no-op + Path(tmp_path).write_bytes(fixed) compressed_size = os.path.getsize(tmp_path) saved = original_size - compressed_size if saved > 0: