diff --git a/CHANGELOG.md b/CHANGELOG.md index 6300553..5cd6451 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,18 @@ Changelog ========= +Version 2.2.0 +------------- + +- Adding Ogg scanner to distinguish Vorbis, Opus, Theora, FLAC, Speex, and OGM codecs +- Adding ASF scanner to distinguish WMV (video) from WMA (audio) files +- Adding EBML scanner to distinguish Matroska (.mkv) from WebM (.webm) files +- Adding MSI (Windows Installer) and MPP (Microsoft Project) detection to CFBF scanner +- Fixing #146 OOXML detection now uses `[Content_Types].xml` content types as primary method, correctly identifying docx/xlsx/pptx files from LibreOffice, Google Docs, and other non-Microsoft tools (thanks to jonasdeboeck79) +- Fixing ZIP deep scan now inspects all ZIP files instead of short-circuiting on .zip extension +- Fixing text scanner now treats files containing NUL bytes as binary data instead of misidentifying them as text +- Fixing mz5 HDF5 scanner typo in chromatogram dataset name + Version 2.1.1 ------------- diff --git a/puremagic/main.py b/puremagic/main.py index 56942c6..e2f6419 100644 --- a/puremagic/main.py +++ b/puremagic/main.py @@ -32,10 +32,13 @@ mpeg_audio_scanner, hdf5_scanner, cfbf_scanner, + ogg_scanner, + asf_scanner, + ebml_scanner, ) __author__ = "Chris Griffith" -__version__ = "2.1.1" +__version__ = "2.2.0" __all__ = [ "magic_file", "magic_string", @@ -206,10 +209,12 @@ def identify_all(header: bytes, footer: bytes, ext=None) -> list[PureMagicWithCo return determine_confidence(matches, ext) -def perform_magic(header: bytes, footer: bytes, mime: bool, ext=None, filename=None) -> str: +def perform_magic(header: bytes | None, footer: bytes | None, mime: bool | None, ext=None, filename=None) -> str: """Discover what type of file it is based on the incoming string""" if not header: raise PureValueError("Input was empty") + if not footer: + footer = b"" infos = identify_all(header, footer, ext) if filename and os.path.isfile(filename) and os.getenv("PUREMAGIC_DEEPSCAN") != "0": results = run_deep_scan(infos, filename, header, footer, raise_on_none=not infos) @@ -466,6 +471,14 @@ def single_deep_scan( return result case cfbf_scanner.match_bytes | cfbf_scanner.match_bytes_short: return cfbf_scanner.main(filename, head, foot) + case ogg_scanner.match_bytes: + result = ogg_scanner.main(filename, head, foot) + if result and result.confidence > confidence: + return result + case asf_scanner.match_bytes: + return asf_scanner.main(filename, head, foot) + case ebml_scanner.match_bytes: + return ebml_scanner.main(filename, head, foot) if eml_result := text_scanner.eml_check(head): return eml_result diff --git a/puremagic/scanners/asf_scanner.py b/puremagic/scanners/asf_scanner.py new file mode 100644 index 0000000..a85848f --- /dev/null +++ b/puremagic/scanners/asf_scanner.py @@ -0,0 +1,59 @@ +import os +import struct + +from puremagic.scanners.helpers import Match + +# ASF Header Object GUID +match_bytes = b"\x30\x26\xb2\x75\x8e\x66\xcf\x11\xa6\xd9\x00\xaa\x00\x62\xce\x6c" + +_STREAM_PROPS_GUID = b"\x91\x07\xdc\xb7\xb7\xa9\xcf\x11\x8e\xe6\x00\xc0\x0c\x20\x53\x65" +_AUDIO_MEDIA_GUID = b"\x40\x9e\x69\xf8\x4d\x5b\xcf\x11\xa8\xfd\x00\x80\x5f\x5c\x44\x2b" +_VIDEO_MEDIA_GUID = b"\xc0\xef\x19\xbc\x4d\x5b\xcf\x11\xa8\xfd\x00\x80\x5f\x5c\x44\x2b" + + +def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None: + if not head or len(head) < 30: + return None + if head[:16] != match_bytes: + return None + + header_size = struct.unpack_from(" len(head): + try: + with open(file_path, "rb") as f: + data = f.read(min(int(header_size), 65536)) + except (OSError, ValueError): + return None + else: + data = head + + has_audio = False + has_video = False + offset = 30 # Past header GUID(16) + size(8) + count(4) + reserved(2) + + for _ in range(min(obj_count, 50)): + if offset + 24 > len(data): + break + obj_guid = data[offset : offset + 16] + obj_size = struct.unpack_from(" set[str]: """Parse CFBF directory entries and return the set of stream/storage names.""" @@ -45,8 +71,19 @@ def _extract_stream_names(dir_data: bytes) -> set[str]: return names -def _identify_format(stream_names: set[str]) -> Match | None: - """Match stream names against known CFBF format signatures.""" +def _extract_root_clsid(dir_data: bytes) -> bytes | None: + """Extract the CLSID from the root directory entry (obj_type 5).""" + for i in range(0, len(dir_data), 128): + entry = dir_data[i : i + 128] + if len(entry) < 96: + break + if entry[66] == 5: # Root storage + return entry[80:96] + return None + + +def _identify_format(stream_names: set[str], dir_data: bytes) -> Match | None: + """Match stream names and CLSIDs against known CFBF format signatures.""" # Check prefix matches first (e.g. __substg1.0_ for MSG) for name in stream_names: for prefix, ext, fmt_name, mime in _PREFIX_MATCHES: @@ -58,6 +95,18 @@ def _identify_format(stream_names: set[str]) -> Match | None: if stream_name in stream_names: return Match(ext, fmt_name, mime) + # Check multi-stream matches (all required streams must be present) + for required_streams, ext, fmt_name, mime in _MULTI_STREAM_MATCHES: + if all(s in stream_names for s in required_streams): + return Match(ext, fmt_name, mime) + + # Check root CLSID + root_clsid = _extract_root_clsid(dir_data) + if root_clsid: + for clsid, ext, fmt_name, mime in _CLSID_MATCHES: + if root_clsid == clsid: + return Match(ext, fmt_name, mime) + return None @@ -94,4 +143,4 @@ def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None: return None stream_names = _extract_stream_names(dir_data) - return _identify_format(stream_names) + return _identify_format(stream_names, dir_data) diff --git a/puremagic/scanners/ebml_scanner.py b/puremagic/scanners/ebml_scanner.py new file mode 100644 index 0000000..dcc3fb2 --- /dev/null +++ b/puremagic/scanners/ebml_scanner.py @@ -0,0 +1,21 @@ +import os + +from puremagic.scanners.helpers import Match + +match_bytes = b"\x1a\x45\xdf\xa3" + + +def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None: + if not head or len(head) < 8: + return None + if head[:4] != match_bytes: + return None + + # Search for DocType string in the EBML header (first 64 bytes) + search_area = head[:64] + if b"webm" in search_area: + return Match(".webm", "WebM Video", "video/webm") + if b"matroska" in search_area: + return Match(".mkv", "Matroska Video", "video/x-matroska") + + return None diff --git a/puremagic/scanners/hdf5_scanner.py b/puremagic/scanners/hdf5_scanner.py index 1ca102a..77a702e 100644 --- a/puremagic/scanners/hdf5_scanner.py +++ b/puremagic/scanners/hdf5_scanner.py @@ -32,7 +32,7 @@ "application/x-biom2", ), # mz5 - mass spectrometry - ([], [b"/SpectrumMetaData", b"/ChomatogramMetaData"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"), + ([], [b"/SpectrumMetaData", b"/ChromatogramList"], 1, ".mz5", "mz5 mass spectrometry data", "application/x-mz5"), # h5mlm - ML model ([], [b"model_type", b"h5mlm"], 1, ".h5mlm", "HDF5 ML model", "application/x-h5mlm"), ] diff --git a/puremagic/scanners/ogg_scanner.py b/puremagic/scanners/ogg_scanner.py new file mode 100644 index 0000000..e642f5b --- /dev/null +++ b/puremagic/scanners/ogg_scanner.py @@ -0,0 +1,39 @@ +import os + +from puremagic.scanners.helpers import Match + +match_bytes = b"OggS" + +# Ogg codec identification signatures found at the start of the first page payload. +# Each entry: (codec_id_bytes, extension, name, mime_type) +_OGG_CODEC_MAP = [ + (b"\x01vorbis", ".ogg", "Ogg Vorbis Audio", "audio/ogg"), + (b"OpusHead", ".opus", "Ogg Opus Audio", "audio/ogg"), + (b"\x80theora", ".ogv", "Ogg Theora Video", "video/ogg"), + (b"\x7fFLAC", ".oga", "Ogg FLAC Audio", "audio/ogg"), + (b"Speex ", ".spx", "Ogg Speex Audio", "audio/ogg"), + (b"fishead\x00", ".ogv", "Ogg Annodex", "video/ogg"), + (b"\x01video", ".ogm", "OGM Video", "video/x-ogm+ogg"), +] + + +def main(file_path: os.PathLike, head: bytes, foot: bytes) -> Match | None: + if not head or len(head) < 28: + return None + + # Verify OggS capture pattern, version 0, and beginning-of-stream flag + if head[:4] != match_bytes or head[4] != 0 or not (head[5] & 0x02): + return None + + seg_count = head[26] + payload_start = 27 + seg_count + + if payload_start >= len(head): + return None + + payload = head[payload_start:] + for codec_id, ext, name, mime in _OGG_CODEC_MAP: + if payload.startswith(codec_id): + return Match(ext, name, mime, confidence=0.9) + + return None diff --git a/puremagic/scanners/text_scanner.py b/puremagic/scanners/text_scanner.py index 9136bec..5e55b15 100644 --- a/puremagic/scanners/text_scanner.py +++ b/puremagic/scanners/text_scanner.py @@ -200,6 +200,10 @@ def main(file_path: os.PathLike | str, _, __) -> Match | None: if len(head) < 8: return Match("", "very short file", "application/octet-stream", confidence=0.5) + # NUL bytes indicate binary data, but skip this check for UTF-16 (which has NUL bytes naturally) + if b"\x00" in head and head[:2] not in (b"\xff\xfe", b"\xfe\xff"): + return Match("", "data", "application/octet-stream", confidence=0.5) + try: text, encoding = decode_any(head) except TypeError: diff --git a/puremagic/scanners/zip_scanner.py b/puremagic/scanners/zip_scanner.py index f8c30d3..1fa1ff0 100644 --- a/puremagic/scanners/zip_scanner.py +++ b/puremagic/scanners/zip_scanner.py @@ -8,6 +8,120 @@ office_macro_enable_match = b"macroEnabled" application_re = re.compile(b"(.*)") +_content_type_re = re.compile(rb'ContentType="([^"]*main[^"]*)"') + +# Maps OOXML main content type fragments (from [Content_Types].xml) to +# (extension, name, mime_type). Based on ECMA-376 Part 2. +_OOXML_CONTENT_TYPE_MAP: dict[str, tuple[str, str, str]] = { + # Word + "wordprocessingml.document.main+xml": ( + ".docx", + "Word Document", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ), + "wordprocessingml.template.main+xml": ( + ".dotx", + "Word Template", + "application/vnd.openxmlformats-officedocument.wordprocessingml.template", + ), + "ms-word.document.macroEnabled.main+xml": ( + ".docm", + "Word Document (Macro-Enabled)", + "application/vnd.ms-word.document.macroEnabled.12", + ), + "ms-word.template.macroEnabledTemplate.main+xml": ( + ".dotm", + "Word Template (Macro-Enabled)", + "application/vnd.ms-word.template.macroEnabled.12", + ), + # Excel + "spreadsheetml.sheet.main+xml": ( + ".xlsx", + "Excel Spreadsheet", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + "spreadsheetml.template.main+xml": ( + ".xltx", + "Excel Template", + "application/vnd.openxmlformats-officedocument.spreadsheetml.template", + ), + "ms-excel.sheet.macroEnabled.main+xml": ( + ".xlsm", + "Excel Spreadsheet (Macro-Enabled)", + "application/vnd.ms-excel.sheet.macroEnabled.12", + ), + "ms-excel.template.macroEnabled.main+xml": ( + ".xltm", + "Excel Template (Macro-Enabled)", + "application/vnd.ms-excel.template.macroEnabled.12", + ), + "ms-excel.sheet.binary.macroEnabled.main": ( + ".xlsb", + "Excel Binary Workbook", + "application/vnd.ms-excel.sheet.binary.macroEnabled.12", + ), + "ms-excel.addin.macroEnabled.main+xml": ( + ".xlam", + "Excel Add-In (Macro-Enabled)", + "application/vnd.ms-excel.addin.macroEnabled.12", + ), + # PowerPoint + "presentationml.presentation.main+xml": ( + ".pptx", + "PowerPoint Presentation", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + "presentationml.template.main+xml": ( + ".potx", + "PowerPoint Template", + "application/vnd.openxmlformats-officedocument.presentationml.template", + ), + "ms-powerpoint.presentation.macroEnabled.main+xml": ( + ".pptm", + "PowerPoint Presentation (Macro-Enabled)", + "application/vnd.ms-powerpoint.presentation.macroEnabled.12", + ), + "ms-powerpoint.template.macroEnabled.main+xml": ( + ".potm", + "PowerPoint Template (Macro-Enabled)", + "application/vnd.ms-powerpoint.template.macroEnabled.12", + ), + "presentationml.slideshow.main+xml": ( + ".ppsx", + "PowerPoint Slideshow", + "application/vnd.openxmlformats-officedocument.presentationml.slideshow", + ), + "ms-powerpoint.slideshow.macroEnabled.main+xml": ( + ".ppsm", + "PowerPoint Slideshow (Macro-Enabled)", + "application/vnd.ms-powerpoint.slideshow.macroEnabled.12", + ), + "ms-powerpoint.addin.macroEnabled.main+xml": ( + ".ppam", + "PowerPoint Add-In (Macro-Enabled)", + "application/vnd.ms-powerpoint.addin.macroEnabled", + ), +} + + +def _detect_from_content_types(zip_file: ZipFile) -> Match | None: + """Detect OOXML format from [Content_Types].xml main content types. + + This is the ECMA-376 standard approach, working with all compliant + implementations (Microsoft Office, LibreOffice, Google Docs, etc.). + """ + try: + ct_data = zip_file.read("[Content_Types].xml") + except KeyError: + return None + + for match in _content_type_re.finditer(ct_data): + content_type = match.group(1).decode("utf-8") + for key, (ext, name, mime) in _OOXML_CONTENT_TYPE_MAP.items(): + if key in content_type: + return Match(ext, name, mime) + + return None def open_office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None: @@ -32,9 +146,10 @@ def open_office_check(internal_files: list[str], zip_file: ZipFile, extension: s return None -def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None: - if "[Content_Types].xml" not in internal_files: - return None +def _detect_from_application( + internal_files: list[str], zip_file: ZipFile, extension: str | None = None +) -> Match | None: + """Fallback detection using docProps/app.xml Application tag.""" if "docProps/app.xml" not in internal_files: return None app_type_matches = application_re.search(zip_file.read("docProps/app.xml")) @@ -44,22 +159,30 @@ def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | if "PowerPoint" in application_type: if extension: + if extension == "pptm": + return Match(".pptm", application_type, "application/vnd.ms-powerpoint.presentation.macroEnabled.12") if extension == "ppsm": return Match(".ppsm", application_type, "application/vnd.ms-powerpoint.slideshow.macroEnabled.12") + if extension == "ppsx": + return Match( + ".ppsx", + application_type, + "application/vnd.openxmlformats-officedocument.presentationml.slideshow", + ) if extension == "potm": return Match(".potm", application_type, "application/vnd.ms-powerpoint.template.macroEnabled.12") if extension == "potx": return Match( - "potx", + ".potx", application_type, "application/vnd.openxmlformats-officedocument.presentationml.template", ) if extension == "ppam": return Match(".ppam", application_type, "application/vnd.ms-powerpoint.addin.macroEnabled") if office_macro_enable_match in zip_file.read("[Content_Types].xml"): - return Match(".ppsm", application_type, "application/vnd.ms-powerpoint.slideshow.macroEnabled.12") + return Match(".pptm", application_type, "application/vnd.ms-powerpoint.presentation.macroEnabled.12") return Match( - "pptx", + ".pptx", application_type, "application/vnd.openxmlformats-officedocument.presentationml.presentation", ) @@ -75,7 +198,7 @@ def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | return Match(".xltm", application_type, "application/vnd.ms-excel.template.macroEnabled.12") if extension == "xltx": return Match( - "xltx", + ".xltx", application_type, "application/vnd.openxmlformats-officedocument.spreadsheetml.template", ) @@ -91,19 +214,32 @@ def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | return Match(".dotm", application_type, "application/vnd.ms-word.template.macroEnabled.12") if extension == "dotx": return Match( - "dotx", + ".dotx", application_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.template", ) if office_macro_enable_match in zip_file.read("[Content_Types].xml"): return Match(".docm", application_type, "application/vnd.ms-word.document.macroEnabled.12") return Match( - "docx", application_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ".docx", application_type, "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) return None +def office_check(internal_files: list[str], zip_file: ZipFile, extension: str | None = None) -> Match | None: + if "[Content_Types].xml" not in internal_files: + return None + + # Primary: content-type-based detection (works with all OOXML creators) + result = _detect_from_content_types(zip_file) + if result: + return result + + # Fallback: application-based detection for non-standard OOXML packages + return _detect_from_application(internal_files, zip_file, extension) + + def jar_check(internal_files: list[str], zip_file: ZipFile) -> Match | None: if "META-INF/MANIFEST.MF" not in internal_files: return None @@ -156,8 +292,6 @@ def cbz_check(internal_files: list[str], extension: str) -> Match | None: def main(file_path: os.PathLike, _, __) -> Match | None: extension = str(file_path).split(".")[-1].lower() - if extension == "zip" and not str(file_path).endswith(".fb2.zip"): - return Match(".zip", "ZIP archive", "application/zip") with ZipFile(file_path) as myzip: internal_files = myzip.namelist() @@ -189,4 +323,5 @@ def main(file_path: os.PathLike, _, __) -> Match | None: if cbz_result: return cbz_result - return None + # No specific format detected — return generic ZIP (same confidence as other matches) + return Match(".zip", "ZIP archive", "application/zip") diff --git a/test/resources/audio/test.oga b/test/resources/audio/test.oga new file mode 100644 index 0000000..597e32d Binary files /dev/null and b/test/resources/audio/test.oga differ diff --git a/test/resources/audio/test.ogg b/test/resources/audio/test.ogg new file mode 100644 index 0000000..c0ef97e Binary files /dev/null and b/test/resources/audio/test.ogg differ diff --git a/test/resources/audio/test.wma b/test/resources/audio/test.wma new file mode 100644 index 0000000..9a374a9 Binary files /dev/null and b/test/resources/audio/test.wma differ diff --git a/test/resources/video/test.ogv b/test/resources/video/test.ogv new file mode 100644 index 0000000..7d8af80 Binary files /dev/null and b/test/resources/video/test.ogv differ diff --git a/test/resources/video/test.webm b/test/resources/video/test.webm new file mode 100644 index 0000000..015d701 Binary files /dev/null and b/test/resources/video/test.webm differ diff --git a/test/test_scanners.py b/test/test_scanners.py index 4d4a0cc..918b2b8 100644 --- a/test/test_scanners.py +++ b/test/test_scanners.py @@ -1,5 +1,10 @@ +import os +import tempfile +from pathlib import Path +from zipfile import ZipFile + import puremagic -from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR +from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR, VIDEO_DIR from puremagic.scanners import python_scanner, json_scanner, sndhdr_scanner sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending} @@ -135,3 +140,743 @@ def test_sndhdr_scanner(): assert result.name.startswith("Macintosh SNDR Resource") assert result.mime_type == "audio/x-sndr" assert result.confidence == 0.1 + + +def test_ooxml_content_type_detection(): + # GH #146: All OOXML files should be detected with correct extension and MIME type + expected = { + "test.docx": (".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"), + "test.docm": (".docm", "application/vnd.ms-word.document.macroEnabled.12"), + "test.dotx": (".dotx", "application/vnd.openxmlformats-officedocument.wordprocessingml.template"), + "test.dotm": (".dotm", "application/vnd.ms-word.template.macroEnabled.12"), + "test.xlsx": (".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"), + "test.xlsm": (".xlsm", "application/vnd.ms-excel.sheet.macroEnabled.12"), + "test.xlsb": (".xlsb", "application/vnd.ms-excel.sheet.binary.macroEnabled.12"), + "test.xltx": (".xltx", "application/vnd.openxmlformats-officedocument.spreadsheetml.template"), + "test.xltm": (".xltm", "application/vnd.ms-excel.template.macroEnabled.12"), + "test.pptx": (".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"), + "test.pptm": (".pptm", "application/vnd.ms-powerpoint.presentation.macroEnabled.12"), + "test.potx": (".potx", "application/vnd.openxmlformats-officedocument.presentationml.template"), + "test.potm": (".potm", "application/vnd.ms-powerpoint.template.macroEnabled.12"), + } + for filename, (exp_ext, exp_mime) in expected.items(): + filepath = os.path.join(OFFICE_DIR, filename) + ext = puremagic.from_file(filepath) + mime = puremagic.from_file(filepath, mime=True) + assert ext == exp_ext, f"{filename}: expected ext {exp_ext}, got {ext}" + assert mime == exp_mime, f"{filename}: expected mime {exp_mime}, got {mime}" + + +def test_ooxml_without_app_xml(): + # GH #146: OOXML files without docProps/app.xml should still be detected + # (e.g., Google Docs exports) + content_types = b""" + + +""" + + with tempfile.NamedTemporaryFile(suffix=".docx", delete=False) as f: + with ZipFile(f, "w") as zf: + zf.writestr("[Content_Types].xml", content_types) + zf.writestr("word/document.xml", "") + tmppath = f.name + + try: + ext = puremagic.from_file(tmppath) + assert ext == ".docx" + mime = puremagic.from_file(tmppath, mime=True) + assert mime == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + finally: + os.unlink(tmppath) + + +def test_ooxml_libreoffice_application(): + # GH #146: OOXML files with non-Microsoft Application tag should still be detected + content_types = b""" + + +""" + + app_xml = b""" + +LibreOffice/24.8.5.2 +""" + + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f: + with ZipFile(f, "w") as zf: + zf.writestr("[Content_Types].xml", content_types) + zf.writestr("docProps/app.xml", app_xml) + zf.writestr("xl/workbook.xml", "") + tmppath = f.name + + try: + ext = puremagic.from_file(tmppath) + assert ext == ".xlsx" + mime = puremagic.from_file(tmppath, mime=True) + assert mime == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + finally: + os.unlink(tmppath) + + +def _make_ogg_bos_page(codec_id: bytes) -> bytes: + """Build a minimal Ogg beginning-of-stream page with the given codec ID payload.""" + # OggS capture pattern + version 0 + BOS header type + header = b"OggS\x00\x02" + # granule(8) + serial(4) + page_seq(4) + crc(4) = 20 bytes + header += b"\x00" * 20 + # 1 segment, segment size = len(codec_id) + header += bytes([1, len(codec_id)]) + return header + codec_id + + +def test_ogg_opus_scanner(): + opus_file = AUDIO_DIR / "test.opus" + results = puremagic.magic_file(opus_file) + assert results[0].extension == ".opus" + assert results[0].mime_type == "audio/ogg" + assert results[0].name == "Ogg Opus Audio" + + +def test_ogg_vorbis_scanner(): + ogg_file = AUDIO_DIR / "test.ogg" + results = puremagic.magic_file(ogg_file) + assert results[0].extension == ".ogg" + assert results[0].mime_type == "audio/ogg" + assert results[0].name == "Ogg Vorbis Audio" + + +def test_ogg_theora_scanner(): + ogv_file = VIDEO_DIR / "test.ogv" + results = puremagic.magic_file(ogv_file) + assert results[0].extension == ".ogv" + assert results[0].mime_type == "video/ogg" + assert results[0].name == "Ogg Theora Video" + + +def test_ogg_flac_scanner(): + oga_file = AUDIO_DIR / "test.oga" + results = puremagic.magic_file(oga_file) + assert results[0].extension == ".oga" + assert results[0].mime_type == "audio/ogg" + assert results[0].name == "Ogg FLAC Audio" + + +def test_ogg_scanner_direct(): + from puremagic.scanners import ogg_scanner + + # Test all codecs via real files + for path, expected_ext in [ + (AUDIO_DIR / "test.opus", ".opus"), + (AUDIO_DIR / "test.ogg", ".ogg"), + (VIDEO_DIR / "test.ogv", ".ogv"), + (AUDIO_DIR / "test.oga", ".oga"), + ]: + with open(path, "rb") as f: + head = f.read(256) + result = ogg_scanner.main(path, head, b"") + assert result is not None, f"{path}: expected {expected_ext}, got None" + assert result.extension == expected_ext, f"{path}: expected {expected_ext}, got {result.extension}" + assert result.confidence == 0.9 + + +def test_ogg_scanner_synthetic_codecs(): + """Test codec detection for formats without real test files (Speex, Annodex, OGM).""" + from puremagic.scanners import ogg_scanner + + cases = [ + (b"Speex ", ".spx", "Ogg Speex Audio", "audio/ogg"), + (b"fishead\x00", ".ogv", "Ogg Annodex", "video/ogg"), + (b"\x01video\x00\x00\x00", ".ogm", "OGM Video", "video/x-ogm+ogg"), + ] + for codec_id, expected_ext, expected_name, expected_mime in cases: + head = _make_ogg_bos_page(codec_id) + result = ogg_scanner.main(Path("fake.ogg"), head, b"") + assert result is not None, f"codec {codec_id!r}: expected {expected_ext}, got None" + assert result.extension == expected_ext + assert result.name == expected_name + assert result.mime_type == expected_mime + assert result.confidence == 0.9 + + +def test_ogg_scanner_rejects_non_ogg(): + from puremagic.scanners import ogg_scanner + + assert ogg_scanner.main(Path("fake.ogg"), b"not ogg data at all", b"") is None + assert ogg_scanner.main(Path("fake.ogg"), b"", b"") is None + # Valid OggS but wrong version + assert ogg_scanner.main(Path("fake.ogg"), b"OggS\x01\x02" + b"\x00" * 50, b"") is None + # Valid OggS but not BOS page + assert ogg_scanner.main(Path("fake.ogg"), b"OggS\x00\x00" + b"\x00" * 50, b"") is None + # Valid BOS page but unknown codec + head = _make_ogg_bos_page(b"UnknownCodecXYZ") + assert ogg_scanner.main(Path("fake.ogg"), head, b"") is None + + +def test_asf_wmv_scanner(): + wmv_file = VIDEO_DIR / "test.wmv" + results = puremagic.magic_file(wmv_file) + assert results[0].extension == ".wmv" + assert results[0].mime_type == "video/x-ms-wmv" + assert results[0].name == "Windows Media Video" + + +def test_asf_wma_scanner(): + wma_file = AUDIO_DIR / "test.wma" + results = puremagic.magic_file(wma_file) + assert results[0].extension == ".wma" + assert results[0].mime_type == "audio/x-ms-wma" + assert results[0].name == "Windows Media Audio" + + +def test_asf_scanner_direct(): + from puremagic.scanners import asf_scanner + + # WMV (has video) + wmv_file = VIDEO_DIR / "test.wmv" + with open(wmv_file, "rb") as f: + head = f.read(256) + result = asf_scanner.main(wmv_file, head, b"") + assert result is not None + assert result.extension == ".wmv" + assert result.mime_type == "video/x-ms-wmv" + + # WMA (audio only) + wma_file = AUDIO_DIR / "test.wma" + with open(wma_file, "rb") as f: + head = f.read(256) + result = asf_scanner.main(wma_file, head, b"") + assert result is not None + assert result.extension == ".wma" + assert result.mime_type == "audio/x-ms-wma" + + +def test_asf_scanner_generic_fallback(): + """ASF with no recognized stream types should return .asf.""" + import struct + from puremagic.scanners import asf_scanner + + # Build minimal ASF header: GUID(16) + size(8) + count(4) + reserved(2) = 30 bytes + # With 0 sub-objects so no streams are found + header_guid = asf_scanner.match_bytes + header_size = struct.pack("= len(head).""" + from puremagic.scanners import ogg_scanner + + # Valid OggS BOS header with seg_count=200 → payload_start=227, but head is only 50 bytes + head = bytearray(50) + head[0:4] = b"OggS" + head[4] = 0 # version + head[5] = 0x02 # BOS flag + head[26] = 200 # seg_count → payload at offset 227 + assert ogg_scanner.main(Path("fake.ogg"), bytes(head), b"") is None + + +def test_json_scanner_array(): + """json_scanner: valid JSON array file.""" + json_file = SYSTEM_DIR / "test_array.json" + json_file.write_bytes(b"[1, 2, 3]") + try: + result = json_scanner.main(json_file, b"[1, 2, 3]", b"[1, 2, 3]") + assert result is not None + assert result.extension == ".json" + finally: + json_file.unlink() + + +def test_json_scanner_malformed(): + """json_scanner lines 17-18: passes structural check but fails json.load().""" + json_file = SYSTEM_DIR / "test_bad.json" + json_file.write_bytes(b"{invalid json content}") + try: + result = json_scanner.main(json_file, b"{invalid json content}", b"{invalid json content}") + assert result is None + finally: + json_file.unlink() + + +def test_sndhdr_hcom_detection(): + """sndhdr_scanner line 25: HCOM format detected via FSSD+HCOM markers.""" + from puremagic.scanners import sndhdr_scanner + + head = bytearray(133) + head[65:69] = b"FSSD" + head[128:132] = b"HCOM" + result = sndhdr_scanner.main(None, bytes(head), None) + assert result is not None + assert result.extension == ".hcom" + assert result.mime_type == "audio/x-hcom" + assert result.confidence == 1.0 + + +def test_sndhdr_short_head(): + """sndhdr_scanner lines 44-45: head too short for struct.unpack.""" + from puremagic.scanners import sndhdr_scanner + + # 2 bytes is too short for get_short_le(head[2:4]) + result = sndhdr_scanner.main(None, b"\x00\x00", None) + # Should not crash — except catches IndexError, then test_hcom also fails (head too short) + # test_hcom will raise IndexError on head[65:69] check but that returns None since slicing doesn't raise + assert result is None + + +def test_asf_scanner_wrong_magic_30_bytes(): + """asf_scanner line 18: 30+ bytes but wrong magic.""" + from puremagic.scanners import asf_scanner + + assert asf_scanner.main(Path("fake.asf"), b"\x00" * 30, b"") is None + + +def test_asf_scanner_file_io_error(): + """asf_scanner lines 28-29: OSError when header_size > len(head) and file doesn't exist.""" + import struct + from puremagic.scanners import asf_scanner + + # Valid ASF magic + header_size=99999 (much larger than head) + obj_count=0 + head = asf_scanner.match_bytes + head += struct.pack(" len(data).""" + import struct + from puremagic.scanners import asf_scanner + + # Valid header with obj_count=1 but no actual object data after the header + head = asf_scanner.match_bytes # 16 bytes + head += struct.pack(" 30 → break + result = asf_scanner.main(Path("fake.asf"), head, b"") + assert result is not None + assert result.extension == ".asf" # Falls through to generic ASF + + +def test_asf_scanner_bad_object_size(): + """asf_scanner line 43: break when obj_size < 24.""" + import struct + from puremagic.scanners import asf_scanner + + header_body = struct.pack(" 1MB return None.""" + large_file = SYSTEM_DIR / "test_large.py" + large_file.write_bytes(b"import os\n" * 200_000) # ~2MB + try: + result = python_scanner.main(large_file, None, None) + assert result is None + finally: + large_file.unlink() + + +def test_python_scanner_no_constructs(): + """python_scanner lines 31-37, 54-55: non-.py file that parses as Python but lacks constructs.""" + # This file parses as valid Python (just assignments) but has no imports/defs/control flow + no_constructs = SYSTEM_DIR / "test_noconstructs.txt" + no_constructs.write_bytes(b"a = 1\nb = 2\nc = 3\nd = 4\ne = 5\nf = 6\ng = 7\n" * 20) + try: + result = python_scanner.main(no_constructs, None, None) + assert result is None + finally: + no_constructs.unlink() + + +def test_python_scanner_few_constructs(): + """python_scanner lines 34-37: non-.py with some constructs but below threshold of 4.""" + few_file = SYSTEM_DIR / "test_few.txt" + # Only 2 imports + padding to exceed 100 byte minimum — below threshold of 4 constructs + few_file.write_bytes(b"import os\nimport sys\nx = 1\ny = 2\n" + b"z = 0\n" * 30) + try: + result = python_scanner.main(few_file, None, None) + assert result is None + finally: + few_file.unlink() + + +# ── Medium tier: main.py coverage ───────────────────────────────────── + + +def test_deepscan_disabled_magic_file(monkeypatch): + """main.py line 351: magic_file returns without deep scan when PUREMAGIC_DEEPSCAN=0.""" + monkeypatch.setenv("PUREMAGIC_DEEPSCAN", "0") + # Force reimport so the env var takes effect at module level + # But scanners are imported at module load time; the env check is in run_deep_scan/single_deep_scan + results = puremagic.magic_file(OFFICE_DIR / "test.docx") + # Without deep scan, all OOXML formats share the same PK magic bytes + assert len(results) > 1 # Multiple matches, not narrowed by scanner + + +def test_deepscan_disabled_magic_stream(monkeypatch): + """main.py line 395: magic_stream returns without deep scan when PUREMAGIC_DEEPSCAN=0.""" + monkeypatch.setenv("PUREMAGIC_DEEPSCAN", "0") + with open(OFFICE_DIR / "test.docx", "rb") as f: + results = puremagic.magic_stream(f, filename=OFFICE_DIR / "test.docx") + assert len(results) > 1 + + +def test_single_deep_scan_disabled(monkeypatch): + """main.py line 451: single_deep_scan returns None when PUREMAGIC_DEEPSCAN=0.""" + monkeypatch.setenv("PUREMAGIC_DEEPSCAN", "0") + result = puremagic.main.single_deep_scan(b"PK\x03\x04", Path("fake.zip"), head=b"\x00", foot=b"\x00") + assert result is None + + +def test_single_deep_scan_none_head(): + """main.py line 453: single_deep_scan returns None when head is None.""" + result = puremagic.main.single_deep_scan(b"PK\x03\x04", Path("fake.zip"), head=None, foot=b"\x00") + assert result is None + result = puremagic.main.single_deep_scan(b"PK\x03\x04", Path("fake.zip"), head=b"\x00", foot=None) + assert result is None + + +def test_catch_all_deep_scan_disabled(monkeypatch): + """main.py line 498: catch_all_deep_scan returns None when PUREMAGIC_DEEPSCAN=0.""" + monkeypatch.setenv("PUREMAGIC_DEEPSCAN", "0") + result = puremagic.main.catch_all_deep_scan(Path("fake.txt"), head=b"\x00", foot=b"\x00") + assert result is None + + +def test_catch_all_deep_scan_none_head(): + """main.py line 500: catch_all_deep_scan returns None when head is None.""" + result = puremagic.main.catch_all_deep_scan(Path("fake.txt"), head=None, foot=b"\x00") + assert result is None + + +def test_file_details_non_regular_file(): + """main.py line 235: file_details raises PureError for directories.""" + import pytest + + with pytest.raises(puremagic.main.PureError, match="Not a regular file"): + puremagic.main.file_details(SYSTEM_DIR) + + +def test_magic_file_no_matches(): + """main.py lines 346-347: identify_all raises PureError, caught and info set to [].""" + # File with random bytes that don't match any magic signature + random_file = SYSTEM_DIR / "test_random.bin" + random_file.write_bytes(bytes(range(256)) * 4) + try: + results = puremagic.magic_file(random_file) + # Should not raise — either returns results from deep scan or empty-ish list + assert isinstance(results, list) + finally: + random_file.unlink() + + +def test_run_deep_scan_no_matches_raises(monkeypatch): + """main.py lines 546-547: run_deep_scan raises PureError when no matches and raise_on_none=True.""" + import pytest + from puremagic.scanners import text_scanner + + random_file = SYSTEM_DIR / "test_unrecognizable.bin" + random_file.write_bytes(b"\x00" * 100) + # Patch catch-all text scanner to return None so we reach the raise + monkeypatch.setattr(text_scanner, "main", lambda *a, **kw: None) + try: + with pytest.raises(puremagic.main.PureError, match="Could not identify file"): + puremagic.main.run_deep_scan([], random_file, b"\x00" * 40, b"\x00" * 40, raise_on_none=True) + finally: + random_file.unlink() + + +# ── Medium tier: hdf5_scanner coverage ──────────────────────────────── + + +def test_hdf5_scanner_no_subtype(): + """hdf5_scanner lines 46-61: valid HDF5 magic but no matching subtype → None.""" + from puremagic.scanners import hdf5_scanner + + hdf5_file = SYSTEM_DIR / "test_generic.hdf5" + hdf5_file.write_bytes(b"\x89HDF\r\n\x1a\n" + b"\x00" * 1024) + try: + result = hdf5_scanner.main(hdf5_file, b"\x89HDF\r\n\x1a\n" + b"\x00" * 100, b"") + assert result is None + finally: + hdf5_file.unlink() + + +def test_hdf5_scanner_anndata_match(): + """hdf5_scanner lines 53-58: HDF5 with AnnData signatures → .h5ad.""" + from puremagic.scanners import hdf5_scanner + + # Create file with HDF5 magic + group paths that look like AnnData + content = b"\x89HDF\r\n\x1a\n" + b"\x00" * 100 + b"/obs" + b"\x00" * 100 + b"/var" + b"\x00" * 100 + hdf5_file = SYSTEM_DIR / "test_anndata.h5ad" + hdf5_file.write_bytes(content) + try: + result = hdf5_scanner.main(hdf5_file, content[:20], b"") + assert result is not None + assert result.extension == ".h5ad" + assert result.name == "AnnData" + assert result.confidence == 0.9 + finally: + hdf5_file.unlink() + + +# ── Medium tier: cfbf_scanner coverage ──────────────────────────────── + + +def test_cfbf_extract_stream_names_incomplete_entry(): + """cfbf_scanner line 60: break on incomplete trailing entry.""" + from puremagic.scanners.cfbf_scanner import _extract_stream_names + + # One valid 128-byte entry + 50 bytes of trailing garbage + entry = bytearray(128) + name = "TestStream".encode("utf-16-le") + b"\x00\x00" + entry[: len(name)] = name + import struct + + struct.pack_into("