iscc · ash-0x00 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026 · Jun 8, 2026
diff --git a/README.md b/README.md
@@ -62,6 +62,37 @@ uv run twinspect info             # Show data folder information
 uv run twinspect checksum <path>  # Compute folder checksum
 ```
 
+## Bioimage conversion benchmark
+
+The `bioimage_convert_1000` dataset builds 1000 same-source conversion clusters from public Broad
+Bioimage Benchmark Collection sources. Each cluster contains the selected source bioimage plus
+OME-TIFF, TIFF, PNG, JPEG 2000, DICOM and ICS variants converted with pinned
+OME Bio-Formats bftools `8.5.0` using default conversion settings only.
+
+TwinSpect downloads and verifies the bftools archive automatically on first use:
+
+- URL: `https://downloads.openmicroscopy.org/bio-formats/8.5.0/artifacts/bftools.zip`
+- SHA-256: `07a3bb1d3de84da3a709655a1008cb2d9b19becc5bad4ae4112633aec9380478`
+- Default cache: `~/.cache/twinspect/bioformats`
+
+Bio-Formats bftools is a Java distribution with Unix and Windows launchers (`bfconvert` and
+`bfconvert.bat`). A working Java runtime is required. Custom converters are still supported via
+`TWINSPECT_BIOIMAGE_CONVERT_TEMPLATE` or `TWINSPECT_BIOIMAGE_CONVERT_BIN`; the binary override uses
+the legacy `imgcnv -i INPUT -o OUTPUT -t FORMAT` argument shape, while arbitrary CLIs should use the
+template override.
+
+The benchmark measures same-source conversion robustness. It intentionally uses default
+format conversions rather than synthetic edits, brightness shifts, blur, or custom compression
+flags. On BBBC smoke runs with Bio-Formats defaults, OME-TIFF/TIFF/PNG/DICOM/ICS
+generally preserve identical IMAGEWALK Data-Codes, while JPEG 2000 introduces converter drift
+on some BMP sources. A stronger real-world stress case is lossy proprietary microscopy input,
+especially Zeiss CZI files using JPEG-XR compression converted to OME-Zarr with `bioformats2raw`:
+different valid decoder/conversion paths can preserve about 96% exact pixels while producing
+non-identical IMAGEWALK codes. TwinSpect treats `.zarr` directories as single benchmark media
+inputs so such CZI-to-OME-Zarr clusters can be added without hashing internal chunk files.
+That negative/positive split is part of the benchmark pressure and keeps the dataset semantics
+converter-induced rather than hand-edited.
+
 ## Documentation
 
 The benchmark results and methodology are documented at **https://eval.iscc.codes**, including:

diff --git a/config.yml b/config.yml
@@ -18,6 +18,16 @@ algorithms:
     url: https://github.com/iscc/iscc-core/blob/main/iscc_core/code_content_image.py
     dependencies:
       - iscc-sdk>=0.8.5
+  - name: ISCC BioImage Data-Code IW 64-Bit
+    label: bioimage_data_code_iw_64
+    mode: image
+    function: twinspect.algos.iscc_bio:bioimage_data_code_iw_64
+    info: IMAGEWALK-based bioimage Data-Code using iscc-bio. Decodes bioimage pixel planes via
+      BioIO/OME-NGFF, canonicalizes plane bytes, and generates a 64-bit ISCC Data-Code for
+      same-image/different-format bioimage matching.
+    url: https://github.com/bio-codes/iscc-bio
+    dependencies:
+      - iscc-bio[readers] @ git+https://github.com/bio-codes/iscc-bio.git@940bbbbb961e4b5316d95a5619b9d797fe70c16d
   - name: ISCC Audio-Code V0 64-Bit
     label: audio_code_v0_64
     mode: audio
@@ -95,6 +105,21 @@ datasets:
     url: https://mfnd.similarity.eu/data/truthfiles/polito/IND_clusters.txt
     mode: image
     installer: twinspect.datasets.mfnd:install
+  - name: BioImage Convert 1000
+    label: bioimage_convert_1000
+    info: Reproducible 1000-cluster public bioimage benchmark built from Broad Bioimage
+      Benchmark Collection archives. Each selected source image is converted with pinned OME
+      Bio-Formats bftools defaults into common, exotic, and microscopy-oriented single-file
+      formats, producing same-source clusters for evaluating iscc-bio IMAGEWALK Data-Code
+      matching under converter-induced format drift. Directory-backed media such as
+      OME-Zarr are treated as single benchmark inputs so CZI/JPEG-XR to OME-Zarr decoder
+      drift cases can be added without hashing internal chunks.
+    url: https://bbbc.broadinstitute.org/
+    mode: image
+    installer: twinspect.datasets.bioimage_convert:install
+    samples: 1000
+    clusters: 1000
+    seed: 0
   - name: ISCC-FMA-10k
     label: iscc_fma_10k
     info: The ISCC-FMA-10k benchmark is a subset of [Free Music Archive 
@@ -206,6 +231,20 @@ metrics:
     label: distribution
     function: twinspect.metrics.distribution:distribution
 benchmarks:
+  - algorithm_label: bioimage_data_code_iw_64
+    dataset_label: bioimage_convert_1000
+    metric_labels:
+      - speed
+      - effectiveness
+      - distribution
+    active: true
+  - algorithm_label: image_code_v0_64
+    dataset_label: bioimage_convert_1000
+    metric_labels:
+      - speed
+      - effectiveness
+      - distribution
+    active: true
   - algorithm_label: image_code_v0_64
     dataset_label: mirflickr_mfnd
     metric_labels:

diff --git a/pyproject.toml b/pyproject.toml
@@ -84,3 +84,8 @@ format-code = { cmd = "ruff format .", help = "Code style formatting with ruff"
 format-md = { script = "twinspect.dev:format_md", help = "Markdown formating with mdformat" }
 fix-line-endings = { script = "twinspect.dev:fix_line_endings", help = "Convert line endings to LF"}
 all = ["format-yaml", "validate-schema", "generate-code", "format-code", "format-md", "fix-line-endings"]
+
+[dependency-groups]
+dev = [
+    "pytest>=8.0.0",
+]
diff --git a/tests/test_bioimage_config.py b/tests/test_bioimage_config.py
@@ -0,0 +1,22 @@
+import twinspect as ts
+
+
+def test_bioimage_dataset_algorithm_and_benchmark_are_configured():
+    dataset = ts.Dataset.from_label("bioimage_convert_1000")
+    assert dataset is not None
+    assert dataset.samples == 1000
+    assert dataset.clusters == 1000
+    assert dataset.installer == "twinspect.datasets.bioimage_convert:install"
+
+    algorithm = next(algo for algo in ts.cnf.algorithms if algo.label == "bioimage_data_code_iw_64")
+    assert algorithm.function == "twinspect.algos.iscc_bio:bioimage_data_code_iw_64"
+    assert algorithm.mode.value == "image"
+
+    benchmark = next(
+        bench
+        for bench in ts.cnf.benchmarks
+        if bench.algorithm_label == "bioimage_data_code_iw_64"
+        and bench.dataset_label == "bioimage_convert_1000"
+    )
+    assert benchmark.active is True
+    assert "effectiveness" in benchmark.metric_labels
diff --git a/tests/test_bioimage_convert_dataset.py b/tests/test_bioimage_convert_dataset.py
@@ -0,0 +1,229 @@
+from pathlib import Path
+from zipfile import ZipInfo
+
+import pytest
+
+from twinspect.datasets import bioimage_convert as bic
+from twinspect.metrics.eff import load_simprints
+
+
+def zinfo(name, size=100):
+    info = ZipInfo(name)
+    info.file_size = size
+    return info
+
+
+def test_is_eligible_member_filters_non_images_empty_and_oversized():
+    assert bic.is_eligible_member(zinfo("a/plate.tif", 42), max_file_size=100)
+    assert bic.is_eligible_member(zinfo("a/plate.BMP", 42), max_file_size=100)
+    assert not bic.is_eligible_member(zinfo("a/readme.txt", 42), max_file_size=100)
+    assert not bic.is_eligible_member(zinfo("a/empty.tif", 0), max_file_size=100)
+    assert not bic.is_eligible_member(zinfo("a/huge.tif", 101), max_file_size=100)
+
+
+def test_select_samples_is_reproducible_and_size_capped(monkeypatch):
+    calls = []
+
+    def fake_archive_image_infos(url, max_file_size):
+        calls.append((url, max_file_size))
+        return [zinfo(f"{Path(url).stem}-{idx}.tif", idx + 1) for idx in range(5)]
+
+    sources = (
+        {"label": "a", "url": "https://example.test/a.zip"},
+        {"label": "b", "url": "https://example.test/b.zip"},
+    )
+    monkeypatch.setattr(bic, "archive_image_infos", fake_archive_image_infos)
+
+    first = bic.select_samples(samples=6, seed=7, max_file_size=123, sources=sources)
+    second = bic.select_samples(samples=6, seed=7, max_file_size=123, sources=sources)
+
+    assert first == second
+    assert len(first) == 6
+    assert calls[0][1] == 123
+    assert {sample.source_label for sample in first} <= {"a", "b"}
+
+
+def test_select_samples_fails_when_not_enough_candidates(monkeypatch):
+    monkeypatch.setattr(bic, "archive_image_infos", lambda url, max_file_size: [zinfo("one.tif")])
+    with pytest.raises(RuntimeError, match="need 2"):
+        bic.select_samples(
+            samples=2, sources=({"label": "a", "url": "https://example.test/a.zip"},)
+        )
+
+
+def test_manifest_roundtrip(tmp_path):
+    samples = [
+        bic.BioimageSample(
+            source_label="bbbc-test",
+            archive_url="https://example.test/archive.zip",
+            member_name="folder/source.TIF",
+            file_size=10,
+        )
+    ]
+    manifest = tmp_path / "manifest.csv"
+    bic.write_manifest(samples, manifest)
+
+    assert bic.load_manifest(manifest) == samples
+
+
+def test_build_cluster_keeps_original_first_and_conversion_labels(monkeypatch, tmp_path):
+    sample = bic.BioimageSample(
+        source_label="bbbc-test",
+        archive_url="https://example.test/archive.zip",
+        member_name="folder/source.TIF",
+        file_size=10,
+    )
+
+    def fake_extract_member(sample, output_path):
+        output_path.write_bytes(b"original")
+        return output_path
+
+    def fake_convert_file(input_path, output_path, format_name):
+        output_path.write_bytes(format_name.encode("utf-8"))
+        return output_path
+
+    monkeypatch.setattr(bic, "extract_member", fake_extract_member)
+    monkeypatch.setattr(bic, "convert_file", fake_convert_file)
+
+    cluster = tmp_path / "0000000"
+    bic.build_cluster(sample, cluster)
+
+    names = sorted(path.name for path in cluster.iterdir())
+    assert names == [
+        "0original.TIF",
+        "1variant_ome-tiff.ome.tiff",
+        "2variant_tiff.tiff",
+        "3variant_png.png",
+        "4variant_jp2.jp2",
+        "5variant_dicom.dcm",
+        "6variant_ics.ics",
+    ]
+    bic.validate_cluster(cluster)
+
+
+def test_bioimage_convert_command_allows_template(monkeypatch):
+    monkeypatch.setenv(
+        "TWINSPECT_BIOIMAGE_CONVERT_TEMPLATE",
+        "converter --input {input} --output {output} --format {format}",
+    )
+    command = bic.bioimage_convert_command(Path("in.tif"), Path("out.png"), "png")
+    assert command == [
+        "converter",
+        "--input",
+        "in.tif",
+        "--output",
+        "out.png",
+        "--format",
+        "png",
+    ]
+
+
+def test_bioformats_tools_archive_is_pinned():
+    assert bic.BIOFORMATS_VERSION == "8.5.0"
+    assert bic.BFTOOLS_URL.endswith("/bio-formats/8.5.0/artifacts/bftools.zip")
+    assert bic.BFTOOLS_SHA256 == "07a3bb1d3de84da3a709655a1008cb2d9b19becc5bad4ae4112633aec9380478"
+
+
+def test_default_converter_command_uses_pinned_bfconvert(monkeypatch, tmp_path):
+    monkeypatch.delenv("TWINSPECT_BIOIMAGE_CONVERT_TEMPLATE", raising=False)
+    monkeypatch.delenv("TWINSPECT_BIOIMAGE_CONVERT_BIN", raising=False)
+    fake = tmp_path / ("bfconvert.bat" if bic.platform.system() == "Windows" else "bfconvert")
+    fake.write_text("", encoding="utf-8")
+    monkeypatch.setattr(bic, "ensure_bioformats_tools", lambda cache_dir=None: fake)
+
+    command = bic.bioimage_convert_command(Path("in.tif"), Path("out.ome.tiff"), "ome-tiff")
+
+    assert command == [str(fake), "in.tif", "out.ome.tiff"]
+
+
+
+def test_ensure_bioformats_tools_downloads_verifies_and_extracts(monkeypatch, tmp_path):
+    archive = tmp_path / "fixture-bftools.zip"
+    import hashlib
+    from zipfile import ZipFile
+
+    script_name = "bfconvert.bat" if bic.platform.system() == "Windows" else "bfconvert"
+    with ZipFile(archive, "w") as zf:
+        zf.writestr(f"bftools/{script_name}", "echo bfconvert")
+        zf.writestr("bftools/bf.sh", "echo bf")
+        zf.writestr("bftools/bioformats_package.jar", "jar")
+    digest = hashlib.sha256(archive.read_bytes()).hexdigest()
+
+    monkeypatch.setattr(bic, "BFTOOLS_SHA256", digest)
+    monkeypatch.setattr(bic, "download_file", lambda url, path: path.write_bytes(archive.read_bytes()))
+
+    bfconvert = bic.ensure_bioformats_tools(tmp_path / "tools")
+
+    assert bfconvert.name == script_name
+    assert bfconvert.exists()
+    assert (bfconvert.parent / "bioformats_package.jar").exists()
+    if bic.platform.system() != "Windows":
+        assert bfconvert.stat().st_mode & 0o111
+        assert (bfconvert.parent / "bf.sh").stat().st_mode & 0o111
+
+
+def test_ensure_bioformats_tools_selects_windows_launcher(monkeypatch, tmp_path):
+    archive = tmp_path / "fixture-bftools.zip"
+    import hashlib
+    from zipfile import ZipFile
+
+    with ZipFile(archive, "w") as zf:
+        zf.writestr("bftools/bfconvert.bat", "call bf.bat")
+        zf.writestr("bftools/bioformats_package.jar", "jar")
+    digest = hashlib.sha256(archive.read_bytes()).hexdigest()
+
+    monkeypatch.setattr(bic.platform, "system", lambda: "Windows")
+    monkeypatch.setattr(bic, "BFTOOLS_SHA256", digest)
+    monkeypatch.setattr(bic, "download_file", lambda url, path: path.write_bytes(archive.read_bytes()))
+
+    bfconvert = bic.ensure_bioformats_tools(tmp_path / "tools")
+
+    assert bfconvert.name == "bfconvert.bat"
+
+
+def test_safe_extract_zip_rejects_path_traversal(tmp_path):
+    archive = tmp_path / "evil.zip"
+    from zipfile import ZipFile
+
+    with ZipFile(archive, "w") as zf:
+        zf.writestr("../evil", "nope")
+
+    with pytest.raises(RuntimeError, match="Unsafe archive"):
+        bic.safe_extract_zip(archive, tmp_path / "extract")
+
+
+def test_ensure_bioformats_tools_rejects_checksum_mismatch(monkeypatch, tmp_path):
+    monkeypatch.setattr(bic, "download_file", lambda url, path: path.write_bytes(b"wrong"))
+    monkeypatch.setattr(bic, "BFTOOLS_SHA256", "0" * 64)
+
+    with pytest.raises(RuntimeError, match="checksum"):
+        bic.ensure_bioformats_tools(tmp_path / "tools")
+
+
+def test_validate_file_size_rejects_empty_and_oversized(tmp_path):
+    empty = tmp_path / "empty.tif"
+    empty.write_bytes(b"")
+    with pytest.raises(RuntimeError, match="Empty"):
+        bic.validate_file_size(empty)
+
+    oversized = tmp_path / "oversized.tif"
+    oversized.write_bytes(b"abc")
+    with pytest.raises(RuntimeError, match="exceeds"):
+        bic.validate_file_size(oversized, max_file_size=2)
+
+
+def test_cluster_layout_matches_twinspect_ground_truth_parser(tmp_path):
+    simprint = tmp_path / "simprint.csv"
+    simprint.write_text(
+        "id;code;file;size;time\n"
+        "0;0000000000000000;0000000/0original.TIF;1;1\n"
+        "1;0000000000000000;0000000/1variant_ome-tiff.ome.tiff;1;1\n"
+        "2;ffffffffffffffff;distractor.TIF;1;1\n",
+        encoding="utf-8",
+    )
+
+    df = load_simprints(simprint)
+    assert df.loc[0, "cluster"] == "0000000"
+    assert bool(df.loc[0, "is_original"]) is True
+    assert df.loc[1, "transform"] == "ome-tiff"
+    assert df.loc[2, "cluster"] is None