From a106dfc77bcf05926119e2dc552edc51f15870c2 Mon Sep 17 00:00:00 2001 From: Martin Etzrodt <31589705+etzm@users.noreply.github.com> Date: Wed, 30 Jul 2025 08:23:46 +0200 Subject: [PATCH 1/2] Add contributor to CITATION.cff and fix paper.md authorship - Add Martin Etzrodt as contributor in CITATION.cff - Fix YAML structure in paper.md authors section - Add three new citations to paper.bib: - Aksenova et al. (2024) - Data stewardship tools review - Chen et al. (2022) - FAIR Higgs boson dataset - Murray et al. (2021) - Citizen science data curation --- CITATION.cff | 2 +- paper/paper.bib | 35 +++++++++++++++++++++++++++++++++++ paper/paper.md | 12 ++++++++---- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/CITATION.cff b/CITATION.cff index a90415e..d030a7c 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,7 +6,7 @@ authors: given-names: Titusz email: tp@iscc.io orcid: https://orcid.org/0000-0002-0521-4214 - family-names: Etzrodt + - family-names: Etzrodt given-names: Martin email: etzrodt.martin@gmail.com orcid: https://orcid.org/0000-0003-1928-3904 diff --git a/paper/paper.bib b/paper/paper.bib index cd07928..c1e1021 100644 --- a/paper/paper.bib +++ b/paper/paper.bib @@ -1,3 +1,38 @@ +@article{aksenova2024data, + title = {Current state of data stewardship tools in life science}, + author = {Aksenova, Anna and Johny, Anoop and Adams, Tim and Gribbon, Phil and Jacobs, Marc and Hofmann-Apitius, Martin}, + journal = {Frontiers in Big Data}, + year = {2024}, + volume = {7}, + pages = {1428568}, + doi = {10.3389/fdata.2024.1428568}, + url = {https://www.frontiersin.org/journals/big-data/articles/10.3389/fdata.2024.1428568/full} +} + +@article{chen2022fair, + title = {A {FAIR} and {AI}-ready Higgs boson decay dataset}, + author = {Chen, Yifan and Huerta, E. A. and Duarte, Javier and Harris, Philip and Katz, Daniel S. and Neubauer, Mark S. and Diaz, Daniel and Mokhtar, Farouk and Kansal, Raghav and Park, Sang Eon and Kindratenko, Volodymyr V. and Zhao, Zhizhen and Rusack, Roger}, + journal = {Scientific Data}, + year = {2022}, + volume = {9}, + number = {31}, + pages = {1--12}, + doi = {10.1038/s41597-021-01109-0}, + url = {https://www.nature.com/articles/s41597-021-01109-0} +} + +@article{murray2021accessible, + title = {Accessible data curation and analytics for international-scale citizen science datasets}, + author = {Murray, Benjamin and Kerfoot, Eric and Chen, Liyuan and Deng, Jie and Graham, Mark S. and Sudre, Carole H. and Molteni, Erika and Canas, Liane S. and Antonelli, Michela and Klaser, Kerstin and Visconti, Alessia and Hammers, Alexander and Chan, Andrew T. and Franks, Paul W. and Davies, Richard and Wolf, Jonathan and Spector, Tim D. and Steves, Claire J. and Modat, Marc and Ourselin, Sebastien}, + journal = {Scientific Data}, + year = {2021}, + volume = {8}, + number = {297}, + pages = {1--12}, + doi = {10.1038/s41597-021-01071-x}, + url = {https://www.nature.com/articles/s41597-021-01071-x} +} + @misc{iso24138:2024, title = {{ISO} 24138:2024 Information and documentation — International Standard Content Code}, author = {{International Organization for Standardization}}, diff --git a/paper/paper.md b/paper/paper.md index 4f9607a..f7f86c0 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -24,15 +24,19 @@ bibliography: paper.bib # Summary -Scientific data management faces unprecedented challenges as research instruments generate ever-larger datasets. In fields like bioimaging, where individual experiments can produce terabytes of data, traditional checksums prove inadequate for both performance and functionality. The International Standard Content Code (ISCC), standardized as ISO 24138:2024 [@iso24138:2024], offers a content-derived identification system that combines data integrity verification with similarity detection capabilities. However, existing implementations process data too slowly for practical use with large scientific datasets. ISCC-SUM addresses this performance gap through a Rust implementation [@rust2024] achieving 50-130× speedup over reference implementations, processing data at over 1 GB/s while maintaining full standard compliance. +To facilitate sharing and reuse scientific data management needs effective technologies to allow for identification, tracking, and integrity verification of datasets. As research instruments generate ever-larger datasets automation of these processes is essential. However traditional checksums prove inadequate for both performance and functionality. The International Standard Content Code (ISCC), standardized as ISO 24138:2024 [@iso24138:2024], offers a content-derived identification system that combines data integrity verification with similarity detection capabilities, yet existing implementations of the ISCC process data too slowly for practical use with large datasets. Here we implement a Rust version of the ISCC [@rust2024], “ISCC-SUM”, closing the performance gap and achieving 50-130× speedup over reference implementations, processing data at over 1 GB/s while maintaining full ISO 24138 standard compliance. We demonstrate that ISCC-SUM could effectively handle large bioimaging datasets with terabytes of data in size. + # Statement of Need -ISCC-SUM [@pan2025isccsum] provides high-performance implementations of ISCC Data-Code and Instance-Code generation, the two fundamental components for media-agnostic content identification. The Data-Code employs content-defined chunking and MinHash algorithms to create similarity-preserving hashes, enabling researchers to identify near-duplicate datasets even when files have minor variations. The Instance-Code generates cryptographic checksums using BLAKE3 [@blake3:2020], ensuring data integrity while supporting efficient verified streaming through its tree-based structure. +Modern scientific instruments routinely generate datasets exceeding hundreds of gigabytes. A manual identification, tracking, and integrity verification of such large amounts of data is impossible, yet essential for achievement of FAIR [@aksenova2024data] [@chen2022fair] [@murray2021accessible] data use. A performant automated and standardized approach across various scientific domains with potential for easy workflow integration is lacking. + +ISCC-SUM directly addresses these requirements through a high-performance implementation of ISCC Data-Code and Instance-Code generation, the two fundamental components for media-agnostic content identification. While the reference implementation of the ISCC can process 7-8 MB/s for pure Python implementations [@iscccore2024], the Rust based ISCC-SUM implementation achieved 950-1050 MB/s reducing processing time from hours to minutes. -This tool directly addresses requirements from the BIO-CODES project [@oscars2024biocodes], part of the European OSCARS initiative for enhancing AI-readiness of bioimaging data. Modern microscopy facilities routinely generate datasets exceeding hundreds of gigabytes, making performance critical for workflow integration. ISCC-SUM processes these files at 950-1050 MB/s, compared to 7-8 MB/s for pure Python implementations [@iscccore2024], reducing processing time from hours to minutes. The familiar checksum-style command-line interface ensures easy adoption, while Python bindings [@pyo3:2024] enable integration into existing data pipelines. Support for container formats like ZARR and HDF5, common in scientific computing, allows direct processing of complex hierarchical datasets. +ISCC-SUM [@pan2025isccsum] +The Data-Code employs content-defined chunking and MinHash algorithms to create similarity-preserving hashes, enabling researchers to identify near-duplicate datasets even when files have minor variations. The Instance-Code generates cryptographic checksums using BLAKE3 [@blake3:2020], ensuring data integrity while supporting efficient verified streaming through its tree-based structure. The familiar checksum-style command-line interface ensures easy adoption, while Python bindings [@pyo3:2024] enable integration into existing data pipelines. Support for container formats like ZARR and HDF5, common in scientific computing, allows direct processing of complex hierarchical datasets. -Beyond bioimaging, ISCC-SUM serves diverse scientific communities requiring robust content identification. The similarity detection capability helps identify redundant submissions in data repositories, track dataset evolution across research projects, and verify exact dataset versions for computational reproducibility. By implementing the ISO 24138:2024 standard [@iso24138:2024], ISCC-SUM ensures global interoperability while introducing extensions like TREEWALK for deterministic directory hashing and wider hash formats for enhanced security. The tool's open-source nature and comprehensive test coverage (100%) provide the reliability essential for scientific infrastructure. +ISCC-SUM can serve diverse scientific communities requiring robust content identification. The similarity detection capability helps identify redundant submissions in data repositories, track dataset evolution across research projects, and verify exact dataset versions for computational reproducibility. By implementing the ISO 24138:2024 standard [@iso24138:2024], ISCC-SUM ensures global interoperability while introducing extensions like TREEWALK for deterministic directory hashing and wider hash formats for enhanced security. The tool's open-source nature and comprehensive test coverage (100%) provide the reliability essential for scientific infrastructure. # Acknowledgements From 8aef194bcc17a2c5580bda33d52888aca950cc64 Mon Sep 17 00:00:00 2001 From: Martin Etzrodt <31589705+etzm@users.noreply.github.com> Date: Wed, 30 Jul 2025 08:33:54 +0200 Subject: [PATCH 2/2] Adding-contributor changed citation position --- paper/paper.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paper/paper.md b/paper/paper.md index f7f86c0..58d7e01 100644 --- a/paper/paper.md +++ b/paper/paper.md @@ -29,7 +29,7 @@ To facilitate sharing and reuse scientific data management needs effective techn # Statement of Need -Modern scientific instruments routinely generate datasets exceeding hundreds of gigabytes. A manual identification, tracking, and integrity verification of such large amounts of data is impossible, yet essential for achievement of FAIR [@aksenova2024data] [@chen2022fair] [@murray2021accessible] data use. A performant automated and standardized approach across various scientific domains with potential for easy workflow integration is lacking. +Modern scientific instruments routinely generate datasets exceeding hundreds of gigabytes. A manual identification, tracking, and integrity verification of such large amounts of data is impossible, yet essential for achievement of FAIR data use[@aksenova2024data] [@chen2022fair] [@murray2021accessible] . A performant automated and standardized approach across various scientific domains with potential for easy workflow integration is lacking. ISCC-SUM directly addresses these requirements through a high-performance implementation of ISCC Data-Code and Instance-Code generation, the two fundamental components for media-agnostic content identification. While the reference implementation of the ISCC can process 7-8 MB/s for pure Python implementations [@iscccore2024], the Rust based ISCC-SUM implementation achieved 950-1050 MB/s reducing processing time from hours to minutes.