From 57eef18b457ab37cf7b294fb15678b75bc8deca8 Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Wed, 18 Feb 2026 17:28:42 +0100 Subject: [PATCH 1/2] Load Data from registry (Zenodo) and fix DF issue --- chemap/data_loader.py | 62 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/chemap/data_loader.py b/chemap/data_loader.py index af95218..1453bc5 100644 --- a/chemap/data_loader.py +++ b/chemap/data_loader.py @@ -1,5 +1,6 @@ import os import pathlib +import re import pandas as pd import pooch @@ -33,6 +34,30 @@ def load(self, source: str, **kwargs) -> list: else: raise ValueError(f"Source {source} unknown.") + def load_collection(self, source: str, **kwargs) -> list: + """ + Loads a dataset collection from a DOI-based registry (e.g. Zenodo). + + Parameters + ------------- + source: + A DOI. + + Returns + ------------- + list of downloaded filenames from the registry. + + Raises + ------------- + ValueError if DOI not present. + """ + doi_pattern = r'(10\.\d{4,9}/[-._;()/:a-zA-Z0-9]+)' + + if not source.startswith("doi") or not bool(re.search(doi_pattern, source)): + ValueError(f"Could not detect DOI in source {source}.") + + return self._from_registry(source, **kwargs) + def _from_local_file(self, path, smiles_column: str = "smiles") -> list: """ Loads a dataset from local file. @@ -67,10 +92,13 @@ def _from_local_file(self, path, smiles_column: str = "smiles") -> list: else: raise ValueError(f"Fileformat {suffix} not supported.") - if smiles_column not in df.columns: + column_map = {col.lower(): col for col in df.columns} + target_col = column_map.get(smiles_column.lower()) + + if not target_col: raise ValueError(f"Smiles column {smiles_column} not in dataframe.") - return df[smiles_column].tolist() + return df[target_col].tolist() def _from_web(self, url: str, **kwargs) -> list: """ @@ -93,3 +121,33 @@ def _from_web(self, url: str, **kwargs) -> list: ) return self._from_local_file(file_path, **kwargs) + + def _from_registry(self, doi: str, **kwargs) -> list: + """ + Loads a dataset collection from DOI-based registry (e.g., Zenodo). + + Parameters + ------------- + doi: + A valid DOI string. + + Returns + ------------- + list of strings with absolute path for all downloaded files. + + Raises + ------------- + ValueError if file type unsupported. + ValueError if smiles column not present. + """ + if not doi.startswith("doi"): + doi = f"doi:{doi}" + + client = pooch.create( + path=self.cache_dir, + base_url=f"{doi}/", + registry=None, + ) + client.load_registry_from_doi() + + return [client.fetch(f, progressbar=True) for f in client.registry] \ No newline at end of file From eca3832226eb5aa434877bea26e5ff53d37d821e Mon Sep 17 00:00:00 2001 From: Julian Pollmann Date: Wed, 18 Feb 2026 17:34:26 +0100 Subject: [PATCH 2/2] Update README --- README.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.md b/README.md index 5226c55..e1ce203 100644 --- a/README.md +++ b/README.md @@ -50,7 +50,12 @@ from chemap import compute_fingerprints, DatasetLoader, FingerprintConfig ds_loader = DatasetLoader() +# Load a single dataset from a local file smiles = ds_loader.load("tests/data/smiles.csv") +# or load a dataset collection from a DOI based registry (e.g., Zenodo) +files = ds_loader.load_collection("10.5281/zenodo.18682050") +# pass one of the absolute file paths from files +smiles = ds_loader.load(files[0]) # ---------------------------- # RDKit: Morgan (folded, dense)