BioSystemsUM · Joao-Cheixo · Oct 6, 2025 · Mar 19, 2026 · Apr 8, 2026 · Apr 29, 2026
diff --git a/examples/notebooks/benchmarking1.ipynb b/examples/notebooks/benchmarking1.ipynb
diff --git a/examples/notebooks/data/bace.csv b/examples/notebooks/data/bace.csv
diff --git a/examples/notebooks/data/hiv.csv b/examples/notebooks/data/hiv.csv
diff --git a/examples/notebooks/results/chemberta_benchmark_20251027_123600_best_combinations.csv b/examples/notebooks/results/chemberta_benchmark_20251027_123600_best_combinations.csv
@@ -0,0 +1,2 @@
+ChemBERTa_Model,Algorithm,ROC_AUC,Accuracy,F1_Score,Best_Hyperparams
+ChemBERTa-zinc-base,RandomForest,0.7646254784034993,0.7644628099173554,0.7615062761506276,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}"
diff --git a/examples/notebooks/results/chemberta_benchmark_20251027_123600_best_models.pkl b/examples/notebooks/results/chemberta_benchmark_20251027_123600_best_models.pkl
diff --git a/examples/notebooks/results/chemberta_benchmark_20251027_123600_hyperparameters.csv b/examples/notebooks/results/chemberta_benchmark_20251027_123600_hyperparameters.csv
@@ -0,0 +1,2 @@
+ChemBERTa_Model,Algorithm,Best_Hyperparameters
+ChemBERTa-zinc-base,RandomForest,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}"
diff --git a/examples/notebooks/results/chemberta_benchmark_20251027_123600_metadata.json b/examples/notebooks/results/chemberta_benchmark_20251027_123600_metadata.json
diff --git a/examples/notebooks/results/chemberta_benchmark_20251027_123600_results.csv b/examples/notebooks/results/chemberta_benchmark_20251027_123600_results.csv
@@ -0,0 +1,2 @@
+ChemBERTa_Model,Model_Name,Description,Feature_Dimension,Algorithm,ROC_AUC,Accuracy,F1_Score,Precision,Recall,Best_Hyperparams,Featurization_Time_Seconds
+ChemBERTa-zinc-base,seyonec/ChemBERTa-zinc-base-v1,Base model trained on ZINC dataset (768 dim),768,RandomForest,0.7646254784034993,0.7644628099173554,0.7615062761506276,0.7520661157024794,0.7711864406779662,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}",7.359382629394531
diff --git a/examples/notebooks/results/chemberta_benchmark_20251027_123600_summary.csv b/examples/notebooks/results/chemberta_benchmark_20251027_123600_summary.csv
@@ -0,0 +1,4 @@
+,ROC_AUC,ROC_AUC,ROC_AUC,ROC_AUC,Accuracy,F1_Score,Precision,Recall,Feature_Dimension,Featurization_Time_Seconds
+,mean,std,max,min,mean,mean,mean,mean,first,first
+ChemBERTa_Model,,,,,,,,,,
+ChemBERTa-zinc-base,0.7646,,0.7646,0.7646,0.7645,0.7615,0.7521,0.7712,768,7.3594
diff --git a/examples/test_HuggingFaceFeaturizer_standalone.py b/examples/test_HuggingFaceFeaturizer_standalone.py
@@ -0,0 +1,169 @@
+import numpy as np
+from rdkit.Chem import MolToSmiles, MolFromSmiles
+import logging
+import os
+
+# using safetensors
+os.environ["TRANSFORMERS_USE_SAFETENSORS"] = "true"
+
+logger = logging.getLogger(__name__)
+
+try:
+    from transformers import AutoModel, AutoTokenizer
+    import torch
+    _transformers_available = True
+    print(f"PyTorch version: {torch.__version__}")
+except ImportError:
+    _transformers_available = False
+    logger.warning("Transformers not available.")
+
+
+class HuggingFaceFeaturizer:
+    """
+    HuggingFaceFeaturizer that works with torch 2.5.1 using SafeTensors
+    """
+
+    def __init__(self, 
+                 model_name: str = "seyonec/ChemBERTa-zinc-base-v1",  # Use this model instead
+                 pooling: str = "mean",
+                 max_length: int = 512):
+
+        if not _transformers_available:
+            raise ImportError("Transformers and torch are required for HuggingFaceFeaturizer")
+
+        self.model_name = model_name
+        self.pooling = pooling
+        self.max_length = max_length
+
+        print(f"Loading model: {model_name}")
+        print(" Using SafeTensors format to avoid torch.load issues...")
+
+        try:
+            # Load tokenizer
+            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+            # Load model with explicit safetensors preference
+            self.model = AutoModel.from_pretrained(
+                model_name,
+                trust_remote_code=True,
+                low_cpu_mem_usage=True,
+                use_safetensors=True  # Explicitly force safetensors
+            )
+            self.model.eval()
+
+            # Initialize feature names
+            embedding_dim = self.model.config.hidden_size
+            self.features_names = [f'chemberta_{i}' for i in range(embedding_dim)]
+
+            print(f" Successfully loaded ChemBERTa model: {model_name}")
+            print(f" Embedding dimension: {embedding_dim}")
+
+        except Exception as e:
+            print(f" Failed to load model: {e}")
+            print(" Trying alternative model...")
+            self._try_alternative_model()
+
+    def _try_alternative_model(self):
+        """Try loading an alternative model that's known to work"""
+        alternative_models = [
+            "seyonec/ChemBERTa-zinc-base-v1",
+            "seyonec/PubChem10M_SMILES_BPE_396_250",
+            "laituan245/molt5-base"
+        ]
+
+        for model_name in alternative_models:
+            try:
+                print(f" Trying alternative model: {model_name}")
+                self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+                self.model = AutoModel.from_pretrained(
+                    model_name,
+                    trust_remote_code=True,
+                    use_safetensors=True
+                )
+                self.model.eval()
+
+                embedding_dim = self.model.config.hidden_size
+                self.features_names = [f'chemberta_{i}' for i in range(embedding_dim)]
+                self.model_name = model_name
+
+                print(f" Successfully loaded alternative model: {model_name}")
+                return
+
+            except Exception as e:
+                print(f" Failed with {model_name}: {e}")
+                continue
+
+        raise ImportError("Could not load any ChemBERTa model with current torch version")
+
+    def featurize_smiles(self, smiles: str):
+        """Featurize a single SMILES string."""
+        try:
+            if not smiles or not isinstance(smiles, str):
+                return None
+
+            # Tokenize
+            inputs = self.tokenizer(
+                smiles,
+                padding=True,
+                truncation=True,
+                max_length=self.max_length,
+                return_tensors="pt"
+            )
+
+            # Generate embeddings
+            with torch.no_grad():
+                outputs = self.model(**inputs)
+                token_embeddings = outputs.last_hidden_state
+
+            # Apply pooling
+            if self.pooling == "mean":
+                attention_mask = inputs['attention_mask']
+                input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+                sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
+                sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+                embedding = sum_embeddings / sum_mask
+            elif self.pooling == "cls":
+                embedding = token_embeddings[:, 0, :]
+            else:
+                raise ValueError(f"Unsupported pooling: {self.pooling}")
+
+            return embedding.cpu().numpy().flatten()
+
+        except Exception as e:
+            logger.warning(f"Failed to featurize SMILES {smiles}: {str(e)}")
+            return None
+
+
+# Test the featurizer
+if __name__ == "__main__":
+    print(" Testing HuggingFaceFeaturizer with Torch 2.5.1...")
+    print("=" * 50)
+
+    # Test molecules
+    test_smiles = [
+        "CCO",  # Ethanol
+        "C1=CC=CC=C1",  # Benzene
+        "CC(=O)O",  # Acetic acid
+    ]
+
+    try:
+        featurizer = HuggingFaceFeaturizer()
+
+        print(f"\n Testing {len(test_smiles)} molecules:")
+        print("=" * 50)
+
+        for i, smiles in enumerate(test_smiles, 1):
+            print(f"\n{i}. Processing: {smiles}")
+            embedding = featurizer.featurize_smiles(smiles)
+            if embedding is not None:
+                print(f"   Embedding shape: {embedding.shape}")
+                print(f"   First 5 values: {[f'{x:.4f}' for x in embedding[:5]]}")
+            else:
+                print("    Failed to generate embedding")
+
+    except Exception as e:
+        print(f" Error: {e}")
+        print(" Alternative solutions:")
+        print("   1. Try upgrading only torch: pip install torch==2.6.1 --upgrade")
+        print("   2. Clear HuggingFace cache: huggingface-cli delete-cache")
+        print("   3. Use a local model path if available")
diff --git a/requirements.txt b/requirements.txt
@@ -32,4 +32,5 @@ timeout_decorator==0.5.0
 IPython
 deepchem==2.8.0
 pandas<=2.0.0
-biosynfoni
+biosynfoni
+transformers
diff --git a/src/deepmol/compound_featurization/__init__.py b/src/deepmol/compound_featurization/__init__.py
@@ -41,3 +41,5 @@
 from .mhfp import MHFP
 
 from .biosynfoni import BiosynfoniKeys
+
+from .huggingface_featurizer import HuggingFaceFeaturizer
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ChemBERTa_Model,Algorithm,ROC_AUC,Accuracy,F1_Score,Best_Hyperparams
		ChemBERTa-zinc-base,RandomForest,0.7646254784034993,0.7644628099173554,0.7615062761506276,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ChemBERTa_Model,Algorithm,Best_Hyperparameters
		ChemBERTa-zinc-base,RandomForest,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}"
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ChemBERTa_Model,Model_Name,Description,Feature_Dimension,Algorithm,ROC_AUC,Accuracy,F1_Score,Precision,Recall,Best_Hyperparams,Featurization_Time_Seconds
		ChemBERTa-zinc-base,seyonec/ChemBERTa-zinc-base-v1,Base model trained on ZINC dataset (768 dim),768,RandomForest,0.7646254784034993,0.7644628099173554,0.7615062761506276,0.7520661157024794,0.7711864406779662,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}",7.359382629394531
Original file line number	Diff line number	Diff line change
Expand Up		@@ -41,3 +41,5 @@
		from .mhfp import MHFP

		from .biosynfoni import BiosynfoniKeys

		from .huggingface_featurizer import HuggingFaceFeaturizer