Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14,528 changes: 14,528 additions & 0 deletions examples/notebooks/benchmarking1.ipynb

Large diffs are not rendered by default.

1,514 changes: 1,514 additions & 0 deletions examples/notebooks/data/bace.csv

Large diffs are not rendered by default.

82,256 changes: 82,256 additions & 0 deletions examples/notebooks/data/hiv.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ChemBERTa_Model,Algorithm,ROC_AUC,Accuracy,F1_Score,Best_Hyperparams
ChemBERTa-zinc-base,RandomForest,0.7646254784034993,0.7644628099173554,0.7615062761506276,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ChemBERTa_Model,Algorithm,Best_Hyperparameters
ChemBERTa-zinc-base,RandomForest,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ChemBERTa_Model,Model_Name,Description,Feature_Dimension,Algorithm,ROC_AUC,Accuracy,F1_Score,Precision,Recall,Best_Hyperparams,Featurization_Time_Seconds
ChemBERTa-zinc-base,seyonec/ChemBERTa-zinc-base-v1,Base model trained on ZINC dataset (768 dim),768,RandomForest,0.7646254784034993,0.7644628099173554,0.7615062761506276,0.7520661157024794,0.7711864406779662,"{'n_estimators': 200, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 1, 'class_weight': 'balanced'}",7.359382629394531
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,ROC_AUC,ROC_AUC,ROC_AUC,ROC_AUC,Accuracy,F1_Score,Precision,Recall,Feature_Dimension,Featurization_Time_Seconds
,mean,std,max,min,mean,mean,mean,mean,first,first
ChemBERTa_Model,,,,,,,,,,
ChemBERTa-zinc-base,0.7646,,0.7646,0.7646,0.7645,0.7615,0.7521,0.7712,768,7.3594
169 changes: 169 additions & 0 deletions examples/test_HuggingFaceFeaturizer_standalone.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
import numpy as np
from rdkit.Chem import MolToSmiles, MolFromSmiles
import logging
import os

# using safetensors
os.environ["TRANSFORMERS_USE_SAFETENSORS"] = "true"

logger = logging.getLogger(__name__)

try:
from transformers import AutoModel, AutoTokenizer
import torch
_transformers_available = True
print(f"PyTorch version: {torch.__version__}")
except ImportError:
_transformers_available = False
logger.warning("Transformers not available.")


class HuggingFaceFeaturizer:
"""
HuggingFaceFeaturizer that works with torch 2.5.1 using SafeTensors
"""

def __init__(self,
model_name: str = "seyonec/ChemBERTa-zinc-base-v1", # Use this model instead
pooling: str = "mean",
max_length: int = 512):

if not _transformers_available:
raise ImportError("Transformers and torch are required for HuggingFaceFeaturizer")

self.model_name = model_name
self.pooling = pooling
self.max_length = max_length

print(f"Loading model: {model_name}")
print(" Using SafeTensors format to avoid torch.load issues...")

try:
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with explicit safetensors preference
self.model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
low_cpu_mem_usage=True,
use_safetensors=True # Explicitly force safetensors
)
self.model.eval()

# Initialize feature names
embedding_dim = self.model.config.hidden_size
self.features_names = [f'chemberta_{i}' for i in range(embedding_dim)]

print(f" Successfully loaded ChemBERTa model: {model_name}")
print(f" Embedding dimension: {embedding_dim}")

except Exception as e:
print(f" Failed to load model: {e}")
print(" Trying alternative model...")
self._try_alternative_model()

def _try_alternative_model(self):
"""Try loading an alternative model that's known to work"""
alternative_models = [
"seyonec/ChemBERTa-zinc-base-v1",
"seyonec/PubChem10M_SMILES_BPE_396_250",
"laituan245/molt5-base"
]

for model_name in alternative_models:
try:
print(f" Trying alternative model: {model_name}")
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True,
use_safetensors=True
)
self.model.eval()

embedding_dim = self.model.config.hidden_size
self.features_names = [f'chemberta_{i}' for i in range(embedding_dim)]
self.model_name = model_name

print(f" Successfully loaded alternative model: {model_name}")
return

except Exception as e:
print(f" Failed with {model_name}: {e}")
continue

raise ImportError("Could not load any ChemBERTa model with current torch version")

def featurize_smiles(self, smiles: str):
"""Featurize a single SMILES string."""
try:
if not smiles or not isinstance(smiles, str):
return None

# Tokenize
inputs = self.tokenizer(
smiles,
padding=True,
truncation=True,
max_length=self.max_length,
return_tensors="pt"
)

# Generate embeddings
with torch.no_grad():
outputs = self.model(**inputs)
token_embeddings = outputs.last_hidden_state

# Apply pooling
if self.pooling == "mean":
attention_mask = inputs['attention_mask']
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
embedding = sum_embeddings / sum_mask
elif self.pooling == "cls":
embedding = token_embeddings[:, 0, :]
else:
raise ValueError(f"Unsupported pooling: {self.pooling}")

return embedding.cpu().numpy().flatten()

except Exception as e:
logger.warning(f"Failed to featurize SMILES {smiles}: {str(e)}")
return None


# Test the featurizer
if __name__ == "__main__":
print(" Testing HuggingFaceFeaturizer with Torch 2.5.1...")
print("=" * 50)

# Test molecules
test_smiles = [
"CCO", # Ethanol
"C1=CC=CC=C1", # Benzene
"CC(=O)O", # Acetic acid
]

try:
featurizer = HuggingFaceFeaturizer()

print(f"\n Testing {len(test_smiles)} molecules:")
print("=" * 50)

for i, smiles in enumerate(test_smiles, 1):
print(f"\n{i}. Processing: {smiles}")
embedding = featurizer.featurize_smiles(smiles)
if embedding is not None:
print(f" Embedding shape: {embedding.shape}")
print(f" First 5 values: {[f'{x:.4f}' for x in embedding[:5]]}")
else:
print(" Failed to generate embedding")

except Exception as e:
print(f" Error: {e}")
print(" Alternative solutions:")
print(" 1. Try upgrading only torch: pip install torch==2.6.1 --upgrade")
print(" 2. Clear HuggingFace cache: huggingface-cli delete-cache")
print(" 3. Use a local model path if available")
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,4 +32,5 @@ timeout_decorator==0.5.0
IPython
deepchem==2.8.0
pandas<=2.0.0
biosynfoni
biosynfoni
transformers
2 changes: 2 additions & 0 deletions src/deepmol/compound_featurization/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,5 @@
from .mhfp import MHFP

from .biosynfoni import BiosynfoniKeys

from .huggingface_featurizer import HuggingFaceFeaturizer
Loading