Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/CI_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
fail-fast: false
matrix:
os: ['ubuntu-latest', 'macos-latest', 'windows-latest', 'macos-14']
python-version: ['3.10', '3.11', '3.12']
python-version: ['3.11', '3.12']
exclude:
# already tested in first_check job
- python-version: 3.12
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/CI_publish_pypi.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.9
python-version: 3.11
- name: Install dependencies
run: |
python -m pip install --upgrade pip
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ bioRxiv 2024.03.25.586580; doi: https://doi.org/10.1101/2024.03.25.586580
## Setup
### Requirements

Python 3.10, 3.11, 3.12 (higher will likely work but is not tested systematically).
Python 3.11, 3.12 (higher will likely work, but is not tested systematically).

### Installation
Installation is expected to take 10-20 minutes.
Expand Down Expand Up @@ -61,7 +61,7 @@ Alternatively, simply install in the environment of your choice by `pip install
We recommend to run the complete tutorial in [notebooks/MS2DeepScore_tutorial.ipynb](https://github.com/matchms/ms2deepscore/blob/main/notebooks/tutorials/ms2deepscore_tutorial.ipynb)
for a more extensive fully-working example on test data, including explanations on how to visualize the results.
The expected run time on a laptop is less than 5 minutes, including automatic model and dummy data download.
Alternatively there are some example scripts below.
Alternatively, there are some example scripts below.

## 1) Compute spectral similarities
We provide a model which was trained on > 500,000 MS/MS combined spectra from [GNPS](https://gnps.ucsd.edu/), [Mona](https://mona.fiehnlab.ucdavis.edu/), MassBank and MSnLib.
Expand Down
2 changes: 1 addition & 1 deletion ms2deepscore/SettingsMS2Deepscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,7 @@ def __init__(self, validate_settings=True, **settings):
self.max_pair_resampling = 10000000

# Tanimioto score setings
self.fingerprint_type: str = "daylight"
self.fingerprint_type: str = "rdkit_binary"
self.fingerprint_nbits: int = 4096

# Data augmentation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def __init__(
model_file_name,
positive_validation_spectra,
negative_validation_spectra,
fingerprint_type="daylight",
fingerprint_type="rdkit_binary",
n_bits_fingerprint=2048,
):
self.model_file_name = model_file_name
Expand Down Expand Up @@ -50,7 +50,7 @@ def __init__(
del self.model

def get_tanimoto_and_prediction_pairs(
self, spectra_1, spectra_2=None, label="", fingerprint_type="daylight", n_bits=2048
self, spectra_1, spectra_2=None, label="", fingerprint_type="rdkit_binary", n_bits=2048
) -> PredictionsAndTanimotoScores:
symmetric = False
if spectra_2 is None:
Expand Down
117 changes: 117 additions & 0 deletions ms2deepscore/fingerprint_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from typing import List
import numpy as np
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from chemap import compute_fingerprints, FingerprintConfig


SUPPORTED_FINGERPRINT_TYPES = {
"rdkit_binary",
"rdkit_count",
"rdkit_binary_unfolded",
"rdkit_count_unfolded",
}


def _inchi_to_smiles(inchi: str) -> str | None:
mol = Chem.MolFromInchi(inchi)
if mol is None:
return None
return Chem.MolToSmiles(mol)


def normalize_to_smiles(smiles_or_inchi: str | List[str]) -> str | List[str | None]:
"""
Convert InChI entries to SMILES. Leave SMILES unchanged.
Invalid InChI entries return None.
"""
if isinstance(smiles_or_inchi, str):
if smiles_or_inchi.startswith("InChI="):
return _inchi_to_smiles(smiles_or_inchi)
return smiles_or_inchi

normalized = []
for entry in smiles_or_inchi:
if entry is None:
normalized.append(None)
elif entry.startswith("InChI="):
normalized.append(_inchi_to_smiles(entry))
else:
normalized.append(entry)
return normalized


def matchms_spectrum_to_smiles(spectrum) -> str | None:
if spectrum is None:
return None
if spectrum.get("smiles") is not None:
return spectrum.get("smiles")
if spectrum.get("inchi") is not None:
return _inchi_to_smiles(spectrum.get("inchi"))
return None


def derive_fingerprint_from_smiles(
smiles: str | List[str],
fingerprint_type="rdkit_binary",
nbits=2048,
policy_invalid_smiles="raise",
) -> np.ndarray:
if fingerprint_type not in SUPPORTED_FINGERPRINT_TYPES:
raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}")

generator = rdFingerprintGenerator.GetRDKitFPGenerator(fpSize=nbits)

is_single = isinstance(smiles, str)
inputs = [smiles] if is_single else smiles

fingerprints = compute_fingerprints(
inputs,
generator,
config=FingerprintConfig(
count=("count" in fingerprint_type),
folded=("unfolded" not in fingerprint_type),
return_csr=False,
invalid_policy=policy_invalid_smiles,
),
)

if not isinstance(fingerprints, np.ndarray) and ("unfolded" not in fingerprint_type):
raise ValueError("Fingerprint computation failed.")

return fingerprints[0] if is_single else fingerprints


def derive_fingerprint_from_smiles_or_inchi(
smiles_or_inchi: str | List[str],
fingerprint_type="rdkit_binary",
nbits=2048,
policy_invalid="raise",
) -> np.ndarray:
normalized = normalize_to_smiles(smiles_or_inchi)

if normalized is None:
if policy_invalid == "raise":
raise ValueError("Could not convert input structure to SMILES.")
return np.zeros((nbits,), dtype=np.float32)

if isinstance(normalized, str):
return derive_fingerprint_from_smiles(
normalized,
fingerprint_type=fingerprint_type,
nbits=nbits,
policy_invalid_smiles=policy_invalid,
)

valid_smiles = [x for x in normalized if x is not None]
if len(valid_smiles) == 0:
if policy_invalid == "raise":
raise ValueError("No valid SMILES/InChI entries available for fingerprinting.")
return np.zeros((0, nbits), dtype=np.float32)

return derive_fingerprint_from_smiles(
valid_smiles,
fingerprint_type=fingerprint_type,
nbits=nbits,
policy_invalid_smiles=policy_invalid,
)
2 changes: 1 addition & 1 deletion ms2deepscore/train_new_model/data_augmentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def peak_removal_for_data_augmentation(
bin_indices_below_removal_intensity = where(
(spectrum_tensor > 0) & (spectrum_tensor < augment_removal_intensity)
)[0]
fraction_of_noise_to_remove = random_number_generator.random(1) * augment_removal_max
fraction_of_noise_to_remove = random_number_generator.random() * augment_removal_max
number_of_peaks_to_remove = int(
np.ceil((1 - fraction_of_noise_to_remove) * len(bin_indices_below_removal_intensity))
)
Expand Down
60 changes: 37 additions & 23 deletions ms2deepscore/train_new_model/inchikey_pair_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,12 @@
import heapq
import numpy as np
from matchms import Spectrum
from matchms.filtering import add_fingerprint
from matchms.similarity.vector_similarity_functions import jaccard_index
from chemap.metrics import tanimoto_similarity_dense
from numba import jit, prange
from tqdm import tqdm
from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore
from ms2deepscore.train_new_model import SpectrumPairGenerator
from ms2deepscore.fingerprint_utils import derive_fingerprint_from_smiles_or_inchi


def create_spectrum_pair_generator(
Expand Down Expand Up @@ -60,7 +60,7 @@ def create_spectrum_pair_generator(

def compute_fingerprints_for_training(
spectra: List[Spectrum],
fingerprint_type: str = "daylight",
fingerprint_type: str = "rdkit_binary",
nbits: int = 2048
) -> Tuple[np.ndarray, List[str]]:
"""Calculates fingerprints for each unique inchikey.
Expand All @@ -83,22 +83,42 @@ def compute_fingerprints_for_training(
print(f"Selected {len(spectra_selected)} spectra with unique inchikeys for calculating tanimoto scores "
f"(out of {len(spectra)} spectra)")

# Compute fingerprints using matchms
spectra_selected = [add_fingerprint(s, fingerprint_type, nbits) \
if s.get("fingerprint") is None else s for s in spectra_selected]
if len(spectra_selected) == 0:
raise ValueError("No spectra with valid structural annotations were found")

# Ignore missing / not-computed fingerprints
fingerprints = [s.get("fingerprint") for s in tqdm(spectra_selected,
desc="Calculating fingerprints")]
idx = np.array([i for i, x in enumerate(fingerprints) if x is not None]).astype(int)
if len(idx) == 0:
structure_list = []
valid_inchikeys = []

for spectrum, inchikey14 in zip(spectra_selected, inchikeys14_unique):
structure = spectrum.get("smiles")
if structure is None:
structure = spectrum.get("inchi")

if structure is None:
continue

structure_list.append(structure)
valid_inchikeys.append(inchikey14)

if len(structure_list) == 0:
raise ValueError("No valid SMILES/InChI entries available for fingerprint calculation")

fingerprints = derive_fingerprint_from_smiles_or_inchi(
structure_list,
fingerprint_type=fingerprint_type,
nbits=nbits,
policy_invalid="keep",
)

if not isinstance(fingerprints, np.ndarray) or fingerprints.shape[0] == 0:
raise ValueError("No fingerprints could be computed")
if len(idx) < len(fingerprints):
print(f"Successfully generated fingerprints for {len(idx)} of {len(fingerprints)} spectra")

fingerprints = np.array([fingerprints[i] for i in idx])
inchikeys14_unique = [inchikeys14_unique[i] for i in idx]
return fingerprints, inchikeys14_unique
if len(valid_inchikeys) != fingerprints.shape[0]:
raise ValueError(
f"Mismatch between inchikeys ({len(valid_inchikeys)}) and fingerprints ({fingerprints.shape[0]})."
)

return fingerprints, valid_inchikeys


@jit(nopython=True, parallel=True)
Expand Down Expand Up @@ -407,37 +427,31 @@ def tanimoto_scores_row(single_fingerprint, list_of_fingerprints):

for idx_fingerprint_j in range(size):
fingerprint_j = list_of_fingerprints[idx_fingerprint_j, :]
tanimoto_score = jaccard_index(single_fingerprint, fingerprint_j)
tanimoto_score = tanimoto_similarity_dense(single_fingerprint, fingerprint_j)
tanimoto_scores[idx_fingerprint_j] = tanimoto_score
return tanimoto_scores



def select_inchi_for_unique_inchikeys(
list_of_spectra: List['Spectrum']
) -> Tuple[List['Spectrum'], List[str]]:
"""Select spectra with most frequent inchi for unique inchikeys.

Method needed to calculate Tanimoto scores.
"""
# Extract inchi's and inchikeys from spectra metadata
inchikeys_list = [s.get("inchikey") for s in list_of_spectra]
inchi_list = [s.get("inchi") for s in list_of_spectra]

inchi_array = np.array(inchi_list)
inchikeys14_array = np.array([x[:14] for x in inchikeys_list])

# Find unique inchikeys
inchikeys14_unique = sorted(set(inchikeys14_array))

spectra_selected = []
for inchikey14 in inchikeys14_unique:
# Indices of matching inchikeys
idx = np.where(inchikeys14_array == inchikey14)[0]

# Find the most frequent inchi for the inchikey
most_common_inchi = Counter(inchi_array[idx]).most_common(1)[0][0]

# ID of the spectrum with the most frequent inchi
ID = idx[np.where(inchi_array[idx] == most_common_inchi)[0][0]]

Expand Down
Loading
Loading