Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 2.8.0

### Fixed
- progress bars can be switched off
- adjust random generator for test split

### Changed
- expand tests

## 2.7.2

### Changed
Expand Down
2 changes: 1 addition & 1 deletion ms2deepscore/MS2DeepScore.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def __init__(self, model: SiameseSpectralModel, progress_bar: bool = True):
self.progress_bar = progress_bar

def get_embedding_array(self, spectrums):
return compute_embedding_array(self.model, spectrums)
return compute_embedding_array(self.model, spectrums, progress_bar=self.progress_bar)

def pair(self, reference: Spectrum, query: Spectrum) -> float:
"""Calculate the MS2DeepScore similaritiy between a reference and a query spectrum.
Expand Down
2 changes: 1 addition & 1 deletion ms2deepscore/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = '2.7.2'
__version__ = '2.8.0'
16 changes: 14 additions & 2 deletions ms2deepscore/models/SiameseSpectralModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,7 +344,13 @@ def dense_layer(input_size, output_size, activation="lrelu"):
return nn.Sequential(nn.Linear(input_size, output_size), activations[activation])


def compute_embedding_array(model: SiameseSpectralModel, spectra, datatype="numpy", device=None):
def compute_embedding_array(
model: SiameseSpectralModel,
spectra,
datatype="numpy",
device=None,
progress_bar: bool = True,
):
"""
Compute the embeddings of all given spectra (list of matchms Spectrum objects).

Expand All @@ -361,6 +367,8 @@ def compute_embedding_array(model: SiameseSpectralModel, spectra, datatype="nump
device:
The device on which to perform the computation.
If None, it automatically uses CUDA if available, otherwise CPU.
progress_bar:
Whether to display a progress bar during embedding computation.
"""
if datatype.lower() not in ["numpy", "pytorch"]:
raise ValueError("datatype can only be 'numpy' or 'pytorch'.")
Expand All @@ -372,7 +380,11 @@ def compute_embedding_array(model: SiameseSpectralModel, spectra, datatype="nump
if device is None:
device = torch_device("cuda" if cuda.is_available() else "cpu")
model.to(device)
for i, spec in tqdm(enumerate(spectra), total=len(spectra), desc="Computing spectral embeddings ..."):
for i, spec in tqdm(
enumerate(spectra),
total=len(spectra),
desc="Computing spectral embeddings ...",
disable=not progress_bar):
X = tensorize_spectra([spec], model.model_settings)
with no_grad():
if datatype.lower() == "numpy":
Expand Down
6 changes: 3 additions & 3 deletions ms2deepscore/train_new_model/validation_and_test_split.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import random
import numpy as np
from typing import List, Tuple
from matchms import Spectrum
from tqdm import tqdm
Expand Down Expand Up @@ -32,8 +32,8 @@ def split_spectra_in_random_inchikey_sets(
"""Splits a set of spectra into a val, test and train set. The size of the val and test set are n/k.
"""
unique_inchikeys = select_unique_inchikeys(spectra)
random.seed(random_seed)
random.shuffle(unique_inchikeys)
rng = np.random.default_rng(random_seed)
rng.shuffle(unique_inchikeys)
fraction_size = len(unique_inchikeys) // k

validation_inchikeys = unique_inchikeys[-fraction_size:]
Expand Down
131 changes: 115 additions & 16 deletions tests/test_validation_and_test_split.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,137 @@
import numpy as np
import pytest
from matchms import Spectrum

from ms2deepscore.train_new_model.validation_and_test_split import (
select_spectra_belonging_to_inchikey, select_unique_inchikeys,
split_spectra_in_random_inchikey_sets)
select_spectra_belonging_to_inchikey,
select_unique_inchikeys,
split_spectra_in_random_inchikey_sets,
)


def _inchikey(letter: str) -> str:
return 14 * letter


def _make_spectrum(letter: str, idx: int = 0) -> Spectrum:
return Spectrum(
mz=np.array([100.1 + idx]),
intensities=np.array([0.9]),
metadata={"inchikey": _inchikey(letter)},
)


@pytest.fixture
def sample_spectra():
return [
Spectrum(mz=np.array([100.1]), intensities=np.array([0.9]),
metadata={"inchikey": 14 * "A"}),
Spectrum(mz=np.array([100.1]), intensities=np.array([0.9]),
metadata={"inchikey": 14 * "B"}),
Spectrum(mz=np.array([100.1]), intensities=np.array([0.9]),
metadata={"inchikey": 14 * "B"}),
Spectrum(mz=np.array([100.1]), intensities=np.array([0.9]),
metadata={"inchikey": 14 * "C"}),
_make_spectrum("A", 0),
_make_spectrum("B", 1),
_make_spectrum("B", 2),
_make_spectrum("C", 3),
]


@pytest.fixture
def larger_sample_spectra():
spectra = []
# 8 unique inchikeys, 2 spectra each
for letter in "ABCDEFGH":
spectra.append(_make_spectrum(letter, 0))
spectra.append(_make_spectrum(letter, 1))
return spectra


def _unique_inchikeys_in_spectra(spectra):
return sorted({s.get("inchikey")[:14] for s in spectra})


def test_select_unique_inchikeys(sample_spectra):
result = select_unique_inchikeys(sample_spectra)
assert result == [14 * "A", 14 * "B", 14 * "C"]
assert result == [_inchikey("A"), _inchikey("B"), _inchikey("C")]


def test_select_spectra_belonging_to_inchikey(sample_spectra):
inchikeys = [14 * "A", 14 * "B"]
inchikeys = [_inchikey("A"), _inchikey("B")]
result = select_spectra_belonging_to_inchikey(sample_spectra, inchikeys)
assert len(result) == 3
assert result[0].get("inchikey") == 14 * "A"
assert result[0].get("inchikey") == _inchikey("A")
assert all(s.get("inchikey")[:14] in inchikeys for s in result)


def test_select_spectra_belonging_to_inchikey_empty_match(sample_spectra):
result = select_spectra_belonging_to_inchikey(sample_spectra, [_inchikey("Z")])
assert result == []


def test_split_spectra_in_random_inchikey_sets(sample_spectra):
# TODO: this is still a dummy test mostly
def test_split_spectra_in_random_inchikey_sets_preserves_all_spectra(sample_spectra):
val, test, train = split_spectra_in_random_inchikey_sets(sample_spectra, 2, 42)
assert len(val) + len(test) + len(train) == 4
assert len(val) + len(test) + len(train) == len(sample_spectra)


def test_split_spectra_in_random_inchikey_sets_splits_by_inchikey_group(larger_sample_spectra):
val, test, train = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, 42)

val_keys = set(_unique_inchikeys_in_spectra(val))
test_keys = set(_unique_inchikeys_in_spectra(test))
train_keys = set(_unique_inchikeys_in_spectra(train))

assert val_keys.isdisjoint(test_keys)
assert val_keys.isdisjoint(train_keys)
assert test_keys.isdisjoint(train_keys)

all_keys = val_keys | test_keys | train_keys
assert all_keys == set(_unique_inchikeys_in_spectra(larger_sample_spectra))


def test_split_spectra_in_random_inchikey_sets_expected_unique_group_sizes(larger_sample_spectra):
val, test, train = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, 42)

# 8 unique inchikeys, k=4 -> fraction_size = 2
assert len(_unique_inchikeys_in_spectra(val)) == 2
assert len(_unique_inchikeys_in_spectra(test)) == 2
assert len(_unique_inchikeys_in_spectra(train)) == 4

# two spectra per inchikey
assert len(val) == 4
assert len(test) == 4
assert len(train) == 8


def test_split_spectra_in_random_inchikey_sets_same_seed_is_stable(larger_sample_spectra):
val1, test1, train1 = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, 42)
val2, test2, train2 = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, 42)

assert _unique_inchikeys_in_spectra(val1) == _unique_inchikeys_in_spectra(val2)
assert _unique_inchikeys_in_spectra(test1) == _unique_inchikeys_in_spectra(test2)
assert _unique_inchikeys_in_spectra(train1) == _unique_inchikeys_in_spectra(train2)


def test_split_spectra_in_random_inchikey_sets_different_seed_can_change_split(larger_sample_spectra):
val1, test1, train1 = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, 1)
val2, test2, train2 = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, 2)

split1 = (
_unique_inchikeys_in_spectra(val1),
_unique_inchikeys_in_spectra(test1),
_unique_inchikeys_in_spectra(train1),
)
split2 = (
_unique_inchikeys_in_spectra(val2),
_unique_inchikeys_in_spectra(test2),
_unique_inchikeys_in_spectra(train2),
)

assert split1 != split2


def test_split_spectra_in_random_inchikey_sets_none_seed_still_preserves_partition(larger_sample_spectra):
val, test, train = split_spectra_in_random_inchikey_sets(larger_sample_spectra, 4, None)

val_keys = set(_unique_inchikeys_in_spectra(val))
test_keys = set(_unique_inchikeys_in_spectra(test))
train_keys = set(_unique_inchikeys_in_spectra(train))

assert val_keys.isdisjoint(test_keys)
assert val_keys.isdisjoint(train_keys)
assert test_keys.isdisjoint(train_keys)
assert len(val) + len(test) + len(train) == len(larger_sample_spectra)
Loading