From d26cdb9906403becff8535a755df94ea76770a51 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:31:13 +0100 Subject: [PATCH 01/12] add logcount fingerprints (rdkit) --- ms2deepscore/fingerprint_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ms2deepscore/fingerprint_utils.py b/ms2deepscore/fingerprint_utils.py index d05082f8..c38ea7be 100644 --- a/ms2deepscore/fingerprint_utils.py +++ b/ms2deepscore/fingerprint_utils.py @@ -8,8 +8,10 @@ SUPPORTED_FINGERPRINT_TYPES = { "rdkit_binary", "rdkit_count", + "rdkit_logcount", "rdkit_binary_unfolded", "rdkit_count_unfolded", + "rdkit_logcount_unfolded", } @@ -71,6 +73,7 @@ def derive_fingerprint_from_smiles( config=FingerprintConfig( count=("count" in fingerprint_type), folded=("unfolded" not in fingerprint_type), + scaling="log" if "logcount" in fingerprint_type else None, return_csr=False, invalid_policy=policy_invalid_smiles, ), From 372a8c542e0df000604c2cbadf0eeacb52792e69 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:31:28 +0100 Subject: [PATCH 02/12] add logcount fingerprints (rdkit) --- tests/test_fingerprint_utils.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/tests/test_fingerprint_utils.py b/tests/test_fingerprint_utils.py index ea09fa4d..24d11120 100644 --- a/tests/test_fingerprint_utils.py +++ b/tests/test_fingerprint_utils.py @@ -246,7 +246,8 @@ def test_derive_fingerprint_from_smiles_or_inchi_single_inchi_count_unfolded(): _assert_unfolded_count_single(fp) -@pytest.mark.parametrize("fingerprint_type", ["rdkit_binary", "rdkit_count"]) +@pytest.mark.parametrize("fingerprint_type", + ["rdkit_binary", "rdkit_count", "rdkit_logcount"]) def test_derive_fingerprint_from_smiles_or_inchi_list_mixed_valid_folded(fingerprint_type): fps = derive_fingerprint_from_smiles_or_inchi( [VALID_SMILES, VALID_INCHI, VALID_SMILES_2], @@ -258,6 +259,22 @@ def test_derive_fingerprint_from_smiles_or_inchi_list_mixed_valid_folded(fingerp assert np.all(fps.sum(axis=1) > 0) +def test_derive_fingerprint_count_vs_logcount(): + # test if indeed logcount is approximately log1p of count + fps_count = derive_fingerprint_from_smiles_or_inchi( + [VALID_SMILES, VALID_INCHI, VALID_SMILES_2], + fingerprint_type="rdkit_count", + nbits=256, + ) + fps_logcount = derive_fingerprint_from_smiles_or_inchi( + [VALID_SMILES, VALID_INCHI, VALID_SMILES_2], + fingerprint_type="rdkit_logcount", + nbits=256, + ) + assert fps_count.shape == fps_logcount.shape + assert np.log1p(fps_count) == pytest.approx(fps_logcount, rel=1e-5) + + def test_derive_fingerprint_from_smiles_or_inchi_list_mixed_valid_binary_unfolded(): fps = derive_fingerprint_from_smiles_or_inchi( [VALID_SMILES, VALID_INCHI, VALID_SMILES_2], @@ -267,10 +284,12 @@ def test_derive_fingerprint_from_smiles_or_inchi_list_mixed_valid_binary_unfolde _assert_unfolded_binary_list(fps, expected_len=3) -def test_derive_fingerprint_from_smiles_or_inchi_list_mixed_valid_count_unfolded(): +@pytest.mark.parametrize("fp_type", + ["rdkit_count_unfolded", "rdkit_logcount_unfolded"]) +def test_derive_fingerprint_count_and_logcount_unfolded(fp_type): fps = derive_fingerprint_from_smiles_or_inchi( [VALID_SMILES, VALID_INCHI, VALID_SMILES_2], - fingerprint_type="rdkit_count_unfolded", + fingerprint_type=fp_type, nbits=256, ) _assert_unfolded_count_list(fps, expected_len=3) From 9d6671d675b71c4545038695aea3d025a4385cc1 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:36:27 +0100 Subject: [PATCH 03/12] central place for fingerprint similarity computations (+tests) --- .../fingerprint_similarity_computations.py | 484 ++++++++++++++++++ ...est_fingerprint_similarity_computations.py | 361 +++++++++++++ 2 files changed, 845 insertions(+) create mode 100644 ms2deepscore/fingerprint_similarity_computations.py create mode 100644 tests/test_fingerprint_similarity_computations.py diff --git a/ms2deepscore/fingerprint_similarity_computations.py b/ms2deepscore/fingerprint_similarity_computations.py new file mode 100644 index 00000000..95afe6d9 --- /dev/null +++ b/ms2deepscore/fingerprint_similarity_computations.py @@ -0,0 +1,484 @@ +from typing import Tuple +import numpy as np +from numba import jit, prange +from chemap.metrics import ( + tanimoto_similarity_dense, + tanimoto_similarity_sparse, + tanimoto_similarity_sparse_binary, + tanimoto_similarity_matrix_dense, + tanimoto_similarity_matrix_sparse_binary, + tanimoto_similarity_matrix_sparse, +) + + +DENSE_FINGERPRINT_TYPES = {"rdkit_binary", "rdkit_count", "rdkit_logcount"} +UNFOLDED_BINARY_FINGERPRINT_TYPES = {"rdkit_binary_unfolded"} +UNFOLDED_COUNT_FINGERPRINT_TYPES = {"rdkit_count_unfolded", "rdkit_logcount_unfolded"} +SUPPORTED_FINGERPRINT_TYPES = ( + DENSE_FINGERPRINT_TYPES + | UNFOLDED_BINARY_FINGERPRINT_TYPES + | UNFOLDED_COUNT_FINGERPRINT_TYPES +) + + +def is_dense_fingerprint_type(fingerprint_type: str) -> bool: + return fingerprint_type in DENSE_FINGERPRINT_TYPES + + +def is_unfolded_binary_fingerprint_type(fingerprint_type: str) -> bool: + return fingerprint_type in UNFOLDED_BINARY_FINGERPRINT_TYPES + + +def is_unfolded_count_fingerprint_type(fingerprint_type: str) -> bool: + return fingerprint_type in UNFOLDED_COUNT_FINGERPRINT_TYPES + + +def compute_fingerprint_similarity_matrix( + fingerprints_1, + fingerprints_2, + fingerprint_type: str, +) -> np.ndarray: + """Compute pairwise Tanimoto similarities for any supported fingerprint type.""" + if fingerprint_type not in SUPPORTED_FINGERPRINT_TYPES: + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") + + if is_dense_fingerprint_type(fingerprint_type): + return tanimoto_similarity_matrix_dense(fingerprints_1, fingerprints_2) + + if is_unfolded_binary_fingerprint_type(fingerprint_type): + return tanimoto_similarity_matrix_sparse_binary(fingerprints_1, fingerprints_2) + + if is_unfolded_count_fingerprint_type(fingerprint_type): + return tanimoto_similarity_matrix_sparse( + [x[0] for x in fingerprints_1], + [x[1] for x in fingerprints_1], + [x[0] for x in fingerprints_2], + [x[1] for x in fingerprints_2], + ) + + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") + + +def compute_fingerprint_similarity_row( + single_fingerprint, + fingerprints, + fingerprint_type: str, +) -> np.ndarray: + """Compute similarities of one fingerprint to a collection of fingerprints.""" + if fingerprint_type not in SUPPORTED_FINGERPRINT_TYPES: + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") + + if is_dense_fingerprint_type(fingerprint_type): + size = fingerprints.shape[0] + tanimoto_scores = np.zeros(size, dtype=np.float32) + for idx_fingerprint_j in range(size): + fingerprint_j = fingerprints[idx_fingerprint_j, :] + tanimoto_scores[idx_fingerprint_j] = tanimoto_similarity_dense( + single_fingerprint, fingerprint_j + ) + return tanimoto_scores + + # For unfolded fingerprints, use the matrix function and take row 0. + return compute_fingerprint_similarity_matrix( + [single_fingerprint], fingerprints, fingerprint_type + )[0] + + +# Add row based similarity computations +# ------------------------------------- + + +@jit(nopython=True, fastmath=True) +def tanimoto_scores_row_dense(single_fingerprint, list_of_fingerprints): + size = list_of_fingerprints.shape[0] + tanimoto_scores = np.zeros(size) + + for idx_fingerprint_j in range(size): + fingerprint_j = list_of_fingerprints[idx_fingerprint_j, :] + tanimoto_score = tanimoto_similarity_dense(single_fingerprint, fingerprint_j) + tanimoto_scores[idx_fingerprint_j] = tanimoto_score + return tanimoto_scores + + +@jit(nopython=True, fastmath=True) +def tanimoto_scores_row_sparse_binary(single_fingerprint, list_of_fingerprints): + size = len(list_of_fingerprints) + tanimoto_scores = np.zeros(size, dtype=np.float32) + + for idx_fingerprint_j in range(size): + fingerprint_j = list_of_fingerprints[idx_fingerprint_j] + tanimoto_scores[idx_fingerprint_j] = tanimoto_similarity_sparse_binary( + single_fingerprint, fingerprint_j + ) + return tanimoto_scores + + +@jit(nopython=True, fastmath=True) +def tanimoto_scores_row_sparse_count( + single_bins, + single_counts, + list_of_bins, + list_of_counts, +): + size = len(list_of_bins) + tanimoto_scores = np.zeros(size, dtype=np.float32) + + for idx_fingerprint_j in range(size): + bins_j = list_of_bins[idx_fingerprint_j] + counts_j = list_of_counts[idx_fingerprint_j] + tanimoto_scores[idx_fingerprint_j] = tanimoto_similarity_sparse( + single_bins, single_counts, bins_j, counts_j + ) + return tanimoto_scores + + +@jit(nopython=True, parallel=True) +def _compute_tanimoto_similarity_per_bin_dense( + fingerprints, + max_pairs_per_bin, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), + include_diagonal=True, +) -> Tuple[np.ndarray, np.ndarray]: + size = fingerprints.shape[0] + num_bins = len(selection_bins) + + selected_pairs_per_bin = -1 * np.ones((num_bins, size, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size, max_pairs_per_bin), dtype=np.float32) + + for idx_fingerprint_i in prange(size): + fingerprint_i = fingerprints[idx_fingerprint_i, :] + tanimoto_scores = tanimoto_scores_row_dense(fingerprint_i, fingerprints) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero( + (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) + )[0] + + if not include_diagonal and idx_fingerprint_i in indices: + indices = indices[indices != idx_fingerprint_i] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin + + +@jit(nopython=True, parallel=True) +def _compute_tanimoto_similarity_per_bin_sparse_binary( + fingerprints, + max_pairs_per_bin, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), + include_diagonal=True, +) -> Tuple[np.ndarray, np.ndarray]: + size = len(fingerprints) + num_bins = len(selection_bins) + + selected_pairs_per_bin = -1 * np.ones((num_bins, size, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size, max_pairs_per_bin), dtype=np.float32) + + for idx_fingerprint_i in prange(size): + fingerprint_i = fingerprints[idx_fingerprint_i] + tanimoto_scores = tanimoto_scores_row_sparse_binary(fingerprint_i, fingerprints) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero( + (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) + )[0] + + if not include_diagonal and idx_fingerprint_i in indices: + indices = indices[indices != idx_fingerprint_i] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin + + +@jit(nopython=True, parallel=True) +def _compute_tanimoto_similarity_per_bin_sparse_count( + fingerprints_bins, + fingerprints_counts, + max_pairs_per_bin, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), + include_diagonal=True, +) -> Tuple[np.ndarray, np.ndarray]: + size = len(fingerprints_bins) + num_bins = len(selection_bins) + + selected_pairs_per_bin = -1 * np.ones((num_bins, size, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size, max_pairs_per_bin), dtype=np.float32) + + for idx_fingerprint_i in prange(size): + bins_i = fingerprints_bins[idx_fingerprint_i] + counts_i = fingerprints_counts[idx_fingerprint_i] + tanimoto_scores = tanimoto_scores_row_sparse_count( + bins_i, counts_i, fingerprints_bins, fingerprints_counts + ) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero( + (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) + )[0] + + if not include_diagonal and idx_fingerprint_i in indices: + indices = indices[indices != idx_fingerprint_i] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin + + +def _split_sparse_count_fingerprints(fingerprints): + fingerprint_bins = [x[0] for x in fingerprints] + fingerprint_counts = [x[1] for x in fingerprints] + return fingerprint_bins, fingerprint_counts + + +def compute_tanimoto_similarity_per_bin( + fingerprints, + max_pairs_per_bin, + fingerprint_type: str, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), + include_diagonal=True, +) -> Tuple[np.ndarray, np.ndarray]: + """Dispatch to the appropriate pairwise-per-bin Tanimoto implementation.""" + if fingerprint_type not in SUPPORTED_FINGERPRINT_TYPES: + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") + + if is_dense_fingerprint_type(fingerprint_type): + return _compute_tanimoto_similarity_per_bin_dense( + fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=include_diagonal, + ) + + if is_unfolded_binary_fingerprint_type(fingerprint_type): + return _compute_tanimoto_similarity_per_bin_sparse_binary( + fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=include_diagonal, + ) + + if is_unfolded_count_fingerprint_type(fingerprint_type): + fingerprint_bins, fingerprint_counts = _split_sparse_count_fingerprints(fingerprints) + return _compute_tanimoto_similarity_per_bin_sparse_count( + fingerprint_bins, + fingerprint_counts, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=include_diagonal, + ) + + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") + + +@jit(nopython=True, parallel=True) +def _compute_tanimoto_similarity_per_bin_between_sets_dense( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), +) -> Tuple[np.ndarray, np.ndarray]: + size_1 = fingerprints_1.shape[0] + size_2 = fingerprints_2.shape[0] + num_bins = len(selection_bins) + + selected_pairs_per_bin = -1 * np.ones((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.float32) + + for idx_fingerprint_i in prange(size_1): + fingerprint_i = fingerprints_1[idx_fingerprint_i, :] + tanimoto_scores = tanimoto_scores_row_dense(fingerprint_i, fingerprints_2) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 + + for idx_fingerprint_j in prange(size_2): + fingerprint_j = fingerprints_2[idx_fingerprint_j, :] + idx_fingerprint_corrected = idx_fingerprint_j + size_1 + tanimoto_scores = tanimoto_scores_row_dense(fingerprint_j, fingerprints_1) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin + + +@jit(nopython=True, parallel=True) +def _compute_tanimoto_similarity_per_bin_between_sets_sparse_binary( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), +) -> Tuple[np.ndarray, np.ndarray]: + size_1 = len(fingerprints_1) + size_2 = len(fingerprints_2) + num_bins = len(selection_bins) + + selected_pairs_per_bin = -1 * np.ones((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.float32) + + for idx_fingerprint_i in prange(size_1): + fingerprint_i = fingerprints_1[idx_fingerprint_i] + tanimoto_scores = tanimoto_scores_row_sparse_binary(fingerprint_i, fingerprints_2) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 + + for idx_fingerprint_j in prange(size_2): + fingerprint_j = fingerprints_2[idx_fingerprint_j] + idx_fingerprint_corrected = idx_fingerprint_j + size_1 + tanimoto_scores = tanimoto_scores_row_sparse_binary(fingerprint_j, fingerprints_1) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin + + +@jit(nopython=True, parallel=True) +def _compute_tanimoto_similarity_per_bin_between_sets_sparse_count( + fingerprints_1_bins, + fingerprints_1_counts, + fingerprints_2_bins, + fingerprints_2_counts, + max_pairs_per_bin, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), +) -> Tuple[np.ndarray, np.ndarray]: + size_1 = len(fingerprints_1_bins) + size_2 = len(fingerprints_2_bins) + num_bins = len(selection_bins) + + selected_pairs_per_bin = -1 * np.ones((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.int32) + selected_scores_per_bin = np.zeros((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.float32) + + for idx_fingerprint_i in prange(size_1): + bins_i = fingerprints_1_bins[idx_fingerprint_i] + counts_i = fingerprints_1_counts[idx_fingerprint_i] + tanimoto_scores = tanimoto_scores_row_sparse_count( + bins_i, counts_i, fingerprints_2_bins, fingerprints_2_counts + ) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 + + for idx_fingerprint_j in prange(size_2): + bins_j = fingerprints_2_bins[idx_fingerprint_j] + counts_j = fingerprints_2_counts[idx_fingerprint_j] + idx_fingerprint_corrected = idx_fingerprint_j + size_1 + tanimoto_scores = tanimoto_scores_row_sparse_count( + bins_j, counts_j, fingerprints_1_bins, fingerprints_1_counts + ) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] + + return selected_pairs_per_bin, selected_scores_per_bin + + +def compute_tanimoto_similarity_per_bin_between_sets( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin, + fingerprint_type: str, + selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)], dtype=np.float32), +) -> Tuple[np.ndarray, np.ndarray]: + """Compute cross-set Tanimoto per bin for all supported fingerprint types.""" + if fingerprint_type not in SUPPORTED_FINGERPRINT_TYPES: + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") + + if is_dense_fingerprint_type(fingerprint_type): + return _compute_tanimoto_similarity_per_bin_between_sets_dense( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + + if is_unfolded_binary_fingerprint_type(fingerprint_type): + return _compute_tanimoto_similarity_per_bin_between_sets_sparse_binary( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + + if is_unfolded_count_fingerprint_type(fingerprint_type): + fingerprints_1_bins, fingerprints_1_counts = _split_sparse_count_fingerprints(fingerprints_1) + fingerprints_2_bins, fingerprints_2_counts = _split_sparse_count_fingerprints(fingerprints_2) + return _compute_tanimoto_similarity_per_bin_between_sets_sparse_count( + fingerprints_1_bins, + fingerprints_1_counts, + fingerprints_2_bins, + fingerprints_2_counts, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + + raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") diff --git a/tests/test_fingerprint_similarity_computations.py b/tests/test_fingerprint_similarity_computations.py new file mode 100644 index 00000000..0d93bbc9 --- /dev/null +++ b/tests/test_fingerprint_similarity_computations.py @@ -0,0 +1,361 @@ +import numpy as np +import pytest + +from ms2deepscore.fingerprint_similarity_computations import ( + compute_tanimoto_similarity_per_bin, + compute_tanimoto_similarity_per_bin_between_sets, +) + + +@pytest.fixture +def simple_binary_fingerprints(): + return np.array([ + [1, 1, 0, 0], + [1, 0, 1, 0], + [0, 1, 1, 1], + [0, 0, 1, 1], + ], dtype=np.bool_) + + +@pytest.fixture +def simple_count_fingerprints(): + return np.array([ + [2, 1, 0, 0], + [1, 0, 2, 0], + [0, 1, 2, 1], + [0, 0, 1, 2], + ], dtype=np.float32) + + +@pytest.fixture +def simple_sparse_binary_fingerprints(): + return [ + np.array([0, 1], dtype=np.int64), + np.array([0, 2], dtype=np.int64), + np.array([1, 2, 3], dtype=np.int64), + np.array([2, 3], dtype=np.int64), + ] + + +@pytest.fixture +def simple_sparse_count_fingerprints(): + return [ + (np.array([0, 1], dtype=np.int64), np.array([2.0, 1.0], dtype=np.float32)), + (np.array([0, 2], dtype=np.int64), np.array([1.0, 2.0], dtype=np.float32)), + (np.array([1, 2, 3], dtype=np.int64), np.array([1.0, 2.0, 1.0], dtype=np.float32)), + (np.array([2, 3], dtype=np.int64), np.array([1.0, 2.0], dtype=np.float32)), + ] + + +@pytest.fixture +def simple_binary_fingerprints_between_sets(): + fingerprints_1 = np.array([ + [1, 0, 0, 0], + [0, 1, 1, 0], + ], dtype=np.bool_) + fingerprints_2 = np.array([ + [0, 1, 1, 0], + [1, 0, 0, 0], + ], dtype=np.bool_) + return fingerprints_1, fingerprints_2 + + +@pytest.fixture +def simple_count_fingerprints_between_sets(): + fingerprints_1 = np.array([ + [2, 0, 0, 0], + [0, 1, 2, 0], + ], dtype=np.float32) + fingerprints_2 = np.array([ + [0, 1, 2, 0], + [2, 0, 0, 0], + ], dtype=np.float32) + return fingerprints_1, fingerprints_2 + + +@pytest.fixture +def simple_sparse_binary_fingerprints_between_sets(): + fingerprints_1 = [ + np.array([0], dtype=np.int64), + np.array([1, 2], dtype=np.int64), + ] + fingerprints_2 = [ + np.array([1, 2], dtype=np.int64), + np.array([0], dtype=np.int64), + ] + return fingerprints_1, fingerprints_2 + + +@pytest.fixture +def simple_sparse_count_fingerprints_between_sets(): + fingerprints_1 = [ + (np.array([0], dtype=np.int64), np.array([2.0], dtype=np.float32)), + (np.array([1, 2], dtype=np.int64), np.array([1.0, 2.0], dtype=np.float32)), + ] + fingerprints_2 = [ + (np.array([1, 2], dtype=np.int64), np.array([1.0, 2.0], dtype=np.float32)), + (np.array([0], dtype=np.int64), np.array([2.0], dtype=np.float32)), + ] + return fingerprints_1, fingerprints_2 + + +def _check_similarity_per_bin_outputs(selected_pairs_per_bin, selected_scores_per_bin, nr_of_items, nr_of_bins, max_pairs_per_bin): + assert selected_pairs_per_bin.shape == (nr_of_bins, nr_of_items, max_pairs_per_bin) + assert selected_scores_per_bin.shape == (nr_of_bins, nr_of_items, max_pairs_per_bin) + assert np.all(selected_scores_per_bin[selected_pairs_per_bin == -1] == 0) + assert np.all(selected_scores_per_bin >= 0.0) + assert np.all(selected_scores_per_bin <= 1.0) + + +def _check_between_sets_similarity_per_bin_outputs( + selected_pairs_per_bin, + selected_scores_per_bin, + nr_of_items_1, + nr_of_items_2, + nr_of_bins, + max_pairs_per_bin, +): + total_items = nr_of_items_1 + nr_of_items_2 + assert selected_pairs_per_bin.shape == (nr_of_bins, total_items, max_pairs_per_bin) + assert selected_scores_per_bin.shape == (nr_of_bins, total_items, max_pairs_per_bin) + assert np.all(selected_scores_per_bin[selected_pairs_per_bin == -1] == 0) + assert np.all(selected_scores_per_bin >= 0.0) + assert np.all(selected_scores_per_bin <= 1.0) + + +@pytest.mark.parametrize( + "fingerprints_fixture,fingerprint_type", + [ + ("simple_binary_fingerprints", "rdkit_binary"), + ("simple_count_fingerprints", "rdkit_count"), + ("simple_sparse_binary_fingerprints", "rdkit_binary_unfolded"), + ("simple_sparse_count_fingerprints", "rdkit_count_unfolded"), + ], +) +def test_compute_tanimoto_similarity_per_bin_all_supported_types( + request, fingerprints_fixture, fingerprint_type +): + fingerprints = request.getfixturevalue(fingerprints_fixture) + max_pairs_per_bin = 5 + nr_of_bins = 10 + selection_bins = np.array( + [(x / nr_of_bins, x / nr_of_bins + 1 / nr_of_bins) for x in range(nr_of_bins)], + dtype=np.float32, + ) + + selected_pairs_per_bin, selected_scores_per_bin = compute_tanimoto_similarity_per_bin( + fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + fingerprint_type=fingerprint_type, + selection_bins=selection_bins, + include_diagonal=True, + ) + + _check_similarity_per_bin_outputs( + selected_pairs_per_bin, + selected_scores_per_bin, + nr_of_items=len(fingerprints), + nr_of_bins=nr_of_bins, + max_pairs_per_bin=max_pairs_per_bin, + ) + + +@pytest.mark.parametrize( + "fingerprints_fixture,fingerprint_type", + [ + ("simple_binary_fingerprints", "rdkit_binary"), + ("simple_count_fingerprints", "rdkit_count"), + ("simple_sparse_binary_fingerprints", "rdkit_binary_unfolded"), + ("simple_sparse_count_fingerprints", "rdkit_count_unfolded"), + ], +) +def test_compute_tanimoto_similarity_per_bin_exclude_diagonal_all_supported_types( + request, fingerprints_fixture, fingerprint_type +): + fingerprints = request.getfixturevalue(fingerprints_fixture) + max_pairs_per_bin = 5 + nr_of_bins = 10 + selection_bins = np.array( + [(x / nr_of_bins, x / nr_of_bins + 1 / nr_of_bins) for x in range(nr_of_bins)], + dtype=np.float32, + ) + + selected_pairs_per_bin, _ = compute_tanimoto_similarity_per_bin( + fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + fingerprint_type=fingerprint_type, + selection_bins=selection_bins, + include_diagonal=False, + ) + + for bin_id, pairs_matrix in enumerate(selected_pairs_per_bin): + for inchikey_idx, row in enumerate(pairs_matrix): + assert len(np.where(row == inchikey_idx)[0]) == 0, ( + f"Diagonal pair found in bin {bin_id} for item {inchikey_idx}" + ) + + +def test_compute_tanimoto_similarity_per_bin_binary_dense_expected_pattern(simple_binary_fingerprints): + max_pairs_per_bin = 5 + nr_of_bins = 10 + selection_bins = np.array( + [(x / nr_of_bins, x / nr_of_bins + 1 / nr_of_bins) for x in range(nr_of_bins)], + dtype=np.float32, + ) + + selected_pairs_per_bin, selected_scores_per_bin = compute_tanimoto_similarity_per_bin( + simple_binary_fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + fingerprint_type="rdkit_binary", + selection_bins=selection_bins, + include_diagonal=True, + ) + + expected_nr_of_pairs_per_bin = np.array([0, 0, 4, 4, 0, 0, 2, 0, 0, 4]) + + for bin_id, pairs_matrix in enumerate(selected_pairs_per_bin): + number_of_pairs_in_bin = len(np.where(pairs_matrix != -1)[0]) + assert expected_nr_of_pairs_per_bin[bin_id] == number_of_pairs_in_bin + + assert np.all(selected_scores_per_bin[bin_id][pairs_matrix == -1] == 0) + assert np.all(selected_scores_per_bin[bin_id][pairs_matrix != -1] > 0.0) + + if selection_bins[bin_id][1] == 1: + for inchikey_idx, row in enumerate(pairs_matrix): + assert len(np.where(row == inchikey_idx)[0]) == 1 + assert selected_scores_per_bin[bin_id][inchikey_idx][row == inchikey_idx] == 1.0 + else: + for inchikey_idx, row in enumerate(pairs_matrix): + assert len(np.where(row == inchikey_idx)[0]) == 0 + + +def test_compute_tanimoto_similarity_per_bin_count_dense_zero_scores_for_invalid_slots(simple_count_fingerprints): + max_pairs_per_bin = 5 + nr_of_bins = 10 + selection_bins = np.array( + [(x / nr_of_bins, x / nr_of_bins + 1 / nr_of_bins) for x in range(nr_of_bins)], + dtype=np.float32, + ) + + selected_pairs_per_bin, selected_scores_per_bin = compute_tanimoto_similarity_per_bin( + simple_count_fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + fingerprint_type="rdkit_count", + selection_bins=selection_bins, + include_diagonal=True, + ) + + assert np.all(selected_scores_per_bin[selected_pairs_per_bin == -1] == 0) + + +@pytest.mark.parametrize( + "fingerprints_fixture_1,fingerprints_fixture_2,fingerprint_type", + [ + ("simple_binary_fingerprints_between_sets", "simple_binary_fingerprints_between_sets", "rdkit_binary"), + ("simple_count_fingerprints_between_sets", "simple_count_fingerprints_between_sets", "rdkit_count"), + ("simple_sparse_binary_fingerprints_between_sets", "simple_sparse_binary_fingerprints_between_sets", "rdkit_binary_unfolded"), + ("simple_sparse_count_fingerprints_between_sets", "simple_sparse_count_fingerprints_between_sets", "rdkit_count_unfolded"), + ], +) +def test_compute_tanimoto_similarity_per_bin_between_sets_all_supported_types( + request, fingerprints_fixture_1, fingerprints_fixture_2, fingerprint_type +): + fingerprints_1, fingerprints_2 = request.getfixturevalue(fingerprints_fixture_1) + max_pairs_per_bin = 2 + selection_bins = np.array([(0.99, 1.0)], dtype=np.float32) + + selected_pairs_per_bin, selected_scores_per_bin = compute_tanimoto_similarity_per_bin_between_sets( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + fingerprint_type=fingerprint_type, + selection_bins=selection_bins, + ) + + _check_between_sets_similarity_per_bin_outputs( + selected_pairs_per_bin, + selected_scores_per_bin, + nr_of_items_1=len(fingerprints_1), + nr_of_items_2=len(fingerprints_2), + nr_of_bins=1, + max_pairs_per_bin=max_pairs_per_bin, + ) + + +@pytest.mark.parametrize( + "fingerprints_fixture,fingerprint_type", + [ + ("simple_binary_fingerprints_between_sets", "rdkit_binary"), + ("simple_count_fingerprints_between_sets", "rdkit_count"), + ("simple_sparse_binary_fingerprints_between_sets", "rdkit_binary_unfolded"), + ("simple_sparse_count_fingerprints_between_sets", "rdkit_count_unfolded"), + ], +) +def test_compute_tanimoto_similarity_per_bin_between_sets_uses_cross_set_similarity_for_both_directions( + request, fingerprints_fixture, fingerprint_type +): + """ + This catches the old bug where the second loop compared set2 against set2 + instead of set2 against set1. + """ + fingerprints_1, fingerprints_2 = request.getfixturevalue(fingerprints_fixture) + + selection_bins = np.array([(0.99, 1.0)], dtype=np.float32) + + selected_pairs_per_bin, selected_scores_per_bin = compute_tanimoto_similarity_per_bin_between_sets( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=2, + fingerprint_type=fingerprint_type, + selection_bins=selection_bins, + ) + + pairs = selected_pairs_per_bin[0] + scores = selected_scores_per_bin[0] + + # rows 0..1 belong to set 1 + # rows 2..3 belong to set 2 + # + # fingerprints are arranged so that: + # set1[0] <-> set2[1] + # set1[1] <-> set2[0] + + assert pairs[0, 0] == 3 + assert scores[0, 0] == 1.0 + + assert pairs[1, 0] == 2 + assert scores[1, 0] == 1.0 + + assert pairs[2, 0] == 1 + assert scores[2, 0] == 1.0 + + assert pairs[3, 0] == 0 + assert scores[3, 0] == 1.0 + + assert np.all(pairs[:, 1] == -1) + assert np.all(scores[:, 1] == 0.0) + + +def test_compute_tanimoto_similarity_per_bin_invalid_fingerprint_type_raises(simple_binary_fingerprints): + with pytest.raises(ValueError, match="Unsupported fingerprint type"): + compute_tanimoto_similarity_per_bin( + simple_binary_fingerprints, + max_pairs_per_bin=5, + fingerprint_type="daylight", + selection_bins=np.array([(-0.01, 1.0)], dtype=np.float32), + include_diagonal=True, + ) + + +def test_compute_tanimoto_similarity_per_bin_between_sets_invalid_fingerprint_type_raises( + simple_binary_fingerprints_between_sets, +): + fingerprints_1, fingerprints_2 = simple_binary_fingerprints_between_sets + with pytest.raises(ValueError, match="Unsupported fingerprint type"): + compute_tanimoto_similarity_per_bin_between_sets( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=5, + fingerprint_type="daylight", + selection_bins=np.array([(-0.01, 1.0)], dtype=np.float32), + ) From d2e5d05074bd732534e209881d9a3c73f7bcd518 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:39:36 +0100 Subject: [PATCH 04/12] adjust to central similarity computations and expand/adjust tests --- .../inchikey_pair_selection.py | 80 +--- tests/test_inchikey_pair_selection.py | 438 +++++++++--------- 2 files changed, 217 insertions(+), 301 deletions(-) diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection.py b/ms2deepscore/train_new_model/inchikey_pair_selection.py index 99fc142d..d4625635 100644 --- a/ms2deepscore/train_new_model/inchikey_pair_selection.py +++ b/ms2deepscore/train_new_model/inchikey_pair_selection.py @@ -3,12 +3,11 @@ import heapq import numpy as np from matchms import Spectrum -from chemap.metrics import tanimoto_similarity_dense -from numba import jit, prange from tqdm import tqdm from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore from ms2deepscore.train_new_model import SpectrumPairGenerator from ms2deepscore.fingerprint_utils import derive_fingerprint_from_smiles_or_inchi +from ms2deepscore.fingerprint_similarity_computations import compute_tanimoto_similarity_per_bin def create_spectrum_pair_generator( @@ -40,13 +39,13 @@ def create_spectrum_pair_generator( if len(inchikeys14_unique) < settings.batch_size: raise ValueError("The number of unique inchikeys must be larger than the batch size.") - available_pairs_per_bin_matrix, available_scores_per_bin_matrix = compute_jaccard_similarity_per_bin( + available_pairs_per_bin_matrix, available_scores_per_bin_matrix = compute_tanimoto_similarity_per_bin( fingerprints, settings.max_pairs_per_bin, - settings.same_prob_bins, - settings.include_diagonal - ) - + fingerprint_type=settings.fingerprint_type, + selection_bins=settings.same_prob_bins, + include_diagonal=settings.include_diagonal, + ) pair_frequency_matrixes = balanced_selection_of_pairs_per_bin( available_pairs_per_bin_matrix, settings) @@ -110,64 +109,17 @@ def compute_fingerprints_for_training( policy_invalid="keep", ) - if not isinstance(fingerprints, np.ndarray) or fingerprints.shape[0] == 0: + if len(fingerprints) == 0: raise ValueError("No fingerprints could be computed") - if len(valid_inchikeys) != fingerprints.shape[0]: + if len(valid_inchikeys) != len(fingerprints): raise ValueError( - f"Mismatch between inchikeys ({len(valid_inchikeys)}) and fingerprints ({fingerprints.shape[0]})." + f"Mismatch between inchikeys ({len(valid_inchikeys)}) and fingerprints ({len(fingerprints)})." ) return fingerprints, valid_inchikeys -@jit(nopython=True, parallel=True) -def compute_jaccard_similarity_per_bin( - fingerprints, - max_pairs_per_bin, - selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)]), - include_diagonal=True) -> Tuple[np.ndarray, np.ndarray]: - """Randomly selects compound pairs per tanimoto bin, up to max_pairs_per_bin - - returns: - 2 3d numpy arrays are returned, the first encodes the pairs per bin and the second the corresponding scores. - A 3D numpy array with shape [nr_of_bins, nr_of_fingerprints, max_pairs_per_bin]. - An example structure for bin 1, with 3 fingerprints and max_pairs_per_bin =4 would be: - [[1,2,-1,-1], - [0,3,-1,-1], - [0,2,-1,-1],] - The pairs are encoded by the index and the value. - So the first row encodes pairs between fingerpint 0 and 1, fingerprint 0 and 2. - The -1 encode that no more pairs were found for this fingerprint in this bin. - """ - - size = fingerprints.shape[0] - num_bins = len(selection_bins) - - selected_pairs_per_bin = -1 * np.ones((num_bins, size, max_pairs_per_bin), dtype=np.int32) - selected_scores_per_bin = np.zeros((num_bins, size, max_pairs_per_bin), dtype=np.float32) - - for idx_fingerprint_i in prange(size): - fingerprint_i = fingerprints[idx_fingerprint_i, :] - tanimoto_scores = tanimoto_scores_row(fingerprint_i, fingerprints) - - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - if not include_diagonal and idx_fingerprint_i in indices: - indices = indices[indices != idx_fingerprint_i] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] - - return selected_pairs_per_bin, selected_scores_per_bin - - def determine_nr_of_pairs_per_bin(settings, nr_of_inchikeys): """Calculate the target number of pairs per bin based on nr of unique inchikeys and given settings. @@ -304,7 +256,7 @@ def select_balanced_pairs(available_pairs_for_bin_matrix: np.ndarray, max_resampling: The maximum number of times a pair can be resampled. Resampling means that the exact same inchikey pair is added multiple times to the list of pairs. - required_number_of_pairs: + max_inchikey_count: The number of pairs to sample. Returns @@ -420,18 +372,6 @@ def get_nr_of_available_pairs_in_bin(selected_pairs_per_bin_matrix: np.ndarray) return nr_of_unique_pairs_per_bin -@jit(nopython=True) -def tanimoto_scores_row(single_fingerprint, list_of_fingerprints): - size = list_of_fingerprints.shape[0] - tanimoto_scores = np.zeros(size) - - for idx_fingerprint_j in range(size): - fingerprint_j = list_of_fingerprints[idx_fingerprint_j, :] - tanimoto_score = tanimoto_similarity_dense(single_fingerprint, fingerprint_j) - tanimoto_scores[idx_fingerprint_j] = tanimoto_score - return tanimoto_scores - - def select_inchi_for_unique_inchikeys( list_of_spectra: List['Spectrum'] ) -> Tuple[List['Spectrum'], List[str]]: diff --git a/tests/test_inchikey_pair_selection.py b/tests/test_inchikey_pair_selection.py index a9f367c6..9daf8f04 100644 --- a/tests/test_inchikey_pair_selection.py +++ b/tests/test_inchikey_pair_selection.py @@ -4,151 +4,109 @@ import pytest from matchms import Spectrum - from ms2deepscore import SettingsMS2Deepscore from ms2deepscore.train_new_model.inchikey_pair_selection import ( - compute_jaccard_similarity_per_bin, select_inchi_for_unique_inchikeys, - create_spectrum_pair_generator, compute_fingerprints_for_training) + select_inchi_for_unique_inchikeys, + create_spectrum_pair_generator, + compute_fingerprints_for_training, +) from ms2deepscore.train_new_model import SpectrumPairGenerator from tests.create_test_spectra import create_test_spectra -@pytest.fixture -def simple_fingerprints(): - return np.array([ - [1, 1, 0, 0], - [1, 0, 1, 0], - [0, 1, 1, 1], - [0, 0, 1, 1], - ], dtype=bool) - - -@pytest.fixture -def fingerprints(): - return np.array([ - [1, 1, 0, 0, 1, 1], - [1, 0, 1, 0, 1, 1], - [0, 1, 1, 1, 1, 1], - [0, 0, 1, 1, 1, 1], - [1, 1, 0, 0, 0, 1], - [1, 0, 1, 0, 0, 1], - [0, 1, 1, 1, 1, 0], - [0, 0, 1, 1, 1, 0], - ], dtype=bool) +def _make_training_settings( + fingerprint_type="rdkit_binary", + bins=None, + batch_size=8, + average_inchikey_sampling_count=10, + max_pair_resampling=10, + max_inchikey_sampling=280, +): + if bins is None: + bins = [ + (0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), + (0.5, 0.6), (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3) + ] + return SettingsMS2Deepscore( + same_prob_bins=np.array(bins, dtype="float32"), + average_inchikey_sampling_count=average_inchikey_sampling_count, + batch_size=batch_size, + max_pair_resampling=max_pair_resampling, + max_inchikey_sampling=max_inchikey_sampling, + fingerprint_type=fingerprint_type, + fingerprint_nbits=256, + ) @pytest.fixture def test_spectra(): - metadata = {"precursor_mz": 101.1, - "inchikey": "ABCABCABCABCAB-nonsense", - "inchi": "InChI=1/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/h2,5,7-10H,1H2/t2-,5+/m0/s1"} - spectrum_1 = Spectrum(mz=np.array([100.]), - intensities=np.array([0.7]), - metadata=metadata) - spectrum_2 = Spectrum(mz=np.array([90.]), - intensities=np.array([0.4]), - metadata=metadata) - spectrum_3 = Spectrum(mz=np.array([90.]), - intensities=np.array([0.4]), - metadata=metadata) - spectrum_4 = Spectrum(mz=np.array([90.]), - intensities=np.array([0.4]), - metadata={"inchikey": 14 * "X", - "inchi": "InChI=1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)8(14)11(6)2/h4H,1-3H3"}) + metadata = { + "precursor_mz": 101.1, + "inchikey": "ABCABCABCABCAB-nonsense", + "inchi": "InChI=1/C6H8O6/c7-1-2(8)5-3(9)4(10)6(11)12-5/h2,5,7-10H,1H2/t2-,5+/m0/s1" + } + spectrum_1 = Spectrum( + mz=np.array([100.]), + intensities=np.array([0.7]), + metadata=metadata + ) + spectrum_2 = Spectrum( + mz=np.array([90.]), + intensities=np.array([0.4]), + metadata=metadata + ) + spectrum_3 = Spectrum( + mz=np.array([90.]), + intensities=np.array([0.4]), + metadata=metadata + ) + spectrum_4 = Spectrum( + mz=np.array([90.]), + intensities=np.array([0.4]), + metadata={ + "inchikey": 14 * "X", + "inchi": "InChI=1S/C8H10N4O2/c1-10-4-9-6-5(10)7(13)12(3)8(14)11(6)2/h4H,1-3H3" + } + ) return [spectrum_1, spectrum_2, spectrum_3, spectrum_4] -def test_compute_jaccard_similarity_per_bin(simple_fingerprints): - max_pairs_per_bin = 5 - nr_of_bins = 10 - selection_bins = np.array([(x / nr_of_bins, x / nr_of_bins + 1/ nr_of_bins) for x in range(nr_of_bins)]) - selected_pairs_per_bin_numba, selected_scores_per_bin_numba = compute_jaccard_similarity_per_bin( - simple_fingerprints, max_pairs_per_bin=max_pairs_per_bin, - selection_bins=selection_bins) - - # Uncompiled - selected_pairs_per_bin_py, selected_scores_per_bin_py = compute_jaccard_similarity_per_bin.py_func( - simple_fingerprints, max_pairs_per_bin=max_pairs_per_bin, - selection_bins=selection_bins) - - def check_correct_matrixes(selected_pairs_per_bin, selected_scores_per_bin): - assert selected_pairs_per_bin.shape == (nr_of_bins, len(simple_fingerprints), max_pairs_per_bin) - expected_nr_of_pairs_per_bin = np.array([0, 0, 4, 4, 0, 0, 2, 0, 0, 4]) - - for bin_id, pairs_matrix in enumerate(selected_pairs_per_bin): - - number_of_pairs_in_bin = len(np.where(pairs_matrix != -1)[0]) - assert expected_nr_of_pairs_per_bin[bin_id] == number_of_pairs_in_bin - - assert np.all(selected_scores_per_bin[bin_id][pairs_matrix == -1] == 0), \ - "If no pair is available the score should be 0" - assert np.all(selected_scores_per_bin[bin_id][pairs_matrix != -1] > 0.0), \ - "If a pair is found the score should not be 0 (in principle it could be, but not the case for these fingerprints)" - - if selection_bins[bin_id][1] == 1: - for inchikey_idx, row in enumerate(pairs_matrix): - assert len(np.where(row == inchikey_idx)[0]) == 1, \ - "When select_diagonal is True there should be a pair with itself in the bin including 1.0" - assert selected_scores_per_bin[bin_id][inchikey_idx][row == inchikey_idx] == 1.0 - else: - for inchikey_idx, row in enumerate(pairs_matrix): - assert len(np.where(row == inchikey_idx)[0]) == 0, \ - "The bins not including 1.0, should not have pairs between the same inchikey" - - check_correct_matrixes(selected_pairs_per_bin_numba, selected_scores_per_bin_numba) - check_correct_matrixes(selected_pairs_per_bin_py, selected_scores_per_bin_py) - - -def test_compute_jaccard_similarity_per_bin_exclude_diagonal(simple_fingerprints): - max_pairs_per_bin = 5 - nr_of_bins = 10 - selection_bins = np.array([(x / nr_of_bins, x / nr_of_bins + 1 / nr_of_bins) for x in range(nr_of_bins)]) - selected_pairs_per_bin_numba, selected_scores_per_bin_numba = compute_jaccard_similarity_per_bin( - simple_fingerprints, max_pairs_per_bin=max_pairs_per_bin, - selection_bins=selection_bins, include_diagonal=False) - - # Uncompiled - selected_pairs_per_bin_py, selected_scores_per_bin_py = compute_jaccard_similarity_per_bin.py_func( - simple_fingerprints, max_pairs_per_bin=max_pairs_per_bin, - selection_bins=selection_bins, include_diagonal=False) - - def check_correct_matrixes(selected_pairs_per_bin): - assert selected_pairs_per_bin.shape == (nr_of_bins, len(simple_fingerprints), max_pairs_per_bin) - for bin_id, pairs_matrix in enumerate(selected_pairs_per_bin): - for inchikey_idx, row in enumerate(pairs_matrix): - assert len(np.where(row == inchikey_idx)[0]) == 0, \ - "When include_diagonal is False there should not have pairs between the same inchikey" - - check_correct_matrixes(selected_pairs_per_bin_numba) - check_correct_matrixes(selected_pairs_per_bin_py) - - def test_select_inchi_for_unique_inchikeys(test_spectra): test_spectra[2].set("inchikey", "ABCABCABCABCAB-nonsense2") test_spectra[3].set("inchikey", "ABCABCABCABCAB-nonsense3") - (spectrums_selected, inchikey14s) = select_inchi_for_unique_inchikeys(test_spectra) - assert inchikey14s == ['ABCABCABCABCAB'] + spectrums_selected, inchikey14s = select_inchi_for_unique_inchikeys(test_spectra) + assert inchikey14s == ["ABCABCABCABCAB"] assert spectrums_selected[0].get("inchi").startswith("InChI=1/C6H8O6/") def test_select_inchi_for_unique_inchikeys_two_inchikeys(test_spectra): - # Test for two different inchikeys - (spectrums_selected, inchikey14s) = select_inchi_for_unique_inchikeys(test_spectra) - assert inchikey14s == ['ABCABCABCABCAB', 'XXXXXXXXXXXXXX'] - assert [s.get("inchi")[:15] for s in spectrums_selected] == ['InChI=1/C6H8O6/', 'InChI=1S/C8H10N'] + spectrums_selected, inchikey14s = select_inchi_for_unique_inchikeys(test_spectra) + assert inchikey14s == ["ABCABCABCABCAB", "XXXXXXXXXXXXXX"] + assert [s.get("inchi")[:15] for s in spectrums_selected] == [ + "InChI=1/C6H8O6/", + "InChI=1S/C8H10N", + ] def test_SelectedInchikeyPairs_generator_with_shuffle(): - dummy_inchikey_pair_generator = SpectrumPairGenerator( [ - ("Inchikey0", "Inchikey1", 0.8), ("Inchikey0", "Inchikey2", 0.6), - ("Inchikey2", "Inchikey1", 0.3), ("Inchikey2", "Inchikey2", 1.0)], [ - Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey0"}), - Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey1"}), - Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey2"}),], - True, 0) + dummy_inchikey_pair_generator = SpectrumPairGenerator( + [ + ("Inchikey0", "Inchikey1", 0.8), + ("Inchikey0", "Inchikey2", 0.6), + ("Inchikey2", "Inchikey1", 0.3), + ("Inchikey2", "Inchikey2", 1.0), + ], + [ + Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey0"}), + Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey1"}), + Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey2"}), + ], + True, + 0, + ) + found_pairs = [] - # do one complete loop - for i in range(len(dummy_inchikey_pair_generator)): + for _ in range(len(dummy_inchikey_pair_generator)): spectrum_1, spectrum_2, score = next(dummy_inchikey_pair_generator) found_pairs.append((spectrum_1.get("inchikey"), spectrum_2.get("inchikey"), score)) @@ -156,8 +114,7 @@ def test_SelectedInchikeyPairs_generator_with_shuffle(): assert sorted(found_pairs) == sorted(dummy_inchikey_pair_generator.selected_inchikey_pairs) found_pairs = [] - # do one complete loop - for i in range(len(dummy_inchikey_pair_generator)): + for _ in range(len(dummy_inchikey_pair_generator)): spectrum_1, spectrum_2, score = next(dummy_inchikey_pair_generator) found_pairs.append((spectrum_1.get("inchikey"), spectrum_2.get("inchikey"), score)) @@ -166,135 +123,154 @@ def test_SelectedInchikeyPairs_generator_with_shuffle(): def test_SelectedInchikeyPairs_generator_without_shuffle(): - dummy_inchikey_pair_generator = SpectrumPairGenerator( [ - ("Inchikey0", "Inchikey1", 0.8), ("Inchikey0", "Inchikey2", 0.6), - ("Inchikey2", "Inchikey1", 0.3), ("Inchikey2", "Inchikey2", 1.0)], [ - Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey0"}), - Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey1"}), - Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey2"}),], - True, 0) - - for _, expected_pair in enumerate(dummy_inchikey_pair_generator.selected_inchikey_pairs): + dummy_inchikey_pair_generator = SpectrumPairGenerator( + [ + ("Inchikey0", "Inchikey1", 0.8), + ("Inchikey0", "Inchikey2", 0.6), + ("Inchikey2", "Inchikey1", 0.3), + ("Inchikey2", "Inchikey2", 1.0), + ], + [ + Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey0"}), + Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey1"}), + Spectrum(mz=np.array([90.]), intensities=np.array([0.4]), metadata={"inchikey": "Inchikey2"}), + ], + True, + 0, + ) + + for expected_pair in dummy_inchikey_pair_generator.selected_inchikey_pairs: spectrum_1, spectrum_2, score = next(dummy_inchikey_pair_generator) assert expected_pair == (spectrum_1.get("inchikey"), spectrum_2.get("inchikey"), score) -def test_select_compound_pairs_wrapper_no_resampling(): +def check_correct_oversampling(selected_inchikey_pairs: SpectrumPairGenerator, max_resampling: int): + pair_counts = Counter(selected_inchikey_pairs.selected_inchikey_pairs) + for count in pair_counts.values(): + assert count <= max_resampling, "the resampling was done too frequently" + + +def check_balanced_scores_selecting_inchikey_pairs(selected_inchikey_pairs: SpectrumPairGenerator, score_bins): + scores = selected_inchikey_pairs.get_scores() + score_bins = np.array(score_bins, dtype="float32") + score_bin_counts = {tuple(score_bin): 0 for score_bin in score_bins} + for score in scores: + for min_bound, max_bound in score_bins: + if score > min_bound and score <= max_bound: + score_bin_counts[(min_bound, max_bound)] += 1 + assert len(set(score_bin_counts.values())) == 1 + + +@pytest.mark.parametrize("fingerprint_type", ["rdkit_binary", "rdkit_count"]) +def test_compute_fingerprints_for_training_dense_types(fingerprint_type): + spectrums = create_test_spectra(num_of_unique_inchikeys=10, num_of_spectra_per_inchikey=2) + + fingerprints, inchikeys14_unique = compute_fingerprints_for_training( + spectrums, + fingerprint_type=fingerprint_type, + nbits=256, + ) + + assert isinstance(inchikeys14_unique, list) + assert len(inchikeys14_unique) == 10 + assert len(fingerprints) == 10 + assert fingerprints.shape[0] == 10 + assert np.all(np.sum(fingerprints, axis=1) > 0) + + +@pytest.mark.parametrize("fingerprint_type", ["rdkit_binary_unfolded", "rdkit_count_unfolded"]) +def test_compute_fingerprints_for_training_unfolded_types_return_expected_length(fingerprint_type): + spectrums = create_test_spectra(num_of_unique_inchikeys=10, num_of_spectra_per_inchikey=2) + + fingerprints, inchikeys14_unique = compute_fingerprints_for_training( + spectrums, + fingerprint_type=fingerprint_type, + nbits=256, + ) + + assert isinstance(inchikeys14_unique, list) + assert len(inchikeys14_unique) == 10 + assert len(fingerprints) == 10 + + +@pytest.mark.parametrize( + "fingerprint_type,bins,max_pair_resampling", + [ + ("rdkit_binary", [(0.5, 0.75), (0.25, 0.5), (0.75, 1), (-0.000001, 0.25)], 1), + ("rdkit_count", [(0.5, 0.75), (0.25, 0.5), (0.75, 1), (-0.000001, 0.25)], 1), + ("rdkit_binary_unfolded", [(-0.01, 1.0)], 1), + ("rdkit_count_unfolded", [(-0.01, 1.0)], 1), + ], +) +def test_select_compound_pairs_wrapper_no_resampling_supported_types( + fingerprint_type, bins, max_pair_resampling +): spectrums = create_test_spectra(num_of_unique_inchikeys=26, num_of_spectra_per_inchikey=2) - bins = [(0.5, 0.75), (0.25, 0.5), (0.75, 1), (-0.000001, 0.25)] - max_pair_resampling = 1 - settings = SettingsMS2Deepscore(same_prob_bins=np.array(bins), - average_inchikey_sampling_count=10, - batch_size=8, - max_pair_resampling=max_pair_resampling) + settings = SettingsMS2Deepscore( + same_prob_bins=np.array(bins, dtype="float32"), + average_inchikey_sampling_count=10, + batch_size=8, + max_pair_resampling=max_pair_resampling, + fingerprint_type=fingerprint_type, + ) inchikey_pair_generator = create_spectrum_pair_generator(spectrums, settings) check_balanced_scores_selecting_inchikey_pairs(inchikey_pair_generator, bins) check_correct_oversampling(inchikey_pair_generator, max_pair_resampling) - # Currently doesn't check anything, but prints badly distributed pairs and the available pairs. It is hard to write - # a good test, since the balancing behaviour we would like to see only happens when you have a lot more pairs - # (and inchikeys) which is not suitable for a test. - print_balanced_bins_per_inchikey(inchikey_pair_generator, settings, spectrums) - -def test_select_compound_pairs_wrapper_with_resampling(): +@pytest.mark.parametrize( + "fingerprint_type,bins,max_pair_resampling", + [ + ("rdkit_binary", [(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6), + (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3)], 10), + ("rdkit_count", [(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6), + (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3)], 10), + ("rdkit_binary_unfolded", [(-0.01, 1.0)], 10), + ("rdkit_count_unfolded", [(-0.01, 1.0)], 10), + ], +) +def test_select_compound_pairs_wrapper_with_resampling_supported_types( + fingerprint_type, bins, max_pair_resampling +): spectrums = create_test_spectra(num_of_unique_inchikeys=26, num_of_spectra_per_inchikey=1) - bins = [(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6), - (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3)] - max_pair_resampling = 10 - settings = SettingsMS2Deepscore(same_prob_bins=np.array(bins, dtype="float32"), - average_inchikey_sampling_count=10, - batch_size=8, - max_pair_resampling=max_pair_resampling) + settings = SettingsMS2Deepscore( + same_prob_bins=np.array(bins, dtype="float32"), + average_inchikey_sampling_count=10, + batch_size=8, + max_pair_resampling=max_pair_resampling, + fingerprint_type=fingerprint_type, + ) inchikey_pair_generator = create_spectrum_pair_generator(spectrums, settings) check_balanced_scores_selecting_inchikey_pairs(inchikey_pair_generator, bins) check_correct_oversampling(inchikey_pair_generator, max_pair_resampling) - # Currently doesn't check anything, but prints badly distributed pairs and the available pairs. It is hard to write - # a good test, since the balancing behaviour we would like to see only happens when you have a lot more pairs - # (and inchikeys) which is not suitable for a test. - print_balanced_bins_per_inchikey(inchikey_pair_generator, settings, spectrums) -def test_select_compound_pairs_wrapper_maximum_inchikey_count(): +@pytest.mark.parametrize( + "fingerprint_type,bins", + [ + ("rdkit_binary", [(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6), + (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3)]), + ("rdkit_count", [(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6), + (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3)]), + ("rdkit_binary_unfolded", [(-0.01, 1.0)]), + ("rdkit_count_unfolded", [(-0.01, 1.0)]), + ], +) +def test_select_compound_pairs_wrapper_maximum_inchikey_count_supported_types(fingerprint_type, bins): spectrums = create_test_spectra(num_of_unique_inchikeys=26, num_of_spectra_per_inchikey=1) - bins = [(0.8, 0.9), (0.7, 0.8), (0.9, 1.0), (0.6, 0.7), (0.5, 0.6), - (0.4, 0.5), (0.3, 0.4), (-0.01, 0.3)] max_pair_resampling = 1000 max_inchikey_sampling = 280 - settings = SettingsMS2Deepscore(same_prob_bins=np.array(bins, dtype="float32"), - average_inchikey_sampling_count=200, - batch_size=8, - max_pair_resampling=max_pair_resampling, - max_inchikey_sampling=max_inchikey_sampling - ) + settings = SettingsMS2Deepscore( + same_prob_bins=np.array(bins, dtype="float32"), + average_inchikey_sampling_count=200, + batch_size=8, + max_pair_resampling=max_pair_resampling, + max_inchikey_sampling=max_inchikey_sampling, + fingerprint_type=fingerprint_type, + ) inchikey_pair_generator = create_spectrum_pair_generator(spectrums, settings) highest_inchikey_count = max(inchikey_pair_generator.get_inchikey_counts().values()) - assert highest_inchikey_count <= max_inchikey_sampling + 1 # +1 because there is a chance that the last added inchikey is a pair to itself... - - -def check_correct_oversampling(selected_inchikey_pairs: SpectrumPairGenerator, max_resampling: int): - pair_counts = Counter(selected_inchikey_pairs.selected_inchikey_pairs) - for count in pair_counts.values(): - assert count <= max_resampling, "the resampling was done too frequently" - - -def get_available_score_distribution(settings, spectra): - """Gets the score distribution for the available pairs (before doing balanced selection)""" - fingerprints, inchikeys14_unique = compute_fingerprints_for_training(spectra, settings.fingerprint_type, - settings.fingerprint_nbits) - - available_pairs_per_bin_matrix, available_scores_per_bin_matrix = compute_jaccard_similarity_per_bin( - fingerprints, - settings.max_pairs_per_bin, - settings.same_prob_bins, - settings.include_diagonal) - - score_distribution_per_inchikey = {inchikey: [0]*len(settings.same_prob_bins) for inchikey in inchikeys14_unique} - for bin_id, available_pairs in enumerate(available_pairs_per_bin_matrix): - for inchikey_1_idx, row_of_pairs in enumerate(available_pairs): - inchikey_1 = inchikeys14_unique[inchikey_1_idx] - score_distribution_per_inchikey[inchikey_1][bin_id] = len(np.where(row_of_pairs != -1)[0]) - return score_distribution_per_inchikey - - -def print_balanced_bins_per_inchikey(selected_inchikey_pairs: SpectrumPairGenerator, settings, spectra): - """Prints the available distribution and the balanced distribution - - Currently doesn't do any checks, because it is hard to check if the wanted behaviour is achieved, - since it is different for small test sets compared to large test sets.""" - score_distribution_per_inchikey = {inchikey: [0]*len(settings.same_prob_bins) for inchikey in selected_inchikey_pairs.get_inchikey_counts().keys()} - for inchikey_1, inchikey_2, score in selected_inchikey_pairs.selected_inchikey_pairs: - for i, score_bin in enumerate(settings.same_prob_bins): - if score > score_bin[0] and score <= score_bin[1]: - score_distribution_per_inchikey[inchikey_1][i] += 1 - score_distribution_per_inchikey[inchikey_2][i] += 1 - available_score_distribution = get_available_score_distribution(settings, spectra) - - for inchikey in score_distribution_per_inchikey.keys(): - balanced_distribution = score_distribution_per_inchikey[inchikey] - average_balanced_distribution = sum(balanced_distribution)/len(balanced_distribution) - if min(balanced_distribution)*2 < average_balanced_distribution: - available_distribution = available_score_distribution[inchikey] - _ = balanced_distribution.index(min(balanced_distribution)) - print(available_distribution, balanced_distribution) - # assert minimum_available_distribution*settings.max_pair_resampling == min(balanced_distribution) - - -def check_balanced_scores_selecting_inchikey_pairs(selected_inchikey_pairs: SpectrumPairGenerator, - score_bins): - """Test if SpectrumPairGenerator has an equal inchikey distribution - """ - scores = selected_inchikey_pairs.get_scores() - # converting to float32 is required, since the scores are float32, otherwise equal numbers are seen as not equal - # and put in the wrong bin. - score_bins = np.array(score_bins, dtype="float32") - score_bin_counts = {tuple(score_bin): 0 for score_bin in score_bins} - for score in scores: - for min_bound, max_bound in score_bins: - if score > min_bound and score <= max_bound: - score_bin_counts[(min_bound, max_bound)] += 1 - # Check that the number of pairs per bin is equal for all bins - assert len(set(score_bin_counts.values())) == 1 + assert highest_inchikey_count <= max_inchikey_sampling + 1 From 538bde2720c7239b8e80f9a3bc46a9d88a5f2656 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:41:51 +0100 Subject: [PATCH 05/12] adjust to new code place --- .../calculate_scores_for_validation.py | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/ms2deepscore/validation_loss_calculation/calculate_scores_for_validation.py b/ms2deepscore/validation_loss_calculation/calculate_scores_for_validation.py index f90de348..45092c37 100644 --- a/ms2deepscore/validation_loss_calculation/calculate_scores_for_validation.py +++ b/ms2deepscore/validation_loss_calculation/calculate_scores_for_validation.py @@ -2,16 +2,11 @@ import pandas as pd from matchms import Spectrum -from chemap.metrics import ( - tanimoto_similarity_matrix_dense, - tanimoto_similarity_matrix_sparse_binary, - tanimoto_similarity_matrix_sparse -) - from ms2deepscore.train_new_model.inchikey_pair_selection import select_inchi_for_unique_inchikeys from ms2deepscore.vector_operations import cosine_similarity_matrix from ms2deepscore.fingerprint_utils import derive_fingerprint_from_smiles, matchms_spectrum_to_smiles +from ms2deepscore.fingerprint_similarity_computations import compute_fingerprint_similarity_matrix def create_embedding_matrix_symmetric(model, spectra) -> pd.DataFrame: @@ -113,18 +108,10 @@ def calculate_tanimoto_scores_unique_inchikey( ) print("Calculating tanimoto scores") - if "unfolded" in fingerprint_type: - if "count" in fingerprint_type: - tanimoto_scores = tanimoto_similarity_matrix_sparse( - [x[0] for x in fingerprints_1], - [x[1] for x in fingerprints_1], - [x[0] for x in fingerprints_2], - [x[1] for x in fingerprints_2], - ) - else: - tanimoto_scores = tanimoto_similarity_matrix_sparse_binary(fingerprints_1, fingerprints_2) - - else: - tanimoto_scores = tanimoto_similarity_matrix_dense(fingerprints_1, fingerprints_2) + tanimoto_scores = compute_fingerprint_similarity_matrix( + fingerprints_1, + fingerprints_2, + fingerprint_type=fingerprint_type, + ) tanimoto_df = pd.DataFrame(tanimoto_scores, index=unique_inchikeys_1, columns=unique_inchikeys_2) return tanimoto_df From d7bf4857fd1dc0fcfebb8eb79038d23f532795a3 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:42:11 +0100 Subject: [PATCH 06/12] refactor code and fix small sampling bug --- .../inchikey_pair_selection_cross_ionmode.py | 179 +++++------ ...t_inchikey_pair_selection_cross_ionmode.py | 299 ++++++++++++++++++ 2 files changed, 375 insertions(+), 103 deletions(-) create mode 100644 tests/test_inchikey_pair_selection_cross_ionmode.py diff --git a/ms2deepscore/train_new_model/inchikey_pair_selection_cross_ionmode.py b/ms2deepscore/train_new_model/inchikey_pair_selection_cross_ionmode.py index d12ada34..aada66b8 100644 --- a/ms2deepscore/train_new_model/inchikey_pair_selection_cross_ionmode.py +++ b/ms2deepscore/train_new_model/inchikey_pair_selection_cross_ionmode.py @@ -1,28 +1,44 @@ import json from typing import List, Tuple from collections import Counter + import numpy as np from matchms import Spectrum -from numba import jit, prange + from ms2deepscore.SettingsMS2Deepscore import SettingsMS2Deepscore from ms2deepscore.train_new_model.TrainingBatchGenerator import TrainingBatchGenerator from ms2deepscore.train_new_model.SpectrumPairGenerator import SpectrumPairGenerator -from ms2deepscore.train_new_model.inchikey_pair_selection import compute_fingerprints_for_training, \ - balanced_selection_of_pairs_per_bin, convert_to_selected_pairs_list, tanimoto_scores_row, \ - create_spectrum_pair_generator +from ms2deepscore.train_new_model.inchikey_pair_selection import ( + compute_fingerprints_for_training, + balanced_selection_of_pairs_per_bin, + convert_to_selected_pairs_list, + create_spectrum_pair_generator, +) +from ms2deepscore.fingerprint_similarity_computations import ( + compute_tanimoto_similarity_per_bin_between_sets, +) from ms2deepscore.utils import split_by_ionmode -def create_data_generator_across_ionmodes(training_spectra, - settings: SettingsMS2Deepscore) -> TrainingBatchGenerator: + +def create_data_generator_across_ionmodes( + training_spectra, + settings: SettingsMS2Deepscore, +) -> TrainingBatchGenerator: pos_spectra, neg_spectra = split_by_ionmode(training_spectra) pos_spectrum_pair_generator = create_spectrum_pair_generator(pos_spectra, settings=settings) neg_spectrum_pair_generator = create_spectrum_pair_generator(neg_spectra, settings=settings) - pos_neg_spectrum_pair_generator = select_compound_pairs_wrapper_across_ionmode(pos_spectra, neg_spectra, settings) + pos_neg_spectrum_pair_generator = select_compound_pairs_wrapper_across_ionmode( + pos_spectra, neg_spectra, settings + ) - spectrum_pair_generator = CombinedSpectrumGenerator([pos_spectrum_pair_generator, neg_spectrum_pair_generator, pos_neg_spectrum_pair_generator]) + spectrum_pair_generator = CombinedSpectrumGenerator( + [pos_spectrum_pair_generator, neg_spectrum_pair_generator, pos_neg_spectrum_pair_generator] + ) - train_generator = TrainingBatchGenerator(spectrum_pair_generator=spectrum_pair_generator, settings=settings) + train_generator = TrainingBatchGenerator( + spectrum_pair_generator=spectrum_pair_generator, settings=settings + ) return train_generator @@ -31,8 +47,10 @@ def select_compound_pairs_wrapper_across_ionmode( spectra_2: List[Spectrum], settings: SettingsMS2Deepscore, ) -> "SpectrumPairGeneratorAcrossIonmodes": - """Returns a SpectrumPairGenerator object containing equally balanced pairs over the different bins + """Returns a SpectrumPairGeneratorAcrossIonmodes object containing equally balanced cross-ionmode pairs. + Parameters + ---------- spectra: A list of spectra settings: @@ -50,108 +68,62 @@ def select_compound_pairs_wrapper_across_ionmode( fingerprints_1, inchikeys14_unique_1 = compute_fingerprints_for_training( spectra_1, settings.fingerprint_type, - settings.fingerprint_nbits - ) + settings.fingerprint_nbits, + ) fingerprints_2, inchikeys14_unique_2 = compute_fingerprints_for_training( spectra_2, settings.fingerprint_type, - settings.fingerprint_nbits - ) + settings.fingerprint_nbits, + ) if len(inchikeys14_unique_1) < settings.batch_size or len(inchikeys14_unique_2) < settings.batch_size: raise ValueError("The number of unique inchikeys must be larger than the batch size.") - available_pairs_per_bin_matrix, available_scores_per_bin_matrix = compute_jaccard_similarity_per_bin_across_ionmodes( - fingerprints_1, fingerprints_2, settings.max_pairs_per_bin, settings.same_prob_bins) + available_pairs_per_bin_matrix, available_scores_per_bin_matrix = ( + compute_tanimoto_similarity_per_bin_between_sets( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=settings.max_pairs_per_bin, + fingerprint_type=settings.fingerprint_type, + selection_bins=settings.same_prob_bins, + ) + ) pair_frequency_matrixes = balanced_selection_of_pairs_per_bin( - available_pairs_per_bin_matrix, settings) + available_pairs_per_bin_matrix, settings + ) selected_pairs_per_bin = convert_to_selected_pairs_list( - pair_frequency_matrixes, available_pairs_per_bin_matrix, - available_scores_per_bin_matrix, inchikeys14_unique_1 + inchikeys14_unique_2) - return SpectrumPairGeneratorAcrossIonmodes([pair for pairs in selected_pairs_per_bin for pair in pairs], - spectra_1, spectra_2, settings.shuffle, settings.random_seed) - - -@jit(nopython=True, parallel=True) -def compute_jaccard_similarity_per_bin_across_ionmodes( - fingerprints_1, - fingerprints_2, - max_pairs_per_bin, - selection_bins=np.array([(x / 10, x / 10 + 0.1) for x in range(10)]) -) -> Tuple[np.ndarray, np.ndarray]: - """Randomly selects compound pairs per tanimoto bin, up to max_pairs_per_bin - - returns: - 2 3d numpy arrays are returned, the first encodes the pairs per bin and the second the corresponding scores. - A 3D numpy array with shape [nr_of_bins, nr_of_fingerprints, max_pairs_per_bin]. - An example structure for bin 1, with 3 fingerprints and max_pairs_per_bin =4 would be: - [[1,2,-1,-1], - [0,3,-1,-1], - [0,2,-1,-1],] - The pairs are encoded by the index and the value. - So the first row encodes pairs between fingerpint 0 and 1, fingerprint 0 and 2. - The -1 encode that no more pairs were found for this fingerprint in this bin. - """ - - size_1 = fingerprints_1.shape[0] - size_2 = fingerprints_2.shape[0] - - num_bins = len(selection_bins) - - selected_pairs_per_bin = -1 * np.ones((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.int32) - selected_scores_per_bin = np.zeros((num_bins, size_1 + size_2, max_pairs_per_bin), dtype=np.float32) - - for idx_fingerprint_i in prange(size_1): - fingerprint_i = fingerprints_1[idx_fingerprint_i, :] - tanimoto_scores = tanimoto_scores_row(fingerprint_i, fingerprints_2) - - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 - - for idx_fingerprint_2 in prange(size_2): - fingerprint_i = fingerprints_2[idx_fingerprint_2, :] - idx_fingerprint_corrected = idx_fingerprint_2 + size_1 - tanimoto_scores = tanimoto_scores_row(fingerprint_i, fingerprints_2) - - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] - - return selected_pairs_per_bin, selected_scores_per_bin + pair_frequency_matrixes, + available_pairs_per_bin_matrix, + available_scores_per_bin_matrix, + inchikeys14_unique_1 + inchikeys14_unique_2, + ) + + return SpectrumPairGeneratorAcrossIonmodes( + [pair for pairs in selected_pairs_per_bin for pair in pairs], + spectra_1, + spectra_2, + settings.shuffle, + settings.random_seed, + ) class SpectrumPairGeneratorAcrossIonmodes: - def __init__(self, selected_inchikey_pairs: List[Tuple[str, str, float]], - spectra_pos: List[Spectrum], spectra_neg: List[Spectrum], - shuffle: bool = True, random_seed: int = 0): - """ - Parameters - ---------- - selected_inchikey_pairs: - A list with tuples encoding inchikey pairs like: (inchikey1, inchikey2, tanimoto_score) - """ + def __init__( + self, + selected_inchikey_pairs: List[Tuple[str, str, float]], + spectra_pos: List[Spectrum], + spectra_neg: List[Spectrum], + shuffle: bool = True, + random_seed: int = 0, + ): self.selected_inchikey_pairs = selected_inchikey_pairs self.spectra_pos = spectra_pos self.spectra_neg = spectra_neg self.pos_inchikeys = np.array([s.get("inchikey")[:14] for s in self.spectra_pos]) - self.neg_inchikeys= np.array([s.get("inchikey")[:14] for s in self.spectra_neg]) + self.neg_inchikeys = np.array([s.get("inchikey")[:14] for s in self.spectra_neg]) self.shuffle = shuffle self.random_nr_generator = np.random.default_rng(random_seed) @@ -198,38 +170,39 @@ def get_scores_per_inchikey(self): if inchikey_1 in inchikey_scores: inchikey_scores[inchikey_1].append(score) else: - inchikey_scores[inchikey_1] = [] + inchikey_scores[inchikey_1] = [score] + if inchikey_2 in inchikey_scores: inchikey_scores[inchikey_2].append(score) else: - inchikey_scores[inchikey_2] = [] + inchikey_scores[inchikey_2] = [score] return inchikey_scores def save_as_json(self, file_name): data_for_json = [(item[0], item[1], float(item[2])) for item in self.selected_inchikey_pairs] - with open(file_name, "w", encoding="utf-8") as f: json.dump(data_for_json, f) def _get_pos_spectrum_with_inchikey(self, inchikey: str, random_number_generator) -> Spectrum: matching_spectrum_id = np.where(self.pos_inchikeys == inchikey)[0] if len(matching_spectrum_id) <= 0: - raise ValueError("No matching inchikey found (note: expected first 14 characters), " - "likely switched pos and neg in entry") + raise ValueError( + "No matching inchikey found (note: expected first 14 characters), likely switched pos and neg in entry" + ) return self.spectra_pos[random_number_generator.choice(matching_spectrum_id)] def _get_neg_spectrum_with_inchikey(self, inchikey: str, random_number_generator) -> Spectrum: matching_spectrum_id = np.where(self.neg_inchikeys == inchikey)[0] if len(matching_spectrum_id) <= 0: - raise ValueError("No matching inchikey found (note: expected first 14 characters), " - "likely switched pos and neg in entry") + raise ValueError( + "No matching inchikey found (note: expected first 14 characters), likely switched pos and neg in entry" + ) return self.spectra_neg[random_number_generator.choice(matching_spectrum_id)] class CombinedSpectrumGenerator: - """Combines multiple SpectrumPairGenerators into a single generator + """Combines multiple SpectrumPairGenerators into a single generator.""" - This is used to combine different iterators for each ionmode pair""" def __init__(self, spectrum_pair_generators: List[SpectrumPairGenerator]): self.generators = spectrum_pair_generators self._idx = 0 @@ -245,4 +218,4 @@ def __next__(self): return next(current_generator) def __len__(self): - return sum([len(generator) for generator in self.generators]) \ No newline at end of file + return sum(len(generator) for generator in self.generators) diff --git a/tests/test_inchikey_pair_selection_cross_ionmode.py b/tests/test_inchikey_pair_selection_cross_ionmode.py new file mode 100644 index 00000000..1057b073 --- /dev/null +++ b/tests/test_inchikey_pair_selection_cross_ionmode.py @@ -0,0 +1,299 @@ +import numpy as np +import pytest +from matchms import Spectrum + +from ms2deepscore import SettingsMS2Deepscore +from ms2deepscore.train_new_model.inchikey_pair_selection_cross_ionmode import ( + create_data_generator_across_ionmodes, + select_compound_pairs_wrapper_across_ionmode, + SpectrumPairGeneratorAcrossIonmodes, + CombinedSpectrumGenerator, +) +from ms2deepscore.train_new_model import SpectrumPairGenerator +from tests.create_test_spectra import create_test_spectra + + +def _make_cross_ionmode_settings( + fingerprint_type="rdkit_binary", + bins=None, + batch_size=2, + average_inchikey_sampling_count=4, + max_pair_resampling=10, + max_inchikey_sampling=100, +): + if bins is None: + bins = [(-0.01, 1.0)] + return SettingsMS2Deepscore( + min_mz=10, + max_mz=1000, + mz_bin_width=0.1, + intensity_scaling=0.5, + additional_metadata=[], + same_prob_bins=np.array(bins, dtype="float32"), + batch_size=batch_size, + num_turns=4, + average_inchikey_sampling_count=average_inchikey_sampling_count, + max_pair_resampling=max_pair_resampling, + max_inchikey_sampling=max_inchikey_sampling, + fingerprint_type=fingerprint_type, + fingerprint_nbits=256, + augment_removal_max=0.0, + augment_removal_intensity=0.0, + augment_intensity=0.0, + augment_noise_max=0.0, + ) + + +@pytest.fixture +def pos_neg_spectra(): + test_spectra = create_test_spectra(20, 2) + + pos_spectra = [] + for spectrum in test_spectra[:20]: + spectrum.set("ionmode", "positive") + pos_spectra.append(spectrum) + + neg_spectra = [] + for spectrum in test_spectra[20:]: + spectrum.set("ionmode", "negative") + neg_spectra.append(spectrum) + + return pos_spectra, neg_spectra + + +@pytest.fixture +def small_pos_neg_spectra(): + pos_spectra = [ + Spectrum( + mz=np.array([100.0]), + intensities=np.array([1.0]), + metadata={"inchikey": "AAAAAAAAAAAAAA-AAAAAAAAAA-N", "ionmode": "positive"}, + ), + Spectrum( + mz=np.array([110.0]), + intensities=np.array([0.8]), + metadata={"inchikey": "BBBBBBBBBBBBBB-BBBBBBBBBB-N", "ionmode": "positive"}, + ), + ] + neg_spectra = [ + Spectrum( + mz=np.array([120.0]), + intensities=np.array([0.7]), + metadata={"inchikey": "CCCCCCCCCCCCCC-CCCCCCCCCC-N", "ionmode": "negative"}, + ), + Spectrum( + mz=np.array([130.0]), + intensities=np.array([0.6]), + metadata={"inchikey": "DDDDDDDDDDDDDD-DDDDDDDDDD-N", "ionmode": "negative"}, + ), + ] + return pos_spectra, neg_spectra + + +@pytest.mark.parametrize( + "fingerprint_type,bins", + [ + ("rdkit_binary", [(-0.01, 0.6), (0.6, 1.0)]), + ("rdkit_count", [(-0.01, 0.6), (0.6, 1.0)]), + ("rdkit_binary_unfolded", [(-0.01, 0.6), (0.6, 1.0)]), + ("rdkit_count_unfolded", [(-0.01, 0.6), (0.6, 1.0)]), + ], +) +def test_select_compound_pairs_wrapper_across_ionmode_supported_types( + pos_neg_spectra, fingerprint_type, bins +): + pos_spectra, neg_spectra = pos_neg_spectra + settings = _make_cross_ionmode_settings( + fingerprint_type=fingerprint_type, + bins=bins, + batch_size=2, + average_inchikey_sampling_count=4, + ) + + spectrum_pair_generator = select_compound_pairs_wrapper_across_ionmode( + pos_spectra, neg_spectra, settings + ) + + assert len(spectrum_pair_generator) > 0 + + for _ in range(len(spectrum_pair_generator)): + spectrum_1, spectrum_2, score = next(spectrum_pair_generator) + assert spectrum_1.get("ionmode") == "positive" + assert spectrum_2.get("ionmode") == "negative" + assert 0.0 <= score <= 1.0 + + # Infinite generator behavior + spectrum_1, spectrum_2, score = next(spectrum_pair_generator) + assert spectrum_1.get("ionmode") == "positive" + assert spectrum_2.get("ionmode") == "negative" + assert 0.0 <= score <= 1.0 + + +@pytest.mark.parametrize( + "fingerprint_type,bins", + [ + ("rdkit_binary", [(-0.01, 1.0)]), + ("rdkit_count", [(-0.01, 1.0)]), + ("rdkit_binary_unfolded", [(-0.01, 1.0)]), + ("rdkit_count_unfolded", [(-0.01, 1.0)]), + ], +) +def test_create_data_generator_across_ionmodes_supported_types( + pos_neg_spectra, fingerprint_type, bins +): + pos_spectra, neg_spectra = pos_neg_spectra + settings = _make_cross_ionmode_settings( + fingerprint_type=fingerprint_type, + bins=bins, + batch_size=2, + average_inchikey_sampling_count=4, + ) + + data_generator = create_data_generator_across_ionmodes(pos_spectra + neg_spectra, settings) + + assert len(data_generator) > 0 + + for _ in range(len(data_generator)): + spec1, spec2, meta1, meta2, targets = next(data_generator) + assert spec1.shape[0] == settings.batch_size + assert spec2.shape[0] == settings.batch_size + assert meta1.shape[0] == settings.batch_size + assert meta2.shape[0] == settings.batch_size + assert targets.shape[0] == settings.batch_size + + +def test_spectrum_pair_generator_across_ionmodes_get_scores_and_counts(small_pos_neg_spectra): + pos_spectra, neg_spectra = small_pos_neg_spectra + selected_inchikey_pairs = [ + ("AAAAAAAAAAAAAA", "CCCCCCCCCCCCCC", 0.2), + ("AAAAAAAAAAAAAA", "DDDDDDDDDDDDDD", 0.6), + ("BBBBBBBBBBBBBB", "CCCCCCCCCCCCCC", 0.8), + ] + + generator = SpectrumPairGeneratorAcrossIonmodes( + selected_inchikey_pairs=selected_inchikey_pairs, + spectra_pos=pos_spectra, + spectra_neg=neg_spectra, + shuffle=False, + random_seed=0, + ) + + assert generator.get_scores() == [0.2, 0.6, 0.8] + + counts = generator.get_inchikey_counts() + assert counts["AAAAAAAAAAAAAA"] == 2 + assert counts["BBBBBBBBBBBBBB"] == 1 + assert counts["CCCCCCCCCCCCCC"] == 2 + assert counts["DDDDDDDDDDDDDD"] == 1 + + +def test_spectrum_pair_generator_across_ionmodes_get_scores_per_inchikey(small_pos_neg_spectra): + pos_spectra, neg_spectra = small_pos_neg_spectra + selected_inchikey_pairs = [ + ("AAAAAAAAAAAAAA", "CCCCCCCCCCCCCC", 0.2), + ("AAAAAAAAAAAAAA", "DDDDDDDDDDDDDD", 0.6), + ("BBBBBBBBBBBBBB", "CCCCCCCCCCCCCC", 0.8), + ] + + generator = SpectrumPairGeneratorAcrossIonmodes( + selected_inchikey_pairs=selected_inchikey_pairs, + spectra_pos=pos_spectra, + spectra_neg=neg_spectra, + shuffle=False, + random_seed=0, + ) + + scores_per_inchikey = generator.get_scores_per_inchikey() + + assert scores_per_inchikey["AAAAAAAAAAAAAA"] == [0.2, 0.6] + assert scores_per_inchikey["BBBBBBBBBBBBBB"] == [0.8] + assert scores_per_inchikey["CCCCCCCCCCCCCC"] == [0.2, 0.8] + assert scores_per_inchikey["DDDDDDDDDDDDDD"] == [0.6] + + +def test_spectrum_pair_generator_across_ionmodes_without_shuffle_order(small_pos_neg_spectra): + pos_spectra, neg_spectra = small_pos_neg_spectra + selected_inchikey_pairs = [ + ("AAAAAAAAAAAAAA", "CCCCCCCCCCCCCC", 0.2), + ("BBBBBBBBBBBBBB", "DDDDDDDDDDDDDD", 0.6), + ] + + generator = SpectrumPairGeneratorAcrossIonmodes( + selected_inchikey_pairs=selected_inchikey_pairs, + spectra_pos=pos_spectra, + spectra_neg=neg_spectra, + shuffle=False, + random_seed=0, + ) + + spectrum_1, spectrum_2, score = next(generator) + assert spectrum_1.get("inchikey")[:14] == "AAAAAAAAAAAAAA" + assert spectrum_2.get("inchikey")[:14] == "CCCCCCCCCCCCCC" + assert score == 0.2 + + spectrum_1, spectrum_2, score = next(generator) + assert spectrum_1.get("inchikey")[:14] == "BBBBBBBBBBBBBB" + assert spectrum_2.get("inchikey")[:14] == "DDDDDDDDDDDDDD" + assert score == 0.6 + + +def test_spectrum_pair_generator_across_ionmodes_missing_positive_inchikey_raises(small_pos_neg_spectra): + pos_spectra, neg_spectra = small_pos_neg_spectra + generator = SpectrumPairGeneratorAcrossIonmodes( + selected_inchikey_pairs=[("ZZZZZZZZZZZZZZ", "CCCCCCCCCCCCCC", 0.2)], + spectra_pos=pos_spectra, + spectra_neg=neg_spectra, + shuffle=False, + random_seed=0, + ) + + with pytest.raises(ValueError, match="No matching inchikey found"): + next(generator) + + +def test_spectrum_pair_generator_across_ionmodes_missing_negative_inchikey_raises(small_pos_neg_spectra): + pos_spectra, neg_spectra = small_pos_neg_spectra + generator = SpectrumPairGeneratorAcrossIonmodes( + selected_inchikey_pairs=[("AAAAAAAAAAAAAA", "ZZZZZZZZZZZZZZ", 0.2)], + spectra_pos=pos_spectra, + spectra_neg=neg_spectra, + shuffle=False, + random_seed=0, + ) + + with pytest.raises(ValueError, match="No matching inchikey found"): + next(generator) + + +def test_combined_spectrum_generator_cycles_through_generators(): + gen1 = SpectrumPairGenerator( + [("A", "B", 0.1)], + [ + Spectrum(mz=np.array([100.0]), intensities=np.array([1.0]), metadata={"inchikey": "A"}), + Spectrum(mz=np.array([101.0]), intensities=np.array([1.0]), metadata={"inchikey": "B"}), + ], + shuffle=False, + random_seed=0, + ) + gen2 = SpectrumPairGenerator( + [("C", "D", 0.2)], + [ + Spectrum(mz=np.array([102.0]), intensities=np.array([1.0]), metadata={"inchikey": "C"}), + Spectrum(mz=np.array([103.0]), intensities=np.array([1.0]), metadata={"inchikey": "D"}), + ], + shuffle=False, + random_seed=0, + ) + + combined = CombinedSpectrumGenerator([gen1, gen2]) + + spectrum_1, spectrum_2, score = next(combined) + assert score == 0.1 + + spectrum_1, spectrum_2, score = next(combined) + assert score == 0.2 + + spectrum_1, spectrum_2, score = next(combined) + assert score == 0.1 + + assert len(combined) == len(gen1) + len(gen2) From d58454c20fe3cbb7e8e129777d807c160bc58a37 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 12:42:24 +0100 Subject: [PATCH 07/12] minor test adjustment --- tests/test_data_generators.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/test_data_generators.py b/tests/test_data_generators.py index 65b363b0..6b84bb2e 100644 --- a/tests/test_data_generators.py +++ b/tests/test_data_generators.py @@ -274,7 +274,8 @@ def test_create_data_generator_across_ionmodes(): mz_bin_width=0.1, intensity_scaling=0.5, additional_metadata=[], - same_prob_bins=np.array([(-0.01, 0.75), (0.75, 1)]), + same_prob_bins=np.array([(-0.01, 0.6), (0.6, 1)]), + max_inchikey_sampling=300, batch_size=2, num_turns=4, ) @@ -299,7 +300,8 @@ def test_select_compound_pairs_wrapper_across_ionmode(): mz_bin_width=0.1, intensity_scaling=0.5, additional_metadata=[], - same_prob_bins=np.array([(-0.01, 0.75), (0.75, 1)]), + same_prob_bins=np.array([(-0.01, 0.6), (0.6, 1)]), + max_inchikey_sampling=300, batch_size=2, num_turns=4, ) From ea7bc78ed0cf375dfb35fc3d2de094404a1cfa8a Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 13:08:09 +0100 Subject: [PATCH 08/12] bump version --- ms2deepscore/__version__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ms2deepscore/__version__.py b/ms2deepscore/__version__.py index f2df444a..387cfacc 100644 --- a/ms2deepscore/__version__.py +++ b/ms2deepscore/__version__.py @@ -1 +1 @@ -__version__ = '2.8.0' +__version__ = '2.9.0' From c4718c57f44a1b832666f176fe9c2d0c734d70d4 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 13:58:33 +0100 Subject: [PATCH 09/12] large text expansion to increase coverage --- ...est_fingerprint_similarity_computations.py | 245 ++++++++++++++++-- 1 file changed, 230 insertions(+), 15 deletions(-) diff --git a/tests/test_fingerprint_similarity_computations.py b/tests/test_fingerprint_similarity_computations.py index 0d93bbc9..29437522 100644 --- a/tests/test_fingerprint_similarity_computations.py +++ b/tests/test_fingerprint_similarity_computations.py @@ -2,6 +2,15 @@ import pytest from ms2deepscore.fingerprint_similarity_computations import ( + is_dense_fingerprint_type, + is_unfolded_binary_fingerprint_type, + is_unfolded_count_fingerprint_type, + compute_fingerprint_similarity_matrix, + compute_fingerprint_similarity_row, + tanimoto_scores_row_dense, + tanimoto_scores_row_sparse_binary, + tanimoto_scores_row_sparse_count, + _split_sparse_count_fingerprints, compute_tanimoto_similarity_per_bin, compute_tanimoto_similarity_per_bin_between_sets, ) @@ -123,13 +132,209 @@ def _check_between_sets_similarity_per_bin_outputs( assert np.all(selected_scores_per_bin <= 1.0) +# ------------------------------------------------------------------ +# Type helper coverage +# ------------------------------------------------------------------ + +@pytest.mark.parametrize( + "fingerprint_type,expected", + [ + ("rdkit_binary", True), + ("rdkit_count", True), + ("rdkit_logcount", True), + ("rdkit_binary_unfolded", False), + ("rdkit_count_unfolded", False), + ("rdkit_logcount_unfolded", False), + ("daylight", False), + ], +) +def test_is_dense_fingerprint_type(fingerprint_type, expected): + assert is_dense_fingerprint_type(fingerprint_type) is expected + + +@pytest.mark.parametrize( + "fingerprint_type,expected", + [ + ("rdkit_binary", False), + ("rdkit_count", False), + ("rdkit_logcount", False), + ("rdkit_binary_unfolded", True), + ("rdkit_count_unfolded", False), + ("rdkit_logcount_unfolded", False), + ("daylight", False), + ], +) +def test_is_unfolded_binary_fingerprint_type(fingerprint_type, expected): + assert is_unfolded_binary_fingerprint_type(fingerprint_type) is expected + + +@pytest.mark.parametrize( + "fingerprint_type,expected", + [ + ("rdkit_binary", False), + ("rdkit_count", False), + ("rdkit_logcount", False), + ("rdkit_binary_unfolded", False), + ("rdkit_count_unfolded", True), + ("rdkit_logcount_unfolded", True), + ("daylight", False), + ], +) +def test_is_unfolded_count_fingerprint_type(fingerprint_type, expected): + assert is_unfolded_count_fingerprint_type(fingerprint_type) is expected + + +# ------------------------------------------------------------------ +# Low-level helper coverage +# ------------------------------------------------------------------ + +def test_split_sparse_count_fingerprints(simple_sparse_count_fingerprints): + bins, counts = _split_sparse_count_fingerprints(simple_sparse_count_fingerprints) + + assert isinstance(bins, list) + assert isinstance(counts, list) + assert len(bins) == len(simple_sparse_count_fingerprints) + assert len(counts) == len(simple_sparse_count_fingerprints) + + for i, (expected_bins, expected_counts) in enumerate(simple_sparse_count_fingerprints): + assert np.array_equal(bins[i], expected_bins) + assert np.array_equal(counts[i], expected_counts) + + +# ------------------------------------------------------------------ +# Matrix similarity coverage +# ------------------------------------------------------------------ + +@pytest.mark.parametrize( + "fingerprints_fixture_1,fingerprints_fixture_2,fingerprint_type", + [ + ("simple_binary_fingerprints_between_sets", "simple_binary_fingerprints_between_sets", "rdkit_binary"), + ("simple_count_fingerprints_between_sets", "simple_count_fingerprints_between_sets", "rdkit_count"), + ("simple_count_fingerprints_between_sets", "simple_count_fingerprints_between_sets", "rdkit_logcount"), + ("simple_sparse_binary_fingerprints_between_sets", "simple_sparse_binary_fingerprints_between_sets", "rdkit_binary_unfolded"), + ("simple_sparse_count_fingerprints_between_sets", "simple_sparse_count_fingerprints_between_sets", "rdkit_count_unfolded"), + ("simple_sparse_count_fingerprints_between_sets", "simple_sparse_count_fingerprints_between_sets", "rdkit_logcount_unfolded"), + ], +) +def test_compute_fingerprint_similarity_matrix_all_supported_types( + request, fingerprints_fixture_1, fingerprints_fixture_2, fingerprint_type +): + fingerprints_1, fingerprints_2 = request.getfixturevalue(fingerprints_fixture_1) + + result = compute_fingerprint_similarity_matrix( + fingerprints_1, + fingerprints_2, + fingerprint_type=fingerprint_type, + ) + + assert isinstance(result, np.ndarray) + assert result.shape == (len(fingerprints_1), len(fingerprints_2)) + assert np.all(result >= 0.0) + assert np.all(result <= 1.0) + + +def test_compute_fingerprint_similarity_matrix_invalid_type_raises(simple_binary_fingerprints_between_sets): + fingerprints_1, fingerprints_2 = simple_binary_fingerprints_between_sets + with pytest.raises(ValueError, match="Unsupported fingerprint type"): + compute_fingerprint_similarity_matrix( + fingerprints_1, + fingerprints_2, + fingerprint_type="daylight", + ) + + +# ------------------------------------------------------------------ +# Row similarity coverage +# ------------------------------------------------------------------ + +@pytest.mark.parametrize( + "fingerprints_fixture,fingerprint_type", + [ + ("simple_binary_fingerprints", "rdkit_binary"), + ("simple_count_fingerprints", "rdkit_count"), + ("simple_count_fingerprints", "rdkit_logcount"), + ("simple_sparse_binary_fingerprints", "rdkit_binary_unfolded"), + ("simple_sparse_count_fingerprints", "rdkit_count_unfolded"), + ("simple_sparse_count_fingerprints", "rdkit_logcount_unfolded"), + ], +) +def test_compute_fingerprint_similarity_row_all_supported_types(request, fingerprints_fixture, fingerprint_type): + fingerprints = request.getfixturevalue(fingerprints_fixture) + single_fingerprint = fingerprints[0] + + row = compute_fingerprint_similarity_row( + single_fingerprint, + fingerprints, + fingerprint_type=fingerprint_type, + ) + + assert isinstance(row, np.ndarray) + assert row.shape == (len(fingerprints),) + assert np.all(row >= 0.0) + assert np.all(row <= 1.0) + assert row[0] == pytest.approx(1.0) + + +def test_compute_fingerprint_similarity_row_invalid_type_raises(simple_binary_fingerprints): + with pytest.raises(ValueError, match="Unsupported fingerprint type"): + compute_fingerprint_similarity_row( + simple_binary_fingerprints[0], + simple_binary_fingerprints, + fingerprint_type="daylight", + ) + + +# ------------------------------------------------------------------ +# Explicit row-kernel coverage, compiled and uncompiled +# ------------------------------------------------------------------ + +def test_tanimoto_scores_row_dense_compiled_and_py_func(simple_binary_fingerprints): + single = simple_binary_fingerprints[0] + + compiled = tanimoto_scores_row_dense(single, simple_binary_fingerprints) + uncompiled = tanimoto_scores_row_dense.py_func(single, simple_binary_fingerprints) + + assert compiled.shape == (len(simple_binary_fingerprints),) + assert np.allclose(compiled, uncompiled) + assert compiled[0] == pytest.approx(1.0) + + +def test_tanimoto_scores_row_sparse_binary_compiled_and_py_func(simple_sparse_binary_fingerprints): + single = simple_sparse_binary_fingerprints[0] + + compiled = tanimoto_scores_row_sparse_binary(single, simple_sparse_binary_fingerprints) + uncompiled = tanimoto_scores_row_sparse_binary.py_func(single, simple_sparse_binary_fingerprints) + + assert compiled.shape == (len(simple_sparse_binary_fingerprints),) + assert np.allclose(compiled, uncompiled) + assert compiled[0] == pytest.approx(1.0) + + +def test_tanimoto_scores_row_sparse_count_compiled_and_py_func(simple_sparse_count_fingerprints): + bins, counts = simple_sparse_count_fingerprints[0] + list_of_bins, list_of_counts = _split_sparse_count_fingerprints(simple_sparse_count_fingerprints) + + compiled = tanimoto_scores_row_sparse_count(bins, counts, list_of_bins, list_of_counts) + uncompiled = tanimoto_scores_row_sparse_count.py_func(bins, counts, list_of_bins, list_of_counts) + + assert compiled.shape == (len(simple_sparse_count_fingerprints),) + assert np.allclose(compiled, uncompiled) + assert compiled[0] == pytest.approx(1.0) + + +# ------------------------------------------------------------------ +# Same-set per-bin dispatcher coverage +# ------------------------------------------------------------------ + @pytest.mark.parametrize( "fingerprints_fixture,fingerprint_type", [ ("simple_binary_fingerprints", "rdkit_binary"), ("simple_count_fingerprints", "rdkit_count"), + ("simple_count_fingerprints", "rdkit_logcount"), ("simple_sparse_binary_fingerprints", "rdkit_binary_unfolded"), ("simple_sparse_count_fingerprints", "rdkit_count_unfolded"), + ("simple_sparse_count_fingerprints", "rdkit_logcount_unfolded"), ], ) def test_compute_tanimoto_similarity_per_bin_all_supported_types( @@ -165,8 +370,10 @@ def test_compute_tanimoto_similarity_per_bin_all_supported_types( [ ("simple_binary_fingerprints", "rdkit_binary"), ("simple_count_fingerprints", "rdkit_count"), + ("simple_count_fingerprints", "rdkit_logcount"), ("simple_sparse_binary_fingerprints", "rdkit_binary_unfolded"), ("simple_sparse_count_fingerprints", "rdkit_count_unfolded"), + ("simple_sparse_count_fingerprints", "rdkit_logcount_unfolded"), ], ) def test_compute_tanimoto_similarity_per_bin_exclude_diagonal_all_supported_types( @@ -248,13 +455,30 @@ def test_compute_tanimoto_similarity_per_bin_count_dense_zero_scores_for_invalid assert np.all(selected_scores_per_bin[selected_pairs_per_bin == -1] == 0) +def test_compute_tanimoto_similarity_per_bin_invalid_fingerprint_type_raises(simple_binary_fingerprints): + with pytest.raises(ValueError, match="Unsupported fingerprint type"): + compute_tanimoto_similarity_per_bin( + simple_binary_fingerprints, + max_pairs_per_bin=5, + fingerprint_type="daylight", + selection_bins=np.array([(-0.01, 1.0)], dtype=np.float32), + include_diagonal=True, + ) + + +# ------------------------------------------------------------------ +# Between-sets per-bin dispatcher coverage +# ------------------------------------------------------------------ + @pytest.mark.parametrize( "fingerprints_fixture_1,fingerprints_fixture_2,fingerprint_type", [ ("simple_binary_fingerprints_between_sets", "simple_binary_fingerprints_between_sets", "rdkit_binary"), ("simple_count_fingerprints_between_sets", "simple_count_fingerprints_between_sets", "rdkit_count"), + ("simple_count_fingerprints_between_sets", "simple_count_fingerprints_between_sets", "rdkit_logcount"), ("simple_sparse_binary_fingerprints_between_sets", "simple_sparse_binary_fingerprints_between_sets", "rdkit_binary_unfolded"), ("simple_sparse_count_fingerprints_between_sets", "simple_sparse_count_fingerprints_between_sets", "rdkit_count_unfolded"), + ("simple_sparse_count_fingerprints_between_sets", "simple_sparse_count_fingerprints_between_sets", "rdkit_logcount_unfolded"), ], ) def test_compute_tanimoto_similarity_per_bin_between_sets_all_supported_types( @@ -287,8 +511,10 @@ def test_compute_tanimoto_similarity_per_bin_between_sets_all_supported_types( [ ("simple_binary_fingerprints_between_sets", "rdkit_binary"), ("simple_count_fingerprints_between_sets", "rdkit_count"), + ("simple_count_fingerprints_between_sets", "rdkit_logcount"), ("simple_sparse_binary_fingerprints_between_sets", "rdkit_binary_unfolded"), ("simple_sparse_count_fingerprints_between_sets", "rdkit_count_unfolded"), + ("simple_sparse_count_fingerprints_between_sets", "rdkit_logcount_unfolded"), ], ) def test_compute_tanimoto_similarity_per_bin_between_sets_uses_cross_set_similarity_for_both_directions( @@ -321,32 +547,21 @@ def test_compute_tanimoto_similarity_per_bin_between_sets_uses_cross_set_similar # set1[1] <-> set2[0] assert pairs[0, 0] == 3 - assert scores[0, 0] == 1.0 + assert scores[0, 0] == pytest.approx(1.0) assert pairs[1, 0] == 2 - assert scores[1, 0] == 1.0 + assert scores[1, 0] == pytest.approx(1.0) assert pairs[2, 0] == 1 - assert scores[2, 0] == 1.0 + assert scores[2, 0] == pytest.approx(1.0) assert pairs[3, 0] == 0 - assert scores[3, 0] == 1.0 + assert scores[3, 0] == pytest.approx(1.0) assert np.all(pairs[:, 1] == -1) assert np.all(scores[:, 1] == 0.0) -def test_compute_tanimoto_similarity_per_bin_invalid_fingerprint_type_raises(simple_binary_fingerprints): - with pytest.raises(ValueError, match="Unsupported fingerprint type"): - compute_tanimoto_similarity_per_bin( - simple_binary_fingerprints, - max_pairs_per_bin=5, - fingerprint_type="daylight", - selection_bins=np.array([(-0.01, 1.0)], dtype=np.float32), - include_diagonal=True, - ) - - def test_compute_tanimoto_similarity_per_bin_between_sets_invalid_fingerprint_type_raises( simple_binary_fingerprints_between_sets, ): From 19fa7fe4796919ddb3197cf7374c34bfac8099e8 Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 14:16:32 +0100 Subject: [PATCH 10/12] also test helper functions --- ...est_fingerprint_similarity_computations.py | 210 ++++++++++++++++++ 1 file changed, 210 insertions(+) diff --git a/tests/test_fingerprint_similarity_computations.py b/tests/test_fingerprint_similarity_computations.py index 29437522..6a0890fa 100644 --- a/tests/test_fingerprint_similarity_computations.py +++ b/tests/test_fingerprint_similarity_computations.py @@ -13,6 +13,12 @@ _split_sparse_count_fingerprints, compute_tanimoto_similarity_per_bin, compute_tanimoto_similarity_per_bin_between_sets, + _compute_tanimoto_similarity_per_bin_dense, + _compute_tanimoto_similarity_per_bin_sparse_binary, + _compute_tanimoto_similarity_per_bin_sparse_count, + _compute_tanimoto_similarity_per_bin_between_sets_sparse_count, + _compute_tanimoto_similarity_per_bin_between_sets_dense, + _compute_tanimoto_similarity_per_bin_between_sets_sparse_binary ) @@ -108,6 +114,15 @@ def simple_sparse_count_fingerprints_between_sets(): return fingerprints_1, fingerprints_2 +def _to_numba_sparse_count_lists(fingerprints): + bins = [] + counts = [] + for fp_bins, fp_counts in fingerprints: + bins.append(fp_bins) + counts.append(fp_counts) + return bins, counts + + def _check_similarity_per_bin_outputs(selected_pairs_per_bin, selected_scores_per_bin, nr_of_items, nr_of_bins, max_pairs_per_bin): assert selected_pairs_per_bin.shape == (nr_of_bins, nr_of_items, max_pairs_per_bin) assert selected_scores_per_bin.shape == (nr_of_bins, nr_of_items, max_pairs_per_bin) @@ -574,3 +589,198 @@ def test_compute_tanimoto_similarity_per_bin_between_sets_invalid_fingerprint_ty fingerprint_type="daylight", selection_bins=np.array([(-0.01, 1.0)], dtype=np.float32), ) + + +# Test underlying helper function +# -------------------------------- + +def test__compute_tanimoto_similarity_per_bin_dense_compiled_and_py_func(simple_binary_fingerprints): + max_pairs_per_bin = 5 + selection_bins = np.array([(0.0, 0.5), (0.5, 1.0)], dtype=np.float32) + + compiled_pairs, compiled_scores = _compute_tanimoto_similarity_per_bin_dense( + simple_binary_fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=True, + ) + py_pairs, py_scores = _compute_tanimoto_similarity_per_bin_dense.py_func( + simple_binary_fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=True, + ) + + assert compiled_pairs.shape == py_pairs.shape == (2, len(simple_binary_fingerprints), max_pairs_per_bin) + assert compiled_scores.shape == py_scores.shape == (2, len(simple_binary_fingerprints), max_pairs_per_bin) + assert np.all(compiled_scores[compiled_pairs == -1] == 0) + assert np.all(py_scores[py_pairs == -1] == 0) + assert np.all(compiled_scores >= 0.0) + assert np.all(compiled_scores <= 1.0) + assert np.all(py_scores >= 0.0) + assert np.all(py_scores <= 1.0) + + +def test__compute_tanimoto_similarity_per_bin_sparse_binary_compiled_and_py_func(simple_sparse_binary_fingerprints): + max_pairs_per_bin = 5 + selection_bins = np.array([(0.0, 0.5), (0.5, 1.0)], dtype=np.float32) + + compiled_pairs, compiled_scores = _compute_tanimoto_similarity_per_bin_sparse_binary( + simple_sparse_binary_fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=True, + ) + py_pairs, py_scores = _compute_tanimoto_similarity_per_bin_sparse_binary.py_func( + simple_sparse_binary_fingerprints, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=True, + ) + + assert compiled_pairs.shape == py_pairs.shape == (2, len(simple_sparse_binary_fingerprints), max_pairs_per_bin) + assert compiled_scores.shape == py_scores.shape == (2, len(simple_sparse_binary_fingerprints), max_pairs_per_bin) + assert np.all(compiled_scores[compiled_pairs == -1] == 0) + assert np.all(py_scores[py_pairs == -1] == 0) + assert np.all(compiled_scores >= 0.0) + assert np.all(compiled_scores <= 1.0) + assert np.all(py_scores >= 0.0) + assert np.all(py_scores <= 1.0) + + +def test__compute_tanimoto_similarity_per_bin_sparse_count_compiled_and_py_func(simple_sparse_count_fingerprints): + bins, counts = _to_numba_sparse_count_lists(simple_sparse_count_fingerprints) + max_pairs_per_bin = 5 + selection_bins = np.array([(0.0, 0.5), (0.5, 1.0)], dtype=np.float32) + + compiled_pairs, compiled_scores = _compute_tanimoto_similarity_per_bin_sparse_count( + bins, + counts, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=True, + ) + py_pairs, py_scores = _compute_tanimoto_similarity_per_bin_sparse_count.py_func( + bins, + counts, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + include_diagonal=True, + ) + + assert compiled_pairs.shape == py_pairs.shape == (2, len(simple_sparse_count_fingerprints), max_pairs_per_bin) + assert compiled_scores.shape == py_scores.shape == (2, len(simple_sparse_count_fingerprints), max_pairs_per_bin) + assert np.all(compiled_scores[compiled_pairs == -1] == 0) + assert np.all(py_scores[py_pairs == -1] == 0) + assert np.all(compiled_scores >= 0.0) + assert np.all(compiled_scores <= 1.0) + assert np.all(py_scores >= 0.0) + assert np.all(py_scores <= 1.0) + + +def test__compute_tanimoto_similarity_per_bin_between_sets_dense_compiled_and_py_func( + simple_binary_fingerprints_between_sets, +): + fingerprints_1, fingerprints_2 = simple_binary_fingerprints_between_sets + max_pairs_per_bin = 2 + selection_bins = np.array([(0.99, 1.0)], dtype=np.float32) + + compiled_pairs, compiled_scores = _compute_tanimoto_similarity_per_bin_between_sets_dense( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + py_pairs, py_scores = _compute_tanimoto_similarity_per_bin_between_sets_dense.py_func( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + + assert compiled_pairs.shape == py_pairs.shape == (1, len(fingerprints_1) + len(fingerprints_2), max_pairs_per_bin) + assert compiled_scores.shape == py_scores.shape == (1, len(fingerprints_1) + len(fingerprints_2), max_pairs_per_bin) + assert np.all(compiled_scores[compiled_pairs == -1] == 0) + assert np.all(py_scores[py_pairs == -1] == 0) + assert compiled_pairs[0, 0, 0] == 3 + assert compiled_pairs[0, 1, 0] == 2 + assert compiled_pairs[0, 2, 0] == 1 + assert compiled_pairs[0, 3, 0] == 0 + assert py_pairs[0, 0, 0] == 3 + assert py_pairs[0, 1, 0] == 2 + assert py_pairs[0, 2, 0] == 1 + assert py_pairs[0, 3, 0] == 0 + + +def test__compute_tanimoto_similarity_per_bin_between_sets_sparse_binary_compiled_and_py_func( + simple_sparse_binary_fingerprints_between_sets, +): + fingerprints_1, fingerprints_2 = simple_sparse_binary_fingerprints_between_sets + max_pairs_per_bin = 2 + selection_bins = np.array([(0.99, 1.0)], dtype=np.float32) + + compiled_pairs, compiled_scores = _compute_tanimoto_similarity_per_bin_between_sets_sparse_binary( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + py_pairs, py_scores = _compute_tanimoto_similarity_per_bin_between_sets_sparse_binary.py_func( + fingerprints_1, + fingerprints_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + + assert compiled_pairs.shape == py_pairs.shape == (1, len(fingerprints_1) + len(fingerprints_2), max_pairs_per_bin) + assert compiled_scores.shape == py_scores.shape == (1, len(fingerprints_1) + len(fingerprints_2), max_pairs_per_bin) + assert np.all(compiled_scores[compiled_pairs == -1] == 0) + assert np.all(py_scores[py_pairs == -1] == 0) + assert compiled_pairs[0, 0, 0] == 3 + assert compiled_pairs[0, 1, 0] == 2 + assert compiled_pairs[0, 2, 0] == 1 + assert compiled_pairs[0, 3, 0] == 0 + assert py_pairs[0, 0, 0] == 3 + assert py_pairs[0, 1, 0] == 2 + assert py_pairs[0, 2, 0] == 1 + assert py_pairs[0, 3, 0] == 0 + + +def test__compute_tanimoto_similarity_per_bin_between_sets_sparse_count_compiled_and_py_func( + simple_sparse_count_fingerprints_between_sets, +): + fingerprints_1, fingerprints_2 = simple_sparse_count_fingerprints_between_sets + bins_1, counts_1 = _to_numba_sparse_count_lists(fingerprints_1) + bins_2, counts_2 = _to_numba_sparse_count_lists(fingerprints_2) + max_pairs_per_bin = 2 + selection_bins = np.array([(0.99, 1.0)], dtype=np.float32) + + compiled_pairs, compiled_scores = _compute_tanimoto_similarity_per_bin_between_sets_sparse_count( + bins_1, + counts_1, + bins_2, + counts_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + py_pairs, py_scores = _compute_tanimoto_similarity_per_bin_between_sets_sparse_count.py_func( + bins_1, + counts_1, + bins_2, + counts_2, + max_pairs_per_bin=max_pairs_per_bin, + selection_bins=selection_bins, + ) + + assert compiled_pairs.shape == py_pairs.shape == (1, len(fingerprints_1) + len(fingerprints_2), max_pairs_per_bin) + assert compiled_scores.shape == py_scores.shape == (1, len(fingerprints_1) + len(fingerprints_2), max_pairs_per_bin) + assert np.all(compiled_scores[compiled_pairs == -1] == 0) + assert np.all(py_scores[py_pairs == -1] == 0) + assert compiled_pairs[0, 0, 0] == 3 + assert compiled_pairs[0, 1, 0] == 2 + assert compiled_pairs[0, 2, 0] == 1 + assert compiled_pairs[0, 3, 0] == 0 + assert py_pairs[0, 0, 0] == 3 + assert py_pairs[0, 1, 0] == 2 + assert py_pairs[0, 2, 0] == 1 + assert py_pairs[0, 3, 0] == 0 From bb80d474c8527043e1322a8b41e29e597bed2bdb Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 14:39:52 +0100 Subject: [PATCH 11/12] reduce code duplication (mostly to make sonarcloud happy) --- .../fingerprint_similarity_computations.py | 261 ++++++++++-------- 1 file changed, 151 insertions(+), 110 deletions(-) diff --git a/ms2deepscore/fingerprint_similarity_computations.py b/ms2deepscore/fingerprint_similarity_computations.py index 95afe6d9..7f363571 100644 --- a/ms2deepscore/fingerprint_similarity_computations.py +++ b/ms2deepscore/fingerprint_similarity_computations.py @@ -132,6 +132,39 @@ def tanimoto_scores_row_sparse_count( return tanimoto_scores +from numba import jit +import numpy as np + + +@jit(nopython=True, fastmath=True) +def _fill_pairs_for_row_same_set( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + max_pairs_per_bin, + selection_bins, + include_diagonal, +): + num_bins = len(selection_bins) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero( + (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) + )[0] + + if not include_diagonal and idx_fingerprint_i in indices: + indices = indices[indices != idx_fingerprint_i] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + + @jit(nopython=True, parallel=True) def _compute_tanimoto_similarity_per_bin_dense( fingerprints, @@ -149,21 +182,15 @@ def _compute_tanimoto_similarity_per_bin_dense( fingerprint_i = fingerprints[idx_fingerprint_i, :] tanimoto_scores = tanimoto_scores_row_dense(fingerprint_i, fingerprints) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero( - (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) - )[0] - - if not include_diagonal and idx_fingerprint_i in indices: - indices = indices[indices != idx_fingerprint_i] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + _fill_pairs_for_row_same_set( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + max_pairs_per_bin, + selection_bins, + include_diagonal, + ) return selected_pairs_per_bin, selected_scores_per_bin @@ -185,21 +212,15 @@ def _compute_tanimoto_similarity_per_bin_sparse_binary( fingerprint_i = fingerprints[idx_fingerprint_i] tanimoto_scores = tanimoto_scores_row_sparse_binary(fingerprint_i, fingerprints) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero( - (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) - )[0] - - if not include_diagonal and idx_fingerprint_i in indices: - indices = indices[indices != idx_fingerprint_i] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + _fill_pairs_for_row_same_set( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + max_pairs_per_bin, + selection_bins, + include_diagonal, + ) return selected_pairs_per_bin, selected_scores_per_bin @@ -225,21 +246,15 @@ def _compute_tanimoto_similarity_per_bin_sparse_count( bins_i, counts_i, fingerprints_bins, fingerprints_counts ) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero( - (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) - )[0] - - if not include_diagonal and idx_fingerprint_i in indices: - indices = indices[indices != idx_fingerprint_i] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] + _fill_pairs_for_row_same_set( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + max_pairs_per_bin, + selection_bins, + include_diagonal, + ) return selected_pairs_per_bin, selected_scores_per_bin @@ -290,6 +305,32 @@ def compute_tanimoto_similarity_per_bin( raise ValueError(f"Unsupported fingerprint type: {fingerprint_type}") +@jit(nopython=True, fastmath=True) +def _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + row_index, + target_offset, + max_pairs_per_bin, + selection_bins, +): + num_bins = len(selection_bins) + + for bin_number in range(num_bins): + selection_bin = selection_bins[bin_number] + indices = np.nonzero( + (tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]) + )[0] + + np.random.shuffle(indices) + indices = indices[:max_pairs_per_bin] + num_indices = len(indices) + + selected_pairs_per_bin[bin_number, row_index, :num_indices] = indices + target_offset + selected_scores_per_bin[bin_number, row_index, :num_indices] = tanimoto_scores[indices] + + @jit(nopython=True, parallel=True) def _compute_tanimoto_similarity_per_bin_between_sets_dense( fingerprints_1, @@ -308,32 +349,30 @@ def _compute_tanimoto_similarity_per_bin_between_sets_dense( fingerprint_i = fingerprints_1[idx_fingerprint_i, :] tanimoto_scores = tanimoto_scores_row_dense(fingerprint_i, fingerprints_2) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 + _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + size_1, + max_pairs_per_bin, + selection_bins, + ) for idx_fingerprint_j in prange(size_2): fingerprint_j = fingerprints_2[idx_fingerprint_j, :] - idx_fingerprint_corrected = idx_fingerprint_j + size_1 + row_index = idx_fingerprint_j + size_1 tanimoto_scores = tanimoto_scores_row_dense(fingerprint_j, fingerprints_1) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] + _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + row_index, + 0, + max_pairs_per_bin, + selection_bins, + ) return selected_pairs_per_bin, selected_scores_per_bin @@ -356,32 +395,30 @@ def _compute_tanimoto_similarity_per_bin_between_sets_sparse_binary( fingerprint_i = fingerprints_1[idx_fingerprint_i] tanimoto_scores = tanimoto_scores_row_sparse_binary(fingerprint_i, fingerprints_2) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 + _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + size_1, + max_pairs_per_bin, + selection_bins, + ) for idx_fingerprint_j in prange(size_2): fingerprint_j = fingerprints_2[idx_fingerprint_j] - idx_fingerprint_corrected = idx_fingerprint_j + size_1 + row_index = idx_fingerprint_j + size_1 tanimoto_scores = tanimoto_scores_row_sparse_binary(fingerprint_j, fingerprints_1) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] + _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + row_index, + 0, + max_pairs_per_bin, + selection_bins, + ) return selected_pairs_per_bin, selected_scores_per_bin @@ -406,38 +443,42 @@ def _compute_tanimoto_similarity_per_bin_between_sets_sparse_count( bins_i = fingerprints_1_bins[idx_fingerprint_i] counts_i = fingerprints_1_counts[idx_fingerprint_i] tanimoto_scores = tanimoto_scores_row_sparse_count( - bins_i, counts_i, fingerprints_2_bins, fingerprints_2_counts + bins_i, + counts_i, + fingerprints_2_bins, + fingerprints_2_counts, ) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_scores_per_bin[bin_number, idx_fingerprint_i, :num_indices] = tanimoto_scores[indices] - selected_pairs_per_bin[bin_number, idx_fingerprint_i, :num_indices] = indices + size_1 + _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + idx_fingerprint_i, + size_1, + max_pairs_per_bin, + selection_bins, + ) for idx_fingerprint_j in prange(size_2): bins_j = fingerprints_2_bins[idx_fingerprint_j] counts_j = fingerprints_2_counts[idx_fingerprint_j] - idx_fingerprint_corrected = idx_fingerprint_j + size_1 + row_index = idx_fingerprint_j + size_1 tanimoto_scores = tanimoto_scores_row_sparse_count( - bins_j, counts_j, fingerprints_1_bins, fingerprints_1_counts + bins_j, + counts_j, + fingerprints_1_bins, + fingerprints_1_counts, ) - for bin_number in range(num_bins): - selection_bin = selection_bins[bin_number] - indices = np.nonzero((tanimoto_scores > selection_bin[0]) & (tanimoto_scores <= selection_bin[1]))[0] - - np.random.shuffle(indices) - indices = indices[:max_pairs_per_bin] - num_indices = len(indices) - - selected_pairs_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = indices - selected_scores_per_bin[bin_number, idx_fingerprint_corrected, :num_indices] = tanimoto_scores[indices] + _fill_pairs_for_row_between_sets( + selected_pairs_per_bin, + selected_scores_per_bin, + tanimoto_scores, + row_index, + 0, + max_pairs_per_bin, + selection_bins, + ) return selected_pairs_per_bin, selected_scores_per_bin From a947b7ff0b1ab7a42c61f704df927ba71b592c5e Mon Sep 17 00:00:00 2001 From: Florian Huber Date: Fri, 27 Mar 2026 14:46:46 +0100 Subject: [PATCH 12/12] linting --- ms2deepscore/fingerprint_similarity_computations.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/ms2deepscore/fingerprint_similarity_computations.py b/ms2deepscore/fingerprint_similarity_computations.py index 7f363571..bf450b26 100644 --- a/ms2deepscore/fingerprint_similarity_computations.py +++ b/ms2deepscore/fingerprint_similarity_computations.py @@ -132,10 +132,6 @@ def tanimoto_scores_row_sparse_count( return tanimoto_scores -from numba import jit -import numpy as np - - @jit(nopython=True, fastmath=True) def _fill_pairs_for_row_same_set( selected_pairs_per_bin,