matchms · niekdejonge · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
diff --git a/README.md b/README.md
@@ -2,28 +2,11 @@
 
 
 # MS2Query 2.0
-more to come...
-
-## Basic workflow (so far):
-
-### Library generation
-```python
-from ms2query.create_new_library import create_new_library
-
-ms2query_lib = create_new_library(
-    spectra_files=["spectra.mgf"],
-    annotation_files=[],
-    output_folder="my_ms2query_folder/",
-    model_path="models/ms2deepscore.pt"
-)
-```
-
-### Loading already generated library
-```python
-from ms2query.create_new_library import load_created_library
-
-lib = load_created_library("my_ms2query_folder/")
-```
+A first basic implementation is out now, more to follow soon...
+The new MS2Query appraoch has a higher accuracy and has a much simpler and faster underlying algorithm. We will hopefully soon share a first preprint as well, showing all the benchmarking. 
 
+The current runably version still requires to create the library files, which takes some time for the first run. 
 
+Soon this will be much easier and faster. We will add downloadable precomputed files, make MS2Query pip installable, add a database and allow faster MS2DeepScore searching. 
 
+The tutorial for the current prototype can be found in notebooks/tutorial. 
diff --git a/ms2query/benchmarking/reference_methods/readme.md b/ms2query/benchmarking/reference_methods/readme.md
diff --git a/...uery/benchmarking/AnnotatedSpectrumSet.py → ...query_development/AnnotatedSpectrumSet.py b/...uery/benchmarking/AnnotatedSpectrumSet.py → ...query_development/AnnotatedSpectrumSet.py
@@ -6,7 +6,7 @@
 from matchms.importing import load_spectra
 from ms2deepscore.models import SiameseSpectralModel
 from tqdm import tqdm
-from ms2query.benchmarking.Embeddings import Embeddings
+from ms2query.ms2query_development.Embeddings import Embeddings
 
 
 class AnnotatedSpectrumSet:

diff --git a/ms2query/benchmarking/Embeddings.py → ms2query/ms2query_development/Embeddings.py b/ms2query/benchmarking/Embeddings.py → ms2query/ms2query_development/Embeddings.py
diff --git a/ms2query/benchmarking/Fingerprints.py → ...uery/ms2query_development/Fingerprints.py b/ms2query/benchmarking/Fingerprints.py → ...uery/ms2query_development/Fingerprints.py
@@ -5,8 +5,8 @@
 from matchms.filtering.metadata_processing.add_fingerprint import _derive_fingerprint_from_inchi
 from numpy.typing import NDArray
 from tqdm import tqdm
-from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
 from ms2query.metrics import generalized_tanimoto_similarity_matrix
+from ms2query.ms2query_development.AnnotatedSpectrumSet import AnnotatedSpectrumSet
 
 
 class Fingerprints:

diff --git a/ms2query/ms2query_development/ReferenceLibrary.py b/ms2query/ms2query_development/ReferenceLibrary.py
@@ -0,0 +1,220 @@
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence
+import numpy as np
+import pandas as pd
+from matchms.importing import load_spectra
+from matchms.Spectrum import Spectrum
+from ms2deepscore.models import SiameseSpectralModel, load_model
+from ms2deepscore.vector_operations import cosine_similarity_matrix
+from tqdm import tqdm
+from ms2query.ms2query_development.AnnotatedSpectrumSet import AnnotatedSpectrumSet
+from ms2query.ms2query_development.Embeddings import Embeddings, _to_json_serializable
+from ms2query.ms2query_development.Fingerprints import Fingerprints
+from ms2query.ms2query_development.TopKTanimotoScores import TopKTanimotoScores
+
+
+class ReferenceLibrary:
+    # Set default file names to enable save and load per library
+    embedding_file_name = "embeddings.npz"
+    top_k_tanimoto_scores_file_name = "top_k_tanimoto_scores.parquet"
+    reference_metadata_file_name = "library_metadata.parquet"
+    ms2deepscore_model_file_name = "ms2deepscore_model.pt"
+    metadata_to_store = [
+        "precursor_mz",
+        "retention_time",
+        "collision_energy",
+        "compound_name",
+        "smiles",
+        "inchikey",
+    ]
+    fingerprint_type = "daylight"
+    fingerprint_nbits = 4096
+    top_k_inchikeys = 8
+
+    def __init__(
+        self,
+        ms2deepscore_model: SiameseSpectralModel,
+        reference_embeddings: Embeddings,
+        top_k_tanimoto_scores: TopKTanimotoScores,
+        reference_metadata: pd.DataFrame,
+    ):
+        self.ms2deepscore_model = ms2deepscore_model
+        self.reference_embeddings = reference_embeddings
+        self.top_k_tanimoto_scores = top_k_tanimoto_scores
+        self.reference_metadata = reference_metadata
+
+        # Check that the loaded files match
+        if _to_json_serializable(ms2deepscore_model.model_settings.get_dict()) != reference_embeddings.model_settings:
+            raise ValueError(
+                "The settings of the ms2deepscore model do not match the model used for creating the library embeddings"
+            )
+        if list(self.reference_metadata["spectrum_hashes"]) != [
+            str(spectrum_hash) for spectrum_hash in reference_embeddings.index_to_spectrum_hash
+        ]:
+            raise ValueError("The loaded metadata does not match the used embeddings")
+        if {inchikey[:14] for inchikey in reference_metadata["inchikey"]} != set(
+            top_k_tanimoto_scores.top_k_inchikeys_and_scores.index
+        ):
+            raise ValueError("The inchikeys in the metadata and in the top_k_tanimoto_scores do not match")
+
+        # Get the spectrum_indices_per_inchikey
+        self.spectrum_indices_per_inchikey = defaultdict(list)
+        for lib_spec_index, inchikey in enumerate(reference_metadata["inchikey"]):
+            self.spectrum_indices_per_inchikey[inchikey[:14]].append(lib_spec_index)
+
+    @classmethod
+    def load_from_directory(cls, library_file_directory) -> "ReferenceLibrary":
+        reference_embeddings_file = library_file_directory / cls.embedding_file_name
+        top_k_tanimoto_scores_file = library_file_directory / cls.top_k_tanimoto_scores_file_name
+        reference_metadata_file = library_file_directory / cls.reference_metadata_file_name
+        ms2deepscore_model_file_name = library_file_directory / cls.ms2deepscore_model_file_name
+        return cls.load_from_files(
+            ms2deepscore_model_file_name, reference_embeddings_file, top_k_tanimoto_scores_file, reference_metadata_file
+        )
+
+    @classmethod
+    def load_from_files(
+        cls,
+        ms2deepscore_model_file_name,
+        reference_embeddings_file,
+        top_k_tanimoto_scores_file,
+        reference_metadata_file,
+    ) -> "ReferenceLibrary":
+        return cls(
+            load_model(ms2deepscore_model_file_name),
+            Embeddings.load(reference_embeddings_file),
+            TopKTanimotoScores.load(top_k_tanimoto_scores_file),
+            pd.read_parquet(reference_metadata_file),
+        )
+
+    @classmethod
+    def create_from_spectra(
+        cls,
+        library_spectra: Sequence[Spectrum],
+        ms2deepscore_model_file_name: str,
+        store_file_directory=None,
+        store_files=True,
+    ) -> "ReferenceLibrary":
+        """Creates all the files needed for MS2Query and stores them"""
+        if store_file_directory is None:
+            store_file_directory = Path(ms2deepscore_model_file_name).parent
+        else:
+            store_file_directory = Path(store_file_directory)
+        if store_files:
+            # Check the files don't exist yet
+            for file in (
+                store_file_directory / cls.embedding_file_name,
+                store_file_directory / cls.top_k_tanimoto_scores_file_name,
+                store_file_directory / cls.reference_metadata_file_name,
+            ):
+                if file.exists():
+                    raise FileExistsError(f"There is already a file stored with the name {file}")
+
+        # library_spectra = list(tqdm(load_spectra(library_spectra_file), "Loading library spectra"))
+        library_spectrum_set = AnnotatedSpectrumSet.create_spectrum_set(library_spectra)
+        ms2deepscore_model = load_model(ms2deepscore_model_file_name)
+        library_spectrum_set.add_embeddings(ms2deepscore_model)
+
+        fingerprints = Fingerprints.from_spectrum_set(library_spectrum_set, cls.fingerprint_type, cls.fingerprint_nbits)
+        top_k_tanimoto_scores = TopKTanimotoScores.calculate_from_fingerprints(
+            fingerprints, fingerprints, cls.top_k_inchikeys
+        )
+        reference_metadata = extract_metadata_from_library(
+            library_spectrum_set,
+            cls.metadata_to_store,
+        )
+
+        if store_files:
+            reference_metadata.to_parquet(store_file_directory / cls.reference_metadata_file_name)
+            top_k_tanimoto_scores.save(store_file_directory / cls.top_k_tanimoto_scores_file_name)
+            library_spectrum_set.embeddings.save(store_file_directory / cls.embedding_file_name)
+        return cls(ms2deepscore_model, library_spectrum_set.embeddings, top_k_tanimoto_scores, reference_metadata)
+
+    def run_ms2query(
+        self,
+        query_spectra: Sequence[Spectrum],
+        batch_size: int = 1000,
+    ) -> pd.DataFrame:
+
+        query_embeddings = Embeddings.create_from_spectra(query_spectra, self.ms2deepscore_model)
+
+        num_of_query_embeddings = query_embeddings.embeddings.shape[0]
+
+        library_index_highest_ms2deepscore = np.zeros((num_of_query_embeddings), dtype=int)
+        ms2query_scores = []
+        for start_idx in tqdm(
+            range(0, num_of_query_embeddings, batch_size),
+            desc="Predicting highest ms2deepscore per batch of "
+            + str(min(batch_size, num_of_query_embeddings))
+            + " embeddings",
+        ):
+            # Do MS2DeepScore predictions for batch
+            end_idx = min(start_idx + batch_size, num_of_query_embeddings)
+            selected_query_embeddings = query_embeddings.embeddings[start_idx:end_idx]
+            score_matrix = cosine_similarity_matrix(selected_query_embeddings, self.reference_embeddings.embeddings)
+            highest_score_idx = np.argmax(score_matrix, axis=1)
+            library_index_highest_ms2deepscore[start_idx:end_idx] = highest_score_idx
+
+            # get predicted inchikeys
+            predicted_inchikeys = self.reference_metadata.iloc[highest_score_idx]["inchikey"]
+            # Compute MS2Query reliability score
+            ms2query_scores.extend(
+                get_ms2query_reliability_prediction(
+                    predicted_inchikeys, self.spectrum_indices_per_inchikey, self.top_k_tanimoto_scores, score_matrix
+                )
+            )
+
+        # construct results df
+        results = self.reference_metadata.iloc[library_index_highest_ms2deepscore]
+        results["ms2query_reliability_prediction"] = ms2query_scores
+        return results
+
+
+def run_ms2query_from_files(
+    query_spectrum_file,
+    ms2deepscore_model_file_name,
+    reference_embeddings_file,
+    top_k_tanimoto_scores_file,
+    reference_metadata_file,
+    save_file_location,
+):
+    ms2query_library = ReferenceLibrary.load_from_files(
+        ms2deepscore_model_file_name,
+        reference_embeddings_file,
+        top_k_tanimoto_scores_file,
+        reference_metadata_file,
+    )
+
+    query_spectra = list(tqdm(load_spectra(query_spectrum_file), desc="loading_in_query_spectra"))
+    results_df = ms2query_library.run_ms2query(query_spectra)
+    results_df.to_csv(save_file_location)
+
+
+def get_ms2query_reliability_prediction(
+    predicted_inchikeys: list[str],
+    spectrum_indices_per_inchikey,
+    top_k_tanimoto_scores: TopKTanimotoScores,
+    ms2deepscore_score_matrix,
+) -> list[float]:
+    ms2query_scores = []
+    for query_spectrum_index, library_inchikey in enumerate(predicted_inchikeys):
+        top_k_inchikeys = top_k_tanimoto_scores.select_top_k_inchikeys(library_inchikey[:14])
+        maximum_ms2deepscores = np.zeros(top_k_tanimoto_scores.k, dtype=float)
+        for i, inchikey in enumerate(top_k_inchikeys):
+            spectrum_indexes = spectrum_indices_per_inchikey[inchikey]
+            highest_ms2deepscore = np.max(ms2deepscore_score_matrix[query_spectrum_index, spectrum_indexes])
+            maximum_ms2deepscores[i] = highest_ms2deepscore
+        ms2query_scores.append(np.mean(maximum_ms2deepscores))
+    # todo get the spectrum hashes instead of the indexes for lookup later.
+    return ms2query_scores
+
+
+def extract_metadata_from_library(spectra: AnnotatedSpectrumSet, metadata_to_collect: list):
+    collected_metadata = {key: [] for key in metadata_to_collect}
+    collected_metadata["spectrum_hashes"] = []
+    for spectrum in tqdm(spectra.spectra, desc="Extracting metadata df from spectra"):
+        for metadata_key in metadata_to_collect:
+            collected_metadata[metadata_key].append(spectrum.get(metadata_key))
+        collected_metadata["spectrum_hashes"].append(str(spectrum.__hash__()))
+    return pd.DataFrame(collected_metadata)
diff --git a/ms2query/benchmarking/TopKTanimotoScores.py → ...s2query_development/TopKTanimotoScores.py b/ms2query/benchmarking/TopKTanimotoScores.py → ...s2query_development/TopKTanimotoScores.py
@@ -1,7 +1,8 @@
+from pathlib import Path
 import numpy as np
 import pandas as pd
-from ms2query.benchmarking.Fingerprints import Fingerprints
 from ms2query.metrics import generalized_tanimoto_similarity_matrix
+from ms2query.ms2query_development.Fingerprints import Fingerprints
 
 
 class TopKTanimotoScores:
@@ -27,13 +28,21 @@ def _create_multi_index(
         combined_data = np.empty((len(inchikey_indexes), self.k * 2), dtype=object)
         combined_data[:, 0::2] = top_k_inchikeys
         combined_data[:, 1::2] = tanimoto_scores_for_top_k
-        return pd.DataFrame(combined_data, index=inchikey_indexes, columns=columns)
+        df = pd.DataFrame(combined_data, index=inchikey_indexes, columns=columns)
+
+        # Cast score columns to float64
+        score_cols = [(rank, "score") for rank in [f"Rank_{i + 1}" for i in range(self.k)]]
+        df[score_cols] = df[score_cols].astype(float)
+
+        return df
 
     @classmethod
     def calculate_from_fingerprints(cls, query_fingerprints: Fingerprints, target_fingerprints: Fingerprints, k):
         """
         Gets the top k highest inchikeys and scores for each inchikey in query_fingerprints from target_fingerprints
         """
+        if target_fingerprints.fingerprints.shape[0] < k:
+            raise ValueError("K cannot be larger than the number of fingerprints")
         similarity_scores = generalized_tanimoto_similarity_matrix(
             query_fingerprints.fingerprints, target_fingerprints.fingerprints
         )
@@ -67,3 +76,30 @@ def get_all_average_tanimoto_scores(self) -> dict[str, float]:
 
         average_per_inchikey_df = scores_df.mean(axis=1)
         return average_per_inchikey_df.to_dict()
+
+    def save(self, path: str | Path) -> None:
+        """Save the TopKTanimotoScores to disk as a parquet file.
+
+        Args:
+            path: File path without extension, e.g. "/data/top_k_scores".
+        """
+        Path(path).with_suffix(".parquet").parent.mkdir(parents=True, exist_ok=True)
+        self.top_k_inchikeys_and_scores.to_parquet(Path(path).with_suffix(".parquet"))
+
+    @classmethod
+    def load(cls, path: str | Path) -> "TopKTanimotoScores":
+        """Load a previously saved TopKTanimotoScores from disk.
+
+        Args:
+            path: File path without extension, e.g. "/data/top_k_scores".
+
+        Returns:
+            A fully reconstructed TopKTanimotoScores instance.
+        """
+        df = pd.read_parquet(Path(path).with_suffix(".parquet"))
+        df.columns.names = ["result_rank", "attribute"]
+
+        instance = cls.__new__(cls)
+        instance.k = len(df.columns.get_level_values("result_rank").unique())
+        instance.top_k_inchikeys_and_scores = df
+        return instance
diff --git a/ms2query/benchmarking/__init__.py → ms2query/ms2query_development/__init__.py b/ms2query/benchmarking/__init__.py → ms2query/ms2query_development/__init__.py
diff --git a/...rence_methods/EvaluateExactMatchSearch.py → ...rence_methods/EvaluateExactMatchSearch.py b/...rence_methods/EvaluateExactMatchSearch.py → ...rence_methods/EvaluateExactMatchSearch.py
@@ -1,7 +1,7 @@
 import random
 from typing import Callable, List, Tuple
 from tqdm import tqdm
-from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
+from ms2query.ms2query_development.AnnotatedSpectrumSet import AnnotatedSpectrumSet
 
 
 class EvaluateExactMatchSearchAcrossIonmodes:

diff --git a/...chmarking/MS2DeepScoresForTopInChikeys.py → ...e_methods/MS2DeepScoresForTopInChikeys.py b/...chmarking/MS2DeepScoresForTopInChikeys.py → ...e_methods/MS2DeepScoresForTopInChikeys.py
@@ -1,12 +1,12 @@
 import numpy as np
 from ms2deepscore.vector_operations import cosine_similarity_matrix
 from tqdm import tqdm
-from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
-from ms2query.benchmarking.Fingerprints import Fingerprints
-from ms2query.benchmarking.predict_top_k_ms2deepscore import (
+from ms2query.ms2query_development.AnnotatedSpectrumSet import AnnotatedSpectrumSet
+from ms2query.ms2query_development.Fingerprints import Fingerprints
+from ms2query.ms2query_development.reference_methods.predict_top_k_ms2deepscore import (
     select_inchikeys_with_highest_ms2deepscore,
 )
-from ms2query.benchmarking.TopKTanimotoScores import TopKTanimotoScores
+from ms2query.ms2query_development.TopKTanimotoScores import TopKTanimotoScores
 
 
 def calculate_MS2DeepScoresForTopKInChikeys_from_spectra(
@@ -77,6 +77,9 @@ def calculate_MS2DeepScoresForTopKInChikeys(
 class MS2DeepScoresForTopKInChikeys:
     """Stores the MS2DeepScores and Tanimoto scores for the top k closest lib spectra
 
+    This is only needed for the benchmarking and development (in the notebooks)
+    and is not used for running the final verison of MS2Query
+
     This allows for quick testing of different reranking strategies. E.g. get_mean is similar to the original MS2Query,
     but it can also be used to make matrixes with both MS2DeepScore and tanimoto scores to train small reranking models.
 

diff --git a/...enchmarking/reference_methods/__init__.py → ...development/reference_methods/__init__.py b/...enchmarking/reference_methods/__init__.py → ...development/reference_methods/__init__.py
diff --git a/...ce_methods/predict_best_possible_match.py → ...ce_methods/predict_best_possible_match.py b/...ce_methods/predict_best_possible_match.py → ...ce_methods/predict_best_possible_match.py
@@ -1,7 +1,7 @@
 from typing import Dict
 from matchms.similarity.vector_similarity_functions import jaccard_similarity_matrix
-from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
-from ms2query.benchmarking.Fingerprints import Fingerprints
+from ms2query.ms2query_development.AnnotatedSpectrumSet import AnnotatedSpectrumSet
+from ms2query.ms2query_development.Fingerprints import Fingerprints
 
 
 def predict_best_possible_match(

diff --git a/...enchmarking/predict_top_k_ms2deepscore.py → ...nce_methods/predict_top_k_ms2deepscore.py b/...enchmarking/predict_top_k_ms2deepscore.py → ...nce_methods/predict_top_k_ms2deepscore.py
@@ -2,8 +2,8 @@
 import numpy as np
 from ms2deepscore.vector_operations import cosine_similarity_matrix
 from tqdm import tqdm
-from ms2query.benchmarking.AnnotatedSpectrumSet import AnnotatedSpectrumSet
-from ms2query.benchmarking.Embeddings import Embeddings
+from ms2query.ms2query_development.AnnotatedSpectrumSet import AnnotatedSpectrumSet
+from ms2query.ms2query_development.Embeddings import Embeddings
 
 
 def predict_top_k_ms2deepscores(

diff --git a/ms2query/ms2query_development/reference_methods/readme.md b/ms2query/ms2query_development/reference_methods/readme.md
@@ -0,0 +1 @@
+All files here are not important to core functionality and in fact not used for running the final version of MS2Query. However MS2DeepSCoresForTopInChikeys and predict_top-k_ms2deepscore are core to the benchmarking and experimenting in the notebooks. predict_best_possible_match and EvaluateExactMatchSearch are not yet used for the benchmarking, but both should still be done, so this could be used for that.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		All files here are not important to core functionality and in fact not used for running the final version of MS2Query. However MS2DeepSCoresForTopInChikeys and predict_top-k_ms2deepscore are core to the benchmarking and experimenting in the notebooks. predict_best_possible_match and EvaluateExactMatchSearch are not yet used for the benchmarking, but both should still be done, so this could be used for that.