From d2f3da2a8960c11c647ca852702c0f959a3282e3 Mon Sep 17 00:00:00 2001 From: wangp97 Date: Tue, 16 Jul 2024 20:22:05 +0200 Subject: [PATCH 1/6] test wird nachgereicht --- .../include/OpenMS/ANALYSIS/ID/NeighborSeq.h | 78 ++++++++ .../include/OpenMS/ANALYSIS/ID/sources.cmake | 1 + src/openms/source/ANALYSIS/ID/NeighborSeq.cpp | 137 +++++++++++++ src/openms/source/ANALYSIS/ID/sources.cmake | 1 + .../openms/source/NeighborSeq_test.cpp | 180 ++++++++++++++++++ 5 files changed, 397 insertions(+) create mode 100644 src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h create mode 100644 src/openms/source/ANALYSIS/ID/NeighborSeq.cpp create mode 100644 src/tests/class_tests/openms/source/NeighborSeq_test.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h new file mode 100644 index 00000000000..9db2d319893 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h @@ -0,0 +1,78 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Sven Nahnsen $ +// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow, Philipp Wang $ +// -------------------------------------------------------------------------- + +// #define NEIGHBORSEQ_H +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace OpenMS; + +/** + * @brief The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) + * that are similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities + * specifically related to the Neighbor Peptide search. + * + * The paper on subset neighbor search is www.ncbi.nlm.nih.gov/pmc/articles/PMC8489664/ + * PMID: 34236864 PMCID: PMC8489664 DOI: 10.1021/acs.jproteome.1c00483 + */ +class OPENMS_DLLAPI NeighborSeq +{ + +public: + /** + * @brief Calculates the monoisotopic mass of a peptide. + * @param peptide The peptide sequence. + * @return The monoisotopic mass of the peptide. + */ + static double calculateMass(const AASequence& peptide); + + /** + * @brief Generates a theoretical spectrum for a given peptide sequence. + * @param peptide_sequence The peptide sequence for which to generate the spectrum. + * @return The generated theoretical spectrum. + */ + static MSSpectrum generateSpectrum(const String& peptide_sequence); + + /** + * @brief Compares two spectra to determine if they share a sufficient number of ions. + * @param spec1 The first spectrum. + * @param spec2 The second spectrum. + * @param ion_tolerance The tolerance for the proportion of shared ions. + * @param resolution The resolution setting for the comparison ("high" or "low"). + * @return True if the spectra share a sufficient number of ions, false otherwise. + */ + static bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& ion_tolerance, const std::string& resolution); + + /** + * @brief Finds neighbor peptides that are similar to a given peptide. + * @param peptides The peptide sequence to find neighbors for. + * @param neighbor_file The list of neighbor sequences to search in. + * @param mass_tolerance The mass tolerance for neighbor peptides. + * @param ion_tolerance The ion tolerance for neighbor peptides. + * @param resolution The resolution setting for the comparison ("high" or "low"). + * @return A vector of FASTA entries that are considered neighbor peptides. + */ + static std::vector findNeighborPeptides(const AASequence& peptides, + const std::vector& neighbor_file, + const double& mass_tolerance, + const double& ion_tolerance, + const String& resolution); +}; \ No newline at end of file diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake index 9484ece68a7..823a3cfb83d 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -34,6 +34,7 @@ IonIdentityMolecularNetworking.h MessagePasserFactory.h MetaboliteSpectralMatching.h MorpheusScore.h +NeighborSeq.h PeptideIndexing.h PeptideProteinResolution.h PercolatorFeatureSetHelper.h diff --git a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp new file mode 100644 index 00000000000..ff6a2ab8e3b --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp @@ -0,0 +1,137 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Sven Nahnsen $ +// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow, Philipp Wang $ +// -------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace OpenMS; +using namespace std; + + +// Function to calculate the mass of a peptide +double NeighborSeq::calculateMass(const AASequence& peptide) +{ +return peptide.getMonoWeight(); +} + + +// Function to generate the theoretical spectrum for a given peptide sequence +MSSpectrum NeighborSeq::generateSpectrum(const String& peptide_sequence) +{ +AASequence peptide = AASequence::fromString(peptide_sequence); +TheoreticalSpectrumGenerator spec_gen; +// Set parameters for the spectrum generator +Param params; +params.setValue("add_b_ions", "true"); +params.setValue("add_y_ions", "true"); +spec_gen.setParameters(params); +// Generate the spectrum and store it in an MSSpectrum object +MSSpectrum spectrum; +spec_gen.getSpectrum(spectrum, peptide, 1, 1); +return spectrum; +} + +// Function to compare two spectra and determine if they are similar +bool NeighborSeq::compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& ion_tolerance, const string& resolution) +{ +// Determine the bin size based on resolution +double bin_size; +if (resolution == "high") { bin_size = 1.0005079; } +else { bin_size = 0.05; } + +// Maps to hold the binned spectra +map bins1, bins2; + +// Lambda function to bin a spectrum +auto bin_spectrum = [&](const MSSpectrum& spec, map& bins) { + for (const auto& peak : spec) + { + int bin = static_cast(peak.getMZ() / bin_size); + bins[bin]++; + } +}; + +// Bin both spectra +bin_spectrum(spec1, bins1); +bin_spectrum(spec2, bins2); + +// Calculate the size of the spectra and the number of shared bins +int B1 = spec1.size(); +int B2 = spec2.size(); +int B12 = 0; + +for (const auto& bin : bins1) +{ + if (bins2.find(bin.first) != bins2.end()) { B12 += min(bin.second, bins2[bin.first]); } +} + +// Calculate the fraction of shared bins +double fraction_shared = static_cast(2 * B12) / (B1 + B2); +// cout << fraction_shared << endl; +return fraction_shared > ion_tolerance; +} + +// Method to find neighbor peptides in a given FASTA file +vector NeighborSeq::findNeighborPeptides(const AASequence& peptides, + const vector& neighbor_file, + const double& mass_tolerance, + const double& ion_tolerance, + const String& resolution) + +{ +vector result_entries; + +// Calculate the mass and generate the spectrum for the input peptide +size_t size = peptides.size(); +double mass = calculateMass(peptides); +String peptide_sequence = peptides.toString(); +MSSpectrum spec = generateSpectrum(peptide_sequence); +// Iterate through the neighbor file entries +for (const auto& entry : neighbor_file) +{ + for (size_t offset = 0; offset + size <= entry.sequence.size(); ++offset) + { + String neighbor_sequence = entry.sequence.substr(offset, size); + // Check if the sequence contains an 'X' + if (neighbor_sequence.find('X') == String::npos) + { + double neighbor_mass = calculateMass(AASequence::fromString(neighbor_sequence)); + double mass_diff = std::abs(mass - neighbor_mass) / ((mass + neighbor_mass) / 2); + // Check if the mass difference is within the tolerance + if (mass_diff <= mass_tolerance) + { + // cout << mass_diff << endl; + MSSpectrum neighbor_spec = generateSpectrum(neighbor_sequence); + // Compare the spectra and add to results if they are similar + if (compareSpectra(spec, neighbor_spec, ion_tolerance, resolution)) + { + // Construct FASTAFile::FASTAEntry and add to results + FASTAFile::FASTAEntry neighbor_entry(entry.identifier, entry.description, neighbor_sequence); + result_entries.push_back(neighbor_entry); + } + } + } + } +} +return result_entries; +} + diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 41cc05a6d59..de1522abfb9 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -34,6 +34,7 @@ IonIdentityMolecularNetworking.cpp MessagePasserFactory.cpp MetaboliteSpectralMatching.cpp MorpheusScore.cpp +NeighborSeq.cpp PeptideProteinResolution.cpp PeptideIndexing.cpp PercolatorFeatureSetHelper.cpp diff --git a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp new file mode 100644 index 00000000000..a53a8ebe691 --- /dev/null +++ b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp @@ -0,0 +1,180 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Andreas Bertsch $ +// -------------------------------------------------------------------------- + +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +START_TEST(NeighborSeq, "$Id$") + +//NS()=delete; + + +START_SECTION(double calculateMass(const AASequence& peptide)) +{ + double mass_1 = NeighborSeq::calculateMass("PEPTIDE"); + double mass_2 = NeighborSeq::calculateMass("PPPPPPP"); + double mass_3 = NeighborSeq::calculateMass("PEPEP"); + // Compare the calculated masses with expected values + TEST_REAL_SIMILAR(mass_1, 799.815); + TEST_REAL_SIMILAR(mass_2, 697.814); + TEST_REAL_SIMILAR(mass_3, 567.579); +} +END_SECTION + +// Test section for the generateSpectrum function +START_SECTION(MSSpectrum generateSpectrum(const String& peptide_sequence)) +{ + MSSpectrum spec_1 = NeighborSeq::generateSpectrum("PEPT"); + MSSpectrum spec_2 = NeighborSeq::generateSpectrum("PPP"); + MSSpectrum spec_3 = NeighborSeq::generateSpectrum("AR"); + + // Define the expected spectrum for "PEPT" + MSSpectrum spec_1_test; + + Peak1D peak1; + peak1.setMZ(120.06552075); + peak1.setIntensity(1); + spec_1_test.push_back(peak1); + + Peak1D peak2; + peak2.setMZ(217.11828498); + peak2.setIntensity(1); + spec_1_test.push_back(peak2); + + Peak1D peak3; + peak3.setMZ(227.10263491); + peak3.setIntensity(1); + spec_1_test.push_back(peak3); + + Peak1D peak4; + peak4.setMZ(324.15539914); + peak4.setIntensity(1); + spec_1_test.push_back(peak4); + + Peak1D peak5; + peak5.setMZ(346.16087920); + peak5.setIntensity(1); + spec_1_test.push_back(peak5); + +// Define the expected spectrum for "PPP" + MSSpectrum spec_2_test; + Peak1D peak1; + peak1.setMZ(116.07060575); + peak1.setIntensity(1); + spec_2_test.push_back(peak1); + + Peak1D peak2; + peak2.setMZ(195.11280491); + peak2.setIntensity(1); + spec_2_test.push_back(peak2); + + Peak1D peak3; + peak3.setMZ(213.12336998); + peak3.setIntensity(1); + spec_3_test.push_back(peak3); + +// Define the expected spectrum for "AR" + MSSpectrum spec_3_test; + Peak1D peak1; + peak1.setMZ(175.11895291); + peak1.setIntensity(1); + spec_3_test.push_back(peak1); + +// Compare the generated spectra with the expected spectra + TEST_EQUAL(spec_1, spec_1_test); + TEST_EQUAL(spec_1, spec_2_test); + TEST_EQUAL(spec_1, spec_3_test); + TEST_EQUAL(spec_2, spec_1_test); + TEST_EQUAL(spec_2, spec_2_test); + TEST_EQUAL(spec_2, spec_3_test); + TEST_EQUAL(spec_3, spec_1_test); + TEST_EQUAL(spec_3, spec_2_test); + TEST_EQUAL(spec_3, spec_3_test); +} +END_SECTION + +// Test section for the compareSpectra function +START_SECTION(bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& ion_tolerance, const string& resolution)) +{ + MSSpectrum spec1; + spec1.push_back(Peak1D(100.0, 1.0)); + spec1.push_back(Peak1D(200.0, 1.0)); + spec1.push_back(Peak1D(300.0, 1.0)); + spec1.push_back(Peak1D(400.0, 1.0)); + + + MSSpectrum spec2; + spec2.push_back(Peak1D(100.05, 1.0)); + spec2.push_back(Peak1D(200.05, 1.0)); + spec2.push_back(Peak1D(300.05, 1.0)); + spec2.push_back(Peak1D(400.06, 1.0)); + + + MSSpectrum spec3; + spec3.push_back(Peak1D(101.05, 1.0)); + spec3.push_back(Peak1D(201.05, 1.0)); + spec3.push_back(Peak1D(301.05, 1.0)); + spec3.push_back(Peak1D(301.06, 1.0)); + + + double ion_tolerance = 0.25; + + bool compare_1_2_low = NeighborSeq::compareSpectra(spec1, spec2, ion_tolerance, "low"); + bool compare_1_3_low = NeighborSeq::compareSpectra(spec1, spec3, ion_tolerance, "low"); + bool compare_2_3_low = NeighborSeq::compareSpectra(spec2, spec3, ion_tolerance, "low"); + + bool compare_1_2_high = NeighborSeq::compareSpectra(spec1, spec2, ion_tolerance, "high"); + bool compare_1_3_high = NeighborSeq::compareSpectra(spec1, spec3, ion_tolerance, "high"); + bool compare_2_3_high = NeighborSeq::compareSpectra(spec2, spec3, ion_tolerance, "high"); + +// Verify the results of the comparisons + TEST_EQUAL(compare_1_2_low, true) + TEST_EQUAL(compare_1_3_low, false) + TEST_EQUAL(compare_1_3_low, false) + TEST_EQUAL(compare_1_2_high, true) + TEST_EQUAL(compare_1_3_high, false) + TEST_EQUAL(compare_2_3_high, true) +} +END_SECTION + + +// Test section for the findNeighborPeptides_ function +START_SECTION(vector findNeighborPeptides(const AASequence& peptides, + const vector& neighbor_file, + const double& mass_tolerance, + const double& ion_tolerance, + const String& resolution)) +{ + AASequence peptide = AASequence::fromString("VGLPINQR") + + vector neighbor_file; + neighbor_file.push_back(FASTAFile::FASTAEntry("protein1", "RIPLANGR")); + neighbor_file.push_back(FASTAFile::FASTAEntry("protein2", "PPPPPPPP")); + neighbor_file.push_back(FASTAFile::FASTAEntry("protein3", "VGLPINQR")); + neighbor_file.push_back(FASTAFile::FASTAEntry("protein4", "RIPLANGR")); + + double mass_tolerance = 0.00004; + double ion_tolerance = 0.25; + String resolution = "low"; + + vector neighbors = findNeighborPeptides(peptide, neighbor_file, mass_tolerance, ion_tolerance, resolution); + vector neighbors_test; + neighbor_test.push_back(FASTAFile::FASTAEntry("RIPLANGR")); + neighbor_test.push_back(FASTAFile::FASTAEntry("VGLPINQRgit clone git@github.com:_YOURUSERNAME_/OpenMS.git")); + + // Verify the found neighbors match the expected neighbors + TEST_EQUAL(neighbor, neighbor_test); + +} +END_SECTION + +END_TEST From 25a840eaabbc1caa7387b35fe4477fc69ee5bb63 Mon Sep 17 00:00:00 2001 From: wangp97 Date: Tue, 16 Jul 2024 20:26:38 +0200 Subject: [PATCH 2/6] test wird nachgereicht --- src/topp/DecoyDatabase.cpp | 322 ++++++++++++++++++++++--------------- 1 file changed, 193 insertions(+), 129 deletions(-) diff --git a/src/topp/DecoyDatabase.cpp b/src/topp/DecoyDatabase.cpp index c67f7caf6d0..9cb3792f5b9 100644 --- a/src/topp/DecoyDatabase.cpp +++ b/src/topp/DecoyDatabase.cpp @@ -16,6 +16,13 @@ #include #include #include +#include +#include +#include +#include +#include + + using namespace OpenMS; using namespace std; @@ -54,6 +61,11 @@ and terminates the program if decoys are found. @verbinclude TOPP_DecoyDatabase.cli INI file documentation of this tool: @htmlinclude TOPP_DecoyDatabase.html + + +The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) that are +similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities specifically related +to the Neighbor Peptide search. */ // We do not want this class to show up in the docu: @@ -73,7 +85,7 @@ class TOPPDecoyDatabase : { registerInputFileList_("in", "", ListUtils::create(""), "Input FASTA file(s), each containing a database. It is recommended to include a contaminant database as well."); setValidFormats_("in", ListUtils::create("fasta")); - registerOutputFile_("out", "", "", "Output FASTA file where the decoy database will be written to."); + registerOutputFile_("out", "", "", "Output FASTA file where the decoy database or neighbor peptide will be written to."); setValidFormats_("out", ListUtils::create("fasta")); registerStringOption_("decoy_string", "", "DECOY_", "String that is combined with the accession of the protein identifier to indicate a decoy protein.", false); registerStringOption_("decoy_string_position", "", "prefix", "Should the 'decoy_string' be prepended (prefix) or appended (suffix) to the protein accession?", false); @@ -96,6 +108,15 @@ class TOPPDecoyDatabase : setValidStrings_("enzyme", all_enzymes); registerSubsection_("Decoy", "Decoy parameters section"); + + // New options for neighbor peptide search + registerTOPPSubsection_("Neighbor_Search", "Parameters for neighbor peptide search"); + registerInputFile_("Neighbor_Search:neighbor_in", "","", "Input FASTA file for neighbor peptide search", false); + setValidFormats_("Neighbor_Search:neighbor_in", {"fasta"}); + registerStringOption_("Neighbor_Search:resolution", "", "low", "Sets the resolution of the neighbor peptide search. Options are 'low'(0.05 Da) and 'high'(1.0005079 Da)",false); + setValidStrings_("Neighbor_Search:resolution", ListUtils::create("low,high")); + registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00004, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); + registerDoubleOption_("Neighbor_Search:ion_tolerance", "", 0.25, "Tolerance of the proportion of b and y ions shared by the neighbor peptide 2*B12/B1+B2 > ti", false, true); } Param getSubsectionDefaults_(const String& /* name */) const override @@ -111,6 +132,7 @@ class TOPPDecoyDatabase : if (as_prefix) return decoy_string + identifier; else return identifier + decoy_string; } + ExitCodes main_(int, const char**) override { @@ -165,173 +187,215 @@ class TOPPDecoyDatabase : f.writeStart(out); FASTAFile::FASTAEntry entry; - // Configure Enzymatic digestion - // TODO: allow user-specified regex - ProteaseDigestion digestion; - String enzyme = getStringOption_("enzyme").trim(); - if ((input_type == SeqType::protein) && !enzyme.empty()) - { - digestion.setEnzyme(enzyme); - } - // check if decoy_string is common decoy string (e.g. decoy, rev, ...) - String decoy_string_lower = decoy_string; - decoy_string_lower.toLower(); - bool is_common = false; - for (const auto& a : DecoyHelper::affixes) + + // Neighbor peptide search + String neighbor_file = getStringOption_("Neighbor_Search:neighbor_in"); + // Check if the neighbor file option is provided + if (! neighbor_file.empty()) { - if ((decoy_string_lower.hasPrefix(a) && decoy_string_position_prefix) || (decoy_string_lower.hasSuffix(a) && !decoy_string_position_prefix)) + String resolution = getStringOption_("Neighbor_Search:resolution"); + double ion_tolerance = getDoubleOption_("Neighbor_Search:ion_tolerance"); + double mass_tolerance = getDoubleOption_("Neighbor_Search:mass_tolerance"); + // Create a ProteaseDigestion object for peptide digestion + ProteaseDigestion digestion; + String enzyme = getStringOption_("enzyme").trim(); + if ((input_type == SeqType::protein) && ! enzyme.empty()) { - is_common = true; + digestion.setEnzyme(enzyme); } - } - // terminate, if decoy_string is not one of the allowed decoy strings (exit code 11) - if (!is_common) - { - if (getFlag_("force")) + + // Vector to hold all digested peptides from the input files + vector all_peptides; + + for (Size i = 0; i < in.size(); ++i) { - OPENMS_LOG_WARN << "Force Flag is enabled, decoys with custom decoy string (not in DecoyHelper::affixes) will not be detected.\n"; + //FASTAContainer in_entries {in[i]}; + f.readStart(in[i]); + while (f.readNext(entry)) + { + + // Vector to hold the peptides generated from digestion + vector peptides; + // Digest the current entry sequence into peptides + digestion.digest(AASequence::fromString(entry.sequence), peptides); + // Add the generated peptides to the all_peptides vector + all_peptides.insert(all_peptides.end(), peptides.begin(), peptides.end()); + } } - else - { - OPENMS_LOG_FATAL_ERROR << "Given decoy string is not allowed. Please use one of the strings in DecoyHelper::affixes as either prefix or suffix (case insensitive): \n"; - return INCOMPATIBLE_INPUT_DATA; + // Load the neighbor peptides from the neighbor file + FASTAFile neighbor_fasta; + vector neighbor_entries; + neighbor_fasta.load(neighbor_file, neighbor_entries); + for (auto i = all_peptides.begin(); i != all_peptides.end(); ++i) + { + { + // Find neighbor peptides for the current peptide + vector result = NeighborSeq::findNeighborPeptides(*i, neighbor_entries, mass_tolerance, ion_tolerance, resolution); + for (const auto& sub : result) + { + // Write the neighbor peptide to the output file + f.writeNext(sub); + } + } } } - MRMDecoy m; - m.setParameters(decoy_param); - - Math::RandomShuffler shuffler(seed); - for (Size i = 0; i < in.size(); ++i) + else { - // check input files for decoys - FASTAContainer in_entries{in[i]}; - auto r = DecoyHelper::countDecoys(in_entries); - // if decoys found, throw exception - if (static_cast(r.all_prefix_occur + r.all_suffix_occur) >= 0.4 * static_cast(r.all_proteins_count)) + // Configure Enzymatic digestion + // TODO: allow user-specified regex + ProteaseDigestion digestion; + String enzyme = getStringOption_("enzyme").trim(); + if ((input_type == SeqType::protein) && ! enzyme.empty()) { digestion.setEnzyme(enzyme); } + // check if decoy_string is common decoy string (e.g. decoy, rev, ...) + String decoy_string_lower = decoy_string; + decoy_string_lower.toLower(); + bool is_common = false; + for (const auto& a : DecoyHelper::affixes) { - // if decoys found, program terminates with exit code 11 - OPENMS_LOG_FATAL_ERROR << "Invalid input in " + in[i] + ": Input file already contains decoys." << '\n'; - return INCOMPATIBLE_INPUT_DATA; + if ((decoy_string_lower.hasPrefix(a) && decoy_string_position_prefix) || (decoy_string_lower.hasSuffix(a) && ! decoy_string_position_prefix)) + { + is_common = true; + } } - - f.readStart(in[i]); - //------------------------------------------------------------- - // calculations - //------------------------------------------------------------- - while (f.readNext(entry)) + // terminate, if decoy_string is not one of the allowed decoy strings (exit code 11) + if (! is_common) { - if (identifiers.find(entry.identifier) != identifiers.end()) + if (getFlag_("force")) { - OPENMS_LOG_WARN << "DecoyDatabase: Warning, identifier '" << entry.identifier << "' occurs more than once!" << endl; + OPENMS_LOG_WARN << "Force Flag is enabled, decoys with custom decoy string (not in DecoyHelper::affixes) will not be detected.\n"; } - identifiers.insert(entry.identifier); - - if (append) + else { - f.writeNext(entry); + OPENMS_LOG_FATAL_ERROR << "Given decoy string is not allowed. Please use one of the strings in DecoyHelper::affixes as either prefix or " + "suffix (case insensitive): \n"; + return INCOMPATIBLE_INPUT_DATA; } + } + MRMDecoy m; + m.setParameters(decoy_param); - // identifier - entry.identifier = getIdentifier_(entry.identifier, decoy_string, decoy_string_position_prefix); + Math::RandomShuffler shuffler(seed); + for (Size i = 0; i < in.size(); ++i) + { + // check input files for decoys + FASTAContainer in_entries {in[i]}; + auto r = DecoyHelper::countDecoys(in_entries); + // if decoys found, throw exception + if (static_cast(r.all_prefix_occur + r.all_suffix_occur) >= 0.4 * static_cast(r.all_proteins_count)) + { + // if decoys found, program terminates with exit code 11 + OPENMS_LOG_FATAL_ERROR << "Invalid input in " + in[i] + ": Input file already contains decoys." << '\n'; + return INCOMPATIBLE_INPUT_DATA; + } - // sequence - if (input_type == SeqType::RNA) + f.readStart(in[i]); + //------------------------------------------------------------- + // calculations + //------------------------------------------------------------- + while (f.readNext(entry)) { - string quick_seq = entry.sequence; - bool five_p = (entry.sequence.front() == 'p'); - bool three_p = (entry.sequence.back() == 'p'); - if (five_p) //we don't want to reverse terminal phosphates + if (identifiers.find(entry.identifier) != identifiers.end()) { - quick_seq.erase(0, 1); - } - if (three_p) - { - quick_seq.pop_back(); + OPENMS_LOG_WARN << "DecoyDatabase: Warning, identifier '" << entry.identifier << "' occurs more than once!" << endl; } + identifiers.insert(entry.identifier); + + if (append) { f.writeNext(entry); } - vector tokenized; - std::smatch m; - std::string pattern = R"([^\[]|(\[[^\[\]]*\]))"; - std::regex re(pattern); + // identifier + entry.identifier = getIdentifier_(entry.identifier, decoy_string, decoy_string_position_prefix); - while (std::regex_search(quick_seq, m, re)) + // sequence + if (input_type == SeqType::RNA) { + string quick_seq = entry.sequence; + bool five_p = (entry.sequence.front() == 'p'); + bool three_p = (entry.sequence.back() == 'p'); + if (five_p) // we don't want to reverse terminal phosphates + { + quick_seq.erase(0, 1); + } + if (three_p) { quick_seq.pop_back(); } + + vector tokenized; + std::smatch m; + std::string pattern = R"([^\[]|(\[[^\[\]]*\]))"; + std::regex re(pattern); + + while (std::regex_search(quick_seq, m, re)) + { tokenized.emplace_back(m.str(0)); quick_seq = m.suffix(); - } + } - if (shuffle) - { - shuffler.portable_random_shuffle(tokenized.begin(), tokenized.end()); - } - else // reverse - { - reverse(tokenized.begin(), tokenized.end()); //reverse the tokens - } - if (five_p) //add back 5' - { - tokenized.insert(tokenized.begin(), String("p")); - } - if (three_p) //add back 3' - { - tokenized.emplace_back("p"); + if (shuffle) { shuffler.portable_random_shuffle(tokenized.begin(), tokenized.end()); } + else // reverse + { + reverse(tokenized.begin(), tokenized.end()); // reverse the tokens + } + if (five_p) // add back 5' + { + tokenized.insert(tokenized.begin(), String("p")); + } + if (three_p) // add back 3' + { + tokenized.emplace_back("p"); + } + entry.sequence = ListUtils::concatenate(tokenized, ""); } - entry.sequence = ListUtils::concatenate(tokenized, ""); - } - else // protein input - { - // if (terminal_aminos != "none") - if (enzyme != "no cleavage" && (keepN || keepC)) + else // protein input { - std::vector peptides; - digestion.digest(AASequence::fromString(entry.sequence), peptides); - String new_sequence = ""; - for (auto const& peptide : peptides) + // if (terminal_aminos != "none") + if (enzyme != "no cleavage" && (keepN || keepC)) { - //TODO why are the functions from TargetedExperiment and MRMDecoy not anywhere more general? - // No soul would look there. + std::vector peptides; + digestion.digest(AASequence::fromString(entry.sequence), peptides); + String new_sequence = ""; + for (auto const& peptide : peptides) + { + // TODO why are the functions from TargetedExperiment and MRMDecoy not anywhere more general? + // No soul would look there. + if (shuffle) + { + OpenMS::TargetedExperiment::Peptide p; + p.sequence = peptide.toString(); + OpenMS::TargetedExperiment::Peptide decoy_p = m.shufflePeptide(p, identity_threshold, seed, max_attempts); + new_sequence += decoy_p.sequence; + } + else + { + OpenMS::TargetedExperiment::Peptide p; + p.sequence = peptide.toString(); + OpenMS::TargetedExperiment::Peptide decoy_p = MRMDecoy::reversePeptide(p, keepN, keepC, keep_const_pattern); + new_sequence += decoy_p.sequence; + } + } + entry.sequence = new_sequence; + } + else + { + // sequence if (shuffle) { - OpenMS::TargetedExperiment::Peptide p; - p.sequence = peptide.toString(); - OpenMS::TargetedExperiment::Peptide decoy_p = m.shufflePeptide(p, identity_threshold, seed, max_attempts); - new_sequence += decoy_p.sequence; + shuffler.seed(seed); // identical proteins are shuffled the same way -> re-seed + shuffler.portable_random_shuffle(entry.sequence.begin(), entry.sequence.end()); } - else + else // reverse { - OpenMS::TargetedExperiment::Peptide p; - p.sequence = peptide.toString(); - OpenMS::TargetedExperiment::Peptide decoy_p = MRMDecoy::reversePeptide(p, keepN, keepC, keep_const_pattern); - new_sequence += decoy_p.sequence; + entry.sequence.reverse(); } } - entry.sequence = new_sequence; - } - else - { - // sequence - if (shuffle) - { - shuffler.seed(seed); // identical proteins are shuffled the same way -> re-seed - shuffler.portable_random_shuffle(entry.sequence.begin(), entry.sequence.end()); - } - else // reverse - { - entry.sequence.reverse(); - } } - } - - //------------------------------------------------------------- - // writing output - //------------------------------------------------------------- - f.writeNext(entry); - } // next protein - } // input files + //------------------------------------------------------------- + // writing output + //------------------------------------------------------------- + f.writeNext(entry); + } // next protein + } // input files + } return EXECUTION_OK; } - }; From f4ffd32e7f10270d661673a6b90d0d7104b84501 Mon Sep 17 00:00:00 2001 From: wangp97 Date: Tue, 23 Jul 2024 02:12:30 +0200 Subject: [PATCH 3/6] Decoy Database Zeile 239-248 wird noch in ein eigene Funktion geschrieben --- .../include/OpenMS/ANALYSIS/ID/NeighborSeq.h | 53 +++---- src/openms/source/ANALYSIS/ID/NeighborSeq.cpp | 146 ++++++++---------- src/topp/DecoyDatabase.cpp | 102 +++++++----- 3 files changed, 145 insertions(+), 156 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h index 9db2d319893..9eaf1ae3a73 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h @@ -2,27 +2,17 @@ // SPDX-License-Identifier: BSD-3-Clause // // -------------------------------------------------------------------------- -// $Maintainer: Sven Nahnsen $ -// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow, Philipp Wang $ +// $Maintainer: Chris Bielow, Philipp Wang $ +// $Authors: Philipp Wang $ // -------------------------------------------------------------------------- -// #define NEIGHBORSEQ_H #pragma once -#include -#include -#include -#include -#include -#include -#include + #include +#include +#include #include -#include -#include #include -#include -#include - using namespace OpenMS; /** @@ -37,12 +27,6 @@ class OPENMS_DLLAPI NeighborSeq { public: - /** - * @brief Calculates the monoisotopic mass of a peptide. - * @param peptide The peptide sequence. - * @return The monoisotopic mass of the peptide. - */ - static double calculateMass(const AASequence& peptide); /** * @brief Generates a theoretical spectrum for a given peptide sequence. @@ -55,24 +39,27 @@ class OPENMS_DLLAPI NeighborSeq * @brief Compares two spectra to determine if they share a sufficient number of ions. * @param spec1 The first spectrum. * @param spec2 The second spectrum. - * @param ion_tolerance The tolerance for the proportion of shared ions. - * @param resolution The resolution setting for the comparison ("high" or "low"). + * @param min_shared_ion_fraction The tolerance for the proportion of shared ions. + * @param mz_bin_size The mz_bin_size setting for the comparison. * @return True if the spectra share a sufficient number of ions, false otherwise. */ - static bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& ion_tolerance, const std::string& resolution); + static bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size); + + + static std::map> NeighborSeq::createMassPositonMap(const std::vector& candidates); /** * @brief Finds neighbor peptides that are similar to a given peptide. - * @param peptides The peptide sequence to find neighbors for. - * @param neighbor_file The list of neighbor sequences to search in. + * @param peptide The peptide sequence to find neighbors for. + * @param neighbor_candidate The list of neighbor sequences to search in. * @param mass_tolerance The mass tolerance for neighbor peptides. - * @param ion_tolerance The ion tolerance for neighbor peptides. - * @param resolution The resolution setting for the comparison ("high" or "low"). + * @param min_shared_ion_fraction The ion tolerance for neighbor peptides. + * @param mz_bin_size The mz_bin_size setting for the comparison is default 0.05 (the original study suggests ‘high’ (0.05 Da) and ‘low’ (1.0005079 Da) mz_bin_size). * @return A vector of FASTA entries that are considered neighbor peptides. */ - static std::vector findNeighborPeptides(const AASequence& peptides, - const std::vector& neighbor_file, - const double& mass_tolerance, - const double& ion_tolerance, - const String& resolution); + static std::vector findNeighborPeptides(const AASequence& peptides, + const std::vector& neighbor_candidate, + const std::vector& candidate_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size); }; \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp index ff6a2ab8e3b..0c16e04c442 100644 --- a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp +++ b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp @@ -7,31 +7,17 @@ // -------------------------------------------------------------------------- #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include #include -#include + using namespace OpenMS; using namespace std; -// Function to calculate the mass of a peptide -double NeighborSeq::calculateMass(const AASequence& peptide) -{ -return peptide.getMonoWeight(); -} // Function to generate the theoretical spectrum for a given peptide sequence @@ -51,87 +37,85 @@ return spectrum; } // Function to compare two spectra and determine if they are similar -bool NeighborSeq::compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& ion_tolerance, const string& resolution) +bool NeighborSeq::compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size) { -// Determine the bin size based on resolution -double bin_size; -if (resolution == "high") { bin_size = 1.0005079; } -else { bin_size = 0.05; } - -// Maps to hold the binned spectra -map bins1, bins2; + std::map bins1, bins2; -// Lambda function to bin a spectrum -auto bin_spectrum = [&](const MSSpectrum& spec, map& bins) { + // Lambda function to bin a spectrum into a map + auto bin_spectrum = [&](const MSSpectrum& spec, std::map& bins) { for (const auto& peak : spec) - { - int bin = static_cast(peak.getMZ() / bin_size); - bins[bin]++; - } -}; + bins[static_cast(peak.getMZ() / mz_bin_size)]++; + }; -// Bin both spectra -bin_spectrum(spec1, bins1); -bin_spectrum(spec2, bins2); + // Bin both spectra + bin_spectrum(spec1, bins1); + bin_spectrum(spec2, bins2); -// Calculate the size of the spectra and the number of shared bins -int B1 = spec1.size(); -int B2 = spec2.size(); -int B12 = 0; + // Extract bin keys as vectors + std::vector vec_bins1, vec_bins2; + for (const auto& bin : bins1) + vec_bins1.push_back(bin.first); + for (const auto& bin : bins2) + vec_bins2.push_back(bin.first); -for (const auto& bin : bins1) -{ - if (bins2.find(bin.first) != bins2.end()) { B12 += min(bin.second, bins2[bin.first]); } + // Calculate the intersection of the bin vectors + std::vector intersection; + std::set_intersection(vec_bins1.begin(), vec_bins1.end(), vec_bins2.begin(), vec_bins2.end(), std::back_inserter(intersection)); + + // Calculate the number of shared bins considering the bin frequencies + int B12 = 0; + for (int bin : intersection) + B12 += std::min(bins1[bin], bins2[bin]); + + // Calculate the fraction of shared bins + double fraction_shared = static_cast(2 * B12) / (spec1.size() + spec2.size()); + + + return fraction_shared > min_shared_ion_fraction; } -// Calculate the fraction of shared bins -double fraction_shared = static_cast(2 * B12) / (B1 + B2); -// cout << fraction_shared << endl; -return fraction_shared > ion_tolerance; +map> NeighborSeq::createMassPositonMap(const vector& candidates) +{ + map> mass_position_map; + + //Iterate through the vector of AASequence objects + for (size_t i = 0; i < candidates.size(); ++i) + { + // Calculate the mono-isotopic mass of the sequence + double mass = candidates[i].getMonoWeight(); + // Insert the mass and the position into the map + mass_position_map[mass].push_back(i); + } + return mass_position_map; } // Method to find neighbor peptides in a given FASTA file -vector NeighborSeq::findNeighborPeptides(const AASequence& peptides, - const vector& neighbor_file, - const double& mass_tolerance, - const double& ion_tolerance, - const String& resolution) +vector NeighborSeq::findNeighborPeptides(const AASequence& peptides, + const vector& neighbor_candidate, + const vector& candidate_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size) { -vector result_entries; - -// Calculate the mass and generate the spectrum for the input peptide -size_t size = peptides.size(); -double mass = calculateMass(peptides); -String peptide_sequence = peptides.toString(); -MSSpectrum spec = generateSpectrum(peptide_sequence); -// Iterate through the neighbor file entries -for (const auto& entry : neighbor_file) -{ - for (size_t offset = 0; offset + size <= entry.sequence.size(); ++offset) - { - String neighbor_sequence = entry.sequence.substr(offset, size); - // Check if the sequence contains an 'X' - if (neighbor_sequence.find('X') == String::npos) + vector result_entries; + String rel_sequence = peptides.toString(); + MSSpectrum spec = generateSpectrum(rel_sequence); + for (size_t i = 0; i < candidate_position.size(); i++) { - double neighbor_mass = calculateMass(AASequence::fromString(neighbor_sequence)); - double mass_diff = std::abs(mass - neighbor_mass) / ((mass + neighbor_mass) / 2); - // Check if the mass difference is within the tolerance - if (mass_diff <= mass_tolerance) - { - // cout << mass_diff << endl; - MSSpectrum neighbor_spec = generateSpectrum(neighbor_sequence); - // Compare the spectra and add to results if they are similar - if (compareSpectra(spec, neighbor_spec, ion_tolerance, resolution)) - { - // Construct FASTAFile::FASTAEntry and add to results - FASTAFile::FASTAEntry neighbor_entry(entry.identifier, entry.description, neighbor_sequence); - result_entries.push_back(neighbor_entry); - } - } + String neighbor_sequence = neighbor_candidate[candidate_position[i]].toString(); + //cout << neighbor_sequence << endl; + // Check if the sequence contains an 'X' + if (neighbor_sequence.find('X') == String::npos) + { + MSSpectrum neighbor_spec = generateSpectrum(neighbor_sequence); + + // Compare the spectra and add to results if they are similar + if (compareSpectra(spec, neighbor_spec, min_shared_ion_fraction, mz_bin_size)) + { + result_entries.push_back(candidate_position[i]); + } + } } - } -} return result_entries; } diff --git a/src/topp/DecoyDatabase.cpp b/src/topp/DecoyDatabase.cpp index 9cb3792f5b9..7be71111cdd 100644 --- a/src/topp/DecoyDatabase.cpp +++ b/src/topp/DecoyDatabase.cpp @@ -17,10 +17,7 @@ #include #include #include -#include -#include -#include -#include + @@ -65,7 +62,8 @@ and terminates the program if decoys are found. The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) that are similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities specifically related -to the Neighbor Peptide search. +to the Neighbor Peptide search. +NeighborSeq class has more details look at *openms\source\ANALYSIS\ID\NeighborSeq.cpp */ // We do not want this class to show up in the docu: @@ -85,7 +83,8 @@ class TOPPDecoyDatabase : { registerInputFileList_("in", "", ListUtils::create(""), "Input FASTA file(s), each containing a database. It is recommended to include a contaminant database as well."); setValidFormats_("in", ListUtils::create("fasta")); - registerOutputFile_("out", "", "", "Output FASTA file where the decoy database or neighbor peptide will be written to."); + registerOutputFile_("out", "", "", "Output FASTA file where the decoy database will be written to."); + registerOutputFile_("out_neighbor", "", "", "Output FASTA file where the neighbor peptide will be written to."); setValidFormats_("out", ListUtils::create("fasta")); registerStringOption_("decoy_string", "", "DECOY_", "String that is combined with the accession of the protein identifier to indicate a decoy protein.", false); registerStringOption_("decoy_string_position", "", "prefix", "Should the 'decoy_string' be prepended (prefix) or appended (suffix) to the protein accession?", false); @@ -111,12 +110,12 @@ class TOPPDecoyDatabase : // New options for neighbor peptide search registerTOPPSubsection_("Neighbor_Search", "Parameters for neighbor peptide search"); - registerInputFile_("Neighbor_Search:neighbor_in", "","", "Input FASTA file for neighbor peptide search", false); + registerInputFile_("Neighbor_Search:neighbor_in", "","", "neighbor peptides are searched for in the Fasta file entered for candiate ", false); setValidFormats_("Neighbor_Search:neighbor_in", {"fasta"}); - registerStringOption_("Neighbor_Search:resolution", "", "low", "Sets the resolution of the neighbor peptide search. Options are 'low'(0.05 Da) and 'high'(1.0005079 Da)",false); - setValidStrings_("Neighbor_Search:resolution", ListUtils::create("low,high")); - registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00004, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); - registerDoubleOption_("Neighbor_Search:ion_tolerance", "", 0.25, "Tolerance of the proportion of b and y ions shared by the neighbor peptide 2*B12/B1+B2 > ti", false, true); + registerIntOption_("Neighbor_Search:missed_cleavages", "", 0,"Number of missed cleavages",false); + registerDoubleOption_("Neighbor_Search:mz_bin_size", "", 0.05,"Sets the mz_bin_size of the neighbor peptide search.(the original study suggests high (0.05 Da) and low (1.0005079 Da) mz_bin_size)", false); + registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00001, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); + registerDoubleOption_("Neighbor_Search:min_shared_ion_fraction", "", 0.25, "Tolerance of the proportion of b and y ions shared by the neighbor peptide 2*B12/B1+B2 > ti", false, true); } Param getSubsectionDefaults_(const String& /* name */) const override @@ -187,33 +186,34 @@ class TOPPDecoyDatabase : f.writeStart(out); FASTAFile::FASTAEntry entry; - - // Neighbor peptide search + // Create a ProteaseDigestion object for peptide digestion + ProteaseDigestion digestion; + String enzyme = getStringOption_("enzyme").trim(); + if ((input_type == SeqType::protein) && ! enzyme.empty()) { digestion.setEnzyme(enzyme); } + + + // create neighbor peptides for the relevant peptides given in the FASTA from '-in' String neighbor_file = getStringOption_("Neighbor_Search:neighbor_in"); // Check if the neighbor file option is provided if (! neighbor_file.empty()) { - String resolution = getStringOption_("Neighbor_Search:resolution"); - double ion_tolerance = getDoubleOption_("Neighbor_Search:ion_tolerance"); + String out_neighbor = getStringOption_("out_neighbor"); + FASTAFile neig; + neig.writeStart(out_neighbor); + double mz_bin_size = getDoubleOption_("Neighbor_Search:mz_bin_size"); + double min_shared_ion_fraction = getDoubleOption_("Neighbor_Search:min_shared_ion_fraction"); double mass_tolerance = getDoubleOption_("Neighbor_Search:mass_tolerance"); - // Create a ProteaseDigestion object for peptide digestion - ProteaseDigestion digestion; - String enzyme = getStringOption_("enzyme").trim(); - if ((input_type == SeqType::protein) && ! enzyme.empty()) - { - digestion.setEnzyme(enzyme); - } - + int missed_cleavages = getIntOption_("Neighbor_Search:missed_cleavages"); + digestion.setMissedCleavages(missed_cleavages); // Vector to hold all digested peptides from the input files vector all_peptides; for (Size i = 0; i < in.size(); ++i) { //FASTAContainer in_entries {in[i]}; - f.readStart(in[i]); - while (f.readNext(entry)) + neig.readStart(in[i]); + while (neig.readNext(entry)) { - // Vector to hold the peptides generated from digestion vector peptides; // Digest the current entry sequence into peptides @@ -222,30 +222,48 @@ class TOPPDecoyDatabase : all_peptides.insert(all_peptides.end(), peptides.begin(), peptides.end()); } } - // Load the neighbor peptides from the neighbor file + // Load the neighbor peptides candidates from the neighbor file FASTAFile neighbor_fasta; vector neighbor_entries; neighbor_fasta.load(neighbor_file, neighbor_entries); + vector digested_candidate_peptides; + for (auto& entry : neighbor_entries) + { + digestion.digest(AASequence::fromString(entry.sequence), digested_candidate_peptides); + } + + map> mass_position_map = NeighborSeq::createMassPositonMap(digested_candidate_peptides); + for (auto i = all_peptides.begin(); i != all_peptides.end(); ++i) { - { - // Find neighbor peptides for the current peptide - vector result = NeighborSeq::findNeighborPeptides(*i, neighbor_entries, mass_tolerance, ion_tolerance, resolution); - for (const auto& sub : result) - { - // Write the neighbor peptide to the output file - f.writeNext(sub); - } - } + double lower_bound_key = i->getMonoWeight() - mass_tolerance; + double upper_bound_key = i->getMonoWeight() + mass_tolerance; + auto lower = mass_position_map.lower_bound(lower_bound_key); + auto upper = mass_position_map.upper_bound(upper_bound_key); + vector candidate_position; + for (auto it = lower; it != upper; ++it) + { + candidate_position.insert(candidate_position.end(), it->second.begin(), it->second.end()); + } + + // Find neighbor peptides for the current peptide + vector result = NeighborSeq::findNeighborPeptides(*i, digested_candidate_peptides, candidate_position, min_shared_ion_fraction, mz_bin_size); + for (size_t j = 0; j < result.size(); ++j) + { + // Get the corresponding neighbor entry + const auto& neighbor_entry = neighbor_entries[result[j]]; + // Create a new FASTAEntry with the neighbor peptide sequence + FASTAFile::FASTAEntry new_entry(entry.identifier, entry.description, digested_candidate_peptides[result[j]].toString()); + // Write the neighbor peptide to the output file + neig.writeNext(new_entry); + + } } + in.push_back(out_neighbor); } - else - { + // Configure Enzymatic digestion // TODO: allow user-specified regex - ProteaseDigestion digestion; - String enzyme = getStringOption_("enzyme").trim(); - if ((input_type == SeqType::protein) && ! enzyme.empty()) { digestion.setEnzyme(enzyme); } // check if decoy_string is common decoy string (e.g. decoy, rev, ...) String decoy_string_lower = decoy_string; decoy_string_lower.toLower(); @@ -393,7 +411,7 @@ class TOPPDecoyDatabase : f.writeNext(entry); } // next protein } // input files - } + return EXECUTION_OK; } }; From 15d985a60ff65198d4e6adb65e0117d90a468400 Mon Sep 17 00:00:00 2001 From: wangp97 Date: Wed, 24 Jul 2024 10:11:34 +0200 Subject: [PATCH 4/6] =?UTF-8?q?Wie=20kann=20ich=20am=20besten=20in=20.test?= =?UTF-8?q?=20ein=20map=20pr=C3=BCfen=3F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../include/OpenMS/ANALYSIS/ID/NeighborSeq.h | 108 ++++--- src/openms/source/ANALYSIS/ID/NeighborSeq.cpp | 65 +++- .../openms/source/NeighborSeq_test.cpp | 292 ++++++++++-------- src/topp/DecoyDatabase.cpp | 46 +-- 4 files changed, 301 insertions(+), 210 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h index 9eaf1ae3a73..7b6a4fb3845 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h @@ -8,58 +8,72 @@ #pragma once -#include -#include -#include #include -#include -using namespace OpenMS; -/** - * @brief The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) - * that are similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities - * specifically related to the Neighbor Peptide search. - * - * The paper on subset neighbor search is www.ncbi.nlm.nih.gov/pmc/articles/PMC8489664/ - * PMID: 34236864 PMCID: PMC8489664 DOI: 10.1021/acs.jproteome.1c00483 - */ -class OPENMS_DLLAPI NeighborSeq -{ -public: - /** - * @brief Generates a theoretical spectrum for a given peptide sequence. - * @param peptide_sequence The peptide sequence for which to generate the spectrum. - * @return The generated theoretical spectrum. - */ - static MSSpectrum generateSpectrum(const String& peptide_sequence); +namespace OpenMS +{ + /** + * @brief The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) + * that are similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities + * specifically related to the Neighbor Peptide search. + * + * The paper on subset neighbor search is www.ncbi.nlm.nih.gov/pmc/articles/PMC8489664/ + * PMID: 34236864 PMCID: PMC8489664 DOI: 10.1021/acs.jproteome.1c00483 + */ + class OPENMS_DLLAPI NeighborSeq + { + + public: + /** + * @brief Generates a theoretical spectrum for a given peptide sequence. + * @param peptide_sequence The peptide sequence for which to generate the spectrum. + * @return The generated theoretical spectrum. + */ + static MSSpectrum generateSpectrum(const String& peptide_sequence); - /** - * @brief Compares two spectra to determine if they share a sufficient number of ions. - * @param spec1 The first spectrum. - * @param spec2 The second spectrum. - * @param min_shared_ion_fraction The tolerance for the proportion of shared ions. - * @param mz_bin_size The mz_bin_size setting for the comparison. - * @return True if the spectra share a sufficient number of ions, false otherwise. - */ - static bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size); + /** + * @brief Compares two spectra to determine if they share a sufficient number of ions. + * @param spec1 The first spectrum. + * @param spec2 The second spectrum. + * @param min_shared_ion_fraction The tolerance for the proportion of shared ions. + * @param mz_bin_size The mz_bin_size setting for the comparison. + * @return True if the spectra share a sufficient number of ions, false otherwise. + */ + static bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size); + /** + * @brief Finds candidate positions based on a given mono-isotopic weight and mass tolerance. + * @param mass_position_map A map where the key is the mass and the value is a vector of positions. + * @param mono_weight The mono-isotopic weight to find candidates for. + * @param mass_tolerance The allowed tolerance for matching the mass. + * @return A vector of candidate positions within the specified mass tolerance range. + */ + static std::vector findCandidatePositions(const std::map>& mass_position_map, const double& mono_weight, const double& mass_tolerance); - static std::map> NeighborSeq::createMassPositonMap(const std::vector& candidates); + + /** + * @brief Creates a map of masses to positions from a vector of peptides. (not protein) + * @param candidates A vector of AASequence objects. + * @return A map where the key is the mass and the value is a vector of positions. + */ + static std::map> NeighborSeq::createMassPositionMap(const std::vector& candidates); - /** - * @brief Finds neighbor peptides that are similar to a given peptide. - * @param peptide The peptide sequence to find neighbors for. - * @param neighbor_candidate The list of neighbor sequences to search in. - * @param mass_tolerance The mass tolerance for neighbor peptides. - * @param min_shared_ion_fraction The ion tolerance for neighbor peptides. - * @param mz_bin_size The mz_bin_size setting for the comparison is default 0.05 (the original study suggests ‘high’ (0.05 Da) and ‘low’ (1.0005079 Da) mz_bin_size). - * @return A vector of FASTA entries that are considered neighbor peptides. - */ - static std::vector findNeighborPeptides(const AASequence& peptides, - const std::vector& neighbor_candidate, - const std::vector& candidate_position, - const double& min_shared_ion_fraction, - const double& mz_bin_size); -}; \ No newline at end of file + /** + * @brief Finds neighbor peptides that are similar to a given peptide. + * @param peptide The peptide sequence (from a relevant protein) to find neighbors for. + * @param neighbor_candidate The list of neighbor sequences to search in. + * @param mass_tolerance The mass tolerance for neighbor peptides. + * @param min_shared_ion_fraction The ion tolerance for neighbor peptides. + * @param mz_bin_size The mz_bin_size setting for the comparison is default 0.05 (the original study suggests ‘high’ (0.05 Da) and ‘low’ (1.0005079 + * Da) mz_bin_size). + * @return A vector of FASTA entries that are considered neighbor peptides. + */ + static std::vector findNeighborPeptides(const AASequence& peptides, + const std::vector& neighbor_candidate, + const std::vector& candidate_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size); + }; +} diff --git a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp index 0c16e04c442..2df9207ac77 100644 --- a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp +++ b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp @@ -2,15 +2,15 @@ // SPDX-License-Identifier: BSD-3-Clause // // -------------------------------------------------------------------------- -// $Maintainer: Sven Nahnsen $ -// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow, Philipp Wang $ +// $Maintainer: Chris Bielow, Philipp Wang $ +// $Authors: Philipp Wang $ // -------------------------------------------------------------------------- #include #include #include #include -#include + @@ -70,19 +70,56 @@ bool NeighborSeq::compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec // Calculate the fraction of shared bins double fraction_shared = static_cast(2 * B12) / (spec1.size() + spec2.size()); - return fraction_shared > min_shared_ion_fraction; } -map> NeighborSeq::createMassPositonMap(const vector& candidates) +//Finds candidate positions based on a given mono-isotopic weight and mass tolerance. +vector NeighborSeq::findCandidatePositions(const map>& mass_position_map,const double& mono_weight, const double& mass_tolerance) { + // Vector to store the candidate positions + vector candidate_position; + // Calculate the lower and upper bounds for the mass tolerance range + double lower_bound_key = mono_weight; + double upper_bound_key = mono_weight; + if (mass_tolerance < 0) + { + lower_bound_key += mass_tolerance; + upper_bound_key -= mass_tolerance; + } + else + { + lower_bound_key -= mass_tolerance; + upper_bound_key += mass_tolerance; + } + + // Find the lower bound iterator in the map + auto lower = mass_position_map.lower_bound(lower_bound_key); + + // Find the upper bound iterator in the map + auto upper = mass_position_map.upper_bound(upper_bound_key); + + // Iterate through the map from the lower bound to the upper bound + for (auto it = lower; it != upper; ++it) + { + // Insert the positions from the current map entry into the candidate positions vector + candidate_position.insert(candidate_position.end(), it->second.begin(), it->second.end()); + } + return candidate_position; +} + + +//Creates a map of masses to positions from a vector of peptides.(not protein) +map> NeighborSeq::createMassPositionMap(const vector& candidates) +{ + // Map to store the mass and corresponding positions map> mass_position_map; - //Iterate through the vector of AASequence objects + // Iterate through the vector of AASequence objects for (size_t i = 0; i < candidates.size(); ++i) { // Calculate the mono-isotopic mass of the sequence double mass = candidates[i].getMonoWeight(); + // Insert the mass and the position into the map mass_position_map[mass].push_back(i); } @@ -90,20 +127,18 @@ map> NeighborSeq::createMassPositonMap(const vector NeighborSeq::findNeighborPeptides(const AASequence& peptides, - const vector& neighbor_candidate, - const vector& candidate_position, +vector NeighborSeq::findNeighborPeptides(const AASequence& peptide, + const vector& neighbor_candidates, + const vector& candidates_position, const double& min_shared_ion_fraction, const double& mz_bin_size) { vector result_entries; - String rel_sequence = peptides.toString(); - MSSpectrum spec = generateSpectrum(rel_sequence); - for (size_t i = 0; i < candidate_position.size(); i++) + MSSpectrum spec = generateSpectrum(peptide.toString()); + for (size_t i = 0; i < candidates_position.size(); i++) { - String neighbor_sequence = neighbor_candidate[candidate_position[i]].toString(); - //cout << neighbor_sequence << endl; + String neighbor_sequence = neighbor_candidates[candidates_position[i]].toString(); // Check if the sequence contains an 'X' if (neighbor_sequence.find('X') == String::npos) { @@ -112,7 +147,7 @@ vector NeighborSeq::findNeighborPeptides(const AASequence& peptides, // Compare the spectra and add to results if they are similar if (compareSpectra(spec, neighbor_spec, min_shared_ion_fraction, mz_bin_size)) { - result_entries.push_back(candidate_position[i]); + result_entries.push_back(candidates_position[i]); } } } diff --git a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp index a53a8ebe691..664dc71c13a 100644 --- a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp +++ b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp @@ -5,176 +5,212 @@ // $Maintainer: Chris Bielow $ // $Authors: Andreas Bertsch $ // -------------------------------------------------------------------------- - +#include #include #include #include +#include + using namespace OpenMS; using namespace std; START_TEST(NeighborSeq, "$Id$") -//NS()=delete; - +// NS()=delete; -START_SECTION(double calculateMass(const AASequence& peptide)) -{ - double mass_1 = NeighborSeq::calculateMass("PEPTIDE"); - double mass_2 = NeighborSeq::calculateMass("PPPPPPP"); - double mass_3 = NeighborSeq::calculateMass("PEPEP"); - // Compare the calculated masses with expected values - TEST_REAL_SIMILAR(mass_1, 799.815); - TEST_REAL_SIMILAR(mass_2, 697.814); - TEST_REAL_SIMILAR(mass_3, 567.579); -} -END_SECTION // Test section for the generateSpectrum function +// The spectra were generated via TOPPView and contained b-and y-ion START_SECTION(MSSpectrum generateSpectrum(const String& peptide_sequence)) { MSSpectrum spec_1 = NeighborSeq::generateSpectrum("PEPT"); - MSSpectrum spec_2 = NeighborSeq::generateSpectrum("PPP"); - MSSpectrum spec_3 = NeighborSeq::generateSpectrum("AR"); + MSSpectrum spec_2 = NeighborSeq::generateSpectrum("AR"); + MSSpectrum spec_3 = NeighborSeq::generateSpectrum("VGLPINQR"); + + // Test "PEPT" + TEST_REAL_SIMILAR(spec_1[0].getPosition()[0], 120.0655); + TEST_REAL_SIMILAR(spec_1[1].getPosition()[0], 217.1182); + TEST_REAL_SIMILAR(spec_1[2].getPosition()[0], 227.1026); + TEST_REAL_SIMILAR(spec_1[3].getPosition()[0], 324.1553); + TEST_REAL_SIMILAR(spec_1[4].getPosition()[0], 346.1608); - // Define the expected spectrum for "PEPT" - MSSpectrum spec_1_test; - - Peak1D peak1; - peak1.setMZ(120.06552075); - peak1.setIntensity(1); - spec_1_test.push_back(peak1); - - Peak1D peak2; - peak2.setMZ(217.11828498); - peak2.setIntensity(1); - spec_1_test.push_back(peak2); - - Peak1D peak3; - peak3.setMZ(227.10263491); - peak3.setIntensity(1); - spec_1_test.push_back(peak3); - - Peak1D peak4; - peak4.setMZ(324.15539914); - peak4.setIntensity(1); - spec_1_test.push_back(peak4); - - Peak1D peak5; - peak5.setMZ(346.16087920); - peak5.setIntensity(1); - spec_1_test.push_back(peak5); - -// Define the expected spectrum for "PPP" - MSSpectrum spec_2_test; - Peak1D peak1; - peak1.setMZ(116.07060575); - peak1.setIntensity(1); - spec_2_test.push_back(peak1); - - Peak1D peak2; - peak2.setMZ(195.11280491); - peak2.setIntensity(1); - spec_2_test.push_back(peak2); - - Peak1D peak3; - peak3.setMZ(213.12336998); - peak3.setIntensity(1); - spec_3_test.push_back(peak3); - -// Define the expected spectrum for "AR" - MSSpectrum spec_3_test; - Peak1D peak1; - peak1.setMZ(175.11895291); - peak1.setIntensity(1); - spec_3_test.push_back(peak1); - -// Compare the generated spectra with the expected spectra - TEST_EQUAL(spec_1, spec_1_test); - TEST_EQUAL(spec_1, spec_2_test); - TEST_EQUAL(spec_1, spec_3_test); - TEST_EQUAL(spec_2, spec_1_test); - TEST_EQUAL(spec_2, spec_2_test); - TEST_EQUAL(spec_2, spec_3_test); - TEST_EQUAL(spec_3, spec_1_test); - TEST_EQUAL(spec_3, spec_2_test); - TEST_EQUAL(spec_3, spec_3_test); + + // Test "AR" + TEST_REAL_SIMILAR(spec_2[0].getPosition()[0], 175.1189); + + // Test "VGLPINQR" + TEST_REAL_SIMILAR(spec_3[0].getPosition()[0], 157.0971); + TEST_REAL_SIMILAR(spec_3[1].getPosition()[0], 175.1189); + TEST_REAL_SIMILAR(spec_3[2].getPosition()[0], 270.1812); + TEST_REAL_SIMILAR(spec_3[3].getPosition()[0], 303.1775); + TEST_REAL_SIMILAR(spec_3[4].getPosition()[0], 367.2339); + TEST_REAL_SIMILAR(spec_3[5].getPosition()[0], 417.2204); + TEST_REAL_SIMILAR(spec_3[6].getPosition()[0], 480.3180); + TEST_REAL_SIMILAR(spec_3[7].getPosition()[0], 530.3045); + TEST_REAL_SIMILAR(spec_3[8].getPosition()[0], 594.3609); + TEST_REAL_SIMILAR(spec_3[9].getPosition()[0], 627.3578); + TEST_REAL_SIMILAR(spec_3[10].getPosition()[0], 722.4195); + TEST_REAL_SIMILAR(spec_3[11].getPosition()[0], 740.4413); + TEST_REAL_SIMILAR(spec_3[12].getPosition()[0], 797.4628); + } END_SECTION // Test section for the compareSpectra function -START_SECTION(bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& ion_tolerance, const string& resolution)) +START_SECTION(bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size)) { MSSpectrum spec1; spec1.push_back(Peak1D(100.0, 1.0)); spec1.push_back(Peak1D(200.0, 1.0)); spec1.push_back(Peak1D(300.0, 1.0)); - spec1.push_back(Peak1D(400.0, 1.0)); - MSSpectrum spec2; spec2.push_back(Peak1D(100.05, 1.0)); spec2.push_back(Peak1D(200.05, 1.0)); spec2.push_back(Peak1D(300.05, 1.0)); - spec2.push_back(Peak1D(400.06, 1.0)); - MSSpectrum spec3; - spec3.push_back(Peak1D(101.05, 1.0)); - spec3.push_back(Peak1D(201.05, 1.0)); - spec3.push_back(Peak1D(301.05, 1.0)); - spec3.push_back(Peak1D(301.06, 1.0)); + spec3.push_back(Peak1D(101.00, 1.0)); + spec3.push_back(Peak1D(201.00, 1.0)); + spec3.push_back(Peak1D(301.00, 1.0)); + MSSpectrum spec4; + spec4.push_back(Peak1D(100.05, 1.0)); + spec4.push_back(Peak1D(201.00, 1.0)); + spec4.push_back(Peak1D(300.05, 1.0)); + spec4.push_back(Peak1D(301.00, 1.0)); - double ion_tolerance = 0.25; + + double min_shared_ion_fraction = 0.5; - bool compare_1_2_low = NeighborSeq::compareSpectra(spec1, spec2, ion_tolerance, "low"); - bool compare_1_3_low = NeighborSeq::compareSpectra(spec1, spec3, ion_tolerance, "low"); - bool compare_2_3_low = NeighborSeq::compareSpectra(spec2, spec3, ion_tolerance, "low"); - - bool compare_1_2_high = NeighborSeq::compareSpectra(spec1, spec2, ion_tolerance, "high"); - bool compare_1_3_high = NeighborSeq::compareSpectra(spec1, spec3, ion_tolerance, "high"); - bool compare_2_3_high = NeighborSeq::compareSpectra(spec2, spec3, ion_tolerance, "high"); - -// Verify the results of the comparisons - TEST_EQUAL(compare_1_2_low, true) - TEST_EQUAL(compare_1_3_low, false) - TEST_EQUAL(compare_1_3_low, false) - TEST_EQUAL(compare_1_2_high, true) - TEST_EQUAL(compare_1_3_high, false) - TEST_EQUAL(compare_2_3_high, true) + + //bin interval is from [a,b[ + bool compare_1_2_low = NeighborSeq::compareSpectra(spec1, spec2, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_1_2_low) + bool compare_1_3_low = NeighborSeq::compareSpectra(spec1, spec3, min_shared_ion_fraction, 1.0); + TEST_FALSE(compare_1_3_low) + bool compare_1_4_low = NeighborSeq::compareSpectra(spec1, spec4, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_1_4_low) + bool compare_2_3_low = NeighborSeq::compareSpectra(spec2, spec3, min_shared_ion_fraction, 1.0); + TEST_FALSE(compare_2_3_low) + bool compare_2_4_low = NeighborSeq::compareSpectra(spec2, spec4, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_2_4_low) + bool compare_3_4_low = NeighborSeq::compareSpectra(spec3, spec4, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_3_4_low) + + bool compare_1_2_high = NeighborSeq::compareSpectra(spec1, spec2, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_1_2_high) + bool compare_1_3_high = NeighborSeq::compareSpectra(spec1, spec3, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_1_3_high) + bool compare_1_4_high = NeighborSeq::compareSpectra(spec1, spec4, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_1_4_high) + bool compare_2_3_high = NeighborSeq::compareSpectra(spec2, spec3, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_2_3_high) + bool compare_2_4_high = NeighborSeq::compareSpectra(spec2, spec4, min_shared_ion_fraction, 0.05); + TEST_TRUE(compare_2_4_high) + bool compare_3_4_high = NeighborSeq::compareSpectra(spec3, spec4, min_shared_ion_fraction, 0.05); + TEST_TRUE(compare_3_4_high) } END_SECTION +// Test section for the findCandidatePositions function +START_SECTION(vector findCandidatePositions(const map>& mass_position_map, + const double& mono_weight, + const double& mass_tolerance)) +{ + map> mass_position_map_1 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_1 = 101.0; + double mass_tolerance_1 = 1.0; + + vector expected_1 = {1, 2, 3, 4 , 5}; + vector result_1 = NeighborSeq::findCandidatePositions(mass_position_map_1, mono_weight_1, mass_tolerance_1); + TEST_EQUAL(expected_1,result_1) + + map> mass_position_map_2 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_2 = 100.5; + double mass_tolerance_2 = 1.0; + + vector expected_2 = {1, 2, 3}; + vector result_2 = NeighborSeq::findCandidatePositions(mass_position_map_2, mono_weight_2, mass_tolerance_2); + TEST_EQUAL(expected_2, result_2) + + map> mass_position_map_3 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_3 = 105.0; + double mass_tolerance_3 = 1.0; + + vector expected_3 = {}; + vector result_3 = NeighborSeq::findCandidatePositions(mass_position_map_3, mono_weight_3, mass_tolerance_3); + TEST_EQUAL(expected_3, result_3) + + map> mass_position_map_4 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_4 = 100.5; + double mass_tolerance_4 = -1.0; + + vector expected_4 = {1, 2, 3}; + vector result_4 = NeighborSeq::findCandidatePositions(mass_position_map_4, mono_weight_4, mass_tolerance_4); + TEST_EQUAL(expected_4, result_4) +} +END_SECTION + +/* +START_SECTION(map> NeighborSeq::createMassPositionMap(const vector& candidates)) +{ + vector candidates_1; + AASequence seq1 = AASequence::fromString("PEPTIDE"); // 728.371 + AASequence seq2 = AASequence::fromString("PROTEIN"); // 799.349 + AASequence seq3 = AASequence::fromString("SEQUENCE");// 837.270 + + + candidates_1.push_back(seq1); // 1 position + candidates_1.push_back(seq2); // 2 position + candidates_1.push_back(seq3); // 3 position + map> expected_1 = {{728.371, {2}}, {799.349, {1}}, {837.270, {3}}}; + map> result_1 = NeighborSeq::createMassPositionMap(candidates_1); + TEST_EQUAL(expected_1, result_1) + + candidates_1.push_back(seq1); // 4 position + map> expected_2 = {{728.371, {2}}, {799.349, {1,4}}, {837.270, {3}}}; + map> result_2 = NeighborSeq::createMassPositionMap(candidates_1); + TEST_EQUAL(expected_2, result_2) + + +} +END_SECTION +*/ + + // Test section for the findNeighborPeptides_ function -START_SECTION(vector findNeighborPeptides(const AASequence& peptides, - const vector& neighbor_file, - const double& mass_tolerance, - const double& ion_tolerance, - const String& resolution)) +// The spectra were generated via TOPPView +START_SECTION(vector findNeighborPeptides(const AASequence& peptides, + const vector& neighbor_candidate, + const vector& candidate_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size)) { - AASequence peptide = AASequence::fromString("VGLPINQR") - - vector neighbor_file; - neighbor_file.push_back(FASTAFile::FASTAEntry("protein1", "RIPLANGR")); - neighbor_file.push_back(FASTAFile::FASTAEntry("protein2", "PPPPPPPP")); - neighbor_file.push_back(FASTAFile::FASTAEntry("protein3", "VGLPINQR")); - neighbor_file.push_back(FASTAFile::FASTAEntry("protein4", "RIPLANGR")); - - double mass_tolerance = 0.00004; - double ion_tolerance = 0.25; - String resolution = "low"; - - vector neighbors = findNeighborPeptides(peptide, neighbor_file, mass_tolerance, ion_tolerance, resolution); - vector neighbors_test; - neighbor_test.push_back(FASTAFile::FASTAEntry("RIPLANGR")); - neighbor_test.push_back(FASTAFile::FASTAEntry("VGLPINQRgit clone git@github.com:_YOURUSERNAME_/OpenMS.git")); - - // Verify the found neighbors match the expected neighbors - TEST_EQUAL(neighbor, neighbor_test); + AASequence seq1 = AASequence::fromString("RIPLANGR"); + AASequence seq2 = AASequence::fromString("PPPPP"); + AASequence seq3 = AASequence::fromString("SEQUENCE"); + AASequence seq4 = AASequence::fromString("VGLPINQR"); + vector neighbor_candidate; + neighbor_candidate.push_back(seq1); + neighbor_candidate.push_back(seq2); + neighbor_candidate.push_back(seq3); + neighbor_candidate.push_back(seq4); + vector candidate_position = {0, 1, 2, 3}; + + vector expected = {0,3}; + double min_shared_ion_fraction = 0.25; + double mz_bin_size = 0.05; + + vector result = NeighborSeq::findNeighborPeptides(seq1, neighbor_candidate, candidate_position, min_shared_ion_fraction, mz_bin_size); + + TEST_EQUAL(expected, result) + } END_SECTION -END_TEST + +END_TEST \ No newline at end of file diff --git a/src/topp/DecoyDatabase.cpp b/src/topp/DecoyDatabase.cpp index 7be71111cdd..01a83651ef5 100644 --- a/src/topp/DecoyDatabase.cpp +++ b/src/topp/DecoyDatabase.cpp @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Sven Nahnsen $ -// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow $ +// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow, Philipp Wang $ // -------------------------------------------------------------------------- #include @@ -63,7 +63,7 @@ and terminates the program if decoys are found. The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) that are similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities specifically related to the Neighbor Peptide search. -NeighborSeq class has more details look at *openms\source\ANALYSIS\ID\NeighborSeq.cpp +NeighborSeq class has more details look at *OpenMS\src\openms\include\OpenMS\ANALYSIS\ID\NeighborSeq.h */ // We do not want this class to show up in the docu: @@ -84,7 +84,6 @@ class TOPPDecoyDatabase : registerInputFileList_("in", "", ListUtils::create(""), "Input FASTA file(s), each containing a database. It is recommended to include a contaminant database as well."); setValidFormats_("in", ListUtils::create("fasta")); registerOutputFile_("out", "", "", "Output FASTA file where the decoy database will be written to."); - registerOutputFile_("out_neighbor", "", "", "Output FASTA file where the neighbor peptide will be written to."); setValidFormats_("out", ListUtils::create("fasta")); registerStringOption_("decoy_string", "", "DECOY_", "String that is combined with the accession of the protein identifier to indicate a decoy protein.", false); registerStringOption_("decoy_string_position", "", "prefix", "Should the 'decoy_string' be prepended (prefix) or appended (suffix) to the protein accession?", false); @@ -112,6 +111,7 @@ class TOPPDecoyDatabase : registerTOPPSubsection_("Neighbor_Search", "Parameters for neighbor peptide search"); registerInputFile_("Neighbor_Search:neighbor_in", "","", "neighbor peptides are searched for in the Fasta file entered for candiate ", false); setValidFormats_("Neighbor_Search:neighbor_in", {"fasta"}); + registerOutputFile_("Neighbor_Search:out_neighbor", "", "", "Output FASTA file where the neighbor peptide will be written to."); registerIntOption_("Neighbor_Search:missed_cleavages", "", 0,"Number of missed cleavages",false); registerDoubleOption_("Neighbor_Search:mz_bin_size", "", 0.05,"Sets the mz_bin_size of the neighbor peptide search.(the original study suggests high (0.05 Da) and low (1.0005079 Da) mz_bin_size)", false); registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00001, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); @@ -197,14 +197,27 @@ class TOPPDecoyDatabase : // Check if the neighbor file option is provided if (! neighbor_file.empty()) { - String out_neighbor = getStringOption_("out_neighbor"); + if (input_type != SeqType::protein) + { + OPENMS_LOG_ERROR << "Parameter settings are invalid. When requesting neighbor peptides, the input type must be 'protein', not 'RNA'.\n"; + return INCOMPATIBLE_INPUT_DATA; + } + // Create a ProteaseDigestion object for neighbor peptide digestion + ProteaseDigestion digestion_neighbor; + String enzyme = getStringOption_("enzyme").trim(); + if ((input_type == SeqType::protein) && ! enzyme.empty()) { digestion_neighbor.setEnzyme(enzyme); } + //------------------------------------------------------------- + // parsing neighbor parameters + //------------------------------------------------------------- + String out_neighbor = getStringOption_("Neighbor_Search:out_neighbor"); FASTAFile neig; neig.writeStart(out_neighbor); + double mz_bin_size = getDoubleOption_("Neighbor_Search:mz_bin_size"); double min_shared_ion_fraction = getDoubleOption_("Neighbor_Search:min_shared_ion_fraction"); double mass_tolerance = getDoubleOption_("Neighbor_Search:mass_tolerance"); int missed_cleavages = getIntOption_("Neighbor_Search:missed_cleavages"); - digestion.setMissedCleavages(missed_cleavages); + digestion_neighbor.setMissedCleavages(missed_cleavages); // Vector to hold all digested peptides from the input files vector all_peptides; @@ -217,7 +230,7 @@ class TOPPDecoyDatabase : // Vector to hold the peptides generated from digestion vector peptides; // Digest the current entry sequence into peptides - digestion.digest(AASequence::fromString(entry.sequence), peptides); + digestion_neighbor.digest(AASequence::fromString(entry.sequence), peptides); // Add the generated peptides to the all_peptides vector all_peptides.insert(all_peptides.end(), peptides.begin(), peptides.end()); } @@ -229,25 +242,18 @@ class TOPPDecoyDatabase : vector digested_candidate_peptides; for (auto& entry : neighbor_entries) { - digestion.digest(AASequence::fromString(entry.sequence), digested_candidate_peptides); + digestion_neighbor.digest(AASequence::fromString(entry.sequence), digested_candidate_peptides); } - - map> mass_position_map = NeighborSeq::createMassPositonMap(digested_candidate_peptides); + // Creates a map of masses to positions from a vector + map> mass_position_map = NeighborSeq::createMassPositionMap(digested_candidate_peptides); for (auto i = all_peptides.begin(); i != all_peptides.end(); ++i) { - double lower_bound_key = i->getMonoWeight() - mass_tolerance; - double upper_bound_key = i->getMonoWeight() + mass_tolerance; - auto lower = mass_position_map.lower_bound(lower_bound_key); - auto upper = mass_position_map.upper_bound(upper_bound_key); - vector candidate_position; - for (auto it = lower; it != upper; ++it) - { - candidate_position.insert(candidate_position.end(), it->second.begin(), it->second.end()); - } - + // Finde position of neighbor peptides candidates + vector candiadate_position = NeighborSeq::findCandidatePositions(mass_position_map, i->getMonoWeight(), mass_tolerance); + // Find neighbor peptides for the current peptide - vector result = NeighborSeq::findNeighborPeptides(*i, digested_candidate_peptides, candidate_position, min_shared_ion_fraction, mz_bin_size); + vector result = NeighborSeq::findNeighborPeptides(*i, digested_candidate_peptides, candiadate_position, min_shared_ion_fraction, mz_bin_size); for (size_t j = 0; j < result.size(); ++j) { // Get the corresponding neighbor entry From ab386c943734f27b7ee7604512c7d4075daf19c7 Mon Sep 17 00:00:00 2001 From: wangp97 Date: Wed, 24 Jul 2024 23:10:18 +0200 Subject: [PATCH 5/6] besteht test nicht --- .../include/OpenMS/ANALYSIS/ID/NeighborSeq.h | 11 ++- src/openms/source/ANALYSIS/ID/NeighborSeq.cpp | 68 +++++++++++++--- .../openms/source/NeighborSeq_test.cpp | 79 +++++++++++++++++-- src/topp/DecoyDatabase.cpp | 8 +- 4 files changed, 143 insertions(+), 23 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h index 7b6a4fb3845..3c851d0705e 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h @@ -75,5 +75,14 @@ namespace OpenMS const std::vector& candidate_position, const double& min_shared_ion_fraction, const double& mz_bin_size); - }; + + /**is only a test function for compareSpectra, it works the same way, only the common peaks are output. + * @brief Compares two spectra to determine if they share a sufficient number of ions. + * @param spec1 The first spectrum. + * @param spec2 The second spectrum. + * @param mz_bin_size The mz_bin_size setting for the comparison. + * @return True if the spectra share a sufficient number of ions, false otherwise. + */ + static int compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size); + }; } diff --git a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp index 2df9207ac77..c366bffceb7 100644 --- a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp +++ b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp @@ -117,11 +117,18 @@ map> NeighborSeq::createMassPositionMap(const vector NeighborSeq::findNeighborPeptides(const AASequence& peptide, // Check if the sequence contains an 'X' if (neighbor_sequence.find('X') == String::npos) { - MSSpectrum neighbor_spec = generateSpectrum(neighbor_sequence); - - // Compare the spectra and add to results if they are similar - if (compareSpectra(spec, neighbor_spec, min_shared_ion_fraction, mz_bin_size)) - { - result_entries.push_back(candidates_position[i]); - } - } + MSSpectrum neighbor_spec = generateSpectrum(neighbor_candidates[candidates_position[i]].toString()); + // Compare the spectra and add to results if they are similar + if (compareSpectra(spec, neighbor_spec, min_shared_ion_fraction, mz_bin_size)) + { + result_entries.push_back(candidates_position[i]); + } + } + else + { + //throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Cannot get peaks of sequence with unknown AA 'X'.", toString()); + } } return result_entries; } + + +// is only a test function for compareSpectra, it works the same way, only the common peaks are output. +int NeighborSeq::compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size) +{ + std::map bins1, bins2; + + // Lambda function to bin a spectrum into a map + auto bin_spectrum = [&](const MSSpectrum& spec, std::map& bins) { + for (const auto& peak : spec) + bins[static_cast(peak.getMZ() / mz_bin_size)]++; + }; + + // Bin both spectra + bin_spectrum(spec1, bins1); + bin_spectrum(spec2, bins2); + + // Extract bin keys as vectors + std::vector vec_bins1, vec_bins2; + for (const auto& bin : bins1) + vec_bins1.push_back(bin.first); + for (const auto& bin : bins2) + vec_bins2.push_back(bin.first); + + // Calculate the intersection of the bin vectors + std::vector intersection; + std::set_intersection(vec_bins1.begin(), vec_bins1.end(), vec_bins2.begin(), vec_bins2.end(), std::back_inserter(intersection)); + // Calculate the number of shared bins considering the bin frequencies + int shared_peaks = 0; + for (int bin : intersection) + shared_peaks += min(bins1[bin], bins2[bin]); + + return shared_peaks; +} \ No newline at end of file diff --git a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp index 664dc71c13a..e1378347676 100644 --- a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp +++ b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp @@ -126,7 +126,7 @@ START_SECTION(vector findCandidatePositions(const map>& vector expected_1 = {1, 2, 3, 4 , 5}; vector result_1 = NeighborSeq::findCandidatePositions(mass_position_map_1, mono_weight_1, mass_tolerance_1); - TEST_EQUAL(expected_1,result_1) + TEST_EQUAL(result_1,expected_1) map> mass_position_map_2 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; double mono_weight_2 = 100.5; @@ -134,7 +134,7 @@ START_SECTION(vector findCandidatePositions(const map>& vector expected_2 = {1, 2, 3}; vector result_2 = NeighborSeq::findCandidatePositions(mass_position_map_2, mono_weight_2, mass_tolerance_2); - TEST_EQUAL(expected_2, result_2) + TEST_EQUAL(result_2,expected_2) map> mass_position_map_3 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; double mono_weight_3 = 105.0; @@ -142,7 +142,7 @@ START_SECTION(vector findCandidatePositions(const map>& vector expected_3 = {}; vector result_3 = NeighborSeq::findCandidatePositions(mass_position_map_3, mono_weight_3, mass_tolerance_3); - TEST_EQUAL(expected_3, result_3) + TEST_EQUAL(result_3, expected_3) map> mass_position_map_4 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; double mono_weight_4 = 100.5; @@ -150,7 +150,7 @@ START_SECTION(vector findCandidatePositions(const map>& vector expected_4 = {1, 2, 3}; vector result_4 = NeighborSeq::findCandidatePositions(mass_position_map_4, mono_weight_4, mass_tolerance_4); - TEST_EQUAL(expected_4, result_4) + TEST_EQUAL(result_4, expected_4) } END_SECTION @@ -168,12 +168,12 @@ START_SECTION(map> NeighborSeq::createMassPositionMap(const candidates_1.push_back(seq3); // 3 position map> expected_1 = {{728.371, {2}}, {799.349, {1}}, {837.270, {3}}}; map> result_1 = NeighborSeq::createMassPositionMap(candidates_1); - TEST_EQUAL(expected_1, result_1) + TEST_EQUAL(result_1,expected_1) candidates_1.push_back(seq1); // 4 position map> expected_2 = {{728.371, {2}}, {799.349, {1,4}}, {837.270, {3}}}; map> result_2 = NeighborSeq::createMassPositionMap(candidates_1); - TEST_EQUAL(expected_2, result_2) + TEST_EQUAL(result_2,expected_2) } @@ -194,14 +194,19 @@ START_SECTION(vector findNeighborPeptides(const AASequence& peptides, AASequence seq2 = AASequence::fromString("PPPPP"); AASequence seq3 = AASequence::fromString("SEQUENCE"); AASequence seq4 = AASequence::fromString("VGLPINQR"); + AASequence seq5 = AASequence::fromString("A"); + AASequence seq6 = AASequence::fromString("X"); vector neighbor_candidate; neighbor_candidate.push_back(seq1); neighbor_candidate.push_back(seq2); neighbor_candidate.push_back(seq3); + neighbor_candidate.push_back(seq1); neighbor_candidate.push_back(seq4); - vector candidate_position = {0, 1, 2, 3}; + neighbor_candidate.push_back(seq5); + neighbor_candidate.push_back(seq6); + vector candidate_position = {0, 1, 2, 3, 4, 5}; - vector expected = {0,3}; + vector expected = {0,3, 4}; double min_shared_ion_fraction = 0.25; double mz_bin_size = 0.05; @@ -213,4 +218,62 @@ START_SECTION(vector findNeighborPeptides(const AASequence& peptides, END_SECTION +// Test section for the compareSpectra function +START_SECTION(bool compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size)) +{ + MSSpectrum spec1; + spec1.push_back(Peak1D(100.0, 1.0)); + spec1.push_back(Peak1D(200.0, 1.0)); + spec1.push_back(Peak1D(300.0, 1.0)); + + MSSpectrum spec2; + spec2.push_back(Peak1D(100.05, 1.0)); + spec2.push_back(Peak1D(200.05, 1.0)); + spec2.push_back(Peak1D(300.05, 1.0)); + + MSSpectrum spec3; + spec3.push_back(Peak1D(101.00, 1.0)); + spec3.push_back(Peak1D(201.00, 1.0)); + spec3.push_back(Peak1D(301.00, 1.0)); + + MSSpectrum spec4; + spec4.push_back(Peak1D(100.05, 1.0)); + spec4.push_back(Peak1D(201.00, 1.0)); + spec4.push_back(Peak1D(300.05, 1.0)); + spec4.push_back(Peak1D(301.00, 1.0)); + + + double min_shared_ion_fraction = 0.5; + + + // bin interval is from [a,b[ + bool compare_1_2_low = NeighborSeq::compareSpectraTest(spec1, spec2, 1.0); + TEST_EQUAL(compare_1_2_low,3) + bool compare_1_3_low = NeighborSeq::compareSpectraTest(spec1, spec3, 1.0); + TEST_EQUAL(compare_1_3_low,0) + bool compare_1_4_low = NeighborSeq::compareSpectraTest(spec1, spec4, 1.0); + TEST_EQUAL(compare_1_4_low,2) + bool compare_2_3_low = NeighborSeq::compareSpectraTest(spec2, spec3, 1.0); + TEST_EQUAL(compare_2_3_low,0) + bool compare_2_4_low = NeighborSeq::compareSpectraTest(spec2, spec4, 1.0); + TEST_EQUAL(compare_2_4_low,3) + bool compare_3_4_low = NeighborSeq::compareSpectraTest(spec3, spec4, 1.0); + TEST_EQUAL(compare_3_4_low,2) + + bool compare_1_2_high = NeighborSeq::compareSpectraTest(spec1, spec2, 0.05); + //TEST_EQUAL(compare_1_2_high,0) + bool compare_1_3_high = NeighborSeq::compareSpectraTest(spec1, spec3, 0.05); + TEST_EQUAL(compare_1_3_high,0) + bool compare_1_4_high = NeighborSeq::compareSpectraTest(spec1, spec4, 0.05); + //TEST_EQUAL(compare_1_4_high,0) + bool compare_2_3_high = NeighborSeq::compareSpectraTest(spec2, spec3, 0.05); + TEST_EQUAL(compare_2_3_high,0) + bool compare_2_4_high = NeighborSeq::compareSpectraTest(spec2, spec4, 0.05); + TEST_EQUAL(compare_2_4_high,2) + bool compare_3_4_high = NeighborSeq::compareSpectraTest(spec3, spec4, 0.05); + TEST_EQUAL(compare_3_4_high,2) + +} +END_SECTION + END_TEST \ No newline at end of file diff --git a/src/topp/DecoyDatabase.cpp b/src/topp/DecoyDatabase.cpp index 01a83651ef5..a26394dd256 100644 --- a/src/topp/DecoyDatabase.cpp +++ b/src/topp/DecoyDatabase.cpp @@ -111,10 +111,12 @@ class TOPPDecoyDatabase : registerTOPPSubsection_("Neighbor_Search", "Parameters for neighbor peptide search"); registerInputFile_("Neighbor_Search:neighbor_in", "","", "neighbor peptides are searched for in the Fasta file entered for candiate ", false); setValidFormats_("Neighbor_Search:neighbor_in", {"fasta"}); - registerOutputFile_("Neighbor_Search:out_neighbor", "", "", "Output FASTA file where the neighbor peptide will be written to."); + registerOutputFile_("Neighbor_Search:out_neighbor", "", "", "Output FASTA file where the neighbor peptide will be written to.",false); registerIntOption_("Neighbor_Search:missed_cleavages", "", 0,"Number of missed cleavages",false); registerDoubleOption_("Neighbor_Search:mz_bin_size", "", 0.05,"Sets the mz_bin_size of the neighbor peptide search.(the original study suggests high (0.05 Da) and low (1.0005079 Da) mz_bin_size)", false); registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00001, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); + //setValidStrings_("Neighbor_Search:fragment_unit_ppm", {"true", "false"}); + //registerStringOption_("Neighbor_Search:fragment_unit_ppm", "", "true","calculates with dalton or ppm", false, true); registerDoubleOption_("Neighbor_Search:min_shared_ion_fraction", "", 0.25, "Tolerance of the proportion of b and y ions shared by the neighbor peptide 2*B12/B1+B2 > ti", false, true); } @@ -240,9 +242,11 @@ class TOPPDecoyDatabase : vector neighbor_entries; neighbor_fasta.load(neighbor_file, neighbor_entries); vector digested_candidate_peptides; + vector temp_peptides; for (auto& entry : neighbor_entries) { - digestion_neighbor.digest(AASequence::fromString(entry.sequence), digested_candidate_peptides); + digestion_neighbor.digest(AASequence::fromString(entry.sequence), temp_peptides); + digested_candidate_peptides.insert(digested_candidate_peptides.end(), temp_peptides.begin(), temp_peptides.end()); } // Creates a map of masses to positions from a vector map> mass_position_map = NeighborSeq::createMassPositionMap(digested_candidate_peptides); From 43717b7c7ef7b26cc2ed186fefd86eb336cd4915 Mon Sep 17 00:00:00 2001 From: wangp97 Date: Mon, 29 Jul 2024 16:36:02 +0200 Subject: [PATCH 6/6] Abgabe --- .../include/OpenMS/ANALYSIS/ID/NeighborSeq.h | 4 +- src/openms/source/ANALYSIS/ID/NeighborSeq.cpp | 54 ++++++------------- .../openms/source/NeighborSeq_test.cpp | 48 ++++++++--------- src/topp/DecoyDatabase.cpp | 44 +++++++++------ 4 files changed, 68 insertions(+), 82 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h index 3c851d0705e..e441291b025 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h @@ -76,13 +76,13 @@ namespace OpenMS const double& min_shared_ion_fraction, const double& mz_bin_size); - /**is only a test function for compareSpectra, it works the same way, only the common peaks are output. + /** * @brief Compares two spectra to determine if they share a sufficient number of ions. * @param spec1 The first spectrum. * @param spec2 The second spectrum. * @param mz_bin_size The mz_bin_size setting for the comparison. * @return True if the spectra share a sufficient number of ions, false otherwise. */ - static int compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size); + static int compareShareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size); }; } diff --git a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp index c366bffceb7..e87e2e18c3e 100644 --- a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp +++ b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp @@ -39,33 +39,9 @@ return spectrum; // Function to compare two spectra and determine if they are similar bool NeighborSeq::compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size) { - std::map bins1, bins2; - - // Lambda function to bin a spectrum into a map - auto bin_spectrum = [&](const MSSpectrum& spec, std::map& bins) { - for (const auto& peak : spec) - bins[static_cast(peak.getMZ() / mz_bin_size)]++; - }; - - // Bin both spectra - bin_spectrum(spec1, bins1); - bin_spectrum(spec2, bins2); - - // Extract bin keys as vectors - std::vector vec_bins1, vec_bins2; - for (const auto& bin : bins1) - vec_bins1.push_back(bin.first); - for (const auto& bin : bins2) - vec_bins2.push_back(bin.first); - - // Calculate the intersection of the bin vectors - std::vector intersection; - std::set_intersection(vec_bins1.begin(), vec_bins1.end(), vec_bins2.begin(), vec_bins2.end(), std::back_inserter(intersection)); // Calculate the number of shared bins considering the bin frequencies - int B12 = 0; - for (int bin : intersection) - B12 += std::min(bins1[bin], bins2[bin]); + int B12 = compareShareSpectra(spec1, spec2, mz_bin_size); // Calculate the fraction of shared bins double fraction_shared = static_cast(2 * B12) / (spec1.size() + spec2.size()); @@ -125,29 +101,29 @@ map> NeighborSeq::createMassPositionMap(const vector NeighborSeq::findNeighborPeptides(const AASequence& peptide, - const vector& neighbor_candidates, - const vector& candidates_position, - const double& min_shared_ion_fraction, - const double& mz_bin_size) + const vector& neighbor_candidates, + const vector& candidates_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size) { vector result_entries; MSSpectrum spec = generateSpectrum(peptide.toString()); for (size_t i = 0; i < candidates_position.size(); i++) { - String neighbor_sequence = neighbor_candidates[candidates_position[i]].toString(); + // Check if the sequence contains an 'X' - if (neighbor_sequence.find('X') == String::npos) + if (neighbor_candidates[candidates_position[i]].toString().find('X') == String::npos) { MSSpectrum neighbor_spec = generateSpectrum(neighbor_candidates[candidates_position[i]].toString()); // Compare the spectra and add to results if they are similar @@ -156,18 +132,18 @@ vector NeighborSeq::findNeighborPeptides(const AASequence& peptide, result_entries.push_back(candidates_position[i]); } } - else - { + // else + // { //throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Cannot get peaks of sequence with unknown AA 'X'.", toString()); - } + // } } return result_entries; } -// is only a test function for compareSpectra, it works the same way, only the common peaks are output. -int NeighborSeq::compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size) + +int NeighborSeq::compareShareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size) { std::map bins1, bins2; diff --git a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp index e1378347676..76d5c1d6c6e 100644 --- a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp +++ b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp @@ -168,14 +168,13 @@ START_SECTION(map> NeighborSeq::createMassPositionMap(const candidates_1.push_back(seq3); // 3 position map> expected_1 = {{728.371, {2}}, {799.349, {1}}, {837.270, {3}}}; map> result_1 = NeighborSeq::createMassPositionMap(candidates_1); - TEST_EQUAL(result_1,expected_1) + // TEST_EQUAL(result_1,expected_1) candidates_1.push_back(seq1); // 4 position map> expected_2 = {{728.371, {2}}, {799.349, {1,4}}, {837.270, {3}}}; map> result_2 = NeighborSeq::createMassPositionMap(candidates_1); - TEST_EQUAL(result_2,expected_2) - - + //TEST_EQUAL(result_2,expected_2) + //TEST_TRY(result_2, expected_2); } END_SECTION */ @@ -213,13 +212,12 @@ START_SECTION(vector findNeighborPeptides(const AASequence& peptides, vector result = NeighborSeq::findNeighborPeptides(seq1, neighbor_candidate, candidate_position, min_shared_ion_fraction, mz_bin_size); TEST_EQUAL(expected, result) - } END_SECTION // Test section for the compareSpectra function -START_SECTION(bool compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size)) +START_SECTION(bool compareShareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size)) { MSSpectrum spec1; spec1.push_back(Peak1D(100.0, 1.0)); @@ -227,9 +225,9 @@ START_SECTION(bool compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec1.push_back(Peak1D(300.0, 1.0)); MSSpectrum spec2; - spec2.push_back(Peak1D(100.05, 1.0)); - spec2.push_back(Peak1D(200.05, 1.0)); - spec2.push_back(Peak1D(300.05, 1.0)); + spec2.push_back(Peak1D(100.06, 1.0)); + spec2.push_back(Peak1D(200.06, 1.0)); + spec2.push_back(Peak1D(300.06, 1.0)); MSSpectrum spec3; spec3.push_back(Peak1D(101.00, 1.0)); @@ -237,9 +235,9 @@ START_SECTION(bool compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& spec3.push_back(Peak1D(301.00, 1.0)); MSSpectrum spec4; - spec4.push_back(Peak1D(100.05, 1.0)); + spec4.push_back(Peak1D(100.06, 1.0)); spec4.push_back(Peak1D(201.00, 1.0)); - spec4.push_back(Peak1D(300.05, 1.0)); + spec4.push_back(Peak1D(300.06, 1.0)); spec4.push_back(Peak1D(301.00, 1.0)); @@ -247,30 +245,30 @@ START_SECTION(bool compareSpectraTest(const MSSpectrum& spec1, const MSSpectrum& // bin interval is from [a,b[ - bool compare_1_2_low = NeighborSeq::compareSpectraTest(spec1, spec2, 1.0); + bool compare_1_2_low = NeighborSeq::compareShareSpectra(spec1, spec2, 1.0); TEST_EQUAL(compare_1_2_low,3) - bool compare_1_3_low = NeighborSeq::compareSpectraTest(spec1, spec3, 1.0); + bool compare_1_3_low = NeighborSeq::compareShareSpectra(spec1, spec3, 1.0); TEST_EQUAL(compare_1_3_low,0) - bool compare_1_4_low = NeighborSeq::compareSpectraTest(spec1, spec4, 1.0); + bool compare_1_4_low = NeighborSeq::compareShareSpectra(spec1, spec4, 1.0); TEST_EQUAL(compare_1_4_low,2) - bool compare_2_3_low = NeighborSeq::compareSpectraTest(spec2, spec3, 1.0); + bool compare_2_3_low = NeighborSeq::compareShareSpectra(spec2, spec3, 1.0); TEST_EQUAL(compare_2_3_low,0) - bool compare_2_4_low = NeighborSeq::compareSpectraTest(spec2, spec4, 1.0); + bool compare_2_4_low = NeighborSeq::compareShareSpectra(spec2, spec4, 1.0); TEST_EQUAL(compare_2_4_low,3) - bool compare_3_4_low = NeighborSeq::compareSpectraTest(spec3, spec4, 1.0); + bool compare_3_4_low = NeighborSeq::compareShareSpectra(spec3, spec4, 1.0); TEST_EQUAL(compare_3_4_low,2) - bool compare_1_2_high = NeighborSeq::compareSpectraTest(spec1, spec2, 0.05); - //TEST_EQUAL(compare_1_2_high,0) - bool compare_1_3_high = NeighborSeq::compareSpectraTest(spec1, spec3, 0.05); + bool compare_1_2_high = NeighborSeq::compareShareSpectra(spec1, spec2, 0.05); + TEST_EQUAL(compare_1_2_high,0) + bool compare_1_3_high = NeighborSeq::compareShareSpectra(spec1, spec3, 0.05); TEST_EQUAL(compare_1_3_high,0) - bool compare_1_4_high = NeighborSeq::compareSpectraTest(spec1, spec4, 0.05); - //TEST_EQUAL(compare_1_4_high,0) - bool compare_2_3_high = NeighborSeq::compareSpectraTest(spec2, spec3, 0.05); + bool compare_1_4_high = NeighborSeq::compareShareSpectra(spec1, spec4, 0.05); + TEST_EQUAL(compare_1_4_high,0) + bool compare_2_3_high = NeighborSeq::compareShareSpectra(spec2, spec3, 0.05); TEST_EQUAL(compare_2_3_high,0) - bool compare_2_4_high = NeighborSeq::compareSpectraTest(spec2, spec4, 0.05); + bool compare_2_4_high = NeighborSeq::compareShareSpectra(spec2, spec4, 0.05); TEST_EQUAL(compare_2_4_high,2) - bool compare_3_4_high = NeighborSeq::compareSpectraTest(spec3, spec4, 0.05); + bool compare_3_4_high = NeighborSeq::compareShareSpectra(spec3, spec4, 0.05); TEST_EQUAL(compare_3_4_high,2) } diff --git a/src/topp/DecoyDatabase.cpp b/src/topp/DecoyDatabase.cpp index a26394dd256..482fb217273 100644 --- a/src/topp/DecoyDatabase.cpp +++ b/src/topp/DecoyDatabase.cpp @@ -115,11 +115,13 @@ class TOPPDecoyDatabase : registerIntOption_("Neighbor_Search:missed_cleavages", "", 0,"Number of missed cleavages",false); registerDoubleOption_("Neighbor_Search:mz_bin_size", "", 0.05,"Sets the mz_bin_size of the neighbor peptide search.(the original study suggests high (0.05 Da) and low (1.0005079 Da) mz_bin_size)", false); registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00001, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); + registerIntOption_("Neighbor_Search:min_peptide_length", "", 5, "the minimum neighbor peptide length ", false); //setValidStrings_("Neighbor_Search:fragment_unit_ppm", {"true", "false"}); //registerStringOption_("Neighbor_Search:fragment_unit_ppm", "", "true","calculates with dalton or ppm", false, true); registerDoubleOption_("Neighbor_Search:min_shared_ion_fraction", "", 0.25, "Tolerance of the proportion of b and y ions shared by the neighbor peptide 2*B12/B1+B2 > ti", false, true); } + Param getSubsectionDefaults_(const String& /* name */) const override { Param p = MRMDecoy().getDefaults(); @@ -219,6 +221,7 @@ class TOPPDecoyDatabase : double min_shared_ion_fraction = getDoubleOption_("Neighbor_Search:min_shared_ion_fraction"); double mass_tolerance = getDoubleOption_("Neighbor_Search:mass_tolerance"); int missed_cleavages = getIntOption_("Neighbor_Search:missed_cleavages"); + int min_peptide_length = getIntOption_("Neighbor_Search:min_peptide_length"); digestion_neighbor.setMissedCleavages(missed_cleavages); // Vector to hold all digested peptides from the input files vector all_peptides; @@ -248,26 +251,35 @@ class TOPPDecoyDatabase : digestion_neighbor.digest(AASequence::fromString(entry.sequence), temp_peptides); digested_candidate_peptides.insert(digested_candidate_peptides.end(), temp_peptides.begin(), temp_peptides.end()); } + + // Remove peptides shorter than min_peptide_length amino acids + digested_candidate_peptides.erase(remove_if(digested_candidate_peptides.begin(), digested_candidate_peptides.end(), + [min_peptide_length](const AASequence& peptide) { return peptide.size() < min_peptide_length; }) + ,digested_candidate_peptides.end()); + // Creates a map of masses to positions from a vector map> mass_position_map = NeighborSeq::createMassPositionMap(digested_candidate_peptides); - + for (auto i = all_peptides.begin(); i != all_peptides.end(); ++i) - { - // Finde position of neighbor peptides candidates - vector candiadate_position = NeighborSeq::findCandidatePositions(mass_position_map, i->getMonoWeight(), mass_tolerance); - - // Find neighbor peptides for the current peptide - vector result = NeighborSeq::findNeighborPeptides(*i, digested_candidate_peptides, candiadate_position, min_shared_ion_fraction, mz_bin_size); - for (size_t j = 0; j < result.size(); ++j) + { + if (i->toString().find('X') == String::npos) { - // Get the corresponding neighbor entry - const auto& neighbor_entry = neighbor_entries[result[j]]; - // Create a new FASTAEntry with the neighbor peptide sequence - FASTAFile::FASTAEntry new_entry(entry.identifier, entry.description, digested_candidate_peptides[result[j]].toString()); - // Write the neighbor peptide to the output file - neig.writeNext(new_entry); - - } + // Finde position of neighbor peptides candidates + vector candiadate_position = NeighborSeq::findCandidatePositions(mass_position_map, i->getMonoWeight(), mass_tolerance); + + // Find neighbor peptides for the current peptide + vector result + = NeighborSeq::findNeighborPeptides(*i, digested_candidate_peptides, candiadate_position, min_shared_ion_fraction, mz_bin_size); + for (size_t j = 0; j < result.size(); ++j) + { + // Get the corresponding neighbor entry + const auto& neighbor_entry = neighbor_entries[result[j]]; + // Create a new FASTAEntry with the neighbor peptide sequence + FASTAFile::FASTAEntry new_entry(entry.identifier, entry.description, digested_candidate_peptides[result[j]].toString()); + // Write the neighbor peptide to the output file + neig.writeNext(new_entry); + } + } } in.push_back(out_neighbor); }