diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h new file mode 100644 index 00000000000..e441291b025 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/ID/NeighborSeq.h @@ -0,0 +1,88 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow, Philipp Wang $ +// $Authors: Philipp Wang $ +// -------------------------------------------------------------------------- + +#pragma once + +#include + + + +namespace OpenMS +{ + /** + * @brief The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) + * that are similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities + * specifically related to the Neighbor Peptide search. + * + * The paper on subset neighbor search is www.ncbi.nlm.nih.gov/pmc/articles/PMC8489664/ + * PMID: 34236864 PMCID: PMC8489664 DOI: 10.1021/acs.jproteome.1c00483 + */ + class OPENMS_DLLAPI NeighborSeq + { + + public: + /** + * @brief Generates a theoretical spectrum for a given peptide sequence. + * @param peptide_sequence The peptide sequence for which to generate the spectrum. + * @return The generated theoretical spectrum. + */ + static MSSpectrum generateSpectrum(const String& peptide_sequence); + + /** + * @brief Compares two spectra to determine if they share a sufficient number of ions. + * @param spec1 The first spectrum. + * @param spec2 The second spectrum. + * @param min_shared_ion_fraction The tolerance for the proportion of shared ions. + * @param mz_bin_size The mz_bin_size setting for the comparison. + * @return True if the spectra share a sufficient number of ions, false otherwise. + */ + static bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size); + + /** + * @brief Finds candidate positions based on a given mono-isotopic weight and mass tolerance. + * @param mass_position_map A map where the key is the mass and the value is a vector of positions. + * @param mono_weight The mono-isotopic weight to find candidates for. + * @param mass_tolerance The allowed tolerance for matching the mass. + * @return A vector of candidate positions within the specified mass tolerance range. + */ + static std::vector findCandidatePositions(const std::map>& mass_position_map, const double& mono_weight, const double& mass_tolerance); + + + /** + * @brief Creates a map of masses to positions from a vector of peptides. (not protein) + * @param candidates A vector of AASequence objects. + * @return A map where the key is the mass and the value is a vector of positions. + */ + static std::map> NeighborSeq::createMassPositionMap(const std::vector& candidates); + + /** + * @brief Finds neighbor peptides that are similar to a given peptide. + * @param peptide The peptide sequence (from a relevant protein) to find neighbors for. + * @param neighbor_candidate The list of neighbor sequences to search in. + * @param mass_tolerance The mass tolerance for neighbor peptides. + * @param min_shared_ion_fraction The ion tolerance for neighbor peptides. + * @param mz_bin_size The mz_bin_size setting for the comparison is default 0.05 (the original study suggests ‘high’ (0.05 Da) and ‘low’ (1.0005079 + * Da) mz_bin_size). + * @return A vector of FASTA entries that are considered neighbor peptides. + */ + static std::vector findNeighborPeptides(const AASequence& peptides, + const std::vector& neighbor_candidate, + const std::vector& candidate_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size); + + /** + * @brief Compares two spectra to determine if they share a sufficient number of ions. + * @param spec1 The first spectrum. + * @param spec2 The second spectrum. + * @param mz_bin_size The mz_bin_size setting for the comparison. + * @return True if the spectra share a sufficient number of ions, false otherwise. + */ + static int compareShareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size); + }; +} diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake index 9484ece68a7..823a3cfb83d 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake +++ b/src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake @@ -34,6 +34,7 @@ IonIdentityMolecularNetworking.h MessagePasserFactory.h MetaboliteSpectralMatching.h MorpheusScore.h +NeighborSeq.h PeptideIndexing.h PeptideProteinResolution.h PercolatorFeatureSetHelper.h diff --git a/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp new file mode 100644 index 00000000000..e87e2e18c3e --- /dev/null +++ b/src/openms/source/ANALYSIS/ID/NeighborSeq.cpp @@ -0,0 +1,176 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow, Philipp Wang $ +// $Authors: Philipp Wang $ +// -------------------------------------------------------------------------- + +#include +#include +#include +#include + + + + +using namespace OpenMS; +using namespace std; + + + + +// Function to generate the theoretical spectrum for a given peptide sequence +MSSpectrum NeighborSeq::generateSpectrum(const String& peptide_sequence) +{ +AASequence peptide = AASequence::fromString(peptide_sequence); +TheoreticalSpectrumGenerator spec_gen; +// Set parameters for the spectrum generator +Param params; +params.setValue("add_b_ions", "true"); +params.setValue("add_y_ions", "true"); +spec_gen.setParameters(params); +// Generate the spectrum and store it in an MSSpectrum object +MSSpectrum spectrum; +spec_gen.getSpectrum(spectrum, peptide, 1, 1); +return spectrum; +} + +// Function to compare two spectra and determine if they are similar +bool NeighborSeq::compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size) +{ + + // Calculate the number of shared bins considering the bin frequencies + int B12 = compareShareSpectra(spec1, spec2, mz_bin_size); + + // Calculate the fraction of shared bins + double fraction_shared = static_cast(2 * B12) / (spec1.size() + spec2.size()); + + return fraction_shared > min_shared_ion_fraction; +} + +//Finds candidate positions based on a given mono-isotopic weight and mass tolerance. +vector NeighborSeq::findCandidatePositions(const map>& mass_position_map,const double& mono_weight, const double& mass_tolerance) +{ + // Vector to store the candidate positions + vector candidate_position; + // Calculate the lower and upper bounds for the mass tolerance range + double lower_bound_key = mono_weight; + double upper_bound_key = mono_weight; + if (mass_tolerance < 0) + { + lower_bound_key += mass_tolerance; + upper_bound_key -= mass_tolerance; + } + else + { + lower_bound_key -= mass_tolerance; + upper_bound_key += mass_tolerance; + } + + // Find the lower bound iterator in the map + auto lower = mass_position_map.lower_bound(lower_bound_key); + + // Find the upper bound iterator in the map + auto upper = mass_position_map.upper_bound(upper_bound_key); + + // Iterate through the map from the lower bound to the upper bound + for (auto it = lower; it != upper; ++it) + { + // Insert the positions from the current map entry into the candidate positions vector + candidate_position.insert(candidate_position.end(), it->second.begin(), it->second.end()); + } + return candidate_position; +} + + +//Creates a map of masses to positions from a vector of peptides.(not protein) +map> NeighborSeq::createMassPositionMap(const vector& candidates) +{ + // Map to store the mass and corresponding positions + map> mass_position_map; + + // Iterate through the vector of AASequence objects + for (size_t i = 0; i < candidates.size(); ++i) + { + if (candidates[i].toString().find('X') == String::npos) + { + // Calculate the mono-isotopic mass of the sequence + double mass = candidates[i].getMonoWeight(); + + // Insert the mass and the position into the map + mass_position_map[mass].push_back(i); + } + // else + // { + //throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Cannot get weight of sequence with unknown AA 'X' with unknown mass.", toString()); + //} + } + return mass_position_map; +} + +// Method to find neighbor peptides in a given FASTA file +vector NeighborSeq::findNeighborPeptides(const AASequence& peptide, + const vector& neighbor_candidates, + const vector& candidates_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size) + +{ + vector result_entries; + MSSpectrum spec = generateSpectrum(peptide.toString()); + for (size_t i = 0; i < candidates_position.size(); i++) + { + + // Check if the sequence contains an 'X' + if (neighbor_candidates[candidates_position[i]].toString().find('X') == String::npos) + { + MSSpectrum neighbor_spec = generateSpectrum(neighbor_candidates[candidates_position[i]].toString()); + // Compare the spectra and add to results if they are similar + if (compareSpectra(spec, neighbor_spec, min_shared_ion_fraction, mz_bin_size)) + { + result_entries.push_back(candidates_position[i]); + } + } + // else + // { + //throw Exception::InvalidValue(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "Cannot get peaks of sequence with unknown AA 'X'.", toString()); + // } + } +return result_entries; +} + + + + +int NeighborSeq::compareShareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size) +{ + std::map bins1, bins2; + + // Lambda function to bin a spectrum into a map + auto bin_spectrum = [&](const MSSpectrum& spec, std::map& bins) { + for (const auto& peak : spec) + bins[static_cast(peak.getMZ() / mz_bin_size)]++; + }; + + // Bin both spectra + bin_spectrum(spec1, bins1); + bin_spectrum(spec2, bins2); + + // Extract bin keys as vectors + std::vector vec_bins1, vec_bins2; + for (const auto& bin : bins1) + vec_bins1.push_back(bin.first); + for (const auto& bin : bins2) + vec_bins2.push_back(bin.first); + + // Calculate the intersection of the bin vectors + std::vector intersection; + std::set_intersection(vec_bins1.begin(), vec_bins1.end(), vec_bins2.begin(), vec_bins2.end(), std::back_inserter(intersection)); + // Calculate the number of shared bins considering the bin frequencies + int shared_peaks = 0; + for (int bin : intersection) + shared_peaks += min(bins1[bin], bins2[bin]); + + return shared_peaks; +} \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/sources.cmake b/src/openms/source/ANALYSIS/ID/sources.cmake index 41cc05a6d59..de1522abfb9 100644 --- a/src/openms/source/ANALYSIS/ID/sources.cmake +++ b/src/openms/source/ANALYSIS/ID/sources.cmake @@ -34,6 +34,7 @@ IonIdentityMolecularNetworking.cpp MessagePasserFactory.cpp MetaboliteSpectralMatching.cpp MorpheusScore.cpp +NeighborSeq.cpp PeptideProteinResolution.cpp PeptideIndexing.cpp PercolatorFeatureSetHelper.cpp diff --git a/src/tests/class_tests/openms/source/NeighborSeq_test.cpp b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp new file mode 100644 index 00000000000..76d5c1d6c6e --- /dev/null +++ b/src/tests/class_tests/openms/source/NeighborSeq_test.cpp @@ -0,0 +1,277 @@ +// Copyright (c) 2002-present, The OpenMS Team -- EKU Tuebingen, ETH Zurich, and FU Berlin +// SPDX-License-Identifier: BSD-3-Clause +// +// -------------------------------------------------------------------------- +// $Maintainer: Chris Bielow $ +// $Authors: Andreas Bertsch $ +// -------------------------------------------------------------------------- +#include +#include +#include +#include +#include + + +using namespace OpenMS; +using namespace std; + +START_TEST(NeighborSeq, "$Id$") + +// NS()=delete; + + +// Test section for the generateSpectrum function +// The spectra were generated via TOPPView and contained b-and y-ion +START_SECTION(MSSpectrum generateSpectrum(const String& peptide_sequence)) +{ + MSSpectrum spec_1 = NeighborSeq::generateSpectrum("PEPT"); + MSSpectrum spec_2 = NeighborSeq::generateSpectrum("AR"); + MSSpectrum spec_3 = NeighborSeq::generateSpectrum("VGLPINQR"); + + // Test "PEPT" + TEST_REAL_SIMILAR(spec_1[0].getPosition()[0], 120.0655); + TEST_REAL_SIMILAR(spec_1[1].getPosition()[0], 217.1182); + TEST_REAL_SIMILAR(spec_1[2].getPosition()[0], 227.1026); + TEST_REAL_SIMILAR(spec_1[3].getPosition()[0], 324.1553); + TEST_REAL_SIMILAR(spec_1[4].getPosition()[0], 346.1608); + + + // Test "AR" + TEST_REAL_SIMILAR(spec_2[0].getPosition()[0], 175.1189); + + // Test "VGLPINQR" + TEST_REAL_SIMILAR(spec_3[0].getPosition()[0], 157.0971); + TEST_REAL_SIMILAR(spec_3[1].getPosition()[0], 175.1189); + TEST_REAL_SIMILAR(spec_3[2].getPosition()[0], 270.1812); + TEST_REAL_SIMILAR(spec_3[3].getPosition()[0], 303.1775); + TEST_REAL_SIMILAR(spec_3[4].getPosition()[0], 367.2339); + TEST_REAL_SIMILAR(spec_3[5].getPosition()[0], 417.2204); + TEST_REAL_SIMILAR(spec_3[6].getPosition()[0], 480.3180); + TEST_REAL_SIMILAR(spec_3[7].getPosition()[0], 530.3045); + TEST_REAL_SIMILAR(spec_3[8].getPosition()[0], 594.3609); + TEST_REAL_SIMILAR(spec_3[9].getPosition()[0], 627.3578); + TEST_REAL_SIMILAR(spec_3[10].getPosition()[0], 722.4195); + TEST_REAL_SIMILAR(spec_3[11].getPosition()[0], 740.4413); + TEST_REAL_SIMILAR(spec_3[12].getPosition()[0], 797.4628); + +} +END_SECTION + +// Test section for the compareSpectra function +START_SECTION(bool compareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& min_shared_ion_fraction, const double& mz_bin_size)) +{ + MSSpectrum spec1; + spec1.push_back(Peak1D(100.0, 1.0)); + spec1.push_back(Peak1D(200.0, 1.0)); + spec1.push_back(Peak1D(300.0, 1.0)); + + MSSpectrum spec2; + spec2.push_back(Peak1D(100.05, 1.0)); + spec2.push_back(Peak1D(200.05, 1.0)); + spec2.push_back(Peak1D(300.05, 1.0)); + + MSSpectrum spec3; + spec3.push_back(Peak1D(101.00, 1.0)); + spec3.push_back(Peak1D(201.00, 1.0)); + spec3.push_back(Peak1D(301.00, 1.0)); + + MSSpectrum spec4; + spec4.push_back(Peak1D(100.05, 1.0)); + spec4.push_back(Peak1D(201.00, 1.0)); + spec4.push_back(Peak1D(300.05, 1.0)); + spec4.push_back(Peak1D(301.00, 1.0)); + + + double min_shared_ion_fraction = 0.5; + + + //bin interval is from [a,b[ + bool compare_1_2_low = NeighborSeq::compareSpectra(spec1, spec2, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_1_2_low) + bool compare_1_3_low = NeighborSeq::compareSpectra(spec1, spec3, min_shared_ion_fraction, 1.0); + TEST_FALSE(compare_1_3_low) + bool compare_1_4_low = NeighborSeq::compareSpectra(spec1, spec4, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_1_4_low) + bool compare_2_3_low = NeighborSeq::compareSpectra(spec2, spec3, min_shared_ion_fraction, 1.0); + TEST_FALSE(compare_2_3_low) + bool compare_2_4_low = NeighborSeq::compareSpectra(spec2, spec4, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_2_4_low) + bool compare_3_4_low = NeighborSeq::compareSpectra(spec3, spec4, min_shared_ion_fraction, 1.0); + TEST_TRUE(compare_3_4_low) + + bool compare_1_2_high = NeighborSeq::compareSpectra(spec1, spec2, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_1_2_high) + bool compare_1_3_high = NeighborSeq::compareSpectra(spec1, spec3, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_1_3_high) + bool compare_1_4_high = NeighborSeq::compareSpectra(spec1, spec4, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_1_4_high) + bool compare_2_3_high = NeighborSeq::compareSpectra(spec2, spec3, min_shared_ion_fraction, 0.05); + TEST_FALSE(compare_2_3_high) + bool compare_2_4_high = NeighborSeq::compareSpectra(spec2, spec4, min_shared_ion_fraction, 0.05); + TEST_TRUE(compare_2_4_high) + bool compare_3_4_high = NeighborSeq::compareSpectra(spec3, spec4, min_shared_ion_fraction, 0.05); + TEST_TRUE(compare_3_4_high) +} +END_SECTION + + +// Test section for the findCandidatePositions function +START_SECTION(vector findCandidatePositions(const map>& mass_position_map, + const double& mono_weight, + const double& mass_tolerance)) +{ + map> mass_position_map_1 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_1 = 101.0; + double mass_tolerance_1 = 1.0; + + vector expected_1 = {1, 2, 3, 4 , 5}; + vector result_1 = NeighborSeq::findCandidatePositions(mass_position_map_1, mono_weight_1, mass_tolerance_1); + TEST_EQUAL(result_1,expected_1) + + map> mass_position_map_2 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_2 = 100.5; + double mass_tolerance_2 = 1.0; + + vector expected_2 = {1, 2, 3}; + vector result_2 = NeighborSeq::findCandidatePositions(mass_position_map_2, mono_weight_2, mass_tolerance_2); + TEST_EQUAL(result_2,expected_2) + + map> mass_position_map_3 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_3 = 105.0; + double mass_tolerance_3 = 1.0; + + vector expected_3 = {}; + vector result_3 = NeighborSeq::findCandidatePositions(mass_position_map_3, mono_weight_3, mass_tolerance_3); + TEST_EQUAL(result_3, expected_3) + + map> mass_position_map_4 = {{100.0, {1, 2}}, {101.5, {3}}, {102.0, {4, 5}}}; + double mono_weight_4 = 100.5; + double mass_tolerance_4 = -1.0; + + vector expected_4 = {1, 2, 3}; + vector result_4 = NeighborSeq::findCandidatePositions(mass_position_map_4, mono_weight_4, mass_tolerance_4); + TEST_EQUAL(result_4, expected_4) +} +END_SECTION + +/* +START_SECTION(map> NeighborSeq::createMassPositionMap(const vector& candidates)) +{ + vector candidates_1; + AASequence seq1 = AASequence::fromString("PEPTIDE"); // 728.371 + AASequence seq2 = AASequence::fromString("PROTEIN"); // 799.349 + AASequence seq3 = AASequence::fromString("SEQUENCE");// 837.270 + + + candidates_1.push_back(seq1); // 1 position + candidates_1.push_back(seq2); // 2 position + candidates_1.push_back(seq3); // 3 position + map> expected_1 = {{728.371, {2}}, {799.349, {1}}, {837.270, {3}}}; + map> result_1 = NeighborSeq::createMassPositionMap(candidates_1); + // TEST_EQUAL(result_1,expected_1) + + candidates_1.push_back(seq1); // 4 position + map> expected_2 = {{728.371, {2}}, {799.349, {1,4}}, {837.270, {3}}}; + map> result_2 = NeighborSeq::createMassPositionMap(candidates_1); + //TEST_EQUAL(result_2,expected_2) + //TEST_TRY(result_2, expected_2); +} +END_SECTION +*/ + + +// Test section for the findNeighborPeptides_ function +// The spectra were generated via TOPPView +START_SECTION(vector findNeighborPeptides(const AASequence& peptides, + const vector& neighbor_candidate, + const vector& candidate_position, + const double& min_shared_ion_fraction, + const double& mz_bin_size)) +{ + + AASequence seq1 = AASequence::fromString("RIPLANGR"); + AASequence seq2 = AASequence::fromString("PPPPP"); + AASequence seq3 = AASequence::fromString("SEQUENCE"); + AASequence seq4 = AASequence::fromString("VGLPINQR"); + AASequence seq5 = AASequence::fromString("A"); + AASequence seq6 = AASequence::fromString("X"); + vector neighbor_candidate; + neighbor_candidate.push_back(seq1); + neighbor_candidate.push_back(seq2); + neighbor_candidate.push_back(seq3); + neighbor_candidate.push_back(seq1); + neighbor_candidate.push_back(seq4); + neighbor_candidate.push_back(seq5); + neighbor_candidate.push_back(seq6); + vector candidate_position = {0, 1, 2, 3, 4, 5}; + + vector expected = {0,3, 4}; + double min_shared_ion_fraction = 0.25; + double mz_bin_size = 0.05; + + vector result = NeighborSeq::findNeighborPeptides(seq1, neighbor_candidate, candidate_position, min_shared_ion_fraction, mz_bin_size); + + TEST_EQUAL(expected, result) +} +END_SECTION + + +// Test section for the compareSpectra function +START_SECTION(bool compareShareSpectra(const MSSpectrum& spec1, const MSSpectrum& spec2, const double& mz_bin_size)) +{ + MSSpectrum spec1; + spec1.push_back(Peak1D(100.0, 1.0)); + spec1.push_back(Peak1D(200.0, 1.0)); + spec1.push_back(Peak1D(300.0, 1.0)); + + MSSpectrum spec2; + spec2.push_back(Peak1D(100.06, 1.0)); + spec2.push_back(Peak1D(200.06, 1.0)); + spec2.push_back(Peak1D(300.06, 1.0)); + + MSSpectrum spec3; + spec3.push_back(Peak1D(101.00, 1.0)); + spec3.push_back(Peak1D(201.00, 1.0)); + spec3.push_back(Peak1D(301.00, 1.0)); + + MSSpectrum spec4; + spec4.push_back(Peak1D(100.06, 1.0)); + spec4.push_back(Peak1D(201.00, 1.0)); + spec4.push_back(Peak1D(300.06, 1.0)); + spec4.push_back(Peak1D(301.00, 1.0)); + + + double min_shared_ion_fraction = 0.5; + + + // bin interval is from [a,b[ + bool compare_1_2_low = NeighborSeq::compareShareSpectra(spec1, spec2, 1.0); + TEST_EQUAL(compare_1_2_low,3) + bool compare_1_3_low = NeighborSeq::compareShareSpectra(spec1, spec3, 1.0); + TEST_EQUAL(compare_1_3_low,0) + bool compare_1_4_low = NeighborSeq::compareShareSpectra(spec1, spec4, 1.0); + TEST_EQUAL(compare_1_4_low,2) + bool compare_2_3_low = NeighborSeq::compareShareSpectra(spec2, spec3, 1.0); + TEST_EQUAL(compare_2_3_low,0) + bool compare_2_4_low = NeighborSeq::compareShareSpectra(spec2, spec4, 1.0); + TEST_EQUAL(compare_2_4_low,3) + bool compare_3_4_low = NeighborSeq::compareShareSpectra(spec3, spec4, 1.0); + TEST_EQUAL(compare_3_4_low,2) + + bool compare_1_2_high = NeighborSeq::compareShareSpectra(spec1, spec2, 0.05); + TEST_EQUAL(compare_1_2_high,0) + bool compare_1_3_high = NeighborSeq::compareShareSpectra(spec1, spec3, 0.05); + TEST_EQUAL(compare_1_3_high,0) + bool compare_1_4_high = NeighborSeq::compareShareSpectra(spec1, spec4, 0.05); + TEST_EQUAL(compare_1_4_high,0) + bool compare_2_3_high = NeighborSeq::compareShareSpectra(spec2, spec3, 0.05); + TEST_EQUAL(compare_2_3_high,0) + bool compare_2_4_high = NeighborSeq::compareShareSpectra(spec2, spec4, 0.05); + TEST_EQUAL(compare_2_4_high,2) + bool compare_3_4_high = NeighborSeq::compareShareSpectra(spec3, spec4, 0.05); + TEST_EQUAL(compare_3_4_high,2) + +} +END_SECTION + +END_TEST \ No newline at end of file diff --git a/src/topp/DecoyDatabase.cpp b/src/topp/DecoyDatabase.cpp index c67f7caf6d0..482fb217273 100644 --- a/src/topp/DecoyDatabase.cpp +++ b/src/topp/DecoyDatabase.cpp @@ -3,7 +3,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Sven Nahnsen $ -// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow $ +// $Authors: Sven Nahnsen, Andreas Bertsch, Chris Bielow, Philipp Wang $ // -------------------------------------------------------------------------- #include @@ -16,6 +16,10 @@ #include #include #include +#include + + + using namespace OpenMS; using namespace std; @@ -54,6 +58,12 @@ and terminates the program if decoys are found. @verbinclude TOPP_DecoyDatabase.cli INI file documentation of this tool: @htmlinclude TOPP_DecoyDatabase.html + + +The Neighbor Peptide functionality in the TOPPDecoyDatabase class is designed to find peptides from a given set of sequences (FASTA file) that are +similar to a target peptide based on mass and spectral characteristics. This section will detail the methods and functionalities specifically related +to the Neighbor Peptide search. +NeighborSeq class has more details look at *OpenMS\src\openms\include\OpenMS\ANALYSIS\ID\NeighborSeq.h */ // We do not want this class to show up in the docu: @@ -96,8 +106,22 @@ class TOPPDecoyDatabase : setValidStrings_("enzyme", all_enzymes); registerSubsection_("Decoy", "Decoy parameters section"); + + // New options for neighbor peptide search + registerTOPPSubsection_("Neighbor_Search", "Parameters for neighbor peptide search"); + registerInputFile_("Neighbor_Search:neighbor_in", "","", "neighbor peptides are searched for in the Fasta file entered for candiate ", false); + setValidFormats_("Neighbor_Search:neighbor_in", {"fasta"}); + registerOutputFile_("Neighbor_Search:out_neighbor", "", "", "Output FASTA file where the neighbor peptide will be written to.",false); + registerIntOption_("Neighbor_Search:missed_cleavages", "", 0,"Number of missed cleavages",false); + registerDoubleOption_("Neighbor_Search:mz_bin_size", "", 0.05,"Sets the mz_bin_size of the neighbor peptide search.(the original study suggests high (0.05 Da) and low (1.0005079 Da) mz_bin_size)", false); + registerDoubleOption_("Neighbor_Search:mass_tolerance", "", 0.00001, "Tolerance of the mass of the neighbor peptide m1-m2 > tm", false, true); + registerIntOption_("Neighbor_Search:min_peptide_length", "", 5, "the minimum neighbor peptide length ", false); + //setValidStrings_("Neighbor_Search:fragment_unit_ppm", {"true", "false"}); + //registerStringOption_("Neighbor_Search:fragment_unit_ppm", "", "true","calculates with dalton or ppm", false, true); + registerDoubleOption_("Neighbor_Search:min_shared_ion_fraction", "", 0.25, "Tolerance of the proportion of b and y ions shared by the neighbor peptide 2*B12/B1+B2 > ti", false, true); } + Param getSubsectionDefaults_(const String& /* name */) const override { Param p = MRMDecoy().getDefaults(); @@ -111,6 +135,7 @@ class TOPPDecoyDatabase : if (as_prefix) return decoy_string + identifier; else return identifier + decoy_string; } + ExitCodes main_(int, const char**) override { @@ -165,173 +190,252 @@ class TOPPDecoyDatabase : f.writeStart(out); FASTAFile::FASTAEntry entry; - // Configure Enzymatic digestion - // TODO: allow user-specified regex + // Create a ProteaseDigestion object for peptide digestion ProteaseDigestion digestion; String enzyme = getStringOption_("enzyme").trim(); - if ((input_type == SeqType::protein) && !enzyme.empty()) - { - digestion.setEnzyme(enzyme); - } - // check if decoy_string is common decoy string (e.g. decoy, rev, ...) - String decoy_string_lower = decoy_string; - decoy_string_lower.toLower(); - bool is_common = false; - for (const auto& a : DecoyHelper::affixes) + if ((input_type == SeqType::protein) && ! enzyme.empty()) { digestion.setEnzyme(enzyme); } + + + // create neighbor peptides for the relevant peptides given in the FASTA from '-in' + String neighbor_file = getStringOption_("Neighbor_Search:neighbor_in"); + // Check if the neighbor file option is provided + if (! neighbor_file.empty()) { - if ((decoy_string_lower.hasPrefix(a) && decoy_string_position_prefix) || (decoy_string_lower.hasSuffix(a) && !decoy_string_position_prefix)) + if (input_type != SeqType::protein) { - is_common = true; + OPENMS_LOG_ERROR << "Parameter settings are invalid. When requesting neighbor peptides, the input type must be 'protein', not 'RNA'.\n"; + return INCOMPATIBLE_INPUT_DATA; } - } - // terminate, if decoy_string is not one of the allowed decoy strings (exit code 11) - if (!is_common) - { - if (getFlag_("force")) + // Create a ProteaseDigestion object for neighbor peptide digestion + ProteaseDigestion digestion_neighbor; + String enzyme = getStringOption_("enzyme").trim(); + if ((input_type == SeqType::protein) && ! enzyme.empty()) { digestion_neighbor.setEnzyme(enzyme); } + //------------------------------------------------------------- + // parsing neighbor parameters + //------------------------------------------------------------- + String out_neighbor = getStringOption_("Neighbor_Search:out_neighbor"); + FASTAFile neig; + neig.writeStart(out_neighbor); + + double mz_bin_size = getDoubleOption_("Neighbor_Search:mz_bin_size"); + double min_shared_ion_fraction = getDoubleOption_("Neighbor_Search:min_shared_ion_fraction"); + double mass_tolerance = getDoubleOption_("Neighbor_Search:mass_tolerance"); + int missed_cleavages = getIntOption_("Neighbor_Search:missed_cleavages"); + int min_peptide_length = getIntOption_("Neighbor_Search:min_peptide_length"); + digestion_neighbor.setMissedCleavages(missed_cleavages); + // Vector to hold all digested peptides from the input files + vector all_peptides; + + for (Size i = 0; i < in.size(); ++i) { - OPENMS_LOG_WARN << "Force Flag is enabled, decoys with custom decoy string (not in DecoyHelper::affixes) will not be detected.\n"; + //FASTAContainer in_entries {in[i]}; + neig.readStart(in[i]); + while (neig.readNext(entry)) + { + // Vector to hold the peptides generated from digestion + vector peptides; + // Digest the current entry sequence into peptides + digestion_neighbor.digest(AASequence::fromString(entry.sequence), peptides); + // Add the generated peptides to the all_peptides vector + all_peptides.insert(all_peptides.end(), peptides.begin(), peptides.end()); + } } - else + // Load the neighbor peptides candidates from the neighbor file + FASTAFile neighbor_fasta; + vector neighbor_entries; + neighbor_fasta.load(neighbor_file, neighbor_entries); + vector digested_candidate_peptides; + vector temp_peptides; + for (auto& entry : neighbor_entries) { - OPENMS_LOG_FATAL_ERROR << "Given decoy string is not allowed. Please use one of the strings in DecoyHelper::affixes as either prefix or suffix (case insensitive): \n"; - return INCOMPATIBLE_INPUT_DATA; + digestion_neighbor.digest(AASequence::fromString(entry.sequence), temp_peptides); + digested_candidate_peptides.insert(digested_candidate_peptides.end(), temp_peptides.begin(), temp_peptides.end()); } - } - MRMDecoy m; - m.setParameters(decoy_param); - Math::RandomShuffler shuffler(seed); - for (Size i = 0; i < in.size(); ++i) - { - // check input files for decoys - FASTAContainer in_entries{in[i]}; - auto r = DecoyHelper::countDecoys(in_entries); - // if decoys found, throw exception - if (static_cast(r.all_prefix_occur + r.all_suffix_occur) >= 0.4 * static_cast(r.all_proteins_count)) + // Remove peptides shorter than min_peptide_length amino acids + digested_candidate_peptides.erase(remove_if(digested_candidate_peptides.begin(), digested_candidate_peptides.end(), + [min_peptide_length](const AASequence& peptide) { return peptide.size() < min_peptide_length; }) + ,digested_candidate_peptides.end()); + + // Creates a map of masses to positions from a vector + map> mass_position_map = NeighborSeq::createMassPositionMap(digested_candidate_peptides); + + for (auto i = all_peptides.begin(); i != all_peptides.end(); ++i) { - // if decoys found, program terminates with exit code 11 - OPENMS_LOG_FATAL_ERROR << "Invalid input in " + in[i] + ": Input file already contains decoys." << '\n'; - return INCOMPATIBLE_INPUT_DATA; - } + if (i->toString().find('X') == String::npos) + { + // Finde position of neighbor peptides candidates + vector candiadate_position = NeighborSeq::findCandidatePositions(mass_position_map, i->getMonoWeight(), mass_tolerance); - f.readStart(in[i]); - //------------------------------------------------------------- - // calculations - //------------------------------------------------------------- - while (f.readNext(entry)) + // Find neighbor peptides for the current peptide + vector result + = NeighborSeq::findNeighborPeptides(*i, digested_candidate_peptides, candiadate_position, min_shared_ion_fraction, mz_bin_size); + for (size_t j = 0; j < result.size(); ++j) + { + // Get the corresponding neighbor entry + const auto& neighbor_entry = neighbor_entries[result[j]]; + // Create a new FASTAEntry with the neighbor peptide sequence + FASTAFile::FASTAEntry new_entry(entry.identifier, entry.description, digested_candidate_peptides[result[j]].toString()); + // Write the neighbor peptide to the output file + neig.writeNext(new_entry); + } + } + } + in.push_back(out_neighbor); + } + + // Configure Enzymatic digestion + // TODO: allow user-specified regex + // check if decoy_string is common decoy string (e.g. decoy, rev, ...) + String decoy_string_lower = decoy_string; + decoy_string_lower.toLower(); + bool is_common = false; + for (const auto& a : DecoyHelper::affixes) { - if (identifiers.find(entry.identifier) != identifiers.end()) + if ((decoy_string_lower.hasPrefix(a) && decoy_string_position_prefix) || (decoy_string_lower.hasSuffix(a) && ! decoy_string_position_prefix)) { - OPENMS_LOG_WARN << "DecoyDatabase: Warning, identifier '" << entry.identifier << "' occurs more than once!" << endl; + is_common = true; } - identifiers.insert(entry.identifier); - - if (append) + } + // terminate, if decoy_string is not one of the allowed decoy strings (exit code 11) + if (! is_common) + { + if (getFlag_("force")) { - f.writeNext(entry); + OPENMS_LOG_WARN << "Force Flag is enabled, decoys with custom decoy string (not in DecoyHelper::affixes) will not be detected.\n"; + } + else + { + OPENMS_LOG_FATAL_ERROR << "Given decoy string is not allowed. Please use one of the strings in DecoyHelper::affixes as either prefix or " + "suffix (case insensitive): \n"; + return INCOMPATIBLE_INPUT_DATA; } + } + MRMDecoy m; + m.setParameters(decoy_param); - // identifier - entry.identifier = getIdentifier_(entry.identifier, decoy_string, decoy_string_position_prefix); + Math::RandomShuffler shuffler(seed); + for (Size i = 0; i < in.size(); ++i) + { + // check input files for decoys + FASTAContainer in_entries {in[i]}; + auto r = DecoyHelper::countDecoys(in_entries); + // if decoys found, throw exception + if (static_cast(r.all_prefix_occur + r.all_suffix_occur) >= 0.4 * static_cast(r.all_proteins_count)) + { + // if decoys found, program terminates with exit code 11 + OPENMS_LOG_FATAL_ERROR << "Invalid input in " + in[i] + ": Input file already contains decoys." << '\n'; + return INCOMPATIBLE_INPUT_DATA; + } - // sequence - if (input_type == SeqType::RNA) + f.readStart(in[i]); + //------------------------------------------------------------- + // calculations + //------------------------------------------------------------- + while (f.readNext(entry)) { - string quick_seq = entry.sequence; - bool five_p = (entry.sequence.front() == 'p'); - bool three_p = (entry.sequence.back() == 'p'); - if (five_p) //we don't want to reverse terminal phosphates - { - quick_seq.erase(0, 1); - } - if (three_p) + if (identifiers.find(entry.identifier) != identifiers.end()) { - quick_seq.pop_back(); + OPENMS_LOG_WARN << "DecoyDatabase: Warning, identifier '" << entry.identifier << "' occurs more than once!" << endl; } + identifiers.insert(entry.identifier); + + if (append) { f.writeNext(entry); } - vector tokenized; - std::smatch m; - std::string pattern = R"([^\[]|(\[[^\[\]]*\]))"; - std::regex re(pattern); + // identifier + entry.identifier = getIdentifier_(entry.identifier, decoy_string, decoy_string_position_prefix); - while (std::regex_search(quick_seq, m, re)) + // sequence + if (input_type == SeqType::RNA) { + string quick_seq = entry.sequence; + bool five_p = (entry.sequence.front() == 'p'); + bool three_p = (entry.sequence.back() == 'p'); + if (five_p) // we don't want to reverse terminal phosphates + { + quick_seq.erase(0, 1); + } + if (three_p) { quick_seq.pop_back(); } + + vector tokenized; + std::smatch m; + std::string pattern = R"([^\[]|(\[[^\[\]]*\]))"; + std::regex re(pattern); + + while (std::regex_search(quick_seq, m, re)) + { tokenized.emplace_back(m.str(0)); quick_seq = m.suffix(); - } + } - if (shuffle) - { - shuffler.portable_random_shuffle(tokenized.begin(), tokenized.end()); - } - else // reverse - { - reverse(tokenized.begin(), tokenized.end()); //reverse the tokens - } - if (five_p) //add back 5' - { - tokenized.insert(tokenized.begin(), String("p")); - } - if (three_p) //add back 3' - { - tokenized.emplace_back("p"); + if (shuffle) { shuffler.portable_random_shuffle(tokenized.begin(), tokenized.end()); } + else // reverse + { + reverse(tokenized.begin(), tokenized.end()); // reverse the tokens + } + if (five_p) // add back 5' + { + tokenized.insert(tokenized.begin(), String("p")); + } + if (three_p) // add back 3' + { + tokenized.emplace_back("p"); + } + entry.sequence = ListUtils::concatenate(tokenized, ""); } - entry.sequence = ListUtils::concatenate(tokenized, ""); - } - else // protein input - { - // if (terminal_aminos != "none") - if (enzyme != "no cleavage" && (keepN || keepC)) + else // protein input { - std::vector peptides; - digestion.digest(AASequence::fromString(entry.sequence), peptides); - String new_sequence = ""; - for (auto const& peptide : peptides) + // if (terminal_aminos != "none") + if (enzyme != "no cleavage" && (keepN || keepC)) { - //TODO why are the functions from TargetedExperiment and MRMDecoy not anywhere more general? - // No soul would look there. + std::vector peptides; + digestion.digest(AASequence::fromString(entry.sequence), peptides); + String new_sequence = ""; + for (auto const& peptide : peptides) + { + // TODO why are the functions from TargetedExperiment and MRMDecoy not anywhere more general? + // No soul would look there. + if (shuffle) + { + OpenMS::TargetedExperiment::Peptide p; + p.sequence = peptide.toString(); + OpenMS::TargetedExperiment::Peptide decoy_p = m.shufflePeptide(p, identity_threshold, seed, max_attempts); + new_sequence += decoy_p.sequence; + } + else + { + OpenMS::TargetedExperiment::Peptide p; + p.sequence = peptide.toString(); + OpenMS::TargetedExperiment::Peptide decoy_p = MRMDecoy::reversePeptide(p, keepN, keepC, keep_const_pattern); + new_sequence += decoy_p.sequence; + } + } + entry.sequence = new_sequence; + } + else + { + // sequence if (shuffle) { - OpenMS::TargetedExperiment::Peptide p; - p.sequence = peptide.toString(); - OpenMS::TargetedExperiment::Peptide decoy_p = m.shufflePeptide(p, identity_threshold, seed, max_attempts); - new_sequence += decoy_p.sequence; + shuffler.seed(seed); // identical proteins are shuffled the same way -> re-seed + shuffler.portable_random_shuffle(entry.sequence.begin(), entry.sequence.end()); } - else + else // reverse { - OpenMS::TargetedExperiment::Peptide p; - p.sequence = peptide.toString(); - OpenMS::TargetedExperiment::Peptide decoy_p = MRMDecoy::reversePeptide(p, keepN, keepC, keep_const_pattern); - new_sequence += decoy_p.sequence; + entry.sequence.reverse(); } } - entry.sequence = new_sequence; } - else - { - // sequence - if (shuffle) - { - shuffler.seed(seed); // identical proteins are shuffled the same way -> re-seed - shuffler.portable_random_shuffle(entry.sequence.begin(), entry.sequence.end()); - } - else // reverse - { - entry.sequence.reverse(); - } - } - } - - //------------------------------------------------------------- - // writing output - //------------------------------------------------------------- - f.writeNext(entry); - } // next protein - } // input files + //------------------------------------------------------------- + // writing output + //------------------------------------------------------------- + f.writeNext(entry); + } // next protein + } // input files + return EXECUTION_OK; } - };