Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
437 changes: 437 additions & 0 deletions src/openms/include/OpenMS/ANALYSIS/ID/AhoCorasickDA.h

Large diffs are not rendered by default.

78 changes: 51 additions & 27 deletions src/openms/include/OpenMS/ANALYSIS/ID/PeptideIndexing.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@
#pragma once


#include <OpenMS/ANALYSIS/ID/AhoCorasickAmbiguous.h>
//#include <OpenMS/ANALYSIS/ID/AhoCorasickAmbiguous.h>
#include <OpenMS/ANALYSIS/ID/AhoCorasickDA.h>
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you get rid of the seqan includes?

#include <OpenMS/CHEMISTRY/ProteaseDigestion.h>
#include <OpenMS/CHEMISTRY/ProteaseDB.h>
#include <OpenMS/CONCEPT/LogStream.h>
Expand All @@ -56,6 +57,8 @@
#include <atomic>
#include <algorithm>
#include <fstream>
#include <chrono>



namespace OpenMS
Expand Down Expand Up @@ -330,7 +333,8 @@ namespace OpenMS
BUILD Peptide DB
*/
bool has_illegal_AAs(false);
AhoCorasickAmbiguous::PeptideDB pep_DB;
//AhoCorasickAmbiguous::PeptideDB pep_DB;
std::vector<String> pep_DB;
for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1)
{
//String run_id = it1->getIdentifier();
Expand All @@ -342,27 +346,30 @@ namespace OpenMS
// do not skip over peptides here, since the results are iterated in the same way
//
String seq = it2->getSequence().toUnmodifiedString().remove('*'); // make a copy, i.e. do NOT change the peptide sequence!
if (seqan::isAmbiguous(seqan::AAString(seq.c_str())))
/* if (seqan::isAmbiguous(seqan::AAString(seq.c_str())))
{ // do not quit here, to show the user all sequences .. only quit after loop
OPENMS_LOG_ERROR << "Peptide sequence '" << it2->getSequence() << "' contains one or more ambiguous amino acids (B|J|Z|X).\n";
has_illegal_AAs = true;
}
}*/
if (IL_equivalent_) // convert L to I;
{
seq.substitute('L', 'I');
}
appendValue(pep_DB, seq.c_str());

//appendValue(pep_DB, seq.c_str());
pep_DB.push_back(seq);
}
}
if (has_illegal_AAs)
{
OPENMS_LOG_ERROR << "One or more peptides contained illegal amino acids. This is not allowed!"
<< "\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;;
}
//length(pep_DB)

OPENMS_LOG_INFO << "Mapping " << length(pep_DB) << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl;
OPENMS_LOG_INFO << "Mapping " << pep_DB.size() << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl;

if (length(pep_DB) == 0)
if (pep_DB.size() == 0)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pep_DB.empty() ?

{ // Aho-Corasick will crash if given empty needles as input
OPENMS_LOG_WARN << "Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl;
return PEPTIDE_IDS_EMPTY;
Expand All @@ -376,8 +383,17 @@ namespace OpenMS
OPENMS_LOG_INFO << "Building trie ...";
StopWatch s;
s.start();
AhoCorasickAmbiguous::FuzzyACPattern pattern;
AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern);
auto s1 = std::chrono::high_resolution_clock::now();


//AhoCorasickAmbiguous::FuzzyACPattern pattern;
//AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern);
Size pos = 0;
Size idx = 0;
AhoCorasickDA fuzzyAC(pep_DB);
auto s2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double,std::milli> elapsed = s2 - s1;
std::cout << "Construction time " << elapsed.count() << " ms"<< std::endl;
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the extra chrono?
StopWatch is a high-resolution clock
(just don't convert to int() as done below to retain its accuracy)

s.stop();
OPENMS_LOG_INFO << " done (" << int(s.getClockTime()) << "s)" << std::endl;
s.reset();
Expand All @@ -389,40 +405,44 @@ namespace OpenMS
this->startProgress(0, proteins.size() == PROTEIN_CACHE_SIZE ? std::numeric_limits<SignedSize>::max() : proteins.size(), "Aho-Corasick");
std::atomic<int> progress_prots(0);
#ifdef _OPENMP
#pragma omp parallel
//#pragma omp parallel
#endif
{
FoundProteinFunctor func_threads(enzyme, xtandem_fix_parameters);
Map<String, Size> acc_to_prot_thread; // map: accessions --> FASTA protein index
AhoCorasickAmbiguous fuzzyAC;
//AhoCorasickAmbiguous fuzzyAC;

String prot;

while (true)
{
#pragma omp barrier // all threads need to be here, since we are about to swap protein data
#pragma omp single
{
DEBUG_ONLY std::cerr << " activating cache ...\n";
//DEBUG_ONLY std::cerr << " activating cache ...\n";
has_active_data = proteins.activateCache(); // swap in last cache
protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize());
} // implicit barrier here

if (!has_active_data) break; // leave while-loop

SignedSize prot_count = (SignedSize)proteins.chunkSize();
std::cout << "Prot Count " << prot_count << std::endl;

#pragma omp master
{
DEBUG_ONLY std::cerr << "Filling Protein Cache ...";
//DEBUG_ONLY std::cerr << "Filling Protein Cache ...";
proteins.cacheChunk(PROTEIN_CACHE_SIZE);
protein_is_decoy.resize(proteins.getChunkOffset() + prot_count);
for (SignedSize i = 0; i < prot_count; ++i)
{ // do this in master only, to avoid false sharing
const String& seq = proteins.chunkAt(i).identifier;
protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.hasPrefix(decoy_string_) : seq.hasSuffix(decoy_string_));
}
DEBUG_ONLY std::cerr << " done" << std::endl;
//DEBUG_ONLY std::cerr << " done" << std::endl;
}
DEBUG_ONLY std::cerr << " starting for loop \n";
auto t1 = std::chrono::high_resolution_clock::now();
//DEBUG_ONLY std::cerr << " starting for loop \n";
// search all peptides in each protein
#pragma omp for schedule(dynamic, 100) nowait
for (SignedSize i = 0; i < prot_count; ++i)
Expand Down Expand Up @@ -471,7 +491,7 @@ namespace OpenMS
while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos)
{
//std::cout << "found X..X at " << offset << " in protein " << proteins[i].identifier << "\n";
addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads);
addHits_(fuzzyAC, pos, idx, aaa_max_, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads);
// skip ahead while we encounter more X...
while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] == 'X') ++offset;
start = offset;
Expand All @@ -480,12 +500,12 @@ namespace OpenMS
// last chunk
if (start < prot.size())
{
addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads);
addHits_(fuzzyAC, pos, idx, aaa_max_, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads);
}
}
else
{
addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads);
addHits_(fuzzyAC, pos, idx, aaa_max_, pep_DB, prot, prot, prot_idx, 0, func_threads);
}
// was protein found?
if (hits_total < func_threads.filter_passed + func_threads.filter_rejected)
Expand All @@ -495,11 +515,16 @@ namespace OpenMS
}
} // end parallel FOR

auto t2 = std::chrono::high_resolution_clock::now();
std::chrono::duration<double,std::milli> elapsed = t2 - t1;
std::cout << "Retrieval time of doubel-array based AC " << elapsed.count()/prot_count << " ms per Protein"<< std::endl;

// join results again
DEBUG_ONLY std::cerr << " critical now \n";
// DEBUG_ONLY std::cerr << " critical now \n";
#ifdef _OPENMP
#pragma omp critical(PeptideIndexer_joinAC)
#endif

{
s.start();
// hits
Expand All @@ -512,11 +537,11 @@ namespace OpenMS
} // end readChunk
} // OMP end parallel
this->endProgress();

std::cout << "Merge took: " << s.toString() << "\n";
mu.after();
std::cout << mu.delta("Aho-Corasick") << "\n\n";

OPENMS_LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << length(pep_DB) << " peptides.\n";
OPENMS_LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << pep_DB.size() << " peptides.\n";

// write some stats
OPENMS_LOG_INFO << "Peptide hits passing enzyme filter: " << func.filter_passed << "\n"
Expand Down Expand Up @@ -902,16 +927,15 @@ namespace OpenMS

};

inline void addHits_(AhoCorasickAmbiguous& fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern& pattern, const AhoCorasickAmbiguous::PeptideDB& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const
inline void addHits_(AhoCorasickDA& fuzzyAC, Size& pos_in_protein, Size& peptide_index, const Int amb_max, const std::vector<String>& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const
{
fuzzyAC.setProtein(prot);
while (fuzzyAC.findNext(pattern))
fuzzyAC.setProtein(prot.c_str(), amb_max);
while (fuzzyAC.findNext(pos_in_protein, peptide_index))
{
const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.getHitDBIndex()];
func_threads.addHit(fuzzyAC.getHitDBIndex(), idx_prot, length(tmp_pep), full_prot, fuzzyAC.getHitProteinPosition() + offset);
const String& tmp_pep = pep_DB[peptide_index];
func_threads.addHit(peptide_index, idx_prot, tmp_pep.length(), full_prot, pos_in_protein + offset);
}
}

void updateMembers_() override;

String decoy_string_{};
Expand Down
1 change: 1 addition & 0 deletions src/openms/include/OpenMS/ANALYSIS/ID/sources.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set(directory include/OpenMS/ANALYSIS/ID)
set(sources_list_h
AccurateMassSearchEngine.h
AhoCorasickAmbiguous.h
AhoCorasickDA.h
AScore.h
BasicProteinInferenceAlgorithm.h
BayesianProteinInferenceAlgorithm.h
Expand Down
Loading