-
Notifications
You must be signed in to change notification settings - Fork 5
Aho corasick #102
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Aho corasick #102
Changes from all commits
d48135f
f3d584d
71ab0db
9bc3f9b
6e41056
4fa9a2c
085c814
312379b
ce7066e
0b8fda2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -35,7 +35,8 @@ | |
| #pragma once | ||
|
|
||
|
|
||
| #include <OpenMS/ANALYSIS/ID/AhoCorasickAmbiguous.h> | ||
| //#include <OpenMS/ANALYSIS/ID/AhoCorasickAmbiguous.h> | ||
| #include <OpenMS/ANALYSIS/ID/AhoCorasickDA.h> | ||
| #include <OpenMS/CHEMISTRY/ProteaseDigestion.h> | ||
| #include <OpenMS/CHEMISTRY/ProteaseDB.h> | ||
| #include <OpenMS/CONCEPT/LogStream.h> | ||
|
|
@@ -56,6 +57,8 @@ | |
| #include <atomic> | ||
| #include <algorithm> | ||
| #include <fstream> | ||
| #include <chrono> | ||
|
|
||
|
|
||
|
|
||
| namespace OpenMS | ||
|
|
@@ -330,7 +333,8 @@ namespace OpenMS | |
| BUILD Peptide DB | ||
| */ | ||
| bool has_illegal_AAs(false); | ||
| AhoCorasickAmbiguous::PeptideDB pep_DB; | ||
| //AhoCorasickAmbiguous::PeptideDB pep_DB; | ||
| std::vector<String> pep_DB; | ||
| for (std::vector<PeptideIdentification>::const_iterator it1 = pep_ids.begin(); it1 != pep_ids.end(); ++it1) | ||
| { | ||
| //String run_id = it1->getIdentifier(); | ||
|
|
@@ -342,27 +346,30 @@ namespace OpenMS | |
| // do not skip over peptides here, since the results are iterated in the same way | ||
| // | ||
| String seq = it2->getSequence().toUnmodifiedString().remove('*'); // make a copy, i.e. do NOT change the peptide sequence! | ||
| if (seqan::isAmbiguous(seqan::AAString(seq.c_str()))) | ||
| /* if (seqan::isAmbiguous(seqan::AAString(seq.c_str()))) | ||
| { // do not quit here, to show the user all sequences .. only quit after loop | ||
| OPENMS_LOG_ERROR << "Peptide sequence '" << it2->getSequence() << "' contains one or more ambiguous amino acids (B|J|Z|X).\n"; | ||
| has_illegal_AAs = true; | ||
| } | ||
| }*/ | ||
| if (IL_equivalent_) // convert L to I; | ||
| { | ||
| seq.substitute('L', 'I'); | ||
| } | ||
| appendValue(pep_DB, seq.c_str()); | ||
|
|
||
| //appendValue(pep_DB, seq.c_str()); | ||
| pep_DB.push_back(seq); | ||
| } | ||
| } | ||
| if (has_illegal_AAs) | ||
| { | ||
| OPENMS_LOG_ERROR << "One or more peptides contained illegal amino acids. This is not allowed!" | ||
| << "\nPlease either remove the peptide or replace it with one of the unambiguous ones (while allowing for ambiguous AA's to match the protein)." << std::endl;; | ||
| } | ||
| //length(pep_DB) | ||
|
|
||
| OPENMS_LOG_INFO << "Mapping " << length(pep_DB) << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl; | ||
| OPENMS_LOG_INFO << "Mapping " << pep_DB.size() << " peptides to " << (proteins.size() == PROTEIN_CACHE_SIZE ? "? (unknown number of)" : String(proteins.size())) << " proteins." << std::endl; | ||
|
|
||
| if (length(pep_DB) == 0) | ||
| if (pep_DB.size() == 0) | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| { // Aho-Corasick will crash if given empty needles as input | ||
| OPENMS_LOG_WARN << "Warning: Peptide identifications have no hits inside! Output will be empty as well." << std::endl; | ||
| return PEPTIDE_IDS_EMPTY; | ||
|
|
@@ -376,8 +383,17 @@ namespace OpenMS | |
| OPENMS_LOG_INFO << "Building trie ..."; | ||
| StopWatch s; | ||
| s.start(); | ||
| AhoCorasickAmbiguous::FuzzyACPattern pattern; | ||
| AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern); | ||
| auto s1 = std::chrono::high_resolution_clock::now(); | ||
|
|
||
|
|
||
| //AhoCorasickAmbiguous::FuzzyACPattern pattern; | ||
| //AhoCorasickAmbiguous::initPattern(pep_DB, aaa_max_, mm_max_, pattern); | ||
| Size pos = 0; | ||
| Size idx = 0; | ||
| AhoCorasickDA fuzzyAC(pep_DB); | ||
| auto s2 = std::chrono::high_resolution_clock::now(); | ||
| std::chrono::duration<double,std::milli> elapsed = s2 - s1; | ||
| std::cout << "Construction time " << elapsed.count() << " ms"<< std::endl; | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why the extra chrono? |
||
| s.stop(); | ||
| OPENMS_LOG_INFO << " done (" << int(s.getClockTime()) << "s)" << std::endl; | ||
| s.reset(); | ||
|
|
@@ -389,40 +405,44 @@ namespace OpenMS | |
| this->startProgress(0, proteins.size() == PROTEIN_CACHE_SIZE ? std::numeric_limits<SignedSize>::max() : proteins.size(), "Aho-Corasick"); | ||
| std::atomic<int> progress_prots(0); | ||
| #ifdef _OPENMP | ||
| #pragma omp parallel | ||
| //#pragma omp parallel | ||
| #endif | ||
| { | ||
| FoundProteinFunctor func_threads(enzyme, xtandem_fix_parameters); | ||
| Map<String, Size> acc_to_prot_thread; // map: accessions --> FASTA protein index | ||
| AhoCorasickAmbiguous fuzzyAC; | ||
| //AhoCorasickAmbiguous fuzzyAC; | ||
|
|
||
| String prot; | ||
|
|
||
| while (true) | ||
| { | ||
| #pragma omp barrier // all threads need to be here, since we are about to swap protein data | ||
| #pragma omp single | ||
| { | ||
| DEBUG_ONLY std::cerr << " activating cache ...\n"; | ||
| //DEBUG_ONLY std::cerr << " activating cache ...\n"; | ||
| has_active_data = proteins.activateCache(); // swap in last cache | ||
| protein_accessions.resize(proteins.getChunkOffset() + proteins.chunkSize()); | ||
| } // implicit barrier here | ||
|
|
||
| if (!has_active_data) break; // leave while-loop | ||
|
|
||
| SignedSize prot_count = (SignedSize)proteins.chunkSize(); | ||
| std::cout << "Prot Count " << prot_count << std::endl; | ||
|
|
||
| #pragma omp master | ||
| { | ||
| DEBUG_ONLY std::cerr << "Filling Protein Cache ..."; | ||
| //DEBUG_ONLY std::cerr << "Filling Protein Cache ..."; | ||
| proteins.cacheChunk(PROTEIN_CACHE_SIZE); | ||
| protein_is_decoy.resize(proteins.getChunkOffset() + prot_count); | ||
| for (SignedSize i = 0; i < prot_count; ++i) | ||
| { // do this in master only, to avoid false sharing | ||
| const String& seq = proteins.chunkAt(i).identifier; | ||
| protein_is_decoy[i + proteins.getChunkOffset()] = (prefix_ ? seq.hasPrefix(decoy_string_) : seq.hasSuffix(decoy_string_)); | ||
| } | ||
| DEBUG_ONLY std::cerr << " done" << std::endl; | ||
| //DEBUG_ONLY std::cerr << " done" << std::endl; | ||
| } | ||
| DEBUG_ONLY std::cerr << " starting for loop \n"; | ||
| auto t1 = std::chrono::high_resolution_clock::now(); | ||
| //DEBUG_ONLY std::cerr << " starting for loop \n"; | ||
| // search all peptides in each protein | ||
| #pragma omp for schedule(dynamic, 100) nowait | ||
| for (SignedSize i = 0; i < prot_count; ++i) | ||
|
|
@@ -471,7 +491,7 @@ namespace OpenMS | |
| while ((offset = prot.find(jumpX, offset + 1)) != std::string::npos) | ||
| { | ||
| //std::cout << "found X..X at " << offset << " in protein " << proteins[i].identifier << "\n"; | ||
| addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads); | ||
| addHits_(fuzzyAC, pos, idx, aaa_max_, pep_DB, prot.substr(start, offset + jumpX.size() - start), prot, prot_idx, (int)start, func_threads); | ||
| // skip ahead while we encounter more X... | ||
| while (offset + jumpX.size() < prot.size() && prot[offset + jumpX.size()] == 'X') ++offset; | ||
| start = offset; | ||
|
|
@@ -480,12 +500,12 @@ namespace OpenMS | |
| // last chunk | ||
| if (start < prot.size()) | ||
| { | ||
| addHits_(fuzzyAC, pattern, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads); | ||
| addHits_(fuzzyAC, pos, idx, aaa_max_, pep_DB, prot.substr(start), prot, prot_idx, (int)start, func_threads); | ||
| } | ||
| } | ||
| else | ||
| { | ||
| addHits_(fuzzyAC, pattern, pep_DB, prot, prot, prot_idx, 0, func_threads); | ||
| addHits_(fuzzyAC, pos, idx, aaa_max_, pep_DB, prot, prot, prot_idx, 0, func_threads); | ||
| } | ||
| // was protein found? | ||
| if (hits_total < func_threads.filter_passed + func_threads.filter_rejected) | ||
|
|
@@ -495,11 +515,16 @@ namespace OpenMS | |
| } | ||
| } // end parallel FOR | ||
|
|
||
| auto t2 = std::chrono::high_resolution_clock::now(); | ||
| std::chrono::duration<double,std::milli> elapsed = t2 - t1; | ||
| std::cout << "Retrieval time of doubel-array based AC " << elapsed.count()/prot_count << " ms per Protein"<< std::endl; | ||
|
|
||
| // join results again | ||
| DEBUG_ONLY std::cerr << " critical now \n"; | ||
| // DEBUG_ONLY std::cerr << " critical now \n"; | ||
| #ifdef _OPENMP | ||
| #pragma omp critical(PeptideIndexer_joinAC) | ||
| #endif | ||
|
|
||
| { | ||
| s.start(); | ||
| // hits | ||
|
|
@@ -512,11 +537,11 @@ namespace OpenMS | |
| } // end readChunk | ||
| } // OMP end parallel | ||
| this->endProgress(); | ||
|
|
||
| std::cout << "Merge took: " << s.toString() << "\n"; | ||
| mu.after(); | ||
| std::cout << mu.delta("Aho-Corasick") << "\n\n"; | ||
|
|
||
| OPENMS_LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << length(pep_DB) << " peptides.\n"; | ||
| OPENMS_LOG_INFO << "\nAho-Corasick done:\n found " << func.filter_passed << " hits for " << func.pep_to_prot.size() << " of " << pep_DB.size() << " peptides.\n"; | ||
|
|
||
| // write some stats | ||
| OPENMS_LOG_INFO << "Peptide hits passing enzyme filter: " << func.filter_passed << "\n" | ||
|
|
@@ -902,16 +927,15 @@ namespace OpenMS | |
|
|
||
| }; | ||
|
|
||
| inline void addHits_(AhoCorasickAmbiguous& fuzzyAC, const AhoCorasickAmbiguous::FuzzyACPattern& pattern, const AhoCorasickAmbiguous::PeptideDB& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const | ||
| inline void addHits_(AhoCorasickDA& fuzzyAC, Size& pos_in_protein, Size& peptide_index, const Int amb_max, const std::vector<String>& pep_DB, const String& prot, const String& full_prot, SignedSize idx_prot, Int offset, FoundProteinFunctor& func_threads) const | ||
| { | ||
| fuzzyAC.setProtein(prot); | ||
| while (fuzzyAC.findNext(pattern)) | ||
| fuzzyAC.setProtein(prot.c_str(), amb_max); | ||
| while (fuzzyAC.findNext(pos_in_protein, peptide_index)) | ||
| { | ||
| const seqan::Peptide& tmp_pep = pep_DB[fuzzyAC.getHitDBIndex()]; | ||
| func_threads.addHit(fuzzyAC.getHitDBIndex(), idx_prot, length(tmp_pep), full_prot, fuzzyAC.getHitProteinPosition() + offset); | ||
| const String& tmp_pep = pep_DB[peptide_index]; | ||
| func_threads.addHit(peptide_index, idx_prot, tmp_pep.length(), full_prot, pos_in_protein + offset); | ||
| } | ||
| } | ||
|
|
||
| void updateMembers_() override; | ||
|
|
||
| String decoy_string_{}; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you get rid of the seqan includes?