From b93b2648683d261457571f7ff41fe2917132b412 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Wed, 31 Mar 2021 23:19:58 +0200 Subject: [PATCH 01/53] added myself as author --- AUTHORS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/AUTHORS b/AUTHORS index 0bfb4d827f3..0be2d875021 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,4 @@ + ========================================================================= OpenMS -- Open Source Mass Spectrometry ========================================================================= @@ -85,6 +86,7 @@ the authors tag in the respective file header. - Swenja Wagner - Taraneh Strunk - Timo Sachsenberg + - Tinatin Kasradze - Tom Lukas Lankenau - Tom Waschischeck - Uwe Schmitt @@ -92,3 +94,4 @@ the authors tag in the respective file header. - Volker Mosthaf - Witold Wolski - Xiao Liang + From b12f802f07dbb899c49c0ff9f88223b6fe6f00cd Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Wed, 31 Mar 2021 23:23:17 +0200 Subject: [PATCH 02/53] added myself as author --- AUTHORS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/AUTHORS b/AUTHORS index 0be2d875021..fc8da732d91 100644 --- a/AUTHORS +++ b/AUTHORS @@ -95,3 +95,5 @@ the authors tag in the respective file header. - Witold Wolski - Xiao Liang + + From 257a6b90237acfa5cf18de6dabe1b8393339356f Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Sat, 17 Apr 2021 16:58:42 +0200 Subject: [PATCH 03/53] FASTA-Reader-Files without using Seqan --- FASTAReader/FASTAFile_testnew.cpp | 163 +++++++++++++++++++++ FASTAReader/FASTAFile_testnew.fasta | 34 +++++ FASTAReader/FASTAFilenew.cpp | 214 ++++++++++++++++++++++++++++ FASTAReader/FASTAFilenew.h | 179 +++++++++++++++++++++++ 4 files changed, 590 insertions(+) create mode 100644 FASTAReader/FASTAFile_testnew.cpp create mode 100644 FASTAReader/FASTAFile_testnew.fasta create mode 100644 FASTAReader/FASTAFilenew.cpp create mode 100644 FASTAReader/FASTAFilenew.h diff --git a/FASTAReader/FASTAFile_testnew.cpp b/FASTAReader/FASTAFile_testnew.cpp new file mode 100644 index 00000000000..49a3738f195 --- /dev/null +++ b/FASTAReader/FASTAFile_testnew.cpp @@ -0,0 +1,163 @@ +#include +#include + +/////////////////////////// + +#include + +#include +#include +#include +#include + +#include + +/////////////////////////// + +START_TEST(FASTAFile, "$Id$") + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +using namespace OpenMS; +using namespace std; + +FASTAFile* ptr = nullptr; //wird der konstruktor getestet +START_SECTION((FASTAFile())) + ptr = new FASTAFile(); + TEST_EQUAL(ptr == nullptr, false) +END_SECTION + +START_SECTION((~FASTAFile())) //wird der destruktor getestet + delete(ptr); +END_SECTION + +START_SECTION([FASTAFile::FASTAEntry] FASTAEntry())//fasta entry wird getestet + FASTAFile::FASTAEntry * ptr_e; + ptr_e = new FASTAFile::FASTAEntry(); + TEST_EQUAL(ptr_e == nullptr, false) +END_SECTION + +START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) + FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); //fasta entry wird befüllt + TEST_EQUAL(entry.identifier, "ID") //hat das befüllen richtig geklappt + TEST_EQUAL(entry.description, "DESC") + TEST_EQUAL(entry.sequence, "DAVLDELNER") +END_SECTION + +START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const)// == operator wird getestet + FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); + TEST_EQUAL(entry1==entry2, true) + TEST_EQUAL(entry1==entry3, false) +END_SECTION + + +START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data)))//load funktion wird getestet + vector data; + FASTAFile file; + + TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); //in diese Datei auch falsche Zeichen einfügen und einen peff header?//in data ligt jetzt die test fasta datei + vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin();//zum ersten protein gehen + TEST_EQUAL(data.size(), 5)//5 proteine + TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) //erstes protein vergleichen + TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) + TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + + String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + + String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + + String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + + String("LRDNLTLWTSDQQDEEAGEGN")) + sequences_iterator++; //zum nächsten protein + TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") + TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + + String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + + String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + + String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + + String("LWTSENQGDEGDAGEGEN")) + sequences_iterator++; //in diese sequenz wurden falsche zeichen eingefügt und es wird getestet ob sie entfernt wurden + TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") + TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) + TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + sequences_iterator++;//hier wurde ein PEFF header angefügt, der übersprungen werden soll + TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") + TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) + TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + // test if the modifed sequence is convertable + AASequence aa = AASequence::fromString(sequences_iterator->sequence);//wieso war die modifiziert vorher? woran erkennt man das?-> im header! + TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + TEST_EQUAL(aa.isModified(), true) //wieso soll das true sein wenn es vorher zu unmodified converted wurde + String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); + TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) + + sequences_iterator++; //last sequence, wenn fasta format keine zeilenumbrüche hat + TEST_EQUAL(sequences_iterator->identifier, "test") + TEST_EQUAL(sequences_iterator->description, String(" ##0")) + TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") + + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") + + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") + + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") + + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") + + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) + +END_SECTION + +START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) + vector data, data2; + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) + + file.store(tmp_filename,data); + file.load(tmp_filename,data2); + TEST_EQUAL(data==data2,true); // vectoren mit fasta entries auf gleichheit testen +END_SECTION + +/* +START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') //diesen Teil hier anpassen bzw löschen, weil das oben getestet wurde und * nicht erlaubt sein sollte und außerdem während des einlesens gecheckt wird und nicht erst danach beim pushbacken + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + */ + +END_SECTION + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// +END_TEST + diff --git a/FASTAReader/FASTAFile_testnew.fasta b/FASTAReader/FASTAFile_testnew.fasta new file mode 100644 index 00000000000..16d892be41a --- /dev/null +++ b/FASTAReader/FASTAFile_testnew.fasta @@ -0,0 +1,34 @@ +>P68509|1433F_BOVIN This is the description of the first protein +GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWR +VISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKV +FYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEHMQPTHPIRLGLALNFSV +FYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEE +AGEGN + +>Q9CQV8|1433B_MOUSE This is the description of the second protein +TMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSW +RVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLILNATQAESKVFYL +KMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYY +EILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDA +GEGEN +>sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3 +MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS +WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY +LKMKGDYFRYLSEVASGDNKQTT VSNS*QQAY5QEAFE!ISKKEMQPTHPIRLGLALNF +SVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQG +DEGDAGEGEN + +# PEFF Description block +# Decoy=false +# DbDescription=extract of neXtProt with manual modifications +# GeneralComment= A selection of protein entries +>sp|P00000|0000A_UNKNOWN Artificially modified version of sp|P31946|1433B_HUMAN +(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL +SVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKY +LIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTH +PIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDN +LTLWTSENQGDEGDAGEGEN + +> test ##0 +GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV + diff --git a/FASTAReader/FASTAFilenew.cpp b/FASTAReader/FASTAFilenew.cpp new file mode 100644 index 00000000000..f1b07d130b0 --- /dev/null +++ b/FASTAReader/FASTAFilenew.cpp @@ -0,0 +1,214 @@ +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +//welche includes können jetzt raus? + +//testen mit datei FASTAFile_test.cpp + + +namespace OpenMS +{ + using namespace std; + //typedef seqan::RecordReader > FASTARecordReader; //FASTARecordReader selbst definieren ohne Seqan + //record reader war ne klasse von seqan implementiert in src/openms/thirdparty/seqan/stream/record_reader_single.h + + + FASTAFile::FASTAFile() + : entries_read_(0) + { + } + + FASTAFile::~FASTAFile() + { + // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. + } + + bool FASTAFile::readRecordNew(std::string & id, std::string & seq) + { + std::string line; + if(TextFile::getLine(infile_, line)) + { + id.push_back(line) + } + else return false; + while(TextFile::getLine(infile_, line) && line[0] != ">") + { + for(unsigned i=0; i<=line.length();++i)//removing illegal characters//besser erst nach dem einlesen vor dem assignen zur sequenz? + { + if(line[i]<='A' || line[i]>='Z')//ambigous akzeptieren wir hier + { + line.erase(i,i+1); + } + } + seq.push_back(line); + } + if(seq.empty()) return false; + return true; + } + + void FASTAFile::readStart(const String& filename) + { + if (!File::exists(filename)) + { + throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } + + if (!File::readable(filename)) + { + throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } + + if (infile_.is_open()) infile_.close(); // precaution + + infile_.open(filename.c_str(), std::ios::binary | std::ios::in); + + infile_.seekg(0, infile_.end); + fileSize_ = infile_.tellg();//tell g returns input position + infile_.seekg(0, infile_.beg);//wieder position zurück zum anfang setzen + + + // Skip the header of PEFF files (http://www.psidev.info/peff) + std::string line; + std::streampos firstline = 0; + while (TextFile::getLine(infile_, line)) + { + if (!line.empty() && line[0] != '#')//wenn es kein header of PEFF files gibt + { + break; + } + firstline = infile_.tellg(); // tellg(): returns:The current position of the get pointer on success, pos_type(-1) on failure//wenn es einen header gibt wird dieser geskipped + } + infile_.seekg(firstline); + + entries_read_ = 0; + } + + bool FASTAFile::readNext(FASTAEntry& protein) + { + if (infile_.eof())// //könnte man sonst auch mit fileSize_ prüfen + { + return false; + } + + String id, s; + if (readRecordNew(id, s) != 0)//fehler beim einlesen + { + if (entries_read_ == 0) s = "The first entry could not be read!"; + else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; + throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); + } + ++entries_read_; + //s.removeWhitespaces();//funktion von openms? wird oben beim einlesen schon gecheckt + protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) + + // handle id + id.trim(); + String::size_type position = id.find_first_of(" \v\t"); + if (position == String::npos) + { + protein.identifier = std::move(id); + protein.description = ""; + } + else + { + protein.identifier = id.substr(0, position); + protein.description = id.suffix(id.size() - position - 1); + } + return true; + } + + std::streampos FASTAFile::position() const + { + return infile_.tellg(); //ist das dann überhaupt vom typ streampos? -> kann umgewandelt werden + } + + bool FASTAFile::setPosition(const std::streampos& pos) + { + if(pos <= fileSize_) + { + infile_.seekg(pos);//relative to the beginning of the stream + return true; + } + return false; //expeption throwen dass es nicht in dem erlaubten bereich ist? + } + + bool FASTAFile::atEnd() const + { + return infile_.eof(); + } + + void FASTAFile::load(const String& filename, vector& data) + { + data.clear(); + FASTAEntry p; + FASTAFile f; + f.readStart(filename); + while (f.readNext(p)) + { + data.push_back(std::move(p)); + } + return; + } + + + void FASTAFile::writeStart(const String& filename) + { + if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) + { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); + } + + outfile_.open(filename.c_str(), ofstream::out); + + if (!outfile_.good()) + { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } + } + + void FASTAFile::writeNext(const FASTAEntry& protein) + { + outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; + const String& tmp(protein.sequence); + + int chunks( tmp.size()/80 ); // number of complete chunks + Size chunk_pos(0); + while (--chunks >= 0) + { + outfile_.write(&tmp[chunk_pos], 80); + outfile_ << "\n"; + chunk_pos += 80; + } + if (tmp.size() > chunk_pos) + { + outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); + outfile_ << "\n"; + } + } + + void FASTAFile::writeEnd() + { + outfile_.close(); + } + + void FASTAFile::store(const String& filename, const vector& data) + { + FASTAFile f; + f.writeStart(filename); + for (vector::const_iterator it = data.begin(); it != data.end(); ++it) + { + f.writeNext(*it); + } + f.writeEnd(); // close file + } + +} // namespace OpenMS + diff --git a/FASTAReader/FASTAFilenew.h b/FASTAReader/FASTAFilenew.h new file mode 100644 index 00000000000..5d99138b661 --- /dev/null +++ b/FASTAReader/FASTAFilenew.h @@ -0,0 +1,179 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace OpenMS +{ + /** + @brief This class serves for reading in and writing FASTA files + If the protein/gene sequence contains unusual symbols (such as translation end (*)), + they will be kept! + You can use aggregate methods load() and store() to read/write a + set of protein sequences at the cost of memory. + + Or use single read/write of protein sequences using readStart(), readNext() + and writeStart(), writeNext(), writeEnd() for more memory efficiency. + Reading from one and writing to another FASTA file can be handled by + one single FASTAFile instance. + */ + + class OPENMS_DLLAPI FASTAFile + { +public: + /** + @brief FASTA entry type (identifier, description and sequence) + The first String corresponds to the identifier that is + written after the > in the FASTA file. The part after the + first whitespace is stored in description and the text + from the next line until the next > (exclusive) is stored + in sequence. + */ + struct FASTAEntry + { + String identifier; + String description; + String sequence; + + FASTAEntry() : + identifier(), + description(), + sequence() + { + } + + FASTAEntry(String id, String desc, String seq) : + identifier(id), + description(desc), + sequence(seq) + { + } + + FASTAEntry(const FASTAEntry& rhs) + : + identifier(rhs.identifier), + description(rhs.description), + sequence(rhs.sequence) + { + } + + FASTAEntry(FASTAEntry&& rhs) noexcept + : + identifier(::std::move(rhs.identifier)), + description(::std::move(rhs.description)), + sequence(::std::move(rhs.sequence)) + { + } + + FASTAEntry& operator=(const FASTAEntry& rhs) + { + if (*this == rhs) return *this; + identifier = rhs.identifier; + description = rhs.description; + sequence = rhs.sequence; + return *this; + } + + bool operator==(const FASTAEntry& rhs) const + { + return identifier == rhs.identifier + && description == rhs.description + && sequence == rhs.sequence; + } + + bool headerMatches(const FASTAEntry& rhs) const + { + return identifier == rhs.identifier && + description == rhs.description; + } + + bool sequenceMatches(const FASTAEntry& rhs) const + { + return sequence == rhs.sequence; + } + }; + + /// Default constructor + FASTAFile(); + + /// Destructor + virtual ~FASTAFile(); + + /** + @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + void readStart(const String& filename); + + /** + @brief Reads the next FASTA entry from file. + If you want to read all entries in one go, use load(). + @return true if entry was read; false if eof was reached + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + bool readNext(FASTAEntry& protein); + + /// current stream position + std::streampos position() const; + + /// is stream at EOF? + bool atEnd() const; + + /// seek stream to @p pos + bool setPosition(const std::streampos& pos); + + /** + @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). + @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). + */ + void writeStart(const String& filename); + + /** + @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). + Call writeEnd() when done to close the file! + @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. + */ + void writeNext(const FASTAEntry& protein); + + /** + @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. + */ + void writeEnd(); + + + /** + @brief loads a FASTA file given by 'filename' and stores the information in 'data' + This uses more RAM than readStart() and readNext(). + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + void static load(const String& filename, std::vector& data); + + /** + @brief stores the data given by 'data' at the file 'filename' + + This uses more RAM than writeStart() and writeNext(). + @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. + */ + void static store(const String& filename, const std::vector& data); + + //eigene Implementierung des readRecord + bool FASTAFile::readRecordNew(std::string & id, std::string & seq); //die soll den vorhandenen infile_ benutzen + +protected: + std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() + std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() + Size entries_read_; ///< some internal book-keeping during reading + unsigned fileSize_{}; + }; + +} // namespace OpenMS + From e8a38372f25d9accfd5f4ffbb8b5adf555ad90aa Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 19 Apr 2021 11:04:58 +0200 Subject: [PATCH 04/53] Deleted the file from the git repository --- FASTAReader/FASTAFilenew.cpp | 214 ----------------------------------- 1 file changed, 214 deletions(-) delete mode 100644 FASTAReader/FASTAFilenew.cpp diff --git a/FASTAReader/FASTAFilenew.cpp b/FASTAReader/FASTAFilenew.cpp deleted file mode 100644 index f1b07d130b0..00000000000 --- a/FASTAReader/FASTAFilenew.cpp +++ /dev/null @@ -1,214 +0,0 @@ -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -//welche includes können jetzt raus? - -//testen mit datei FASTAFile_test.cpp - - -namespace OpenMS -{ - using namespace std; - //typedef seqan::RecordReader > FASTARecordReader; //FASTARecordReader selbst definieren ohne Seqan - //record reader war ne klasse von seqan implementiert in src/openms/thirdparty/seqan/stream/record_reader_single.h - - - FASTAFile::FASTAFile() - : entries_read_(0) - { - } - - FASTAFile::~FASTAFile() - { - // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. - } - - bool FASTAFile::readRecordNew(std::string & id, std::string & seq) - { - std::string line; - if(TextFile::getLine(infile_, line)) - { - id.push_back(line) - } - else return false; - while(TextFile::getLine(infile_, line) && line[0] != ">") - { - for(unsigned i=0; i<=line.length();++i)//removing illegal characters//besser erst nach dem einlesen vor dem assignen zur sequenz? - { - if(line[i]<='A' || line[i]>='Z')//ambigous akzeptieren wir hier - { - line.erase(i,i+1); - } - } - seq.push_back(line); - } - if(seq.empty()) return false; - return true; - } - - void FASTAFile::readStart(const String& filename) - { - if (!File::exists(filename)) - { - throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } - - if (!File::readable(filename)) - { - throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } - - if (infile_.is_open()) infile_.close(); // precaution - - infile_.open(filename.c_str(), std::ios::binary | std::ios::in); - - infile_.seekg(0, infile_.end); - fileSize_ = infile_.tellg();//tell g returns input position - infile_.seekg(0, infile_.beg);//wieder position zurück zum anfang setzen - - - // Skip the header of PEFF files (http://www.psidev.info/peff) - std::string line; - std::streampos firstline = 0; - while (TextFile::getLine(infile_, line)) - { - if (!line.empty() && line[0] != '#')//wenn es kein header of PEFF files gibt - { - break; - } - firstline = infile_.tellg(); // tellg(): returns:The current position of the get pointer on success, pos_type(-1) on failure//wenn es einen header gibt wird dieser geskipped - } - infile_.seekg(firstline); - - entries_read_ = 0; - } - - bool FASTAFile::readNext(FASTAEntry& protein) - { - if (infile_.eof())// //könnte man sonst auch mit fileSize_ prüfen - { - return false; - } - - String id, s; - if (readRecordNew(id, s) != 0)//fehler beim einlesen - { - if (entries_read_ == 0) s = "The first entry could not be read!"; - else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; - throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); - } - ++entries_read_; - //s.removeWhitespaces();//funktion von openms? wird oben beim einlesen schon gecheckt - protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) - - // handle id - id.trim(); - String::size_type position = id.find_first_of(" \v\t"); - if (position == String::npos) - { - protein.identifier = std::move(id); - protein.description = ""; - } - else - { - protein.identifier = id.substr(0, position); - protein.description = id.suffix(id.size() - position - 1); - } - return true; - } - - std::streampos FASTAFile::position() const - { - return infile_.tellg(); //ist das dann überhaupt vom typ streampos? -> kann umgewandelt werden - } - - bool FASTAFile::setPosition(const std::streampos& pos) - { - if(pos <= fileSize_) - { - infile_.seekg(pos);//relative to the beginning of the stream - return true; - } - return false; //expeption throwen dass es nicht in dem erlaubten bereich ist? - } - - bool FASTAFile::atEnd() const - { - return infile_.eof(); - } - - void FASTAFile::load(const String& filename, vector& data) - { - data.clear(); - FASTAEntry p; - FASTAFile f; - f.readStart(filename); - while (f.readNext(p)) - { - data.push_back(std::move(p)); - } - return; - } - - - void FASTAFile::writeStart(const String& filename) - { - if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) - { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); - } - - outfile_.open(filename.c_str(), ofstream::out); - - if (!outfile_.good()) - { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } - } - - void FASTAFile::writeNext(const FASTAEntry& protein) - { - outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; - const String& tmp(protein.sequence); - - int chunks( tmp.size()/80 ); // number of complete chunks - Size chunk_pos(0); - while (--chunks >= 0) - { - outfile_.write(&tmp[chunk_pos], 80); - outfile_ << "\n"; - chunk_pos += 80; - } - if (tmp.size() > chunk_pos) - { - outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); - outfile_ << "\n"; - } - } - - void FASTAFile::writeEnd() - { - outfile_.close(); - } - - void FASTAFile::store(const String& filename, const vector& data) - { - FASTAFile f; - f.writeStart(filename); - for (vector::const_iterator it = data.begin(); it != data.end(); ++it) - { - f.writeNext(*it); - } - f.writeEnd(); // close file - } - -} // namespace OpenMS - From c5d89511f1e8680f8217e1a6a11dcdace8b13d1e Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 19 Apr 2021 11:07:02 +0200 Subject: [PATCH 05/53] Deleted the file from the git repository --- FASTAReader/FASTAFilenew.h | 179 ------------------------------------- 1 file changed, 179 deletions(-) delete mode 100644 FASTAReader/FASTAFilenew.h diff --git a/FASTAReader/FASTAFilenew.h b/FASTAReader/FASTAFilenew.h deleted file mode 100644 index 5d99138b661..00000000000 --- a/FASTAReader/FASTAFilenew.h +++ /dev/null @@ -1,179 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include -#include -#include - -namespace OpenMS -{ - /** - @brief This class serves for reading in and writing FASTA files - If the protein/gene sequence contains unusual symbols (such as translation end (*)), - they will be kept! - You can use aggregate methods load() and store() to read/write a - set of protein sequences at the cost of memory. - - Or use single read/write of protein sequences using readStart(), readNext() - and writeStart(), writeNext(), writeEnd() for more memory efficiency. - Reading from one and writing to another FASTA file can be handled by - one single FASTAFile instance. - */ - - class OPENMS_DLLAPI FASTAFile - { -public: - /** - @brief FASTA entry type (identifier, description and sequence) - The first String corresponds to the identifier that is - written after the > in the FASTA file. The part after the - first whitespace is stored in description and the text - from the next line until the next > (exclusive) is stored - in sequence. - */ - struct FASTAEntry - { - String identifier; - String description; - String sequence; - - FASTAEntry() : - identifier(), - description(), - sequence() - { - } - - FASTAEntry(String id, String desc, String seq) : - identifier(id), - description(desc), - sequence(seq) - { - } - - FASTAEntry(const FASTAEntry& rhs) - : - identifier(rhs.identifier), - description(rhs.description), - sequence(rhs.sequence) - { - } - - FASTAEntry(FASTAEntry&& rhs) noexcept - : - identifier(::std::move(rhs.identifier)), - description(::std::move(rhs.description)), - sequence(::std::move(rhs.sequence)) - { - } - - FASTAEntry& operator=(const FASTAEntry& rhs) - { - if (*this == rhs) return *this; - identifier = rhs.identifier; - description = rhs.description; - sequence = rhs.sequence; - return *this; - } - - bool operator==(const FASTAEntry& rhs) const - { - return identifier == rhs.identifier - && description == rhs.description - && sequence == rhs.sequence; - } - - bool headerMatches(const FASTAEntry& rhs) const - { - return identifier == rhs.identifier && - description == rhs.description; - } - - bool sequenceMatches(const FASTAEntry& rhs) const - { - return sequence == rhs.sequence; - } - }; - - /// Default constructor - FASTAFile(); - - /// Destructor - virtual ~FASTAFile(); - - /** - @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - void readStart(const String& filename); - - /** - @brief Reads the next FASTA entry from file. - If you want to read all entries in one go, use load(). - @return true if entry was read; false if eof was reached - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - bool readNext(FASTAEntry& protein); - - /// current stream position - std::streampos position() const; - - /// is stream at EOF? - bool atEnd() const; - - /// seek stream to @p pos - bool setPosition(const std::streampos& pos); - - /** - @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). - @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). - */ - void writeStart(const String& filename); - - /** - @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). - Call writeEnd() when done to close the file! - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. - */ - void writeNext(const FASTAEntry& protein); - - /** - @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. - */ - void writeEnd(); - - - /** - @brief loads a FASTA file given by 'filename' and stores the information in 'data' - This uses more RAM than readStart() and readNext(). - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - void static load(const String& filename, std::vector& data); - - /** - @brief stores the data given by 'data' at the file 'filename' - - This uses more RAM than writeStart() and writeNext(). - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. - */ - void static store(const String& filename, const std::vector& data); - - //eigene Implementierung des readRecord - bool FASTAFile::readRecordNew(std::string & id, std::string & seq); //die soll den vorhandenen infile_ benutzen - -protected: - std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() - std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() - Size entries_read_; ///< some internal book-keeping during reading - unsigned fileSize_{}; - }; - -} // namespace OpenMS - From 5ceedcbfc9fcd29ebfb5d9e1cff587f8acca6da4 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 19 Apr 2021 11:07:49 +0200 Subject: [PATCH 06/53] Deleted the file from the git repository --- FASTAReader/FASTAFile_testnew.cpp | 163 ------------------------------ 1 file changed, 163 deletions(-) delete mode 100644 FASTAReader/FASTAFile_testnew.cpp diff --git a/FASTAReader/FASTAFile_testnew.cpp b/FASTAReader/FASTAFile_testnew.cpp deleted file mode 100644 index 49a3738f195..00000000000 --- a/FASTAReader/FASTAFile_testnew.cpp +++ /dev/null @@ -1,163 +0,0 @@ -#include -#include - -/////////////////////////// - -#include - -#include -#include -#include -#include - -#include - -/////////////////////////// - -START_TEST(FASTAFile, "$Id$") - -///////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////// - -using namespace OpenMS; -using namespace std; - -FASTAFile* ptr = nullptr; //wird der konstruktor getestet -START_SECTION((FASTAFile())) - ptr = new FASTAFile(); - TEST_EQUAL(ptr == nullptr, false) -END_SECTION - -START_SECTION((~FASTAFile())) //wird der destruktor getestet - delete(ptr); -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry())//fasta entry wird getestet - FASTAFile::FASTAEntry * ptr_e; - ptr_e = new FASTAFile::FASTAEntry(); - TEST_EQUAL(ptr_e == nullptr, false) -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) - FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); //fasta entry wird befüllt - TEST_EQUAL(entry.identifier, "ID") //hat das befüllen richtig geklappt - TEST_EQUAL(entry.description, "DESC") - TEST_EQUAL(entry.sequence, "DAVLDELNER") -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const)// == operator wird getestet - FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); - TEST_EQUAL(entry1==entry2, true) - TEST_EQUAL(entry1==entry3, false) -END_SECTION - - -START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data)))//load funktion wird getestet - vector data; - FASTAFile file; - - TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); //in diese Datei auch falsche Zeichen einfügen und einen peff header?//in data ligt jetzt die test fasta datei - vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin();//zum ersten protein gehen - TEST_EQUAL(data.size(), 5)//5 proteine - TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) //erstes protein vergleichen - TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) - TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + - String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + - String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + - String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + - String("LRDNLTLWTSDQQDEEAGEGN")) - sequences_iterator++; //zum nächsten protein - TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") - TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + - String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + - String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + - String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + - String("LWTSENQGDEGDAGEGEN")) - sequences_iterator++; //in diese sequenz wurden falsche zeichen eingefügt und es wird getestet ob sie entfernt wurden - TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") - TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) - TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - sequences_iterator++;//hier wurde ein PEFF header angefügt, der übersprungen werden soll - TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") - TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) - TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - // test if the modifed sequence is convertable - AASequence aa = AASequence::fromString(sequences_iterator->sequence);//wieso war die modifiziert vorher? woran erkennt man das?-> im header! - TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - TEST_EQUAL(aa.isModified(), true) //wieso soll das true sein wenn es vorher zu unmodified converted wurde - String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); - TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) - - sequences_iterator++; //last sequence, wenn fasta format keine zeilenumbrüche hat - TEST_EQUAL(sequences_iterator->identifier, "test") - TEST_EQUAL(sequences_iterator->description, String(" ##0")) - TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") - + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") - + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") - + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") - + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") - + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) - -END_SECTION - -START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) - vector data, data2; - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) - - file.store(tmp_filename,data); - file.load(tmp_filename,data2); - TEST_EQUAL(data==data2,true); // vectoren mit fasta entries auf gleichheit testen -END_SECTION - -/* -START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') //diesen Teil hier anpassen bzw löschen, weil das oben getestet wurde und * nicht erlaubt sein sollte und außerdem während des einlesens gecheckt wird und nicht erst danach beim pushbacken - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); - */ - -END_SECTION - -///////////////////////////////////////////////////////////// -///////////////////////////////////////////////////////////// -END_TEST - From 30c420f2937ca7b0b99e3aa5b093b3afabe472b6 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 19 Apr 2021 11:08:26 +0200 Subject: [PATCH 07/53] Deleted the file from the git repository --- FASTAReader/FASTAFile_testnew.fasta | 34 ----------------------------- 1 file changed, 34 deletions(-) delete mode 100644 FASTAReader/FASTAFile_testnew.fasta diff --git a/FASTAReader/FASTAFile_testnew.fasta b/FASTAReader/FASTAFile_testnew.fasta deleted file mode 100644 index 16d892be41a..00000000000 --- a/FASTAReader/FASTAFile_testnew.fasta +++ /dev/null @@ -1,34 +0,0 @@ ->P68509|1433F_BOVIN This is the description of the first protein -GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWR -VISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKV -FYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEHMQPTHPIRLGLALNFSV -FYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEE -AGEGN - ->Q9CQV8|1433B_MOUSE This is the description of the second protein -TMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSW -RVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLILNATQAESKVFYL -KMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYY -EILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDA -GEGEN ->sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3 -MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS -WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY -LKMKGDYFRYLSEVASGDNKQTT VSNS*QQAY5QEAFE!ISKKEMQPTHPIRLGLALNF -SVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQG -DEGDAGEGEN - -# PEFF Description block -# Decoy=false -# DbDescription=extract of neXtProt with manual modifications -# GeneralComment= A selection of protein entries ->sp|P00000|0000A_UNKNOWN Artificially modified version of sp|P31946|1433B_HUMAN -(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL -SVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKY -LIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTH -PIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDN -LTLWTSENQGDEGDAGEGEN - -> test ##0 -GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV - From 87d5034c76b19b053fda3a94a3d37a5046ba3c14 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 19 Apr 2021 12:18:24 +0200 Subject: [PATCH 08/53] Fasta-Reader without using Seqan --- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 320 +++++++++--------- src/openms/source/FORMAT/FASTAFile.cpp | 266 +++++++-------- .../openms/data/FASTAFile_test.fasta | 11 +- .../openms/source/FASTAFile_test.cpp | 270 +++++++-------- 4 files changed, 436 insertions(+), 431 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index bb314c4a762..429e0c676f8 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -45,180 +45,180 @@ namespace OpenMS { - /** - @brief This class serves for reading in and writing FASTA files - - If the protein/gene sequence contains unusual symbols (such as translation end (*)), - they will be kept! - - You can use aggregate methods load() and store() to read/write a - set of protein sequences at the cost of memory. - - Or use single read/write of protein sequences using readStart(), readNext() - and writeStart(), writeNext(), writeEnd() for more memory efficiency. - Reading from one and writing to another FASTA file can be handled by - one single FASTAFile instance. - - */ - - class OPENMS_DLLAPI FASTAFile - { -public: - /** - @brief FASTA entry type (identifier, description and sequence) - - The first String corresponds to the identifier that is - written after the > in the FASTA file. The part after the - first whitespace is stored in description and the text - from the next line until the next > (exclusive) is stored - in sequence. - */ - struct FASTAEntry - { - String identifier; - String description; - String sequence; - - FASTAEntry() : - identifier(), - description(), - sequence() - { - } - - FASTAEntry(String id, String desc, String seq) : - identifier(id), - description(desc), - sequence(seq) - { - } - - FASTAEntry(const FASTAEntry& rhs) - : - identifier(rhs.identifier), - description(rhs.description), - sequence(rhs.sequence) - { - } - - FASTAEntry(FASTAEntry&& rhs) noexcept - : - identifier(::std::move(rhs.identifier)), - description(::std::move(rhs.description)), - sequence(::std::move(rhs.sequence)) - { - } - - FASTAEntry& operator=(const FASTAEntry& rhs) - { - if (*this == rhs) return *this; - identifier = rhs.identifier; - description = rhs.description; - sequence = rhs.sequence; - return *this; - } - - bool operator==(const FASTAEntry& rhs) const - { - return identifier == rhs.identifier - && description == rhs.description - && sequence == rhs.sequence; - } - - bool headerMatches(const FASTAEntry& rhs) const - { - return identifier == rhs.identifier && - description == rhs.description; - } - - bool sequenceMatches(const FASTAEntry& rhs) const - { - return sequence == rhs.sequence; - } - }; - - /// Default constructor - FASTAFile(); - - /// Destructor - virtual ~FASTAFile(); - - /** - @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). - - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - void readStart(const String& filename); - /** - @brief Reads the next FASTA entry from file. - - If you want to read all entries in one go, use load(). - - @return true if entry was read; false if eof was reached - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - bool readNext(FASTAEntry& protein); + @brief This class serves for reading in and writing FASTA files - /// current stream position - std::streampos position() const; + If the protein/gene sequence contains unusual symbols (such as translation end (*)), + they will be kept! - /// is stream at EOF? - bool atEnd() const; + You can use aggregate methods load() and store() to read/write a + set of protein sequences at the cost of memory. - /// seek stream to @p pos - bool setPosition(const std::streampos& pos); + Or use single read/write of protein sequences using readStart(), readNext() + and writeStart(), writeNext(), writeEnd() for more memory efficiency. + Reading from one and writing to another FASTA file can be handled by + one single FASTAFile instance. - /** - @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). - - @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). */ - void writeStart(const String& filename); - - /** - @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). - - Call writeEnd() when done to close the file! - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. - */ - void writeNext(const FASTAEntry& protein); + class OPENMS_DLLAPI FASTAFile + { + public: + /** + @brief FASTA entry type (identifier, description and sequence) + + The first String corresponds to the identifier that is + written after the > in the FASTA file. The part after the + first whitespace is stored in description and the text + from the next line until the next > (exclusive) is stored + in sequence. + */ + struct FASTAEntry + { + String identifier; + String description; + String sequence; + + FASTAEntry() : + identifier(), + description(), + sequence() + { + } + + FASTAEntry(String id, String desc, String seq) : + identifier(id), + description(desc), + sequence(seq) + { + } + + FASTAEntry(const FASTAEntry& rhs) + : + identifier(rhs.identifier), + description(rhs.description), + sequence(rhs.sequence) + { + } + + FASTAEntry(FASTAEntry&& rhs) noexcept + : + identifier(::std::move(rhs.identifier)), + description(::std::move(rhs.description)), + sequence(::std::move(rhs.sequence)) + { + } + + FASTAEntry& operator=(const FASTAEntry& rhs) + { + if (*this == rhs) return *this; + identifier = rhs.identifier; + description = rhs.description; + sequence = rhs.sequence; + return *this; + } + + bool operator==(const FASTAEntry& rhs) const + { + return identifier == rhs.identifier + && description == rhs.description + && sequence == rhs.sequence; + } + + bool headerMatches(const FASTAEntry& rhs) const + { + return identifier == rhs.identifier && + description == rhs.description; + } + + bool sequenceMatches(const FASTAEntry& rhs) const + { + return sequence == rhs.sequence; + } + }; + + /// Default constructor + FASTAFile(); + + /// Destructor + virtual ~FASTAFile(); + + /** + @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). + + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + void readStart(const String& filename); + + /** + @brief Reads the next FASTA entry from file. + + If you want to read all entries in one go, use load(). + + @return true if entry was read; false if eof was reached + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + bool readNext(FASTAEntry& protein); + + /// current stream position + std::streampos position() const; + + /// is stream at EOF? + bool atEnd() const; + + /// seek stream to @p pos + bool setPosition(const std::streampos& pos); + + /** + @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). + + @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). + */ + void writeStart(const String& filename); + + /** + @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). + + Call writeEnd() when done to close the file! + + @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. + */ + void writeNext(const FASTAEntry& protein); + + /** + @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. + + */ + void writeEnd(); - /** - @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. - */ - void writeEnd(); - + /** + @brief loads a FASTA file given by 'filename' and stores the information in 'data' + + This uses more RAM than readStart() and readNext(). - /** - @brief loads a FASTA file given by 'filename' and stores the information in 'data' + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + void static load(const String& filename, std::vector& data); - This uses more RAM than readStart() and readNext(). + /** + @brief stores the data given by 'data' at the file 'filename' - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - void static load(const String& filename, std::vector& data); + This uses more RAM than writeStart() and writeNext(). - /** - @brief stores the data given by 'data' at the file 'filename' - - This uses more RAM than writeStart() and writeNext(). + @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. + */ + void static store(const String& filename, const std::vector& data); - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. - */ - void static store(const String& filename, const std::vector& data); - -protected: - std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() - std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() - std::unique_ptr > reader_; ///< filestream for reading; init using FastaFile::readStart(); needs to be a pointer, since its not copy-constructable; we use void* here, to avoid pulling in seqan includes - Size entries_read_; ///< some internal book-keeping during reading - }; + protected: + std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() + std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() + std::unique_ptr > reader_; ///< filestream for reading; init using FastaFile::readStart(); needs to be a pointer, since its not copy-constructable; we use void* here, to avoid pulling in seqan includes + Size entries_read_; ///< some internal book-keeping during reading + }; } // namespace OpenMS diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 3d0fe861014..cc344888a2c 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -46,172 +46,172 @@ namespace OpenMS { - using namespace std; - typedef seqan::RecordReader > FASTARecordReader; - - FASTAFile::FASTAFile() - : reader_(std::nullptr_t()), // point to nothing - entries_read_(0) - { - } - - FASTAFile::~FASTAFile() - { - // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. - } - - void FASTAFile::readStart(const String& filename) - { - if (!File::exists(filename)) + using namespace std; + typedef seqan::RecordReader > FASTARecordReader; + + FASTAFile::FASTAFile() + : reader_(std::nullptr_t()), // point to nothing + entries_read_(0) { - throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); } - if (!File::readable(filename)) + FASTAFile::~FASTAFile() { - throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. } - if (infile_.is_open()) infile_.close(); // precaution + void FASTAFile::readStart(const String& filename) + { + if (!File::exists(filename)) + { + throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } - infile_.open(filename.c_str(), std::ios::binary | std::ios::in); + if (!File::readable(filename)) + { + throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } - // Skip the header of PEFF files (http://www.psidev.info/peff) - std::string line; - std::streampos firstline = 0; - while (TextFile::getLine(infile_, line)) - { - if (!line.empty() && line[0] != '#') + if (infile_.is_open()) infile_.close(); // precaution + + infile_.open(filename.c_str(), std::ios::binary | std::ios::in); + + // Skip the header of PEFF files (http://www.psidev.info/peff) + std::string line; + std::streampos firstline = 0; + while (TextFile::getLine(infile_, line)) { - break; + if (!line.empty() && line[0] != '#') + { + break; + } + firstline = infile_.tellg(); } - firstline = infile_.tellg(); - } - infile_.seekg(firstline); - - // automatically deletes old handles - reader_ = std::unique_ptr >(new FASTARecordReader(infile_), - [](void* ptr) - { // lambda with custom cast - delete static_cast(ptr); - }); - - entries_read_ = 0; - } - - bool FASTAFile::readNext(FASTAEntry& protein) - { - if (seqan::atEnd(*static_cast(reader_.get()))) - { - // do NOT close(), since we still might want to seek to certain positions - return false; + infile_.seekg(firstline); + + // automatically deletes old handles + reader_ = std::unique_ptr >(new FASTARecordReader(infile_), + [](void* ptr) + { // lambda with custom cast + delete static_cast(ptr); + }); + + entries_read_ = 0; } - String id, s; - if (readRecord(id, s, *static_cast(reader_.get()), seqan::Fasta()) != 0) + + bool FASTAFile::readNext(FASTAEntry& protein) { - if (entries_read_ == 0) s = "The first entry could not be read!"; - else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; - throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); + if (seqan::atEnd(*static_cast(reader_.get()))) + { + // do NOT close(), since we still might want to seek to certain positions + return false; + } + String id, s; + if (readRecord(id, s, *static_cast(reader_.get()), seqan::Fasta()) != 0) + { + if (entries_read_ == 0) s = "The first entry could not be read!"; + else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; + throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); + } + ++entries_read_; + s.removeWhitespaces(); + protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) + + // handle id + id.trim(); + String::size_type position = id.find_first_of(" \v\t"); + if (position == String::npos) + { + protein.identifier = std::move(id); + protein.description = ""; + } + else + { + protein.identifier = id.substr(0, position); + protein.description = id.suffix(id.size() - position - 1); + } + + return true; } - ++entries_read_; - s.removeWhitespaces(); - protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) - - // handle id - id.trim(); - String::size_type position = id.find_first_of(" \v\t"); - if (position == String::npos) + + std::streampos FASTAFile::position() const { - protein.identifier = std::move(id); - protein.description = ""; + return seqan::position(*static_cast(reader_.get())); } - else + + bool FASTAFile::setPosition(const std::streampos& pos) { - protein.identifier = id.substr(0, position); - protein.description = id.suffix(id.size() - position - 1); + return (seqan::setPosition(*static_cast(reader_.get()), pos) == 0); } - return true; - } - - std::streampos FASTAFile::position() const - { - return seqan::position(*static_cast(reader_.get())); - } - - bool FASTAFile::setPosition(const std::streampos& pos) - { - return (seqan::setPosition(*static_cast(reader_.get()), pos) == 0); - } - - bool FASTAFile::atEnd() const - { - return seqan::atEnd(*static_cast(reader_.get())); - } - - void FASTAFile::load(const String& filename, vector& data) - { - data.clear(); - FASTAEntry p; - FASTAFile f; - f.readStart(filename); - while (f.readNext(p)) + bool FASTAFile::atEnd() const { - data.push_back(std::move(p)); + return seqan::atEnd(*static_cast(reader_.get())); } - return; - } - void FASTAFile::writeStart(const String& filename) - { - if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) + void FASTAFile::load(const String& filename, vector& data) { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); + data.clear(); + FASTAEntry p; + FASTAFile f; + f.readStart(filename); + while (f.readNext(p)) + { + data.push_back(std::move(p)); + } + return; } - outfile_.open(filename.c_str(), ofstream::out); - - if (!outfile_.good()) + void FASTAFile::writeStart(const String& filename) { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } - } + if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) + { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); + } - void FASTAFile::writeNext(const FASTAEntry& protein) - { - outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; - const String& tmp(protein.sequence); + outfile_.open(filename.c_str(), ofstream::out); - int chunks( tmp.size()/80 ); // number of complete chunks - Size chunk_pos(0); - while (--chunks >= 0) + if (!outfile_.good()) + { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } + } + + void FASTAFile::writeNext(const FASTAEntry& protein) { - outfile_.write(&tmp[chunk_pos], 80); - outfile_ << "\n"; - chunk_pos += 80; + outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; + const String& tmp(protein.sequence); + + int chunks( tmp.size()/80 ); // number of complete chunks + Size chunk_pos(0); + while (--chunks >= 0) + { + outfile_.write(&tmp[chunk_pos], 80); + outfile_ << "\n"; + chunk_pos += 80; + } + + if (tmp.size() > chunk_pos) + { + outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); + outfile_ << "\n"; + } } - if (tmp.size() > chunk_pos) + void FASTAFile::writeEnd() { - outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); - outfile_ << "\n"; + outfile_.close(); } - } - - void FASTAFile::writeEnd() - { - outfile_.close(); - } - - void FASTAFile::store(const String& filename, const vector& data) - { - FASTAFile f; - f.writeStart(filename); - for (vector::const_iterator it = data.begin(); it != data.end(); ++it) + + void FASTAFile::store(const String& filename, const vector& data) { - f.writeNext(*it); + FASTAFile f; + f.writeStart(filename); + for (vector::const_iterator it = data.begin(); it != data.end(); ++it) + { + f.writeNext(*it); + } + f.writeEnd(); // close file } - f.writeEnd(); // close file - } } // namespace OpenMS diff --git a/src/tests/class_tests/openms/data/FASTAFile_test.fasta b/src/tests/class_tests/openms/data/FASTAFile_test.fasta index 8e573a1a6af..aceeecca384 100755 --- a/src/tests/class_tests/openms/data/FASTAFile_test.fasta +++ b/src/tests/class_tests/openms/data/FASTAFile_test.fasta @@ -14,9 +14,14 @@ GEGEN >sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY -LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY -YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD -AGEGEN +LKMKGDYFRYLSEVASGDNKQTT VSNS*QQAY5QEAFE!ISKKEMQPTHPIRLGLALNF +SVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQG +DEGDAGEGEN + +# PEFF Description block +# Decoy=false +# DbDescription=extract of neXtProt with manual modifications +# GeneralComment= A selection of protein entries >sp|P00000|0000A_UNKNOWN Artificially modified version of sp|P31946|1433B_HUMAN (ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL SVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKY diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index 305f5734f91..45c804edcda 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -53,141 +53,141 @@ START_TEST(FASTAFile, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// -using namespace OpenMS; -using namespace std; - -FASTAFile* ptr = nullptr; -START_SECTION((FASTAFile())) - ptr = new FASTAFile(); - TEST_EQUAL(ptr == nullptr, false) -END_SECTION - -START_SECTION((~FASTAFile())) - delete(ptr); -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) - FASTAFile::FASTAEntry * ptr_e; - ptr_e = new FASTAFile::FASTAEntry(); - TEST_EQUAL(ptr_e == nullptr, false) -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) - FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); - TEST_EQUAL(entry.identifier, "ID") - TEST_EQUAL(entry.description, "DESC") - TEST_EQUAL(entry.sequence, "DAVLDELNER") -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) - FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); - TEST_EQUAL(entry1==entry2, true) - TEST_EQUAL(entry1==entry3, false) -END_SECTION - - -START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) - vector data; - FASTAFile file; - - TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); - TEST_EQUAL(data.size(), 5) - TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) - TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) - TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + - String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + - String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + - String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + - String("LRDNLTLWTSDQQDEEAGEGN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") - TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + - String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + - String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + - String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + - String("LWTSENQGDEGDAGEGEN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") - TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) - TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") - TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) - TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - // test if the modifed sequence is convertable - AASequence aa = AASequence::fromString(sequences_iterator->sequence); - TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - TEST_EQUAL(aa.isModified(), true) - String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); - TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "test") - TEST_EQUAL(sequences_iterator->description, String(" ##0")) - TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") - + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") - + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") - + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") - + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") - + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) - -END_SECTION - -START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) - vector data, data2; - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) - - file.store(tmp_filename,data); - file.load(tmp_filename,data2); - TEST_EQUAL(data==data2,true); -END_SECTION - -START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); - -END_SECTION + using namespace OpenMS; + using namespace std; + + FASTAFile* ptr = nullptr; + START_SECTION((FASTAFile())) + ptr = new FASTAFile(); + TEST_EQUAL(ptr == nullptr, false) + END_SECTION + + START_SECTION((~FASTAFile())) + delete(ptr); + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) + FASTAFile::FASTAEntry * ptr_e; + ptr_e = new FASTAFile::FASTAEntry(); + TEST_EQUAL(ptr_e == nullptr, false) + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) + FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); + TEST_EQUAL(entry.identifier, "ID") + TEST_EQUAL(entry.description, "DESC") + TEST_EQUAL(entry.sequence, "DAVLDELNER") + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) + FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); + TEST_EQUAL(entry1==entry2, true) + TEST_EQUAL(entry1==entry3, false) + END_SECTION + + + START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) + vector data; + FASTAFile file; + + TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); + TEST_EQUAL(data.size(), 5) + TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) + TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) + TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + + String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + + String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + + String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + + String("LRDNLTLWTSDQQDEEAGEGN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") + TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + + String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + + String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + + String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + + String("LWTSENQGDEGDAGEGEN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") + TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) + TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") + TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) + TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + // test if the modifed sequence is convertable + AASequence aa = AASequence::fromString(sequences_iterator->sequence); + TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + TEST_EQUAL(aa.isModified(), true) + String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); + TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "test") + TEST_EQUAL(sequences_iterator->description, String(" ##0")) + TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") + + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") + + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") + + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") + + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") + + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) + + END_SECTION + + START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) + vector data, data2; + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) + + file.store(tmp_filename,data); + file.load(tmp_filename,data2); + TEST_EQUAL(data==data2,true); + END_SECTION + + START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + + END_SECTION ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// From 42c320e12395bc0e066454d86fb75d4207ad793f Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 19 Apr 2021 12:29:24 +0200 Subject: [PATCH 09/53] Fasta-Reader without using Seqan --- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 19 +- src/openms/source/FORMAT/FASTAFile.cpp | 240 +++++++++------- .../openms/source/FASTAFile_test.cpp | 272 +++++++++--------- 3 files changed, 271 insertions(+), 260 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 429e0c676f8..6b9406f377d 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -47,10 +47,8 @@ namespace OpenMS { /** @brief This class serves for reading in and writing FASTA files - If the protein/gene sequence contains unusual symbols (such as translation end (*)), they will be kept! - You can use aggregate methods load() and store() to read/write a set of protein sequences at the cost of memory. @@ -58,7 +56,6 @@ namespace OpenMS and writeStart(), writeNext(), writeEnd() for more memory efficiency. Reading from one and writing to another FASTA file can be handled by one single FASTAFile instance. - */ class OPENMS_DLLAPI FASTAFile @@ -66,7 +63,6 @@ namespace OpenMS public: /** @brief FASTA entry type (identifier, description and sequence) - The first String corresponds to the identifier that is written after the > in the FASTA file. The part after the first whitespace is stored in description and the text @@ -143,9 +139,10 @@ namespace OpenMS /// Destructor virtual ~FASTAFile(); + bool FASTAFile::readRecordNew(std::string & id, std::string & seq); + /** @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). - @exception Exception::FileNotFound is thrown if the file does not exists. @exception Exception::ParseError is thrown if the file does not suit to the standard. */ @@ -153,9 +150,7 @@ namespace OpenMS /** @brief Reads the next FASTA entry from file. - If you want to read all entries in one go, use load(). - @return true if entry was read; false if eof was reached @exception Exception::FileNotFound is thrown if the file does not exists. @exception Exception::ParseError is thrown if the file does not suit to the standard. @@ -173,32 +168,26 @@ namespace OpenMS /** @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). - @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). */ void writeStart(const String& filename); /** @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). - Call writeEnd() when done to close the file! - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. */ void writeNext(const FASTAEntry& protein); /** @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. - */ void writeEnd(); /** @brief loads a FASTA file given by 'filename' and stores the information in 'data' - This uses more RAM than readStart() and readNext(). - @exception Exception::FileNotFound is thrown if the file does not exists. @exception Exception::ParseError is thrown if the file does not suit to the standard. */ @@ -208,7 +197,6 @@ namespace OpenMS @brief stores the data given by 'data' at the file 'filename' This uses more RAM than writeStart() and writeNext(). - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. */ void static store(const String& filename, const std::vector& data); @@ -216,9 +204,8 @@ namespace OpenMS protected: std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() - std::unique_ptr > reader_; ///< filestream for reading; init using FastaFile::readStart(); needs to be a pointer, since its not copy-constructable; we use void* here, to avoid pulling in seqan includes Size entries_read_; ///< some internal book-keeping during reading + unsigned fileSize_{}; }; } // namespace OpenMS - diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index cc344888a2c..dfa4638fccb 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -47,171 +47,193 @@ namespace OpenMS { using namespace std; - typedef seqan::RecordReader > FASTARecordReader; FASTAFile::FASTAFile() - : reader_(std::nullptr_t()), // point to nothing - entries_read_(0) + : entries_read_(0) { } FASTAFile::~FASTAFile() { - // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. + // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. + } + + bool FASTAFile::readRecordNew(std::string & id, std::string & seq) + { + std::string line; + if(TextFile::getLine(infile_, line)) + { + id.push_back(line) + } + else return false; + while(TextFile::getLine(infile_, line) && line[0] != ">") + { + for(unsigned i=0; i<=line.length();++i)//removing illegal characters//besser erst nach dem einlesen vor dem assignen zur sequenz? + { + if(line[i]<='A' || line[i]>='Z')//ambigous akzeptieren wir hier + { + line.erase(i,i+1); + } + } + seq.push_back(line); + } + if(seq.empty()) return false; + return true; } void FASTAFile::readStart(const String& filename) { - if (!File::exists(filename)) - { - throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } + if (!File::exists(filename)) + { + throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } - if (!File::readable(filename)) - { - throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } + if (!File::readable(filename)) + { + throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } - if (infile_.is_open()) infile_.close(); // precaution + if (infile_.is_open()) infile_.close(); // precaution - infile_.open(filename.c_str(), std::ios::binary | std::ios::in); + infile_.open(filename.c_str(), std::ios::binary | std::ios::in); - // Skip the header of PEFF files (http://www.psidev.info/peff) - std::string line; - std::streampos firstline = 0; - while (TextFile::getLine(infile_, line)) - { - if (!line.empty() && line[0] != '#') + infile_.seekg(0, infile_.end); + fileSize_ = infile_.tellg(); + infile_.seekg(0, infile_.beg); + + // Skip the header of PEFF files (http://www.psidev.info/peff) + std::string line; + std::streampos firstline = 0; + while (TextFile::getLine(infile_, line)) { - break; + if (!line.empty() && line[0] != '#') + { + break; + } + firstline = infile_.tellg(); } - firstline = infile_.tellg(); - } - infile_.seekg(firstline); - - // automatically deletes old handles - reader_ = std::unique_ptr >(new FASTARecordReader(infile_), - [](void* ptr) - { // lambda with custom cast - delete static_cast(ptr); - }); + infile_.seekg(firstline); - entries_read_ = 0; + entries_read_ = 0; } bool FASTAFile::readNext(FASTAEntry& protein) { - if (seqan::atEnd(*static_cast(reader_.get()))) - { - // do NOT close(), since we still might want to seek to certain positions - return false; - } - String id, s; - if (readRecord(id, s, *static_cast(reader_.get()), seqan::Fasta()) != 0) - { - if (entries_read_ == 0) s = "The first entry could not be read!"; - else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; - throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); - } - ++entries_read_; - s.removeWhitespaces(); - protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) - - // handle id - id.trim(); - String::size_type position = id.find_first_of(" \v\t"); - if (position == String::npos) - { - protein.identifier = std::move(id); - protein.description = ""; - } - else - { - protein.identifier = id.substr(0, position); - protein.description = id.suffix(id.size() - position - 1); - } - - return true; + if (infile_.eof()) + { + return false; + } + + String id, s; + if (readRecordNew(id, s) != 0) + { + if (entries_read_ == 0) s = "The first entry could not be read!"; + else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; + throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); + } + ++entries_read_; + + protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) + + // handle id + id.trim(); + String::size_type position = id.find_first_of(" \v\t"); + if (position == String::npos) + { + protein.identifier = std::move(id); + protein.description = ""; + } + else + { + protein.identifier = id.substr(0, position); + protein.description = id.suffix(id.size() - position - 1); + } + return true; } std::streampos FASTAFile::position() const { - return seqan::position(*static_cast(reader_.get())); + return infile_.tellg(); } bool FASTAFile::setPosition(const std::streampos& pos) { - return (seqan::setPosition(*static_cast(reader_.get()), pos) == 0); + if(pos <= fileSize_) + { + infile_.seekg(pos);//relative to the beginning of the stream + return true; + } + return false; } bool FASTAFile::atEnd() const { - return seqan::atEnd(*static_cast(reader_.get())); + return infile_.eof(); } void FASTAFile::load(const String& filename, vector& data) { - data.clear(); - FASTAEntry p; - FASTAFile f; - f.readStart(filename); - while (f.readNext(p)) - { - data.push_back(std::move(p)); - } - return; + data.clear(); + FASTAEntry p; + FASTAFile f; + f.readStart(filename); + while (f.readNext(p)) + { + data.push_back(std::move(p)); + } + return; } void FASTAFile::writeStart(const String& filename) { - if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) - { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); - } + if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) + { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); + } - outfile_.open(filename.c_str(), ofstream::out); + outfile_.open(filename.c_str(), ofstream::out); - if (!outfile_.good()) - { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } + if (!outfile_.good()) + { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } } void FASTAFile::writeNext(const FASTAEntry& protein) { - outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; - const String& tmp(protein.sequence); - - int chunks( tmp.size()/80 ); // number of complete chunks - Size chunk_pos(0); - while (--chunks >= 0) - { - outfile_.write(&tmp[chunk_pos], 80); - outfile_ << "\n"; - chunk_pos += 80; - } - - if (tmp.size() > chunk_pos) - { - outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); - outfile_ << "\n"; - } + outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; + const String& tmp(protein.sequence); + + int chunks( tmp.size()/80 ); // number of complete chunks + Size chunk_pos(0); + while (--chunks >= 0) + { + outfile_.write(&tmp[chunk_pos], 80); + outfile_ << "\n"; + chunk_pos += 80; + } + + if (tmp.size() > chunk_pos) + { + outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); + outfile_ << "\n"; + } } void FASTAFile::writeEnd() { - outfile_.close(); + outfile_.close(); } void FASTAFile::store(const String& filename, const vector& data) { - FASTAFile f; - f.writeStart(filename); - for (vector::const_iterator it = data.begin(); it != data.end(); ++it) - { - f.writeNext(*it); - } - f.writeEnd(); // close file + FASTAFile f; + f.writeStart(filename); + for (vector::const_iterator it = data.begin(); it != data.end(); ++it) + { + f.writeNext(*it); + } + f.writeEnd(); // close file } } // namespace OpenMS diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index 45c804edcda..ceba7fe68d9 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -53,141 +53,143 @@ START_TEST(FASTAFile, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// - using namespace OpenMS; - using namespace std; - - FASTAFile* ptr = nullptr; - START_SECTION((FASTAFile())) - ptr = new FASTAFile(); - TEST_EQUAL(ptr == nullptr, false) - END_SECTION - - START_SECTION((~FASTAFile())) - delete(ptr); - END_SECTION - - START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) - FASTAFile::FASTAEntry * ptr_e; - ptr_e = new FASTAFile::FASTAEntry(); - TEST_EQUAL(ptr_e == nullptr, false) - END_SECTION - - START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) - FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); - TEST_EQUAL(entry.identifier, "ID") - TEST_EQUAL(entry.description, "DESC") - TEST_EQUAL(entry.sequence, "DAVLDELNER") - END_SECTION - - START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) - FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); - TEST_EQUAL(entry1==entry2, true) - TEST_EQUAL(entry1==entry3, false) - END_SECTION - - - START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) - vector data; - FASTAFile file; - - TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); - TEST_EQUAL(data.size(), 5) - TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) - TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) - TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + - String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + - String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + - String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + - String("LRDNLTLWTSDQQDEEAGEGN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") - TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + - String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + - String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + - String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + - String("LWTSENQGDEGDAGEGEN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") - TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) - TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") - TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) - TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - // test if the modifed sequence is convertable - AASequence aa = AASequence::fromString(sequences_iterator->sequence); - TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - TEST_EQUAL(aa.isModified(), true) - String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); - TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "test") - TEST_EQUAL(sequences_iterator->description, String(" ##0")) - TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") - + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") - + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") - + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") - + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") - + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) - - END_SECTION - - START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) - vector data, data2; - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) - - file.store(tmp_filename,data); - file.load(tmp_filename,data2); - TEST_EQUAL(data==data2,true); - END_SECTION - - START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); - - END_SECTION + using namespace OpenMS; + using namespace std; + + FASTAFile* ptr = nullptr; + START_SECTION((FASTAFile())) + ptr = new FASTAFile(); + TEST_EQUAL(ptr == nullptr, false) + END_SECTION + + START_SECTION((~FASTAFile())) + delete(ptr); + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) + FASTAFile::FASTAEntry * ptr_e; + ptr_e = new FASTAFile::FASTAEntry(); + TEST_EQUAL(ptr_e == nullptr, false) + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) + FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); + TEST_EQUAL(entry.identifier, "ID") + TEST_EQUAL(entry.description, "DESC") + TEST_EQUAL(entry.sequence, "DAVLDELNER") + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) + FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); + TEST_EQUAL(entry1==entry2, true) + TEST_EQUAL(entry1==entry3, false) + END_SECTION + + + START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) + vector data; + FASTAFile file; + + TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); + TEST_EQUAL(data.size(), 5) + TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) + TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) + TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + + String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + + String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + + String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + + String("LRDNLTLWTSDQQDEEAGEGN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") + TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + + String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + + String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + + String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + + String("LWTSENQGDEGDAGEGEN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") + TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) + TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") + TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) + TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + // test if the modifed sequence is convertable + AASequence aa = AASequence::fromString(sequences_iterator->sequence); + TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + TEST_EQUAL(aa.isModified(), true) + String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); + TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "test") + TEST_EQUAL(sequences_iterator->description, String(" ##0")) + TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") + + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") + + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") + + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") + + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") + + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) + + END_SECTION + + START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) + vector data, data2; + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) + + file.store(tmp_filename,data); + file.load(tmp_filename,data2); + TEST_EQUAL(data==data2,true); + END_SECTION + +/* +START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + +END_SECTION +*/ ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// From e204e872c9439d592a0f910970ebacf8501772b2 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Wed, 21 Apr 2021 00:06:51 +0200 Subject: [PATCH 10/53] Fasta-Reader without using Seqan --- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 5 ++- src/openms/source/FORMAT/FASTAFile.cpp | 42 +++++++++---------- .../openms/data/FASTAFile_test.fasta | 16 +++---- .../openms/source/FASTAFile_test.cpp | 32 +++++++++++--- 4 files changed, 58 insertions(+), 37 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 6b9406f377d..701583ae079 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -139,7 +139,7 @@ namespace OpenMS /// Destructor virtual ~FASTAFile(); - bool FASTAFile::readRecordNew(std::string & id, std::string & seq); + bool readSequence(std::string & seq); /** @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). @@ -158,7 +158,7 @@ namespace OpenMS bool readNext(FASTAEntry& protein); /// current stream position - std::streampos position() const; + std::streampos position(); /// is stream at EOF? bool atEnd() const; @@ -206,6 +206,7 @@ namespace OpenMS std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() Size entries_read_; ///< some internal book-keeping during reading unsigned fileSize_{}; + std:: string nextID_{};///< saving ID of next protein since reading current protein stops AFTER reading next ID }; } // namespace OpenMS diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index dfa4638fccb..2ed3de652ed 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -58,25 +58,15 @@ namespace OpenMS // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. } - bool FASTAFile::readRecordNew(std::string & id, std::string & seq) + bool FASTAFile::readSequence(std::string & seq) { - std::string line; - if(TextFile::getLine(infile_, line)) - { - id.push_back(line) - } - else return false; - while(TextFile::getLine(infile_, line) && line[0] != ">") + std::string line; + while(TextFile::getLine(infile_, line) && line[0] != '>') { - for(unsigned i=0; i<=line.length();++i)//removing illegal characters//besser erst nach dem einlesen vor dem assignen zur sequenz? - { - if(line[i]<='A' || line[i]>='Z')//ambigous akzeptieren wir hier - { - line.erase(i,i+1); - } - } - seq.push_back(line); + seq+=line; } + nextID_=line;//because while loop stops AFTER reading first line of next protein + if(seq.empty()) return false; return true; } @@ -103,16 +93,17 @@ namespace OpenMS // Skip the header of PEFF files (http://www.psidev.info/peff) std::string line; - std::streampos firstline = 0; while (TextFile::getLine(infile_, line)) { if (!line.empty() && line[0] != '#') { + if(line[0]=='>') + { + nextID_=line;//because while loop stops AFTER reading first line of first protein + } break; } - firstline = infile_.tellg(); } - infile_.seekg(firstline); entries_read_ = 0; } @@ -125,7 +116,13 @@ namespace OpenMS } String id, s; - if (readRecordNew(id, s) != 0) + if(!nextID_.empty() && nextID_[0]=='>') + { + nextID_.erase(0,1); + id=nextID_; + } + + if (readSequence(s) == false) { if (entries_read_ == 0) s = "The first entry could not be read!"; else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; @@ -133,6 +130,7 @@ namespace OpenMS } ++entries_read_; + s.removeWhitespaces(); protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) // handle id @@ -151,7 +149,7 @@ namespace OpenMS return true; } - std::streampos FASTAFile::position() const + std::streampos FASTAFile::position() { return infile_.tellg(); } @@ -160,7 +158,7 @@ namespace OpenMS { if(pos <= fileSize_) { - infile_.seekg(pos);//relative to the beginning of the stream + infile_.seekg(pos); return true; } return false; diff --git a/src/tests/class_tests/openms/data/FASTAFile_test.fasta b/src/tests/class_tests/openms/data/FASTAFile_test.fasta index aceeecca384..1b3c4c188bc 100755 --- a/src/tests/class_tests/openms/data/FASTAFile_test.fasta +++ b/src/tests/class_tests/openms/data/FASTAFile_test.fasta @@ -1,3 +1,7 @@ +# PEFF Description block +# Decoy=false +# DbDescription=extract of neXtProt with manual modifications +# GeneralComment= A selection of protein entries >P68509|1433F_BOVIN This is the description of the first protein GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWR VISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKV @@ -14,20 +18,16 @@ GEGEN >sp|P31946|1433B_HUMAN 14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3 MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY -LKMKGDYFRYLSEVASGDNKQTT VSNS*QQAY5QEAFE!ISKKEMQPTHPIRLGLALNF -SVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQG -DEGDAGEGEN +LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY +YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD +AGEGEN + -# PEFF Description block -# Decoy=false -# DbDescription=extract of neXtProt with manual modifications -# GeneralComment= A selection of protein entries >sp|P00000|0000A_UNKNOWN Artificially modified version of sp|P31946|1433B_HUMAN (ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL SVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKY LIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTH PIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDN LTLWTSENQGDEGDAGEGEN - > test ##0 GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index ceba7fe68d9..dd22b1a2b4c 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -127,7 +127,7 @@ START_TEST(FASTAFile, "$Id$") + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) + + String("AGEGEN"))//wie modifikationen drin lassen? // test if the modifed sequence is convertable AASequence aa = AASequence::fromString(sequences_iterator->sequence); @@ -163,11 +163,11 @@ START_TEST(FASTAFile, "$Id$") TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) file.store(tmp_filename,data); - file.load(tmp_filename,data2); + file.load(tmp_filename,data2);//fail TEST_EQUAL(data==data2,true); END_SECTION -/* + START_SECTION([EXTRA] test_strange_symbols_in_sequence) // test if * is read correctly (not changed into something weird like 'X') String tmp_filename; @@ -181,15 +181,37 @@ START_SECTION([EXTRA] test_strange_symbols_in_sequence) data.push_back(temp_entry); data.push_back(temp_entry); // twice + file.store(tmp_filename, data); - file.load(tmp_filename, data2); ABORT_IF(data2.size() != 2); TEST_EQUAL(data2[0] == temp_entry, true); TEST_EQUAL(data2[1] == temp_entry, true); END_SECTION -*/ + + +START_SECTION(test_white_spaces) +//test if spaces and tabulators are removed correctly + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR LAEQ\tAERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 1); + TEST_EQUAL(data2[0].sequence == String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL"), true); + +END_SECTION + ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// From 77e168d98f113f8eda29b8143c1b61341fb492a1 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Fri, 23 Apr 2021 09:50:19 +0200 Subject: [PATCH 11/53] Fasta-Reader without using Seqan --- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 7 +- src/openms/source/FORMAT/FASTAFile.cpp | 89 ++++++++++++++----- .../openms/source/FASTAFile_test.cpp | 11 ++- 3 files changed, 76 insertions(+), 31 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 701583ae079..7cf59fe3123 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -139,7 +139,6 @@ namespace OpenMS /// Destructor virtual ~FASTAFile(); - bool readSequence(std::string & seq); /** @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). @@ -202,11 +201,13 @@ namespace OpenMS void static store(const String& filename, const std::vector& data); protected: + bool readEntry_(std::string & id, std::string & seq); + bool getLine_(std::istream& is, std::string& t); + std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() Size entries_read_; ///< some internal book-keeping during reading - unsigned fileSize_{}; - std:: string nextID_{};///< saving ID of next protein since reading current protein stops AFTER reading next ID + std::streampos fileSize_{}; }; } // namespace OpenMS diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 2ed3de652ed..8815ac94105 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -39,10 +39,6 @@ #include -#include -#include -#include -#include namespace OpenMS { @@ -58,14 +54,73 @@ namespace OpenMS // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. } - bool FASTAFile::readSequence(std::string & seq) + + bool FASTAFile::getLine_(std::istream& is, std::string& t){ + t.clear(); + std::istream::sentry se(is, true); + if (!se) + { // the stream has an error + return false; + } + std::streambuf* sb = is.rdbuf(); + if(entries_read_== 0)// + { + if(sb->sgetc() == '>') return false; + if(sb->sgetc() == '#') + { + is.ignore(numeric_limits::max(),'\n');//skipping the following line without reading + return true; + } + } + for (;;) + { + int c = sb->sbumpc();// get and advance to next char + switch (c) + { + case '\n': + if (sb->sgetc() == '>') // peek current char + { + return false;//reaching the beginning of the next protein-entry + } + return true; + case '\r': + if (sb->sgetc() == '\n') // peek current char + { + sb->sbumpc(); // consume it + } + return true; + case ' ': + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + is.setstate(std::ios::eofbit); // still allows: while(is == true) + if (t.empty()) + { // only if we just started a new line, we set the is.fail() == true, ie. is == false + is.setstate(std::ios::badbit); + } + return false; + default: + t += (char)c; + } + } + } + + bool FASTAFile::readEntry_(std::string & id, std::string & seq) { std::string line; - while(TextFile::getLine(infile_, line) && line[0] != '>') + if(TextFile::getLine(infile_,line))//using Textfile::getLine to be able to read '>' + { + line.erase(0,1); + id=line; + } + else return false; //infile_ empty + + while(FASTAFile::getLine_(infile_, line)) { seq+=line; } - nextID_=line;//because while loop stops AFTER reading first line of next protein + seq+=line;//after getLine_ returns false still add the line read if(seq.empty()) return false; return true; @@ -93,16 +148,9 @@ namespace OpenMS // Skip the header of PEFF files (http://www.psidev.info/peff) std::string line; - while (TextFile::getLine(infile_, line)) + while (FASTAFile::getLine_(infile_, line)) { - if (!line.empty() && line[0] != '#') - { - if(line[0]=='>') - { - nextID_=line;//because while loop stops AFTER reading first line of first protein - } - break; - } + //skipping PEFF header or anything before the first identifier } entries_read_ = 0; @@ -116,13 +164,8 @@ namespace OpenMS } String id, s; - if(!nextID_.empty() && nextID_[0]=='>') - { - nextID_.erase(0,1); - id=nextID_; - } - if (readSequence(s) == false) + if (readEntry_(id, s) == false) { if (entries_read_ == 0) s = "The first entry could not be read!"; else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; @@ -130,9 +173,7 @@ namespace OpenMS } ++entries_read_; - s.removeWhitespaces(); protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) - // handle id id.trim(); String::size_type position = id.find_first_of(" \v\t"); diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index dd22b1a2b4c..d57137a8026 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -57,6 +57,7 @@ START_TEST(FASTAFile, "$Id$") using namespace std; FASTAFile* ptr = nullptr; + START_SECTION((FASTAFile())) ptr = new FASTAFile(); TEST_EQUAL(ptr == nullptr, false) @@ -72,6 +73,7 @@ START_TEST(FASTAFile, "$Id$") TEST_EQUAL(ptr_e == nullptr, false) END_SECTION + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); TEST_EQUAL(entry.identifier, "ID") @@ -127,7 +129,7 @@ START_TEST(FASTAFile, "$Id$") + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN"))//wie modifikationen drin lassen? + + String("AGEGEN")) // test if the modifed sequence is convertable AASequence aa = AASequence::fromString(sequences_iterator->sequence); @@ -153,6 +155,8 @@ START_TEST(FASTAFile, "$Id$") END_SECTION + + START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) vector data, data2; String tmp_filename; @@ -163,7 +167,7 @@ START_TEST(FASTAFile, "$Id$") TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) file.store(tmp_filename,data); - file.load(tmp_filename,data2);//fail + file.load(tmp_filename,data2); TEST_EQUAL(data==data2,true); END_SECTION @@ -183,11 +187,11 @@ START_SECTION([EXTRA] test_strange_symbols_in_sequence) file.store(tmp_filename, data); + file.load(tmp_filename, data2); ABORT_IF(data2.size() != 2); TEST_EQUAL(data2[0] == temp_entry, true); TEST_EQUAL(data2[1] == temp_entry, true); - END_SECTION @@ -209,7 +213,6 @@ START_SECTION(test_white_spaces) ABORT_IF(data2.size() != 1); TEST_EQUAL(data2[0].sequence == String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL"), true); - END_SECTION From 9739aacacf9973b5537d82a8dffc7ef8b6ee5c48 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Fri, 23 Apr 2021 09:59:45 +0200 Subject: [PATCH 12/53] Fasta-Reader without using Seqan --- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 1 - src/openms/source/FORMAT/FASTAFile.cpp | 1 - src/tests/class_tests/openms/source/FASTAFile_test.cpp | 1 - 3 files changed, 3 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 7cf59fe3123..5b5ff172a4d 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -139,7 +139,6 @@ namespace OpenMS /// Destructor virtual ~FASTAFile(); - /** @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). @exception Exception::FileNotFound is thrown if the file does not exists. diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 8815ac94105..1bdda9a9654 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -164,7 +164,6 @@ namespace OpenMS } String id, s; - if (readEntry_(id, s) == false) { if (entries_read_ == 0) s = "The first entry could not be read!"; diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index d57137a8026..c4b85d0d114 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -156,7 +156,6 @@ START_TEST(FASTAFile, "$Id$") END_SECTION - START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) vector data, data2; String tmp_filename; From b8d09dd849618b1efc0aafff6eb8d89351c8d886 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Fri, 23 Apr 2021 13:49:35 +0200 Subject: [PATCH 13/53] test --- .../openms/source/FASTAFile_test.cpp | 316 ++++++++++-------- 1 file changed, 181 insertions(+), 135 deletions(-) diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index 305f5734f91..9a9fe0c82be 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -53,141 +53,187 @@ START_TEST(FASTAFile, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// -using namespace OpenMS; -using namespace std; - -FASTAFile* ptr = nullptr; -START_SECTION((FASTAFile())) - ptr = new FASTAFile(); - TEST_EQUAL(ptr == nullptr, false) -END_SECTION - -START_SECTION((~FASTAFile())) - delete(ptr); -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) - FASTAFile::FASTAEntry * ptr_e; - ptr_e = new FASTAFile::FASTAEntry(); - TEST_EQUAL(ptr_e == nullptr, false) -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) - FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); - TEST_EQUAL(entry.identifier, "ID") - TEST_EQUAL(entry.description, "DESC") - TEST_EQUAL(entry.sequence, "DAVLDELNER") -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) - FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); - TEST_EQUAL(entry1==entry2, true) - TEST_EQUAL(entry1==entry3, false) -END_SECTION - - -START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) - vector data; - FASTAFile file; - - TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); - TEST_EQUAL(data.size(), 5) - TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) - TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) - TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + - String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + - String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + - String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + - String("LRDNLTLWTSDQQDEEAGEGN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") - TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + - String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + - String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + - String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + - String("LWTSENQGDEGDAGEGEN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") - TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) - TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") - TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) - TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - // test if the modifed sequence is convertable - AASequence aa = AASequence::fromString(sequences_iterator->sequence); - TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - TEST_EQUAL(aa.isModified(), true) - String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); - TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "test") - TEST_EQUAL(sequences_iterator->description, String(" ##0")) - TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") - + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") - + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") - + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") - + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") - + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) - -END_SECTION - -START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) - vector data, data2; - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) - - file.store(tmp_filename,data); - file.load(tmp_filename,data2); - TEST_EQUAL(data==data2,true); -END_SECTION - -START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); - -END_SECTION + using namespace OpenMS; + using namespace std; + + FASTAFile* ptr = nullptr; + START_SECTION((FASTAFile())) + ptr = new FASTAFile(); + TEST_EQUAL(ptr == nullptr, false) + END_SECTION + + START_SECTION((~FASTAFile())) + delete(ptr); + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) + FASTAFile::FASTAEntry * ptr_e; + ptr_e = new FASTAFile::FASTAEntry(); + TEST_EQUAL(ptr_e == nullptr, false) + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) + FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); + TEST_EQUAL(entry.identifier, "ID") + TEST_EQUAL(entry.description, "DESC") + TEST_EQUAL(entry.sequence, "DAVLDELNER") + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) + FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); + TEST_EQUAL(entry1==entry2, true) + TEST_EQUAL(entry1==entry3, false) + END_SECTION + + + START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) + vector data; + FASTAFile file; + + TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); + TEST_EQUAL(data.size(), 5) + TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) + TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) + TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + + String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + + String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + + String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + + String("LRDNLTLWTSDQQDEEAGEGN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") + TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + + String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + + String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + + String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + + String("LWTSENQGDEGDAGEGEN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") + TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) + TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") + TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) + TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN"))//wie modifikationen drin lassen? + + // test if the modifed sequence is convertable + AASequence aa = AASequence::fromString(sequences_iterator->sequence); + TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + TEST_EQUAL(aa.isModified(), true) + String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); + TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "test") + TEST_EQUAL(sequences_iterator->description, String(" ##0")) + TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") + + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") + + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") + + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") + + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") + + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) + + END_SECTION + + START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) + vector data, data2; + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); + TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) + + file.store(tmp_filename,data); + file.load(tmp_filename,data2);//fail + TEST_EQUAL(data==data2,true); + END_SECTION + + + START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + + END_SECTION + + START_SECTION([EXTRA] test_white_spaces) // noch eine Sequenz in die Fasta Datei hinzufügen und da checken, ob die Zeichen entfernt werden + // test if ' ' and '\t' are removed correctly + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR LAEQ\tAERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + + file.store(tmp_filename, data);//tmp_filename ist dann outfile, den wir gleich in load übergeben + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 1); + TEST_EQUAL(data2[0].sequence == string("GDREQLLQRARLAEQAERYDDMASAMKAVTEL"), true); + + END_SECTION + + START_SECTION([EXTRA] test_position) // + // test if setPosition() works correctly + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data; + FASTAFile::FASTAEntry temp_entry; + file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); + streampos a = file.position(); + + file.readNext(temp_entry); + data.push_back(temp_entry); + streampos b = file.position();// check if a+ + file.setPosition(a); + file.readNext(temp_entry); + data.push_back(temp_entry); + streampos c = file.position(); + + ABORT_IF(data.size() != 2 || data[0].sequence != data[1].sequence); + // die erste Proteinsequenz in FASTAFile_test.fasta + TEST_EQUAL(data[0].sequence == string("GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEEAGEGN"), true); + + + END_SECTION ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// From 26859707f227db14eedaa991de34aff980b2f8fb Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Tue, 27 Apr 2021 14:50:12 +0200 Subject: [PATCH 14/53] atEnd() --- src/openms/source/FORMAT/FASTAFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 4c340d9bf9b..411aa59729e 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -203,7 +203,7 @@ namespace OpenMS { return true; } - return infile_.eof(); + return false; } void FASTAFile::load(const String& filename, vector& data) From c9d1d50e66d89e76690c1ba755e2c14014737eb8 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Tue, 27 Apr 2021 14:56:52 +0200 Subject: [PATCH 15/53] test setPosition() --- .../openms/source/FASTAFile_test.cpp | 28 +++++++++++-------- 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index 9a9fe0c82be..c9b3b8eb741 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -215,22 +215,26 @@ START_TEST(FASTAFile, "$Id$") String tmp_filename; NEW_TMP_FILE(tmp_filename); FASTAFile file; - vector data; + //vector data; + vector> data; FASTAFile::FASTAEntry temp_entry; file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); - streampos a = file.position(); + unsigned i =1; + while(i<5){ //read the first, say, 4 entries file.readNext(temp_entry); - data.push_back(temp_entry); - streampos b = file.position();// check if a+ - file.setPosition(a); - file.readNext(temp_entry); - data.push_back(temp_entry); - streampos c = file.position(); - - ABORT_IF(data.size() != 2 || data[0].sequence != data[1].sequence); - // die erste Proteinsequenz in FASTAFile_test.fasta - TEST_EQUAL(data[0].sequence == string("GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEISKEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEEAGEGN"), true); + data.push_back(std::make_pair(file.position(), temp_entry));//remember their FASTAEntry's as well as their position. + i++;} + file.setPosition(data[1].first); //reset the position to the, say, 2nd entry + while(i<8){ //read the next 3 entries + file.readNext(temp_entry); + data.push_back(std::make_pair(file.position(), temp_entry));//and also remember their positions + i++; + }// + + ABORT_IF(data.size() != 7 || data[1].second.sequence != data[4].second.sequence || data[6].first != data[3].first ); + // die dritte Proteinsequenz in FASTAFile_test.fasta + TEST_EQUAL(data[5].second.sequence == string("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAGEGEN"), true); END_SECTION From bcf9b788cee02ff5a695c967219717124c3c123c Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Tue, 27 Apr 2021 18:57:25 +0200 Subject: [PATCH 16/53] test.tmp --- FASTAFile_test_159.tmp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 FASTAFile_test_159.tmp diff --git a/FASTAFile_test_159.tmp b/FASTAFile_test_159.tmp new file mode 100644 index 00000000000..30f3f38e7fb --- /dev/null +++ b/FASTAFile_test_159.tmp @@ -0,0 +1,26 @@ +> +GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEK +VKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS +KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEE +AGEGN +> +TMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMG +KEYREKIEAELQDICNDVLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKE +MQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDA +GEGEN +> +MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQM +GKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKK +EMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD +AGEGEN +> +ICPL6KSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQ +MGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISK +KEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEG +DAGEGEN +> +GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTF +GLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTR +PSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTL +AESPRAPSPGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSE +LVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV From 127e66030118e257c2059ca5e3c2c4110b4d0f9b Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Wed, 28 Apr 2021 13:19:49 +0200 Subject: [PATCH 17/53] Fasta Reader --- .../OpenMS/DATASTRUCTURES/FASTAContainer.h | 6 ++- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 13 ++++--- src/openms/source/FORMAT/FASTAFile.cpp | 38 ++++++++++++------- 3 files changed, 35 insertions(+), 22 deletions(-) diff --git a/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h b/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h index 0215707486b..86a5b9af185 100644 --- a/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h +++ b/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h @@ -101,7 +101,8 @@ class FASTAContainer offsets_(), data_fg_(), data_bg_(), - chunk_offset_(0) + chunk_offset_(0), + filename_(FASTA_file) { f_.readStart(FASTA_file); } @@ -207,11 +208,11 @@ class FASTAContainer /// resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning void reset() { - f_.setPosition(0); offsets_.clear(); data_fg_.clear(); data_bg_.clear(); chunk_offset_ = 0; + f_.readStart(filename_); } @@ -231,6 +232,7 @@ class FASTAContainer std::vector data_fg_; ///< active (foreground) data std::vector data_bg_; ///< prefetched (background) data; will become the next active data size_t chunk_offset_; ///< number of entries before the current chunk + std::string filename_;///< FASTA file name }; /** diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 88da5413c26..16ee605130c 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -38,9 +38,7 @@ #include #include -#include #include -#include #include #include @@ -190,7 +188,7 @@ namespace OpenMS @exception Exception::FileNotFound is thrown if the file does not exists. @exception Exception::ParseError is thrown if the file does not suit to the standard. */ - void static load(const String& filename, std::vector& data); + void load(const String& filename, std::vector& data) const; /** @brief stores the data given by 'data' at the file 'filename' @@ -198,11 +196,14 @@ namespace OpenMS This uses more RAM than writeStart() and writeNext(). @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. */ - void static store(const String& filename, const std::vector& data); + void store(const String& filename, const std::vector& data) const; protected: - bool readEntry_(std::string & id, std::string & seq);///< reading a protein entry and saving the ID and - ///< sequence into the strings id and seq + + /** + @brief Reads a protein entry from the current file position and returns the ID and sequence + */ + bool readEntry_(std::string& id, std::string& seq); std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() Size entries_read_; ///< some internal book-keeping during reading diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 411aa59729e..8372f6c9679 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -57,27 +57,31 @@ namespace OpenMS bool FASTAFile::readEntry_(std::string & id, std::string & seq) { std::streambuf* sb = infile_.rdbuf(); - bool condition = true; + bool keep_reading = true; + - while(sb->sgetc() == '#')// Skip the header of PEFF files (http://www.psidev.info/peff) - { - infile_.ignore(numeric_limits::max(),'\n'); - } if(sb->sbumpc() == '>')//not saving '>' { - while(condition)// reading the ID + while(keep_reading)// reading the ID { int c = sb->sbumpc();// get and advance to next char switch (c) { case '\n'://ID finished - condition = false; + keep_reading = false; + break; + case '\r': + if (sb->sgetc() == '\n') // peek current char + { + sb->sbumpc(); // consume it + } break; case std::streambuf::traits_type::eof(): infile_.setstate(std::ios::eofbit); if (id.empty()) { // only if we just started a new line, we set the is.fail() == true, ie. is == false infile_.setstate(std::ios::badbit); + return false; } return true; default: @@ -88,8 +92,8 @@ namespace OpenMS else return false;//was in wrong position for reading ID if(id.empty()==true) return false; - condition = true; - while(condition)//reading the sequence + keep_reading = true; + while(keep_reading)//reading the sequence { int c = sb->sbumpc();// get and advance to next char switch (c) @@ -97,7 +101,7 @@ namespace OpenMS case '\n': if (sb->sgetc() == '>') //reaching the beginning of the next protein-entry { - condition = false; + keep_reading = false; } break; case '\r': @@ -115,14 +119,14 @@ namespace OpenMS if (seq.empty()) { // only if we just started a new line, we set the is.fail() == true, ie. is == false infile_.setstate(std::ios::badbit); + return false; } return true; default: seq += (char)c; } } - if(seq.empty()) return false; - return true; + return !seq.empty(); } void FASTAFile::readStart(const String& filename) @@ -145,6 +149,11 @@ namespace OpenMS fileSize_ = infile_.tellg(); infile_.seekg(0, infile_.beg); + std::streambuf* sb = infile_.rdbuf(); + while(sb->sgetc() == '#')// Skip the header of PEFF files (http://www.psidev.info/peff) + { + infile_.ignore(numeric_limits::max(),'\n'); + } entries_read_ = 0; } @@ -186,6 +195,7 @@ namespace OpenMS return infile_.tellg(); } + bool FASTAFile::setPosition(const std::streampos& pos) { if(pos <= fileSize_) @@ -206,7 +216,7 @@ namespace OpenMS return false; } - void FASTAFile::load(const String& filename, vector& data) + void FASTAFile::load(const String& filename, vector& data) const { data.clear(); FASTAEntry p; @@ -260,7 +270,7 @@ namespace OpenMS outfile_.close(); } - void FASTAFile::store(const String& filename, const vector& data) + void FASTAFile::store(const String& filename, const vector& data) const { FASTAFile f; f.writeStart(filename); From 2288487e17725d3bea27274eb35cc7dd97be69b5 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Wed, 28 Apr 2021 13:30:55 +0200 Subject: [PATCH 18/53] atEnd() --- src/openms/source/FORMAT/FASTAFile.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 8372f6c9679..64da6da4e4e 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -207,13 +207,9 @@ namespace OpenMS return false; } - bool FASTAFile::atEnd() + bool FASTAFile::atEnd() { - if(infile_.peek() == std::streambuf::traits_type::eof()) // empty file - { - return true; - } - return false; + return (infile_.peek() == std::streambuf::traits_type::eof()); } void FASTAFile::load(const String& filename, vector& data) const From 40626c037c8663b991f9b06715ad791cd554158d Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Thu, 29 Apr 2021 12:35:15 +0200 Subject: [PATCH 19/53] ProgressLogger --- src/openms/source/FORMAT/FASTAFile.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 64da6da4e4e..73826ac92b4 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -214,6 +214,7 @@ namespace OpenMS void FASTAFile::load(const String& filename, vector& data) const { + startProgress(0, 1, "Loading FASTA file"); data.clear(); FASTAEntry p; FASTAFile f; @@ -222,6 +223,7 @@ namespace OpenMS { data.push_back(std::move(p)); } + endProgress(); return; } @@ -268,13 +270,16 @@ namespace OpenMS void FASTAFile::store(const String& filename, const vector& data) const { + startProgress(0, data.size(), "Writing FASTA file"); FASTAFile f; f.writeStart(filename); for (vector::const_iterator it = data.begin(); it != data.end(); ++it) { f.writeNext(*it); + nextProgress(); } f.writeEnd(); // close file + endProgress(); } } // namespace OpenMS From a633f58f9210d5ba4d369b79132ba7d5dd566174 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Thu, 29 Apr 2021 13:17:39 +0200 Subject: [PATCH 20/53] \r --- src/openms/source/FORMAT/FASTAFile.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 73826ac92b4..d759445b672 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -71,10 +71,6 @@ namespace OpenMS keep_reading = false; break; case '\r': - if (sb->sgetc() == '\n') // peek current char - { - sb->sbumpc(); // consume it - } break; case std::streambuf::traits_type::eof(): infile_.setstate(std::ios::eofbit); @@ -105,10 +101,6 @@ namespace OpenMS } break; case '\r': - if (sb->sgetc() == '\n') // peek current char - { - sb->sbumpc(); // consume it - } break; case ' ': //not saving white spaces break; From b545d8833f635c9dc155a8b4a8d066fcfb0d077a Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Thu, 29 Apr 2021 15:27:38 +0200 Subject: [PATCH 21/53] eof --- src/openms/source/FORMAT/FASTAFile.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index d759445b672..004ee3eb1af 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -74,12 +74,7 @@ namespace OpenMS break; case std::streambuf::traits_type::eof(): infile_.setstate(std::ios::eofbit); - if (id.empty()) - { // only if we just started a new line, we set the is.fail() == true, ie. is == false - infile_.setstate(std::ios::badbit); - return false; - } - return true; + return false; default: id += (char)c; } From 662920eea454027b96a7b93f830511efd3875ea6 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Fri, 30 Apr 2021 11:26:30 +0200 Subject: [PATCH 22/53] test position --- .../openms/source/FASTAFile_test.cpp | 35 ++++++++++++------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index babe51c5d60..ba806415528 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -252,26 +252,35 @@ START_SECTION(test_white_spaces) String tmp_filename; NEW_TMP_FILE(tmp_filename); FASTAFile file; - //vector data; - vector> data; + + vector> data1; + vector> data2; FASTAFile::FASTAEntry temp_entry; file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); - unsigned i =1; - while(i<5){ //read the first, say, 4 entries + + for (int i=0; i<5; i++){ file.readNext(temp_entry); - data.push_back(std::make_pair(file.position(), temp_entry));//remember their FASTAEntry's as well as their position. - i++;} - file.setPosition(data[1].first); //reset the position to the, say, 2nd entry - while(i<8){ //read the next 3 entries + data1.push_back(std::make_pair(file.position(), temp_entry)); + } + file.setPosition(data1[0].first); + for(int i=0; i<3; i++){ file.readNext(temp_entry); - data.push_back(std::make_pair(file.position(), temp_entry));//and also remember their positions - i++; - }// + data2.push_back(std::make_pair(file.position(), temp_entry)); + } + + ABORT_IF(data1.size() != 4 || data2.size() != 3 ); + + for(int i=1; i<=data1.size(); i++) { + TEST_EQUAL(data1[i].second.identifier == data2[i - 1].second.identifier, true); + TEST_EQUAL(data1[i].second.description == data2[i - 1].second.description, true); + TEST_EQUAL(data1[i].second.sequence == data2[i - 1].second.sequence, true); + TEST_EQUAL(data2[i].first == data2[i - 1].first, true); + } + //Brauchen wir die konkrete Stringvergleiche auch? - ABORT_IF(data.size() != 7 || data[1].second.sequence != data[4].second.sequence || data[6].first != data[3].first ); // die dritte Proteinsequenz in FASTAFile_test.fasta - TEST_EQUAL(data[5].second.sequence == string("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAGEGEN"), true); + //TEST_EQUAL(data[5].second.sequence == string("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDAGEGEN"), true); END_SECTION From 5eb9ab5aa03b7e81f98f4d2bf87d88849a20de3e Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Fri, 30 Apr 2021 11:41:26 +0200 Subject: [PATCH 23/53] delete_tmp --- FASTAFile_test_159.tmp | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 FASTAFile_test_159.tmp diff --git a/FASTAFile_test_159.tmp b/FASTAFile_test_159.tmp deleted file mode 100644 index 30f3f38e7fb..00000000000 --- a/FASTAFile_test_159.tmp +++ /dev/null @@ -1,26 +0,0 @@ -> -GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEK -VKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS -KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQLLRDNLTLWTSDQQDEE -AGEGN -> -TMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMG -KEYREKIEAELQDICNDVLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKE -MQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGDA -GEGEN -> -MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQM -GKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKK -EMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD -AGEGEN -> -ICPL6KSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQ -MGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISK -KEMQPTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEG -DAGEGEN -> -GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTF -GLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTR -PSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTL -AESPRAPSPGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSE -LVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV From c59d49198fa3bc6b9a2629d69809cf3f779169ed Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 3 May 2021 14:02:44 +0200 Subject: [PATCH 24/53] fasta-reader --- src/openms/include/OpenMS/FORMAT/FASTAFile.h | 8 +- src/openms/source/FORMAT/FASTAFile.cpp | 77 ++++++++++++-------- 2 files changed, 53 insertions(+), 32 deletions(-) diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 16ee605130c..edf9af4881f 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -202,12 +202,16 @@ namespace OpenMS /** @brief Reads a protein entry from the current file position and returns the ID and sequence + @return Return true if the protein entry was read and saved successfully, false otherwise */ - bool readEntry_(std::string& id, std::string& seq); + bool readEntry_(std::string& id, std::string& description, std::string& seq); std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() - Size entries_read_; ///< some internal book-keeping during reading + Size entries_read_{0}; ///< some internal book-keeping during reading std::streampos fileSize_{};///< total number of characters of filestream + std::string seq_;///< sequence of currently read protein + std::string id_;///< identifier of currently read protein + std::string description_;///< description of currently read protein }; } // namespace OpenMS diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 004ee3eb1af..0717f0bf56c 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -45,7 +45,6 @@ namespace OpenMS using namespace std; FASTAFile::FASTAFile() - : entries_read_(0) { } @@ -54,24 +53,34 @@ namespace OpenMS // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. } - bool FASTAFile::readEntry_(std::string & id, std::string & seq) + bool FASTAFile::readEntry_(std::string& id, std::string& description, std::string& seq) { std::streambuf* sb = infile_.rdbuf(); bool keep_reading = true; + bool description_exists = true; - - if(sb->sbumpc() == '>')//not saving '>' + if (sb->sbumpc() != '>') return false; //was in wrong position for reading ID + else { while(keep_reading)// reading the ID { int c = sb->sbumpc();// get and advance to next char switch (c) { - case '\n'://ID finished + case ' ': + if (!id.empty()) + { + keep_reading = false; //ID finished + } + break; + case '\n': //ID finished and no description available keep_reading = false; + description_exists = false; break; case '\r': break; + case '\t': + break; case std::streambuf::traits_type::eof(): infile_.setstate(std::ios::eofbit); return false; @@ -79,10 +88,28 @@ namespace OpenMS id += (char)c; } } + if (description_exists) keep_reading = true; + while(keep_reading)// reading the description + { + int c = sb->sbumpc();// get and advance to next char + switch (c) + { + case '\n': //description finished + keep_reading = false; + break; + case '\r': + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + description += (char)c; + } + } } - else return false;//was in wrong position for reading ID - if(id.empty()==true) return false; - + if (id.empty() && description.empty()) return false; keep_reading = true; while(keep_reading)//reading the sequence { @@ -150,30 +177,21 @@ namespace OpenMS { return false; } - - String id, s; - if (readEntry_(id, s) == false) + seq_.clear(); + id_.clear(); + description_.clear(); + if (!readEntry_(id_, description_, seq_)) { - if (entries_read_ == 0) s = "The first entry could not be read!"; - else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; - throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); + if (entries_read_ == 0) seq_ = "The first entry could not be read!"; + else seq_ = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; + throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + seq_ + " Please check the file!"); } ++entries_read_; - protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) - // handle id - id.trim(); - String::size_type position = id.find_first_of(" \v\t"); - if (position == String::npos) - { - protein.identifier = std::move(id); - protein.description = ""; - } - else - { - protein.identifier = id.substr(0, position); - protein.description = id.suffix(id.size() - position - 1); - } + protein.identifier = id_; + protein.description = description_; + protein.sequence = seq_; + return true; } @@ -185,7 +203,7 @@ namespace OpenMS bool FASTAFile::setPosition(const std::streampos& pos) { - if(pos <= fileSize_) + if (pos <= fileSize_) { infile_.clear();//when end of file is reached, otherwise it gets -1 infile_.seekg(pos); @@ -211,7 +229,6 @@ namespace OpenMS data.push_back(std::move(p)); } endProgress(); - return; } void FASTAFile::writeStart(const String& filename) From 52062e997ee846f16206b5bb3ec4181963c68f4d Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Mon, 3 May 2021 14:18:01 +0200 Subject: [PATCH 25/53] merge-conflict From 7360aa58c7d3182d0038ef2f0cb9cb291584a116 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Mon, 3 May 2021 15:26:44 +0200 Subject: [PATCH 26/53] testPos() --- .../openms/source/FASTAFile_test.cpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index ba806415528..868c76e2877 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -259,7 +259,7 @@ START_SECTION(test_white_spaces) file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); - for (int i=0; i<5; i++){ + for (int i=0; i<4; i++){ file.readNext(temp_entry); data1.push_back(std::make_pair(file.position(), temp_entry)); } @@ -271,17 +271,12 @@ START_SECTION(test_white_spaces) ABORT_IF(data1.size() != 4 || data2.size() != 3 ); - for(int i=1; i<=data1.size(); i++) { - TEST_EQUAL(data1[i].second.identifier == data2[i - 1].second.identifier, true); - TEST_EQUAL(data1[i].second.description == data2[i - 1].second.description, true); - TEST_EQUAL(data1[i].second.sequence == data2[i - 1].second.sequence, true); - TEST_EQUAL(data2[i].first == data2[i - 1].first, true); + for(Size i=1; i Date: Mon, 3 May 2021 15:53:42 +0200 Subject: [PATCH 27/53] old-test-version --- src/tests/class_tests/openms/source/FASTAFile_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index babe51c5d60..21ddb2bd3d3 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -247,7 +247,7 @@ START_SECTION(test_white_spaces) END_SECTION - START_SECTION([EXTRA] test_position) // + START_SECTION([EXTRA] test_position) // test if setPosition() works correctly String tmp_filename; NEW_TMP_FILE(tmp_filename); From fa81f81e9fc16b801e07aa1c72f76a45f2bf3aac Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 3 May 2021 16:11:56 +0200 Subject: [PATCH 28/53] fasta-reader --- src/openms/source/FORMAT/FASTAFile.cpp | 57 +++++++++++++------------- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 0717f0bf56c..0c13bb1edac 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -29,7 +29,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Timo Sachsenberg $ -// $Authors: Nico Pfeifer, Chris Bielow $ +// $Authors: Nico Pfeifer, Chris Bielow, Tinatin Kasradze, Nora Wild $ // -------------------------------------------------------------------------- #include @@ -60,34 +60,36 @@ namespace OpenMS bool description_exists = true; if (sb->sbumpc() != '>') return false; //was in wrong position for reading ID - else + while(keep_reading)// reading the ID { - while(keep_reading)// reading the ID + int c = sb->sbumpc();// get and advance to next char + switch (c) { - int c = sb->sbumpc();// get and advance to next char - switch (c) - { - case ' ': - if (!id.empty()) - { - keep_reading = false; //ID finished - } - break; - case '\n': //ID finished and no description available - keep_reading = false; - description_exists = false; - break; - case '\r': - break; - case '\t': - break; - case std::streambuf::traits_type::eof(): - infile_.setstate(std::ios::eofbit); - return false; - default: - id += (char)c; - } + case ' ': + if (!id.empty()) + { + keep_reading = false; //ID finished + } + break; + case '\t': + if (!id.empty()) + { + keep_reading = false; //ID finished + } + break; + case '\n': //ID finished and no description available + keep_reading = false; + description_exists = false; + break; + case '\r': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + id += (char)c; } + } if (description_exists) keep_reading = true; while(keep_reading)// reading the description { @@ -108,8 +110,7 @@ namespace OpenMS description += (char)c; } } - } - if (id.empty() && description.empty()) return false; + if (id.empty()) return false; keep_reading = true; while(keep_reading)//reading the sequence { From ad3292252da25dd81087c78145ecd9e29e016f33 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 3 May 2021 16:15:41 +0200 Subject: [PATCH 29/53] fasta-reader --- src/openms/source/FORMAT/FASTAFile.cpp | 38 +++++++++++++------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 0c13bb1edac..a211ef94e89 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -90,27 +90,27 @@ namespace OpenMS id += (char)c; } } - if (description_exists) keep_reading = true; - while(keep_reading)// reading the description + if (id.empty()) return false; + if (description_exists) keep_reading = true; + while(keep_reading)// reading the description + { + int c = sb->sbumpc();// get and advance to next char + switch (c) { - int c = sb->sbumpc();// get and advance to next char - switch (c) - { - case '\n': //description finished - keep_reading = false; - break; - case '\r': - break; - case '\t': - break; - case std::streambuf::traits_type::eof(): - infile_.setstate(std::ios::eofbit); - return false; - default: - description += (char)c; - } + case '\n': //description finished + keep_reading = false; + break; + case '\r': + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + description += (char)c; } - if (id.empty()) return false; + } keep_reading = true; while(keep_reading)//reading the sequence { From fcc4d045faba32ba6a3408820ffac59f86fb6b04 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 3 May 2021 16:46:07 +0200 Subject: [PATCH 30/53] fasta-reader --- src/openms/source/FORMAT/FASTAFile.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index a211ef94e89..f93aec57f2b 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -66,11 +66,6 @@ namespace OpenMS switch (c) { case ' ': - if (!id.empty()) - { - keep_reading = false; //ID finished - } - break; case '\t': if (!id.empty()) { @@ -132,7 +127,7 @@ namespace OpenMS case std::streambuf::traits_type::eof(): infile_.setstate(std::ios::eofbit); if (seq.empty()) - { // only if we just started a new line, we set the is.fail() == true, ie. is == false + { infile_.setstate(std::ios::badbit); return false; } From 82e3769b7d01dc9e04fcd4e58affffd94d9c1a9d Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Tue, 4 May 2021 13:20:36 +0200 Subject: [PATCH 31/53] style changes --- src/openms/source/FORMAT/FASTAFile.cpp | 385 +++++++-------- .../openms/source/FASTAFile_test.cpp | 458 +++++++++--------- 2 files changed, 417 insertions(+), 426 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index f93aec57f2b..95e03079380 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -40,246 +40,219 @@ #include -namespace OpenMS -{ - using namespace std; +namespace OpenMS { + using namespace std; - FASTAFile::FASTAFile() + FASTAFile::FASTAFile() { + } + + FASTAFile::~FASTAFile() { + // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. + } + + bool FASTAFile::readEntry_(std::string &id, std::string &description, std::string &seq) { + std::streambuf *sb = infile_.rdbuf(); + bool keep_reading = true; + bool description_exists = true; + + if (sb->sbumpc() != '>') return false; // was in wrong position for reading ID + while (keep_reading) // reading the ID { + int c = sb->sbumpc(); // get and advance to next char + switch (c) { + case ' ': + case '\t': + if (!id.empty()) { + keep_reading = false; // ID finished + } + break; + case '\n': // ID finished and no description available + keep_reading = false; + description_exists = false; + break; + case '\r': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + id += (char) c; + } } - - FASTAFile::~FASTAFile() + if (id.empty()) return false; + if (description_exists) keep_reading = true; + while (keep_reading) //reading the description { - // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. + int c = sb->sbumpc(); // get and advance to next char + switch (c) { + case '\n': // description finished + keep_reading = false; + break; + case '\r': + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + description += (char) c; + } } - - bool FASTAFile::readEntry_(std::string& id, std::string& description, std::string& seq) + keep_reading = true; + while (keep_reading) // reading the sequence { - std::streambuf* sb = infile_.rdbuf(); - bool keep_reading = true; - bool description_exists = true; - - if (sb->sbumpc() != '>') return false; //was in wrong position for reading ID - while(keep_reading)// reading the ID - { - int c = sb->sbumpc();// get and advance to next char - switch (c) - { - case ' ': - case '\t': - if (!id.empty()) - { - keep_reading = false; //ID finished - } - break; - case '\n': //ID finished and no description available - keep_reading = false; - description_exists = false; - break; - case '\r': - break; - case std::streambuf::traits_type::eof(): - infile_.setstate(std::ios::eofbit); - return false; - default: - id += (char)c; - } - } - if (id.empty()) return false; - if (description_exists) keep_reading = true; - while(keep_reading)// reading the description - { - int c = sb->sbumpc();// get and advance to next char - switch (c) - { - case '\n': //description finished - keep_reading = false; - break; - case '\r': - break; - case '\t': - break; - case std::streambuf::traits_type::eof(): - infile_.setstate(std::ios::eofbit); - return false; - default: - description += (char)c; - } - } - keep_reading = true; - while(keep_reading)//reading the sequence - { - int c = sb->sbumpc();// get and advance to next char - switch (c) - { - case '\n': - if (sb->sgetc() == '>') //reaching the beginning of the next protein-entry - { - keep_reading = false; - } - break; - case '\r': - break; - case ' ': //not saving white spaces - break; - case '\t': - break; - case std::streambuf::traits_type::eof(): - infile_.setstate(std::ios::eofbit); - if (seq.empty()) - { - infile_.setstate(std::ios::badbit); - return false; - } - return true; - default: - seq += (char)c; - } - } - return !seq.empty(); + int c = sb->sbumpc(); // get and advance to next char + switch (c) { + case '\n': + if (sb->sgetc() == '>') // reaching the beginning of the next protein-entry + { + keep_reading = false; + } + break; + case '\r': + break; + case ' ': // not saving white spaces + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + if (seq.empty()) { + infile_.setstate(std::ios::badbit); + return false; + } + return true; + default: + seq += (char) c; + } } + return !seq.empty(); + } - void FASTAFile::readStart(const String& filename) - { + void FASTAFile::readStart(const String &filename) { - if (!File::exists(filename)) - { - throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } + if (!File::exists(filename)) { + throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } - if (!File::readable(filename)) - { - throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } + if (!File::readable(filename)) { + throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); + } - if (infile_.is_open()) infile_.close(); // precaution + if (infile_.is_open()) infile_.close(); // precaution - infile_.open(filename.c_str(), std::ios::binary | std::ios::in); - infile_.seekg(0, infile_.end); - fileSize_ = infile_.tellg(); - infile_.seekg(0, infile_.beg); + infile_.open(filename.c_str(), std::ios::binary | std::ios::in); + infile_.seekg(0, infile_.end); + fileSize_ = infile_.tellg(); + infile_.seekg(0, infile_.beg); - std::streambuf* sb = infile_.rdbuf(); - while(sb->sgetc() == '#')// Skip the header of PEFF files (http://www.psidev.info/peff) - { - infile_.ignore(numeric_limits::max(),'\n'); - } - entries_read_ = 0; + std::streambuf *sb = infile_.rdbuf(); + while (sb->sgetc() == '#') // Skip the header of PEFF files (http://www.psidev.info/peff) + { + infile_.ignore(numeric_limits::max(), '\n'); } + entries_read_ = 0; + } - bool FASTAFile::readNext(FASTAEntry& protein) - { - if (infile_.eof()) - { - return false; - } - seq_.clear(); - id_.clear(); - description_.clear(); - if (!readEntry_(id_, description_, seq_)) - { - if (entries_read_ == 0) seq_ = "The first entry could not be read!"; - else seq_ = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; - throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + seq_ + " Please check the file!"); - } - ++entries_read_; + bool FASTAFile::readNext(FASTAEntry &protein) { + if (infile_.eof()) { + return false; + } + seq_.clear(); + id_.clear(); + description_.clear(); + if (!readEntry_(id_, description_, seq_)) { + if (entries_read_ == 0) seq_ = "The first entry could not be read!"; + else seq_ = "Only " + String(entries_read_) + " proteins could be read. Parsing next record failed."; + throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", + "Error while parsing FASTA file! " + seq_ + " Please check the file!"); + } + ++entries_read_; - protein.identifier = id_; - protein.description = description_; - protein.sequence = seq_; + protein.identifier = id_; + protein.description = description_; + protein.sequence = seq_; - return true; - } + return true; + } - std::streampos FASTAFile::position() - { - return infile_.tellg(); - } + std::streampos FASTAFile::position() { + return infile_.tellg(); + } - bool FASTAFile::setPosition(const std::streampos& pos) - { - if (pos <= fileSize_) - { - infile_.clear();//when end of file is reached, otherwise it gets -1 - infile_.seekg(pos); - return true; - } - return false; + bool FASTAFile::setPosition(const std::streampos &pos) { + if (pos <= fileSize_) { + infile_.clear(); // when end of file is reached, otherwise it gets -1 + infile_.seekg(pos); + return true; } + return false; + } - bool FASTAFile::atEnd() - { - return (infile_.peek() == std::streambuf::traits_type::eof()); - } + bool FASTAFile::atEnd() { + return (infile_.peek() == std::streambuf::traits_type::eof()); + } - void FASTAFile::load(const String& filename, vector& data) const - { - startProgress(0, 1, "Loading FASTA file"); - data.clear(); - FASTAEntry p; - FASTAFile f; - f.readStart(filename); - while (f.readNext(p)) - { - data.push_back(std::move(p)); - } - endProgress(); + void FASTAFile::load(const String &filename, vector &data) const { + startProgress(0, 1, "Loading FASTA file"); + data.clear(); + FASTAEntry p; + FASTAFile f; + f.readStart(filename); + while (f.readNext(p)) { + data.push_back(std::move(p)); } + endProgress(); + } - void FASTAFile::writeStart(const String& filename) - { - if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) - { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); - } + void FASTAFile::writeStart(const String &filename) { + if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, + "invalid file extension; expected '" + + FileTypes::typeToName(FileTypes::FASTA) + "'"); + } - outfile_.open(filename.c_str(), ofstream::out); + outfile_.open(filename.c_str(), ofstream::out); - if (!outfile_.good()) - { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); - } + if (!outfile_.good()) { + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); } + } - void FASTAFile::writeNext(const FASTAEntry& protein) - { - outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; - const String& tmp(protein.sequence); - - int chunks( tmp.size()/80 ); // number of complete chunks - Size chunk_pos(0); - while (--chunks >= 0) - { - outfile_.write(&tmp[chunk_pos], 80); - outfile_ << "\n"; - chunk_pos += 80; - } + void FASTAFile::writeNext(const FASTAEntry &protein) { + outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; + const String &tmp(protein.sequence); - if (tmp.size() > chunk_pos) - { - outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); - outfile_ << "\n"; - } + int chunks(tmp.size() / 80); // number of complete chunks + Size chunk_pos(0); + while (--chunks >= 0) { + outfile_.write(&tmp[chunk_pos], 80); + outfile_ << "\n"; + chunk_pos += 80; } - void FASTAFile::writeEnd() - { - outfile_.close(); + if (tmp.size() > chunk_pos) { + outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); + outfile_ << "\n"; } + } - void FASTAFile::store(const String& filename, const vector& data) const - { - startProgress(0, data.size(), "Writing FASTA file"); - FASTAFile f; - f.writeStart(filename); - for (vector::const_iterator it = data.begin(); it != data.end(); ++it) - { - f.writeNext(*it); - nextProgress(); - } - f.writeEnd(); // close file - endProgress(); + void FASTAFile::writeEnd() { + outfile_.close(); + } + + void FASTAFile::store(const String &filename, const vector &data) const { + startProgress(0, data.size(), "Writing FASTA file"); + FASTAFile f; + f.writeStart(filename); + for (vector::const_iterator it = data.begin(); it != data.end(); ++it) { + f.writeNext(*it); + nextProgress(); } + f.writeEnd(); // close file + endProgress(); + } } // namespace OpenMS diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index ac52ecfd0dc..6ba2cc1f3a6 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -53,232 +53,250 @@ START_TEST(FASTAFile, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// - using namespace OpenMS; - using namespace std; - - FASTAFile* ptr = nullptr; - - START_SECTION((FASTAFile())) - ptr = new FASTAFile(); - TEST_EQUAL(ptr == nullptr, false) - END_SECTION - - START_SECTION((~FASTAFile())) - delete(ptr); - END_SECTION - - START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) - FASTAFile::FASTAEntry * ptr_e; - ptr_e = new FASTAFile::FASTAEntry(); - TEST_EQUAL(ptr_e == nullptr, false) - END_SECTION - - - START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) - FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); - TEST_EQUAL(entry.identifier, "ID") - TEST_EQUAL(entry.description, "DESC") - TEST_EQUAL(entry.sequence, "DAVLDELNER") - END_SECTION - - START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) - FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); - TEST_EQUAL(entry1==entry2, true) - TEST_EQUAL(entry1==entry3, false) - END_SECTION - - - START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) - vector data; - FASTAFile file; - - TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); - TEST_EQUAL(data.size(), 5) - TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) - TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) - TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + - String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + - String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + - String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + - String("LRDNLTLWTSDQQDEEAGEGN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") - TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + - String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + - String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + - String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + - String("LWTSENQGDEGDAGEGEN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") - TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) - TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") - TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) - TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - - + String("AGEGEN")) - - - - - // test if the modifed sequence is convertable - AASequence aa = AASequence::fromString(sequences_iterator->sequence); - TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - TEST_EQUAL(aa.isModified(), true) - String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); - TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "test") - TEST_EQUAL(sequences_iterator->description, String(" ##0")) - TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") - + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") - + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") - + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") - + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") - + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) - - END_SECTION - - - START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) - vector data, data2; - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) - - file.store(tmp_filename,data); - - file.load(tmp_filename,data2); - - file.load(tmp_filename,data2); - - TEST_EQUAL(data==data2,true); - END_SECTION - - - START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); - - END_SECTION - - - -START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice + using namespace OpenMS; + using namespace std; + + FASTAFile *ptr = nullptr; + + START_SECTION((FASTAFile())) + ptr = new FASTAFile(); + TEST_EQUAL(ptr == nullptr, false) + END_SECTION + + START_SECTION((~FASTAFile())) + delete (ptr); + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) + FASTAFile::FASTAEntry *ptr_e; + ptr_e = new FASTAFile::FASTAEntry(); + TEST_EQUAL(ptr_e == nullptr, false) + END_SECTION + + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) + FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); + TEST_EQUAL(entry.identifier, "ID") + TEST_EQUAL(entry.description, "DESC") + TEST_EQUAL(entry.sequence, "DAVLDELNER") + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) + FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); + TEST_EQUAL(entry1 == entry2, true) + TEST_EQUAL(entry1 == entry3, false) + END_SECTION + + + START_SECTION((void + load( + const String &filename, std::vector + &data))) + vector data; + FASTAFile file; + + TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist", data)) + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"), data); + vector::const_iterator sequences_iterator = data.begin(); + TEST_EQUAL(data.size(), 5) + TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) + TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) + TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + + String( + "NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + + String( + "NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + + String( + "KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + + String("LRDNLTLWTSDQQDEEAGEGN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") + TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + + String( + "QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + + String( + "VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + + String( + "PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + + String("LWTSENQGDEGDAGEGEN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") + TEST_EQUAL(sequences_iterator->description, + String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) + TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") + TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) + TEST_EQUAL(sequences_iterator->sequence, + String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + + String("AGEGEN")) + + + + + // test if the modifed sequence is convertable + AASequence aa = AASequence::fromString(sequences_iterator->sequence); + TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + TEST_EQUAL(aa.isModified(), true) + String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", + ResidueModification::N_TERM)->getId(); + TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "test") + TEST_EQUAL(sequences_iterator->description, String(" ##0")) + TEST_EQUAL(sequences_iterator->sequence, + String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") + + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") + + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") + + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") + + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") + + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) + + END_SECTION + + + START_SECTION((void + store( + const String &filename, + const std::vector &data) const)) + vector data, data2; + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"), data); + TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt", data)) + + file.store(tmp_filename, data); + + file.load(tmp_filename, data2); + + file.load(tmp_filename, data2); + + TEST_EQUAL(data == data2, true); + END_SECTION + + + START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + + END_SECTION + + + START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice - file.store(tmp_filename, data); - file.load(tmp_filename, data2); + file.store(tmp_filename, data); + file.load(tmp_filename, data2); - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); -END_SECTION + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + END_SECTION -START_SECTION(test_white_spaces) + START_SECTION(test_white_spaces) //test if spaces and tabulators are removed correctly - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR LAEQ\tAERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 1); - TEST_EQUAL(data2[0].sequence == string("GDREQLLQRARLAEQAERYDDMASAMKAVTEL"), true); - - END_SECTION - - START_SECTION([EXTRA] test_position) - // test if setPosition() works correctly - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - vector> data1; - vector> data2; - FASTAFile::FASTAEntry temp_entry; - file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); - - - for (int i=0; i<4; i++){ - file.readNext(temp_entry); - data1.push_back(std::make_pair(file.position(), temp_entry)); - } - file.setPosition(data1[0].first); - for(int i=0; i<3; i++){ - file.readNext(temp_entry); - data2.push_back(std::make_pair(file.position(), temp_entry)); - } - - ABORT_IF(data1.size() != 4 || data2.size() != 3 ); - - for(Size i=1; i data, data2; + + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR LAEQ\tAERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 1); + TEST_EQUAL(data2[0].sequence == string("GDREQLLQRARLAEQAERYDDMASAMKAVTEL"), true); + + END_SECTION + + START_SECTION([EXTRA] test_position) + // test if setPosition() works correctly + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + vector> data1; + vector> data2; + FASTAFile::FASTAEntry temp_entry; + file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); + + + for (int i = 0; i < 4; i++) { + file.readNext(temp_entry); + data1.push_back(std::make_pair(file.position(), temp_entry)); + } + file.setPosition(data1[0].first); + for (int i = 0; i < 3; i++) { + file.readNext(temp_entry); + data2.push_back(std::make_pair(file.position(), temp_entry)); + } + + ABORT_IF(data1.size() != 4 || data2.size() != 3); + + for (Size i = 1; i < data1.size(); i++) { + TEST_EQUAL(data1[i].second.identifier, data2[i - 1].second.identifier); + TEST_EQUAL(data1[i].second.description, data2[i - 1].second.description); + TEST_EQUAL(data1[i].second.sequence, data2[i - 1].second.sequence); + TEST_EQUAL(data1[i].first, data2[i - 1].first); + } + + END_SECTION ///////////////////////////////////////////////////////////// From c9893a975afaf2e4ed6d65887f0fb865776a9fe4 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Wed, 5 May 2021 10:24:32 +0200 Subject: [PATCH 32/53] style changes --- src/openms/source/FORMAT/FASTAFile.cpp | 102 +++++++++++++++++-------- 1 file changed, 69 insertions(+), 33 deletions(-) diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index 95e03079380..bc39fe3eebd 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -40,17 +40,21 @@ #include -namespace OpenMS { +namespace OpenMS +{ using namespace std; - FASTAFile::FASTAFile() { + FASTAFile::FASTAFile() + { } - FASTAFile::~FASTAFile() { + FASTAFile::~FASTAFile() + { // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. } - bool FASTAFile::readEntry_(std::string &id, std::string &description, std::string &seq) { + bool FASTAFile::readEntry_(std::string &id, std::string &description, std::string &seq) + { std::streambuf *sb = infile_.rdbuf(); bool keep_reading = true; bool description_exists = true; @@ -59,10 +63,12 @@ namespace OpenMS { while (keep_reading) // reading the ID { int c = sb->sbumpc(); // get and advance to next char - switch (c) { + switch (c) + { case ' ': case '\t': - if (!id.empty()) { + if (!id.empty()) + { keep_reading = false; // ID finished } break; @@ -84,7 +90,8 @@ namespace OpenMS { while (keep_reading) //reading the description { int c = sb->sbumpc(); // get and advance to next char - switch (c) { + switch (c) + { case '\n': // description finished keep_reading = false; break; @@ -103,7 +110,8 @@ namespace OpenMS { while (keep_reading) // reading the sequence { int c = sb->sbumpc(); // get and advance to next char - switch (c) { + switch (c) + { case '\n': if (sb->sgetc() == '>') // reaching the beginning of the next protein-entry { @@ -118,7 +126,8 @@ namespace OpenMS { break; case std::streambuf::traits_type::eof(): infile_.setstate(std::ios::eofbit); - if (seq.empty()) { + if (seq.empty()) + { infile_.setstate(std::ios::badbit); return false; } @@ -130,13 +139,16 @@ namespace OpenMS { return !seq.empty(); } - void FASTAFile::readStart(const String &filename) { + void FASTAFile::readStart(const String &filename) + { - if (!File::exists(filename)) { + if (!File::exists(filename)) + { throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); } - if (!File::readable(filename)) { + if (!File::readable(filename)) + { throw Exception::FileNotReadable(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); } @@ -155,16 +167,25 @@ namespace OpenMS { entries_read_ = 0; } - bool FASTAFile::readNext(FASTAEntry &protein) { - if (infile_.eof()) { + bool FASTAFile::readNext(FASTAEntry &protein) + { + if (infile_.eof()) + { return false; } seq_.clear(); id_.clear(); description_.clear(); - if (!readEntry_(id_, description_, seq_)) { - if (entries_read_ == 0) seq_ = "The first entry could not be read!"; - else seq_ = "Only " + String(entries_read_) + " proteins could be read. Parsing next record failed."; + if (!readEntry_(id_, description_, seq_)) + { + if (entries_read_ == 0) + { + seq_ = "The first entry could not be read!"; + } + else + { + seq_ = "Only " + String(entries_read_) + " proteins could be read. Parsing next record failed."; + } throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + seq_ + " Please check the file!"); } @@ -177,13 +198,16 @@ namespace OpenMS { return true; } - std::streampos FASTAFile::position() { + std::streampos FASTAFile::position() + { return infile_.tellg(); } - bool FASTAFile::setPosition(const std::streampos &pos) { - if (pos <= fileSize_) { + bool FASTAFile::setPosition(const std::streampos &pos) + { + if (pos <= fileSize_) + { infile_.clear(); // when end of file is reached, otherwise it gets -1 infile_.seekg(pos); return true; @@ -191,24 +215,29 @@ namespace OpenMS { return false; } - bool FASTAFile::atEnd() { + bool FASTAFile::atEnd() + { return (infile_.peek() == std::streambuf::traits_type::eof()); } - void FASTAFile::load(const String &filename, vector &data) const { + void FASTAFile::load(const String &filename, vector &data) const + { startProgress(0, 1, "Loading FASTA file"); data.clear(); FASTAEntry p; FASTAFile f; f.readStart(filename); - while (f.readNext(p)) { + while (f.readNext(p)) + { data.push_back(std::move(p)); } endProgress(); } - void FASTAFile::writeStart(const String &filename) { - if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) { + void FASTAFile::writeStart(const String &filename) + { + if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) + { throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); @@ -216,39 +245,46 @@ namespace OpenMS { outfile_.open(filename.c_str(), ofstream::out); - if (!outfile_.good()) { + if (!outfile_.good()) + { throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); } } - void FASTAFile::writeNext(const FASTAEntry &protein) { + void FASTAFile::writeNext(const FASTAEntry &protein) + { outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; const String &tmp(protein.sequence); int chunks(tmp.size() / 80); // number of complete chunks Size chunk_pos(0); - while (--chunks >= 0) { + while (--chunks >= 0) + { outfile_.write(&tmp[chunk_pos], 80); outfile_ << "\n"; chunk_pos += 80; } - if (tmp.size() > chunk_pos) { + if (tmp.size() > chunk_pos) + { outfile_.write(&tmp[chunk_pos], tmp.size() - chunk_pos); outfile_ << "\n"; } } - void FASTAFile::writeEnd() { + void FASTAFile::writeEnd() + { outfile_.close(); } - void FASTAFile::store(const String &filename, const vector &data) const { + void FASTAFile::store(const String &filename, const vector &data) const + { startProgress(0, data.size(), "Writing FASTA file"); FASTAFile f; f.writeStart(filename); - for (vector::const_iterator it = data.begin(); it != data.end(); ++it) { - f.writeNext(*it); + for (const FASTAFile::FASTAEntry& it : data) + { + f.writeNext(it); nextProgress(); } f.writeEnd(); // close file From ca23e076a943eb8dc739eb31b517ababc4473741 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Tue, 11 May 2021 15:04:47 +0200 Subject: [PATCH 33/53] seq-align not yet compiling --- .../ID/ConsensusIDAlgorithmPEPMatrix.h | 121 +----------------- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 82 ++++++++++++ src/openms/includes.cmake | 2 + .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 64 +++------ .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 74 +++++++++++ 5 files changed, 177 insertions(+), 166 deletions(-) create mode 100644 src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h create mode 100644 src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index 742206e91e7..b9ec4ede3ab 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -35,113 +35,7 @@ #pragma once #include -#include - -// Extend SeqAn by a user-define scoring matrix. -namespace seqan -{ - - // We have to create a new specialization of the _ScoringMatrix class - // for amino acids. For this, we first create a new tag. - struct PAM30MS {}; // PAM30MS matrix - struct AdaptedIdentity {}; // identity matrix adapted for I/L, Q/K ambiguity - - // Then, we specialize the class _ScoringMatrix. - template <> - struct ScoringMatrixData_ - { - enum - { - VALUE_SIZE = ValueSize::VALUE, - TAB_SIZE = VALUE_SIZE * VALUE_SIZE - }; - static inline const int* getData() - { - // Rant: I cannot find a primary source for the PAM30MS scoring matrix! - // It seems to have been first published in Huang et al., JBC 2001 - // (http://www.jbc.org/content/276/30/28327), but the paper does not show - // the actual matrix (gah!). - // The matrix here comes from old OpenMS code and also matches this one: - // http://proteomics.fiocruz.br/supplementaryfiles/pepexplorer/BeforeRevision/PFUGridResults/PFUGridSearch/pam30ms.txt - - static const int _data[TAB_SIZE] = - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, - /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, - /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, - /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, - /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, - /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, - /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, - /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, - /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, - /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, - /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, - /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, - /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, - /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, - /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, - /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, - /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, - /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, - /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, - /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, - /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, - /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, - /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 - }; - - return _data; - } - }; - - template <> - struct ScoringMatrixData_ - { - enum - { - VALUE_SIZE = ValueSize::VALUE, - TAB_SIZE = VALUE_SIZE * VALUE_SIZE - }; - static inline const int* getData() - { - static const int _data[TAB_SIZE] = - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, - /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, - /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, - /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, - /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, - /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, - /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 - }; - - return _data; - } - }; - -} // namespace seqan - +#include namespace OpenMS { @@ -162,17 +56,8 @@ namespace OpenMS ConsensusIDAlgorithmPEPMatrix(); private: - /// SeqAn similarity scoring - typedef ::seqan::Score > SeqAnScore; - - /// SeqAn amino acid sequence - typedef ::seqan::String< ::seqan::AminoAcid> SeqAnSequence; - - /// Similarity scoring method - SeqAnScore scoring_method_; - /// Alignment data structure - ::seqan::Align alignment_; + NeedlemanWunsch object; /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -180,8 +65,6 @@ namespace OpenMS /// Not implemented ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); - /// Docu in base class - void updateMembers_() override; /// Sequence similarity based on substitution matrix (ignores PTMs) double getSimilarity_(AASequence seq1, AASequence seq2) override; diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h new file mode 100644 index 00000000000..4ebc91104b5 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -0,0 +1,82 @@ +#include + +std::vector adaptedIdentity + { + // A R N D C Q E G H I L K M F P S T W Y V B Z X * + /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, + /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, + /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, + /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, + /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, + /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, + /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, + /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 + }; + +std::vector PAM30MS + { + // A R N D C Q E G H I L K M F P S T W Y V B Z X * + /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, + /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, + /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, + /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, + /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, + /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, + /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, + /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, + /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, + /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, + /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, + /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, + /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, + /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, + /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, + /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, + /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, + /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, + /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, + /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, + /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, + /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, + /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, + /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 + }; + +namespace OpenMS +{ + class OPENMS_DLLAPI NeedlemanWunsch + { + public: + enum ScoringMatrix + { + PAM30MSMatrix, + identityMatrix + }; + + NeedlemanWunsch(ScoringMatrix matrix, int penalty); + int align_(const String& seq1, const String& seq2); + + private: + int getIndex_(char& a const, char& b const) const; + unsigned seq1len_ = 0; + unsigned seq2len_ = 0; + int gapPenalty_ = 0; + std::vector* matrixPtr_ = nullptr; + }; +} \ No newline at end of file diff --git a/src/openms/includes.cmake b/src/openms/includes.cmake index efc8464a49e..72cb4e5a186 100644 --- a/src/openms/includes.cmake +++ b/src/openms/includes.cmake @@ -21,6 +21,7 @@ include(source/FORMAT/VALIDATORS/sources.cmake) include(source/FORMAT/OPTIONS/sources.cmake) include(source/FORMAT/sources.cmake) include(source/ANALYSIS/QUANTITATION/sources.cmake) +include(source/ANALYSIS/SEQUENCE/sources.cmake) include(source/ANALYSIS/SVM/sources.cmake) include(source/ANALYSIS/MAPMATCHING/sources.cmake) include(source/ANALYSIS/DECHARGING/sources.cmake) @@ -80,6 +81,7 @@ include(include/OpenMS/ANALYSIS/ID/sources.cmake) include(include/OpenMS/ANALYSIS/DENOVO/sources.cmake) include(include/OpenMS/ANALYSIS/MAPMATCHING/sources.cmake) include(include/OpenMS/ANALYSIS/QUANTITATION/sources.cmake) +include(include/OpenMS/ANALYSIS/SEQUENCE/sources.cmake) include(include/OpenMS/ANALYSIS/SVM/sources.cmake) include(include/OpenMS/ANALYSIS/PIP/sources.cmake) include(include/OpenMS/ANALYSIS/MRM/sources.cmake) diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 00eab98aee7..e9ba0f7f498 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -33,6 +33,7 @@ // -------------------------------------------------------------------------- #include +#include using namespace std; @@ -40,46 +41,34 @@ namespace OpenMS { ConsensusIDAlgorithmPEPMatrix::ConsensusIDAlgorithmPEPMatrix() { - setName("ConsensusIDAlgorithmPEPMatrix"); // DefaultParamHandler - + setName("ConsensusIDAlgorithmPEPMatrix"); defaults_.setValue("matrix", "identity", "Substitution matrix to use for alignment-based similarity scoring"); - defaults_.setValidStrings("matrix", {"identity","PAM30MS"}); + defaults_.setValidStrings("matrix", {"identity", "PAM30MS"}); defaults_.setValue("penalty", 5, "Alignment gap penalty (the same value is used for gap opening and extension)"); defaults_.setMinInt("penalty", 1); defaultsToParam_(); - ::seqan::resize(rows(alignment_), 2); - } - - - void ConsensusIDAlgorithmPEPMatrix::updateMembers_() - { - ConsensusIDAlgorithmSimilarity::updateMembers_(); - - // alignment scoring using SeqAn/similarity matrices: - std::string matrix = param_.getValue("matrix"); + string matrix = param_.getValue("matrix"); //muss ja enum sein... int penalty = param_.getValue("penalty"); - scoring_method_ = SeqAnScore(-penalty, -penalty); + if (matrix == "identity") { - ::seqan::setDefaultScoreMatrix(scoring_method_, - ::seqan::AdaptedIdentity()); + NeedlemanWunsch::ScoringMatrix enumMatrix = NeedlemanWunsch::identityMatrix; + NeedlemanWunsch object(enumMatrix, penalty); } else if (matrix == "PAM30MS") { - ::seqan::setDefaultScoreMatrix(scoring_method_, ::seqan::PAM30MS()); + NeedlemanWunsch::ScoringMatrix enumMatrix = NeedlemanWunsch::PAM30MSMatrix; + NeedlemanWunsch object(enumMatrix, penalty); } else { String msg = "Matrix '" + matrix + "' is not known! Valid choices are: " - "'identity', 'PAM30MS'."; + "'identity', 'PAM30MS'."; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } - - // new parameters may affect the similarity calculation, so clear cache: - similarities_.clear(); } @@ -90,30 +79,13 @@ namespace OpenMS String unmod_seq1 = seq1.toUnmodifiedString(); String unmod_seq2 = seq2.toUnmodifiedString(); if (unmod_seq1 == unmod_seq2) return 1.0; - // order of sequences matters for cache look-up: - if (unmod_seq1 > unmod_seq2) swap(unmod_seq1, unmod_seq2); - seq1 = AASequence::fromString(unmod_seq1); - seq2 = AASequence::fromString(unmod_seq2); - pair seq_pair = make_pair(seq1, seq2); - SimilarityCache::iterator pos = similarities_.find(seq_pair); - if (pos != similarities_.end()) return pos->second; // score found in cache - - // use SeqAn similarity scoring: - SeqAnSequence seqan_seq1 = unmod_seq1.c_str(); - SeqAnSequence seqan_seq2 = unmod_seq2.c_str(); - // seq. 1 against itself: - ::seqan::assignSource(row(alignment_, 0), seqan_seq1); - ::seqan::assignSource(row(alignment_, 1), seqan_seq1); - double score_self1 = globalAlignment(alignment_, scoring_method_, - ::seqan::NeedlemanWunsch()); - // seq. 1 against seq. 2: - ::seqan::assignSource(row(alignment_, 1), seqan_seq2); - double score_sim = globalAlignment(alignment_, scoring_method_, - ::seqan::NeedlemanWunsch()); - // seq. 2 against itself: - ::seqan::assignSource(row(alignment_, 0), seqan_seq2); - double score_self2 = globalAlignment(alignment_, scoring_method_, - ::seqan::NeedlemanWunsch()); + // if (unmod_seq1 > unmod_seq2) swap(unmod_seq1, unmod_seq2); + + + int score_self1 = object.align_(unmod_seq1, unmod_seq1); + int score_sim = object.align_(unmod_seq1, unmod_seq2); + int score_self2 = object.align_(unmod_seq2, unmod_seq2); + if (score_sim < 0) { score_sim = 0; @@ -122,8 +94,6 @@ namespace OpenMS { score_sim /= min(score_self1, score_self2); // normalize } - similarities_[seq_pair] = score_sim; // cache the similarity score - return score_sim; } diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp new file mode 100644 index 00000000000..7a9de7bd141 --- /dev/null +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -0,0 +1,74 @@ +#include + +using namespace std; +namespace OpenMS +{ + NeedlemanWunsch(ScoringMatrix matrix, int penalty) +{ + gapPenalty_ = penalty; + switch(matrix) + { + case identity: + matrixPtr_ = &adaptedIdentity; + break; + case PAM30MS: + matrixPtr_ = &PAM30MS; + break; + default: + String msg = "Matrix '" + matrix + "' is not known! Valid choices are: " + "'identity', 'PAM30MS'."; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } +} + +int NeedlemanWunsch::getIndex_(char& a const, char& b const) const //noch optimieren +{ +vector> vec = + { + {'A', 0}, {'R', 1}, {'N', 2}, + {'D', 3}, {'C', 4}, {'Q', 5}, + {'E', 6}, {'G', 7}, {'H', 8}, + {'I', 9}, {'L', 10}, {'K', 11}, + {'M', 12}, {'F', 13}, {'P', 14}, + {'S', 15}, {'T', 16}, {'W', 17}, + {'Y', 18}, {'V', 19}, {'B', 20}, + {'Z', 21}, {'X', 22}, {'*', 23} + }; +int x = -1; +int y = -1; +for (int i = 0; i < vec.size(); ++i) +{ +if (vec[i].first == a) +x = vec[i].second; +if (vec[i].first == b) +y = vec[i].second; +} +if (x == -1) +x = 23; +if (y == -1) +y = 23; +return x + y*vec.size(); +} + +int NeedlemanWunsch::align_(const String& seq1, const String& seq2) +{ + seq1len_ = seq1.length(); + seq2len_ = seq2.length(); + + std::vector matrix((seq1len_+1)*(seq2len_+1), 0)//matrix mit 0en initialisieren + for (unsigned i = 1; i <= seq1len_; ++i) //vertikale mit gapkkosten initialisieren + matrix[i*(seq2len_+1)]=i*gapPenalty_; + for (unsigned i =0; i<=seq2len_;++i)//horizontale mit gapkosten initialieren + matrix[i]=i*gapPenalty_; + for (unsigned i=1;i<=seq1len_;++1) + { + for (unsigned j=1;j<=seq2len_;++j) + { + matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]+gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]+gapPenalty)), + (matrix[(i-1)*(seq2len_+1)+j-1])+ *matrixPtr_[getIndex_(seq1[i], seq2[j])]) + } + } + return matrix[(seq1len_+1)*(seq2len_+1)-1]; +} +} \ No newline at end of file From 5502882ac6a20e1e4c46c7091e50e463ca47a11f Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Tue, 11 May 2021 22:28:45 +0200 Subject: [PATCH 34/53] seq-align not yet compiling --- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 29 ++++++++++--------- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 8 ++--- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index 4ebc91104b5..f01275d71ad 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -61,22 +61,23 @@ std::vector PAM30MS namespace OpenMS { class OPENMS_DLLAPI NeedlemanWunsch - { - public: - enum ScoringMatrix + { + public: + enum ScoringMatrix { - PAM30MSMatrix, - identityMatrix - }; + PAM30MSMatrix, + identityMatrix + }; - NeedlemanWunsch(ScoringMatrix matrix, int penalty); - int align_(const String& seq1, const String& seq2); + NeedlemanWunsch(ScoringMatrix matrix, int penalty); + ~NeedlemanWunsch()=default; + int align_(const String& seq1, const String& seq2); private: - int getIndex_(char& a const, char& b const) const; - unsigned seq1len_ = 0; - unsigned seq2len_ = 0; - int gapPenalty_ = 0; - std::vector* matrixPtr_ = nullptr; - }; + int getIndex_(const char& a, const char& b) const; + unsigned seq1len_ = 0; + unsigned seq2len_ = 0; + int gapPenalty_ = 0; + std::vector* matrixPtr_ = nullptr; + }; } \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 7a9de7bd141..ffd7aed23cb 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -3,7 +3,7 @@ using namespace std; namespace OpenMS { - NeedlemanWunsch(ScoringMatrix matrix, int penalty) + NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { gapPenalty_ = penalty; switch(matrix) @@ -22,7 +22,7 @@ namespace OpenMS } } -int NeedlemanWunsch::getIndex_(char& a const, char& b const) const //noch optimieren +int NeedlemanWunsch::getIndex_(const char& a, const char& b) const //noch optimieren { vector> vec = { @@ -61,12 +61,12 @@ int NeedlemanWunsch::align_(const String& seq1, const String& seq2) matrix[i*(seq2len_+1)]=i*gapPenalty_; for (unsigned i =0; i<=seq2len_;++i)//horizontale mit gapkosten initialieren matrix[i]=i*gapPenalty_; - for (unsigned i=1;i<=seq1len_;++1) + for (unsigned i=1;i<=seq1len_;++i) { for (unsigned j=1;j<=seq2len_;++j) { matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]+gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]+gapPenalty)), - (matrix[(i-1)*(seq2len_+1)+j-1])+ *matrixPtr_[getIndex_(seq1[i], seq2[j])]) + (matrix[(i-1)*(seq2len_+1)+j-1])+ *matrixPtr_[getIndex_(seq1[i-1], seq2[j-1])]) } } return matrix[(seq1len_+1)*(seq2len_+1)-1]; From 213b6a66952dcb14b4cdfdcfd04dc06e714c6fd1 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Thu, 13 May 2021 16:28:03 +0200 Subject: [PATCH 35/53] seq-align working but not yet optimized --- .../ID/ConsensusIDAlgorithmPEPMatrix.h | 7 +- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 127 +++++++++--------- .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 14 +- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 36 ++--- .../ConsensusIDAlgorithmPEPMatrix_test.cpp | 10 ++ 5 files changed, 100 insertions(+), 94 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index b9ec4ede3ab..03fc6a9db68 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -54,10 +54,11 @@ namespace OpenMS public: /// Default constructor ConsensusIDAlgorithmPEPMatrix(); - + /// Sequence similarity based on substitution matrix (ignores PTMs) + double getSimilarity_(AASequence seq1, AASequence seq2) override; //danach wieder zu private machen (nur zum Testen) private: - NeedlemanWunsch object; + NeedlemanWunsch object= NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identityMatrix ,-5); /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -66,8 +67,6 @@ namespace OpenMS ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); - /// Sequence similarity based on substitution matrix (ignores PTMs) - double getSimilarity_(AASequence seq1, AASequence seq2) override; }; diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index f01275d71ad..895c359c80f 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -1,68 +1,71 @@ #include +#include +#include -std::vector adaptedIdentity - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, - /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, - /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, - /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, - /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, - /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, - /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 - }; -std::vector PAM30MS - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, - /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, - /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, - /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, - /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, - /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, - /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, - /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, - /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, - /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, - /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, - /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, - /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, - /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, - /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, - /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, - /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, - /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, - /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, - /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, - /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, - /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, - /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 - }; - -namespace OpenMS -{ +//namespace OpenMS +//{ class OPENMS_DLLAPI NeedlemanWunsch { - public: + std::vector adaptedIdentity + { + // A R N D C Q E G H I L K M F P S T W Y V B Z X * + /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, + /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, + /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, + /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, + /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, + /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, + /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, + /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 + }; + + std::vector PAM30MS + { + // A R N D C Q E G H I L K M F P S T W Y V B Z X * + /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, + /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, + /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, + /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, + /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, + /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, + /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, + /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, + /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, + /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, + /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, + /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, + /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, + /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, + /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, + /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, + /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, + /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, + /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, + /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, + /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, + /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, + /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, + /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 + }; + + public: enum ScoringMatrix { PAM30MSMatrix, @@ -71,7 +74,7 @@ namespace OpenMS NeedlemanWunsch(ScoringMatrix matrix, int penalty); ~NeedlemanWunsch()=default; - int align_(const String& seq1, const String& seq2); + double align_(const std::string& seq1, const std::string& seq2); private: int getIndex_(const char& a, const char& b) const; @@ -80,4 +83,4 @@ namespace OpenMS int gapPenalty_ = 0; std::vector* matrixPtr_ = nullptr; }; -} \ No newline at end of file +//} \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index e9ba0f7f498..0898840f020 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -33,7 +33,7 @@ // -------------------------------------------------------------------------- #include -#include +#include //wieder rausnehmen using namespace std; @@ -49,16 +49,18 @@ namespace OpenMS defaultsToParam_(); - string matrix = param_.getValue("matrix"); //muss ja enum sein... + string matrix = param_.getValue("matrix"); int penalty = param_.getValue("penalty"); if (matrix == "identity") { + cout<<"ich war hier identity"< unmod_seq2) swap(unmod_seq1, unmod_seq2); - int score_self1 = object.align_(unmod_seq1, unmod_seq1); - int score_sim = object.align_(unmod_seq1, unmod_seq2); - int score_self2 = object.align_(unmod_seq2, unmod_seq2); + double score_self1 = object.align_(unmod_seq1, unmod_seq1); + double score_sim = object.align_(unmod_seq1, unmod_seq2); + double score_self2 = object.align_(unmod_seq2, unmod_seq2); if (score_sim < 0) { @@ -92,7 +94,7 @@ namespace OpenMS } else { - score_sim /= min(score_self1, score_self2); // normalize + score_sim/=min(score_self1, score_self2); // normalize //was ist wenn man durch 0 teilt hier? } return score_sim; } diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index ffd7aed23cb..ea3f872f896 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -1,28 +1,19 @@ #include +#include using namespace std; -namespace OpenMS -{ +//namespace OpenMS +//{ NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { gapPenalty_ = penalty; - switch(matrix) - { - case identity: - matrixPtr_ = &adaptedIdentity; - break; - case PAM30MS: - matrixPtr_ = &PAM30MS; - break; - default: - String msg = "Matrix '" + matrix + "' is not known! Valid choices are: " - "'identity', 'PAM30MS'."; - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - msg); - } + if (matrix == identityMatrix) + matrixPtr_ = &adaptedIdentity; + else if (matrix == PAM30MSMatrix) + matrixPtr_ = &PAM30MS; } -int NeedlemanWunsch::getIndex_(const char& a, const char& b) const //noch optimieren +int NeedlemanWunsch::getIndex_(const char& a, const char& b) const //noch optimieren (Tina) { vector> vec = { @@ -51,12 +42,12 @@ y = 23; return x + y*vec.size(); } -int NeedlemanWunsch::align_(const String& seq1, const String& seq2) +double NeedlemanWunsch::align_(const std::string& seq1, const std::string& seq2) { seq1len_ = seq1.length(); seq2len_ = seq2.length(); - std::vector matrix((seq1len_+1)*(seq2len_+1), 0)//matrix mit 0en initialisieren + std::vector matrix((seq1len_+1)*(seq2len_+1), 0);//matrix mit 0en initialisieren for (unsigned i = 1; i <= seq1len_; ++i) //vertikale mit gapkkosten initialisieren matrix[i*(seq2len_+1)]=i*gapPenalty_; for (unsigned i =0; i<=seq2len_;++i)//horizontale mit gapkosten initialieren @@ -65,10 +56,11 @@ int NeedlemanWunsch::align_(const String& seq1, const String& seq2) { for (unsigned j=1;j<=seq2len_;++j) { - matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]+gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]+gapPenalty)), - (matrix[(i-1)*(seq2len_+1)+j-1])+ *matrixPtr_[getIndex_(seq1[i-1], seq2[j-1])]) + matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]+gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]+gapPenalty_)), + (matrix[(i-1)*(seq2len_+1)+j-1])+ (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]); } } + cout< +#include //wieder rausnehmen using namespace OpenMS; using namespace std; @@ -70,6 +71,15 @@ START_SECTION(void apply(std::vector& ids)) } END_SECTION +START_SECTION(double getSimilarity_(AASequence seq1, + AASequence seq2)) +{ + ConsensusIDAlgorithmPEPMatrix object2= ConsensusIDAlgorithmPEPMatrix();//konstruktor + AASequence seq1=AASequence::fromString("ATLIGQLAIQQ"); + AASequence seq2=AASequence::fromString("ATLIGALDQQQ"); + cout< Date: Fri, 14 May 2021 23:48:54 +0200 Subject: [PATCH 36/53] seq-align working but not yet optimized --- .../ID/ConsensusIDAlgorithmPEPMatrix.h | 6 +- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 101 +++------- .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 42 +--- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 188 +++++++++++++++++- .../ConsensusIDAlgorithmPEPMatrix_test.cpp | 2 +- 5 files changed, 215 insertions(+), 124 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index 03fc6a9db68..d06050f171b 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -54,11 +54,13 @@ namespace OpenMS public: /// Default constructor ConsensusIDAlgorithmPEPMatrix(); + /// Sequence similarity based on substitution matrix (ignores PTMs) double getSimilarity_(AASequence seq1, AASequence seq2) override; //danach wieder zu private machen (nur zum Testen) + private: - NeedlemanWunsch object= NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identityMatrix ,-5); + NeedlemanWunsch object_ = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identityMatrix , -5); //initialization //bisher wird hier die Matrix gewählt /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -66,8 +68,6 @@ namespace OpenMS /// Not implemented ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); - - }; } // namespace OpenMS diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index 895c359c80f..a76681257cc 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -3,84 +3,37 @@ #include -//namespace OpenMS -//{ +namespace OpenMS +{ class OPENMS_DLLAPI NeedlemanWunsch { - std::vector adaptedIdentity - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, - /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, - /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, - /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, - /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, - /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, - /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 - }; - - std::vector PAM30MS - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, - /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, - /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, - /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, - /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, - /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, - /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, - /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, - /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, - /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, - /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, - /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, - /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, - /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, - /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, - /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, - /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, - /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, - /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, - /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, - /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, - /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, - /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 - }; public: - enum ScoringMatrix - { - PAM30MSMatrix, - identityMatrix - }; + enum class ScoringMatrix + { + PAM30MSMatrix, + identityMatrix + }; + + NeedlemanWunsch(ScoringMatrix matrix, int penalty); + + ~NeedlemanWunsch()=default; + + double align_(const String& seq1, const String& seq2); + + void setMatrix_(const ScoringMatrix& matrix); + + void setPenalty_(const int& penalty); + + ScoringMatrix getMatrix_() const; - NeedlemanWunsch(ScoringMatrix matrix, int penalty); - ~NeedlemanWunsch()=default; - double align_(const std::string& seq1, const std::string& seq2); + int getPenalty_() const; - private: - int getIndex_(const char& a, const char& b) const; - unsigned seq1len_ = 0; - unsigned seq2len_ = 0; - int gapPenalty_ = 0; - std::vector* matrixPtr_ = nullptr; + private: + int getIndex_(const char& a, const char& b) const; + unsigned seq1len_ = 0; + unsigned seq2len_ = 0; + int gapPenalty_ = 0; + std::vector* matrixPtr_ = nullptr; }; -//} \ No newline at end of file +} \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 0898840f020..8dcad121a67 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -33,7 +33,6 @@ // -------------------------------------------------------------------------- #include -#include //wieder rausnehmen using namespace std; @@ -41,36 +40,6 @@ namespace OpenMS { ConsensusIDAlgorithmPEPMatrix::ConsensusIDAlgorithmPEPMatrix() { - setName("ConsensusIDAlgorithmPEPMatrix"); - defaults_.setValue("matrix", "identity", "Substitution matrix to use for alignment-based similarity scoring"); - defaults_.setValidStrings("matrix", {"identity", "PAM30MS"}); - defaults_.setValue("penalty", 5, "Alignment gap penalty (the same value is used for gap opening and extension)"); - defaults_.setMinInt("penalty", 1); - - defaultsToParam_(); - - string matrix = param_.getValue("matrix"); - int penalty = param_.getValue("penalty"); - - if (matrix == "identity") - { - cout<<"ich war hier identity"< unmod_seq2) swap(unmod_seq1, unmod_seq2); - + if (unmod_seq1 < unmod_seq2) swap(unmod_seq1, unmod_seq2); - double score_self1 = object.align_(unmod_seq1, unmod_seq1); - double score_sim = object.align_(unmod_seq1, unmod_seq2); - double score_self2 = object.align_(unmod_seq2, unmod_seq2); + double score_self1 = object_.align_(unmod_seq1, unmod_seq1); + double score_sim = object_.align_(unmod_seq1, unmod_seq2); + double score_self2 = object_.align_(unmod_seq2, unmod_seq2); if (score_sim < 0) { @@ -94,7 +62,7 @@ namespace OpenMS } else { - score_sim/=min(score_self1, score_self2); // normalize //was ist wenn man durch 0 teilt hier? + score_sim /= min(score_self1, score_self2); // normalize } return score_sim; } diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index ea3f872f896..16088d8abfa 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -1,16 +1,152 @@ #include #include +#include //swap using namespace std; -//namespace OpenMS -//{ +namespace OpenMS +{ + +std::vector adaptedIdentity + { + // A R N D C Q E G H I L K M F P S T W Y V B Z X * + /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, + /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, + /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, + /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, + /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, + /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, + /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, + /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, + /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 + }; + +std::vector PAM30MS + { + // A R N D C Q E G H I L K M F P S T W Y V B Z X * + /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, + /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, + /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, + /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, + /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, + /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, + /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, + /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, + /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, + /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, + /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, + /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, + /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, + /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, + /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, + /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, + /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, + /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, + /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, + /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, + /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, + /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, + /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, + /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 + }; + NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { + /* was muss für Exeption inkludiert werden? + if (penalty >= 0) + { + String msg = "Gap penalty should be negative"; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + */ gapPenalty_ = penalty; - if (matrix == identityMatrix) + + if (matrix == ScoringMatrix::identityMatrix) + { matrixPtr_ = &adaptedIdentity; - else if (matrix == PAM30MSMatrix) - matrixPtr_ = &PAM30MS; + } + + else if (matrix == ScoringMatrix::PAM30MSMatrix) + { + matrixPtr_ = &PAM30MS; + } + /* + else + { + String msg = "Matrix is not known! Valid choices are: " + "'identity', 'PAM30MS'."; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + */ +} + +void NeedlemanWunsch::setMatrix_(const NeedlemanWunsch::ScoringMatrix& matrix) +{ + if (matrix == ScoringMatrix::identityMatrix) + { + matrixPtr_ = &adaptedIdentity; + } + + else if (matrix == ScoringMatrix::PAM30MSMatrix) + { + matrixPtr_ = &PAM30MS; + } +/* + else + { + String msg = "Matrix is not known! Valid choices are: " + "'identity', 'PAM30MS'."; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + */ +} + +void NeedlemanWunsch::setPenalty_(const int& penalty) +{ + /* + if (penalty >= 0) + { + String msg = "Gap penalty should be negative"; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + */ + + gapPenalty_ = penalty; +} + +NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix_() const +{ + if (*matrixPtr_ == adaptedIdentity) + { + return ScoringMatrix::identityMatrix; + } + else + { + return ScoringMatrix::PAM30MSMatrix; + } +} + +int NeedlemanWunsch::getPenalty_() const +{ + return gapPenalty_; } int NeedlemanWunsch::getIndex_(const char& a, const char& b) const //noch optimieren (Tina) @@ -41,13 +177,13 @@ if (y == -1) y = 23; return x + y*vec.size(); } - -double NeedlemanWunsch::align_(const std::string& seq1, const std::string& seq2) +/* +double NeedlemanWunsch::align_(const String& seq1, const String& seq2) //vollständige matrix { seq1len_ = seq1.length(); seq2len_ = seq2.length(); - std::vector matrix((seq1len_+1)*(seq2len_+1), 0);//matrix mit 0en initialisieren + vector matrix((seq1len_+1)*(seq2len_+1), 0);//matrix mit 0en initialisieren for (unsigned i = 1; i <= seq1len_; ++i) //vertikale mit gapkkosten initialisieren matrix[i*(seq2len_+1)]=i*gapPenalty_; for (unsigned i =0; i<=seq2len_;++i)//horizontale mit gapkosten initialieren @@ -63,4 +199,38 @@ double NeedlemanWunsch::align_(const std::string& seq1, const std::string& seq2) cout< firstRow{}; + vector secondRow(seq2len_+1,0); + vector* firstRowPtr = &firstRow; + vector* secondRowPtr = &secondRow; + + + for (unsigned i = 0; i <= seq2len_; ++i)//horizontale mit gapkosten initialieren + { + firstRow.push_back(i * gapPenalty_); + } + + for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen und clearen + { + (*secondRowPtr)[0] = i * gapPenalty_; //erster wert in der zeile mit gapkosten + for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen + { + (*secondRowPtr)[j] = (max(max(((*secondRowPtr)[j-1] + gapPenalty_), ((*firstRowPtr)[j] + gapPenalty_)), + ((*firstRowPtr)[j-1]) + (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]));//statt getIndex: [seq1[i-1] - 'A'] [seq2[j-1] - 'A'] und matrix entsprechend aufbauen + cout<<(*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]< Date: Mon, 17 May 2021 12:05:21 +0200 Subject: [PATCH 37/53] NeedlemanWunsch class test --- .../openms/source/NeedlemanWunsch_test.cpp | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp diff --git a/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp new file mode 100644 index 00000000000..8eada46e7f2 --- /dev/null +++ b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp @@ -0,0 +1,68 @@ +#include +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +/////////////////////////// + +START_TEST(NeedlemanWunsch, "$Id$") + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +NeedlemanWunsch* ptr = nullptr; +START_SECTION(NeedlemanWunsch(ScoringMatrix matrix, int penalty)()) +{ + ptr = new NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MSMatrix, -5); + TEST_EQUAL(ptr == nullptr, false) +} +END_SECTION + +START_SECTION(~NeedlemanWunsch()) +{ + delete (ptr); +} +END_SECTION + +String seq1 = "IGGATLIGQLAIQQAHVHL"; +String seq2 = "IGGATLIGALDQVVAQQAHVHL"; + +START_SECTION(double align_(const String& seq1, const String& seq2)) +{ + NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identityMatrix, -5); + TEST_EQUAL(object.align_(seq1, seq2), 1); + TEST_EQUAL(object.align_(seq1, seq1), 19); + TEST_EQUAL(object.align_(seq2, seq2), 22); +} +END_SECTION + +START_SECTION(void setMatrix_(const ScoringMatrix& matrix)) +{ + NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identityMatrix, -5); + object.setMatrix_(NeedlemanWunsch::ScoringMatrix::PAM30MSMatrix); + TEST_EQUAL(object.align_(seq1, seq2), 93); + TEST_EQUAL(object.align_(seq1, seq1), 131); + TEST_EQUAL(object.align_(seq2, seq2), 151); + //TEST_EQUAL(object.getMatrix_(), NeedlemanWunsch::ScoringMatrix::PAM30MSMatrix); kein == operator definiert für ScoringMatrix? +} +END_SECTION + +START_SECTION(void setPenalty_(const ScoringMatrix& matrix)) +{ + NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MSMatrix, -5); + //object.setPenalty_(5); //exeption abfangen + object.setPenalty_(-1); + TEST_EQUAL(object.align_(seq1, seq2), 113); + TEST_EQUAL(object.getPenalty_(), -1); +} +END_SECTION + + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +END_TEST + From 0547ce9fa82a880af1fed8021e68682a4f4507e2 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Mon, 17 May 2021 14:19:07 +0200 Subject: [PATCH 38/53] Exceptions --- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 23 ++++++++----------- .../openms/source/NeedlemanWunsch_test.cpp | 2 +- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 16088d8abfa..7d211be810b 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -1,5 +1,6 @@ #include #include +#include #include //swap using namespace std; @@ -66,14 +67,13 @@ std::vector PAM30MS NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { - /* was muss für Exeption inkludiert werden? if (penalty >= 0) { String msg = "Gap penalty should be negative"; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } - */ + gapPenalty_ = penalty; if (matrix == ScoringMatrix::identityMatrix) @@ -85,15 +85,13 @@ std::vector PAM30MS { matrixPtr_ = &PAM30MS; } - /* else { String msg = "Matrix is not known! Valid choices are: " - "'identity', 'PAM30MS'."; + "'identityMatrix', 'PAM30MSMatrix'."; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } - */ } void NeedlemanWunsch::setMatrix_(const NeedlemanWunsch::ScoringMatrix& matrix) @@ -107,28 +105,25 @@ void NeedlemanWunsch::setMatrix_(const NeedlemanWunsch::ScoringMatrix& matrix) { matrixPtr_ = &PAM30MS; } -/* + else { String msg = "Matrix is not known! Valid choices are: " - "'identity', 'PAM30MS'."; + "'identityMatrix', 'PAM30MSMatrix'."; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } - */ } void NeedlemanWunsch::setPenalty_(const int& penalty) { - /* + if (penalty >= 0) { String msg = "Gap penalty should be negative"; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } - */ - gapPenalty_ = penalty; } @@ -218,18 +213,18 @@ double NeedlemanWunsch::align_(const String& seq1, const String& seq2) //vollst firstRow.push_back(i * gapPenalty_); } - for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen und clearen + for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen { (*secondRowPtr)[0] = i * gapPenalty_; //erster wert in der zeile mit gapkosten for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen { (*secondRowPtr)[j] = (max(max(((*secondRowPtr)[j-1] + gapPenalty_), ((*firstRowPtr)[j] + gapPenalty_)), ((*firstRowPtr)[j-1]) + (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]));//statt getIndex: [seq1[i-1] - 'A'] [seq2[j-1] - 'A'] und matrix entsprechend aufbauen - cout<<(*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]< Date: Mon, 17 May 2021 23:05:58 +0200 Subject: [PATCH 39/53] neue Matrizen --- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 61 ++++++++++++++++++- 1 file changed, 59 insertions(+), 2 deletions(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 7d211be810b..3b804bba1df 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -6,8 +6,36 @@ using namespace std; namespace OpenMS { + std::vector adaptedIdentityNew // Name noch zu ändern + { -std::vector adaptedIdentity + // A B C D E F G H I K L M N P Q R S T V W X Y Z + /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, + /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, + /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, + /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, + /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, + /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, + }; + +std::vector> adaptedIdentity { // A R N D C Q E G H I L K M F P S T W Y V B Z X * /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, @@ -35,6 +63,35 @@ std::vector adaptedIdentity /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 }; + std::vector> PAM30MSNEW //Name noch zu ändern + { + + // A B C D E F G H I K L M N P Q R S T V W X Y Z + /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, -7, -6, -5, -4, -2, -4, -7, 0, -1, -2, -13, 0, -8, -6}, + /* B */ {-7, 5, -11, -7, -7, -12, -8, -4, -6, 5, -7, -3, -4, -5, -3, 5, -4, -5, -9, -7, 0, -10, 1}, + /* C */ {-6, -11, 10, -14, -14, -13, -9, -7, -6, -14, -11, -13, -11, -8, -14, -8, -3, -8, -6, -15, 0, -4, -14}, + /* D */ {-3, -7, -14, 8, 2, -15, -3, -4, -7, -4, -10, -11, 2, -8, -2, -10, -4, -5, -8, -15, 0, -11, -3}, + /* E */ {-2, -7, -14, 2, 8, -14, -4, -5, -5, -4, -7, -7, -2, -5, 1, -9, -4, -6, -6, -17, 0, -8, -2}, + /* F */ {-8, -12, -13, -15, -14, 9, -9, -6, -2, -14, -3, -4, -9, -10, -13, -9, -6, -9, -8, -4, 0, 2, -14}, + /* G */ {-2, -8, -9, -3, -4, -9, 6, -9, -11, -7, -11, -8, -3, -6, -7, -9, -2, -6, -5, -15, 0, -14, -7}, + /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, -6, -8, -10, 0, -4, 1, -2, -6, -7, -6, -7, 0, -3, -3}, + /* I */ {-5, -6, -6, -7, -5, -2, -11, -9, 8, -6, 5, -1, -5, -8, -8, -5, -7, -2, 2, -14, 0, -6, -7}, + /* K */ {-7, 5, -14, -4, -4, -14, -7, -6, -6, 7, -7, -2, -1, -6, -3, 0, -4, -3, -9, -12, 0, -9, 4}, + /* L */ {-6, -7, -11, -10, -7, -3, -11, -8, 5, -7, 5, 0, -6, -8, -7, -7, -8, -5, 0, -10, 0, -7, -7}, + /* M */ {-5, -3, -13, -11, -7, -4, -8, -10, -1, -2, 0, 11, -9, -8, -4, -4, -5, -4, -1, -13, 0, -11, -3}, + /* N */ {-4, -4, -11, 2, -2, -9, -3, 0, -5, -1, -6, -9, 8, -6, -3, -6, 0, -2, -8, -8, 0, -4, -2}, + /* P */ {-2, -5, -8, -8, -5, -10, -6, -4, -8, -6, -8, -8, -6, 8, -3, -4, -2, -4, -6, -14, 0, -13, -5}, + /* Q */ {-4, -3, -14, -2, 1, -13, -7, 1, -8, -3, -7, -4, -3, -3, 8, -2, -5, -5, -7, -13, 0, -12, 4}, + /* R */ {-7, 5, -8, -10, -9, -9, -9, -2, -5, 0, -7, -4, -6, -4, -2, 8, -3, -6, -8, -2, 0, -10, -1}, + /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, -4, -8, -5, 0, -2, -5, -3, 6, 0, -6, -5, 0, -7, -5}, + /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, -3, -5, -4, -2, -4, -5, -6, 0, 7, -3, -13, 0, -6, -4}, + /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, -9, 0, -1, -8, -6, -7, -8, -6, -3, 7, -15, 0, -7, -8}, + /* W */ {-13, -7, -15, -15, -17, -4, -15, -7, -14, -12, -10, -13, -8, -14, -13, -2, -5, -13, -15, 13, 0, -5, -13}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, + /* Y */ {-8, -10, -4, -11, -8, 2, -14, -3, -6, -9, -7, -11, -4, -13, -12, -10, -7, -6, -7, -5, 0, 10, -11}, + /* Z */ {-6, 1, -14, -3, -2, -14, -7, -3, -7, 4, -7, -3, -2, -5, 4, -1, -5, -4, -8, -13, 0, -11, 4} + + }; std::vector PAM30MS { @@ -159,7 +216,7 @@ vector> vec = }; int x = -1; int y = -1; -for (int i = 0; i < vec.size(); ++i) +for (Size i = 0; i < vec.size(); ++i) { if (vec[i].first == a) x = vec[i].second; From c4d402a9d9c6721e8bef6150d3d6253108efb9ce Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Tue, 18 May 2021 08:18:22 +0200 Subject: [PATCH 40/53] udpate param_ will follow --- .../ID/ConsensusIDAlgorithmPEPMatrix.h | 7 ++- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 14 ++--- .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 53 ++++++++++++++++-- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 54 +++++++------------ 4 files changed, 80 insertions(+), 48 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index d06050f171b..fd59d4aa5c1 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -60,7 +60,7 @@ namespace OpenMS private: - NeedlemanWunsch object_ = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identityMatrix , -5); //initialization //bisher wird hier die Matrix gewählt + NeedlemanWunsch object_ = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, -5); //initialization //bisher wird hier die Matrix gewählt /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -68,6 +68,11 @@ namespace OpenMS /// Not implemented ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); + /* + void ConsensusIDAlgorithmPEPMatrix::updateParams_(const NeedlemanWunsch::ScoringMatrix& matrix); + + void ConsensusIDAlgorithmPEPMatrix::updateParams_(const int& penalty); +*/ }; } // namespace OpenMS diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index a76681257cc..00271db0d83 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -11,23 +11,23 @@ namespace OpenMS public: enum class ScoringMatrix { - PAM30MSMatrix, - identityMatrix + PAM30MS, + identity }; NeedlemanWunsch(ScoringMatrix matrix, int penalty); ~NeedlemanWunsch()=default; - double align_(const String& seq1, const String& seq2); + int align(const String& seq1, const String& seq2); - void setMatrix_(const ScoringMatrix& matrix); + void setMatrix(const ScoringMatrix& matrix); - void setPenalty_(const int& penalty); + void setPenalty(const int& penalty); - ScoringMatrix getMatrix_() const; + ScoringMatrix getMatrix() const; - int getPenalty_() const; + int getPenalty() const; private: int getIndex_(const char& a, const char& b) const; diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 8dcad121a67..65f95a34cb6 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -40,8 +40,47 @@ namespace OpenMS { ConsensusIDAlgorithmPEPMatrix::ConsensusIDAlgorithmPEPMatrix() { + setName("ConsensusIDAlgorithmPEPMatrix"); // DefaultParamHandler + + defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alognment-based similarity scoring"); + defaults_.setValidStrings("matrix", {"identity","PAM30MS"}); + defaults_.setValue("penalty", -5, "Alignment gap penalty (the same value is used for gap opening and extension)"); + defaults_.setMinInt("penalty", -1); + + defaultsToParam_(); + } +/* + void ConsensusIDAlgorithmPEPMatrix::updateParams_(const NeedlemanWunsch::ScoringMatrix& matrix) + { + if (matrix == NeedlemanWunsch::ScoringMatrix::identity) + { + defaults_.setValue("matrix", "identity", "Substitution matrix to use for alignment-based similarity scoring"); + } + else if (matrix == NeedlemanWunsch::ScoringMatrix::PAM30MS) + { + defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alignment-based similarity scoring"); + } + + //params_ up to date machen und die funktion nach set matrix ud penalty aufrufen + + defaultsToParam_(); + + } + + void ConsensusIDAlgorithmPEPMatrix::updateParams_(const int& penalty) + { + + defaults_.setValue("penalty", penalty, "Alignment gap penalty (the same value is used for gap opening and extension)"); + + //params_ up to date machen und die funktion nach set matrix ud penalty aufrufen + + defaultsToParam_(); + + } + +*/ double ConsensusIDAlgorithmPEPMatrix::getSimilarity_(AASequence seq1, AASequence seq2) @@ -51,10 +90,16 @@ namespace OpenMS String unmod_seq2 = seq2.toUnmodifiedString(); if (unmod_seq1 == unmod_seq2) return 1.0; if (unmod_seq1 < unmod_seq2) swap(unmod_seq1, unmod_seq2); - - double score_self1 = object_.align_(unmod_seq1, unmod_seq1); - double score_sim = object_.align_(unmod_seq1, unmod_seq2); - double score_self2 = object_.align_(unmod_seq2, unmod_seq2); + /* testen ob es schneller mit oder ohne ist + AASequence s1 = AASequence::fromString(unmod_seq1); + AASequence s2 = AASequence::fromString(unmod_seq2); + pair seq_pair = make_pair(s1, s2); + SimilarityCache::iterator pos = similarities_.find(seq_pair); + if (pos != similarities_.end()) return pos->second; // score found in cache + */ + double score_self1 = object_.align(unmod_seq1, unmod_seq1); + double score_sim = object_.align(unmod_seq1, unmod_seq2); + double score_self2 = object_.align(unmod_seq2, unmod_seq2); if (score_sim < 0) { diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 7d211be810b..fe87817b7f8 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -2,6 +2,7 @@ #include #include #include //swap +#include using namespace std; namespace OpenMS @@ -65,43 +66,21 @@ std::vector PAM30MS /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 }; - NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) +NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { - if (penalty >= 0) - { - String msg = "Gap penalty should be negative"; - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - msg); - } - - gapPenalty_ = penalty; + setMatrix(matrix); + setPenalty(penalty); - if (matrix == ScoringMatrix::identityMatrix) - { - matrixPtr_ = &adaptedIdentity; - } - - else if (matrix == ScoringMatrix::PAM30MSMatrix) - { - matrixPtr_ = &PAM30MS; - } - else - { - String msg = "Matrix is not known! Valid choices are: " - "'identityMatrix', 'PAM30MSMatrix'."; - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - msg); - } } -void NeedlemanWunsch::setMatrix_(const NeedlemanWunsch::ScoringMatrix& matrix) +void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) { - if (matrix == ScoringMatrix::identityMatrix) + if (matrix == ScoringMatrix::identity) { matrixPtr_ = &adaptedIdentity; } - else if (matrix == ScoringMatrix::PAM30MSMatrix) + else if (matrix == ScoringMatrix::PAM30MS) { matrixPtr_ = &PAM30MS; } @@ -109,13 +88,15 @@ void NeedlemanWunsch::setMatrix_(const NeedlemanWunsch::ScoringMatrix& matrix) else { String msg = "Matrix is not known! Valid choices are: " - "'identityMatrix', 'PAM30MSMatrix'."; + "'identity', 'PAM30MS'."; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } + // ConsensusIDAlgorithmPEPMatrix::updateParams_(matrix) dafür objekt erstellen ... + } -void NeedlemanWunsch::setPenalty_(const int& penalty) +void NeedlemanWunsch::setPenalty(const int& penalty) { if (penalty >= 0) @@ -125,21 +106,22 @@ void NeedlemanWunsch::setPenalty_(const int& penalty) msg); } gapPenalty_ = penalty; + // ConsensusIDAlgorithmPEPMatrix::updateParams_(matrix) dafür objekt erstellen ... } -NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix_() const +NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const { if (*matrixPtr_ == adaptedIdentity) { - return ScoringMatrix::identityMatrix; + return ScoringMatrix::identity; } else { - return ScoringMatrix::PAM30MSMatrix; + return ScoringMatrix::PAM30MS; } } -int NeedlemanWunsch::getPenalty_() const +int NeedlemanWunsch::getPenalty() const { return gapPenalty_; } @@ -173,7 +155,7 @@ y = 23; return x + y*vec.size(); } /* -double NeedlemanWunsch::align_(const String& seq1, const String& seq2) //vollständige matrix +double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix { seq1len_ = seq1.length(); seq2len_ = seq2.length(); @@ -197,7 +179,7 @@ double NeedlemanWunsch::align_(const String& seq1, const String& seq2) //vollst */ //linear space (2 Zeilen) - double NeedlemanWunsch::align_(const String& seq1, const String& seq2) + int NeedlemanWunsch::align(const String& seq1, const String& seq2) { seq1len_ = seq1.length(); seq2len_ = seq2.length(); From f433bc9300e15a54dc3c4f9d8b7c5ed49bde4fb3 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Tue, 18 May 2021 15:06:22 +0200 Subject: [PATCH 41/53] Vector --- src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 3b804bba1df..c73dedcb4b9 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -32,10 +32,10 @@ namespace OpenMS /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, - /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}, + /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1} }; -std::vector> adaptedIdentity +std::vector adaptedIdentity { // A R N D C Q E G H I L K M F P S T W Y V B Z X * /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, From eb9bb99ee96bf6e31460eca6a52408a1959cd578 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Tue, 18 May 2021 15:37:32 +0200 Subject: [PATCH 42/53] include Test & edit Vector --- src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 2 +- src/tests/class_tests/openms/executables.cmake | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index c73dedcb4b9..ffa3fc9f628 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -6,7 +6,7 @@ using namespace std; namespace OpenMS { - std::vector adaptedIdentityNew // Name noch zu ändern + std::vector> adaptedIdentityNew // Name noch zu ändern { // A B C D E F G H I K L M N P Q R S T V W X Y Z diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index 917d10594cb..ad17d06a6f9 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -528,6 +528,7 @@ set(analysis_executables_list MetaboliteFeatureDeconvolution_test MetaboliteSpectralMatching_test ModifiedPeptideGenerator_test + NeedlemanWunsch_test OfflinePrecursorIonSelection_test PeptideIndexing_test PeptideAndProteinQuant_test From b8c37163a94f1d06f3da9040101769b71f029d20 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Tue, 18 May 2021 19:16:48 +0200 Subject: [PATCH 43/53] getIndexNEW_ for Matrix[i][j] --- .../OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 1 + .../source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 12 ++++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index a76681257cc..f9a262cea4f 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -30,6 +30,7 @@ namespace OpenMS int getPenalty_() const; private: + int getIndexNEW_(const char& a) const; // falls wir mit Matrix[i][j] Elementen arbeiten int getIndex_(const char& a, const char& b) const; unsigned seq1len_ = 0; unsigned seq2len_ = 0; diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index ffa3fc9f628..9852e33cb0e 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -200,9 +200,21 @@ int NeedlemanWunsch::getPenalty_() const { return gapPenalty_; } +int NeedlemanWunsch::getIndexNEW_(const char&a) const //Falls wir die Matrizen als vector> haben, rufen wir getIndexNEW_ 2x auf (für Matrix[i][j]) +{ + // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 + vector alphabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z' }; + int index = 0; + for(Size i=0; i> vec = { {'A', 0}, {'R', 1}, {'N', 2}, From edb665ee595cb889e45687110e2e583fb5848413 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Tue, 18 May 2021 19:21:27 +0200 Subject: [PATCH 44/53] add break --- src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 9852e33cb0e..82e5ae4f05b 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -208,7 +208,10 @@ int NeedlemanWunsch::getIndexNEW_(const char&a) const //Falls wir die Matrizen a for(Size i=0; i Date: Wed, 19 May 2021 15:35:24 +0200 Subject: [PATCH 45/53] positive penalty and updatemembers_() --- .../ID/ConsensusIDAlgorithmPEPMatrix.h | 9 +-- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 3 + .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 56 ++++++++++--------- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 31 +++------- .../openms/source/NeedlemanWunsch_test.cpp | 38 ++++++------- 5 files changed, 63 insertions(+), 74 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index fd59d4aa5c1..fedba8ec768 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -60,7 +60,7 @@ namespace OpenMS private: - NeedlemanWunsch object_ = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, -5); //initialization //bisher wird hier die Matrix gewählt + NeedlemanWunsch object_ = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 55); //initialization //bisher wird hier die Matrix gewählt /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -68,11 +68,8 @@ namespace OpenMS /// Not implemented ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); - /* - void ConsensusIDAlgorithmPEPMatrix::updateParams_(const NeedlemanWunsch::ScoringMatrix& matrix); - - void ConsensusIDAlgorithmPEPMatrix::updateParams_(const int& penalty); -*/ + // Docu in base class + void updateMembers_() override; }; } // namespace OpenMS diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index 00271db0d83..3ec0a5c6460 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -1,6 +1,8 @@ #include #include #include +#include + namespace OpenMS @@ -29,6 +31,7 @@ namespace OpenMS int getPenalty() const; + private: int getIndex_(const char& a, const char& b) const; unsigned seq1len_ = 0; diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 65f95a34cb6..c7d8ade57d4 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -42,46 +42,52 @@ namespace OpenMS { setName("ConsensusIDAlgorithmPEPMatrix"); // DefaultParamHandler - defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alognment-based similarity scoring"); + defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alignment-based similarity scoring"); defaults_.setValidStrings("matrix", {"identity","PAM30MS"}); - defaults_.setValue("penalty", -5, "Alignment gap penalty (the same value is used for gap opening and extension)"); + defaults_.setValue("penalty", 5, "Alignment gap penalty (the same value is used for gap opening and extension)"); defaults_.setMinInt("penalty", -1); defaultsToParam_(); } -/* - void ConsensusIDAlgorithmPEPMatrix::updateParams_(const NeedlemanWunsch::ScoringMatrix& matrix) + + void ConsensusIDAlgorithmPEPMatrix::updateMembers_() { - if (matrix == NeedlemanWunsch::ScoringMatrix::identity) + //ConsenusIDAlgorithmSimilarity::updateMembers_(); // error: has not been declared + + string matrix = param_.getValue("matrix"); + int penalty = param_.getValue("penalty"); + if (matrix == "identity") { - defaults_.setValue("matrix", "identity", "Substitution matrix to use for alignment-based similarity scoring"); + object_.setMatrix(NeedlemanWunsch::ScoringMatrix::identity); } - - else if (matrix == NeedlemanWunsch::ScoringMatrix::PAM30MS) + else if (matrix == "PAM30MS") { - defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alignment-based similarity scoring"); + object_.setMatrix(NeedlemanWunsch::ScoringMatrix::PAM30MS); + } + else + { + String msg = "Matrix is not known! Valid choices are: " + "'identity', 'PAM30MS'."; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + if (penalty > 0) + { + object_.setPenalty(penalty); + } + else + { + String msg = "Gap penalty should be positive"; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); } - //params_ up to date machen und die funktion nach set matrix ud penalty aufrufen - - defaultsToParam_(); - - } - - void ConsensusIDAlgorithmPEPMatrix::updateParams_(const int& penalty) - { - - defaults_.setValue("penalty", penalty, "Alignment gap penalty (the same value is used for gap opening and extension)"); - - //params_ up to date machen und die funktion nach set matrix ud penalty aufrufen - - defaultsToParam_(); + // new parameters may affect the similarity calculation, so clear cache: + similarities_.clear(); } -*/ - double ConsensusIDAlgorithmPEPMatrix::getSimilarity_(AASequence seq1, AASequence seq2) { diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index fe87817b7f8..504b74a0bee 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -2,7 +2,7 @@ #include #include #include //swap -#include + using namespace std; namespace OpenMS @@ -85,28 +85,12 @@ void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) matrixPtr_ = &PAM30MS; } - else - { - String msg = "Matrix is not known! Valid choices are: " - "'identity', 'PAM30MS'."; - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - msg); - } - // ConsensusIDAlgorithmPEPMatrix::updateParams_(matrix) dafür objekt erstellen ... - } void NeedlemanWunsch::setPenalty(const int& penalty) { - if (penalty >= 0) - { - String msg = "Gap penalty should be negative"; - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - msg); - } gapPenalty_ = penalty; - // ConsensusIDAlgorithmPEPMatrix::updateParams_(matrix) dafür objekt erstellen ... } NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const @@ -162,14 +146,14 @@ double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollstä vector matrix((seq1len_+1)*(seq2len_+1), 0);//matrix mit 0en initialisieren for (unsigned i = 1; i <= seq1len_; ++i) //vertikale mit gapkkosten initialisieren - matrix[i*(seq2len_+1)]=i*gapPenalty_; + matrix[i*(seq2len_+1)]=i*(-gapPenalty_); for (unsigned i =0; i<=seq2len_;++i)//horizontale mit gapkosten initialieren - matrix[i]=i*gapPenalty_; + matrix[i]=i*(-gapPenalty_); for (unsigned i=1;i<=seq1len_;++i) { for (unsigned j=1;j<=seq2len_;++j) { - matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]+gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]+gapPenalty_)), + matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]-gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]-gapPenalty_)), (matrix[(i-1)*(seq2len_+1)+j-1])+ (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]); } } @@ -192,17 +176,16 @@ double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollstä for (unsigned i = 0; i <= seq2len_; ++i)//horizontale mit gapkosten initialieren { - firstRow.push_back(i * gapPenalty_); + firstRow.push_back(i * ((-1 * gapPenalty_))); } for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen { - (*secondRowPtr)[0] = i * gapPenalty_; //erster wert in der zeile mit gapkosten + (*secondRowPtr)[0] = i * ((-1 * gapPenalty_)); //erster wert in der zeile mit gapkosten for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen { - (*secondRowPtr)[j] = (max(max(((*secondRowPtr)[j-1] + gapPenalty_), ((*firstRowPtr)[j] + gapPenalty_)), + (*secondRowPtr)[j] = (max(max(((*secondRowPtr)[j-1] - gapPenalty_), ((*firstRowPtr)[j] - gapPenalty_)), ((*firstRowPtr)[j-1]) + (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]));//statt getIndex: [seq1[i-1] - 'A'] [seq2[j-1] - 'A'] und matrix entsprechend aufbauen - //cout<<(*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]< Date: Wed, 19 May 2021 21:36:04 +0200 Subject: [PATCH 46/53] Replace getIndex() && insert final Matrices[26][26] --- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 4 +- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 210 +++++------------- 2 files changed, 60 insertions(+), 154 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index 85360927ccd..1a641ed441a 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -33,11 +33,9 @@ namespace OpenMS private: - int getIndexNEW_(const char& a) const; // falls wir mit Matrix[i][j] Elementen arbeiten - int getIndex_(const char& a, const char& b) const; unsigned seq1len_ = 0; unsigned seq2len_ = 0; int gapPenalty_ = 0; - std::vector* matrixPtr_ = nullptr; + int(* matrixPtr_)[26][26] = nullptr; }; } \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 07a676bfee3..52d23363af9 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -7,121 +7,73 @@ using namespace std; namespace OpenMS { - std::vector> adaptedIdentityNew // Name noch zu ändern + static int adaptedIdentity[26][26] { - // A B C D E F G H I K L M N P Q R S T V W X Y Z - /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, - /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, - /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0}, - /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0}, - /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0}, - /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}, - /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}, - /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}, - /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1} + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT8_MAX, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* J */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, + /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, INT8_MAX, 0, 1, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT8_MAX, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 1, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 1, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* O */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, + /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, INT8_MAX, 0, 1, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 1, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 1, INT8_MAX, 0, 0, 0, 0, 0}, + /* U */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, + /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, 0}, + /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 1, 0, 0, 0}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 1, 0}, + /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX,0, 0, 0, 0, 1} }; -std::vector adaptedIdentity - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, - /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, - /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, - /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, - /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, - /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, - /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 - }; - std::vector> PAM30MSNEW //Name noch zu ändern + + static int PAM30MS[26][26] { - // A B C D E F G H I K L M N P Q R S T V W X Y Z - /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, -7, -6, -5, -4, -2, -4, -7, 0, -1, -2, -13, 0, -8, -6}, - /* B */ {-7, 5, -11, -7, -7, -12, -8, -4, -6, 5, -7, -3, -4, -5, -3, 5, -4, -5, -9, -7, 0, -10, 1}, - /* C */ {-6, -11, 10, -14, -14, -13, -9, -7, -6, -14, -11, -13, -11, -8, -14, -8, -3, -8, -6, -15, 0, -4, -14}, - /* D */ {-3, -7, -14, 8, 2, -15, -3, -4, -7, -4, -10, -11, 2, -8, -2, -10, -4, -5, -8, -15, 0, -11, -3}, - /* E */ {-2, -7, -14, 2, 8, -14, -4, -5, -5, -4, -7, -7, -2, -5, 1, -9, -4, -6, -6, -17, 0, -8, -2}, - /* F */ {-8, -12, -13, -15, -14, 9, -9, -6, -2, -14, -3, -4, -9, -10, -13, -9, -6, -9, -8, -4, 0, 2, -14}, - /* G */ {-2, -8, -9, -3, -4, -9, 6, -9, -11, -7, -11, -8, -3, -6, -7, -9, -2, -6, -5, -15, 0, -14, -7}, - /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, -6, -8, -10, 0, -4, 1, -2, -6, -7, -6, -7, 0, -3, -3}, - /* I */ {-5, -6, -6, -7, -5, -2, -11, -9, 8, -6, 5, -1, -5, -8, -8, -5, -7, -2, 2, -14, 0, -6, -7}, - /* K */ {-7, 5, -14, -4, -4, -14, -7, -6, -6, 7, -7, -2, -1, -6, -3, 0, -4, -3, -9, -12, 0, -9, 4}, - /* L */ {-6, -7, -11, -10, -7, -3, -11, -8, 5, -7, 5, 0, -6, -8, -7, -7, -8, -5, 0, -10, 0, -7, -7}, - /* M */ {-5, -3, -13, -11, -7, -4, -8, -10, -1, -2, 0, 11, -9, -8, -4, -4, -5, -4, -1, -13, 0, -11, -3}, - /* N */ {-4, -4, -11, 2, -2, -9, -3, 0, -5, -1, -6, -9, 8, -6, -3, -6, 0, -2, -8, -8, 0, -4, -2}, - /* P */ {-2, -5, -8, -8, -5, -10, -6, -4, -8, -6, -8, -8, -6, 8, -3, -4, -2, -4, -6, -14, 0, -13, -5}, - /* Q */ {-4, -3, -14, -2, 1, -13, -7, 1, -8, -3, -7, -4, -3, -3, 8, -2, -5, -5, -7, -13, 0, -12, 4}, - /* R */ {-7, 5, -8, -10, -9, -9, -9, -2, -5, 0, -7, -4, -6, -4, -2, 8, -3, -6, -8, -2, 0, -10, -1}, - /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, -4, -8, -5, 0, -2, -5, -3, 6, 0, -6, -5, 0, -7, -5}, - /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, -3, -5, -4, -2, -4, -5, -6, 0, 7, -3, -13, 0, -6, -4}, - /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, -9, 0, -1, -8, -6, -7, -8, -6, -3, 7, -15, 0, -7, -8}, - /* W */ {-13, -7, -15, -15, -17, -4, -15, -7, -14, -12, -10, -13, -8, -14, -13, -2, -5, -13, -15, 13, 0, -5, -13}, - /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, - /* Y */ {-8, -10, -4, -11, -8, 2, -14, -3, -6, -9, -7, -11, -4, -13, -12, -10, -7, -6, -7, -5, 0, 10, -11}, - /* Z */ {-6, 1, -14, -3, -2, -14, -7, -3, -7, 4, -7, -3, -2, -5, 4, -1, -5, -4, -8, -13, 0, -11, 4} + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, INT8_MAX, -7, -6, -5, -4, INT8_MAX, -2, -4, -7, 0, -1, INT8_MAX, -2,-13, 0, -8, -6}, + /* B */ {-7, 5,-11, -7, -7,-12, -8, -4, -6, INT8_MAX, 5, -7, -3, -4, INT8_MAX, -5, -3, 5, -4, -5, INT8_MAX, -9, -7, 0,-10, 1}, + /* C */ {-6,-11, 10,-14,-14,-13, -9, -7, -6, INT8_MAX,-14,-11,-13,-11, INT8_MAX, -8,-14, -8, -3, -8, INT8_MAX, -6,-15, 0, -4,-14}, + /* D */ {-3, -7,-14, 8, 2,-15, -3, -4, -7, INT8_MAX, -4,-10,-11, 2, INT8_MAX, -8, -2,-10, -4, -5, INT8_MAX, -8,-15, 0,-11, -3}, + /* E */ {-2, -7,-14, 2, 8,-14, -4, -5, -5, INT8_MAX, -4, -7, -7, -2, INT8_MAX, -5, 1, -9, -4, -6, INT8_MAX, -6,-17, 0, -8, -2}, + /* F */ {-8,-12,-13,-15,-14, 9, -9, -6, -2, INT8_MAX,-14, -3, -4, -9, INT8_MAX,-10,-13, -9, -6, -9, INT8_MAX, -8, -4, 0, 2,-14}, + /* G */ {-2, -8, -9, -3, -4, -9, 6, -9,-11, INT8_MAX, -7,-11, -8, -3, INT8_MAX, -6, -7, -9, -2, -6, INT8_MAX, -5,-15, 0,-14, -7}, + /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, INT8_MAX, -6, -8,-10, 0, INT8_MAX, -4, 1, -2, -6, -7, INT8_MAX, -6, -7, 0, -3, -3}, + /* I */ {-5, -6, -6, -7, -5, -2,-11, -9, 8, INT8_MAX, -6, 5, -1, -5, INT8_MAX, -8, -8, -5, -7, -2, INT8_MAX, 2,-14, 0, -6, -7}, + /* J */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, + /* K */ {-7, 5,-14, -4, -4,-14, -7, -6, -6, INT8_MAX, 7, -7, -2, -1, INT8_MAX, -6, -3, 0, -4, -3, INT8_MAX, -9,-12, 0, -9, 4}, + /* L */ {-6, -7,-11,-10, -7, -3,-11, -8, 5, INT8_MAX, -7, 5, 0, -6, INT8_MAX, -8, -7, -7, -8, -5, INT8_MAX, 0,-10, 0, -7, -7}, + /* M */ {-5, -3,-13,-11, -7, -4, -8,-10, -1, INT8_MAX, -2, 0, 11, -9, INT8_MAX, -8, -4, -4, -5, -4, INT8_MAX, -1,-13, 0,-11, -3}, + /* N */ {-4, -4,-11, 2, -2, -9, -3, 0, -5, INT8_MAX, -1, -6, -9, 8, INT8_MAX, -6, -3, -6, 0, -2, INT8_MAX, -8, -8, 0, -4, -2}, + /* O */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, + /* P */ {-2, -5, -8, -8, -5,-10, -6, -4, -8, INT8_MAX, -6, -8, -8, -6, INT8_MAX, 8, -3, -4, -2, -4, INT8_MAX, -6,-14, 0,-13, -5}, + /* Q */ {-4, -3,-14, -2, 1,-13, -7, 1, -8, INT8_MAX, -3, -7, -4, -3, INT8_MAX, -3, 8, -2, -5, -5, INT8_MAX, -7,-13, 0,-12, 4}, + /* R */ {-7, 5, -8,-10, -9, -9, -9, -2, -5, INT8_MAX, 0, -7, -4, -6, INT8_MAX, -4, -2, 8, -3, -6, INT8_MAX, -8, -2, 0, 10, -1}, + /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, INT8_MAX, -4, -8, -5, 0, INT8_MAX, -2, -5, -3, 6, 0, INT8_MAX, -6, -5, 0, -7, -5}, + /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, INT8_MAX, -3, -5, -4, -2, INT8_MAX, -4, -5, -6, 0, 7, INT8_MAX, -3,-13, 0, -6, -4}, + /* I */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, + /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, INT8_MAX, -9, 0, -1, -8, INT8_MAX, -6, -7, -8, -6, -3, INT8_MAX, 7,-15, 0, -7, -8}, + /* W */ {-13,-7,-15,-15,-17, -4,-15, -7,-14, INT8_MAX,-12,-10,-13, -8, INT8_MAX,-14,-13, -2, -5,-13, INT8_MAX,-15, 13, 0, -5,-13}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, + /* Y */ {-8,-10, -4,-11, -8, 2,-14, -3, -6, INT8_MAX, -9, -7,-11, -4, INT8_MAX,-13,-12,-10, -7, -6, INT8_MAX, -7, -5, 0, 10,-11}, + /* Z */ {-6, 1,-14, -3, -2,-14, -7, -3, -7, INT8_MAX, 4, -7, -3, -2, INT8_MAX, -5, 4, -1, -5, -4, INT8_MAX, -8,-13, 0,-11, 4} }; -std::vector PAM30MS - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, - /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, - /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, - /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, - /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, - /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, - /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, - /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, - /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, - /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, - /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, - /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, - /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, - /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, - /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, - /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, - /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, - /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, - /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, - /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, - /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, - /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, - /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 - }; + NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { @@ -166,50 +118,6 @@ int NeedlemanWunsch::getPenalty() const { return gapPenalty_; } -int NeedlemanWunsch::getIndexNEW_(const char&a) const //Falls wir die Matrizen als vector> haben, rufen wir getIndexNEW_ 2x auf (für Matrix[i][j]) -{ - // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 - vector alphabet = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'X', 'Y', 'Z' }; - int index = 0; - - for(Size i=0; i> vec = - { - {'A', 0}, {'R', 1}, {'N', 2}, - {'D', 3}, {'C', 4}, {'Q', 5}, - {'E', 6}, {'G', 7}, {'H', 8}, - {'I', 9}, {'L', 10}, {'K', 11}, - {'M', 12}, {'F', 13}, {'P', 14}, - {'S', 15}, {'T', 16}, {'W', 17}, - {'Y', 18}, {'V', 19}, {'B', 20}, - {'Z', 21}, {'X', 22}, {'*', 23} - }; -int x = -1; -int y = -1; -for (Size i = 0; i < vec.size(); ++i) -{ -if (vec[i].first == a) -x = vec[i].second; -if (vec[i].first == b) -y = vec[i].second; -} -if (x == -1) -x = 23; -if (y == -1) -y = 23; -return x + y*vec.size(); -} /* double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix { @@ -257,7 +165,7 @@ double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollstä for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen { (*secondRowPtr)[j] = (max(max(((*secondRowPtr)[j-1] - gapPenalty_), ((*firstRowPtr)[j] - gapPenalty_)), - ((*firstRowPtr)[j-1]) + (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]));//statt getIndex: [seq1[i-1] - 'A'] [seq2[j-1] - 'A'] und matrix entsprechend aufbauen + ((*firstRowPtr)[j-1]) + (*matrixPtr_)[seq1[i-1] - 'A'] [seq2[j-1] - 'A']));//[getIndex_(seq1[i-1], seq2[j-1])]));//statt getIndex: [seq1[i-1] - 'A'] [seq2[j-1] - 'A'] und matrix entsprechend aufbauen } swap(firstRowPtr, secondRowPtr); } From 5ab8334328b2a5c5d75d65ea120a7afa22ee470b Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Thu, 20 May 2021 12:18:17 +0200 Subject: [PATCH 47/53] executables.cmake --- src/tests/class_tests/openms/executables.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index 917d10594cb..ad17d06a6f9 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -528,6 +528,7 @@ set(analysis_executables_list MetaboliteFeatureDeconvolution_test MetaboliteSpectralMatching_test ModifiedPeptideGenerator_test + NeedlemanWunsch_test OfflinePrecursorIonSelection_test PeptideIndexing_test PeptideAndProteinQuant_test From 3084b220493cdd852f4cb728dc3510197ae674b2 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Thu, 20 May 2021 20:52:03 +0200 Subject: [PATCH 48/53] seqalign --- .../ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp | 4 ++-- .../source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 12 +++++------- .../source/ConsensusIDAlgorithmPEPMatrix_test.cpp | 9 --------- 3 files changed, 7 insertions(+), 18 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index c7d8ade57d4..435c421ea96 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -96,13 +96,13 @@ namespace OpenMS String unmod_seq2 = seq2.toUnmodifiedString(); if (unmod_seq1 == unmod_seq2) return 1.0; if (unmod_seq1 < unmod_seq2) swap(unmod_seq1, unmod_seq2); - /* testen ob es schneller mit oder ohne ist + AASequence s1 = AASequence::fromString(unmod_seq1); AASequence s2 = AASequence::fromString(unmod_seq2); pair seq_pair = make_pair(s1, s2); SimilarityCache::iterator pos = similarities_.find(seq_pair); if (pos != similarities_.end()) return pos->second; // score found in cache - */ + double score_self1 = object_.align(unmod_seq1, unmod_seq1); double score_sim = object_.align(unmod_seq1, unmod_seq2); double score_self2 = object_.align(unmod_seq2, unmod_seq2); diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 52d23363af9..aad4cb3a62f 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -118,8 +118,8 @@ int NeedlemanWunsch::getPenalty() const { return gapPenalty_; } -/* -double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix + +int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix { seq1len_ = seq1.length(); seq2len_ = seq2.length(); @@ -134,14 +134,13 @@ double NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollstä for (unsigned j=1;j<=seq2len_;++j) { matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]-gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]-gapPenalty_)), - (matrix[(i-1)*(seq2len_+1)+j-1])+ (*matrixPtr_)[getIndex_(seq1[i-1], seq2[j-1])]); + (matrix[(i-1)*(seq2len_+1)+j-1])+ (*matrixPtr_)[seq1[i-1] - 'A'] [seq2[j-1] - 'A']); } } - cout<& ids)) } END_SECTION -START_SECTION(double getSimilarity_(AASequence seq1, - AASequence seq2)) -{ - ConsensusIDAlgorithmPEPMatrix object2= ConsensusIDAlgorithmPEPMatrix(); - AASequence seq1=AASequence::fromString("ATLIGQLAIQQ"); - AASequence seq2=AASequence::fromString("ATLIGALDQQQ"); - cout< Date: Tue, 25 May 2021 16:32:19 +0200 Subject: [PATCH 49/53] set matrix --- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 4 ++ .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 20 ++---- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 69 ++++++++++++++++--- .../openms/source/NeedlemanWunsch_test.cpp | 7 +- 4 files changed, 71 insertions(+), 29 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index 1a641ed441a..b6e91c3d848 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -24,6 +24,7 @@ namespace OpenMS int align(const String& seq1, const String& seq2); void setMatrix(const ScoringMatrix& matrix); + void setMatrix(const std::string& matrix); void setPenalty(const int& penalty); @@ -37,5 +38,8 @@ namespace OpenMS unsigned seq2len_ = 0; int gapPenalty_ = 0; int(* matrixPtr_)[26][26] = nullptr; + std::vector firstRow_{}; + std::vector secondRow_{}; + std::string validMatrices_[2] = {"PAM30MS", "identity"}; }; } \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 435c421ea96..3f73bebc76c 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -53,25 +53,13 @@ namespace OpenMS void ConsensusIDAlgorithmPEPMatrix::updateMembers_() { - //ConsenusIDAlgorithmSimilarity::updateMembers_(); // error: has not been declared + ConsensusIDAlgorithmSimilarity::updateMembers_(); string matrix = param_.getValue("matrix"); int penalty = param_.getValue("penalty"); - if (matrix == "identity") - { - object_.setMatrix(NeedlemanWunsch::ScoringMatrix::identity); - } - else if (matrix == "PAM30MS") - { - object_.setMatrix(NeedlemanWunsch::ScoringMatrix::PAM30MS); - } - else - { - String msg = "Matrix is not known! Valid choices are: " - "'identity', 'PAM30MS'."; - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, - msg); - } + + object_.setMatrix(matrix); + if (penalty > 0) { object_.setPenalty(penalty); diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index aad4cb3a62f..cb92cd99009 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -93,13 +93,28 @@ void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) { matrixPtr_ = &PAM30MS; } +} +void NeedlemanWunsch::setMatrix(const std::string& matrix) +{ + auto first = &validMatrices_[0]; + auto last = &validMatrices_[2]; + const auto it = std::find(first, last, matrix); + if (it == last) + { + String msg = "Matrix is not known! Valid choices are: " + "'identity', 'PAM30MS'."; + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + setMatrix(static_cast(it - first)); } + + void NeedlemanWunsch::setPenalty(const int& penalty) { - - gapPenalty_ = penalty; + gapPenalty_ = penalty; } NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const @@ -118,7 +133,7 @@ int NeedlemanWunsch::getPenalty() const { return gapPenalty_; } - +/* int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix { seq1len_ = seq1.length(); @@ -139,23 +154,23 @@ int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständi } return matrix[(seq1len_+1)*(seq2len_+1)-1]; } - +*/ /* -//linear space (2 Zeilen) +//linear space (2 Zeilen) //seit vectoren member sind: munmap_chunk(): invalid pointer int NeedlemanWunsch::align(const String& seq1, const String& seq2) { seq1len_ = seq1.length(); seq2len_ = seq2.length(); - vector firstRow{}; - vector secondRow(seq2len_+1,0); - vector* firstRowPtr = &firstRow; - vector* secondRowPtr = &secondRow; + firstRow_.resize(seq1len_); + secondRow_.resize(seq2len_+1); + vector* firstRowPtr = &firstRow_; + vector* secondRowPtr = &secondRow_; for (unsigned i = 0; i <= seq2len_; ++i)//horizontale mit gapkosten initialieren { - firstRow.push_back(i * ((-1 * gapPenalty_))); + firstRow_[i] = i * ((-1 * gapPenalty_)); } for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen @@ -170,5 +185,37 @@ int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständi } return (*firstRowPtr)[seq2len_]; } -*/ + */ + + + int NeedlemanWunsch::align(const String& seq1, const String& seq2) + { + seq1len_ = seq1.length(); + seq2len_ = seq2.length(); + + firstRow_.resize(seq2len_+1); // both rows have the same length + secondRow_.resize(seq2len_+1); + + int* firstRowPtr = &(firstRow_[0]); + int* secondRowPtr = &(secondRow_[0]); + + + for (unsigned i = 0; i <= seq2len_; ++i)//horizontale mit gapkosten initialieren + { + firstRow_[i] = i * (-gapPenalty_); + } + + for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen + { + (*secondRowPtr) = i * (-gapPenalty_); //erster wert in der zeile mit gapkosten //second row pointer muss auf die erste stelle zeigen + for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen + { + (*(secondRowPtr+j)) = max(max(((*(secondRowPtr+j-1)) - gapPenalty_), ((*(firstRowPtr+j)) - gapPenalty_)), + ((*(firstRowPtr+j-1)) + (*matrixPtr_)[seq1[i-1] - 'A'] [seq2[j-1] - 'A'])); + } + swap(firstRowPtr, secondRowPtr); + } + return (*(firstRowPtr+seq2len_)); + } + } \ No newline at end of file diff --git a/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp index 49d3c4fb543..210fb7799e9 100644 --- a/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp +++ b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp @@ -46,14 +46,17 @@ START_SECTION(void setMatrix(const ScoringMatrix& matrix)) TEST_EQUAL(object.align(seq1, seq2), 93); TEST_EQUAL(object.align(seq1, seq1), 131); TEST_EQUAL(object.align(seq2, seq2), 151); - //TEST_EQUAL(object.getMatrix(), NeedlemanWunsch::ScoringMatrix::PAM30MSMatrix); kein == operator definiert für ScoringMatrix? + + object.setMatrix("identity"); + TEST_EQUAL(object.align(seq1, seq2), 1); + TEST_EQUAL(object.align(seq1, seq1), 19); + TEST_EQUAL(object.align(seq2, seq2), 22); } END_SECTION START_SECTION(void setPenalty(const ScoringMatrix& matrix)) { NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 5); - //TEST_EXCEPTION(Exception::IllegalArgument, object.setPenalty(-5)) object.setPenalty(1); TEST_EQUAL(object.align(seq1, seq2), 113); TEST_EQUAL(object.getPenalty(), 1); From c8e6d5fe548f716dc1b5ae01dd7ee657b7be0309 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Tue, 25 May 2021 22:35:01 +0200 Subject: [PATCH 50/53] committing before changing branches --- .../ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp | 2 +- .../source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 3f73bebc76c..b59e349dfe1 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -43,7 +43,7 @@ namespace OpenMS setName("ConsensusIDAlgorithmPEPMatrix"); // DefaultParamHandler defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alignment-based similarity scoring"); - defaults_.setValidStrings("matrix", {"identity","PAM30MS"}); + //defaults_.setValidStrings("matrix", NeedlemanWunsch::validMatrices_; //hier auf unser member zugreifen defaults_.setValue("penalty", 5, "Alignment gap penalty (the same value is used for gap opening and extension)"); defaults_.setMinInt("penalty", -1); diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index cb92cd99009..d84651d21c4 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -133,8 +133,8 @@ int NeedlemanWunsch::getPenalty() const { return gapPenalty_; } -/* -int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix + +int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix 26.59 CPU { seq1len_ = seq1.length(); seq2len_ = seq2.length(); @@ -154,7 +154,7 @@ int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständi } return matrix[(seq1len_+1)*(seq2len_+1)-1]; } -*/ + /* //linear space (2 Zeilen) //seit vectoren member sind: munmap_chunk(): invalid pointer int NeedlemanWunsch::align(const String& seq1, const String& seq2) @@ -187,8 +187,8 @@ int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständi } */ - - int NeedlemanWunsch::align(const String& seq1, const String& seq2) +/* + int NeedlemanWunsch::align(const String& seq1, const String& seq2) //25.30 s CPU { seq1len_ = seq1.length(); seq2len_ = seq2.length(); @@ -217,5 +217,5 @@ int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständi } return (*(firstRowPtr+seq2len_)); } - +*/ } \ No newline at end of file From e96b2c78bdfac19062e9ac1c889d931885e43fd3 Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Wed, 26 May 2021 18:06:16 +0200 Subject: [PATCH 51/53] =?UTF-8?q?Einr=C3=BCckung=20und=20Doku=20folgt=20no?= =?UTF-8?q?ch?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ID/ConsensusIDAlgorithmPEPMatrix.h | 7 +- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.h | 24 +- .../ID/ConsensusIDAlgorithmPEPMatrix.cpp | 20 +- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 261 +++++++----------- .../openms/source/NeedlemanWunsch_test.cpp | 38 +-- 5 files changed, 144 insertions(+), 206 deletions(-) diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index fedba8ec768..c0e37633be2 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -55,12 +55,10 @@ namespace OpenMS /// Default constructor ConsensusIDAlgorithmPEPMatrix(); - /// Sequence similarity based on substitution matrix (ignores PTMs) - double getSimilarity_(AASequence seq1, AASequence seq2) override; //danach wieder zu private machen (nur zum Testen) private: - NeedlemanWunsch object_ = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 55); //initialization //bisher wird hier die Matrix gewählt + NeedlemanWunsch alignment_; /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -68,6 +66,9 @@ namespace OpenMS /// Not implemented ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); + /// Sequence similarity based on substitution matrix (ignores PTMs) + double getSimilarity_(AASequence seq1, AASequence seq2) override; + // Docu in base class void updateMembers_() override; }; diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h index b6e91c3d848..373d06f5db5 100644 --- a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -3,8 +3,6 @@ #include #include - - namespace OpenMS { class OPENMS_DLLAPI NeedlemanWunsch @@ -13,33 +11,35 @@ namespace OpenMS public: enum class ScoringMatrix { + identity, PAM30MS, - identity + SIZE_OF_SCORINGMATRIX }; NeedlemanWunsch(ScoringMatrix matrix, int penalty); + NeedlemanWunsch(); ~NeedlemanWunsch()=default; + static const std::vector NamesOfScoringMatrices; + int align(const String& seq1, const String& seq2); void setMatrix(const ScoringMatrix& matrix); void setMatrix(const std::string& matrix); - void setPenalty(const int& penalty); + void setPenalty(const int penalty); ScoringMatrix getMatrix() const; int getPenalty() const; - private: - unsigned seq1len_ = 0; - unsigned seq2len_ = 0; - int gapPenalty_ = 0; - int(* matrixPtr_)[26][26] = nullptr; - std::vector firstRow_{}; - std::vector secondRow_{}; - std::string validMatrices_[2] = {"PAM30MS", "identity"}; + unsigned seq1_len_ = 0; + unsigned seq2_len_ = 0; + int gap_penalty_ = 0; + int my_matrix_ = 0; + std::vector first_row_{}; + std::vector second_row_{}; }; } \ No newline at end of file diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index b59e349dfe1..ae599d7a64b 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -43,9 +43,9 @@ namespace OpenMS setName("ConsensusIDAlgorithmPEPMatrix"); // DefaultParamHandler defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alignment-based similarity scoring"); - //defaults_.setValidStrings("matrix", NeedlemanWunsch::validMatrices_; //hier auf unser member zugreifen + defaults_.setValidStrings("matrix", NeedlemanWunsch::NamesOfScoringMatrices); defaults_.setValue("penalty", 5, "Alignment gap penalty (the same value is used for gap opening and extension)"); - defaults_.setMinInt("penalty", -1); + defaults_.setMinInt("penalty", 1); defaultsToParam_(); @@ -58,11 +58,11 @@ namespace OpenMS string matrix = param_.getValue("matrix"); int penalty = param_.getValue("penalty"); - object_.setMatrix(matrix); + alignment_.setMatrix(matrix); if (penalty > 0) { - object_.setPenalty(penalty); + alignment_.setPenalty(penalty); } else { @@ -85,15 +85,7 @@ namespace OpenMS if (unmod_seq1 == unmod_seq2) return 1.0; if (unmod_seq1 < unmod_seq2) swap(unmod_seq1, unmod_seq2); - AASequence s1 = AASequence::fromString(unmod_seq1); - AASequence s2 = AASequence::fromString(unmod_seq2); - pair seq_pair = make_pair(s1, s2); - SimilarityCache::iterator pos = similarities_.find(seq_pair); - if (pos != similarities_.end()) return pos->second; // score found in cache - - double score_self1 = object_.align(unmod_seq1, unmod_seq1); - double score_sim = object_.align(unmod_seq1, unmod_seq2); - double score_self2 = object_.align(unmod_seq2, unmod_seq2); + double score_sim = alignment_.align(unmod_seq1, unmod_seq2); if (score_sim < 0) { @@ -101,6 +93,8 @@ namespace OpenMS } else { + double score_self1 = alignment_.align(unmod_seq1, unmod_seq1); + double score_self2 = alignment_.align(unmod_seq2, unmod_seq2); score_sim /= min(score_self1, score_self2); // normalize } return score_sim; diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index d84651d21c4..80493d7797c 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -2,108 +2,110 @@ #include #include #include //swap +#include using namespace std; namespace OpenMS { - static int adaptedIdentity[26][26] + static int matrices[static_cast(NeedlemanWunsch::ScoringMatrix::SIZE_OF_SCORINGMATRIX)][26][26] { - + //identity + { // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT8_MAX, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* J */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, - /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, INT8_MAX, 0, 1, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT8_MAX, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 1, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 1, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* O */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, - /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, INT8_MAX, 0, 1, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 1, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 1, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 1, INT8_MAX, 0, 0, 0, 0, 0}, - /* U */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, - /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 1, 0, 0, 0, 0}, - /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 1, 0, 0, 0}, - /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 1, 0}, - /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX,0, 0, 0, 0, 1} + /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0}, + /* U */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0}, + /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0}, + /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX,0, 0, 0, 0, 1} + + }, + + //PAM30MS + { + + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, INT16_MAX, -7, -6, -5, -4, INT16_MAX, -2, -4, -7, 0, -1, INT16_MAX, -2,-13, 0, -8, -6}, + /* B */ {-7, 5,-11, -7, -7,-12, -8, -4, -6, INT16_MAX, 5, -7, -3, -4, INT16_MAX, -5, -3, 5, -4, -5, INT16_MAX, -9, -7, 0,-10, 1}, + /* C */ {-6,-11, 10,-14,-14,-13, -9, -7, -6, INT16_MAX,-14,-11,-13,-11, INT16_MAX, -8,-14, -8, -3, -8, INT16_MAX, -6,-15, 0, -4,-14}, + /* D */ {-3, -7,-14, 8, 2,-15, -3, -4, -7, INT16_MAX, -4,-10,-11, 2, INT16_MAX, -8, -2,-10, -4, -5, INT16_MAX, -8,-15, 0,-11, -3}, + /* E */ {-2, -7,-14, 2, 8,-14, -4, -5, -5, INT16_MAX, -4, -7, -7, -2, INT16_MAX, -5, 1, -9, -4, -6, INT16_MAX, -6,-17, 0, -8, -2}, + /* F */ {-8,-12,-13,-15,-14, 9, -9, -6, -2, INT16_MAX,-14, -3, -4, -9, INT16_MAX,-10,-13, -9, -6, -9, INT16_MAX, -8, -4, 0, 2,-14}, + /* G */ {-2, -8, -9, -3, -4, -9, 6, -9,-11, INT16_MAX, -7,-11, -8, -3, INT16_MAX, -6, -7, -9, -2, -6, INT16_MAX, -5,-15, 0,-14, -7}, + /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, INT16_MAX, -6, -8,-10, 0, INT16_MAX, -4, 1, -2, -6, -7, INT16_MAX, -6, -7, 0, -3, -3}, + /* I */ {-5, -6, -6, -7, -5, -2,-11, -9, 8, INT16_MAX, -6, 5, -1, -5, INT16_MAX, -8, -8, -5, -7, -2, INT16_MAX, 2,-14, 0, -6, -7}, + /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* K */ {-7, 5,-14, -4, -4,-14, -7, -6, -6, INT16_MAX, 7, -7, -2, -1, INT16_MAX, -6, -3, 0, -4, -3, INT16_MAX, -9,-12, 0, -9, 4}, + /* L */ {-6, -7,-11,-10, -7, -3,-11, -8, 5, INT16_MAX, -7, 5, 0, -6, INT16_MAX, -8, -7, -7, -8, -5, INT16_MAX, 0,-10, 0, -7, -7}, + /* M */ {-5, -3,-13,-11, -7, -4, -8,-10, -1, INT16_MAX, -2, 0, 11, -9, INT16_MAX, -8, -4, -4, -5, -4, INT16_MAX, -1,-13, 0,-11, -3}, + /* N */ {-4, -4,-11, 2, -2, -9, -3, 0, -5, INT16_MAX, -1, -6, -9, 8, INT16_MAX, -6, -3, -6, 0, -2, INT16_MAX, -8, -8, 0, -4, -2}, + /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* P */ {-2, -5, -8, -8, -5,-10, -6, -4, -8, INT16_MAX, -6, -8, -8, -6, INT16_MAX, 8, -3, -4, -2, -4, INT16_MAX, -6,-14, 0,-13, -5}, + /* Q */ {-4, -3,-14, -2, 1,-13, -7, 1, -8, INT16_MAX, -3, -7, -4, -3, INT16_MAX, -3, 8, -2, -5, -5, INT16_MAX, -7,-13, 0,-12, 4}, + /* R */ {-7, 5, -8,-10, -9, -9, -9, -2, -5, INT16_MAX, 0, -7, -4, -6, INT16_MAX, -4, -2, 8, -3, -6, INT16_MAX, -8, -2, 0, 10, -1}, + /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, INT16_MAX, -4, -8, -5, 0, INT16_MAX, -2, -5, -3, 6, 0, INT16_MAX, -6, -5, 0, -7, -5}, + /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, INT16_MAX, -3, -5, -4, -2, INT16_MAX, -4, -5, -6, 0, 7, INT16_MAX, -3,-13, 0, -6, -4}, + /* I */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, INT16_MAX, -9, 0, -1, -8, INT16_MAX, -6, -7, -8, -6, -3, INT16_MAX, 7,-15, 0, -7, -8}, + /* W */ {-13,-7,-15,-15,-17, -4,-15, -7,-14, INT16_MAX,-12,-10,-13, -8, INT16_MAX,-14,-13, -2, -5,-13, INT16_MAX,-15, 13, 0, -5,-13}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Y */ {-8,-10, -4,-11, -8, 2,-14, -3, -6, INT16_MAX, -9, -7,-11, -4, INT16_MAX,-13,-12,-10, -7, -6, INT16_MAX, -7, -5, 0, 10,-11}, + /* Z */ {-6, 1,-14, -3, -2,-14, -7, -3, -7, INT16_MAX, 4, -7, -3, -2, INT16_MAX, -5, 4, -1, -5, -4, INT16_MAX, -8,-13, 0,-11, 4} + } }; - static int PAM30MS[26][26] - { - - // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, INT8_MAX, -7, -6, -5, -4, INT8_MAX, -2, -4, -7, 0, -1, INT8_MAX, -2,-13, 0, -8, -6}, - /* B */ {-7, 5,-11, -7, -7,-12, -8, -4, -6, INT8_MAX, 5, -7, -3, -4, INT8_MAX, -5, -3, 5, -4, -5, INT8_MAX, -9, -7, 0,-10, 1}, - /* C */ {-6,-11, 10,-14,-14,-13, -9, -7, -6, INT8_MAX,-14,-11,-13,-11, INT8_MAX, -8,-14, -8, -3, -8, INT8_MAX, -6,-15, 0, -4,-14}, - /* D */ {-3, -7,-14, 8, 2,-15, -3, -4, -7, INT8_MAX, -4,-10,-11, 2, INT8_MAX, -8, -2,-10, -4, -5, INT8_MAX, -8,-15, 0,-11, -3}, - /* E */ {-2, -7,-14, 2, 8,-14, -4, -5, -5, INT8_MAX, -4, -7, -7, -2, INT8_MAX, -5, 1, -9, -4, -6, INT8_MAX, -6,-17, 0, -8, -2}, - /* F */ {-8,-12,-13,-15,-14, 9, -9, -6, -2, INT8_MAX,-14, -3, -4, -9, INT8_MAX,-10,-13, -9, -6, -9, INT8_MAX, -8, -4, 0, 2,-14}, - /* G */ {-2, -8, -9, -3, -4, -9, 6, -9,-11, INT8_MAX, -7,-11, -8, -3, INT8_MAX, -6, -7, -9, -2, -6, INT8_MAX, -5,-15, 0,-14, -7}, - /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, INT8_MAX, -6, -8,-10, 0, INT8_MAX, -4, 1, -2, -6, -7, INT8_MAX, -6, -7, 0, -3, -3}, - /* I */ {-5, -6, -6, -7, -5, -2,-11, -9, 8, INT8_MAX, -6, 5, -1, -5, INT8_MAX, -8, -8, -5, -7, -2, INT8_MAX, 2,-14, 0, -6, -7}, - /* J */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, - /* K */ {-7, 5,-14, -4, -4,-14, -7, -6, -6, INT8_MAX, 7, -7, -2, -1, INT8_MAX, -6, -3, 0, -4, -3, INT8_MAX, -9,-12, 0, -9, 4}, - /* L */ {-6, -7,-11,-10, -7, -3,-11, -8, 5, INT8_MAX, -7, 5, 0, -6, INT8_MAX, -8, -7, -7, -8, -5, INT8_MAX, 0,-10, 0, -7, -7}, - /* M */ {-5, -3,-13,-11, -7, -4, -8,-10, -1, INT8_MAX, -2, 0, 11, -9, INT8_MAX, -8, -4, -4, -5, -4, INT8_MAX, -1,-13, 0,-11, -3}, - /* N */ {-4, -4,-11, 2, -2, -9, -3, 0, -5, INT8_MAX, -1, -6, -9, 8, INT8_MAX, -6, -3, -6, 0, -2, INT8_MAX, -8, -8, 0, -4, -2}, - /* O */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, - /* P */ {-2, -5, -8, -8, -5,-10, -6, -4, -8, INT8_MAX, -6, -8, -8, -6, INT8_MAX, 8, -3, -4, -2, -4, INT8_MAX, -6,-14, 0,-13, -5}, - /* Q */ {-4, -3,-14, -2, 1,-13, -7, 1, -8, INT8_MAX, -3, -7, -4, -3, INT8_MAX, -3, 8, -2, -5, -5, INT8_MAX, -7,-13, 0,-12, 4}, - /* R */ {-7, 5, -8,-10, -9, -9, -9, -2, -5, INT8_MAX, 0, -7, -4, -6, INT8_MAX, -4, -2, 8, -3, -6, INT8_MAX, -8, -2, 0, 10, -1}, - /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, INT8_MAX, -4, -8, -5, 0, INT8_MAX, -2, -5, -3, 6, 0, INT8_MAX, -6, -5, 0, -7, -5}, - /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, INT8_MAX, -3, -5, -4, -2, INT8_MAX, -4, -5, -6, 0, 7, INT8_MAX, -3,-13, 0, -6, -4}, - /* I */ {INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX,}, - /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, INT8_MAX, -9, 0, -1, -8, INT8_MAX, -6, -7, -8, -6, -3, INT8_MAX, 7,-15, 0, -7, -8}, - /* W */ {-13,-7,-15,-15,-17, -4,-15, -7,-14, INT8_MAX,-12,-10,-13, -8, INT8_MAX,-14,-13, -2, -5,-13, INT8_MAX,-15, 13, 0, -5,-13}, - /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0, INT8_MAX, 0, 0, 0, 0, 0}, - /* Y */ {-8,-10, -4,-11, -8, 2,-14, -3, -6, INT8_MAX, -9, -7,-11, -4, INT8_MAX,-13,-12,-10, -7, -6, INT8_MAX, -7, -5, 0, 10,-11}, - /* Z */ {-6, 1,-14, -3, -2,-14, -7, -3, -7, INT8_MAX, 4, -7, -3, -2, INT8_MAX, -5, 4, -1, -5, -4, INT8_MAX, -8,-13, 0,-11, 4} - - }; - NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) { setMatrix(matrix); setPenalty(penalty); +} +NeedlemanWunsch::NeedlemanWunsch() +{ + setMatrix(ScoringMatrix::PAM30MS); + setPenalty(5); } +const vector NeedlemanWunsch::NamesOfScoringMatrices = {"identity", "PAM30MS"}; + void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) { - if (matrix == ScoringMatrix::identity) - { - matrixPtr_ = &adaptedIdentity; - } - - else if (matrix == ScoringMatrix::PAM30MS) - { - matrixPtr_ = &PAM30MS; - } + my_matrix_ = static_cast(matrix); } void NeedlemanWunsch::setMatrix(const std::string& matrix) { - auto first = &validMatrices_[0]; - auto last = &validMatrices_[2]; + auto first = &NamesOfScoringMatrices[0]; + auto last = &NamesOfScoringMatrices[static_cast(ScoringMatrix::SIZE_OF_SCORINGMATRIX)]; const auto it = std::find(first, last, matrix); if (it == last) { - String msg = "Matrix is not known! Valid choices are: " - "'identity', 'PAM30MS'."; + String msg = "Matrix is not known! Valid choices are: "+ + ListUtils::concatenate(NamesOfScoringMatrices, ", "); throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } @@ -111,111 +113,50 @@ void NeedlemanWunsch::setMatrix(const std::string& matrix) } - -void NeedlemanWunsch::setPenalty(const int& penalty) +void NeedlemanWunsch::setPenalty(const int penalty) { - gapPenalty_ = penalty; + gap_penalty_ = penalty; } NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const { - if (*matrixPtr_ == adaptedIdentity) - { - return ScoringMatrix::identity; - } - else - { return ScoringMatrix::PAM30MS; - } + } int NeedlemanWunsch::getPenalty() const { - return gapPenalty_; + return gap_penalty_; } -int NeedlemanWunsch::align(const String& seq1, const String& seq2) //vollständige matrix 26.59 CPU +int NeedlemanWunsch::align(const String& seq1, const String& seq2) { - seq1len_ = seq1.length(); - seq2len_ = seq2.length(); - - vector matrix((seq1len_+1)*(seq2len_+1), 0);//matrix mit 0en initialisieren - for (unsigned i = 1; i <= seq1len_; ++i) //vertikale mit gapkkosten initialisieren - matrix[i*(seq2len_+1)]=i*(-gapPenalty_); - for (unsigned i =0; i<=seq2len_;++i)//horizontale mit gapkosten initialieren - matrix[i]=i*(-gapPenalty_); - for (unsigned i=1;i<=seq1len_;++i) - { - for (unsigned j=1;j<=seq2len_;++j) - { - matrix[i*(seq2len_ +1)+j]=max(max((matrix[i*(seq2len_+1)+j-1]-gapPenalty_), (matrix[(i-1)*(seq2len_+1)+j]-gapPenalty_)), - (matrix[(i-1)*(seq2len_+1)+j-1])+ (*matrixPtr_)[seq1[i-1] - 'A'] [seq2[j-1] - 'A']); - } - } - return matrix[(seq1len_+1)*(seq2len_+1)-1]; -} + seq1_len_ = seq1.length(); + seq2_len_ = seq2.length(); -/* -//linear space (2 Zeilen) //seit vectoren member sind: munmap_chunk(): invalid pointer - int NeedlemanWunsch::align(const String& seq1, const String& seq2) - { - seq1len_ = seq1.length(); - seq2len_ = seq2.length(); - - firstRow_.resize(seq1len_); - secondRow_.resize(seq2len_+1); - vector* firstRowPtr = &firstRow_; - vector* secondRowPtr = &secondRow_; - - - for (unsigned i = 0; i <= seq2len_; ++i)//horizontale mit gapkosten initialieren - { - firstRow_[i] = i * ((-1 * gapPenalty_)); - } - - for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen - { - (*secondRowPtr)[0] = i * ((-1 * gapPenalty_)); //erster wert in der zeile mit gapkosten - for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen - { - (*secondRowPtr)[j] = (max(max(((*secondRowPtr)[j-1] - gapPenalty_), ((*firstRowPtr)[j] - gapPenalty_)), - ((*firstRowPtr)[j-1]) + (*matrixPtr_)[seq1[i-1] - 'A'] [seq2[j-1] - 'A']));//[getIndex_(seq1[i-1], seq2[j-1])]));//statt getIndex: [seq1[i-1] - 'A'] [seq2[j-1] - 'A'] und matrix entsprechend aufbauen - } - swap(firstRowPtr, secondRowPtr); - } - return (*firstRowPtr)[seq2len_]; - } - */ - -/* - int NeedlemanWunsch::align(const String& seq1, const String& seq2) //25.30 s CPU - { - seq1len_ = seq1.length(); - seq2len_ = seq2.length(); - - firstRow_.resize(seq2len_+1); // both rows have the same length - secondRow_.resize(seq2len_+1); + first_row_.resize(seq2_len_+1); // both rows have the same length + second_row_.resize(seq2_len_+1); - int* firstRowPtr = &(firstRow_[0]); - int* secondRowPtr = &(secondRow_[0]); + int* firstRowPtr = &(first_row_[0]); + int* secondRowPtr = &(second_row_[0]); + int (*matrix_ptr)[26][26] = &matrices[my_matrix_]; - for (unsigned i = 0; i <= seq2len_; ++i)//horizontale mit gapkosten initialieren - { - firstRow_[i] = i * (-gapPenalty_); - } + for (unsigned i = 0; i <= seq2_len_; ++i) // initialize using gap-penalty + { + first_row_[i] = i * (-gap_penalty_); + } - for (unsigned i = 1;i <= seq1len_; ++i) //second row berechnen und swappen + for (unsigned i = 1;i <= seq1_len_; ++i) + { + (*secondRowPtr) = i * (-gap_penalty_); // the first value in a row + for (unsigned j = 1; j <= seq2_len_; ++j) { - (*secondRowPtr) = i * (-gapPenalty_); //erster wert in der zeile mit gapkosten //second row pointer muss auf die erste stelle zeigen - for (unsigned j = 1; j <= seq2len_; ++j) //secondRow berechnen - { - (*(secondRowPtr+j)) = max(max(((*(secondRowPtr+j-1)) - gapPenalty_), ((*(firstRowPtr+j)) - gapPenalty_)), - ((*(firstRowPtr+j-1)) + (*matrixPtr_)[seq1[i-1] - 'A'] [seq2[j-1] - 'A'])); - } - swap(firstRowPtr, secondRowPtr); + (*(secondRowPtr+j)) = max(max(((*(secondRowPtr+j-1)) - gap_penalty_), ((*(firstRowPtr+j)) - gap_penalty_)), + ((*(firstRowPtr+j-1)) + (*matrix_ptr)[seq1[i-1] - 'A'] [seq2[j-1] - 'A'])); } - return (*(firstRowPtr+seq2len_)); + swap(firstRowPtr, secondRowPtr); } -*/ + return (*(firstRowPtr+seq2_len_)); +} } \ No newline at end of file diff --git a/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp index 210fb7799e9..5b492669ed8 100644 --- a/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp +++ b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp @@ -32,34 +32,36 @@ String seq2 = "IGGATLIGALDQVVAQQAHVHL"; START_SECTION(double align(const String& seq1, const String& seq2)) { - NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identity, 5); - TEST_EQUAL(object.align(seq1, seq2), 1); - TEST_EQUAL(object.align(seq1, seq1), 19); - TEST_EQUAL(object.align(seq2, seq2), 22); + NeedlemanWunsch alignment = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identity, 5); + TEST_EQUAL(alignment.align(seq1, seq2), 1); + TEST_EQUAL(alignment.align(seq1, seq1), 19); + TEST_EQUAL(alignment.align(seq2, seq2), 22); } END_SECTION START_SECTION(void setMatrix(const ScoringMatrix& matrix)) { - NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identity, 5); - object.setMatrix(NeedlemanWunsch::ScoringMatrix::PAM30MS); - TEST_EQUAL(object.align(seq1, seq2), 93); - TEST_EQUAL(object.align(seq1, seq1), 131); - TEST_EQUAL(object.align(seq2, seq2), 151); - - object.setMatrix("identity"); - TEST_EQUAL(object.align(seq1, seq2), 1); - TEST_EQUAL(object.align(seq1, seq1), 19); - TEST_EQUAL(object.align(seq2, seq2), 22); + NeedlemanWunsch alignment = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identity, 5); + alignment.setMatrix(NeedlemanWunsch::ScoringMatrix::PAM30MS); + TEST_EQUAL(alignment.align(seq1, seq2), 93); + TEST_EQUAL(alignment.align(seq1, seq1), 131); + TEST_EQUAL(alignment.align(seq2, seq2), 151); + + TEST_EXCEPTION(Exception::IllegalArgument, alignment.setMatrix("Identity")) + + alignment.setMatrix("identity"); + TEST_EQUAL(alignment.align(seq1, seq2), 1); + TEST_EQUAL(alignment.align(seq1, seq1), 19); + TEST_EQUAL(alignment.align(seq2, seq2), 22); } END_SECTION START_SECTION(void setPenalty(const ScoringMatrix& matrix)) { - NeedlemanWunsch object = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 5); - object.setPenalty(1); - TEST_EQUAL(object.align(seq1, seq2), 113); - TEST_EQUAL(object.getPenalty(), 1); + NeedlemanWunsch alignment = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 5); + alignment.setPenalty(1); + TEST_EQUAL(alignment.align(seq1, seq2), 113); + TEST_EQUAL(alignment.getPenalty(), 1); } END_SECTION From da20bc32446a895da938321edad3511e7c808c4f Mon Sep 17 00:00:00 2001 From: Nora Wild Date: Wed, 26 May 2021 18:28:53 +0200 Subject: [PATCH 52/53] getMatrix() --- src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index 80493d7797c..adfbbdd0a9a 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -120,7 +120,7 @@ void NeedlemanWunsch::setPenalty(const int penalty) NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const { - return ScoringMatrix::PAM30MS; + return static_cast(my_matrix_); } From 42733fc2799a602ea4bb8fd3d229daa7ab187172 Mon Sep 17 00:00:00 2001 From: kasrat93 Date: Wed, 26 May 2021 20:41:39 +0200 Subject: [PATCH 53/53] =?UTF-8?q?Einr=C3=BCckung?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp | 265 +++++++++--------- 1 file changed, 132 insertions(+), 133 deletions(-) diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp index adfbbdd0a9a..5d25cf60ae8 100644 --- a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -8,155 +8,154 @@ using namespace std; namespace OpenMS { - static int matrices[static_cast(NeedlemanWunsch::ScoringMatrix::SIZE_OF_SCORINGMATRIX)][26][26] - { - //identity - { - // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, - /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, - /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0}, - /* U */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, - /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0}, - /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0}, - /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0}, - /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX,0, 0, 0, 0, 1} - - }, - - //PAM30MS - { - - // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z - /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, INT16_MAX, -7, -6, -5, -4, INT16_MAX, -2, -4, -7, 0, -1, INT16_MAX, -2,-13, 0, -8, -6}, - /* B */ {-7, 5,-11, -7, -7,-12, -8, -4, -6, INT16_MAX, 5, -7, -3, -4, INT16_MAX, -5, -3, 5, -4, -5, INT16_MAX, -9, -7, 0,-10, 1}, - /* C */ {-6,-11, 10,-14,-14,-13, -9, -7, -6, INT16_MAX,-14,-11,-13,-11, INT16_MAX, -8,-14, -8, -3, -8, INT16_MAX, -6,-15, 0, -4,-14}, - /* D */ {-3, -7,-14, 8, 2,-15, -3, -4, -7, INT16_MAX, -4,-10,-11, 2, INT16_MAX, -8, -2,-10, -4, -5, INT16_MAX, -8,-15, 0,-11, -3}, - /* E */ {-2, -7,-14, 2, 8,-14, -4, -5, -5, INT16_MAX, -4, -7, -7, -2, INT16_MAX, -5, 1, -9, -4, -6, INT16_MAX, -6,-17, 0, -8, -2}, - /* F */ {-8,-12,-13,-15,-14, 9, -9, -6, -2, INT16_MAX,-14, -3, -4, -9, INT16_MAX,-10,-13, -9, -6, -9, INT16_MAX, -8, -4, 0, 2,-14}, - /* G */ {-2, -8, -9, -3, -4, -9, 6, -9,-11, INT16_MAX, -7,-11, -8, -3, INT16_MAX, -6, -7, -9, -2, -6, INT16_MAX, -5,-15, 0,-14, -7}, - /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, INT16_MAX, -6, -8,-10, 0, INT16_MAX, -4, 1, -2, -6, -7, INT16_MAX, -6, -7, 0, -3, -3}, - /* I */ {-5, -6, -6, -7, -5, -2,-11, -9, 8, INT16_MAX, -6, 5, -1, -5, INT16_MAX, -8, -8, -5, -7, -2, INT16_MAX, 2,-14, 0, -6, -7}, - /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, - /* K */ {-7, 5,-14, -4, -4,-14, -7, -6, -6, INT16_MAX, 7, -7, -2, -1, INT16_MAX, -6, -3, 0, -4, -3, INT16_MAX, -9,-12, 0, -9, 4}, - /* L */ {-6, -7,-11,-10, -7, -3,-11, -8, 5, INT16_MAX, -7, 5, 0, -6, INT16_MAX, -8, -7, -7, -8, -5, INT16_MAX, 0,-10, 0, -7, -7}, - /* M */ {-5, -3,-13,-11, -7, -4, -8,-10, -1, INT16_MAX, -2, 0, 11, -9, INT16_MAX, -8, -4, -4, -5, -4, INT16_MAX, -1,-13, 0,-11, -3}, - /* N */ {-4, -4,-11, 2, -2, -9, -3, 0, -5, INT16_MAX, -1, -6, -9, 8, INT16_MAX, -6, -3, -6, 0, -2, INT16_MAX, -8, -8, 0, -4, -2}, - /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, - /* P */ {-2, -5, -8, -8, -5,-10, -6, -4, -8, INT16_MAX, -6, -8, -8, -6, INT16_MAX, 8, -3, -4, -2, -4, INT16_MAX, -6,-14, 0,-13, -5}, - /* Q */ {-4, -3,-14, -2, 1,-13, -7, 1, -8, INT16_MAX, -3, -7, -4, -3, INT16_MAX, -3, 8, -2, -5, -5, INT16_MAX, -7,-13, 0,-12, 4}, - /* R */ {-7, 5, -8,-10, -9, -9, -9, -2, -5, INT16_MAX, 0, -7, -4, -6, INT16_MAX, -4, -2, 8, -3, -6, INT16_MAX, -8, -2, 0, 10, -1}, - /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, INT16_MAX, -4, -8, -5, 0, INT16_MAX, -2, -5, -3, 6, 0, INT16_MAX, -6, -5, 0, -7, -5}, - /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, INT16_MAX, -3, -5, -4, -2, INT16_MAX, -4, -5, -6, 0, 7, INT16_MAX, -3,-13, 0, -6, -4}, - /* I */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, - /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, INT16_MAX, -9, 0, -1, -8, INT16_MAX, -6, -7, -8, -6, -3, INT16_MAX, 7,-15, 0, -7, -8}, - /* W */ {-13,-7,-15,-15,-17, -4,-15, -7,-14, INT16_MAX,-12,-10,-13, -8, INT16_MAX,-14,-13, -2, -5,-13, INT16_MAX,-15, 13, 0, -5,-13}, - /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, - /* Y */ {-8,-10, -4,-11, -8, 2,-14, -3, -6, INT16_MAX, -9, -7,-11, -4, INT16_MAX,-13,-12,-10, -7, -6, INT16_MAX, -7, -5, 0, 10,-11}, - /* Z */ {-6, 1,-14, -3, -2,-14, -7, -3, -7, INT16_MAX, 4, -7, -3, -2, INT16_MAX, -5, 4, -1, -5, -4, INT16_MAX, -8,-13, 0,-11, 4} - } - }; - - - - -NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) -{ - setMatrix(matrix); - setPenalty(penalty); -} + static int matrices[static_cast(NeedlemanWunsch::ScoringMatrix::SIZE_OF_SCORINGMATRIX)][26][26] + { + //identity + { + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0}, + /* U */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0}, + /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0}, + /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 1} + + }, + + //PAM30MS + { + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, INT16_MAX, -7, -6, -5, -4, INT16_MAX, -2, -4, -7, 0, -1, INT16_MAX, -2,-13, 0, -8, -6}, + /* B */ {-7, 5,-11, -7, -7,-12, -8, -4, -6, INT16_MAX, 5, -7, -3, -4, INT16_MAX, -5, -3, 5, -4, -5, INT16_MAX, -9, -7, 0,-10, 1}, + /* C */ {-6,-11, 10,-14,-14,-13, -9, -7, -6, INT16_MAX,-14,-11,-13,-11, INT16_MAX, -8,-14, -8, -3, -8, INT16_MAX, -6,-15, 0, -4,-14}, + /* D */ {-3, -7,-14, 8, 2,-15, -3, -4, -7, INT16_MAX, -4,-10,-11, 2, INT16_MAX, -8, -2,-10, -4, -5, INT16_MAX, -8,-15, 0,-11, -3}, + /* E */ {-2, -7,-14, 2, 8,-14, -4, -5, -5, INT16_MAX, -4, -7, -7, -2, INT16_MAX, -5, 1, -9, -4, -6, INT16_MAX, -6,-17, 0, -8, -2}, + /* F */ {-8,-12,-13,-15,-14, 9, -9, -6, -2, INT16_MAX,-14, -3, -4, -9, INT16_MAX,-10,-13, -9, -6, -9, INT16_MAX, -8, -4, 0, 2,-14}, + /* G */ {-2, -8, -9, -3, -4, -9, 6, -9,-11, INT16_MAX, -7,-11, -8, -3, INT16_MAX, -6, -7, -9, -2, -6, INT16_MAX, -5,-15, 0,-14, -7}, + /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, INT16_MAX, -6, -8,-10, 0, INT16_MAX, -4, 1, -2, -6, -7, INT16_MAX, -6, -7, 0, -3, -3}, + /* I */ {-5, -6, -6, -7, -5, -2,-11, -9, 8, INT16_MAX, -6, 5, -1, -5, INT16_MAX, -8, -8, -5, -7, -2, INT16_MAX, 2,-14, 0, -6, -7}, + /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* K */ {-7, 5,-14, -4, -4,-14, -7, -6, -6, INT16_MAX, 7, -7, -2, -1, INT16_MAX, -6, -3, 0, -4, -3, INT16_MAX, -9,-12, 0, -9, 4}, + /* L */ {-6, -7,-11,-10, -7, -3,-11, -8, 5, INT16_MAX, -7, 5, 0, -6, INT16_MAX, -8, -7, -7, -8, -5, INT16_MAX, 0,-10, 0, -7, -7}, + /* M */ {-5, -3,-13,-11, -7, -4, -8,-10, -1, INT16_MAX, -2, 0, 11, -9, INT16_MAX, -8, -4, -4, -5, -4, INT16_MAX, -1,-13, 0,-11, -3}, + /* N */ {-4, -4,-11, 2, -2, -9, -3, 0, -5, INT16_MAX, -1, -6, -9, 8, INT16_MAX, -6, -3, -6, 0, -2, INT16_MAX, -8, -8, 0, -4, -2}, + /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* P */ {-2, -5, -8, -8, -5,-10, -6, -4, -8, INT16_MAX, -6, -8, -8, -6, INT16_MAX, 8, -3, -4, -2, -4, INT16_MAX, -6,-14, 0,-13, -5}, + /* Q */ {-4, -3,-14, -2, 1,-13, -7, 1, -8, INT16_MAX, -3, -7, -4, -3, INT16_MAX, -3, 8, -2, -5, -5, INT16_MAX, -7,-13, 0,-12, 4}, + /* R */ {-7, 5, -8,-10, -9, -9, -9, -2, -5, INT16_MAX, 0, -7, -4, -6, INT16_MAX, -4, -2, 8, -3, -6, INT16_MAX, -8, -2, 0, 10, -1}, + /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, INT16_MAX, -4, -8, -5, 0, INT16_MAX, -2, -5, -3, 6, 0, INT16_MAX, -6, -5, 0, -7, -5}, + /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, INT16_MAX, -3, -5, -4, -2, INT16_MAX, -4, -5, -6, 0, 7, INT16_MAX, -3,-13, 0, -6, -4}, + /* I */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, INT16_MAX, -9, 0, -1, -8, INT16_MAX, -6, -7, -8, -6, -3, INT16_MAX, 7,-15, 0, -7, -8}, + /* W */ {-13,-7,-15,-15,-17, -4,-15, -7,-14, INT16_MAX,-12,-10,-13, -8, INT16_MAX,-14,-13, -2, -5,-13, INT16_MAX,-15, 13, 0, -5,-13}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Y */ {-8,-10, -4,-11, -8, 2,-14, -3, -6, INT16_MAX, -9, -7,-11, -4, INT16_MAX,-13,-12,-10, -7, -6, INT16_MAX, -7, -5, 0, 10,-11}, + /* Z */ {-6, 1,-14, -3, -2,-14, -7, -3, -7, INT16_MAX, 4, -7, -3, -2, INT16_MAX, -5, 4, -1, -5, -4, INT16_MAX, -8,-13, 0,-11, 4} + } + }; + + + NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) + { + setMatrix(matrix); + setPenalty(penalty); + } -NeedlemanWunsch::NeedlemanWunsch() -{ - setMatrix(ScoringMatrix::PAM30MS); - setPenalty(5); -} + NeedlemanWunsch::NeedlemanWunsch() + { + setMatrix(ScoringMatrix::PAM30MS); + setPenalty(5); + } -const vector NeedlemanWunsch::NamesOfScoringMatrices = {"identity", "PAM30MS"}; + const vector NeedlemanWunsch::NamesOfScoringMatrices = {"identity", "PAM30MS"}; -void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) -{ - my_matrix_ = static_cast(matrix); -} + void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) + { + my_matrix_ = static_cast(matrix); + } -void NeedlemanWunsch::setMatrix(const std::string& matrix) -{ - auto first = &NamesOfScoringMatrices[0]; - auto last = &NamesOfScoringMatrices[static_cast(ScoringMatrix::SIZE_OF_SCORINGMATRIX)]; - const auto it = std::find(first, last, matrix); - if (it == last) + void NeedlemanWunsch::setMatrix(const std::string& matrix) { - String msg = "Matrix is not known! Valid choices are: "+ - ListUtils::concatenate(NamesOfScoringMatrices, ", "); - throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + auto first = &NamesOfScoringMatrices[0]; + auto last = &NamesOfScoringMatrices[static_cast(ScoringMatrix::SIZE_OF_SCORINGMATRIX)]; + const auto it = std::find(first, last, matrix); + if (it == last) + { + String msg = "Matrix is not known! Valid choices are: "+ + ListUtils::concatenate(NamesOfScoringMatrices, ", "); + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); + } + setMatrix(static_cast(it - first)); } - setMatrix(static_cast(it - first)); -} - -void NeedlemanWunsch::setPenalty(const int penalty) -{ - gap_penalty_ = penalty; -} - -NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const -{ - return static_cast(my_matrix_); - -} -int NeedlemanWunsch::getPenalty() const -{ - return gap_penalty_; -} - -int NeedlemanWunsch::align(const String& seq1, const String& seq2) -{ - seq1_len_ = seq1.length(); - seq2_len_ = seq2.length(); + void NeedlemanWunsch::setPenalty(const int penalty) + { + gap_penalty_ = penalty; + } - first_row_.resize(seq2_len_+1); // both rows have the same length - second_row_.resize(seq2_len_+1); - int* firstRowPtr = &(first_row_[0]); - int* secondRowPtr = &(second_row_[0]); + NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const + { + return static_cast(my_matrix_); + } - int (*matrix_ptr)[26][26] = &matrices[my_matrix_]; - for (unsigned i = 0; i <= seq2_len_; ++i) // initialize using gap-penalty + int NeedlemanWunsch::getPenalty() const { - first_row_[i] = i * (-gap_penalty_); + return gap_penalty_; } - for (unsigned i = 1;i <= seq1_len_; ++i) + int NeedlemanWunsch::align(const String& seq1, const String& seq2) { - (*secondRowPtr) = i * (-gap_penalty_); // the first value in a row - for (unsigned j = 1; j <= seq2_len_; ++j) - { - (*(secondRowPtr+j)) = max(max(((*(secondRowPtr+j-1)) - gap_penalty_), ((*(firstRowPtr+j)) - gap_penalty_)), - ((*(firstRowPtr+j-1)) + (*matrix_ptr)[seq1[i-1] - 'A'] [seq2[j-1] - 'A'])); - } - swap(firstRowPtr, secondRowPtr); + seq1_len_ = seq1.length(); + seq2_len_ = seq2.length(); + + first_row_.resize(seq2_len_+1); // both rows have the same length + second_row_.resize(seq2_len_+1); + + int* firstRowPtr = &(first_row_[0]); + int* secondRowPtr = &(second_row_[0]); + + int (*matrix_ptr)[26][26] = &matrices[my_matrix_]; + + for (unsigned i = 0; i <= seq2_len_; ++i) // initialize using gap-penalty + { + first_row_[i] = i * (-gap_penalty_); + } + + for (unsigned i = 1;i <= seq1_len_; ++i) + { + (*secondRowPtr) = i * (-gap_penalty_); // the first value in a row + for (unsigned j = 1; j <= seq2_len_; ++j) + { + (*(secondRowPtr+j)) = max(max(((*(secondRowPtr+j-1)) - gap_penalty_), ((*(firstRowPtr+j)) - gap_penalty_)), + ((*(firstRowPtr+j-1)) + (*matrix_ptr)[seq1[i-1] - 'A'] [seq2[j-1] - 'A'])); + } + swap(firstRowPtr, secondRowPtr); + } + return (*(firstRowPtr+seq2_len_)); } - return (*(firstRowPtr+seq2_len_)); -} + } \ No newline at end of file