diff --git a/AUTHORS b/AUTHORS index 716b461465a..1adb77152d1 100644 --- a/AUTHORS +++ b/AUTHORS @@ -1,3 +1,4 @@ + ========================================================================= OpenMS -- Open Source Mass Spectrometry ========================================================================= @@ -86,6 +87,7 @@ the authors tag in the respective file header. - Swenja Wagner - Taraneh Strunk - Timo Sachsenberg + - Tinatin Kasradze - Tom Lukas Lankenau - Tom Waschischeck - Uwe Schmitt @@ -93,3 +95,6 @@ the authors tag in the respective file header. - Volker Mosthaf - Witold Wolski - Xiao Liang + + + diff --git a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h index 742206e91e7..c0e37633be2 100644 --- a/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h +++ b/src/openms/include/OpenMS/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.h @@ -35,113 +35,7 @@ #pragma once #include -#include - -// Extend SeqAn by a user-define scoring matrix. -namespace seqan -{ - - // We have to create a new specialization of the _ScoringMatrix class - // for amino acids. For this, we first create a new tag. - struct PAM30MS {}; // PAM30MS matrix - struct AdaptedIdentity {}; // identity matrix adapted for I/L, Q/K ambiguity - - // Then, we specialize the class _ScoringMatrix. - template <> - struct ScoringMatrixData_ - { - enum - { - VALUE_SIZE = ValueSize::VALUE, - TAB_SIZE = VALUE_SIZE * VALUE_SIZE - }; - static inline const int* getData() - { - // Rant: I cannot find a primary source for the PAM30MS scoring matrix! - // It seems to have been first published in Huang et al., JBC 2001 - // (http://www.jbc.org/content/276/30/28327), but the paper does not show - // the actual matrix (gah!). - // The matrix here comes from old OpenMS code and also matches this one: - // http://proteomics.fiocruz.br/supplementaryfiles/pepexplorer/BeforeRevision/PFUGridResults/PFUGridSearch/pam30ms.txt - - static const int _data[TAB_SIZE] = - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 6, -7, -4, -3, -6, -4, -2, -2, -7, -5, -6, -7, -5, -8, -2, 0, -1,-13, -8, -2, -7, -6, 0,-17, - /* R */ -7, 8, -6,-10, -8, -2, -9, -9, -2, -5, -7, 0, -4, -9, -4, -3, -6, -2,-10, -8, 5, -1, 0,-17, - /* N */ -4, -6, 8, 2,-11, -3, -2, -3, 0, -5, -6, -1, -9, -9, -6, 0, -2, -8, -4, -8, -4, -2, 0,-17, - /* D */ -3,-10, 2, 8,-14, -2, 2, -3, -4, -7,-10, -4,-11,-15, -8, -4, -5,-15,-11, -8, -7, -3, 0,-17, - /* C */ -6, -8,-11,-14, 10,-14,-14, -9, -7, -6,-11,-14,-13,-13, -8, -3, -8,-15, -4, -6,-11,-14, 0,-17, - /* Q */ -4, -2, -3, -2,-14, 8, 1, -7, 1, -8, -7, -3, -4,-13, -3, -5, -5,-13,-12, -7, -3, 4, 0,-17, - /* E */ -2, -9, -2, 2,-14, 1, 8, -4, -5, -5, -7, -4, -7,-14, -5, -4, -6,-17, -8, -6, -7, -2, 0,-17, - /* G */ -2, -9, -3, -3, -9, -7, -4, 6, -9,-11,-11, -7, -8, -9, -6, -2, -6,-15,-14, -5, -8, -7, 0,-17, - /* H */ -7, -2, 0, -4, -7, 1, -5, -9, 9, -9, -8, -6,-10, -6, -4, -6, -7, -7, -3, -6, -4, -3, 0,-17, - /* I */ -5, -5, -5, -7, -6, -8, -5,-11, -9, 8, 5, -6, -1, -2, -8, -7, -2,-14, -6, 2, -6, -7, 0,-17, - /* L */ -6, -7, -6,-10,-11, -7, -7,-11, -8, 5, 5, -7, 0, -3, -8, -8, -5,-10, -7, 0, -7, -7, 0,-17, - /* K */ -7, 0, -1, -4,-14, -3, -4, -7, -6, -6, -7, 7, -2,-14, -6, -4, -3,-12, -9, -9, 5, 4, 0,-17, - /* M */ -5, -4, -9,-11,-13, -4, -7, -8,-10, -1, 0, -2, 11, -4, -8, -5, -4,-13,-11, -1, -3, -3, 0,-17, - /* F */ -8, -9, -9,-15,-13,-13,-14, -9, -6, -2, -3,-14, -4, 9,-10, -6, -9, -4, 2, -8,-12,-14, 0,-17, - /* P */ -2, -4, -6, -8, -8, -3, -5, -6, -4, -8, -8, -6, -8,-10, 8, -2, -4,-14,-13, -6, -5, -5, 0,-17, - /* S */ 0, -3, 0, -4, -3, -5, -4, -2, -6, -7, -8, -4, -5, -6, -2, 6, 0, -5, -7, -6, -4, -5, 0,-17, - /* T */ -1, -6, -2, -5, -8, -5, -6, -6, -7, -2, -5, -3, -4, -9, -4, 0, 7,-13, -6, -3, -5, -4, 0,-17, - /* W */ -13, -2, -8,-15,-15,-13,-17,-15, -7,-14,-10,-12,-13, -4,-14, -5,-13, 13, -5,-15, -7,-13, 0,-17, - /* Y */ -8,-10, -4,-11, -4,-12, -8,-14, -3, -6, -7, -9,-11, 2,-13, -7, -6, -5, 10, -7,-10,-11, 0,-17, - /* V */ -2, -8, -8, -8, -6, -7, -6, -5, -6, 2, 0, -9, -1, -8, -6, -6, -3,-15, -7, 7, -9, -8, 0,-17, - /* B */ -7, 5, -4, -7,-11, -3, -7, -8, -4, -6, -7, 5, -3,-12, -5, -4, -5, -7,-10, -9, 5, 1, 0,-17, - /* Z */ -6, -1, -2, -3,-14, 4, -2, -7, -3, -7, -7, 4, -3,-14, -5, -5, -4,-13,-11, -8, 1, 4, 0,-17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,-17, - /* * */ -17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17,-17, 1 - }; - - return _data; - } - }; - - template <> - struct ScoringMatrixData_ - { - enum - { - VALUE_SIZE = ValueSize::VALUE, - TAB_SIZE = VALUE_SIZE * VALUE_SIZE - }; - static inline const int* getData() - { - static const int _data[TAB_SIZE] = - { - // A R N D C Q E G H I L K M F P S T W Y V B Z X * - /* A */ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* R */ 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* N */ 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* D */ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* C */ 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* Q */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* E */ 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* G */ 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* H */ 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* I */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* L */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* K */ 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* M */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* P */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* S */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, -17, - /* T */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -17, - /* W */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, -17, - /* Y */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, -17, - /* V */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, -17, - /* B */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, -17, - /* Z */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, -17, - /* X */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -17, - /* * */ -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, -17, 1 - }; - - return _data; - } - }; - -} // namespace seqan - +#include namespace OpenMS { @@ -161,18 +55,10 @@ namespace OpenMS /// Default constructor ConsensusIDAlgorithmPEPMatrix(); - private: - /// SeqAn similarity scoring - typedef ::seqan::Score > SeqAnScore; - - /// SeqAn amino acid sequence - typedef ::seqan::String< ::seqan::AminoAcid> SeqAnSequence; - /// Similarity scoring method - SeqAnScore scoring_method_; + private: - /// Alignment data structure - ::seqan::Align alignment_; + NeedlemanWunsch alignment_; /// Not implemented ConsensusIDAlgorithmPEPMatrix(const ConsensusIDAlgorithmPEPMatrix&); @@ -180,12 +66,11 @@ namespace OpenMS /// Not implemented ConsensusIDAlgorithmPEPMatrix& operator=(const ConsensusIDAlgorithmPEPMatrix&); - /// Docu in base class - void updateMembers_() override; - /// Sequence similarity based on substitution matrix (ignores PTMs) double getSimilarity_(AASequence seq1, AASequence seq2) override; + // Docu in base class + void updateMembers_() override; }; } // namespace OpenMS diff --git a/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h new file mode 100644 index 00000000000..373d06f5db5 --- /dev/null +++ b/src/openms/include/OpenMS/ANALYSIS/SEQUENCE/NeedlemanWunsch.h @@ -0,0 +1,45 @@ +#include +#include +#include +#include + +namespace OpenMS +{ + class OPENMS_DLLAPI NeedlemanWunsch + { + + public: + enum class ScoringMatrix + { + identity, + PAM30MS, + SIZE_OF_SCORINGMATRIX + }; + + NeedlemanWunsch(ScoringMatrix matrix, int penalty); + NeedlemanWunsch(); + + ~NeedlemanWunsch()=default; + + static const std::vector NamesOfScoringMatrices; + + int align(const String& seq1, const String& seq2); + + void setMatrix(const ScoringMatrix& matrix); + void setMatrix(const std::string& matrix); + + void setPenalty(const int penalty); + + ScoringMatrix getMatrix() const; + + int getPenalty() const; + + private: + unsigned seq1_len_ = 0; + unsigned seq2_len_ = 0; + int gap_penalty_ = 0; + int my_matrix_ = 0; + std::vector first_row_{}; + std::vector second_row_{}; + }; +} \ No newline at end of file diff --git a/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h b/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h index 24a980595d9..86a5b9af185 100644 --- a/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h +++ b/src/openms/include/OpenMS/DATASTRUCTURES/FASTAContainer.h @@ -101,7 +101,8 @@ class FASTAContainer offsets_(), data_fg_(), data_bg_(), - chunk_offset_(0) + chunk_offset_(0), + filename_(FASTA_file) { f_.readStart(FASTA_file); } @@ -199,7 +200,7 @@ class FASTAContainer } /// is the FASTA file empty? - bool empty() const + bool empty() { // trusting the FASTA file can be read... return f_.atEnd() && offsets_.empty(); } @@ -207,11 +208,11 @@ class FASTAContainer /// resets reading of the FASTA file, enables fresh reading of the FASTA from the beginning void reset() { - f_.setPosition(0); offsets_.clear(); data_fg_.clear(); data_bg_.clear(); chunk_offset_ = 0; + f_.readStart(filename_); } @@ -231,6 +232,7 @@ class FASTAContainer std::vector data_fg_; ///< active (foreground) data std::vector data_bg_; ///< prefetched (background) data; will become the next active data size_t chunk_offset_; ///< number of entries before the current chunk + std::string filename_;///< FASTA file name }; /** diff --git a/src/openms/include/OpenMS/FORMAT/FASTAFile.h b/src/openms/include/OpenMS/FORMAT/FASTAFile.h index 46f649d6f4e..edf9af4881f 100644 --- a/src/openms/include/OpenMS/FORMAT/FASTAFile.h +++ b/src/openms/include/OpenMS/FORMAT/FASTAFile.h @@ -38,187 +38,180 @@ #include #include -#include // for std::function #include -#include // for unique_ptr +#include #include namespace OpenMS { - /** - @brief This class serves for reading in and writing FASTA files - - If the protein/gene sequence contains unusual symbols (such as translation end (*)), - they will be kept! - - You can use aggregate methods load() and store() to read/write a - set of protein sequences at the cost of memory. - - Or use single read/write of protein sequences using readStart(), readNext() - and writeStart(), writeNext(), writeEnd() for more memory efficiency. - Reading from one and writing to another FASTA file can be handled by - one single FASTAFile instance. - - */ - - class OPENMS_DLLAPI FASTAFile : public ProgressLogger - { -public: - /** - @brief FASTA entry type (identifier, description and sequence) - - The first String corresponds to the identifier that is - written after the > in the FASTA file. The part after the - first whitespace is stored in description and the text - from the next line until the next > (exclusive) is stored - in sequence. - */ - struct FASTAEntry - { - String identifier; - String description; - String sequence; - - FASTAEntry() : - identifier(), - description(), - sequence() - { - } - - FASTAEntry(String id, String desc, String seq) : - identifier(id), - description(desc), - sequence(seq) - { - } - - FASTAEntry(const FASTAEntry& rhs) - : - identifier(rhs.identifier), - description(rhs.description), - sequence(rhs.sequence) - { - } - - FASTAEntry(FASTAEntry&& rhs) noexcept - : - identifier(::std::move(rhs.identifier)), - description(::std::move(rhs.description)), - sequence(::std::move(rhs.sequence)) - { - } - - FASTAEntry& operator=(const FASTAEntry& rhs) - { - if (*this == rhs) return *this; - identifier = rhs.identifier; - description = rhs.description; - sequence = rhs.sequence; - return *this; - } - - bool operator==(const FASTAEntry& rhs) const - { - return identifier == rhs.identifier - && description == rhs.description - && sequence == rhs.sequence; - } - - bool headerMatches(const FASTAEntry& rhs) const - { - return identifier == rhs.identifier && - description == rhs.description; - } - - bool sequenceMatches(const FASTAEntry& rhs) const - { - return sequence == rhs.sequence; - } - }; - - /// Default constructor - FASTAFile(); - - /// Destructor - virtual ~FASTAFile(); - /** - @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). - - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. + @brief This class serves for reading in and writing FASTA files + If the protein/gene sequence contains unusual symbols (such as translation end (*)), + they will be kept! + You can use aggregate methods load() and store() to read/write a + set of protein sequences at the cost of memory. + + Or use single read/write of protein sequences using readStart(), readNext() + and writeStart(), writeNext(), writeEnd() for more memory efficiency. + Reading from one and writing to another FASTA file can be handled by + one single FASTAFile instance. */ - void readStart(const String& filename); - - /** - @brief Reads the next FASTA entry from file. - If you want to read all entries in one go, use load(). - - @return true if entry was read; false if eof was reached - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - bool readNext(FASTAEntry& protein); - - /// current stream position - std::streampos position() const; - - /// is stream at EOF? - bool atEnd() const; - - /// seek stream to @p pos - bool setPosition(const std::streampos& pos); - - /** - @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). - - @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). - */ - void writeStart(const String& filename); - - /** - @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). - - Call writeEnd() when done to close the file! - - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. - */ - void writeNext(const FASTAEntry& protein); - - /** - @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. - - */ - void writeEnd(); - - - /** - @brief loads a FASTA file given by 'filename' and stores the information in 'data' - - This uses more RAM than readStart() and readNext(), but supports progress-logging. - - @exception Exception::FileNotFound is thrown if the file does not exists. - @exception Exception::ParseError is thrown if the file does not suit to the standard. - */ - void load(const String& filename, std::vector& data) const; - - /** - @brief stores the data given by 'data' at the file 'filename' - - This uses more RAM than writeStart() and writeNext(). - - @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. - */ - void store(const String& filename, const std::vector& data) const; - -protected: - std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() - std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() - std::unique_ptr > reader_; ///< filestream for reading; init using FastaFile::readStart(); needs to be a pointer, since its not copy-constructable; we use void* here, to avoid pulling in seqan includes - Size entries_read_; ///< some internal book-keeping during reading - }; + class OPENMS_DLLAPI FASTAFile : public ProgressLogger + { + public: + /** + @brief FASTA entry type (identifier, description and sequence) + The first String corresponds to the identifier that is + written after the > in the FASTA file. The part after the + first whitespace is stored in description and the text + from the next line until the next > (exclusive) is stored + in sequence. + */ + struct FASTAEntry + { + String identifier; + String description; + String sequence; + + FASTAEntry() : + identifier(), + description(), + sequence() + { + } + + FASTAEntry(String id, String desc, String seq) : + identifier(id), + description(desc), + sequence(seq) + { + } + + FASTAEntry(const FASTAEntry& rhs) + : + identifier(rhs.identifier), + description(rhs.description), + sequence(rhs.sequence) + { + } + + FASTAEntry(FASTAEntry&& rhs) noexcept + : + identifier(::std::move(rhs.identifier)), + description(::std::move(rhs.description)), + sequence(::std::move(rhs.sequence)) + { + } + + FASTAEntry& operator=(const FASTAEntry& rhs) + { + if (*this == rhs) return *this; + identifier = rhs.identifier; + description = rhs.description; + sequence = rhs.sequence; + return *this; + } + + bool operator==(const FASTAEntry& rhs) const + { + return identifier == rhs.identifier + && description == rhs.description + && sequence == rhs.sequence; + } + + bool headerMatches(const FASTAEntry& rhs) const + { + return identifier == rhs.identifier && + description == rhs.description; + } + + bool sequenceMatches(const FASTAEntry& rhs) const + { + return sequence == rhs.sequence; + } + }; + + /// Default constructor + FASTAFile(); + + /// Destructor + virtual ~FASTAFile(); + + /** + @brief Prepares a FASTA file given by 'filename' for streamed reading using readNext(). + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + void readStart(const String& filename); + + /** + @brief Reads the next FASTA entry from file. + If you want to read all entries in one go, use load(). + @return true if entry was read; false if eof was reached + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + bool readNext(FASTAEntry& protein); + + /// current stream position + std::streampos position(); + + /// is stream at EOF? + bool atEnd(); + + /// seek stream to @p pos + bool setPosition(const std::streampos& pos); + + /** + @brief Prepares a FASTA file given by 'filename' for streamed writing using writeNext(). + @exception Exception::UnableToCreateFile is thrown if the process is not able to write to the file (disk full?). + */ + void writeStart(const String& filename); + + /** + @brief Stores the data given by @p protein. Call writeStart() once before calling writeNext(). + Call writeEnd() when done to close the file! + @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. + */ + void writeNext(const FASTAEntry& protein); + + /** + @brief Closes the file (flush). Called implicitly when FASTAFile object does out of scope. + */ + void writeEnd(); + + + /** + @brief loads a FASTA file given by 'filename' and stores the information in 'data' + This uses more RAM than readStart() and readNext(). + @exception Exception::FileNotFound is thrown if the file does not exists. + @exception Exception::ParseError is thrown if the file does not suit to the standard. + */ + void load(const String& filename, std::vector& data) const; + + /** + @brief stores the data given by 'data' at the file 'filename' + + This uses more RAM than writeStart() and writeNext(). + @exception Exception::UnableToCreateFile is thrown if the process is not able to write the file. + */ + void store(const String& filename, const std::vector& data) const; + + protected: + + /** + @brief Reads a protein entry from the current file position and returns the ID and sequence + @return Return true if the protein entry was read and saved successfully, false otherwise + */ + bool readEntry_(std::string& id, std::string& description, std::string& seq); + std::fstream infile_; ///< filestream for reading; init using FastaFile::readStart() + std::ofstream outfile_; ///< filestream for writing; init using FastaFile::writeStart() + Size entries_read_{0}; ///< some internal book-keeping during reading + std::streampos fileSize_{};///< total number of characters of filestream + std::string seq_;///< sequence of currently read protein + std::string id_;///< identifier of currently read protein + std::string description_;///< description of currently read protein + }; } // namespace OpenMS - diff --git a/src/openms/includes.cmake b/src/openms/includes.cmake index efc8464a49e..72cb4e5a186 100644 --- a/src/openms/includes.cmake +++ b/src/openms/includes.cmake @@ -21,6 +21,7 @@ include(source/FORMAT/VALIDATORS/sources.cmake) include(source/FORMAT/OPTIONS/sources.cmake) include(source/FORMAT/sources.cmake) include(source/ANALYSIS/QUANTITATION/sources.cmake) +include(source/ANALYSIS/SEQUENCE/sources.cmake) include(source/ANALYSIS/SVM/sources.cmake) include(source/ANALYSIS/MAPMATCHING/sources.cmake) include(source/ANALYSIS/DECHARGING/sources.cmake) @@ -80,6 +81,7 @@ include(include/OpenMS/ANALYSIS/ID/sources.cmake) include(include/OpenMS/ANALYSIS/DENOVO/sources.cmake) include(include/OpenMS/ANALYSIS/MAPMATCHING/sources.cmake) include(include/OpenMS/ANALYSIS/QUANTITATION/sources.cmake) +include(include/OpenMS/ANALYSIS/SEQUENCE/sources.cmake) include(include/OpenMS/ANALYSIS/SVM/sources.cmake) include(include/OpenMS/ANALYSIS/PIP/sources.cmake) include(include/OpenMS/ANALYSIS/MRM/sources.cmake) diff --git a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp index 00eab98aee7..ae599d7a64b 100644 --- a/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp +++ b/src/openms/source/ANALYSIS/ID/ConsensusIDAlgorithmPEPMatrix.cpp @@ -42,46 +42,39 @@ namespace OpenMS { setName("ConsensusIDAlgorithmPEPMatrix"); // DefaultParamHandler - defaults_.setValue("matrix", "identity", "Substitution matrix to use for alignment-based similarity scoring"); - defaults_.setValidStrings("matrix", {"identity","PAM30MS"}); + defaults_.setValue("matrix", "PAM30MS", "Substitution matrix to use for alignment-based similarity scoring"); + defaults_.setValidStrings("matrix", NeedlemanWunsch::NamesOfScoringMatrices); defaults_.setValue("penalty", 5, "Alignment gap penalty (the same value is used for gap opening and extension)"); defaults_.setMinInt("penalty", 1); defaultsToParam_(); - ::seqan::resize(rows(alignment_), 2); } - void ConsensusIDAlgorithmPEPMatrix::updateMembers_() { ConsensusIDAlgorithmSimilarity::updateMembers_(); - // alignment scoring using SeqAn/similarity matrices: - std::string matrix = param_.getValue("matrix"); + string matrix = param_.getValue("matrix"); int penalty = param_.getValue("penalty"); - scoring_method_ = SeqAnScore(-penalty, -penalty); - if (matrix == "identity") - { - ::seqan::setDefaultScoreMatrix(scoring_method_, - ::seqan::AdaptedIdentity()); - } - else if (matrix == "PAM30MS") + + alignment_.setMatrix(matrix); + + if (penalty > 0) { - ::seqan::setDefaultScoreMatrix(scoring_method_, ::seqan::PAM30MS()); + alignment_.setPenalty(penalty); } else { - String msg = "Matrix '" + matrix + "' is not known! Valid choices are: " - "'identity', 'PAM30MS'."; + String msg = "Gap penalty should be positive"; throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, msg); } // new parameters may affect the similarity calculation, so clear cache: similarities_.clear(); - } + } double ConsensusIDAlgorithmPEPMatrix::getSimilarity_(AASequence seq1, AASequence seq2) @@ -90,40 +83,20 @@ namespace OpenMS String unmod_seq1 = seq1.toUnmodifiedString(); String unmod_seq2 = seq2.toUnmodifiedString(); if (unmod_seq1 == unmod_seq2) return 1.0; - // order of sequences matters for cache look-up: - if (unmod_seq1 > unmod_seq2) swap(unmod_seq1, unmod_seq2); - seq1 = AASequence::fromString(unmod_seq1); - seq2 = AASequence::fromString(unmod_seq2); - pair seq_pair = make_pair(seq1, seq2); - SimilarityCache::iterator pos = similarities_.find(seq_pair); - if (pos != similarities_.end()) return pos->second; // score found in cache - - // use SeqAn similarity scoring: - SeqAnSequence seqan_seq1 = unmod_seq1.c_str(); - SeqAnSequence seqan_seq2 = unmod_seq2.c_str(); - // seq. 1 against itself: - ::seqan::assignSource(row(alignment_, 0), seqan_seq1); - ::seqan::assignSource(row(alignment_, 1), seqan_seq1); - double score_self1 = globalAlignment(alignment_, scoring_method_, - ::seqan::NeedlemanWunsch()); - // seq. 1 against seq. 2: - ::seqan::assignSource(row(alignment_, 1), seqan_seq2); - double score_sim = globalAlignment(alignment_, scoring_method_, - ::seqan::NeedlemanWunsch()); - // seq. 2 against itself: - ::seqan::assignSource(row(alignment_, 0), seqan_seq2); - double score_self2 = globalAlignment(alignment_, scoring_method_, - ::seqan::NeedlemanWunsch()); + if (unmod_seq1 < unmod_seq2) swap(unmod_seq1, unmod_seq2); + + double score_sim = alignment_.align(unmod_seq1, unmod_seq2); + if (score_sim < 0) { score_sim = 0; } else { + double score_self1 = alignment_.align(unmod_seq1, unmod_seq1); + double score_self2 = alignment_.align(unmod_seq2, unmod_seq2); score_sim /= min(score_self1, score_self2); // normalize } - similarities_[seq_pair] = score_sim; // cache the similarity score - return score_sim; } diff --git a/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp new file mode 100644 index 00000000000..5d25cf60ae8 --- /dev/null +++ b/src/openms/source/ANALYSIS/SEQUENCE/NeedlemanWunsch.cpp @@ -0,0 +1,161 @@ +#include +#include +#include +#include //swap +#include + + +using namespace std; +namespace OpenMS +{ + static int matrices[static_cast(NeedlemanWunsch::ScoringMatrix::SIZE_OF_SCORINGMATRIX)][26][26] + { + //identity + { + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {1, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* B */ {0, 1, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* C */ {0, 0, 1, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* D */ {0, 0, 0, 1, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* E */ {0, 0, 0, 0, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* F */ {0, 0, 0, 0, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* G */ {0, 0, 0, 0, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* H */ {0, 0, 0, 0, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* I */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* K */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* L */ {0, 0, 0, 0, 0, 0, 0, 0, 1, INT16_MAX, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* M */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* N */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* P */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Q */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* R */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 1, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* S */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* T */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 1, INT16_MAX, 0, 0, 0, 0, 0}, + /* U */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* V */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 1, 0, 0, 0, 0}, + /* W */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 1, 0, 0, 0}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Y */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 1, 0}, + /* Z */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 1} + + }, + + //PAM30MS + { + // A B C D E F G H I J K L M N O P Q R S T U V W X Y Z + /* A */ {6, -7, -6, -3, -2, -8, -2, -7, -5, INT16_MAX, -7, -6, -5, -4, INT16_MAX, -2, -4, -7, 0, -1, INT16_MAX, -2,-13, 0, -8, -6}, + /* B */ {-7, 5,-11, -7, -7,-12, -8, -4, -6, INT16_MAX, 5, -7, -3, -4, INT16_MAX, -5, -3, 5, -4, -5, INT16_MAX, -9, -7, 0,-10, 1}, + /* C */ {-6,-11, 10,-14,-14,-13, -9, -7, -6, INT16_MAX,-14,-11,-13,-11, INT16_MAX, -8,-14, -8, -3, -8, INT16_MAX, -6,-15, 0, -4,-14}, + /* D */ {-3, -7,-14, 8, 2,-15, -3, -4, -7, INT16_MAX, -4,-10,-11, 2, INT16_MAX, -8, -2,-10, -4, -5, INT16_MAX, -8,-15, 0,-11, -3}, + /* E */ {-2, -7,-14, 2, 8,-14, -4, -5, -5, INT16_MAX, -4, -7, -7, -2, INT16_MAX, -5, 1, -9, -4, -6, INT16_MAX, -6,-17, 0, -8, -2}, + /* F */ {-8,-12,-13,-15,-14, 9, -9, -6, -2, INT16_MAX,-14, -3, -4, -9, INT16_MAX,-10,-13, -9, -6, -9, INT16_MAX, -8, -4, 0, 2,-14}, + /* G */ {-2, -8, -9, -3, -4, -9, 6, -9,-11, INT16_MAX, -7,-11, -8, -3, INT16_MAX, -6, -7, -9, -2, -6, INT16_MAX, -5,-15, 0,-14, -7}, + /* H */ {-7, -4, -7, -4, -5, -6, -9, 9, -9, INT16_MAX, -6, -8,-10, 0, INT16_MAX, -4, 1, -2, -6, -7, INT16_MAX, -6, -7, 0, -3, -3}, + /* I */ {-5, -6, -6, -7, -5, -2,-11, -9, 8, INT16_MAX, -6, 5, -1, -5, INT16_MAX, -8, -8, -5, -7, -2, INT16_MAX, 2,-14, 0, -6, -7}, + /* J */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* K */ {-7, 5,-14, -4, -4,-14, -7, -6, -6, INT16_MAX, 7, -7, -2, -1, INT16_MAX, -6, -3, 0, -4, -3, INT16_MAX, -9,-12, 0, -9, 4}, + /* L */ {-6, -7,-11,-10, -7, -3,-11, -8, 5, INT16_MAX, -7, 5, 0, -6, INT16_MAX, -8, -7, -7, -8, -5, INT16_MAX, 0,-10, 0, -7, -7}, + /* M */ {-5, -3,-13,-11, -7, -4, -8,-10, -1, INT16_MAX, -2, 0, 11, -9, INT16_MAX, -8, -4, -4, -5, -4, INT16_MAX, -1,-13, 0,-11, -3}, + /* N */ {-4, -4,-11, 2, -2, -9, -3, 0, -5, INT16_MAX, -1, -6, -9, 8, INT16_MAX, -6, -3, -6, 0, -2, INT16_MAX, -8, -8, 0, -4, -2}, + /* O */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* P */ {-2, -5, -8, -8, -5,-10, -6, -4, -8, INT16_MAX, -6, -8, -8, -6, INT16_MAX, 8, -3, -4, -2, -4, INT16_MAX, -6,-14, 0,-13, -5}, + /* Q */ {-4, -3,-14, -2, 1,-13, -7, 1, -8, INT16_MAX, -3, -7, -4, -3, INT16_MAX, -3, 8, -2, -5, -5, INT16_MAX, -7,-13, 0,-12, 4}, + /* R */ {-7, 5, -8,-10, -9, -9, -9, -2, -5, INT16_MAX, 0, -7, -4, -6, INT16_MAX, -4, -2, 8, -3, -6, INT16_MAX, -8, -2, 0, 10, -1}, + /* S */ {0, -4, -3, -4, -4, -6, -2, -6, -7, INT16_MAX, -4, -8, -5, 0, INT16_MAX, -2, -5, -3, 6, 0, INT16_MAX, -6, -5, 0, -7, -5}, + /* T */ {-1, -5, -8, -5, -6, -9, -6, -7, -2, INT16_MAX, -3, -5, -4, -2, INT16_MAX, -4, -5, -6, 0, 7, INT16_MAX, -3,-13, 0, -6, -4}, + /* I */ {INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX,}, + /* V */ {-2, -9, -6, -8, -6, -8, -5, -6, 2, INT16_MAX, -9, 0, -1, -8, INT16_MAX, -6, -7, -8, -6, -3, INT16_MAX, 7,-15, 0, -7, -8}, + /* W */ {-13,-7,-15,-15,-17, -4,-15, -7,-14, INT16_MAX,-12,-10,-13, -8, INT16_MAX,-14,-13, -2, -5,-13, INT16_MAX,-15, 13, 0, -5,-13}, + /* X */ {0, 0, 0, 0, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0, INT16_MAX, 0, 0, 0, 0, 0}, + /* Y */ {-8,-10, -4,-11, -8, 2,-14, -3, -6, INT16_MAX, -9, -7,-11, -4, INT16_MAX,-13,-12,-10, -7, -6, INT16_MAX, -7, -5, 0, 10,-11}, + /* Z */ {-6, 1,-14, -3, -2,-14, -7, -3, -7, INT16_MAX, 4, -7, -3, -2, INT16_MAX, -5, 4, -1, -5, -4, INT16_MAX, -8,-13, 0,-11, 4} + } + }; + + + NeedlemanWunsch::NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix matrix, int penalty) + { + setMatrix(matrix); + setPenalty(penalty); + } + + NeedlemanWunsch::NeedlemanWunsch() + { + setMatrix(ScoringMatrix::PAM30MS); + setPenalty(5); + } + + const vector NeedlemanWunsch::NamesOfScoringMatrices = {"identity", "PAM30MS"}; + + void NeedlemanWunsch::setMatrix(const NeedlemanWunsch::ScoringMatrix& matrix) + { + my_matrix_ = static_cast(matrix); + } + + void NeedlemanWunsch::setMatrix(const std::string& matrix) + { + auto first = &NamesOfScoringMatrices[0]; + auto last = &NamesOfScoringMatrices[static_cast(ScoringMatrix::SIZE_OF_SCORINGMATRIX)]; + const auto it = std::find(first, last, matrix); + if (it == last) + { + String msg = "Matrix is not known! Valid choices are: "+ + ListUtils::concatenate(NamesOfScoringMatrices, ", "); + throw Exception::IllegalArgument(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, + msg); + } + setMatrix(static_cast(it - first)); + } + + + void NeedlemanWunsch::setPenalty(const int penalty) + { + gap_penalty_ = penalty; + } + + + NeedlemanWunsch::ScoringMatrix NeedlemanWunsch::getMatrix() const + { + return static_cast(my_matrix_); + } + + + int NeedlemanWunsch::getPenalty() const + { + return gap_penalty_; + } + + int NeedlemanWunsch::align(const String& seq1, const String& seq2) + { + seq1_len_ = seq1.length(); + seq2_len_ = seq2.length(); + + first_row_.resize(seq2_len_+1); // both rows have the same length + second_row_.resize(seq2_len_+1); + + int* firstRowPtr = &(first_row_[0]); + int* secondRowPtr = &(second_row_[0]); + + int (*matrix_ptr)[26][26] = &matrices[my_matrix_]; + + for (unsigned i = 0; i <= seq2_len_; ++i) // initialize using gap-penalty + { + first_row_[i] = i * (-gap_penalty_); + } + + for (unsigned i = 1;i <= seq1_len_; ++i) + { + (*secondRowPtr) = i * (-gap_penalty_); // the first value in a row + for (unsigned j = 1; j <= seq2_len_; ++j) + { + (*(secondRowPtr+j)) = max(max(((*(secondRowPtr+j-1)) - gap_penalty_), ((*(firstRowPtr+j)) - gap_penalty_)), + ((*(firstRowPtr+j-1)) + (*matrix_ptr)[seq1[i-1] - 'A'] [seq2[j-1] - 'A'])); + } + swap(firstRowPtr, secondRowPtr); + } + return (*(firstRowPtr+seq2_len_)); + } + +} \ No newline at end of file diff --git a/src/openms/source/FORMAT/FASTAFile.cpp b/src/openms/source/FORMAT/FASTAFile.cpp index bd9ecafb240..bc39fe3eebd 100644 --- a/src/openms/source/FORMAT/FASTAFile.cpp +++ b/src/openms/source/FORMAT/FASTAFile.cpp @@ -29,7 +29,7 @@ // // -------------------------------------------------------------------------- // $Maintainer: Timo Sachsenberg $ -// $Authors: Nico Pfeifer, Chris Bielow $ +// $Authors: Nico Pfeifer, Chris Bielow, Tinatin Kasradze, Nora Wild $ // -------------------------------------------------------------------------- #include @@ -39,29 +39,109 @@ #include -#include -#include -#include -#include namespace OpenMS { using namespace std; - typedef seqan::RecordReader > FASTARecordReader; FASTAFile::FASTAFile() - : reader_(std::nullptr_t()), // point to nothing - entries_read_(0) { } - + FASTAFile::~FASTAFile() { // infile_ and outfile_ will close automatically when going out of scope. No need to do it explicitly here. } - void FASTAFile::readStart(const String& filename) + bool FASTAFile::readEntry_(std::string &id, std::string &description, std::string &seq) + { + std::streambuf *sb = infile_.rdbuf(); + bool keep_reading = true; + bool description_exists = true; + + if (sb->sbumpc() != '>') return false; // was in wrong position for reading ID + while (keep_reading) // reading the ID + { + int c = sb->sbumpc(); // get and advance to next char + switch (c) + { + case ' ': + case '\t': + if (!id.empty()) + { + keep_reading = false; // ID finished + } + break; + case '\n': // ID finished and no description available + keep_reading = false; + description_exists = false; + break; + case '\r': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + id += (char) c; + } + } + if (id.empty()) return false; + if (description_exists) keep_reading = true; + while (keep_reading) //reading the description + { + int c = sb->sbumpc(); // get and advance to next char + switch (c) + { + case '\n': // description finished + keep_reading = false; + break; + case '\r': + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + return false; + default: + description += (char) c; + } + } + keep_reading = true; + while (keep_reading) // reading the sequence + { + int c = sb->sbumpc(); // get and advance to next char + switch (c) + { + case '\n': + if (sb->sgetc() == '>') // reaching the beginning of the next protein-entry + { + keep_reading = false; + } + break; + case '\r': + break; + case ' ': // not saving white spaces + break; + case '\t': + break; + case std::streambuf::traits_type::eof(): + infile_.setstate(std::ios::eofbit); + if (seq.empty()) + { + infile_.setstate(std::ios::badbit); + return false; + } + return true; + default: + seq += (char) c; + } + } + return !seq.empty(); + } + + void FASTAFile::readStart(const String &filename) { + if (!File::exists(filename)) { throw Exception::FileNotFound(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename); @@ -75,81 +155,72 @@ namespace OpenMS if (infile_.is_open()) infile_.close(); // precaution infile_.open(filename.c_str(), std::ios::binary | std::ios::in); + infile_.seekg(0, infile_.end); + fileSize_ = infile_.tellg(); + infile_.seekg(0, infile_.beg); - // Skip the header of PEFF files (http://www.psidev.info/peff) - std::string line; - std::streampos firstline = 0; - while (TextFile::getLine(infile_, line)) + std::streambuf *sb = infile_.rdbuf(); + while (sb->sgetc() == '#') // Skip the header of PEFF files (http://www.psidev.info/peff) { - if (!line.empty() && line[0] != '#') - { - break; - } - firstline = infile_.tellg(); + infile_.ignore(numeric_limits::max(), '\n'); } - infile_.seekg(firstline); - - // automatically deletes old handles - reader_ = std::unique_ptr >(new FASTARecordReader(infile_), - [](void* ptr) - { // lambda with custom cast - delete static_cast(ptr); - }); - entries_read_ = 0; } - bool FASTAFile::readNext(FASTAEntry& protein) + bool FASTAFile::readNext(FASTAEntry &protein) { - if (seqan::atEnd(*static_cast(reader_.get()))) - { - // do NOT close(), since we still might want to seek to certain positions + if (infile_.eof()) + { return false; } - String id, s; - if (readRecord(id, s, *static_cast(reader_.get()), seqan::Fasta()) != 0) + seq_.clear(); + id_.clear(); + description_.clear(); + if (!readEntry_(id_, description_, seq_)) { - if (entries_read_ == 0) s = "The first entry could not be read!"; - else s = "Only " + String(entries_read_) + " proteins could be read. The record after failed."; - throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", "Error while parsing FASTA file! " + s + " Please check the file!"); + if (entries_read_ == 0) + { + seq_ = "The first entry could not be read!"; + } + else + { + seq_ = "Only " + String(entries_read_) + " proteins could be read. Parsing next record failed."; + } + throw Exception::ParseError(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, "", + "Error while parsing FASTA file! " + seq_ + " Please check the file!"); } ++entries_read_; - s.removeWhitespaces(); - protein.sequence = s; // assign here, since 's' might have higher capacity, thus wasting memory (usually 10-15%) - // handle id - id.trim(); - String::size_type position = id.find_first_of(" \v\t"); - if (position == String::npos) - { - protein.identifier = std::move(id); - protein.description = ""; - } - else - { - protein.identifier = id.substr(0, position); - protein.description = id.suffix(id.size() - position - 1); - } + protein.identifier = id_; + protein.description = description_; + protein.sequence = seq_; return true; } - std::streampos FASTAFile::position() const + std::streampos FASTAFile::position() { - return seqan::position(*static_cast(reader_.get())); + return infile_.tellg(); } - bool FASTAFile::setPosition(const std::streampos& pos) + + bool FASTAFile::setPosition(const std::streampos &pos) { - return (seqan::setPosition(*static_cast(reader_.get()), pos) == 0); + if (pos <= fileSize_) + { + infile_.clear(); // when end of file is reached, otherwise it gets -1 + infile_.seekg(pos); + return true; + } + return false; } - bool FASTAFile::atEnd() const + bool FASTAFile::atEnd() { - return seqan::atEnd(*static_cast(reader_.get())); + return (infile_.peek() == std::streambuf::traits_type::eof()); } - void FASTAFile::load(const String& filename, vector& data) const + void FASTAFile::load(const String &filename, vector &data) const { startProgress(0, 1, "Loading FASTA file"); data.clear(); @@ -161,14 +232,15 @@ namespace OpenMS data.push_back(std::move(p)); } endProgress(); - return; } - void FASTAFile::writeStart(const String& filename) + void FASTAFile::writeStart(const String &filename) { if (!FileHandler::hasValidExtension(filename, FileTypes::FASTA)) { - throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, "invalid file extension; expected '" + FileTypes::typeToName(FileTypes::FASTA) + "'"); + throw Exception::UnableToCreateFile(__FILE__, __LINE__, OPENMS_PRETTY_FUNCTION, filename, + "invalid file extension; expected '" + + FileTypes::typeToName(FileTypes::FASTA) + "'"); } outfile_.open(filename.c_str(), ofstream::out); @@ -179,12 +251,12 @@ namespace OpenMS } } - void FASTAFile::writeNext(const FASTAEntry& protein) + void FASTAFile::writeNext(const FASTAEntry &protein) { outfile_ << ">" << protein.identifier << " " << protein.description << "\n"; - const String& tmp(protein.sequence); + const String &tmp(protein.sequence); - int chunks( tmp.size()/80 ); // number of complete chunks + int chunks(tmp.size() / 80); // number of complete chunks Size chunk_pos(0); while (--chunks >= 0) { @@ -205,14 +277,14 @@ namespace OpenMS outfile_.close(); } - void FASTAFile::store(const String& filename, const vector& data) const + void FASTAFile::store(const String &filename, const vector &data) const { startProgress(0, data.size(), "Writing FASTA file"); FASTAFile f; f.writeStart(filename); - for (vector::const_iterator it = data.begin(); it != data.end(); ++it) + for (const FASTAFile::FASTAEntry& it : data) { - f.writeNext(*it); + f.writeNext(it); nextProgress(); } f.writeEnd(); // close file diff --git a/src/openms/source/FORMAT/TextFile.cpp b/src/openms/source/FORMAT/TextFile.cpp index c8ddf43d2f9..ca6fad4ecc2 100644 --- a/src/openms/source/FORMAT/TextFile.cpp +++ b/src/openms/source/FORMAT/TextFile.cpp @@ -128,6 +128,7 @@ namespace OpenMS { // the stream has an error return is; } + std::streambuf* sb = is.rdbuf(); for (;;) diff --git a/src/tests/class_tests/openms/data/FASTAFile_test.fasta b/src/tests/class_tests/openms/data/FASTAFile_test.fasta index 8e573a1a6af..1b3c4c188bc 100755 --- a/src/tests/class_tests/openms/data/FASTAFile_test.fasta +++ b/src/tests/class_tests/openms/data/FASTAFile_test.fasta @@ -1,3 +1,7 @@ +# PEFF Description block +# Decoy=false +# DbDescription=extract of neXtProt with manual modifications +# GeneralComment= A selection of protein entries >P68509|1433F_BOVIN This is the description of the first protein GDREQLLQRARLAEQAERYDDMASAMKAVTELNEPLSNEDRNLLSVAYKNVVGARRSSWR VISSIEQKTMADGNEKKLEKVKAYREKIEKELETVCNDVLALLDKFLIKNCNDFQYESKV @@ -17,12 +21,13 @@ WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD AGEGEN + + >sp|P00000|0000A_UNKNOWN Artificially modified version of sp|P31946|1433B_HUMAN (ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLL SVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKY LIPNATQPESKVFYLKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTH PIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDN LTLWTSENQGDEGDAGEGEN - > test ##0 GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index 917d10594cb..ad17d06a6f9 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -528,6 +528,7 @@ set(analysis_executables_list MetaboliteFeatureDeconvolution_test MetaboliteSpectralMatching_test ModifiedPeptideGenerator_test + NeedlemanWunsch_test OfflinePrecursorIonSelection_test PeptideIndexing_test PeptideAndProteinQuant_test diff --git a/src/tests/class_tests/openms/source/ConsensusIDAlgorithmPEPMatrix_test.cpp b/src/tests/class_tests/openms/source/ConsensusIDAlgorithmPEPMatrix_test.cpp index 4d0c7f1ab2b..a2559856166 100644 --- a/src/tests/class_tests/openms/source/ConsensusIDAlgorithmPEPMatrix_test.cpp +++ b/src/tests/class_tests/openms/source/ConsensusIDAlgorithmPEPMatrix_test.cpp @@ -38,6 +38,7 @@ /////////////////////////// #include +#include //wieder rausnehmen using namespace OpenMS; using namespace std; diff --git a/src/tests/class_tests/openms/source/FASTAFile_test.cpp b/src/tests/class_tests/openms/source/FASTAFile_test.cpp index 305f5734f91..6ba2cc1f3a6 100644 --- a/src/tests/class_tests/openms/source/FASTAFile_test.cpp +++ b/src/tests/class_tests/openms/source/FASTAFile_test.cpp @@ -53,141 +53,251 @@ START_TEST(FASTAFile, "$Id$") ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// -using namespace OpenMS; -using namespace std; - -FASTAFile* ptr = nullptr; -START_SECTION((FASTAFile())) - ptr = new FASTAFile(); - TEST_EQUAL(ptr == nullptr, false) -END_SECTION - -START_SECTION((~FASTAFile())) - delete(ptr); -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) - FASTAFile::FASTAEntry * ptr_e; - ptr_e = new FASTAFile::FASTAEntry(); - TEST_EQUAL(ptr_e == nullptr, false) -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) - FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); - TEST_EQUAL(entry.identifier, "ID") - TEST_EQUAL(entry.description, "DESC") - TEST_EQUAL(entry.sequence, "DAVLDELNER") -END_SECTION - -START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) - FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); - FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); - TEST_EQUAL(entry1==entry2, true) - TEST_EQUAL(entry1==entry3, false) -END_SECTION - - -START_SECTION((void load(const String& filename, std::vector< FASTAEntry > &data))) - vector data; - FASTAFile file; - - TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist",data)) - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - vector< FASTAFile::FASTAEntry >::const_iterator sequences_iterator = data.begin(); - TEST_EQUAL(data.size(), 5) - TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) - TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) - TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + - String("NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + - String("NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + - String("KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + - String("LRDNLTLWTSDQQDEEAGEGN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") - TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + - String("QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + - String("VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + - String("PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + - String("LWTSENQGDEGDAGEGEN")) - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") - TEST_EQUAL(sequences_iterator->description, String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) - TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") - TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) - TEST_EQUAL(sequences_iterator->sequence, String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - // test if the modifed sequence is convertable - AASequence aa = AASequence::fromString(sequences_iterator->sequence); - TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") - + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") - + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") - + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") - + String("AGEGEN")) - - TEST_EQUAL(aa.isModified(), true) - String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", ResidueModification::N_TERM)->getId(); - TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) - - sequences_iterator++; - TEST_EQUAL(sequences_iterator->identifier, "test") - TEST_EQUAL(sequences_iterator->description, String(" ##0")) - TEST_EQUAL(sequences_iterator->sequence, String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") - + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") - + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") - + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") - + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") - + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) - -END_SECTION - -START_SECTION((void store(const String& filename, const std::vector< FASTAEntry > &data) const)) - vector data, data2; - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - - file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"),data); - TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt",data)) - - file.store(tmp_filename,data); - file.load(tmp_filename,data2); - TEST_EQUAL(data==data2,true); -END_SECTION - -START_SECTION([EXTRA] test_strange_symbols_in_sequence) - // test if * is read correctly (not changed into something weird like 'X') - String tmp_filename; - NEW_TMP_FILE(tmp_filename); - FASTAFile file; - vector data, data2; - FASTAFile::FASTAEntry temp_entry; - temp_entry.identifier = String("P68509|1433F_BOVIN"); - temp_entry.description = String("This is the description of the first protein"); - temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); - data.push_back(temp_entry); - data.push_back(temp_entry); // twice - - file.store(tmp_filename, data); - file.load(tmp_filename, data2); - - ABORT_IF(data2.size() != 2); - TEST_EQUAL(data2[0] == temp_entry, true); - TEST_EQUAL(data2[1] == temp_entry, true); - -END_SECTION + using namespace OpenMS; + using namespace std; + + FASTAFile *ptr = nullptr; + + START_SECTION((FASTAFile())) + ptr = new FASTAFile(); + TEST_EQUAL(ptr == nullptr, false) + END_SECTION + + START_SECTION((~FASTAFile())) + delete (ptr); + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry()) + FASTAFile::FASTAEntry *ptr_e; + ptr_e = new FASTAFile::FASTAEntry(); + TEST_EQUAL(ptr_e == nullptr, false) + END_SECTION + + + START_SECTION([FASTAFile::FASTAEntry] FASTAEntry(String id, String desc, String seq)) + FASTAFile::FASTAEntry entry("ID", "DESC", "DAVLDELNER"); + TEST_EQUAL(entry.identifier, "ID") + TEST_EQUAL(entry.description, "DESC") + TEST_EQUAL(entry.sequence, "DAVLDELNER") + END_SECTION + + START_SECTION([FASTAFile::FASTAEntry] bool operator==(const FASTAEntry &rhs) const) + FASTAFile::FASTAEntry entry1("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry2("ID", "DESC", "DAV*LDELNER"); + FASTAFile::FASTAEntry entry3("ID2", "DESC", "DAV*LDELNER"); + TEST_EQUAL(entry1 == entry2, true) + TEST_EQUAL(entry1 == entry3, false) + END_SECTION + + + START_SECTION((void + load( + const String &filename, std::vector + &data))) + vector data; + FASTAFile file; + + TEST_EXCEPTION(Exception::FileNotFound, file.load("FASTAFile_test_this_file_does_not_exist", data)) + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"), data); + vector::const_iterator sequences_iterator = data.begin(); + TEST_EQUAL(data.size(), 5) + TEST_EQUAL(sequences_iterator->identifier, String("P68509|1433F_BOVIN")) + TEST_EQUAL(sequences_iterator->description, String("This is the description of the first protein")) + TEST_EQUAL(sequences_iterator->sequence, String("GDREQLLQRARLAEQAERYDDMASAMKAVTEL") + + String( + "NEPLSNEDRNLLSVAYKNVVGARRSSWRVISSIEQKTMADGNEKKLEKVKAYREKIEKELETVC") + + String( + "NDVLALLDKFLIKNCNDFQYESKVFYLKMKGDYYRYLAEVASGEKKNSVVEASEAAYKEAFEIS") + + String( + "KEHMQPTHPIRLGLALNFSVFYYEIQNAPEQACLLAKQAFDDAIAELDTLNEDSYKDSTLIMQL") + + String("LRDNLTLWTSDQQDEEAGEGN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "Q9CQV8|1433B_MOUSE") + TEST_EQUAL(sequences_iterator->sequence, String("TMDKSELVQKAKLAEQAERYDDMAAAMKAVTE") + + String( + "QGHELSNEERNLLSVAYKNVVGARRSSWRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICND") + + String( + "VLELLDKYLILNATQAESKVFYLKMKGDYFRYLSEVASGENKQTTVSNSQQAYQEAFEISKKEMQ") + + String( + "PTHPIRLGLALNFSVFYYEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLT") + + String("LWTSENQGDEGDAGEGEN")) + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P31946|1433B_HUMAN") + TEST_EQUAL(sequences_iterator->description, + String("14-3-3 protein beta/alpha OS=Homo sapiens GN=YWHAB PE=1 SV=3")) + TEST_EQUAL(sequences_iterator->sequence, String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "sp|P00000|0000A_UNKNOWN") + TEST_EQUAL(sequences_iterator->description, String("Artificially modified version of sp|P31946|1433B_HUMAN")) + TEST_EQUAL(sequences_iterator->sequence, + String("(ICPL:13C(6))MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + + String("AGEGEN")) + + + + + // test if the modifed sequence is convertable + AASequence aa = AASequence::fromString(sequences_iterator->sequence); + TEST_EQUAL(aa.toUnmodifiedString(), String("MTMDKSELVQKAKLAEQAERYDDMAAAMKAVTEQGHELSNEERNLLSVAYKNVVGARRSS") + + String("WRVISSIEQKTERNEKKQQMGKEYREKIEAELQDICNDVLELLDKYLIPNATQPESKVFY") + + String("LKMKGDYFRYLSEVASGDNKQTTVSNSQQAYQEAFEISKKEMQPTHPIRLGLALNFSVFY") + + String("YEILNSPEKACSLAKTAFDEAIAELDTLNEESYKDSTLIMQLLRDNLTLWTSENQGDEGD") + + String("AGEGEN")) + + TEST_EQUAL(aa.isModified(), true) + String expectedModification = ModificationsDB::getInstance()->getModification("ICPL:13C(6)", "", + ResidueModification::N_TERM)->getId(); + TEST_EQUAL(aa.getNTerminalModificationName(), expectedModification) + + sequences_iterator++; + TEST_EQUAL(sequences_iterator->identifier, "test") + TEST_EQUAL(sequences_iterator->description, String(" ##0")) + TEST_EQUAL(sequences_iterator->sequence, + String("GSMTVDMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVS") + + String("PAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSVGSMTVDMQEIGSTEMPYEVPTQ") + + String("PNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSFKEKVSELVSPAVYTFGL") + + String("FVQNASESLTSDDPSDVPTQRTFKSDFQSVAXXSTFDFYQRRLVTLAESPRAPSPGSMTV") + + String("DMQEIGSTEMPYEVPTQPNATSASAGRGWFDGPSFKVPSVPTRPSGIFRRPSRIKPEFSF") + + String("KEKVSELVSPAVYTFGLFVQNASESLTSDDPSDVPTQRTFKSDFQSV")) + + END_SECTION + + + START_SECTION((void + store( + const String &filename, + const std::vector &data) const)) + vector data, data2; + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + file.load(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta"), data); + TEST_EXCEPTION(Exception::UnableToCreateFile, file.store("/bla/bluff/blblb/sdfhsdjf/test.txt", data)) + + file.store(tmp_filename, data); + + file.load(tmp_filename, data2); + + file.load(tmp_filename, data2); + + TEST_EQUAL(data == data2, true); + END_SECTION + + + START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + + END_SECTION + + + START_SECTION([EXTRA] test_strange_symbols_in_sequence) + // test if * is read correctly (not changed into something weird like 'X') + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR*LAEQ*AERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + data.push_back(temp_entry); // twice + + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 2); + TEST_EQUAL(data2[0] == temp_entry, true); + TEST_EQUAL(data2[1] == temp_entry, true); + END_SECTION + + + START_SECTION(test_white_spaces) +//test if spaces and tabulators are removed correctly + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + vector data, data2; + + FASTAFile::FASTAEntry temp_entry; + temp_entry.identifier = String("P68509|1433F_BOVIN"); + temp_entry.description = String("This is the description of the first protein"); + temp_entry.sequence = String("GDREQLLQRAR LAEQ\tAERYDDMASAMKAVTEL"); + data.push_back(temp_entry); + + file.store(tmp_filename, data); + file.load(tmp_filename, data2); + + ABORT_IF(data2.size() != 1); + TEST_EQUAL(data2[0].sequence == string("GDREQLLQRARLAEQAERYDDMASAMKAVTEL"), true); + + END_SECTION + + START_SECTION([EXTRA] test_position) + // test if setPosition() works correctly + String tmp_filename; + NEW_TMP_FILE(tmp_filename); + FASTAFile file; + + vector> data1; + vector> data2; + FASTAFile::FASTAEntry temp_entry; + file.readStart(OPENMS_GET_TEST_DATA_PATH("FASTAFile_test.fasta")); + + + for (int i = 0; i < 4; i++) { + file.readNext(temp_entry); + data1.push_back(std::make_pair(file.position(), temp_entry)); + } + file.setPosition(data1[0].first); + for (int i = 0; i < 3; i++) { + file.readNext(temp_entry); + data2.push_back(std::make_pair(file.position(), temp_entry)); + } + + ABORT_IF(data1.size() != 4 || data2.size() != 3); + + for (Size i = 1; i < data1.size(); i++) { + TEST_EQUAL(data1[i].second.identifier, data2[i - 1].second.identifier); + TEST_EQUAL(data1[i].second.description, data2[i - 1].second.description); + TEST_EQUAL(data1[i].second.sequence, data2[i - 1].second.sequence); + TEST_EQUAL(data1[i].first, data2[i - 1].first); + } + + END_SECTION + ///////////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////// diff --git a/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp new file mode 100644 index 00000000000..5b492669ed8 --- /dev/null +++ b/src/tests/class_tests/openms/source/NeedlemanWunsch_test.cpp @@ -0,0 +1,73 @@ +#include +#include +#include +#include + +using namespace OpenMS; +using namespace std; + +/////////////////////////// + +START_TEST(NeedlemanWunsch, "$Id$") + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +NeedlemanWunsch* ptr = nullptr; +START_SECTION(NeedlemanWunsch(ScoringMatrix matrix, int penalty)()) +{ + ptr = new NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 5); + TEST_EQUAL(ptr == nullptr, false) +} +END_SECTION + +START_SECTION(~NeedlemanWunsch()) +{ + delete (ptr); +} +END_SECTION + +String seq1 = "IGGATLIGQLAIQQAHVHL"; +String seq2 = "IGGATLIGALDQVVAQQAHVHL"; + +START_SECTION(double align(const String& seq1, const String& seq2)) +{ + NeedlemanWunsch alignment = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identity, 5); + TEST_EQUAL(alignment.align(seq1, seq2), 1); + TEST_EQUAL(alignment.align(seq1, seq1), 19); + TEST_EQUAL(alignment.align(seq2, seq2), 22); +} +END_SECTION + +START_SECTION(void setMatrix(const ScoringMatrix& matrix)) +{ + NeedlemanWunsch alignment = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::identity, 5); + alignment.setMatrix(NeedlemanWunsch::ScoringMatrix::PAM30MS); + TEST_EQUAL(alignment.align(seq1, seq2), 93); + TEST_EQUAL(alignment.align(seq1, seq1), 131); + TEST_EQUAL(alignment.align(seq2, seq2), 151); + + TEST_EXCEPTION(Exception::IllegalArgument, alignment.setMatrix("Identity")) + + alignment.setMatrix("identity"); + TEST_EQUAL(alignment.align(seq1, seq2), 1); + TEST_EQUAL(alignment.align(seq1, seq1), 19); + TEST_EQUAL(alignment.align(seq2, seq2), 22); +} +END_SECTION + +START_SECTION(void setPenalty(const ScoringMatrix& matrix)) +{ + NeedlemanWunsch alignment = NeedlemanWunsch(NeedlemanWunsch::ScoringMatrix::PAM30MS, 5); + alignment.setPenalty(1); + TEST_EQUAL(alignment.align(seq1, seq2), 113); + TEST_EQUAL(alignment.getPenalty(), 1); +} +END_SECTION + + +///////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////// + +END_TEST +