From a95e23dfdb302b213b79877f66a721a09782355d Mon Sep 17 00:00:00 2001 From: Chris Bielow Date: Mon, 24 Feb 2025 13:35:47 +0100 Subject: [PATCH 1/2] speed fixes to mzML parsing, in particular avoid double lookup of CV terms --- .../source/FORMAT/HANDLERS/MzMLHandler.cpp | 44 ++++++++++--------- .../source/FORMAT/HANDLERS/XMLHandler.cpp | 31 ++++++------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp b/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp index d8377313848..672185afa22 100644 --- a/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp +++ b/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp @@ -692,8 +692,10 @@ namespace OpenMS::Internal constexpr XMLCh s_external_spectrum_id[] = { 'e','x','t','e','r','n','a','l','S','p','e','c','t','r','u','m','I','D' , 0}; // constexpr XMLCh s_default_source_file_ref[] = { 'd','e','f','a','u','l','t','S','o','u','r','c','e','F','i','l','e','R','e','f' , 0}; constexpr XMLCh s_scan_settings_ref[] = { 's','c','a','n','S','e','t','t','i','n','g','s','R','e','f' , 0}; - String tag = sm_.convert(qname); - open_tags_.push_back(tag); + + + open_tags_.push_back(sm_.convert(qname)); + const String& tag = open_tags_.back(); // do nothing until a spectrum/chromatogram/spectrumList ends if (skip_spectrum_ || skip_chromatogram_) @@ -701,16 +703,16 @@ namespace OpenMS::Internal return; } - //determine parent tag - String parent_tag; + // determine parent tag + const String* parent_tag = &tag; // set to some valid string if (open_tags_.size() > 1) { - parent_tag = *(open_tags_.end() - 2); + parent_tag = &(*(open_tags_.end() - 2)); } - String parent_parent_tag; + const String* parent_parent_tag = &tag; // set to some valid string if (open_tags_.size() > 2) { - parent_parent_tag = *(open_tags_.end() - 3); + parent_parent_tag = &(*(open_tags_.end() - 3)); } if (tag == "spectrum") @@ -859,21 +861,21 @@ namespace OpenMS::Internal } else if (tag == "cvParam") { - String value = ""; + String value; optionalAttributeAsString_(value, attributes, s_value); - String unit_accession = ""; + String unit_accession; optionalAttributeAsString_(unit_accession, attributes, s_unit_accession); - handleCVParam_(parent_parent_tag, parent_tag, attributeAsString_(attributes, s_accession), attributeAsString_(attributes, s_name), value, unit_accession); + handleCVParam_(*parent_parent_tag, *parent_tag, attributeAsString_(attributes, s_accession), attributeAsString_(attributes, s_name), value, unit_accession); } else if (tag == "userParam") { - String type = ""; + String type; optionalAttributeAsString_(type, attributes, s_type); - String value = ""; + String value; optionalAttributeAsString_(value, attributes, s_value); - String unit_accession = ""; + String unit_accession; optionalAttributeAsString_(unit_accession, attributes, s_unit_accession); - handleUserParam_(parent_parent_tag, parent_tag, attributeAsString_(attributes, s_name), type, value, unit_accession); + handleUserParam_(*parent_parent_tag, *parent_tag, attributeAsString_(attributes, s_name), type, value, unit_accession); } else if (tag == "referenceableParamGroup") { @@ -944,7 +946,7 @@ namespace OpenMS::Internal String ref = attributeAsString_(attributes, s_ref); for (Size i = 0; i < ref_param_[ref].size(); ++i) { - handleCVParam_(parent_parent_tag, parent_tag, ref_param_[ref][i].accession, ref_param_[ref][i].name, ref_param_[ref][i].value, ref_param_[ref][i].unit_accession); + handleCVParam_(*parent_parent_tag, *parent_tag, ref_param_[ref][i].accession, ref_param_[ref][i].name, ref_param_[ref][i].value, ref_param_[ref][i].unit_accession); } } else if (tag == "scan") @@ -5226,7 +5228,7 @@ namespace OpenMS::Internal } Base64::encodeIntegers(data64_to_encode, Base64::BYTEORDER_LITTLEENDIAN, encoded_string, options_.getCompression()); - String data_processing_ref_string = ""; + String data_processing_ref_string ; if (!array.getDataProcessing().empty()) { data_processing_ref_string = String("dataProcessingRef=\"dp_sp_") + s + "_bi_" + m + "\""; @@ -5256,7 +5258,7 @@ namespace OpenMS::Internal for (Size p = 0; p < array.size(); ++p) data_to_encode[p] = array[p]; Base64::encodeStrings(data_to_encode, encoded_string, options_.getCompression()); - String data_processing_ref_string = ""; + String data_processing_ref_string ; if (!array.getDataProcessing().empty()) { data_processing_ref_string = String("dataProcessingRef=\"dp_sp_") + s + "_bi_" + m + "\""; @@ -5426,7 +5428,7 @@ namespace OpenMS::Internal // Try and identify whether we have a CV term for this particular array (otherwise write the array name itself) ControlledVocabulary::CVTerm bi_term = getChildWithName_("MS:1000513", array.getName()); // name: binary data array - String unit_cv_term = ""; + String unit_cv_term ; if (array_metadata.metaValueExists("unit_accession")) { ControlledVocabulary::CVTerm unit = cv_.getTerm(array_metadata.getMetaValue("unit_accession")); @@ -5449,7 +5451,7 @@ namespace OpenMS::Internal np_config = pf_options_.getNumpressConfigurationFloatDataArray(); } - String data_processing_ref_string = ""; + String data_processing_ref_string ; if (!array.getDataProcessing().empty()) { data_processing_ref_string = String("dataProcessingRef=\"dp_sp_") + spec_chrom_idx + "_bi_" + array_idx + "\""; @@ -5598,7 +5600,7 @@ namespace OpenMS::Internal data64_to_encode[p] = array[p]; } Base64::encodeIntegers(data64_to_encode, Base64::BYTEORDER_LITTLEENDIAN, encoded_string, options_.getCompression()); - String data_processing_ref_string = ""; + String data_processing_ref_string ; if (!array.getDataProcessing().empty()) { data_processing_ref_string = String("dataProcessingRef=\"dp_sp_") + c + "_bi_" + m + "\""; @@ -5630,7 +5632,7 @@ namespace OpenMS::Internal data_to_encode[p] = array[p]; } Base64::encodeStrings(data_to_encode, encoded_string, options_.getCompression()); - String data_processing_ref_string = ""; + String data_processing_ref_string ; if (!array.getDataProcessing().empty()) { data_processing_ref_string = String("dataProcessingRef=\"dp_sp_") + c + "_bi_" + m + "\""; diff --git a/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp b/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp index 8f4b037e829..6af312971b6 100644 --- a/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp +++ b/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp @@ -204,27 +204,14 @@ namespace OpenMS::Internal DataValue cv_value = value; // Abort on unknown terms - if (!cv.exists(accession)) + try { - // in 'sample' several external CVs are used (Brenda, GO, ...). Do not warn then. - if (parent_tag != "sample") - { - warning(LOAD, String("Unknown cvParam '") + accession + "' in tag '" + parent_tag + "'."); - return DataValue::EMPTY; - } - } - else - { - const ControlledVocabulary::CVTerm& term = cv.getTerm(accession); + const ControlledVocabulary::CVTerm& term = cv.getTerm(accession); // throws Exception::InvalidValue if missing // check if term name and parsed name match + if (name != term.name) { - const String parsed_name = String(name).trim(); - const String correct_name = String(term.name).trim(); - if (parsed_name != correct_name) - { - warning(LOAD, String("Name of CV term not correct: '") + term.id + " - " + parsed_name + "' should be '" + correct_name + "'"); - } + warning(LOAD, String("Name of CV term not correct: '") + term.id + " - " + name + "' should be '" + term.name + "'"); } if (term.obsolete) { @@ -322,6 +309,15 @@ namespace OpenMS::Internal return DataValue::EMPTY; } } + catch (const Exception::InvalidValue& /*e*/) + { + // in 'sample' several external CVs are used (Brenda, GO, ...). Do not warn then. + if (parent_tag != "sample") + { + warning(LOAD, String("Unknown cvParam '") + accession + "' in tag '" + parent_tag + "'."); + return DataValue::EMPTY; + } + } if (!unit_accession.empty()) { @@ -437,6 +433,7 @@ namespace OpenMS::Internal // and all bytes except the least significant one will be zero. Thus // we can convert to char directly (only keeping the least // significant byte). + const XMLCh* it = chars; const XMLCh* end = it + length; From 8ee428a66457c92d4effd8fd39f7539bff89b452 Mon Sep 17 00:00:00 2001 From: Chris Bielow Date: Mon, 24 Feb 2025 13:56:16 +0100 Subject: [PATCH 2/2] dangerous commit -- not correct --- src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h b/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h index 9d4f98665ce..d99d9e88e2e 100644 --- a/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h +++ b/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h @@ -225,7 +225,9 @@ namespace OpenMS // Converts from a wide-character string to a narrow-character string. inline static String toNative_(const XMLCh* str) { - return String(unique_xerces_ptr(xercesc::XMLString::transcode(str)).get()); + String r; + appendASCII(str, xercesc::XMLString::stringLen(str), r); + return r; } // Converts from a wide-character string to a narrow-character string.