diff --git a/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h b/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h index 9d4f98665ce..0563d765809 100644 --- a/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h +++ b/src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h @@ -11,7 +11,7 @@ #include #include -#include // StringList + #include #include #include @@ -21,6 +21,7 @@ #include #include +#include #include #include @@ -224,8 +225,18 @@ namespace OpenMS // Converts from a wide-character string to a narrow-character string. inline static String toNative_(const XMLCh* str) - { - return String(unique_xerces_ptr(xercesc::XMLString::transcode(str)).get()); + { + String r; + XMLSize_t l = strLength(str); + if(isASCII(str, l)) + { + appendASCII(str,l,r); + } + else + { + r = (unique_xerces_ptr(xercesc::XMLString::transcode(str)).get()); + } + return r; } // Converts from a wide-character string to a narrow-character string. @@ -242,6 +253,9 @@ namespace OpenMS /// Destructor ~StringManager(); + /// Calculates the length of a XMLCh* string using SIMDe + static int strLength(const XMLCh* input_ptr); + /// Transcode the supplied C string to a xerces string inline static XercesString convert(const char * str) { @@ -283,7 +297,12 @@ namespace OpenMS { return toNative_(str); } + /// Checks if supplied if chars in XMLCh* can be encoded with ASCII + static bool isASCII(const XMLCh * chars, const XMLSize_t length); + /// Compresses eight 8x16bit Chars in XMLCh* to 8x8bit Chars by cutting upper byte + static void compress64 (const XMLCh * input_it, char* output_it); + /** * @brief Transcodes the supplied XMLCh* and appends it to the OpenMS String * diff --git a/src/openms/source/FEATUREFINDER/FeatureFinderAlgorithmPicked.cpp b/src/openms/source/FEATUREFINDER/FeatureFinderAlgorithmPicked.cpp index e819450dfcf..9dbe24853a8 100644 --- a/src/openms/source/FEATUREFINDER/FeatureFinderAlgorithmPicked.cpp +++ b/src/openms/source/FEATUREFINDER/FeatureFinderAlgorithmPicked.cpp @@ -1007,7 +1007,7 @@ namespace OpenMS { //store map of abort reasons for failed seeds FeatureMap abort_map; - abort_map.reserve(abort_reasons_.size()); + abort_map.reserve( abort_reasons_.size()); Size counter = 0; for (std::map::iterator it2 = abort_reasons_.begin(); it2 != abort_reasons_.end(); ++it2, ++counter) { diff --git a/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp b/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp index f643342819c..23b62e46c8a 100644 --- a/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp +++ b/src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp @@ -18,8 +18,11 @@ #include #include + #include +using namespace std::literals; + namespace OpenMS::Internal { @@ -267,7 +270,7 @@ namespace OpenMS::Internal UInt meta_string_array_index = 0; for (Size i = 0; i < input_data.size(); i++) //loop over all binary data arrays { - if (input_data[i].meta.getName() != "m/z array" && input_data[i].meta.getName() != "intensity array") // is meta data array? + if (input_data[i].meta.getName() != "m/z array"sv && input_data[i].meta.getName() != "intensity array"sv) // is meta data array? { if (input_data[i].data_type == MzMLHandlerHelper::BinaryData::DT_FLOAT) { diff --git a/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp b/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp index 6af312971b6..6441a048bbc 100644 --- a/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp +++ b/src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -302,7 +303,7 @@ namespace OpenMS::Internal } // no value, although there should be a numerical value else if (term.xref_type != ControlledVocabulary::CVTerm::NONE && term.xref_type != ControlledVocabulary::CVTerm::XSD_STRING && // should be numerical - !cv.isChildOf(accession, "MS:1000513") // here the value type relates to the binary data array, not the 'value=' attribute! + !cv.isChildOf(accession, "MS:1000513") // here the value type relates to the bits data array, not the 'value=' attribute! ) { warning(LOAD, String("The CV term '") + accession + " - " + term.name + "' used in tag '" + parent_tag + "' should have a numerical value. The value is '" + value + "'."); @@ -417,43 +418,145 @@ namespace OpenMS::Internal } } - //******************************************************************************************************************* + int StringManager::strLength(const XMLCh* input_ptr) + { + size_t processed_chars = 0; + XMLCh* pos_ptr = const_cast(input_ptr); + size_t align = (size_t)pos_ptr % 16; - StringManager::StringManager() - = default; + // Prevent crossing page boundaries + for (size_t i = 0; i < align; ++i) + { + if (pos_ptr[i] == 0) + { + return processed_chars + i; + } + ++processed_chars; + ++pos_ptr; + } + + while (true) + { + simde__m128i bits = simde_mm_loadu_si128(reinterpret_cast(pos_ptr)); + simde__m128i zero = simde_mm_setzero_si128(); + simde__m128i cmp_zero = simde_mm_cmpeq_epi16(bits, zero); + uint16_t zero_mask = simde_mm_movemask_epi8(cmp_zero); + + if (zero_mask != 0x0000) + { + int byte_pos_zero = __builtin_ctz(zero_mask); + int char_pos_zero = byte_pos_zero / 2; + pos_ptr += char_pos_zero; + return processed_chars + char_pos_zero; + } + + pos_ptr += 8; + processed_chars += 8; + } + } - StringManager::~StringManager() - = default; + void StringManager::compress64(const XMLCh* inputIt, char* outputIt) + { + simde__m128i bits = simde_mm_loadu_si128(reinterpret_cast(inputIt)); + + // Select every second byte (little-endian lower byte of each UTF-16 character) + const simde__m128i shuffleMask = simde_mm_setr_epi8( + 0, 2, 4, 6, 8, 10, 12, 14, + -1, -1, -1, -1, -1, -1, -1, -1 + ); + + simde__m128i compressed = simde_mm_shuffle_epi8(bits, shuffleMask); + + // Store the lower 64 bits (8 ASCII characters) + simde_mm_storel_epi64(reinterpret_cast(outputIt), compressed); + } - void StringManager::appendASCII(const XMLCh * chars, const XMLSize_t length, String & result) + bool StringManager::isASCII(const XMLCh* chars, const XMLSize_t length) + { + if (length == 0) { - // XMLCh are characters in UTF16 (usually stored as 16bit unsigned + return false; + } + + Size quotient = length / 8; + Size remainder = length % 8; + + const XMLCh* inputPtr = chars; + simde__m128i mask = simde_mm_set1_epi16(0xFF00); + bool bitmask = true; + + // Process blocks of 8 UTF-16 characters using SIMD + for (Size i = 0; i < quotient; ++i) + { + simde__m128i bits = simde_mm_loadu_si128(reinterpret_cast(inputPtr)); + simde__m128i zero = simde_mm_setzero_si128(); + simde__m128i andOp = simde_mm_and_si128(bits, mask); + simde__m128i cmp = simde_mm_cmpeq_epi16(andOp, zero); + + if (simde_mm_movemask_epi8(cmp) != 0xFFFF) + { + bitmask = false; + break; + } + + inputPtr += 8; + } + + // Check remaining characters individually + for (Size i = 0; i < remainder && bitmask; ++i) + { + if (inputPtr[i] & 0xFF00) + { + bitmask = false; + break; + } + } + + return bitmask; + } + + void StringManager::appendASCII(const XMLCh* chars, const XMLSize_t length, String& result) + { + // XMLCh are characters in UTF16 (usually stored as 16-bit unsigned // short but this is not guaranteed). // We know that the Base64 string here can only contain plain ASCII // and all bytes except the least significant one will be zero. Thus // we can convert to char directly (only keeping the least // significant byte). - - const XMLCh* it = chars; - const XMLCh* end = it + length; - - size_t curr_size = result.size(); - result.resize(curr_size + length); - std::string::iterator str_it = result.begin(); - std::advance(str_it, curr_size); - while (it!=end) - { - *str_it = (char)*it; - ++str_it; - ++it; + + Size quotient = length / 8; + Size remainder = length % 8; + + const XMLCh* inputPtr = chars; + + Size currentSize = result.size(); + result.resize(currentSize + length); + char* outputPtr = &result[currentSize]; + + // Copy blocks of 8 characters at a time + for (Size i = 0; i < quotient; ++i) + { + compress64(inputPtr, outputPtr); + inputPtr += 8; + outputPtr += 8; + } + + // Copy any remaining characters individually + for (Size i = 0; i < remainder; ++i) + { + outputPtr[i] = static_cast(inputPtr[i] & 0xFF); } + } - // This is ca. 50 % faster than - // for (size_t i = 0; i < length; i++) - // { - // result[curr_size + i] = (char)chars[i]; - // } + //******************************************************************************************************************* + + StringManager::StringManager() + = default; + + StringManager::~StringManager() + = default; + + - } } // namespace OpenMS // namespace Internal diff --git a/src/tests/class_tests/openms/executables.cmake b/src/tests/class_tests/openms/executables.cmake index 656625cea4c..83b2508d6b0 100644 --- a/src/tests/class_tests/openms/executables.cmake +++ b/src/tests/class_tests/openms/executables.cmake @@ -251,6 +251,7 @@ set(format_executables_list UnimodXMLFile_test XMassFile_test XMLFile_test + XMLHandler_test XMLValidator_test XQuestResultXMLFile_test XTandemInfile_test diff --git a/src/tests/class_tests/openms/source/XMLHandler_test.cpp b/src/tests/class_tests/openms/source/XMLHandler_test.cpp new file mode 100644 index 00000000000..b55c6ad27d7 --- /dev/null +++ b/src/tests/class_tests/openms/source/XMLHandler_test.cpp @@ -0,0 +1,149 @@ + +#include +#include +#include +#include +#include + +#include + +using namespace OpenMS::Internal; + + + + + +START_TEST(StringManager, "$Id$") + + +const XMLCh russianHello[] = { + 0x041F, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0x043C, + 0x0438, 0x0440, // "Привет мир" (Hello World in Russian) +}; +XMLSize_t r_length = xercesc::XMLString::stringLen(russianHello); + +const XMLCh ascii[] = { + 0x0048,0x0065,0x006C,0x006C,0x006F,0x002C,0x0057,0x006F, + 0x0072,0x006C,0x0064,0x0021, 0x0000}; +XMLSize_t a_length = xercesc::XMLString::stringLen(ascii); + +const XMLCh mixed[] = { + 0x0048, 0x0065,0x0432, 0x0435, 0x0442, 0x043C, 0x006F, + 0x0072,0x006C,0x0064, 0x0021, 0x0000 }; +XMLSize_t m_length = xercesc::XMLString::stringLen(mixed); + +const XMLCh empty[] = {0}; +XMLSize_t e_length = xercesc::XMLString::stringLen(empty); + +const XMLCh upperBoundary [] = {0x00FF,0x00FF,0x0000}; +XMLSize_t u_length = xercesc::XMLString::stringLen(upperBoundary); + +bool isAscii = false; + +START_SECTION(isASCII(const XMLCh * chars, const XMLSize_t length)) + isAscii = StringManager::isASCII(ascii,a_length); + std::cout << "1 \n"; + TEST_TRUE(isAscii) + isAscii = StringManager::isASCII(russianHello,r_length); + std::cout << "2 \n"; + TEST_FALSE(isAscii) + isAscii = StringManager::isASCII(mixed,m_length); + std::cout << "3 \n"; + TEST_FALSE(isAscii) + isAscii = StringManager::isASCII(empty,e_length); + std::cout << "4 \n"; + TEST_FALSE(isAscii) + isAscii = StringManager::isASCII(upperBoundary,u_length); + std::cout << "5 \n"; + TEST_TRUE(isAscii) +END_SECTION + +const XMLCh eight_block_negative[] = {0x0148,0x0165,0x016C,0x016C,0x016F,0x012C,0x0157,0x016F}; + +const XMLCh eight_block[] = {0x0048,0x0065,0x006C,0x006C,0x006F,0x002C,0x0057,0x006F}; + +const XMLCh eight_block_mixed[] ={0x0042,0x0045,0x004C,0x0041,0x0142,0x0145,0x014C,0x0141}; + +const XMLCh eight_block_kadabra[] = { + 0x004B, // K + 0x0041, // A + 0x0044, // D + 0x0041, // A + 0x0042, // B + 0x0052, // R + 0x0041, // A + 0x0021 // ! +}; + +START_SECTION(compress64 (const XMLCh* input_it, char* output_it)) + std::string o1_str(8,'\0'); + StringManager::compress64(eight_block,o1_str.data()); + std::string res1_str = "Hello,Wo"; + TEST_STRING_EQUAL(o1_str,res1_str); + + + std::string o2_str(8,'\0'); + StringManager::compress64(eight_block_negative,o2_str.data()); + std::string res2_str = res1_str; + TEST_STRING_EQUAL(o2_str, res2_str); + + + std::string o3_str(8,'\0'); + // char res3 [9] = {0x42,0x45,0x4C,0x41,0x42,0x45,0x4C,0x41}; + // res3[8] = '\0'; + StringManager::compress64(eight_block_mixed,o3_str.data()); + std::string res3_str = {0x42,0x45,0x4C,0x41,0x42,0x45,0x4C,0x41}; + TEST_STRING_EQUAL(o3_str, res3_str); + + std::string o4_str(12,'\0'); + o4_str [0] ='A'; + o4_str [1] ='B'; + o4_str [2] ='R'; + o4_str [3] ='A'; + + StringManager::compress64(eight_block_kadabra,((o4_str.data())+4)); + std::string res4_str = "ABRAKADABRA!"; + TEST_STRING_EQUAL(o4_str, res4_str); + +END_SECTION + +//Tests Number of Chars not Dividable by 8 +OpenMS::String o5_str; +std::string res5_str = "Hello,World!"; + +//Checks how the Function handles Data thats already stored in Output string +OpenMS::String o6_str = "Gruess Gott und "; +std::string res6_str = "Gruess Gott und Hello,World!"; + +OpenMS::String o7_str; +std::string res7_str = ""; + + +START_SECTION(appendASCII(const XMLCh * chars, const XMLSize_t length, String & result)) + + StringManager::appendASCII(ascii,a_length,o5_str); + TEST_STRING_EQUAL(o5_str, res5_str); + + StringManager::appendASCII(ascii,a_length,o6_str); + TEST_STRING_EQUAL(o6_str, res6_str); + + StringManager::appendASCII(empty,e_length,o7_str); + TEST_STRING_EQUAL(o7_str, res7_str); + std::cout << o7_str.size() << std::endl; + +END_SECTION + +START_SECTION(appendASCII(const XMLCh * chars, const XMLSize_t length, String & result)) + int o_length = StringManager::strLength(ascii); + TEST_EQUAL(o_length, a_length); + o_length = StringManager::strLength(empty); + TEST_EQUAL(o_length, e_length); + o_length = StringManager::strLength(upperBoundary); + TEST_EQUAL(o_length, u_length); +END_SECTION + +END_TEST + + + +