Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 22 additions & 3 deletions src/openms/include/OpenMS/FORMAT/HANDLERS/XMLHandler.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#include <OpenMS/CONCEPT/Types.h>
#include <OpenMS/CONCEPT/Macros.h>

#include <OpenMS/DATASTRUCTURES/ListUtils.h> // StringList
#include <OpenMS/DATASTRUCTURES/DateTime.h>
#include <OpenMS/DATASTRUCTURES/DataValue.h>
#include <OpenMS/DATASTRUCTURES/ListUtils.h>
Expand All @@ -21,6 +21,7 @@
#include <xercesc/util/XMLString.hpp>

#include <iosfwd>
#include <iostream>
#include <string>
#include <memory>

Expand Down Expand Up @@ -224,8 +225,18 @@ namespace OpenMS

// Converts from a wide-character string to a narrow-character string.
inline static String toNative_(const XMLCh* str)
{
return String(unique_xerces_ptr<char>(xercesc::XMLString::transcode(str)).get());
{
String r;
XMLSize_t l = strLength(str);
if(isASCII(str, l))
{
appendASCII(str,l,r);
}
else
{
r = (unique_xerces_ptr<char>(xercesc::XMLString::transcode(str)).get());
}
return r;
}

// Converts from a wide-character string to a narrow-character string.
Expand All @@ -242,6 +253,9 @@ namespace OpenMS
/// Destructor
~StringManager();

/// Calculates the length of a XMLCh* string using SIMDe
static int strLength(const XMLCh* input_ptr);

/// Transcode the supplied C string to a xerces string
inline static XercesString convert(const char * str)
{
Expand Down Expand Up @@ -283,7 +297,12 @@ namespace OpenMS
{
return toNative_(str);
}
/// Checks if supplied if chars in XMLCh* can be encoded with ASCII
static bool isASCII(const XMLCh * chars, const XMLSize_t length);

/// Compresses eight 8x16bit Chars in XMLCh* to 8x8bit Chars by cutting upper byte
static void compress64 (const XMLCh * input_it, char* output_it);

/**
* @brief Transcodes the supplied XMLCh* and appends it to the OpenMS String
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@ namespace OpenMS
{
//store map of abort reasons for failed seeds
FeatureMap abort_map;
abort_map.reserve(abort_reasons_.size());
abort_map.reserve( abort_reasons_.size());
Size counter = 0;
for (std::map<Seed, String>::iterator it2 = abort_reasons_.begin(); it2 != abort_reasons_.end(); ++it2, ++counter)
{
Expand Down
5 changes: 4 additions & 1 deletion src/openms/source/FORMAT/HANDLERS/MzMLHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
#include <OpenMS/INTERFACES/IMSDataConsumer.h>
#include <OpenMS/SYSTEM/File.h>


#include <map>

using namespace std::literals;

namespace OpenMS::Internal
{

Expand Down Expand Up @@ -267,7 +270,7 @@ namespace OpenMS::Internal
UInt meta_string_array_index = 0;
for (Size i = 0; i < input_data.size(); i++) //loop over all binary data arrays
{
if (input_data[i].meta.getName() != "m/z array" && input_data[i].meta.getName() != "intensity array") // is meta data array?
if (input_data[i].meta.getName() != "m/z array"sv && input_data[i].meta.getName() != "intensity array"sv) // is meta data array?
{
if (input_data[i].data_type == MzMLHandlerHelper::BinaryData::DT_FLOAT)
{
Expand Down
157 changes: 130 additions & 27 deletions src/openms/source/FORMAT/HANDLERS/XMLHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <OpenMS/FORMAT/XMLFile.h>
#include <OpenMS/CONCEPT/LogStream.h>
#include <OpenMS/METADATA/ProteinIdentification.h>
#include <OpenMS/SYSTEM/SIMDe.h>

#include <algorithm>
#include <set>
Expand Down Expand Up @@ -302,7 +303,7 @@ namespace OpenMS::Internal
}
// no value, although there should be a numerical value
else if (term.xref_type != ControlledVocabulary::CVTerm::NONE && term.xref_type != ControlledVocabulary::CVTerm::XSD_STRING && // should be numerical
!cv.isChildOf(accession, "MS:1000513") // here the value type relates to the binary data array, not the 'value=' attribute!
!cv.isChildOf(accession, "MS:1000513") // here the value type relates to the bits data array, not the 'value=' attribute!
)
{
warning(LOAD, String("The CV term '") + accession + " - " + term.name + "' used in tag '" + parent_tag + "' should have a numerical value. The value is '" + value + "'.");
Expand Down Expand Up @@ -417,43 +418,145 @@ namespace OpenMS::Internal
}
}

//*******************************************************************************************************************
int StringManager::strLength(const XMLCh* input_ptr)
{
size_t processed_chars = 0;
XMLCh* pos_ptr = const_cast<XMLCh*>(input_ptr);
size_t align = (size_t)pos_ptr % 16;

StringManager::StringManager()
= default;
// Prevent crossing page boundaries
for (size_t i = 0; i < align; ++i)
{
if (pos_ptr[i] == 0)
{
return processed_chars + i;
}
++processed_chars;
++pos_ptr;
}

while (true)
{
simde__m128i bits = simde_mm_loadu_si128(reinterpret_cast<simde__m128i*>(pos_ptr));
simde__m128i zero = simde_mm_setzero_si128();
simde__m128i cmp_zero = simde_mm_cmpeq_epi16(bits, zero);
uint16_t zero_mask = simde_mm_movemask_epi8(cmp_zero);

if (zero_mask != 0x0000)
{
int byte_pos_zero = __builtin_ctz(zero_mask);
int char_pos_zero = byte_pos_zero / 2;
pos_ptr += char_pos_zero;
return processed_chars + char_pos_zero;
}

pos_ptr += 8;
processed_chars += 8;
}
}

StringManager::~StringManager()
= default;
void StringManager::compress64(const XMLCh* inputIt, char* outputIt)
{
simde__m128i bits = simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(inputIt));

// Select every second byte (little-endian lower byte of each UTF-16 character)
const simde__m128i shuffleMask = simde_mm_setr_epi8(
0, 2, 4, 6, 8, 10, 12, 14,
-1, -1, -1, -1, -1, -1, -1, -1
);

simde__m128i compressed = simde_mm_shuffle_epi8(bits, shuffleMask);

// Store the lower 64 bits (8 ASCII characters)
simde_mm_storel_epi64(reinterpret_cast<simde__m128i*>(outputIt), compressed);
}

void StringManager::appendASCII(const XMLCh * chars, const XMLSize_t length, String & result)
bool StringManager::isASCII(const XMLCh* chars, const XMLSize_t length)
{
if (length == 0)
{
// XMLCh are characters in UTF16 (usually stored as 16bit unsigned
return false;
}

Size quotient = length / 8;
Size remainder = length % 8;

const XMLCh* inputPtr = chars;
simde__m128i mask = simde_mm_set1_epi16(0xFF00);
bool bitmask = true;

// Process blocks of 8 UTF-16 characters using SIMD
for (Size i = 0; i < quotient; ++i)
{
simde__m128i bits = simde_mm_loadu_si128(reinterpret_cast<const simde__m128i*>(inputPtr));
simde__m128i zero = simde_mm_setzero_si128();
simde__m128i andOp = simde_mm_and_si128(bits, mask);
simde__m128i cmp = simde_mm_cmpeq_epi16(andOp, zero);

if (simde_mm_movemask_epi8(cmp) != 0xFFFF)
{
bitmask = false;
break;
}

inputPtr += 8;
}

// Check remaining characters individually
for (Size i = 0; i < remainder && bitmask; ++i)
{
if (inputPtr[i] & 0xFF00)
{
bitmask = false;
break;
}
}

return bitmask;
}

void StringManager::appendASCII(const XMLCh* chars, const XMLSize_t length, String& result)
{
// XMLCh are characters in UTF16 (usually stored as 16-bit unsigned
// short but this is not guaranteed).
// We know that the Base64 string here can only contain plain ASCII
// and all bytes except the least significant one will be zero. Thus
// we can convert to char directly (only keeping the least
// significant byte).

const XMLCh* it = chars;
const XMLCh* end = it + length;

size_t curr_size = result.size();
result.resize(curr_size + length);
std::string::iterator str_it = result.begin();
std::advance(str_it, curr_size);
while (it!=end)
{
*str_it = (char)*it;
++str_it;
++it;

Size quotient = length / 8;
Size remainder = length % 8;

const XMLCh* inputPtr = chars;

Size currentSize = result.size();
result.resize(currentSize + length);
char* outputPtr = &result[currentSize];

// Copy blocks of 8 characters at a time
for (Size i = 0; i < quotient; ++i)
{
compress64(inputPtr, outputPtr);
inputPtr += 8;
outputPtr += 8;
}

// Copy any remaining characters individually
for (Size i = 0; i < remainder; ++i)
{
outputPtr[i] = static_cast<char>(inputPtr[i] & 0xFF);
}
}

// This is ca. 50 % faster than
// for (size_t i = 0; i < length; i++)
// {
// result[curr_size + i] = (char)chars[i];
// }
//*******************************************************************************************************************

StringManager::StringManager()
= default;

StringManager::~StringManager()
= default;



}

} // namespace OpenMS // namespace Internal
1 change: 1 addition & 0 deletions src/tests/class_tests/openms/executables.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ set(format_executables_list
UnimodXMLFile_test
XMassFile_test
XMLFile_test
XMLHandler_test
XMLValidator_test
XQuestResultXMLFile_test
XTandemInfile_test
Expand Down
Loading
Loading