From adaea2136e06ee510eb1c596f975b36bcd3eee9d Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Thu, 19 Jan 2023 14:17:04 +0100 Subject: [PATCH 1/9] Use size_t instead of uint64_t for size parameters --- highwayhash/highwayhash_target.cc | 2 +- highwayhash/highwayhash_test.cc | 2 +- highwayhash/highwayhash_test_target.cc | 6 +++--- highwayhash/highwayhash_test_target.h | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/highwayhash/highwayhash_target.cc b/highwayhash/highwayhash_target.cc index 74022f6..34d4020 100644 --- a/highwayhash/highwayhash_target.cc +++ b/highwayhash/highwayhash_target.cc @@ -25,7 +25,7 @@ namespace highwayhash { extern "C" { uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key, const char* bytes, - const uint64_t size) { + const size_t size) { HHStateT state(key); HHResult64 result; HighwayHashT(&state, bytes, size, &result); diff --git a/highwayhash/highwayhash_test.cc b/highwayhash/highwayhash_test.cc index aed9a9e..60e4bae 100644 --- a/highwayhash/highwayhash_test.cc +++ b/highwayhash/highwayhash_test.cc @@ -79,7 +79,7 @@ TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) { // For each test input: empty string, 00, 00 01, ... char in[kMaxSize + 1] = {0}; // Fast enough that we don't need a thread pool. - for (uint64_t size = 0; size <= kMaxSize; ++size) { + for (size_t size = 0; size <= kMaxSize; ++size) { in[size] = static_cast(size); #if PRINT_RESULTS Result actual; diff --git a/highwayhash/highwayhash_test_target.cc b/highwayhash/highwayhash_test_target.cc index e999d9f..96af42e 100644 --- a/highwayhash/highwayhash_test_target.cc +++ b/highwayhash/highwayhash_test_target.cc @@ -127,7 +127,7 @@ void HighwayHashTest::operator()(const HHKey& key, template void HighwayHashCatTest::operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, + const size_t size, const HHResult64* expected, const HHNotify notify) const { TestHighwayHashCat(key, bytes, size, expected, notify); @@ -136,7 +136,7 @@ void HighwayHashCatTest::operator()(const HHKey& key, template void HighwayHashCatTest::operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, + const size_t size, const HHResult128* expected, const HHNotify notify) const { TestHighwayHashCat(key, bytes, size, expected, notify); @@ -145,7 +145,7 @@ void HighwayHashCatTest::operator()(const HHKey& key, template void HighwayHashCatTest::operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, + const size_t size, const HHResult256* expected, const HHNotify notify) const { TestHighwayHashCat(key, bytes, size, expected, notify); diff --git a/highwayhash/highwayhash_test_target.h b/highwayhash/highwayhash_test_target.h index 56ae960..93be728 100644 --- a/highwayhash/highwayhash_test_target.h +++ b/highwayhash/highwayhash_test_target.h @@ -54,13 +54,13 @@ struct HighwayHashTest { template struct HighwayHashCatTest { void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, const HHResult64* expected, + const size_t size, const HHResult64* expected, const HHNotify notify) const; void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, const HHResult128* expected, + const size_t size, const HHResult128* expected, const HHNotify notify) const; void operator()(const HHKey& key, const char* HH_RESTRICT bytes, - const uint64_t size, const HHResult256* expected, + const size_t size, const HHResult256* expected, const HHNotify notify) const; }; From fde1926fa99bd497f6cc8cca5ca4ad5d262cb378 Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Thu, 19 Jan 2023 14:26:04 +0100 Subject: [PATCH 2/9] return uint32_t from Load3::operator() --- highwayhash/hh_avx2.h | 6 +++--- highwayhash/hh_sse41.h | 4 ++-- highwayhash/load3.h | 28 ++++++++++++---------------- 3 files changed, 17 insertions(+), 21 deletions(-) diff --git a/highwayhash/hh_avx2.h b/highwayhash/hh_avx2.h index db44f53..c3dfb59 100644 --- a/highwayhash/hh_avx2.h +++ b/highwayhash/hh_avx2.h @@ -105,12 +105,12 @@ def x(a,b,c): } else { // size_mod32 < 16 const V4x32U int_mask = IntMask<0>()(size); const V4x32U packetL = MaskedLoadInt(bytes, int_mask); - const uint64_t last3 = + const uint32_t last3 = Load3()(Load3::AllowUnordered(), remainder, size_mod4); // Rather than insert into packetL[3], it is faster to initialize // the otherwise empty packetH. - const V4x32U packetH(_mm_cvtsi64_si128(last3)); + const V4x32U packetH(_mm_cvtsi32_si128(last3)); Update(packetH, packetL); } } @@ -255,7 +255,7 @@ def x(a,b,c): static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32, const V4x32U& size) { const char* remainder = from + (size_mod32 & ~3); - const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3); + const uint32_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3); const V4x32U int_mask = IntMask()(size); const V4x32U int_lanes = MaskedLoadInt(from, int_mask); return Insert4AboveMask(last3, int_mask, int_lanes); diff --git a/highwayhash/hh_sse41.h b/highwayhash/hh_sse41.h index 6bbed22..ed30f22 100644 --- a/highwayhash/hh_sse41.h +++ b/highwayhash/hh_sse41.h @@ -97,12 +97,12 @@ class HHStateSSE41 { } else { // size_mod32 < 16 const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32); - const uint64_t last4 = + const uint32_t last4 = Load3()(Load3::AllowUnordered(), remainder, size_mod4); // Rather than insert into packetL[3], it is faster to initialize // the otherwise empty packetH. - const V2x64U packetH(_mm_cvtsi64_si128(last4)); + const V2x64U packetH(_mm_cvtsi32_si128(last4)); Update(packetH, packetL); } } diff --git a/highwayhash/load3.h b/highwayhash/load3.h index 5e258e5..cdd9e50 100644 --- a/highwayhash/load3.h +++ b/highwayhash/load3.h @@ -64,22 +64,22 @@ class Load3 { } // As above, but preceding bytes are removed and upper byte(s) are zero. - HH_INLINE uint64_t operator()(AllowReadBefore, const char* from, + HH_INLINE uint32_t operator()(AllowReadBefore, const char* from, const size_t size_mod4) { // Shift 0..3 valid bytes into LSB as if loaded in little-endian order. // 64-bit type enables 32-bit shift when size_mod4 == 0. uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4); last3 >>= 32 - (size_mod4 * 8); - return last3; + return static_cast(last3); } // The bytes need not be loaded in little-endian order. This particular order // (and the duplication of some bytes depending on "size_mod4") was chosen for // computational convenience and can no longer be changed because it is part // of the HighwayHash length padding definition. - HH_INLINE uint64_t operator()(AllowUnordered, const char* from, + HH_INLINE uint32_t operator()(AllowUnordered, const char* from, const size_t size_mod4) { - uint64_t last3 = 0; + uint32_t last3 = 0; // Not allowed to read any bytes; early-out is faster than reading from a // constant array of zeros. if (size_mod4 == 0) { @@ -93,26 +93,26 @@ class Load3 { const uint64_t idx1 = size_mod4 >> 1; const uint64_t idx2 = size_mod4 - 1; // Store into least significant bytes (avoids one shift). - last3 = U64FromChar(from[idx0]); - last3 += U64FromChar(from[idx1]) << 8; - last3 += U64FromChar(from[idx2]) << 16; + last3 = U32FromChar(from[idx0]); + last3 += U32FromChar(from[idx1]) << 8; + last3 += U32FromChar(from[idx2]) << 16; return last3; } // Must read exactly [0, size) bytes in little-endian order. - HH_INLINE uint64_t operator()(AllowNone, const char* from, + HH_INLINE uint32_t operator()(AllowNone, const char* from, const size_t size_mod4) { // We need to load in little-endian order without accessing anything outside // [from, from + size_mod4). Unrolling is faster than looping backwards. - uint64_t last3 = 0; + uint32_t last3 = 0; if (size_mod4 >= 1) { - last3 += U64FromChar(from[0]); + last3 += U32FromChar(from[0]); } if (size_mod4 >= 2) { - last3 += U64FromChar(from[1]) << 8; + last3 += U32FromChar(from[1]) << 8; } if (size_mod4 == 3) { - last3 += U64FromChar(from[2]) << 16; + last3 += U32FromChar(from[2]) << 16; } return last3; } @@ -122,10 +122,6 @@ class Load3 { return static_cast(static_cast(c)); } - static HH_INLINE uint64_t U64FromChar(const char c) { - return static_cast(static_cast(c)); - } - static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size, char* HH_RESTRICT to) { #if HH_MSC_VERSION From 7cb97d69839cab08efda2140857408a3d4154806 Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Thu, 19 Jan 2023 14:36:01 +0100 Subject: [PATCH 3/9] use size_t instead of uint64_t for indexing --- highwayhash/load3.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/highwayhash/load3.h b/highwayhash/load3.h index cdd9e50..9d4d2a0 100644 --- a/highwayhash/load3.h +++ b/highwayhash/load3.h @@ -89,9 +89,9 @@ class Load3 { // These indices are chosen as an easy-to-compute sequence containing the // same elements as [0, size), but repeated and/or reordered. This enables // unconditional loads, which outperform conditional 8 or 16+8 bit loads. - const uint64_t idx0 = 0; - const uint64_t idx1 = size_mod4 >> 1; - const uint64_t idx2 = size_mod4 - 1; + const size_t idx0 = 0; + const size_t idx1 = size_mod4 >> 1; + const size_t idx2 = size_mod4 - 1; // Store into least significant bytes (avoids one shift). last3 = U32FromChar(from[idx0]); last3 += U32FromChar(from[idx1]) << 8; From a1138c8d2fa87203ca86f7c010bcea55bc721200 Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Thu, 19 Jan 2023 14:49:56 +0100 Subject: [PATCH 4/9] Use _mm_cvtsi32_si128 instead of _mm_cvtsi64_si128 --- highwayhash/hh_avx2.h | 2 +- highwayhash/hh_sse41.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/highwayhash/hh_avx2.h b/highwayhash/hh_avx2.h index c3dfb59..4d76adb 100644 --- a/highwayhash/hh_avx2.h +++ b/highwayhash/hh_avx2.h @@ -78,7 +78,7 @@ def x(a,b,c): // size/32. mod32 is sufficient because each Update behaves as if a // counter were injected, because the state is large and mixed thoroughly. const V8x32U size256( - _mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32))); + _mm256_broadcastd_epi32(_mm_cvtsi32_si128(static_cast(size_mod32)))); // Equivalent to storing size_mod32 in packet. v0 += V4x64U(size256); // Boosts the avalanche effect of mod32. diff --git a/highwayhash/hh_sse41.h b/highwayhash/hh_sse41.h index ed30f22..685b436 100644 --- a/highwayhash/hh_sse41.h +++ b/highwayhash/hh_sse41.h @@ -195,8 +195,8 @@ class HHStateSSE41 { const uint64_t count) { // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32, // which is broadcast into 32-bit lanes. - const __m128i count_left = _mm_cvtsi64_si128(count); - const __m128i count_right = _mm_cvtsi64_si128(32 - count); + const __m128i count_left = _mm_cvtsi32_si128(static_cast(count)); + const __m128i count_right = _mm_cvtsi32_si128(static_cast(32 - count)); const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left)); const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left)); const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right)); @@ -250,7 +250,7 @@ class HHStateSSE41 { const uint32_t* words = reinterpret_cast(bytes); // Mask of 1-bits where the final 4 bytes should be inserted (replacement // for variable shift/insert using broadcast+blend). - V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0 + V2x64U mask4(_mm_cvtsi32_si128(0xFFFFFFFFU)); // 'insert' into lane 0 V2x64U ret(0); if (size & 8) { ret = V2x64U(_mm_loadl_epi64(reinterpret_cast(words))); From 8fb4fabd3251a398bd77919d482d99d1bc2bd4ae Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Thu, 19 Jan 2023 14:50:58 +0100 Subject: [PATCH 5/9] Use size_t instead of uint64_t as count argument to Rotate32By --- highwayhash/hh_portable.h | 2 +- highwayhash/hh_sse41.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/highwayhash/hh_portable.h b/highwayhash/hh_portable.h index 3b1a394..628e1aa 100644 --- a/highwayhash/hh_portable.h +++ b/highwayhash/hh_portable.h @@ -233,7 +233,7 @@ class HHStatePortable { } } - static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) { + static HH_INLINE void Rotate32By(uint32_t* halves, const size_t count) { for (int i = 0; i < 2 * kNumLanes; ++i) { const uint32_t x = halves[i]; halves[i] = (x << count) | (x >> (32 - count)); diff --git a/highwayhash/hh_sse41.h b/highwayhash/hh_sse41.h index 685b436..7f71de8 100644 --- a/highwayhash/hh_sse41.h +++ b/highwayhash/hh_sse41.h @@ -192,7 +192,7 @@ class HHStateSSE41 { // Rotates 32-bit lanes by "count" bits. static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH, V2x64U* HH_RESTRICT vL, - const uint64_t count) { + const size_t count) { // WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32, // which is broadcast into 32-bit lanes. const __m128i count_left = _mm_cvtsi32_si128(static_cast(count)); From 7b6bdf85899527e27fea1c567db79e0277b7d7b2 Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Wed, 21 Jul 2021 14:29:48 +0200 Subject: [PATCH 6/9] Implement V256::V256(uint64_t) on x86 --- highwayhash/vector256.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/highwayhash/vector256.h b/highwayhash/vector256.h index d1ccec4..e10a365 100644 --- a/highwayhash/vector256.h +++ b/highwayhash/vector256.h @@ -287,7 +287,12 @@ class V256 { // Broadcasts i to all lanes. HH_INLINE explicit V256(T i) +#if HH_ARCH_X86 + // _mm_cvtsi64_si128 is not available on x86 + : V256(i,i,i,i) {} +#else : v_(_mm256_broadcastq_epi64(_mm_cvtsi64_si128(i))) {} +#endif // Copy from other vector. HH_INLINE explicit V256(const V256& other) : v_(other.v_) {} From 1a95ecd8ae71a6a3431c38f713c2cd324e7aa1dd Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Thu, 19 Jan 2023 13:52:05 +0100 Subject: [PATCH 7/9] Fix pass by reference --- highwayhash/vector256.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/highwayhash/vector256.h b/highwayhash/vector256.h index e10a365..e960c44 100644 --- a/highwayhash/vector256.h +++ b/highwayhash/vector256.h @@ -541,7 +541,7 @@ HH_INLINE V256 operator&(const V256& left, const V256& right) { } template -HH_INLINE V256 operator|(const V256 left, const V256& right) { +HH_INLINE V256 operator|(const V256& left, const V256& right) { V256 t(left); return t |= right; } From 9d42373d386477fb6ca817aa6328fc183c93e76c Mon Sep 17 00:00:00 2001 From: Lydia Rabkin Date: Wed, 21 Jul 2021 12:15:33 +0200 Subject: [PATCH 8/9] Enable specialized implementations on x32 --- highwayhash/arch_specific.cc | 8 ++++---- highwayhash/arch_specific.h | 16 ++++++++++++++-- highwayhash/highwayhash.h | 2 +- highwayhash/instruction_sets.cc | 4 ++-- highwayhash/instruction_sets.h | 6 +++--- highwayhash/os_specific.cc | 2 +- highwayhash/tsc_timer.h | 8 ++++---- 7 files changed, 29 insertions(+), 17 deletions(-) diff --git a/highwayhash/arch_specific.cc b/highwayhash/arch_specific.cc index 2a05860..314c84f 100644 --- a/highwayhash/arch_specific.cc +++ b/highwayhash/arch_specific.cc @@ -16,7 +16,7 @@ #include -#if HH_ARCH_X64 && !HH_MSC_VERSION +#if HH_ARCH_X86_X64 && !HH_MSC_VERSION #include #endif @@ -53,7 +53,7 @@ const char* TargetName(const TargetBits target_bit) { } } -#if HH_ARCH_X64 +#if HH_ARCH_X86_X64 namespace { @@ -101,12 +101,12 @@ uint32_t ApicId() { return abcd[1] >> 24; // ebx } -#endif // HH_ARCH_X64 +#endif // HH_ARCH_X86_X64 namespace { double DetectNominalClockRate() { -#if HH_ARCH_X64 +#if HH_ARCH_X86_X64 const std::string& brand_string = BrandString(); // Brand strings include the maximum configured frequency. These prefixes are // defined by Intel CPUID documentation. diff --git a/highwayhash/arch_specific.h b/highwayhash/arch_specific.h index 0b8c384..1cc35b5 100644 --- a/highwayhash/arch_specific.h +++ b/highwayhash/arch_specific.h @@ -52,6 +52,18 @@ namespace highwayhash { #define HH_ARCH_X64 0 #endif +#if defined(__i386__) || defined(_M_IX86) +#define HH_ARCH_X86 1 +#else +#define HH_ARCH_X86 0 +#endif + +#if HH_ARCH_X86 || HH_ARCH_X64 +#define HH_ARCH_X86_X64 1 +#else +#define HH_ARCH_X86_X64 0 +#endif + #if defined(__aarch64__) || defined(__arm64__) #define HH_ARCH_AARCH64 1 #else @@ -162,7 +174,7 @@ double NominalClockRate(); // frequency on PPC and NominalClockRate on all other platforms. double InvariantTicksPerSecond(); -#if HH_ARCH_X64 +#if HH_ARCH_X86_X64 // Calls CPUID instruction with eax=level and ecx=count and returns the result // in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd). @@ -172,7 +184,7 @@ void Cpuid(const uint32_t level, const uint32_t count, // Returns the APIC ID of the CPU on which we're currently running. uint32_t ApicId(); -#endif // HH_ARCH_X64 +#endif // HH_ARCH_X86_X64 } // namespace highwayhash diff --git a/highwayhash/highwayhash.h b/highwayhash/highwayhash.h index 3655ce3..a9b18ab 100644 --- a/highwayhash/highwayhash.h +++ b/highwayhash/highwayhash.h @@ -31,7 +31,7 @@ #include "highwayhash/compiler_specific.h" #include "highwayhash/hh_types.h" -#if HH_ARCH_X64 +#if HH_ARCH_X86_X64 #include "highwayhash/iaca.h" #endif diff --git a/highwayhash/instruction_sets.cc b/highwayhash/instruction_sets.cc index a02e1f8..0aae595 100644 --- a/highwayhash/instruction_sets.cc +++ b/highwayhash/instruction_sets.cc @@ -17,7 +17,7 @@ // Currently there are only specialized targets for X64; other architectures // only use HH_TARGET_Portable, in which case Supported() just returns that. -#if HH_ARCH_X64 +#if HH_ARCH_X86_X64 #include @@ -138,4 +138,4 @@ TargetBits InstructionSets::Supported() { } // namespace highwayhash -#endif // HH_ARCH_X64 +#endif // HH_ARCH_X86_X64 diff --git a/highwayhash/instruction_sets.h b/highwayhash/instruction_sets.h index aa7bd6b..756f561 100644 --- a/highwayhash/instruction_sets.h +++ b/highwayhash/instruction_sets.h @@ -34,7 +34,7 @@ class InstructionSets { public: // Returns bit array of HH_TARGET_* supported by the current CPU. // The HH_TARGET_Portable bit is guaranteed to be set. -#if HH_ARCH_X64 +#if HH_ARCH_X86_X64 static TargetBits Supported(); #elif HH_ARCH_PPC static HH_INLINE TargetBits Supported() { @@ -54,7 +54,7 @@ class InstructionSets { // this should only be called infrequently (e.g. hoisting it out of loops). template