Skip to content
This repository was archived by the owner on Apr 22, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions highwayhash/arch_specific.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include <stdint.h>

#if HH_ARCH_X64 && !HH_MSC_VERSION
#if HH_ARCH_X86_X64 && !HH_MSC_VERSION
#include <cpuid.h>
#endif

Expand Down Expand Up @@ -53,7 +53,7 @@ const char* TargetName(const TargetBits target_bit) {
}
}

#if HH_ARCH_X64
#if HH_ARCH_X86_X64

namespace {

Expand Down Expand Up @@ -101,12 +101,12 @@ uint32_t ApicId() {
return abcd[1] >> 24; // ebx
}

#endif // HH_ARCH_X64
#endif // HH_ARCH_X86_X64

namespace {

double DetectNominalClockRate() {
#if HH_ARCH_X64
#if HH_ARCH_X86_X64
const std::string& brand_string = BrandString();
// Brand strings include the maximum configured frequency. These prefixes are
// defined by Intel CPUID documentation.
Expand Down
16 changes: 14 additions & 2 deletions highwayhash/arch_specific.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ namespace highwayhash {
#define HH_ARCH_X64 0
#endif

#if defined(__i386__) || defined(_M_IX86)
#define HH_ARCH_X86 1
#else
#define HH_ARCH_X86 0
#endif

#if HH_ARCH_X86 || HH_ARCH_X64
#define HH_ARCH_X86_X64 1
#else
#define HH_ARCH_X86_X64 0
#endif

#if defined(__aarch64__) || defined(__arm64__)
#define HH_ARCH_AARCH64 1
#else
Expand Down Expand Up @@ -162,7 +174,7 @@ double NominalClockRate();
// frequency on PPC and NominalClockRate on all other platforms.
double InvariantTicksPerSecond();

#if HH_ARCH_X64
#if HH_ARCH_X86_X64

// Calls CPUID instruction with eax=level and ecx=count and returns the result
// in abcd array where abcd = {eax, ebx, ecx, edx} (hence the name abcd).
Expand All @@ -172,7 +184,7 @@ void Cpuid(const uint32_t level, const uint32_t count,
// Returns the APIC ID of the CPU on which we're currently running.
uint32_t ApicId();

#endif // HH_ARCH_X64
#endif // HH_ARCH_X86_X64

} // namespace highwayhash

Expand Down
8 changes: 4 additions & 4 deletions highwayhash/hh_avx2.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def x(a,b,c):
// size/32. mod32 is sufficient because each Update behaves as if a
// counter were injected, because the state is large and mixed thoroughly.
const V8x32U size256(
_mm256_broadcastd_epi32(_mm_cvtsi64_si128(size_mod32)));
_mm256_broadcastd_epi32(_mm_cvtsi32_si128(static_cast<uint32_t>(size_mod32))));
// Equivalent to storing size_mod32 in packet.
v0 += V4x64U(size256);
// Boosts the avalanche effect of mod32.
Expand All @@ -105,12 +105,12 @@ def x(a,b,c):
} else { // size_mod32 < 16
const V4x32U int_mask = IntMask<0>()(size);
const V4x32U packetL = MaskedLoadInt(bytes, int_mask);
const uint64_t last3 =
const uint32_t last3 =
Load3()(Load3::AllowUnordered(), remainder, size_mod4);

// Rather than insert into packetL[3], it is faster to initialize
// the otherwise empty packetH.
const V4x32U packetH(_mm_cvtsi64_si128(last3));
const V4x32U packetH(_mm_cvtsi32_si128(last3));
Update(packetH, packetL);
}
}
Expand Down Expand Up @@ -255,7 +255,7 @@ def x(a,b,c):
static HH_INLINE V4x32U Load0To16(const char* from, const size_t size_mod32,
const V4x32U& size) {
const char* remainder = from + (size_mod32 & ~3);
const uint64_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3);
const uint32_t last3 = Load3()(Load3Policy(), remainder, size_mod32 & 3);
const V4x32U int_mask = IntMask<kSizeOffset>()(size);
const V4x32U int_lanes = MaskedLoadInt(from, int_mask);
return Insert4AboveMask(last3, int_mask, int_lanes);
Expand Down
2 changes: 1 addition & 1 deletion highwayhash/hh_portable.h
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ class HHStatePortable {
}
}

static HH_INLINE void Rotate32By(uint32_t* halves, const uint64_t count) {
static HH_INLINE void Rotate32By(uint32_t* halves, const size_t count) {
for (int i = 0; i < 2 * kNumLanes; ++i) {
const uint32_t x = halves[i];
halves[i] = (x << count) | (x >> (32 - count));
Expand Down
12 changes: 6 additions & 6 deletions highwayhash/hh_sse41.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ class HHStateSSE41 {
} else { // size_mod32 < 16
const V2x64U packetL = LoadMultipleOfFour(bytes, size_mod32);

const uint64_t last4 =
const uint32_t last4 =
Load3()(Load3::AllowUnordered(), remainder, size_mod4);

// Rather than insert into packetL[3], it is faster to initialize
// the otherwise empty packetH.
const V2x64U packetH(_mm_cvtsi64_si128(last4));
const V2x64U packetH(_mm_cvtsi32_si128(last4));
Update(packetH, packetL);
}
}
Expand Down Expand Up @@ -192,11 +192,11 @@ class HHStateSSE41 {
// Rotates 32-bit lanes by "count" bits.
static HH_INLINE void Rotate32By(V2x64U* HH_RESTRICT vH,
V2x64U* HH_RESTRICT vL,
const uint64_t count) {
const size_t count) {
// WARNING: the shift count is 64 bits, so we can't reuse vsize_mod32,
// which is broadcast into 32-bit lanes.
const __m128i count_left = _mm_cvtsi64_si128(count);
const __m128i count_right = _mm_cvtsi64_si128(32 - count);
const __m128i count_left = _mm_cvtsi32_si128(static_cast<uint32_t>(count));
const __m128i count_right = _mm_cvtsi32_si128(static_cast<uint32_t>(32 - count));
const V2x64U shifted_leftL(_mm_sll_epi32(*vL, count_left));
const V2x64U shifted_leftH(_mm_sll_epi32(*vH, count_left));
const V2x64U shifted_rightL(_mm_srl_epi32(*vL, count_right));
Expand Down Expand Up @@ -250,7 +250,7 @@ class HHStateSSE41 {
const uint32_t* words = reinterpret_cast<const uint32_t*>(bytes);
// Mask of 1-bits where the final 4 bytes should be inserted (replacement
// for variable shift/insert using broadcast+blend).
V2x64U mask4(_mm_cvtsi64_si128(0xFFFFFFFFULL)); // 'insert' into lane 0
V2x64U mask4(_mm_cvtsi32_si128(0xFFFFFFFFU)); // 'insert' into lane 0
V2x64U ret(0);
if (size & 8) {
ret = V2x64U(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(words)));
Expand Down
2 changes: 1 addition & 1 deletion highwayhash/highwayhash.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
#include "highwayhash/compiler_specific.h"
#include "highwayhash/hh_types.h"

#if HH_ARCH_X64
#if HH_ARCH_X86_X64
#include "highwayhash/iaca.h"
#endif

Expand Down
2 changes: 1 addition & 1 deletion highwayhash/highwayhash_target.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ namespace highwayhash {
extern "C" {
uint64_t HH_ADD_TARGET_SUFFIX(HighwayHash64_)(const HHKey key,
const char* bytes,
const uint64_t size) {
const size_t size) {
HHStateT<HH_TARGET> state(key);
HHResult64 result;
HighwayHashT(&state, bytes, size, &result);
Expand Down
2 changes: 1 addition & 1 deletion highwayhash/highwayhash_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ TargetBits VerifyImplementations(const Result (&known_good)[kMaxSize + 1]) {
// For each test input: empty string, 00, 00 01, ...
char in[kMaxSize + 1] = {0};
// Fast enough that we don't need a thread pool.
for (uint64_t size = 0; size <= kMaxSize; ++size) {
for (size_t size = 0; size <= kMaxSize; ++size) {
in[size] = static_cast<char>(size);
#if PRINT_RESULTS
Result actual;
Expand Down
6 changes: 3 additions & 3 deletions highwayhash/highwayhash_test_target.cc
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ void HighwayHashTest<Target>::operator()(const HHKey& key,
template <TargetBits Target>
void HighwayHashCatTest<Target>::operator()(const HHKey& key,
const char* HH_RESTRICT bytes,
const uint64_t size,
const size_t size,
const HHResult64* expected,
const HHNotify notify) const {
TestHighwayHashCat(key, bytes, size, expected, notify);
Expand All @@ -136,7 +136,7 @@ void HighwayHashCatTest<Target>::operator()(const HHKey& key,
template <TargetBits Target>
void HighwayHashCatTest<Target>::operator()(const HHKey& key,
const char* HH_RESTRICT bytes,
const uint64_t size,
const size_t size,
const HHResult128* expected,
const HHNotify notify) const {
TestHighwayHashCat(key, bytes, size, expected, notify);
Expand All @@ -145,7 +145,7 @@ void HighwayHashCatTest<Target>::operator()(const HHKey& key,
template <TargetBits Target>
void HighwayHashCatTest<Target>::operator()(const HHKey& key,
const char* HH_RESTRICT bytes,
const uint64_t size,
const size_t size,
const HHResult256* expected,
const HHNotify notify) const {
TestHighwayHashCat(key, bytes, size, expected, notify);
Expand Down
6 changes: 3 additions & 3 deletions highwayhash/highwayhash_test_target.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@ struct HighwayHashTest {
template <TargetBits Target>
struct HighwayHashCatTest {
void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
const uint64_t size, const HHResult64* expected,
const size_t size, const HHResult64* expected,
const HHNotify notify) const;
void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
const uint64_t size, const HHResult128* expected,
const size_t size, const HHResult128* expected,
const HHNotify notify) const;
void operator()(const HHKey& key, const char* HH_RESTRICT bytes,
const uint64_t size, const HHResult256* expected,
const size_t size, const HHResult256* expected,
const HHNotify notify) const;
};

Expand Down
4 changes: 2 additions & 2 deletions highwayhash/instruction_sets.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

// Currently there are only specialized targets for X64; other architectures
// only use HH_TARGET_Portable, in which case Supported() just returns that.
#if HH_ARCH_X64
#if HH_ARCH_X86_X64

#include <atomic>

Expand Down Expand Up @@ -138,4 +138,4 @@ TargetBits InstructionSets::Supported() {

} // namespace highwayhash

#endif // HH_ARCH_X64
#endif // HH_ARCH_X86_X64
6 changes: 3 additions & 3 deletions highwayhash/instruction_sets.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class InstructionSets {
public:
// Returns bit array of HH_TARGET_* supported by the current CPU.
// The HH_TARGET_Portable bit is guaranteed to be set.
#if HH_ARCH_X64
#if HH_ARCH_X86_X64
static TargetBits Supported();
#elif HH_ARCH_PPC
static HH_INLINE TargetBits Supported() {
Expand All @@ -54,7 +54,7 @@ class InstructionSets {
// this should only be called infrequently (e.g. hoisting it out of loops).
template <template <TargetBits> class Func, typename... Args>
static HH_INLINE TargetBits Run(Args&&... args) {
#if HH_ARCH_X64
#if HH_ARCH_X86_X64
const TargetBits supported = Supported();
if (supported & HH_TARGET_AVX2) {
Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
Expand Down Expand Up @@ -89,7 +89,7 @@ class InstructionSets {
static HH_INLINE TargetBits RunAll(Args&&... args) {
const TargetBits supported = Supported();

#if HH_ARCH_X64
#if HH_ARCH_X86_X64
if (supported & HH_TARGET_AVX2) {
Func<HH_TARGET_AVX2>()(std::forward<Args>(args)...);
}
Expand Down
34 changes: 15 additions & 19 deletions highwayhash/load3.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,22 +64,22 @@ class Load3 {
}

// As above, but preceding bytes are removed and upper byte(s) are zero.
HH_INLINE uint64_t operator()(AllowReadBefore, const char* from,
HH_INLINE uint32_t operator()(AllowReadBefore, const char* from,
const size_t size_mod4) {
// Shift 0..3 valid bytes into LSB as if loaded in little-endian order.
// 64-bit type enables 32-bit shift when size_mod4 == 0.
uint64_t last3 = operator()(AllowReadBeforeAndReturn(), from, size_mod4);
last3 >>= 32 - (size_mod4 * 8);
return last3;
return static_cast<uint32_t>(last3);
}

// The bytes need not be loaded in little-endian order. This particular order
// (and the duplication of some bytes depending on "size_mod4") was chosen for
// computational convenience and can no longer be changed because it is part
// of the HighwayHash length padding definition.
HH_INLINE uint64_t operator()(AllowUnordered, const char* from,
HH_INLINE uint32_t operator()(AllowUnordered, const char* from,
const size_t size_mod4) {
uint64_t last3 = 0;
uint32_t last3 = 0;
// Not allowed to read any bytes; early-out is faster than reading from a
// constant array of zeros.
if (size_mod4 == 0) {
Expand All @@ -89,30 +89,30 @@ class Load3 {
// These indices are chosen as an easy-to-compute sequence containing the
// same elements as [0, size), but repeated and/or reordered. This enables
// unconditional loads, which outperform conditional 8 or 16+8 bit loads.
const uint64_t idx0 = 0;
const uint64_t idx1 = size_mod4 >> 1;
const uint64_t idx2 = size_mod4 - 1;
const size_t idx0 = 0;
const size_t idx1 = size_mod4 >> 1;
const size_t idx2 = size_mod4 - 1;
// Store into least significant bytes (avoids one shift).
last3 = U64FromChar(from[idx0]);
last3 += U64FromChar(from[idx1]) << 8;
last3 += U64FromChar(from[idx2]) << 16;
last3 = U32FromChar(from[idx0]);
last3 += U32FromChar(from[idx1]) << 8;
last3 += U32FromChar(from[idx2]) << 16;
return last3;
}

// Must read exactly [0, size) bytes in little-endian order.
HH_INLINE uint64_t operator()(AllowNone, const char* from,
HH_INLINE uint32_t operator()(AllowNone, const char* from,
const size_t size_mod4) {
// We need to load in little-endian order without accessing anything outside
// [from, from + size_mod4). Unrolling is faster than looping backwards.
uint64_t last3 = 0;
uint32_t last3 = 0;
if (size_mod4 >= 1) {
last3 += U64FromChar(from[0]);
last3 += U32FromChar(from[0]);
}
if (size_mod4 >= 2) {
last3 += U64FromChar(from[1]) << 8;
last3 += U32FromChar(from[1]) << 8;
}
if (size_mod4 == 3) {
last3 += U64FromChar(from[2]) << 16;
last3 += U32FromChar(from[2]) << 16;
}
return last3;
}
Expand All @@ -122,10 +122,6 @@ class Load3 {
return static_cast<uint32_t>(static_cast<unsigned char>(c));
}

static HH_INLINE uint64_t U64FromChar(const char c) {
return static_cast<uint64_t>(static_cast<unsigned char>(c));
}

static HH_INLINE void Copy(const char* HH_RESTRICT from, const size_t size,
char* HH_RESTRICT to) {
#if HH_MSC_VERSION
Expand Down
2 changes: 1 addition & 1 deletion highwayhash/os_specific.cc
Original file line number Diff line number Diff line change
Expand Up @@ -249,7 +249,7 @@ void PinThreadToRandomCPU() {

PinThreadToCPU(cpu);

#if HH_ARCH_X64
#if HH_ARCH_X86_X64
// After setting affinity, we should be running on the desired CPU.
printf("Running on CPU #%d, APIC ID %02x\n", cpu, ApicId());
#else
Expand Down
8 changes: 4 additions & 4 deletions highwayhash/tsc_timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ inline uint64_t Start<uint64_t>() {
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HH_ARCH_AARCH64
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HH_ARCH_X64 && HH_MSC_VERSION
#elif HH_ARCH_X32_X64 && HH_MSC_VERSION
_mm_lfence();
HH_COMPILER_FENCE;
t = __rdtsc();
Expand Down Expand Up @@ -128,7 +128,7 @@ inline uint64_t Stop<uint64_t>() {
asm volatile("mfspr %0, %1" : "=r"(t) : "i"(268));
#elif HH_ARCH_AARCH64
asm volatile("mrs %0, cntvct_el0" : "=r"(t));
#elif HH_ARCH_X64 && HH_MSC_VERSION
#elif HH_ARCH_X32_X64 && HH_MSC_VERSION
HH_COMPILER_FENCE;
unsigned aux;
t = __rdtscp(&aux);
Expand Down Expand Up @@ -158,7 +158,7 @@ inline uint64_t Stop<uint64_t>() {
template <>
inline uint32_t Start<uint32_t>() {
uint32_t t;
#if HH_ARCH_X64 && HH_MSC_VERSION
#if HH_ARCH_X32_X64 && HH_MSC_VERSION
_mm_lfence();
HH_COMPILER_FENCE;
t = static_cast<uint32_t>(__rdtsc());
Expand All @@ -182,7 +182,7 @@ inline uint32_t Start<uint32_t>() {
template <>
inline uint32_t Stop<uint32_t>() {
uint32_t t;
#if HH_ARCH_X64 && HH_MSC_VERSION
#if HH_ARCH_X32_X64 && HH_MSC_VERSION
HH_COMPILER_FENCE;
unsigned aux;
t = static_cast<uint32_t>(__rdtscp(&aux));
Expand Down
Loading