From e940b04f13775f91d1288f92785a2addcb9c4428 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Sat, 31 Jan 2026 22:11:50 +0100 Subject: [PATCH] x86: use `intrinsics::simd` for `hadds`/`hsubs` --- crates/core_arch/src/x86/avx2.rs | 36 +++++++++++++++++++++++++------ crates/core_arch/src/x86/ssse3.rs | 22 ++++++++++++------- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 6a39a0aaf8..83aef753c9 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -991,7 +991,21 @@ pub const fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vphaddsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) } + let a = a.as_i16x16(); + let b = b.as_i16x16(); + unsafe { + let even: i16x16 = simd_shuffle!( + a, + b, + [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30] + ); + let odd: i16x16 = simd_shuffle!( + a, + b, + [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31] + ); + simd_saturating_add(even, odd).as_m256i() + } } /// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`. @@ -1047,7 +1061,21 @@ pub const fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i { #[cfg_attr(test, assert_instr(vphsubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i { - unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) } + let a = a.as_i16x16(); + let b = b.as_i16x16(); + unsafe { + let even: i16x16 = simd_shuffle!( + a, + b, + [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30] + ); + let odd: i16x16 = simd_shuffle!( + a, + b, + [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31] + ); + simd_saturating_sub(even, odd).as_m256i() + } } /// Returns values from `slice` at offsets determined by `offsets * scale`, @@ -3791,10 +3819,6 @@ pub const fn _mm256_extract_epi16(a: __m256i) -> i32 { #[allow(improper_ctypes)] unsafe extern "C" { - #[link_name = "llvm.x86.avx2.phadd.sw"] - fn phaddsw(a: i16x16, b: i16x16) -> i16x16; - #[link_name = "llvm.x86.avx2.phsub.sw"] - fn phsubsw(a: i16x16, b: i16x16) -> i16x16; #[link_name = "llvm.x86.avx2.pmadd.wd"] fn pmaddwd(a: i16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] diff --git a/crates/core_arch/src/x86/ssse3.rs b/crates/core_arch/src/x86/ssse3.rs index 4426a3274c..1d7a97944a 100644 --- a/crates/core_arch/src/x86/ssse3.rs +++ b/crates/core_arch/src/x86/ssse3.rs @@ -188,7 +188,13 @@ pub const fn _mm_hadd_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(phaddsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hadds_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(phaddsw128(a.as_i16x8(), b.as_i16x8())) } + let a = a.as_i16x8(); + let b = b.as_i16x8(); + unsafe { + let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]); + let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]); + simd_saturating_add(even, odd).as_m128i() + } } /// Horizontally adds the adjacent pairs of values contained in 2 packed @@ -240,7 +246,13 @@ pub const fn _mm_hsub_epi16(a: __m128i, b: __m128i) -> __m128i { #[cfg_attr(test, assert_instr(phsubsw))] #[stable(feature = "simd_x86", since = "1.27.0")] pub fn _mm_hsubs_epi16(a: __m128i, b: __m128i) -> __m128i { - unsafe { transmute(phsubsw128(a.as_i16x8(), b.as_i16x8())) } + let a = a.as_i16x8(); + let b = b.as_i16x8(); + unsafe { + let even: i16x8 = simd_shuffle!(a, b, [0, 2, 4, 6, 8, 10, 12, 14]); + let odd: i16x8 = simd_shuffle!(a, b, [1, 3, 5, 7, 9, 11, 13, 15]); + simd_saturating_sub(even, odd).as_m128i() + } } /// Horizontally subtract the adjacent pairs of values contained in 2 @@ -337,12 +349,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.ssse3.pshuf.b.128"] fn pshufb128(a: u8x16, b: u8x16) -> u8x16; - #[link_name = "llvm.x86.ssse3.phadd.sw.128"] - fn phaddsw128(a: i16x8, b: i16x8) -> i16x8; - - #[link_name = "llvm.x86.ssse3.phsub.sw.128"] - fn phsubsw128(a: i16x8, b: i16x8) -> i16x8; - #[link_name = "llvm.x86.ssse3.pmadd.ub.sw.128"] fn pmaddubsw128(a: u8x16, b: i8x16) -> i16x8;