Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 19 additions & 17 deletions crates/core_arch/src/x86/avx2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1841,20 +1841,14 @@ pub const fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) ->
#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
//
// ```rust
// #[target_feature(enable = "avx2")]
// unsafe fn widening_add(mad: __m256i) -> __m256i {
// _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
// }
// ```
//
// If we implement this using generic vector intrinsics, the optimizer
// will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
// For this reason, we use x86 intrinsics.
unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
unsafe {
let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16()));
let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]);
let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]);
simd_add(even, odd).as_m256i()
}
}

/// Vertically multiplies each unsigned 8-bit integer from `a` with the
Expand Down Expand Up @@ -3819,8 +3813,6 @@ pub const fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {

#[allow(improper_ctypes)]
unsafe extern "C" {
#[link_name = "llvm.x86.avx2.pmadd.wd"]
fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
#[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
#[link_name = "llvm.x86.avx2.mpsadbw"]
Expand Down Expand Up @@ -4669,14 +4661,24 @@ mod tests {
}

#[simd_test(enable = "avx2")]
fn test_mm256_madd_epi16() {
const fn test_mm256_madd_epi16() {
let a = _mm256_set1_epi16(2);
let b = _mm256_set1_epi16(4);
let r = _mm256_madd_epi16(a, b);
let e = _mm256_set1_epi32(16);
assert_eq_m256i(r, e);
}

#[target_feature(enable = "avx2")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
unsafe fn test_mm256_madd_epi16_mul_one(mad: __m256i) -> __m256i {
// This is a trick used in the adler32 algorithm to get a widening addition. The
// multiplication by 1 is trivial, but must not be optimized out because then the vpmaddwd
// instruction is no longer selected. The assert_instr verifies that this is the case.
let one_v = _mm256_set1_epi16(1);
_mm256_madd_epi16(mad, one_v)
}

#[simd_test(enable = "avx2")]
const fn test_mm256_inserti128_si256() {
let a = _mm256_setr_epi64x(1, 2, 3, 4);
Expand Down
64 changes: 35 additions & 29 deletions crates/core_arch/src/x86/avx512bw.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6321,20 +6321,22 @@ pub const unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a:
#[target_feature(enable = "avx512bw")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
//
// ```rust
// #[target_feature(enable = "avx512bw")]
// unsafe fn widening_add(mad: __m512i) -> __m512i {
// _mm512_madd_epi16(mad, _mm512_set1_epi16(1))
// }
// ```
//
// If we implement this using generic vector intrinsics, the optimizer
// will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
// For this reason, we use x86 intrinsics.
unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) }
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
unsafe {
let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32()));
let even: i32x16 = simd_shuffle!(
r,
r,
[0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
);
let odd: i32x16 = simd_shuffle!(
r,
r,
[1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31]
);
simd_add(even, odd).as_m512i()
}
}

/// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set).
Expand All @@ -6344,7 +6346,8 @@ pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i {
#[target_feature(enable = "avx512bw")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
unsafe {
let madd = _mm512_madd_epi16(a, b).as_i32x16();
transmute(simd_select_bitmask(k, madd, src.as_i32x16()))
Expand All @@ -6358,7 +6361,8 @@ pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i
#[target_feature(enable = "avx512bw")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i {
unsafe {
let madd = _mm512_madd_epi16(a, b).as_i32x16();
transmute(simd_select_bitmask(k, madd, i32x16::ZERO))
Expand All @@ -6372,7 +6376,8 @@ pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i
#[target_feature(enable = "avx512bw,avx512vl")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
unsafe {
let madd = _mm256_madd_epi16(a, b).as_i32x8();
transmute(simd_select_bitmask(k, madd, src.as_i32x8()))
Expand All @@ -6386,7 +6391,8 @@ pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i)
#[target_feature(enable = "avx512bw,avx512vl")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
unsafe {
let madd = _mm256_madd_epi16(a, b).as_i32x8();
transmute(simd_select_bitmask(k, madd, i32x8::ZERO))
Expand All @@ -6400,7 +6406,8 @@ pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i {
#[target_feature(enable = "avx512bw,avx512vl")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
unsafe {
let madd = _mm_madd_epi16(a, b).as_i32x4();
transmute(simd_select_bitmask(k, madd, src.as_i32x4()))
Expand All @@ -6414,7 +6421,8 @@ pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) ->
#[target_feature(enable = "avx512bw,avx512vl")]
#[stable(feature = "stdarch_x86_avx512", since = "1.89")]
#[cfg_attr(test, assert_instr(vpmaddwd))]
pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i {
unsafe {
let madd = _mm_madd_epi16(a, b).as_i32x4();
transmute(simd_select_bitmask(k, madd, i32x4::ZERO))
Expand Down Expand Up @@ -12574,8 +12582,6 @@ unsafe extern "C" {
#[link_name = "llvm.x86.avx512.pmul.hr.sw.512"]
fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32;

#[link_name = "llvm.x86.avx512.pmaddw.d.512"]
fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16;
#[link_name = "llvm.x86.avx512.pmaddubs.w.512"]
fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32;

Expand Down Expand Up @@ -17500,7 +17506,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw")]
fn test_mm512_madd_epi16() {
const fn test_mm512_madd_epi16() {
let a = _mm512_set1_epi16(1);
let b = _mm512_set1_epi16(1);
let r = _mm512_madd_epi16(a, b);
Expand All @@ -17509,7 +17515,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw")]
fn test_mm512_mask_madd_epi16() {
const fn test_mm512_mask_madd_epi16() {
let a = _mm512_set1_epi16(1);
let b = _mm512_set1_epi16(1);
let r = _mm512_mask_madd_epi16(a, 0, a, b);
Expand Down Expand Up @@ -17537,7 +17543,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw")]
fn test_mm512_maskz_madd_epi16() {
const fn test_mm512_maskz_madd_epi16() {
let a = _mm512_set1_epi16(1);
let b = _mm512_set1_epi16(1);
let r = _mm512_maskz_madd_epi16(0, a, b);
Expand All @@ -17548,7 +17554,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw,avx512vl")]
fn test_mm256_mask_madd_epi16() {
const fn test_mm256_mask_madd_epi16() {
let a = _mm256_set1_epi16(1);
let b = _mm256_set1_epi16(1);
let r = _mm256_mask_madd_epi16(a, 0, a, b);
Expand All @@ -17568,7 +17574,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw,avx512vl")]
fn test_mm256_maskz_madd_epi16() {
const fn test_mm256_maskz_madd_epi16() {
let a = _mm256_set1_epi16(1);
let b = _mm256_set1_epi16(1);
let r = _mm256_maskz_madd_epi16(0, a, b);
Expand All @@ -17579,7 +17585,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw,avx512vl")]
fn test_mm_mask_madd_epi16() {
const fn test_mm_mask_madd_epi16() {
let a = _mm_set1_epi16(1);
let b = _mm_set1_epi16(1);
let r = _mm_mask_madd_epi16(a, 0, a, b);
Expand All @@ -17590,7 +17596,7 @@ mod tests {
}

#[simd_test(enable = "avx512bw,avx512vl")]
fn test_mm_maskz_madd_epi16() {
const fn test_mm_maskz_madd_epi16() {
let a = _mm_set1_epi16(1);
let b = _mm_set1_epi16(1);
let r = _mm_maskz_madd_epi16(0, a, b);
Expand Down
26 changes: 9 additions & 17 deletions crates/core_arch/src/x86/sse2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -210,20 +210,14 @@ pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i {
#[target_feature(enable = "sse2")]
#[cfg_attr(test, assert_instr(pmaddwd))]
#[stable(feature = "simd_x86", since = "1.27.0")]
pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
// It's a trick used in the Adler-32 algorithm to perform a widening addition.
//
// ```rust
// #[target_feature(enable = "sse2")]
// unsafe fn widening_add(mad: __m128i) -> __m128i {
// _mm_madd_epi16(mad, _mm_set1_epi16(1))
// }
// ```
//
// If we implement this using generic vector intrinsics, the optimizer
// will eliminate this pattern, and `pmaddwd` will no longer be emitted.
// For this reason, we use x86 intrinsics.
unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) }
#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")]
pub const fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i {
unsafe {
let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8()));
let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]);
let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]);
simd_add(even, odd).as_m128i()
}
}

/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
Expand Down Expand Up @@ -3193,8 +3187,6 @@ unsafe extern "C" {
fn lfence();
#[link_name = "llvm.x86.sse2.mfence"]
fn mfence();
#[link_name = "llvm.x86.sse2.pmadd.wd"]
fn pmaddwd(a: i16x8, b: i16x8) -> i32x4;
#[link_name = "llvm.x86.sse2.psad.bw"]
fn psadbw(a: u8x16, b: u8x16) -> u64x2;
#[link_name = "llvm.x86.sse2.psll.w"]
Expand Down Expand Up @@ -3473,7 +3465,7 @@ mod tests {
}

#[simd_test(enable = "sse2")]
fn test_mm_madd_epi16() {
const fn test_mm_madd_epi16() {
let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16);
let r = _mm_madd_epi16(a, b);
Expand Down