diff --git a/crates/core_arch/src/x86/avx2.rs b/crates/core_arch/src/x86/avx2.rs index 83aef753c9..e9463d4331 100644 --- a/crates/core_arch/src/x86/avx2.rs +++ b/crates/core_arch/src/x86/avx2.rs @@ -1841,20 +1841,14 @@ pub const fn _mm256_inserti128_si256(a: __m256i, b: __m128i) -> #[target_feature(enable = "avx2")] #[cfg_attr(test, assert_instr(vpmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { - // It's a trick used in the Adler-32 algorithm to perform a widening addition. - // - // ```rust - // #[target_feature(enable = "avx2")] - // unsafe fn widening_add(mad: __m256i) -> __m256i { - // _mm256_madd_epi16(mad, _mm256_set1_epi16(1)) - // } - // ``` - // - // If we implement this using generic vector intrinsics, the optimizer - // will eliminate this pattern, and `vpmaddwd` will no longer be emitted. - // For this reason, we use x86 intrinsics. - unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) } +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i { + unsafe { + let r: i32x16 = simd_mul(simd_cast(a.as_i16x16()), simd_cast(b.as_i16x16())); + let even: i32x8 = simd_shuffle!(r, r, [0, 2, 4, 6, 8, 10, 12, 14]); + let odd: i32x8 = simd_shuffle!(r, r, [1, 3, 5, 7, 9, 11, 13, 15]); + simd_add(even, odd).as_m256i() + } } /// Vertically multiplies each unsigned 8-bit integer from `a` with the @@ -3819,8 +3813,6 @@ pub const fn _mm256_extract_epi16(a: __m256i) -> i32 { #[allow(improper_ctypes)] unsafe extern "C" { - #[link_name = "llvm.x86.avx2.pmadd.wd"] - fn pmaddwd(a: i16x16, b: i16x16) -> i32x8; #[link_name = "llvm.x86.avx2.pmadd.ub.sw"] fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16; #[link_name = "llvm.x86.avx2.mpsadbw"] @@ -4669,7 +4661,7 @@ mod tests { } #[simd_test(enable = "avx2")] - fn test_mm256_madd_epi16() { + const fn test_mm256_madd_epi16() { let a = _mm256_set1_epi16(2); let b = _mm256_set1_epi16(4); let r = _mm256_madd_epi16(a, b); @@ -4677,6 +4669,16 @@ mod tests { assert_eq_m256i(r, e); } + #[target_feature(enable = "avx2")] + #[cfg_attr(test, assert_instr(vpmaddwd))] + unsafe fn test_mm256_madd_epi16_mul_one(mad: __m256i) -> __m256i { + // This is a trick used in the adler32 algorithm to get a widening addition. The + // multiplication by 1 is trivial, but must not be optimized out because then the vpmaddwd + // instruction is no longer selected. The assert_instr verifies that this is the case. + let one_v = _mm256_set1_epi16(1); + _mm256_madd_epi16(mad, one_v) + } + #[simd_test(enable = "avx2")] const fn test_mm256_inserti128_si256() { let a = _mm256_setr_epi64x(1, 2, 3, 4); diff --git a/crates/core_arch/src/x86/avx512bw.rs b/crates/core_arch/src/x86/avx512bw.rs index 8e074fdcfa..e2d12cd972 100644 --- a/crates/core_arch/src/x86/avx512bw.rs +++ b/crates/core_arch/src/x86/avx512bw.rs @@ -6321,20 +6321,22 @@ pub const unsafe fn _mm_mask_storeu_epi8(mem_addr: *mut i8, mask: __mmask16, a: #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i { - // It's a trick used in the Adler-32 algorithm to perform a widening addition. - // - // ```rust - // #[target_feature(enable = "avx512bw")] - // unsafe fn widening_add(mad: __m512i) -> __m512i { - // _mm512_madd_epi16(mad, _mm512_set1_epi16(1)) - // } - // ``` - // - // If we implement this using generic vector intrinsics, the optimizer - // will eliminate this pattern, and `vpmaddwd` will no longer be emitted. - // For this reason, we use x86 intrinsics. - unsafe { transmute(vpmaddwd(a.as_i16x32(), b.as_i16x32())) } +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i { + unsafe { + let r: i32x32 = simd_mul(simd_cast(a.as_i16x32()), simd_cast(b.as_i16x32())); + let even: i32x16 = simd_shuffle!( + r, + r, + [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30] + ); + let odd: i32x16 = simd_shuffle!( + r, + r, + [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31] + ); + simd_add(even, odd).as_m512i() + } } /// Multiply packed signed 16-bit integers in a and b, producing intermediate signed 32-bit integers. Horizontally add adjacent pairs of intermediate 32-bit integers, and pack the results in dst using writemask k (elements are copied from src when the corresponding mask bit is not set). @@ -6344,7 +6346,8 @@ pub fn _mm512_madd_epi16(a: __m512i, b: __m512i) -> __m512i { #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i) -> __m512i { unsafe { let madd = _mm512_madd_epi16(a, b).as_i32x16(); transmute(simd_select_bitmask(k, madd, src.as_i32x16())) @@ -6358,7 +6361,8 @@ pub fn _mm512_mask_madd_epi16(src: __m512i, k: __mmask16, a: __m512i, b: __m512i #[target_feature(enable = "avx512bw")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i { unsafe { let madd = _mm512_madd_epi16(a, b).as_i32x16(); transmute(simd_select_bitmask(k, madd, i32x16::ZERO)) @@ -6372,7 +6376,8 @@ pub fn _mm512_maskz_madd_epi16(k: __mmask16, a: __m512i, b: __m512i) -> __m512i #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) -> __m256i { unsafe { let madd = _mm256_madd_epi16(a, b).as_i32x8(); transmute(simd_select_bitmask(k, madd, src.as_i32x8())) @@ -6386,7 +6391,8 @@ pub fn _mm256_mask_madd_epi16(src: __m256i, k: __mmask8, a: __m256i, b: __m256i) #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { unsafe { let madd = _mm256_madd_epi16(a, b).as_i32x8(); transmute(simd_select_bitmask(k, madd, i32x8::ZERO)) @@ -6400,7 +6406,8 @@ pub fn _mm256_maskz_madd_epi16(k: __mmask8, a: __m256i, b: __m256i) -> __m256i { #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> __m128i { unsafe { let madd = _mm_madd_epi16(a, b).as_i32x4(); transmute(simd_select_bitmask(k, madd, src.as_i32x4())) @@ -6414,7 +6421,8 @@ pub fn _mm_mask_madd_epi16(src: __m128i, k: __mmask8, a: __m128i, b: __m128i) -> #[target_feature(enable = "avx512bw,avx512vl")] #[stable(feature = "stdarch_x86_avx512", since = "1.89")] #[cfg_attr(test, assert_instr(vpmaddwd))] -pub fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm_maskz_madd_epi16(k: __mmask8, a: __m128i, b: __m128i) -> __m128i { unsafe { let madd = _mm_madd_epi16(a, b).as_i32x4(); transmute(simd_select_bitmask(k, madd, i32x4::ZERO)) @@ -12574,8 +12582,6 @@ unsafe extern "C" { #[link_name = "llvm.x86.avx512.pmul.hr.sw.512"] fn vpmulhrsw(a: i16x32, b: i16x32) -> i16x32; - #[link_name = "llvm.x86.avx512.pmaddw.d.512"] - fn vpmaddwd(a: i16x32, b: i16x32) -> i32x16; #[link_name = "llvm.x86.avx512.pmaddubs.w.512"] fn vpmaddubsw(a: u8x64, b: i8x64) -> i16x32; @@ -17500,7 +17506,7 @@ mod tests { } #[simd_test(enable = "avx512bw")] - fn test_mm512_madd_epi16() { + const fn test_mm512_madd_epi16() { let a = _mm512_set1_epi16(1); let b = _mm512_set1_epi16(1); let r = _mm512_madd_epi16(a, b); @@ -17509,7 +17515,7 @@ mod tests { } #[simd_test(enable = "avx512bw")] - fn test_mm512_mask_madd_epi16() { + const fn test_mm512_mask_madd_epi16() { let a = _mm512_set1_epi16(1); let b = _mm512_set1_epi16(1); let r = _mm512_mask_madd_epi16(a, 0, a, b); @@ -17537,7 +17543,7 @@ mod tests { } #[simd_test(enable = "avx512bw")] - fn test_mm512_maskz_madd_epi16() { + const fn test_mm512_maskz_madd_epi16() { let a = _mm512_set1_epi16(1); let b = _mm512_set1_epi16(1); let r = _mm512_maskz_madd_epi16(0, a, b); @@ -17548,7 +17554,7 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - fn test_mm256_mask_madd_epi16() { + const fn test_mm256_mask_madd_epi16() { let a = _mm256_set1_epi16(1); let b = _mm256_set1_epi16(1); let r = _mm256_mask_madd_epi16(a, 0, a, b); @@ -17568,7 +17574,7 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - fn test_mm256_maskz_madd_epi16() { + const fn test_mm256_maskz_madd_epi16() { let a = _mm256_set1_epi16(1); let b = _mm256_set1_epi16(1); let r = _mm256_maskz_madd_epi16(0, a, b); @@ -17579,7 +17585,7 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - fn test_mm_mask_madd_epi16() { + const fn test_mm_mask_madd_epi16() { let a = _mm_set1_epi16(1); let b = _mm_set1_epi16(1); let r = _mm_mask_madd_epi16(a, 0, a, b); @@ -17590,7 +17596,7 @@ mod tests { } #[simd_test(enable = "avx512bw,avx512vl")] - fn test_mm_maskz_madd_epi16() { + const fn test_mm_maskz_madd_epi16() { let a = _mm_set1_epi16(1); let b = _mm_set1_epi16(1); let r = _mm_maskz_madd_epi16(0, a, b); diff --git a/crates/core_arch/src/x86/sse2.rs b/crates/core_arch/src/x86/sse2.rs index f339a003df..ecd478511b 100644 --- a/crates/core_arch/src/x86/sse2.rs +++ b/crates/core_arch/src/x86/sse2.rs @@ -210,20 +210,14 @@ pub const fn _mm_avg_epu16(a: __m128i, b: __m128i) -> __m128i { #[target_feature(enable = "sse2")] #[cfg_attr(test, assert_instr(pmaddwd))] #[stable(feature = "simd_x86", since = "1.27.0")] -pub fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { - // It's a trick used in the Adler-32 algorithm to perform a widening addition. - // - // ```rust - // #[target_feature(enable = "sse2")] - // unsafe fn widening_add(mad: __m128i) -> __m128i { - // _mm_madd_epi16(mad, _mm_set1_epi16(1)) - // } - // ``` - // - // If we implement this using generic vector intrinsics, the optimizer - // will eliminate this pattern, and `pmaddwd` will no longer be emitted. - // For this reason, we use x86 intrinsics. - unsafe { transmute(pmaddwd(a.as_i16x8(), b.as_i16x8())) } +#[rustc_const_unstable(feature = "stdarch_const_x86", issue = "149298")] +pub const fn _mm_madd_epi16(a: __m128i, b: __m128i) -> __m128i { + unsafe { + let r: i32x8 = simd_mul(simd_cast(a.as_i16x8()), simd_cast(b.as_i16x8())); + let even: i32x4 = simd_shuffle!(r, r, [0, 2, 4, 6]); + let odd: i32x4 = simd_shuffle!(r, r, [1, 3, 5, 7]); + simd_add(even, odd).as_m128i() + } } /// Compares packed 16-bit integers in `a` and `b`, and returns the packed @@ -3193,8 +3187,6 @@ unsafe extern "C" { fn lfence(); #[link_name = "llvm.x86.sse2.mfence"] fn mfence(); - #[link_name = "llvm.x86.sse2.pmadd.wd"] - fn pmaddwd(a: i16x8, b: i16x8) -> i32x4; #[link_name = "llvm.x86.sse2.psad.bw"] fn psadbw(a: u8x16, b: u8x16) -> u64x2; #[link_name = "llvm.x86.sse2.psll.w"] @@ -3473,7 +3465,7 @@ mod tests { } #[simd_test(enable = "sse2")] - fn test_mm_madd_epi16() { + const fn test_mm_madd_epi16() { let a = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); let b = _mm_setr_epi16(9, 10, 11, 12, 13, 14, 15, 16); let r = _mm_madd_epi16(a, b);