From 5a57e692503da61f649b235c20221b87372f6479 Mon Sep 17 00:00:00 2001 From: TheNumbat Date: Tue, 3 Mar 2026 13:33:19 -0500 Subject: [PATCH] fix cvtt range --- simde/x86/avx.h | 38 ++++++++++++++++++++++++++------------ test/x86/avx.c | 40 ++++++++++++++++++++++++++++++++++------ 2 files changed, 60 insertions(+), 18 deletions(-) diff --git a/simde/x86/avx.h b/simde/x86/avx.h index f30a315a6..fa0b140ac 100644 --- a/simde/x86/avx.h +++ b/simde/x86/avx.h @@ -3823,7 +3823,7 @@ simde__m128i simde_mm256_cvttpd_epi32 (simde__m256d a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvttpd_epi32(a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) simde__m256i_private a_; a_.i256 = __lasx_xvftintrz_w_d(a, a); a_.i256 = __lasx_xvpermi_d(a_.i256, 0xd8); @@ -3832,13 +3832,20 @@ simde_mm256_cvttpd_epi32 (simde__m256d a) { simde__m128i_private r_; simde__m256d_private a_ = simde__m256d_to_private(a); - #if defined(simde_math_trunc) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m64[0] = simde_mm_cvttpd_pi32(a_.m128d[0]); + r_.m64[1] = simde_mm_cvttpd_pi32(a_.m128d[1]); + #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_trunc(a_.f64[i])); + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + simde_float64 v = simde_math_trunc(a_.f64[i]); + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); + #else + r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? + SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; + #endif } - #else - HEDLEY_UNREACHABLE(); #endif return simde__m128i_from_private(r_); @@ -3854,19 +3861,26 @@ simde__m256i simde_mm256_cvttps_epi32 (simde__m256 a) { #if defined(SIMDE_X86_AVX_NATIVE) return _mm256_cvttps_epi32(a); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) + #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) return __lasx_xvftintrz_w_s(a); #else simde__m256i_private r_; simde__m256_private a_ = simde__m256_to_private(a); - #if defined(simde_math_truncf) + #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) + r_.m128i[0] = simde_mm_cvttps_epi32(a_.m128[0]); + r_.m128i[1] = simde_mm_cvttps_epi32(a_.m128[1]); + #else SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); + for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { + simde_float32 v = simde_math_truncf(a_.f32[i]); + #if defined(SIMDE_FAST_CONVERSION_RANGE) + r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); + #else + r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? + SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; + #endif } - #else - HEDLEY_UNREACHABLE(); #endif return simde__m256i_from_private(r_); diff --git a/test/x86/avx.c b/test/x86/avx.c index 4da442e24..f3905184f 100644 --- a/test/x86/avx.c +++ b/test/x86/avx.c @@ -7125,9 +7125,9 @@ test_simde_mm256_cvtpd_epi32(SIMDE_MUNIT_TEST_ARGS) { #endif #if !defined(SIMDE_FAST_CONVERSION_RANGE) { simde_mm256_set_pd( - HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1), + HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1), HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) - 100), - HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), + HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100)), simde_mm_set_epi32( INT32_MIN, INT32_C(2147483547), INT32_MIN, -INT32_C(2147483548)) }, @@ -7218,9 +7218,9 @@ test_simde_mm256_cvtps_epi32(SIMDE_MUNIT_TEST_ARGS) { #endif #if !defined(SIMDE_FAST_CONVERSION_RANGE) { simde_mm256_set_ps( - HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1), + HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1), HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) - 100), - HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), + HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100), 0.f, 0.f, 0.f, 0.f), simde_mm256_set_epi32( @@ -7436,7 +7436,20 @@ test_simde_mm256_cvttpd_epi32(SIMDE_MUNIT_TEST_ARGS) { const struct { simde__m256d a; simde__m128i r; - } test_vec[8] = { + } test_vec[] = { + #if !defined(SIMDE_FAST_NANS) + { simde_mm256_set_pd(SIMDE_MATH_NAN, -SIMDE_MATH_NAN, 0.0, 0.0), + simde_mm_set_epi32( INT32_MIN, INT32_MIN, 0, 0) }, + #endif + #if !defined(SIMDE_FAST_CONVERSION_RANGE) + { simde_mm256_set_pd( + HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1), + HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) - 100), + HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), + HEDLEY_STATIC_CAST(simde_float64, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100)), + simde_mm_set_epi32( + INT32_MIN, INT32_C(2147483547), INT32_MIN, -INT32_C(2147483548)) }, + #endif { simde_mm256_set_pd(SIMDE_FLOAT64_C( -175.82), SIMDE_FLOAT64_C( -91.19), SIMDE_FLOAT64_C( -855.64), SIMDE_FLOAT64_C(-1000.00)), simde_mm_set_epi32(INT32_C(-175), INT32_C( -91), INT32_C(-855), INT32_C(-1000)) }, @@ -7476,7 +7489,22 @@ test_simde_mm256_cvttps_epi32(SIMDE_MUNIT_TEST_ARGS) { const struct { simde__m256 a; simde__m256i r; - } test_vec[8] = { + } test_vec[] = { + #if !defined(SIMDE_FAST_NANS) + { simde_mm256_set_ps(SIMDE_MATH_NAN, -SIMDE_MATH_NAN, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f), + simde_mm256_set_epi32( INT32_MIN, INT32_MIN, 0, 0, 0, 0, 0, 0) }, + #endif + #if !defined(SIMDE_FAST_CONVERSION_RANGE) + { simde_mm256_set_ps( + HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) + 1), + HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MAX) - 100), + HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) - 1), + HEDLEY_STATIC_CAST(simde_float32, HEDLEY_STATIC_CAST(int64_t, INT32_MIN) + 100), + 0.f, 0.f, 0.f, 0.f), + simde_mm256_set_epi32( + INT32_MIN, INT32_C(2147483520), INT32_MIN, -INT32_C(2147483520), + 0, 0, 0, 0) }, + #endif { simde_mm256_set_ps(SIMDE_FLOAT32_C( -135.75), SIMDE_FLOAT32_C( 534.39), SIMDE_FLOAT32_C( -81.93), SIMDE_FLOAT32_C( -234.94), SIMDE_FLOAT32_C( -390.94), SIMDE_FLOAT32_C( -625.05),