diff --git a/test_conformance/workgroups/test_wg_scan_reduce.cpp b/test_conformance/workgroups/test_wg_scan_reduce.cpp index 1efe78bc6c..d7dd291499 100644 --- a/test_conformance/workgroups/test_wg_scan_reduce.cpp +++ b/test_conformance/workgroups/test_wg_scan_reduce.cpp @@ -183,7 +183,7 @@ template struct Reduce } static int verify(Type *inptr, Type *outptr, size_t n_elems, - size_t max_wg_size, const Type &max_err = 0) + size_t max_wg_size, const Type *const max_err = nullptr) { for (size_t i = 0; i < n_elems; i += max_wg_size) { @@ -197,7 +197,8 @@ template struct Reduce for (size_t j = 0; j < wg_size; j++) { - if (check_result(outptr[i + j], result, max_err) != CL_SUCCESS) + if (check_result(outptr[i + j], result, max_err[0]) + != CL_SUCCESS) { log_info("%s_%s: Error at %zu\n", testName, testOpName, i + j); @@ -209,7 +210,8 @@ template struct Reduce } static void generate_input_values(Type *inptr, size_t n_elems, - size_t max_wg_size, Type &max_err = 0) + size_t max_wg_size, + Type *const max_err = nullptr) { MTdataHolder d(gRandomSeed); if constexpr (std::is_floating_point_v< @@ -246,7 +248,7 @@ template struct Reduce float s = 0.f; for (auto it = ref_vals.begin(); it != ref_vals.end(); ++it) s += std::abs(cl_half_to_float(*it)); - max_err = cl_half_from_float( + max_err[0] = cl_half_from_float( fabs((max_wg_size - 1) * CL_HALF_EPSILON * s), gHalfRoundingMode); } @@ -274,7 +276,7 @@ template struct Reduce // Higham, N. J. (2002). Accuracy and Stability of Numerical // Algorithms (2nd ed.), Chapter 4: Summation, Section 2: // Error Analysis (worst case error summation) - max_err = (max_wg_size - 1) + max_err[0] = (max_wg_size - 1) * (std::is_same_v ? CL_FLT_EPSILON : CL_DBL_EPSILON) * abs_sum; @@ -299,8 +301,30 @@ template struct ScanInclusive static constexpr const char *deviceTypeName = TestTypeInfo::deviceName; static constexpr const char *kernelName = "test_wg_scan_inclusive"; + + static int check_result(const Type &test_value, const Type &reference, + const Type &max_err = 0) + { + if constexpr (std::is_floating_point_v) + { + if (std::abs(reference - test_value) > max_err) return -1; + } + else if constexpr (std::is_same_v) + { + if (std::abs(cl_half_to_float(reference) + - cl_half_to_float(test_value)) + > cl_half_to_float(max_err)) + return -1; + } + else + { + if (reference != test_value) return -1; + } + return CL_SUCCESS; + } + static int verify(Type *inptr, Type *outptr, size_t n_elems, - size_t max_wg_size, const Type &max_err = 0) + size_t max_wg_size, const Type *const max_err = nullptr) { for (size_t i = 0; i < n_elems; i += max_wg_size) { @@ -310,7 +334,8 @@ template struct ScanInclusive for (size_t j = 0; j < wg_size; ++j) { result = C::combine(result, inptr[i + j]); - if (result != outptr[i + j]) + if (check_result(outptr[i + j], result, max_err[j]) + != CL_SUCCESS) { log_info("%s_%s: Error at %zu\n", testName, testOpName, i + j); @@ -322,10 +347,80 @@ template struct ScanInclusive } static void generate_input_values(Type *inptr, size_t n_elems, - size_t max_wg_size, Type &max_err = 0) + size_t max_wg_size, + Type *const max_err = nullptr) { MTdataHolder d(gRandomSeed); - for (size_t i = 0; i < n_elems; i++) inptr[i] = (Type)genrand_int64(d); + if constexpr (std::is_floating_point_v< + Type> || std::is_same_v) + { + std::vector ref_vals(max_wg_size, 0); + if constexpr (std::is_same_v) + { + // to prevent overflow limit range of randomization + float max_range = 99.0; + float min_range = -99.0; + // generate reference values for one work group + for (size_t j = 0; j < max_wg_size; j++) + ref_vals[j] = cl_half_from_float( + get_random_float(min_range, max_range, d), + gHalfRoundingMode); + + // populate reference data across all work groups + for (size_t i = 0; i < (size_t)n_elems; i += max_wg_size) + { + size_t wg_size = std::min(max_wg_size, n_elems - i); + memcpy(&inptr[i], ref_vals.data(), sizeof(Type) * wg_size); + } + + if constexpr (std::is_same_v>) + { + // compute maximal summation error + float s = std::abs(cl_half_to_float(ref_vals[0])); + for (size_t i = 1; i < (size_t)n_elems; i++) + { + s += std::abs(cl_half_to_float(ref_vals[i])); + max_err[i] = cl_half_from_float( + std::abs((max_wg_size - 1) * CL_HALF_EPSILON * s), + gHalfRoundingMode); + } + } + } + else + { + double max_range = 999.0; + double min_range = -999.0; + for (size_t j = 0; j < max_wg_size; j++) + ref_vals[j] = get_random_float(min_range, max_range, d); + + for (size_t i = 0; i < (size_t)n_elems; i += max_wg_size) + { + size_t work_group_size = std::min(max_wg_size, n_elems - i); + memcpy(&inptr[i], ref_vals.data(), + sizeof(Type) * work_group_size); + } + + if constexpr (std::is_same_v>) + { + // compute maximal summation error + Type s = std::abs(ref_vals[0]); + for (size_t i = 1; i < (size_t)n_elems; i++) + { + s += std::abs(ref_vals[i]); + max_err[i] = std::abs((max_wg_size - 1) + * (std::is_same_v + ? CL_FLT_EPSILON + : CL_DBL_EPSILON) + * s); + } + } + } + } + else + { + for (size_t i = 0; i < n_elems; i++) + inptr[i] = (Type)genrand_int64(d); + } } }; @@ -340,7 +435,7 @@ template struct ScanExclusive TestTypeInfo::deviceName; static constexpr const char *kernelName = "test_wg_scan_exclusive"; static int verify(Type *inptr, Type *outptr, size_t n_elems, - size_t max_wg_size, const Type &max_err = 0) + size_t max_wg_size, const Type *const max_err = nullptr) { for (size_t i = 0; i < n_elems; i += max_wg_size) { @@ -362,7 +457,8 @@ template struct ScanExclusive } static void generate_input_values(Type *inptr, size_t n_elems, - size_t max_wg_size, Type &max_err = 0) + size_t max_wg_size, + const Type *max_err = nullptr) { MTdataHolder d(gRandomSeed); for (size_t i = 0; i < n_elems; i++) inptr[i] = (Type)genrand_int64(d); @@ -409,14 +505,14 @@ static int run_test(cl_device_id device, cl_context context, sizeof(T) * n_elems, NULL, &err); test_error(err, "Unable to create destination buffer"); - std::vector input_ptr(n_elems); + std::vector input_vec(n_elems); - T max_err = 0; - TestInfo::generate_input_values(input_ptr.data(), n_elems, wg_size[0], - max_err); + std::vector max_err_vec(n_elems, 0); + TestInfo::generate_input_values(input_vec.data(), n_elems, wg_size[0], + max_err_vec.data()); err = clEnqueueWriteBuffer(queue, src, CL_TRUE, 0, sizeof(T) * n_elems, - input_ptr.data(), 0, NULL, NULL); + input_vec.data(), 0, NULL, NULL); test_error(err, "clWriteBuffer to initialize src buffer failed"); err = clSetKernelArg(kernel, 0, sizeof(src), &src); @@ -437,8 +533,8 @@ static int run_test(cl_device_id device, cl_context context, output_ptr.data(), 0, NULL, NULL); test_error(err, "clEnqueueReadBuffer to read read dst buffer failed"); - if (TestInfo::verify(input_ptr.data(), output_ptr.data(), n_elems, - wg_size[0], max_err)) + if (TestInfo::verify(input_vec.data(), output_ptr.data(), n_elems, + wg_size[0], max_err_vec.data())) { log_error("%s_%s %s verify failed\n", TestInfo::testName, TestInfo::testOpName, TestInfo::deviceTypeName); @@ -450,6 +546,60 @@ static int run_test(cl_device_id device, cl_context context, return TEST_PASS; } +template