diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu index ca5ccdc4ce..6edc6bd63b 100644 --- a/tests/cpp/operator/test_act.cu +++ b/tests/cpp/operator/test_act.cu @@ -124,6 +124,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); fillUniform(&ograd); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output = std::make_unique(N*H); std::unique_ptr ref_igrad = std::make_unique(N*H); @@ -132,7 +133,7 @@ void performTest(const size_t N, const size_t H) { float ref_amax; compute_ref_act_cast(input.rowwise_cpu_dptr(), ref_output.get(), - output.scale(), &ref_amax, N, H); + ref_scale, &ref_amax, N, H); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -179,6 +180,7 @@ void performTestGLU(const size_t N, const size_t H) { fillUniform(&input); fillUniform(&ograd); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output = std::make_unique(N * H); std::unique_ptr ref_igrad = std::make_unique(2 * N * H); @@ -187,7 +189,7 @@ void performTestGLU(const size_t N, const size_t H) { float ref_amax; compute_ref_glu_act_cast(input.rowwise_cpu_dptr(), ref_output.get(), - output.scale(), &ref_amax, N, H); + ref_scale, &ref_amax, N, H); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -197,8 +199,8 @@ void performTestGLU(const size_t N, const size_t H) { auto [atol, rtol] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol, rtol); if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { - const float ref_scale = 1.f / output.scale(); - compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr(), ref_scale, atol, rtol); + const float ref_scale_inv = 1.f / ref_scale; + compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr(), ref_scale_inv, atol, rtol); } } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu index 35d9dd2efd..e8f48feef8 100644 --- a/tests/cpp/operator/test_cast.cu +++ b/tests/cpp/operator/test_cast.cu @@ -53,13 +53,14 @@ void performTest(const std::vector& shape) { fillUniform(&input); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; nvte_quantize(input.data(), output_c.data(), 0); float ref_amax; compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - full_size, &ref_amax, output_c.scale()); + full_size, &ref_amax, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -67,7 +68,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_current_scaling.cu b/tests/cpp/operator/test_cast_current_scaling.cu index 4dd6cd2d58..7cca0d72e0 100644 --- a/tests/cpp/operator/test_cast_current_scaling.cu +++ b/tests/cpp/operator/test_cast_current_scaling.cu @@ -123,6 +123,7 @@ void performTest(const std::vector& shape) { nvte_compute_amax(input.data(), output_c.data(), 0); QuantizationConfigWrapper config; nvte_compute_scale_from_amax(output_c.data(), config, 0); + // avoid atomic amax update in cuda cast kernels because of current per-tensor scaling amax_to_check = output_c.amax(); output_c.set_tensor_amax_nullptr(); @@ -130,7 +131,7 @@ void performTest(const std::vector& shape) { nvte_quantize(input.data(), output_c.data(), 0); float ref_amax; - float ref_scale; + float ref_scale = 1.0; float ref_scale_inv; if (is_out_fp8){ compute_amax_scale_ref(input.rowwise_cpu_dptr(), @@ -138,13 +139,13 @@ void performTest(const std::vector& shape) { } compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f ); + full_size, nullptr, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); - if (isFp8Type(otype)) { + if (is_out_fp8) { auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32); compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32); compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32); diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu index 18f07153c6..b7b5db48c3 100644 --- a/tests/cpp/operator/test_cast_dbias.cu +++ b/tests/cpp/operator/test_cast_dbias.cu @@ -74,13 +74,14 @@ void performTest(const std::vector& shape) { fillUniform(&input); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_dbias = std::make_unique(H); CType ref_amax; compute_ref_cast_dbias(input.rowwise_cpu_dptr(), - output_c.scale(), + ref_scale, ref_output_c.get(), &ref_amax, ref_output_dbias.get(), @@ -109,7 +110,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu index 8213e5665a..d8b8a20e6f 100644 --- a/tests/cpp/operator/test_cast_dbias_dgelu.cu +++ b/tests/cpp/operator/test_cast_dbias_dgelu.cu @@ -84,6 +84,7 @@ void performTest(const std::vector& shape) { fillUniform(&input); fillUniform(&grad); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_dbias = std::make_unique(H); @@ -91,7 +92,7 @@ void performTest(const std::vector& shape) { CType ref_amax; compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr(), grad.rowwise_cpu_dptr(), - output_c.scale(), + ref_scale, ref_output_c.get(), &ref_amax, ref_output_dbias.get(), @@ -123,7 +124,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_cast_float8blockwise.cu b/tests/cpp/operator/test_cast_float8blockwise.cu index 8e9da91d08..d50589ca43 100644 --- a/tests/cpp/operator/test_cast_float8blockwise.cu +++ b/tests/cpp/operator/test_cast_float8blockwise.cu @@ -524,14 +524,12 @@ TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) { // GTEST_SKIP(); // } - DACT_FUNC_SWITCH( - Act_type, OP, - TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY( - input_type, InputType, - TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY( - output_type, OutputType, - runTestCase(processing_method, matrix_size, rowwise, colwise, - fill_case, q_opts);););); + TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY( + input_type, InputType, + TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY( + output_type, OutputType, + runTestCase(processing_method, matrix_size, rowwise, colwise, + fill_case, q_opts););); } TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) { @@ -581,14 +579,12 @@ TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) { // GTEST_SKIP(); // } - DACT_FUNC_SWITCH( - Act_type, OP, - TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY( - input_type, InputType, - TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY( - output_type, OutputType, - runTestCaseOneDimensionalBlocks( - processing_method, matrix_size, rowwise, colwise, fill_case, q_opts);););); + TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY( + input_type, InputType, + TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY( + output_type, OutputType, + runTestCaseOneDimensionalBlocks( + processing_method, matrix_size, rowwise, colwise, fill_case, q_opts););); } std::string to_string(const ProcessingMethod method) { diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu index 298b978f2a..5298cc7577 100644 --- a/tests/cpp/operator/test_cast_gated_swiglu.cu +++ b/tests/cpp/operator/test_cast_gated_swiglu.cu @@ -79,6 +79,7 @@ void performTest(const std::vector& shape) { fillUniform(&grad); fillUniform(&input); setRandomScale(&output_c); + const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(input_size); @@ -91,7 +92,7 @@ void performTest(const std::vector& shape) { float ref_amax; compute_ref_cast_dgated_swiglu(grad.rowwise_cpu_dptr(), input.rowwise_cpu_dptr(), - output_c.scale(), + ref_scale, ref_output_c.get(), &ref_amax, rows, @@ -100,7 +101,7 @@ void performTest(const std::vector& shape) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output_c.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu index 1f37520bc7..a8f58f8598 100644 --- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu +++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu @@ -4,6 +4,15 @@ * See LICENSE for license information. ************************************************************************/ +#include +#include +#include +#include +#include +#include +#include +#include + #include #include #include @@ -14,7 +23,6 @@ #include #include "../test_common.h" #include "transformer_engine/transformer_engine.h" -#include using namespace transformer_engine; using namespace test; @@ -309,40 +317,24 @@ void compute_ref(float (*OP)(const float), fp4e2m1x2* output_t, fp8e4m3* scales, fp8e4m3* scales_t, - const float global_amax, + const float* amax, const size_t rows, const size_t cols, const size_t scales_stride, const size_t scales_stride_t, const bool use_fast_math, const bool use_2d_quantization = false, - std::vector *rowwise_amax = nullptr) + const bool row_scaled_nvfp4 = false) { std::vector input_t = create_transpose(input, rows, cols); + NVTE_CHECK(!(use_2d_quantization && row_scaled_nvfp4), + "2D quantization and row-scaling are not supported together."); - if (rowwise_amax != nullptr) { - rowwise_amax->resize(rows, 0.0f); - for (size_t row = 0; row < rows; ++row) { - float row_amax = 0.0f; - for (size_t col = 0; col < cols; ++col) { - row_amax = fmaxf(row_amax, fabsf(static_cast(input[row * cols + col]))); - } - (*rowwise_amax)[row] = row_amax; - quantize_nvfp4(OP, - input + row * cols, - output + row * (cols / 2), - scales + row * scales_stride, - 1, - cols, - scales_stride, - row_amax, - use_fast_math, - use_2d_quantization); - } - } else if (use_2d_quantization) { + // Ref impl for 2D quantization + if (use_2d_quantization) { // Step 1: Compute mathematical 8×8 scaling factors std::vector> math_scales; - compute_2d_mathematical_scales(OP, input, rows, cols, global_amax, math_scales, use_fast_math); + compute_2d_mathematical_scales(OP, input, rows, cols, *amax, math_scales, use_fast_math); constexpr size_t block_size_Y = 16; constexpr size_t block_size_X = 16; @@ -369,17 +361,36 @@ void compute_ref(float (*OP)(const float), // Step 4: Process quantized outputs using the same algorithm as quantize_nvfp4_2d // (This part processes the actual FP4 data using the mathematical scaling factors) - quantize_nvfp4_2d(OP, input, output, nullptr, rows, cols, scales_stride, global_amax, + quantize_nvfp4_2d(OP, input, output, nullptr, rows, cols, scales_stride, *amax, use_fast_math); // scales already filled - quantize_nvfp4_2d(OP, input_t.data(), output_t, nullptr, cols, rows, scales_stride_t, global_amax, + quantize_nvfp4_2d(OP, input_t.data(), output_t, nullptr, cols, rows, scales_stride_t, *amax, use_fast_math); // scales_t already filled - } else { - quantize_nvfp4(OP, input, output, scales, rows, cols, scales_stride, global_amax, - use_fast_math, use_2d_quantization); - quantize_nvfp4(OP, input_t.data(), output_t, scales_t, cols, rows, scales_stride_t, global_amax, - use_fast_math, use_2d_quantization); + return; + } + + // Ref impl for row-scaling + if (row_scaled_nvfp4) { + for (size_t row = 0; row < rows; ++row) { + quantize_nvfp4(OP, + input + row * cols, + output + row * (cols / 2), + scales + row * scales_stride, + 1, + cols, + scales_stride, + amax[row], + use_fast_math, + use_2d_quantization); + } + return; } + + // Ref impl for basic NVFP4 + quantize_nvfp4(OP, input, output, scales, rows, cols, scales_stride, *amax, + use_fast_math, use_2d_quantization); + quantize_nvfp4(OP, input_t.data(), output_t, scales_t, cols, rows, scales_stride_t, *amax, + use_fast_math, use_2d_quantization); } void compare_nvfp4_tensors(const std::string& name, @@ -479,48 +490,7 @@ void dump_nvfp4_tensor_data(const std::string& prefix, } } -void print_detailed_tensor_comparison(const std::string& name, - const fp4e2m1 *test_data, const fp4e2m1 *ref_data, - const int rows, const int cols) { - printf("\n=== DETAILED COMPARISON for %s (%d×%d = %d elements) ===\n", - name.c_str(), rows, cols, rows * cols); - - const int total_elements = rows * cols; - const int check_count = 128; - - printf("--- FIRST %d ELEMENTS ---\n", check_count); - printf("Index | Test_Value | Ref_Value | Match\n"); - printf("------|---------------|---------------|-------\n"); - for (int i = 0; i < std::min(check_count, total_elements); ++i) { - double2 test_pair = cvt_fp4x2_to_double2(*reinterpret_cast(&test_data[i/2])); - double2 ref_pair = cvt_fp4x2_to_double2(*reinterpret_cast(&ref_data[i/2])); - - double t = (i % 2 == 0) ? test_pair.x : test_pair.y; - double r = (i % 2 == 0) ? ref_pair.x : ref_pair.y; - bool match = (fabs(t - r) < 1e-6); - - printf("%5d | %13.6f | %13.6f | %s\n", i, t, r, match ? "✓" : "✗"); - } - - if (total_elements > 2 * check_count) { - printf("\n--- LAST %d ELEMENTS ---\n", check_count); - printf("Index | Test_Value | Ref_Value | Match\n"); - printf("------|---------------|---------------|-------\n"); - for (int i = total_elements - check_count; i < total_elements; ++i) { - double2 test_pair = cvt_fp4x2_to_double2(*reinterpret_cast(&test_data[i/2])); - double2 ref_pair = cvt_fp4x2_to_double2(*reinterpret_cast(&ref_data[i/2])); - - double t = (i % 2 == 0) ? test_pair.x : test_pair.y; - double r = (i % 2 == 0) ? ref_pair.x : ref_pair.y; - bool match = (fabs(t - r) < 1e-6); - - printf("%5d | %13.6f | %13.6f | %s\n", i, t, r, match ? "✓" : "✗"); - } - } - printf("==================================\n"); -} - -void compareResults_nvfp4(const Tensor &test, +void compareResults_nvfp4(Tensor &test, const void *ref, const void *ref_t, const int rows, const int cols, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true, bool dump_data = false, bool compare_columnwise = true) { @@ -529,10 +499,6 @@ void compareResults_nvfp4(const Tensor &test, const fp4e2m1 *test_data = test.rowwise_cpu_dptr(); const fp4e2m1 *ref_data = reinterpret_cast(ref); - // Print detailed element-by-element comparison - // print_detailed_tensor_comparison("output", test_data, ref_data, rows, cols); - // print_detailed_tensor_comparison("output_t", test_data_t, ref_data_t, cols, rows); - // Optionally dump tensor data to files for detailed analysis if (dump_data) { dump_nvfp4_tensor_data("output", test_data, ref_data, rows, cols); @@ -549,9 +515,10 @@ void compareResults_nvfp4(const Tensor &test, } } -void compare_rowwise_amax(const Tensor &output, const std::vector &ref_amax) { - const std::vector test_amax_data = output.tensor_amax_values(); - ASSERT_EQ(test_amax_data.size(), ref_amax.size()); +void compare_rowwise_amax(Tensor &output, const std::vector &ref_amax) { + ASSERT_EQ(output.rowwise_amax_size(), ref_amax.size()); + const auto *amax_ptr = output.cpu_rowwise_amax_ptr(); + const std::vector test_amax_data(amax_ptr, amax_ptr + ref_amax.size()); for (size_t row = 0; row < ref_amax.size(); ++row) { ASSERT_EQ(test_amax_data[row], ref_amax[row]) << "Row-scaled amax mismatch at row " << row; @@ -568,6 +535,9 @@ void performTest(float (*OP)(const float), DType itype = TypeInfo::dtype; DType otype = DType::kFloat4E2M1; + const bool rowwise = true; + const bool columnwise = !row_scaled_nvfp4; + const size_t rows = first_dimension(shape); const size_t cols = last_dimension(shape); @@ -589,7 +559,7 @@ void performTest(float (*OP)(const float), const size_t scales_stride_t = blocks_X_t; Tensor input("input", shape, itype); - Tensor output("output", shape, otype, true, !row_scaled_nvfp4, NVTE_NVFP4_1D_SCALING); + Tensor output("output", shape, otype, rowwise, columnwise, NVTE_NVFP4_1D_SCALING); std::unique_ptr ref_output = std::make_unique(rows * (cols / 2)); std::unique_ptr ref_output_t = std::make_unique(cols * (rows / 2)); @@ -598,58 +568,65 @@ void performTest(float (*OP)(const float), fillCase(&input, InputsFillCase::uniform); - // Golden value of amax chosen to make the 2nd-stage scaling mantissa zero and avoid rounding issues - const float amax = 448.0f * 6.0f * 8.0f; - std::vector ref_rowwise_amax; - bool use_2d_quantization = false; + // Compute 2nd stage NVFP4 scaling factor + std::vector ref_amax; if (row_scaled_nvfp4) { - output.set_tensor_amax_shape({rows}); - output.set_row_scaled_nvfp4(true); - compute_ref(OP, - input.rowwise_cpu_dptr(), - ref_output.get(), - ref_output_t.get(), - ref_scales.get(), - ref_scales_t.get(), - 0.0f, - rows, - cols, - scales_stride, - scales_stride_t, - use_fast_math, - use_2d_quantization, - &ref_rowwise_amax); + // Compute per-row amaxes + const auto *input_vals = input.rowwise_cpu_dptr(); + for (size_t row = 0; row < rows; ++row){ + float row_amax = 0.0f; + for (size_t col = 0; col < cols; ++col) { + row_amax = fmaxf(row_amax, fabsf(static_cast(input_vals[row * cols + col]))); + } + ref_amax.push_back(row_amax); + } + + // Update tensor + // Note: No need to update amax like standard NVFP4, amaxes + // are computed during quantization. + output.set_row_scaled_nvfp4(row_scaled_nvfp4); } else { - // Set 2nd stage NVFP4 scaling factor - output.set_tensor_amax(amax); - output.set_tensor_amax_columnwise(amax); - compute_ref(OP, - input.rowwise_cpu_dptr(), - ref_output.get(), - ref_output_t.get(), - ref_scales.get(), - ref_scales_t.get(), - amax, - rows, - cols, - scales_stride, - scales_stride_t, - use_fast_math, - use_2d_quantization); + // Golden value of amax chosen to make the 2nd-stage scaling mantissa zero and avoid rounding issues + ref_amax.assign(1, 448.0f * 6.0f * 8.0f); + + // Update tensor + if (rowwise) { + std::copy(ref_amax.begin(), ref_amax.end(), output.cpu_rowwise_amax_ptr()); + } + if (columnwise) { + std::copy(ref_amax.begin(), ref_amax.end(), output.cpu_columnwise_amax_ptr()); + } + output.from_cpu(); } + // Reference implementation + bool use_2d_quantization = false; + compute_ref(OP, + input.rowwise_cpu_dptr(), + ref_output.get(), + ref_output_t.get(), + ref_scales.get(), + ref_scales_t.get(), + ref_amax.data(), + rows, + cols, + scales_stride, + scales_stride_t, + use_fast_math, + use_2d_quantization, + row_scaled_nvfp4); + // Initialize stochastic rounding Tensor rng_state("rng_state", std::vector{2}, DType::kInt64); rng_state.rowwise_cpu_dptr()[0] = 123; // rng_seed rng_state.rowwise_cpu_dptr()[1] = 321; // rng_sequence rng_state.from_cpu(); + // Quantization options QuantizationConfigWrapper quant_config; quant_config.set_use_fast_math(use_fast_math); quant_config.set_stochastic_rounding(false); quant_config.set_rng_state(rng_state.data()); - - // Set 2D quantization based on compile-time flag quant_config.set_nvfp4_2d_quantization(use_2d_quantization); // Call appropriate function based on operation type @@ -696,9 +673,7 @@ void performTest(float (*OP)(const float), scale_mismatches_num); } - if (row_scaled_nvfp4) { - compare_rowwise_amax(output, ref_rowwise_amax); - } + compare_rowwise_amax(output, ref_amax); } std::vector> tensor_dims = { diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu index 44c78e4a09..9a5dc959da 100644 --- a/tests/cpp/operator/test_cast_transpose.cu +++ b/tests/cpp/operator/test_cast_transpose.cu @@ -55,13 +55,13 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; nvte_quantize(input.data(), output.data(), 0); float ref_amax; compute_ref(input.rowwise_cpu_dptr(), ref_output_c.get(), - ref_output_t.get(), N, H, &ref_amax, - output.scale()); + ref_output_t.get(), N, H, &ref_amax, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); @@ -69,7 +69,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_transpose_dbias.cu b/tests/cpp/operator/test_cast_transpose_dbias.cu index 5b06b28327..f9303d34f5 100644 --- a/tests/cpp/operator/test_cast_transpose_dbias.cu +++ b/tests/cpp/operator/test_cast_transpose_dbias.cu @@ -73,6 +73,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_t = std::make_unique(N*H); @@ -80,7 +81,7 @@ void performTest(const size_t N, const size_t H) { CType ref_amax; compute_ref_cast_transpose_dbias(input.rowwise_cpu_dptr(), - output.scale(), + ref_scale, ref_output_c.get(), ref_output_t.get(), &ref_amax, @@ -111,7 +112,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu index 9a4a2fa080..31eafff80f 100644 --- a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu +++ b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu @@ -86,6 +86,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&input); fillUniform(&gelu_input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N*H); std::unique_ptr ref_output_t = std::make_unique(N*H); @@ -94,7 +95,7 @@ void performTest(const size_t N, const size_t H) { CType ref_amax; compute_ref_cast_transpose_dbias_dgelu(input.rowwise_cpu_dptr(), gelu_input.rowwise_cpu_dptr(), - output.scale(), + ref_scale, ref_output_c.get(), ref_output_t.get(), &ref_amax, @@ -127,7 +128,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_cast_transpose_dgeglu.cu b/tests/cpp/operator/test_cast_transpose_dgeglu.cu index a87c0c5a42..15ecd3ab66 100644 --- a/tests/cpp/operator/test_cast_transpose_dgeglu.cu +++ b/tests/cpp/operator/test_cast_transpose_dgeglu.cu @@ -81,6 +81,7 @@ void performTest(const size_t N, const size_t H) { fillUniform(&grad); fillUniform(&input); setRandomScale(&output); + const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f; std::unique_ptr ref_output_c = std::make_unique(N * H * 2); std::unique_ptr ref_output_t = std::make_unique(N * H * 2); @@ -89,7 +90,7 @@ void performTest(const size_t N, const size_t H) { CType ref_amax; compute_ref_cast_transpose_dgated_gelu(grad.rowwise_cpu_dptr(), input.rowwise_cpu_dptr(), - output.scale(), ref_output_c.get(), ref_output_t.get(), + ref_scale, ref_output_c.get(), ref_output_t.get(), &ref_amax, N, H); cudaDeviceSynchronize(); @@ -99,7 +100,7 @@ void performTest(const size_t N, const size_t H) { if (isFp8Type(otype)) { auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / output.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_dequantize_nvfp4.cu b/tests/cpp/operator/test_dequantize_nvfp4.cu index ec405b1d90..eb9e8bce23 100644 --- a/tests/cpp/operator/test_dequantize_nvfp4.cu +++ b/tests/cpp/operator/test_dequantize_nvfp4.cu @@ -76,7 +76,7 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data, } template -float compute_amax(const test::Tensor &t, size_t rows, size_t cols) { +float compute_amax(test::Tensor &t, size_t rows, size_t cols) { t.to_cpu(); const auto *data = t.rowwise_cpu_dptr(); float amax = 0.0f; @@ -94,50 +94,63 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols, using namespace test; DType otype = TypeInfo::dtype; + // Tensors Tensor input("input", std::vector{rows, cols}, otype); - fillCase(&input, InputsFillCase::uniform); - Tensor quantized("quantized", std::vector{rows, cols}, DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING); + Tensor output("output", std::vector{rows, cols}, otype, true, false); + + // Fill input with random data + fillCase(&input, InputsFillCase::uniform); + + // Configure quantized tensor amax + size_t amax_size = 1; if (row_scaled_nvfp4) { - quantized.set_tensor_amax_shape({rows}); - quantized.set_row_scaled_nvfp4(true); + quantized.set_row_scaled_nvfp4(true); + amax_size = rows; } else if (rows > 0 && cols > 0) { - quantized.set_tensor_amax(compute_amax(input, rows, cols)); + quantized.set_amax(compute_amax(input, rows, cols)); } else { - quantized.set_tensor_amax(0.0f); + quantized.set_amax(0.0f); } + // Quantize if (rows > 0 && cols > 0) { nvte_quantize(input.data(), quantized.data(), 0); cudaDeviceSynchronize(); + auto err = cudaGetLastError(); + ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); } - Tensor output("output", std::vector{rows, cols}, otype, true, false); + // Dequantize nvte_dequantize(quantized.data(), output.data(), 0); cudaDeviceSynchronize(); - auto err = cudaGetLastError(); ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err); - if (rows > 0 && cols > 0) { - quantized.to_cpu(); - const uint8_t *fp4_data = - reinterpret_cast(quantized.rowwise_cpu_dptr()); - const fp8e4m3 *scales = quantized.rowwise_cpu_scale_inv_ptr(); - const std::vector amax_val = quantized.tensor_amax_values(); - const NVTEShape scale_shape = quantized.rowwise_scale_inv_shape(); - const size_t scale_stride = scale_shape.data[scale_shape.ndim - 1]; - - std::unique_ptr ref_output = - std::make_unique(rows * cols); - compute_ref_dequantize_nvfp4( - fp4_data, scales, amax_val, ref_output.get(), - rows, cols, scale_stride); - - auto [atol, rtol] = getTolerances(otype); - compareResults("output_nvfp4", output, ref_output.get(), true, atol, rtol); + // Nothing to be done if tensor is empty + if (rows == 0 && cols == 0) { + return; } + + // Dequantize reference implementation + quantized.to_cpu(); + const uint8_t *fp4_data = + reinterpret_cast(quantized.rowwise_cpu_dptr()); + const fp8e4m3 *scales = quantized.rowwise_cpu_scale_inv_ptr(); + const auto *amax = quantized.cpu_rowwise_amax_ptr(); + const std::vector amax_vals(amax, amax + amax_size); + const NVTEShape scale_shape = quantized.rowwise_scale_inv_shape(); + const size_t scale_stride = scale_shape.data[scale_shape.ndim - 1]; + std::unique_ptr ref_output = + std::make_unique(rows * cols); + compute_ref_dequantize_nvfp4( + fp4_data, scales, amax_vals, ref_output.get(), + rows, cols, scale_stride); + + // Compare results from TE and reference impls + auto [atol, rtol] = getTolerances(otype); + compareResults("output_nvfp4", output, ref_output.get(), true, atol, rtol); } // Dequantize NVFP4 with GEMM-swizzled scales and compare against compact path. @@ -153,12 +166,11 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols, Tensor quantized_compact("quantized_compact", std::vector{rows, cols}, DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING); if (row_scaled_nvfp4) { - quantized_compact.set_tensor_amax_shape({rows}); quantized_compact.set_row_scaled_nvfp4(true); } else if (rows > 0 && cols > 0) { - quantized_compact.set_tensor_amax(compute_amax(input, rows, cols)); + quantized_compact.set_amax(compute_amax(input, rows, cols)); } else { - quantized_compact.set_tensor_amax(0.0f); + quantized_compact.set_amax(0.0f); } if (rows > 0 && cols > 0) { @@ -175,10 +187,9 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols, Tensor quantized_swizzled("quantized_swizzled", std::vector{rows, cols}, DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING); if (row_scaled_nvfp4) { - quantized_swizzled.set_tensor_amax_shape({rows}); quantized_swizzled.set_row_scaled_nvfp4(true); } else { - quantized_swizzled.set_tensor_amax(0.0f); + quantized_swizzled.set_amax(0.0f); } quantized_swizzled.set_with_gemm_swizzled_scales(true); @@ -186,9 +197,12 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols, // since from_cpu() uploads all CPU buffers (including zero-init data). quantized_compact.to_cpu(); if (row_scaled_nvfp4) { - quantized_swizzled.copy_tensor_amax_from(quantized_compact); + const auto *src = quantized_compact.cpu_rowwise_amax_ptr(); + auto *dst = quantized_swizzled.cpu_rowwise_amax_ptr(); + std::copy(src, src + rows, dst); + quantized_swizzled.from_cpu(); } else { - quantized_swizzled.set_tensor_amax(quantized_compact.amax()); + quantized_swizzled.set_amax(quantized_compact.amax()); } // Copy FP4 data after from_cpu() to avoid being overwritten diff --git a/tests/cpp/operator/test_multi_cast_transpose.cu b/tests/cpp/operator/test_multi_cast_transpose.cu index 2bb35c4b89..0271c9dc6b 100644 --- a/tests/cpp/operator/test_multi_cast_transpose.cu +++ b/tests/cpp/operator/test_multi_cast_transpose.cu @@ -97,7 +97,7 @@ void performTest() { std::copy(input.rowwise_cpu_dptr(), input.rowwise_cpu_dptr() + height * width, ref_input_list.back().begin()); - ref_scale_list[tensor_id] = output.scale(); + ref_scale_list[tensor_id] = isFp8Type(otype) ? output.scale() : 1.0f; ref_height_list[tensor_id] = height; ref_width_list[tensor_id] = width; } @@ -138,7 +138,7 @@ void performTest() { atol_amax, rtol_amax); compareResults("scale_inv", output_list[tensor_id].rowwise_scale_inv(), - 1.f / output_list[tensor_id].scale(), + 1.f / ref_scale_list[tensor_id], atol_amax, rtol_amax); } auto [atol, rtol] = getTolerances(otype); diff --git a/tests/cpp/operator/test_normalization.cu b/tests/cpp/operator/test_normalization.cu index f737005e26..ea6692dba4 100644 --- a/tests/cpp/operator/test_normalization.cu +++ b/tests/cpp/operator/test_normalization.cu @@ -208,7 +208,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma, auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32); if (isFp8Type(otype)) { compareResults("amax", z.amax(), ref_amax, atol_amax, rtol_amax); - float ref_scale_inv = 1.f / z.scale(); + float ref_scale_inv = 1.f / ref_scale; compareResults("scale_inv", z.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax); } diff --git a/tests/cpp/operator/test_qdq.cu b/tests/cpp/operator/test_qdq.cu index 4e364fffa4..034280aa9a 100644 --- a/tests/cpp/operator/test_qdq.cu +++ b/tests/cpp/operator/test_qdq.cu @@ -65,12 +65,13 @@ void performTestQ(const size_t N) { fillUniform(&input); setRandomScale(&output); + const float ref_scale = output.scale(); nvte_quantize(input.data(), output.data(), 0); float ref_amax; compute_ref_q(input.rowwise_cpu_dptr(), ref_output.get(), - N, &ref_amax, output.scale()); + N, &ref_amax, ref_scale); cudaDeviceSynchronize(); auto err = cudaGetLastError(); diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu index 96e71f9513..4fd75bb927 100644 --- a/tests/cpp/test_common.cu +++ b/tests/cpp/test_common.cu @@ -8,12 +8,14 @@ #include "test_common.h" #include +#include +#include +#include +#include #include #include +#include #include -#include -#include -#include #include #include @@ -193,33 +195,6 @@ std::pair get_scales(const NVTEShape& shape, return {ret_rowwise, ret_colwise}; } - if (scaling_mode == NVTE_MXFP8_1D_SCALING) { - std::vector shape_vec; - for (size_t i = 0; i < shape.ndim; ++i) { - shape_vec.push_back(shape.data[i]); - } - size_t first_dim = first_dimension(shape_vec); - size_t last_dim = last_dimension(shape_vec); - - scale_inv_meta ret_rowwise, ret_colwise; - - const size_t block_size_X_rowwise = 32; - size_t scale_dim_Y_rowwise = DIVUP_TO_MULTIPLE(first_dim, scale_tensor_alignment_Y_rowwise); - size_t scale_dim_X_rowwise = DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_size_X_rowwise), scale_tensor_alignment_X_rowwise); - ret_rowwise.shape = {scale_dim_Y_rowwise, scale_dim_X_rowwise}; - - const size_t block_size_Y_colwise = 32; - size_t scale_dim_Y_colwise = DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_size_Y_colwise), scale_tensor_alignment_Y_colwise); - size_t scale_dim_X_colwise = DIVUP_TO_MULTIPLE(last_dim, scale_tensor_alignment_X_colwise); - ret_colwise.shape = {scale_dim_Y_colwise, scale_dim_X_colwise}; - - ret_rowwise.type = DType::kFloat8E8M0; - ret_colwise.type = DType::kFloat8E8M0; - ret_rowwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0); - ret_colwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0); - - return {ret_rowwise, ret_colwise}; - } if (scaling_mode == NVTE_BLOCK_SCALING_2D) { std::vector shape_vec; for (size_t i = 0; i < shape.ndim; ++i) { @@ -276,6 +251,30 @@ std::pair get_scales(const NVTEShape& shape, NVTE_ERROR("Invalid scaling mode!"); } +Tensor::Buffer::Buffer(size_t size, DType dtype) + : size_{size}, dtype_{dtype}, bytes_{size * typeToNumBits(dtype) / 8} { + if (bytes_ > 0) { + cpu_buffer_.reset(new unsigned char[bytes_]); + std::memset(cpu_buffer_.get(), 0, bytes_); + unsigned char *gpu_buffer = nullptr; + NVTE_CHECK_CUDA(cudaMalloc(&gpu_buffer, bytes_)); + gpu_buffer_.reset(gpu_buffer); + NVTE_CHECK_CUDA(cudaMemset(gpu_buffer_.get(), 0, bytes_)); + } +} + +void Tensor::Buffer::to_cpu() { + if (bytes_ > 0) { + NVTE_CHECK_CUDA(cudaMemcpy(cpu_buffer_.get(), gpu_buffer_.get(), bytes_, cudaMemcpyDeviceToHost)); + } +} + +void Tensor::Buffer::from_cpu() { + if (bytes_ > 0) { + NVTE_CHECK_CUDA(cudaMemcpy(gpu_buffer_.get(), cpu_buffer_.get(), bytes_, cudaMemcpyHostToDevice)); + } +} + Tensor::Tensor(const std::string& name, const NVTEShape &shape, const DType type, const bool rowwise, const bool columnwise, @@ -303,31 +302,13 @@ Tensor::Tensor(const std::string& name, flattened_shape = convertShape(flattened_shape_vec); } - // Allocate and initialize data - void *dptr_rowwise = nullptr, *dptr_columnwise = nullptr; - const size_t total_size = bytes(shape, type); - if (total_size != 0) { - if (rowwise) { - cudaMalloc((void**)&dptr_rowwise, total_size); // NOLINT(*) - cudaMemset(dptr_rowwise, 0, total_size); - cpu_data_rowwise_ = std::make_unique(total_size); - std::fill_n(cpu_data_rowwise_.get(), total_size, 0); - } - if (columnwise) { - cudaMalloc((void**)&dptr_columnwise, total_size); // NOLINT(*) - cudaMemset(dptr_columnwise, 0, total_size); - cpu_data_columnwise_ = std::make_unique(total_size); - std::fill_n(cpu_data_columnwise_.get(), total_size, 0); - } - } - - // Set tensor row-wise data + // Allocate row-wise data if (rowwise) { - const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type; - tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape); + data_rowwise_.emplace(product(shape), type); + tensor_.set_rowwise_data(data_rowwise_->gpu_buffer(), type, shape); } - // Set tensor column-wise data + // Allocate column-wise data if (columnwise) { // Determine shape of column-wise data std::vector columnwise_shape_vec; @@ -358,310 +339,224 @@ Tensor::Tensor(const std::string& name, const auto columnwise_shape = nvte_make_shape(columnwise_shape_vec.data(), columnwise_shape_vec.size()); - // Set column-wise data buffer - const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type; - tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape); + // Allocate buffer + data_columnwise_.emplace(product(columnwise_shape), type); + + // Configure TE tensor + tensor_.set_columnwise_data(data_columnwise_->gpu_buffer(), type, columnwise_shape); } - // Configure scales, amaxes, and other tensor buffers - float *amax = nullptr; - float *amax_columnwise = nullptr; - float *scale = nullptr; - float *rowwise_scale_inv = nullptr; - float *columnwise_scale_inv = nullptr; - if (isFp8Type(type) || isFp4Type(type)) { - if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) { - cudaMalloc((void**)&amax, sizeof(float)); // NOLINT(*) - cudaMemset(amax, 0, sizeof(float)); - cudaMalloc((void**)&scale, sizeof(float)); // NOLINT(*) - cudaMemset(scale, 0, sizeof(float)); - amax_cpu_data_ = std::make_shared(0); - scale_cpu_data_ = std::make_shared(0); - tensor_.set_amax(amax, DType::kFloat32, std::vector{1}); - tensor_.set_scale(scale, DType::kFloat32, std::vector{1}); - cudaMalloc((void**)&rowwise_scale_inv, sizeof(float)); // NOLINT(*) + // Allocate recipe-specific buffers + switch (scaling_mode) { + case NVTE_DELAYED_TENSOR_SCALING: + if (isFp8Type(type)) { + amax_rowwise_.emplace(1, DType::kFloat32); + scale_.emplace(1, DType::kFloat32); + tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); + tensor_.set_scale(scale_->gpu_buffer(), DType::kFloat32, std::vector{1}); + + // Use same buffer for row-wise and column-wise scale-inverse + auto scale_inv = std::make_shared(1, DType::kFloat32); if (rowwise) { - tensor_.set_rowwise_scale_inv(rowwise_scale_inv, DType::kFloat32, - std::vector{1}); - rowwise_scale_inv_cpu_data_ = std::make_unique(sizeof(float)); - std::fill_n(rowwise_scale_inv_cpu_data_.get(), sizeof(float), 0); + scale_inv_rowwise_ = scale_inv; + tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } if (columnwise) { - tensor_.set_columnwise_scale_inv(rowwise_scale_inv, DType::kFloat32, - std::vector{1}); - columnwise_scale_inv_cpu_data_ = std::make_unique(sizeof(float)); - std::fill_n(columnwise_scale_inv_cpu_data_.get(), sizeof(float), 0); - } - } else { - if (scaling_mode == NVTE_NVFP4_1D_SCALING) { - // Used for NVFP4 second stage scaling - amax_cpu_data_ = std::make_shared(0); - amax_cpu_data_columnwise_ = std::make_shared(0); - cudaMalloc((void**)&amax, sizeof(float)); // NOLINT(*) - cudaMalloc((void**)&amax_columnwise, sizeof(float)); // NOLINT(*) - cudaMemset(amax, 0, sizeof(float)); - cudaMemset(amax_columnwise, 0, sizeof(float)); - tensor_.set_amax(amax, DType::kFloat32, std::vector{1}); - tensor_.set_columnwise_amax(amax_columnwise, DType::kFloat32, std::vector{1}); + scale_inv_columnwise_ = scale_inv; + tensor_.set_columnwise_scale_inv(scale_inv_columnwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } + } + break; + case NVTE_MXFP8_1D_SCALING: + case NVTE_BLOCK_SCALING_1D: + case NVTE_BLOCK_SCALING_2D: + case NVTE_NVFP4_1D_SCALING: + { + // Block scaling factors auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode()); - auto rowwise_scale_size = rowwise_scale_meta.bytes(); - auto columnwise_scale_size = colwise_scale_meta.bytes(); - auto scale_shape = rowwise_scale_meta.shape; - auto columnwise_scale_shape = colwise_scale_meta.shape; if (rowwise) { - cudaMalloc((void **)&rowwise_scale_inv, rowwise_scale_size); // NOLINT(*) - cudaMemset(rowwise_scale_inv, 0, rowwise_scale_size); - rowwise_scale_inv_cpu_data_ = std::make_unique(rowwise_scale_size); - std::fill_n(rowwise_scale_inv_cpu_data_.get(), rowwise_scale_size, 0); - auto scale_dtype = rowwise_scale_meta.type; - tensor_.set_rowwise_scale_inv(rowwise_scale_inv, scale_dtype, scale_shape); + const auto scale_shape = rowwise_scale_meta.shape; + const auto scale_dtype = rowwise_scale_meta.type; + scale_inv_rowwise_ = std::make_shared(product(scale_shape), scale_dtype); + tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), scale_dtype, scale_shape); } if (columnwise) { - cudaMalloc((void**)&columnwise_scale_inv, columnwise_scale_size); // NOLINT(*) - cudaMemset(columnwise_scale_inv, 0, columnwise_scale_size); - columnwise_scale_inv_cpu_data_ = std::make_unique(columnwise_scale_size); - std::fill_n(columnwise_scale_inv_cpu_data_.get(), columnwise_scale_size, 0); - auto scale_dtype = colwise_scale_meta.type; - tensor_.set_columnwise_scale_inv(columnwise_scale_inv, scale_dtype, columnwise_scale_shape); + const auto scale_shape = colwise_scale_meta.shape; + const auto scale_dtype = colwise_scale_meta.type; + scale_inv_columnwise_ = std::make_shared(product(scale_shape), scale_dtype); + tensor_.set_columnwise_scale_inv(scale_inv_columnwise_->gpu_buffer(), scale_dtype, scale_shape); } - } - } -} -void Tensor::to_cpu() const { - const NVTEShape s = tensor_.shape(); - const size_t size = bytes(s, tensor_.dtype()); - if (rowwise_) { - cudaMemcpy(cpu_data_rowwise_.get(), - tensor_.get_rowwise_data().data_ptr, - size, - cudaMemcpyDeviceToHost); - } - if (columnwise_) { - const DType colwise_type = tensor_.dtype(); - - const size_t colwise_size = bytes(s, colwise_type); - cudaMemcpy(cpu_data_columnwise_.get(), - tensor_.get_columnwise_data().data_ptr, - colwise_size, - cudaMemcpyDeviceToHost); - } - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { - if (tensor_.amax() != nullptr){ - cudaMemcpy(amax_cpu_data_.get(), - tensor_.amax(), - sizeof(float), - cudaMemcpyDeviceToHost); - } - cudaMemcpy(scale_cpu_data_.get(), - tensor_.scale(), - sizeof(float), - cudaMemcpyDeviceToHost); - } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) { - if (rowwise_ && (tensor_.amax() != nullptr)){ - cudaMemcpy(amax_cpu_data_.get(), - tensor_.amax(), - sizeof(float), - cudaMemcpyDeviceToHost); - } - if (columnwise_ && (tensor_.get_columnwise_amax().data_ptr != nullptr)){ - cudaMemcpy(amax_cpu_data_columnwise_.get(), - tensor_.get_columnwise_amax().data_ptr, - sizeof(float), - cudaMemcpyDeviceToHost); + // NVFP4 uses amax for tensor scaling + if (scaling_mode == NVTE_NVFP4_1D_SCALING) { + if (rowwise) { + amax_rowwise_.emplace(1, DType::kFloat32); + tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); + } + if (columnwise) { + amax_columnwise_.emplace(1, DType::kFloat32); + tensor_.set_columnwise_amax(amax_columnwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); + } } } - auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode()); - if (rowwise_) { - auto scale_size = rowwise_scale_meta.bytes(); - cudaMemcpy(rowwise_scale_inv_cpu_data_.get(), - tensor_.get_rowwise_scale_inv().data_ptr, - scale_size, - cudaMemcpyDeviceToHost); - } - if (columnwise_) { - auto scale_size = colwise_scale_meta.bytes(); - cudaMemcpy(columnwise_scale_inv_cpu_data_.get(), - tensor_.get_columnwise_scale_inv().data_ptr, - scale_size, - cudaMemcpyDeviceToHost); - } + break; + default: + NVTE_ERROR("Unsupported tensor format (", static_cast(scaling_mode), ")"); } } -void Tensor::from_cpu() const { - const NVTEShape s = tensor_.shape(); - const size_t size = bytes(s, tensor_.dtype()); - if (rowwise_) { - cudaMemcpy(tensor_.get_rowwise_data().data_ptr, cpu_data_rowwise_.get(), size, - cudaMemcpyHostToDevice); - } - if (columnwise_) { - cudaMemcpy(tensor_.get_columnwise_data().data_ptr, cpu_data_columnwise_.get(), size, - cudaMemcpyHostToDevice); - } - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { - if (tensor_.amax() != nullptr){ - cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice); - } - cudaMemcpy(tensor_.scale(), scale_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice); - } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) { - if (rowwise_ && (tensor_.amax() != nullptr)) { - cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice); - } - if (columnwise_ && (tensor_.get_columnwise_amax().data_ptr != nullptr)) { - cudaMemcpy(tensor_.get_columnwise_amax().data_ptr, amax_cpu_data_columnwise_.get(), - sizeof(float), cudaMemcpyHostToDevice); - } - } - auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode()); +void Tensor::set_tensor_amax_nullptr() { + tensor_.set_amax(nullptr, DType::kFloat32, tensor_.defaultShape); +} + +void Tensor::set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales) { + tensor_.set_with_gemm_swizzled_scales(with_gemm_swizzled_scales); +} + +void Tensor::set_row_scaled_nvfp4(bool row_scaled_nvfp4) { + NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING, + "Row-scaled NVFP4 is only supported for NVFP4 tensors."); + tensor_.set_row_scaled_nvfp4(row_scaled_nvfp4); + + // Update amax tensor + if (row_scaled_nvfp4) { + // Row-scaled NVFP4 has amax matching number of rows + NVTE_CHECK(rowwise_, "Row-scaled NVFP4 requires row-wise data."); + NVTE_CHECK(!columnwise_, "Row-scaled NVFP4 does not support column-wise data."); + auto shape = tensor_.shape(); + const size_t rows = product(shape, 0, shape.ndim - 1); + amax_rowwise_.emplace(rows, DType::kFloat32); + tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{rows}); + } else { + // Tensor-scaled NVFP4 has single amax if (rowwise_) { - auto scale_size = rowwise_scale_meta.bytes(); - cudaMemcpy(tensor_.get_rowwise_scale_inv().data_ptr, - rowwise_scale_inv_cpu_data_.get(), scale_size, - cudaMemcpyHostToDevice); + amax_rowwise_.emplace(1, DType::kFloat32); + tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } if (columnwise_) { - auto scale_size = colwise_scale_meta.bytes(); - cudaMemcpy(tensor_.get_columnwise_scale_inv().data_ptr, - columnwise_scale_inv_cpu_data_.get(), scale_size, - cudaMemcpyHostToDevice); + amax_columnwise_.emplace(1, DType::kFloat32); + tensor_.set_columnwise_amax(amax_columnwise_->gpu_buffer(), DType::kFloat32, std::vector{1}); } } } -void Tensor::set_scale(float scale) { - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - NVTE_CHECK(scale_cpu_data_); - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { - *scale_cpu_data_ = scale; - from_cpu(); - } - } +void Tensor::to_cpu() { + if (data_rowwise_) { data_rowwise_->to_cpu(); } + if (data_columnwise_) { data_columnwise_->to_cpu(); } + if (scale_inv_rowwise_) { scale_inv_rowwise_->to_cpu(); } + if (scale_inv_columnwise_) { scale_inv_columnwise_->to_cpu(); } + if (amax_rowwise_) { amax_rowwise_->to_cpu(); } + if (amax_columnwise_) { amax_columnwise_->to_cpu(); } + if (scale_) { scale_->to_cpu(); } } -void Tensor::set_tensor_amax_shape(const std::vector &shape) { - const size_t numel = product(shape); - NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING, - "Amax shape override is only supported for NVFP4 test tensors."); - - auto old_amax = tensor_.get_amax(); - if (old_amax.data_ptr != nullptr) { - NVTE_CHECK_CUDA(cudaFree(old_amax.data_ptr)); - } - - float *amax = nullptr; - NVTE_CHECK_CUDA(cudaMalloc(&amax, numel * sizeof(float))); - NVTE_CHECK_CUDA(cudaMemset(amax, 0, numel * sizeof(float))); - tensor_.set_amax(amax, DType::kFloat32, shape); +void Tensor::from_cpu() { + if (data_rowwise_) { data_rowwise_->from_cpu(); } + if (data_columnwise_) { data_columnwise_->from_cpu(); } + if (scale_inv_rowwise_) { scale_inv_rowwise_->from_cpu(); } + if (scale_inv_columnwise_) { scale_inv_columnwise_->from_cpu(); } + if (amax_rowwise_) { amax_rowwise_->from_cpu(); } + if (amax_columnwise_) { amax_columnwise_->from_cpu(); } + if (scale_) { scale_->from_cpu(); } } -std::vector Tensor::tensor_amax_values() const { - const auto amax = tensor_.get_amax(); - NVTE_CHECK(static_cast(amax.dtype) == DType::kFloat32, "Tensor amax must be FP32."); - - const size_t numel = product(amax.shape); - if (numel == 0) { - return {}; - } - NVTE_CHECK(amax.data_ptr != nullptr, "Tensor amax is not allocated."); - - std::vector values(numel); - NVTE_CHECK_CUDA( - cudaMemcpy(values.data(), amax.data_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost)); - return values; +void Tensor::set_amax(float amax) { + NVTE_CHECK(amax_rowwise_); + NVTE_CHECK(amax_rowwise_->size() == 1); + NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32); + *amax_rowwise_->cpu_buffer() = amax; + amax_rowwise_->from_cpu(); } -void Tensor::copy_tensor_amax_from(const Tensor &other) { - const auto other_amax = other.tensor_.get_amax(); - NVTE_CHECK(static_cast(other_amax.dtype) == DType::kFloat32, - "Source tensor amax must be FP32."); - - auto my_amax = tensor_.get_amax(); - NVTE_CHECK(static_cast(my_amax.dtype) == DType::kFloat32, - "Destination tensor amax must be FP32."); - NVTE_CHECK(areShapesEqual(my_amax.shape, other_amax.shape), "Amax shape mismatch."); +void Tensor::set_scale(float scale) { + NVTE_CHECK(scale_); + NVTE_CHECK(scale_->size() == 1); + NVTE_CHECK(scale_->dtype() == DType::kFloat32); + *scale_->cpu_buffer() = scale; + scale_->from_cpu(); +} - const size_t numel = product(other_amax.shape); - if (numel == 0) { - return; - } +void Tensor::set_scale_inv(float scale_inv) { + NVTE_CHECK(scale_inv_rowwise_); + NVTE_CHECK(scale_inv_rowwise_->size() == 1); + NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32); + *scale_inv_rowwise_->cpu_buffer() = scale_inv; + scale_inv_rowwise_->from_cpu(); +} - NVTE_CHECK(other_amax.data_ptr != nullptr, "Source tensor amax is not allocated."); - NVTE_CHECK(my_amax.data_ptr != nullptr, "Destination tensor amax is not allocated."); - NVTE_CHECK_CUDA(cudaMemcpy(my_amax.data_ptr, other_amax.data_ptr, numel * sizeof(float), - cudaMemcpyDeviceToDevice)); +void Tensor::set_tensor_amax_columnwise(float amax) { + NVTE_CHECK(amax_columnwise_); + NVTE_CHECK(amax_columnwise_->size() == 1); + NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32); + *amax_columnwise_->cpu_buffer() = amax; + amax_columnwise_->from_cpu(); } -void Tensor::set_scale_inv(float scale_inv) { - if (isFp8Type(dtype()) || isFp4Type(dtype())) { - if (rowwise_) { - NVTE_CHECK(rowwise_scale_inv_cpu_data_); - } - if (columnwise_) { - NVTE_CHECK(columnwise_scale_inv_cpu_data_); - } +namespace { - auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(tensor_.shape(), tensor_.scaling_mode()); - if (rowwise_) { - auto num_scales = product(rowwise_scale_meta.shape); - if (num_scales == 1) { - rowwise_cpu_scale_inv_ptr()[0] = scale_inv; - } else { - std::uniform_int_distribution dis(0, 127); - auto *scale_inv_ptr = rowwise_cpu_scale_inv_ptr(); - for (size_t i = 0; i < num_scales; i++) { - scale_inv_ptr[i] = dis(gen_); - } +void fill_uniform_buffer(void *cpu_data, size_t numel, DType dtype, std::mt19937 &gen) { + switch (dtype) { + case DType::kFloat32: + { + auto *data = static_cast(cpu_data); + std::uniform_real_distribution dis(-2.0, 1.0); + for (size_t i = 0; i < numel; ++i) { + data[i] = dis(gen); } } - if (columnwise_) { - auto num_scales = product(colwise_scale_meta.shape); - if (num_scales == 1) { - columnwise_cpu_scale_inv_ptr()[0] = scale_inv; - } else { - std::uniform_int_distribution dis(0, 127); - auto *scale_inv_ptr = columnwise_cpu_scale_inv_ptr(); - for (size_t i = 0; i < num_scales; i++) { - scale_inv_ptr[i] = dis(gen_); - } + break; + case DType::kFloat8E4M3: + case DType::kFloat8E8M0: + case DType::kByte: + { + auto *data = static_cast(cpu_data); + std::uniform_int_distribution dis(0, 127); + for (size_t i = 0; i < numel; ++i) { + data[i] = dis(gen); } } - from_cpu(); + break; + default: + NVTE_ERROR("Unsupported dtype (", static_cast(dtype), ")."); + } +} + +} // namespace + +void Tensor::fill_uniform_rowwise_scale_inv() { + if (!scale_inv_rowwise_ || scale_inv_rowwise_->size() == 0) { + return; + } + fill_uniform_buffer(scale_inv_rowwise_->cpu_buffer(), scale_inv_rowwise_->size(), + scale_inv_rowwise_->dtype(), gen_); + scale_inv_rowwise_->from_cpu(); +} + +void Tensor::fill_uniform_columnwise_scale_inv() { + if (!scale_inv_columnwise_ || scale_inv_columnwise_->size() == 0) { + return; } + fill_uniform_buffer(scale_inv_columnwise_->cpu_buffer(), scale_inv_columnwise_->size(), + scale_inv_columnwise_->dtype(), gen_); + scale_inv_columnwise_->from_cpu(); } -void Tensor::shareFP8Meta(const Tensor &other) { - if ((isFp8Type(dtype()) && isFp8Type(other.dtype())) - || isFp4Type(dtype()) && isFp4Type(other.dtype())) { - auto new_tensor = TensorWrapper(other.tensor_.scaling_mode()); - auto my_rowwise_data = tensor_.get_rowwise_data(); - new_tensor.set_rowwise_data(my_rowwise_data.data_ptr, static_cast(my_rowwise_data.dtype), - my_rowwise_data.shape); - auto my_columnwise_data = tensor_.get_columnwise_data(); - new_tensor.set_columnwise_data(my_columnwise_data.data_ptr, - static_cast(my_columnwise_data.dtype), - my_columnwise_data.shape); - auto other_amax = other.tensor_.get_amax(); - new_tensor.set_amax(other_amax.data_ptr, static_cast(other_amax.dtype), - other_amax.shape); - auto other_scale = other.tensor_.get_scale(); - new_tensor.set_scale(other_scale.data_ptr, static_cast(other_scale.dtype), - other_scale.shape); - auto other_row_scale_inv = other.tensor_.get_rowwise_scale_inv(); - new_tensor.set_rowwise_scale_inv(other_row_scale_inv.data_ptr, - static_cast(other_row_scale_inv.dtype), - other_row_scale_inv.shape); - auto other_col_scale_inv = other.tensor_.get_columnwise_scale_inv(); - new_tensor.set_columnwise_scale_inv(other_col_scale_inv.data_ptr, - static_cast(other_col_scale_inv.dtype), - other_col_scale_inv.shape); - tensor_ = std::move(new_tensor); - to_cpu(); +void Tensor::fill_uniform_scale() { + if (!scale_ || scale_->size() == 0) { + return; + } + + // Generate random scales on CPU + auto *cpu_data = scale_->cpu_buffer(); + const auto numel = scale_->size(); + NVTE_CHECK(scale_->dtype() == DType::kFloat32); + std::uniform_real_distribution dis(-2.0, 1.0); + for (size_t i = 0; i < numel; ++i) { + cpu_data[i] = dis(gen_); } + + // Update GPU tensor + scale_->from_cpu(); } using std::to_string; @@ -689,7 +584,7 @@ std::vector unravel(const size_t i, const NVTEShape &shape) { return ret; } -void compareResults_sequential(const std::string &name, const Tensor &test, +void compareResults_sequential(const std::string &name, Tensor &test, const void *ref, const bool rowwise, double atol, double rtol, bool if_on_gpus, const size_t tolerable_mismatches_limit) { @@ -779,7 +674,7 @@ static size_t getFirstMismatchIdx(const DType data_type, const T* test_data, con return first_mismatch_idx; } -void compareResults_parallel(const std::string &name, const Tensor &test, const void *ref, +void compareResults_parallel(const std::string &name, Tensor &test, const void *ref, const bool rowwise, double atol, double rtol, bool if_on_gpus, const size_t tolerable_mismatches_limit) { if (if_on_gpus) test.to_cpu(); @@ -806,7 +701,7 @@ void compareResults_parallel(const std::string &name, const Tensor &test, const ); } -void compareResults(const std::string &name, const Tensor &test, const void *ref, +void compareResults(const std::string &name, Tensor &test, const void *ref, const bool rowwise, double atol, double rtol, bool if_on_gpus, const size_t tolerable_mismatches_limit) { constexpr bool sequential = false; @@ -992,6 +887,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) { } void fillUniform(Tensor *t) { + // Generate random row-wise data and column-wise data if (t->rowwise()) { const size_t size = product(t->rowwise_shape()); TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T, @@ -1009,8 +905,12 @@ void fillUniform(Tensor *t) { } ); } - std::uniform_real_distribution<> dis(-2.0, 1.0); - t->set_scale_inv(dis(t->gen())); + + // Generate random scales + t->fill_uniform_rowwise_scale_inv(); + t->fill_uniform_columnwise_scale_inv(); + + // Update data on GPU t->from_cpu(); } @@ -1046,7 +946,20 @@ void fillCase_special(Tensor *t) { } }); } - t->set_scale_inv(1.0); + + // Fill scales + if (t->scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) { + if (isFp8Type(t->dtype())) { + // FP8 tensor scale is set to 1 + t->set_scale_inv(1.0); + } + } else { + // Block scales are filled randomly + t->fill_uniform_rowwise_scale_inv(); + t->fill_uniform_columnwise_scale_inv(); + } + + // Update GPU tensor data t->from_cpu(); } @@ -1080,15 +993,12 @@ template void fillCase(Tensor *t, const InputsFillCase fill_case); #endif void setRandomScale(Tensor *t) { - std::uniform_real_distribution<> dis(-2.0, 1.0); - const float scale = dis(t->gen()); - t->set_scale(scale); + t->fill_uniform_scale(); } void setRandomScaleInv(Tensor *t) { - std::uniform_real_distribution<> dis(-2.0, 1.0); - const float scale_inv = dis(t->gen()); - t->set_scale_inv(scale_inv); + t->fill_uniform_rowwise_scale_inv(); + t->fill_uniform_columnwise_scale_inv(); } bool isFp8Type(DType type) { diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h index b2a7da89cf..17f36a99dd 100644 --- a/tests/cpp/test_common.h +++ b/tests/cpp/test_common.h @@ -6,10 +6,12 @@ #pragma once -#include -#include #include +#include +#include #include +#include + #include #define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080) @@ -27,6 +29,11 @@ namespace test { using namespace transformer_engine; +size_t typeToNumBits(DType type); +size_t product(const NVTEShape &shape); +size_t product(const std::vector &shape); +size_t bytes(const NVTEShape& shape, const DType type); + template struct BytesToType {}; @@ -114,9 +121,30 @@ struct TypeInfo { } constexpr static DType dtype = getType(); - constexpr static size_t size = BitsNumber::num_bits;; + constexpr static size_t size = BitsNumber::num_bits; +}; + +// Deleter for CUDA buffer RAII class +struct CudaDeleter { + void operator()(void* ptr) const { if (ptr != nullptr) cudaFree(ptr); } }; +// CUDA buffer RAII class +template +using CudaPtr = std::unique_ptr; + +// Construct CUDA memory +template +CudaPtr cuda_alloc(size_t bytes) { + void* ptr = nullptr; + NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes)); + return CudaPtr(static_cast(ptr)); +} + +/* Wrapper for Transformer Engine tensor + * + * Maintains matching GPU and CPU buffers. + */ class Tensor { public: Tensor(const std::string& name, @@ -133,7 +161,7 @@ class Tensor { const NVTEScalingMode &mode = NVTE_DELAYED_TENSOR_SCALING) : Tensor(name, nvte_make_shape(shape.data(), shape.size()), type, rowwise, columnwise, mode) {} - Tensor() {} + Tensor() = default; Tensor& operator=(const Tensor &other) = delete; Tensor(const Tensor &other) = delete; @@ -141,42 +169,7 @@ class Tensor { Tensor(Tensor &&other) = default; Tensor& operator=(Tensor &&other) = default; - ~Tensor() { - void *data_ptr = tensor_.dptr(); - void *scale_inv = tensor_.scale_inv(); - void *columnwise_data_ptr = tensor_.get_columnwise_data().data_ptr; - void *columnwise_scale_inv = tensor_.get_columnwise_scale_inv().data_ptr; - void *amax = tensor_.amax(); - void *columnwise_amax_ptr = tensor_.get_columnwise_amax().data_ptr; - void *scale = tensor_.scale(); - if (columnwise_data_ptr == data_ptr) { - columnwise_data_ptr = nullptr; - } - if (columnwise_scale_inv == scale_inv) { - columnwise_scale_inv = nullptr; - } - if (data_ptr != nullptr) { - cudaFree(data_ptr); - } - if (scale_inv != nullptr) { - cudaFree(scale_inv); - } - if (columnwise_data_ptr != nullptr) { - cudaFree(columnwise_data_ptr); - } - if (columnwise_scale_inv != nullptr) { - cudaFree(columnwise_scale_inv); - } - if (amax != nullptr) { - cudaFree(amax); - } - if (columnwise_amax_ptr != nullptr) { - cudaFree(columnwise_amax_ptr); - } - if (scale != nullptr) { - cudaFree(scale); - } - } + ~Tensor() = default; NVTETensor data() const noexcept { return tensor_.data(); } @@ -213,141 +206,176 @@ class Tensor { } template - T *rowwise_cpu_dptr() const { - NVTE_CHECK(TypeInfo::dtype == tensor_.dtype(), "Invalid type!"); + T *rowwise_cpu_dptr() { + NVTE_CHECK(data_rowwise_, "Tensor does not have rowwise data!"); + NVTE_CHECK(TypeInfo::dtype == data_rowwise_->dtype(), "Invalid type!"); NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!"); - return reinterpret_cast(cpu_data_rowwise_.get()); + return data_rowwise_->cpu_buffer(); } template - T *columnwise_cpu_dptr() const { - NVTE_CHECK(TypeInfo::dtype == tensor_.dtype(), "Invalid type!"); + T *columnwise_cpu_dptr() { + NVTE_CHECK(data_columnwise_, "Tensor does not have columnwise data!"); + NVTE_CHECK(TypeInfo::dtype == data_columnwise_->dtype(), "Invalid type!"); NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!"); - return reinterpret_cast(cpu_data_columnwise_.get()); + return data_columnwise_->cpu_buffer(); } - float amax() const { - if(amax_cpu_data_) { - to_cpu(); - return *amax_cpu_data_; - } else { - return 0; - } + float amax() { + NVTE_CHECK(amax_rowwise_); + NVTE_CHECK(amax_rowwise_->size() == 1); + NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32); + amax_rowwise_->to_cpu(); + return *amax_rowwise_->cpu_buffer(); } - float amax_columnwise() const { - if(amax_cpu_data_columnwise_) { - to_cpu(); - return *amax_cpu_data_columnwise_; - } else { - return 0; - } + float amax_columnwise() { + NVTE_CHECK(amax_columnwise_); + NVTE_CHECK(amax_columnwise_->size() == 1); + NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32); + amax_columnwise_->to_cpu(); + return *amax_columnwise_->cpu_buffer(); } - float scale() const { - if(scale_cpu_data_) { - NVTE_CHECK(tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING, "Invalid scaling_mode!"); - to_cpu(); - return *scale_cpu_data_; - } else { - return 1; - } + float scale() { + NVTE_CHECK(scale_); + NVTE_CHECK(scale_->size() == 1); + NVTE_CHECK(scale_->dtype() == DType::kFloat32); + scale_->to_cpu(); + return *scale_->cpu_buffer(); + } + + float rowwise_scale_inv(){ + NVTE_CHECK(scale_inv_rowwise_); + NVTE_CHECK(scale_inv_rowwise_->size() == 1); + NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32); + scale_inv_rowwise_->to_cpu(); + return *scale_inv_rowwise_->cpu_buffer(); } template T *rowwise_cpu_scale_inv_ptr(){ - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){ - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat8E4M3, "Invalid type!"); - } else { - NVTE_CHECK(TypeInfo::dtype == DType::kByte, "Invalid type!"); - } - to_cpu(); - return reinterpret_cast(rowwise_scale_inv_cpu_data_.get()); + NVTE_CHECK(scale_inv_rowwise_); + scale_inv_rowwise_->to_cpu(); + return scale_inv_rowwise_->cpu_buffer(); } template T *columnwise_cpu_scale_inv_ptr(){ - if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){ - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat32, "Invalid type!"); - } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) { - NVTE_CHECK(TypeInfo::dtype == DType::kFloat8E4M3, "Invalid type!"); - } else { - NVTE_CHECK(TypeInfo::dtype == DType::kByte, "Invalid type!"); - } - to_cpu(); - return reinterpret_cast(columnwise_scale_inv_cpu_data_.get()); + NVTE_CHECK(scale_inv_columnwise_); + scale_inv_columnwise_->to_cpu(); + return scale_inv_columnwise_->cpu_buffer(); } - float rowwise_scale_inv(){ - if(rowwise_scale_inv_cpu_data_) { - float scale_inv = rowwise_cpu_scale_inv_ptr()[0]; - return scale_inv; - } else { - return 1; - } - } - - bool rowwise() const { - return rowwise_; + template + T *cpu_rowwise_amax_ptr() { + NVTE_CHECK(amax_rowwise_); + amax_rowwise_->to_cpu(); + return amax_rowwise_->cpu_buffer(); } - bool columnwise() const { - return columnwise_; + template + T *cpu_columnwise_amax_ptr() { + NVTE_CHECK(amax_columnwise_); + amax_columnwise_->to_cpu(); + return amax_columnwise_->cpu_buffer(); } - void set_tensor_amax(const float amax) { - if (amax_cpu_data_) { - *amax_cpu_data_ = amax; - from_cpu(); - } + size_t rowwise_amax_size() const noexcept { + return amax_rowwise_ ? amax_rowwise_->size() : 0; } - void set_tensor_amax_columnwise(const float amax) { - if (amax_cpu_data_columnwise_) { - *amax_cpu_data_columnwise_ = amax; - from_cpu(); - } + bool rowwise() const { + return rowwise_; } - void set_tensor_amax_nullptr(){ - tensor_.set_amax(nullptr, DType::kFloat32, tensor_.defaultShape); + bool columnwise() const { + return columnwise_; } - void set_tensor_amax_shape(const std::vector &shape); - std::vector tensor_amax_values() const; - void copy_tensor_amax_from(const Tensor &other); + void set_tensor_amax_nullptr(); - void set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales){ - tensor_.set_with_gemm_swizzled_scales(with_gemm_swizzled_scales); - } + void set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales); + void set_row_scaled_nvfp4(bool row_scaled_nvfp4); - void set_row_scaled_nvfp4(bool row_scaled_nvfp4) { - tensor_.set_row_scaled_nvfp4(row_scaled_nvfp4); - } + void to_cpu(); + void from_cpu(); - void to_cpu() const; - void from_cpu() const; + void set_amax(float amax); void set_scale(float scale); void set_scale_inv(float scale_inv); - void shareFP8Meta(const Tensor &other); + void set_tensor_amax_columnwise(float amax); + + void fill_uniform_rowwise_scale_inv(); + void fill_uniform_columnwise_scale_inv(); + void fill_uniform_scale(); std::mt19937& gen() { return gen_; } private: + + /* Manages matching GPU and CPU buffers. */ + class Buffer { + public: + + Buffer(size_t size = 0, DType dtype = DType::kByte); + ~Buffer() = default; + Buffer(const Buffer&) = delete; + Buffer& operator=(const Buffer&) = delete; + Buffer(Buffer&&) = default; + Buffer& operator=(Buffer&&) = default; + + size_t size() const noexcept { return size_; } + DType dtype() const noexcept { return dtype_; } + + // Void pointer accessors + void *cpu_buffer() { return cpu_buffer_.get(); } + const void *cpu_buffer() const { return cpu_buffer_.get(); } + void *gpu_buffer() { return gpu_buffer_.get(); } + const void *gpu_buffer() const { return gpu_buffer_.get(); } + + // Templated pointer accessors + template + T *cpu_buffer() { + return reinterpret_cast(cpu_buffer()); + } + template + const T *cpu_buffer() const { + return const_cast(this)->cpu_buffer(); + } + template + T *gpu_buffer() { + return reinterpret_cast(gpu_buffer()); + } + template + const T *gpu_buffer() const { + return const_cast(this)->gpu_buffer(); + } + + // Memory transfers between CPU and GPU + void to_cpu(); + void from_cpu(); + + private: + std::unique_ptr cpu_buffer_; + CudaPtr gpu_buffer_; + size_t size_; + DType dtype_; + size_t bytes_; + }; + + // Transformer Engine tensor TensorWrapper tensor_; - std::unique_ptr cpu_data_rowwise_; - std::unique_ptr cpu_data_columnwise_; - std::shared_ptr amax_cpu_data_; - std::shared_ptr amax_cpu_data_columnwise_; - std::shared_ptr scale_cpu_data_; - std::unique_ptr rowwise_scale_inv_cpu_data_; - std::unique_ptr columnwise_scale_inv_cpu_data_; + + // Data buffers + std::optional data_rowwise_; + std::optional data_columnwise_; + std::shared_ptr scale_inv_rowwise_; + std::shared_ptr scale_inv_columnwise_; + std::optional amax_rowwise_; + std::optional amax_columnwise_; + std::optional scale_; + bool rowwise_; bool columnwise_; std::string name_; @@ -497,17 +525,12 @@ inline float dsilu(const float x) { return x * dsigmoid(x) + sigmoid(x); } inline float srelu(const float x) { return x > 0 ? x * x : 0; } inline float dsrelu(const float x) { return fmaxf(0, 2 * x); } -size_t typeToNumBits(DType type); -size_t product(const NVTEShape &shape); -size_t product(const std::vector &shape); -size_t bytes(const NVTEShape& shape, const DType type); - size_t first_dimension(const std::vector &shape); size_t last_dimension(const std::vector &shape); bool areShapesEqual(const NVTEShape &s1, const NVTEShape &s2); -void compareResults(const std::string &name, const Tensor &test, const void *ref, +void compareResults(const std::string &name, Tensor &test, const void *ref, bool rowwise, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true, const size_t tolerable_mismatches_limit = 0); void compareResults(const std::string &name, const float test, const float ref, @@ -550,26 +573,14 @@ int32_t getDeviceComputeCapability(); constexpr int32_t hopperComputeCapability = 90; constexpr int32_t blackwellComputeCapability = 100; -// Custom deleters for RAII -struct CudaDeleter { - void operator()(void* p) const { if (p) cudaFree(p); } -}; +// Custom deleter for RAII struct GroupedTensorDeleter { void operator()(NVTEGroupedTensor h) const { if (h) nvte_destroy_grouped_tensor(h); } }; -template -using CudaPtr = std::unique_ptr; +// Grouped tensor RAII class using GroupedTensorHandle = std::unique_ptr, GroupedTensorDeleter>; -// Helper to allocate CUDA memory into a CudaPtr -template -CudaPtr cuda_alloc(size_t bytes) { - void* ptr = nullptr; - NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes)); - return CudaPtr(static_cast(ptr)); -} - // Helper owning GPU buffers that back NVTEGroupedTensor. // NVTEGroupedTensor does not own memory; data/offsets/scales // must be allocated and freed by the test. diff --git a/tests/cpp_distributed/test_comm_gemm.cu b/tests/cpp_distributed/test_comm_gemm.cu index cc0d760a39..45f6664567 100644 --- a/tests/cpp_distributed/test_comm_gemm.cu +++ b/tests/cpp_distributed/test_comm_gemm.cu @@ -107,8 +107,10 @@ std::vector CopyMatrix(const std::vector& data, size_t mstart, size_t nsta template test::Tensor Make(size_t m, size_t n, float scale) { test::Tensor ret("", std::vector{n, m}, TypeInfo::dtype); - ret.set_scale(scale); - ret.set_scale_inv(1.0 / scale); + if (test::isFp8Type(TypeInfo::dtype)) { + ret.set_scale(scale); + ret.set_scale_inv(1.0 / scale); + } return ret; } @@ -116,8 +118,10 @@ template test::Tensor MakeFromData(const std::vector& data, size_t mstart, size_t nstart, size_t msize, size_t nsize, size_t ld, float scale) { test::Tensor ret("", std::vector{nsize, msize}, TypeInfo::dtype); - ret.set_scale(scale); - ret.set_scale_inv(1.0 / scale); + if (test::isFp8Type(TypeInfo::dtype)) { + ret.set_scale(scale); + ret.set_scale_inv(1.0 / scale); + } auto local = CopyMatrix(data, mstart, nstart, msize, nsize, ld); NVTE_CHECK_CUDA(cudaMemcpy(ret.rowwise_dptr(), local.data(), local.size() * sizeof local[0], cudaMemcpyDefault)); diff --git a/transformer_engine/common/include/transformer_engine/transformer_engine.h b/transformer_engine/common/include/transformer_engine/transformer_engine.h index e9a6f4f735..488f259150 100644 --- a/transformer_engine/common/include/transformer_engine/transformer_engine.h +++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h @@ -72,7 +72,17 @@ enum NVTETensorParam { kNVTEColumnwiseScaleInv = 5, /*!< Scale inverse tensor for decoding Columnwise Data */ kNVTEColumnwiseAmax = 6, /*!< Columnwise Amax tensor */ kNVTEWithGEMMSwizzledScales = 7, /*!< Whether scaling factors are in format expected by GEMM */ - kNVTERowScaledNVFP4 = 8, /*!< Whether an NVFP4 tensor uses row scaling */ + /*! Whether an NVFP4 tensor uses row scaling instead of tensor scaling. + * + * Column-wise data is not supported with row scaling. + * + * Row scaling affects the interpretation of the amax tensor. With + * tensor scaling, the amax tensor is a single FP32 that must be + * computed prior to quantization. With row scaling, the amax + * tensor size is the number of tensor rows (flattened to 2D), and + * its values are populated during quantization. + */ + kNVTERowScaledNVFP4 = 8, kNVTENumTensorParams };