phu0ngng · pull · May 11, 2026 · May 11, 2026
diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
@@ -124,6 +124,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(N*H);
@@ -132,7 +133,7 @@ void performTest(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                output.scale(), &ref_amax, N, H);
+                                ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -179,6 +180,7 @@ void performTestGLU(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N * H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(2 * N * H);
@@ -187,7 +189,7 @@ void performTestGLU(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_glu_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                    output.scale(), &ref_amax, N, H);
+                                    ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -197,8 +199,8 @@ void performTestGLU(const size_t N, const size_t H) {
     auto [atol, rtol] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol, rtol);
     if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      const float ref_scale = 1.f / output.scale();
-      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale, atol, rtol);
+      const float ref_scale_inv = 1.f / ref_scale;
+      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale_inv, atol, rtol);
     }
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
@@ -53,21 +53,22 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     full_size, &ref_amax, output_c.scale());
+                                     full_size, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast_current_scaling.cu b/tests/cpp/operator/test_cast_current_scaling.cu
@@ -123,28 +123,29 @@ void performTest(const std::vector<size_t>& shape) {
     nvte_compute_amax(input.data(), output_c.data(), 0);
     QuantizationConfigWrapper config;
     nvte_compute_scale_from_amax(output_c.data(), config, 0);
+
     // avoid atomic amax update in cuda cast kernels because of current per-tensor scaling
     amax_to_check = output_c.amax();
     output_c.set_tensor_amax_nullptr();
   }
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
-  float ref_scale;
+  float ref_scale = 1.0;
   float ref_scale_inv;
   if (is_out_fp8){
     compute_amax_scale_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(),
                                      full_size, &ref_amax, &ref_scale, &ref_scale_inv, max_fp8, 0.0f);
   }
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                    full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f );
+                                    full_size, nullptr, ref_scale);
 
   cudaDeviceSynchronize();
 
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-  if (isFp8Type(otype)) {
+  if (is_out_fp8) {
     auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32);
     compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32);
     compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32);

diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
@@ -74,13 +74,14 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias(input.rowwise_cpu_dptr<IType>(),
-                         output_c.scale(),
+                         ref_scale,
                          ref_output_c.get(),
                          &ref_amax,
                          ref_output_dbias.get(),
@@ -109,7 +110,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);

diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -84,14 +84,15 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&input);
   fillUniform(&grad);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                grad.rowwise_cpu_dptr<IType>(),
-                               output_c.scale(),
+                               ref_scale,
                                ref_output_c.get(),
                                &ref_amax,
                                ref_output_dbias.get(),
@@ -123,7 +124,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 

diff --git a/tests/cpp/operator/test_cast_float8blockwise.cu b/tests/cpp/operator/test_cast_float8blockwise.cu
@@ -524,14 +524,12 @@ TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) {
   //     GTEST_SKIP();
   // }
 
-  DACT_FUNC_SWITCH(
-      Act_type, OP,
-      TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
-          input_type, InputType,
-          TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
-              output_type, OutputType,
-              runTestCase<InputType, OutputType>(processing_method, matrix_size, rowwise, colwise,
-                                                 fill_case, q_opts););););
+  TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
+      input_type, InputType,
+      TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
+          output_type, OutputType,
+          runTestCase<InputType, OutputType>(processing_method, matrix_size, rowwise, colwise,
+                                             fill_case, q_opts);););
 }
 
 TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) {
@@ -581,14 +579,12 @@ TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) {
   //     GTEST_SKIP();
   // }
 
-  DACT_FUNC_SWITCH(
-      Act_type, OP,
-      TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
-          input_type, InputType,
-          TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
-              output_type, OutputType,
-              runTestCaseOneDimensionalBlocks<InputType, OutputType>(
-                  processing_method, matrix_size, rowwise, colwise, fill_case, q_opts););););
+  TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
+      input_type, InputType,
+      TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
+          output_type, OutputType,
+          runTestCaseOneDimensionalBlocks<InputType, OutputType>(
+              processing_method, matrix_size, rowwise, colwise, fill_case, q_opts);););
 }
 
 std::string to_string(const ProcessingMethod method) {

diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -79,6 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(input_size);
 
@@ -91,7 +92,7 @@ void performTest(const std::vector<size_t>& shape) {
   float ref_amax;
   compute_ref_cast_dgated_swiglu(grad.rowwise_cpu_dptr<IType>(),
                                  input.rowwise_cpu_dptr<IType>(),
-                                 output_c.scale(),
+                                 ref_scale,
                                  ref_output_c.get(),
                                  &ref_amax,
                                  rows,
@@ -100,7 +101,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }