diff --git a/tests/cpp/operator/test_act.cu b/tests/cpp/operator/test_act.cu
index ca5ccdc4ce..6edc6bd63b 100644
--- a/tests/cpp/operator/test_act.cu
+++ b/tests/cpp/operator/test_act.cu
@@ -124,6 +124,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(N*H);
@@ -132,7 +133,7 @@ void performTest(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                output.scale(), &ref_amax, N, H);
+                                ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -179,6 +180,7 @@ void performTestGLU(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&ograd);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output = std::make_unique<OType[]>(N * H);
   std::unique_ptr<IType[]> ref_igrad = std::make_unique<IType[]>(2 * N * H);
@@ -187,7 +189,7 @@ void performTestGLU(const size_t N, const size_t H) {
 
   float ref_amax;
   compute_ref_glu_act_cast<ref_act>(input.rowwise_cpu_dptr<IType>(), ref_output.get(),
-                                    output.scale(), &ref_amax, N, H);
+                                    ref_scale, &ref_amax, N, H);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -197,8 +199,8 @@ void performTestGLU(const size_t N, const size_t H) {
     auto [atol, rtol] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol, rtol);
     if (output.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      const float ref_scale = 1.f / output.scale();
-      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale, atol, rtol);
+      const float ref_scale_inv = 1.f / ref_scale;
+      compareResults("scale_inv", *output.rowwise_cpu_scale_inv_ptr<float>(), ref_scale_inv, atol, rtol);
     }
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast.cu b/tests/cpp/operator/test_cast.cu
index 35d9dd2efd..e8f48feef8 100644
--- a/tests/cpp/operator/test_cast.cu
+++ b/tests/cpp/operator/test_cast.cu
@@ -53,13 +53,14 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     full_size, &ref_amax, output_c.scale());
+                                     full_size, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -67,7 +68,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_current_scaling.cu b/tests/cpp/operator/test_cast_current_scaling.cu
index 4dd6cd2d58..7cca0d72e0 100644
--- a/tests/cpp/operator/test_cast_current_scaling.cu
+++ b/tests/cpp/operator/test_cast_current_scaling.cu
@@ -123,6 +123,7 @@ void performTest(const std::vector<size_t>& shape) {
     nvte_compute_amax(input.data(), output_c.data(), 0);
     QuantizationConfigWrapper config;
     nvte_compute_scale_from_amax(output_c.data(), config, 0);
+
     // avoid atomic amax update in cuda cast kernels because of current per-tensor scaling
     amax_to_check = output_c.amax();
     output_c.set_tensor_amax_nullptr();
@@ -130,7 +131,7 @@ void performTest(const std::vector<size_t>& shape) {
   nvte_quantize(input.data(), output_c.data(), 0);
 
   float ref_amax;
-  float ref_scale;
+  float ref_scale = 1.0;
   float ref_scale_inv;
   if (is_out_fp8){
     compute_amax_scale_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(),
@@ -138,13 +139,13 @@ void performTest(const std::vector<size_t>& shape) {
   }
 
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                    full_size, nullptr, is_out_fp8 ? output_c.scale() : 1.0f );
+                                    full_size, nullptr, ref_scale);
 
   cudaDeviceSynchronize();
 
   auto err = cudaGetLastError();
   ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
-  if (isFp8Type(otype)) {
+  if (is_out_fp8) {
     auto [atol_fp32, rtol_fp32] = getTolerances(DType::kFloat32);
     compareResults("amax", amax_to_check, ref_amax, 0.0f, rtol_fp32);
     compareResults("scale", output_c.scale(), ref_scale, 0.0f, rtol_fp32);
diff --git a/tests/cpp/operator/test_cast_dbias.cu b/tests/cpp/operator/test_cast_dbias.cu
index 18f07153c6..b7b5db48c3 100644
--- a/tests/cpp/operator/test_cast_dbias.cu
+++ b/tests/cpp/operator/test_cast_dbias.cu
@@ -74,13 +74,14 @@ void performTest(const std::vector<size_t>& shape) {
 
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
 
   CType ref_amax;
   compute_ref_cast_dbias(input.rowwise_cpu_dptr<IType>(),
-                         output_c.scale(),
+                         ref_scale,
                          ref_output_c.get(),
                          &ref_amax,
                          ref_output_dbias.get(),
@@ -109,7 +110,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_dbias_dgelu.cu b/tests/cpp/operator/test_cast_dbias_dgelu.cu
index 8213e5665a..d8b8a20e6f 100644
--- a/tests/cpp/operator/test_cast_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_dbias_dgelu.cu
@@ -84,6 +84,7 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&input);
   fillUniform(&grad);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<IType[]> ref_output_dbias = std::make_unique<IType[]>(H);
@@ -91,7 +92,7 @@ void performTest(const std::vector<size_t>& shape) {
   CType ref_amax;
   compute_ref_cast_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                grad.rowwise_cpu_dptr<IType>(),
-                               output_c.scale(),
+                               ref_scale,
                                ref_output_c.get(),
                                &ref_amax,
                                ref_output_dbias.get(),
@@ -123,7 +124,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_cast_float8blockwise.cu b/tests/cpp/operator/test_cast_float8blockwise.cu
index 8e9da91d08..d50589ca43 100644
--- a/tests/cpp/operator/test_cast_float8blockwise.cu
+++ b/tests/cpp/operator/test_cast_float8blockwise.cu
@@ -524,14 +524,12 @@ TEST_P(FusedCastFloat8BlockwiseTestSuite, TestFusedCastFloat8Blockwise) {
   //     GTEST_SKIP();
   // }
 
-  DACT_FUNC_SWITCH(
-      Act_type, OP,
-      TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
-          input_type, InputType,
-          TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
-              output_type, OutputType,
-              runTestCase<InputType, OutputType>(processing_method, matrix_size, rowwise, colwise,
-                                                 fill_case, q_opts););););
+  TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
+      input_type, InputType,
+      TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
+          output_type, OutputType,
+          runTestCase<InputType, OutputType>(processing_method, matrix_size, rowwise, colwise,
+                                             fill_case, q_opts);););
 }
 
 TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) {
@@ -581,14 +579,12 @@ TEST_P(FusedCastFloat8VectorwiseTestSuite, TestFusedCastFloat8Vectorwise) {
   //     GTEST_SKIP();
   // }
 
-  DACT_FUNC_SWITCH(
-      Act_type, OP,
-      TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
-          input_type, InputType,
-          TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
-              output_type, OutputType,
-              runTestCaseOneDimensionalBlocks<InputType, OutputType>(
-                  processing_method, matrix_size, rowwise, colwise, fill_case, q_opts););););
+  TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(
+      input_type, InputType,
+      TRANSFORMER_ENGINE_TYPE_SWITCH_FP8_ONLY(
+          output_type, OutputType,
+          runTestCaseOneDimensionalBlocks<InputType, OutputType>(
+              processing_method, matrix_size, rowwise, colwise, fill_case, q_opts);););
 }
 
 std::string to_string(const ProcessingMethod method) {
diff --git a/tests/cpp/operator/test_cast_gated_swiglu.cu b/tests/cpp/operator/test_cast_gated_swiglu.cu
index 298b978f2a..5298cc7577 100644
--- a/tests/cpp/operator/test_cast_gated_swiglu.cu
+++ b/tests/cpp/operator/test_cast_gated_swiglu.cu
@@ -79,6 +79,7 @@ void performTest(const std::vector<size_t>& shape) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output_c);
+  const float ref_scale = isFp8Type(otype) ? output_c.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(input_size);
 
@@ -91,7 +92,7 @@ void performTest(const std::vector<size_t>& shape) {
   float ref_amax;
   compute_ref_cast_dgated_swiglu(grad.rowwise_cpu_dptr<IType>(),
                                  input.rowwise_cpu_dptr<IType>(),
-                                 output_c.scale(),
+                                 ref_scale,
                                  ref_output_c.get(),
                                  &ref_amax,
                                  rows,
@@ -100,7 +101,7 @@ void performTest(const std::vector<size_t>& shape) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output_c.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output_c.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output_c.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
index 1f37520bc7..a8f58f8598 100644
--- a/tests/cpp/operator/test_cast_nvfp4_transpose.cu
+++ b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
@@ -4,6 +4,15 @@
  * See LICENSE for license information.
  ************************************************************************/
 
+#include <algorithm>
+#include <array>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <vector>
+
 #include <cuda_bf16.h>
 #include <cuda_fp8.h>
 #include <cuda_fp4.h>
@@ -14,7 +23,6 @@
 #include <transformer_engine/activation.h>
 #include "../test_common.h"
 #include "transformer_engine/transformer_engine.h"
-#include <fstream>
 
 using namespace transformer_engine;
 using namespace test;
@@ -309,40 +317,24 @@ void compute_ref(float (*OP)(const float),
                  fp4e2m1x2* output_t,
                  fp8e4m3* scales,
                  fp8e4m3* scales_t,
-                 const float global_amax,
+                 const float* amax,
                  const size_t rows,
                  const size_t cols,
                  const size_t scales_stride,
                  const size_t scales_stride_t,
                  const bool use_fast_math,
                  const bool use_2d_quantization = false,
-                 std::vector<float> *rowwise_amax = nullptr)
+                 const bool row_scaled_nvfp4 = false)
 {
     std::vector<InputType> input_t = create_transpose(input, rows, cols);
+    NVTE_CHECK(!(use_2d_quantization && row_scaled_nvfp4),
+               "2D quantization and row-scaling are not supported together.");
 
-    if (rowwise_amax != nullptr) {
-        rowwise_amax->resize(rows, 0.0f);
-        for (size_t row = 0; row < rows; ++row) {
-            float row_amax = 0.0f;
-            for (size_t col = 0; col < cols; ++col) {
-                row_amax = fmaxf(row_amax, fabsf(static_cast<float>(input[row * cols + col])));
-            }
-            (*rowwise_amax)[row] = row_amax;
-            quantize_nvfp4(OP,
-                           input + row * cols,
-                           output + row * (cols / 2),
-                           scales + row * scales_stride,
-                           1,
-                           cols,
-                           scales_stride,
-                           row_amax,
-                           use_fast_math,
-                           use_2d_quantization);
-        }
-    } else if (use_2d_quantization) {
+    // Ref impl for 2D quantization
+    if (use_2d_quantization) {
         // Step 1: Compute mathematical 8×8 scaling factors
         std::vector<std::vector<fp8e4m3>> math_scales;
-        compute_2d_mathematical_scales(OP, input, rows, cols, global_amax, math_scales, use_fast_math);
+        compute_2d_mathematical_scales(OP, input, rows, cols, *amax, math_scales, use_fast_math);
 
         constexpr size_t block_size_Y = 16;
         constexpr size_t block_size_X = 16;
@@ -369,17 +361,36 @@ void compute_ref(float (*OP)(const float),
 
         // Step 4: Process quantized outputs using the same algorithm as quantize_nvfp4_2d
         // (This part processes the actual FP4 data using the mathematical scaling factors)
-        quantize_nvfp4_2d(OP, input, output, nullptr, rows, cols, scales_stride, global_amax,
+        quantize_nvfp4_2d(OP, input, output, nullptr, rows, cols, scales_stride, *amax,
                           use_fast_math); // scales already filled
-        quantize_nvfp4_2d(OP, input_t.data(), output_t, nullptr, cols, rows, scales_stride_t, global_amax,
+        quantize_nvfp4_2d(OP, input_t.data(), output_t, nullptr, cols, rows, scales_stride_t, *amax,
                           use_fast_math); // scales_t already filled
 
-    } else {
-        quantize_nvfp4(OP, input, output, scales, rows, cols, scales_stride, global_amax,
-                       use_fast_math, use_2d_quantization);
-        quantize_nvfp4(OP, input_t.data(), output_t, scales_t, cols, rows, scales_stride_t, global_amax,
-                       use_fast_math, use_2d_quantization);
+        return;
+    }
+
+    // Ref impl for row-scaling
+    if (row_scaled_nvfp4) {
+        for (size_t row = 0; row < rows; ++row) {
+            quantize_nvfp4(OP,
+                           input + row * cols,
+                           output + row * (cols / 2),
+                           scales + row * scales_stride,
+                           1,
+                           cols,
+                           scales_stride,
+                           amax[row],
+                           use_fast_math,
+                           use_2d_quantization);
+        }
+        return;
     }
+
+    // Ref impl for basic NVFP4
+    quantize_nvfp4(OP, input, output, scales, rows, cols, scales_stride, *amax,
+                   use_fast_math, use_2d_quantization);
+    quantize_nvfp4(OP, input_t.data(), output_t, scales_t, cols, rows, scales_stride_t, *amax,
+                   use_fast_math, use_2d_quantization);
 }
 
 void compare_nvfp4_tensors(const std::string& name,
@@ -479,48 +490,7 @@ void dump_nvfp4_tensor_data(const std::string& prefix,
     }
 }
 
-void print_detailed_tensor_comparison(const std::string& name,
-                                     const fp4e2m1 *test_data, const fp4e2m1 *ref_data,
-                                     const int rows, const int cols) {
-    printf("\n=== DETAILED COMPARISON for %s (%d×%d = %d elements) ===\n",
-           name.c_str(), rows, cols, rows * cols);
-
-    const int total_elements = rows * cols;
-    const int check_count = 128;
-
-    printf("--- FIRST %d ELEMENTS ---\n", check_count);
-    printf("Index | Test_Value    | Ref_Value     | Match\n");
-    printf("------|---------------|---------------|-------\n");
-    for (int i = 0; i < std::min(check_count, total_elements); ++i) {
-        double2 test_pair = cvt_fp4x2_to_double2(*reinterpret_cast<const fp4e2m1x2*>(&test_data[i/2]));
-        double2 ref_pair = cvt_fp4x2_to_double2(*reinterpret_cast<const fp4e2m1x2*>(&ref_data[i/2]));
-
-        double t = (i % 2 == 0) ? test_pair.x : test_pair.y;
-        double r = (i % 2 == 0) ? ref_pair.x : ref_pair.y;
-        bool match = (fabs(t - r) < 1e-6);
-
-        printf("%5d | %13.6f | %13.6f | %s\n", i, t, r, match ? "✓" : "✗");
-    }
-
-    if (total_elements > 2 * check_count) {
-        printf("\n--- LAST %d ELEMENTS ---\n", check_count);
-        printf("Index | Test_Value    | Ref_Value     | Match\n");
-        printf("------|---------------|---------------|-------\n");
-        for (int i = total_elements - check_count; i < total_elements; ++i) {
-            double2 test_pair = cvt_fp4x2_to_double2(*reinterpret_cast<const fp4e2m1x2*>(&test_data[i/2]));
-            double2 ref_pair = cvt_fp4x2_to_double2(*reinterpret_cast<const fp4e2m1x2*>(&ref_data[i/2]));
-
-            double t = (i % 2 == 0) ? test_pair.x : test_pair.y;
-            double r = (i % 2 == 0) ? ref_pair.x : ref_pair.y;
-            bool match = (fabs(t - r) < 1e-6);
-
-            printf("%5d | %13.6f | %13.6f | %s\n", i, t, r, match ? "✓" : "✗");
-        }
-    }
-    printf("==================================\n");
-}
-
-void compareResults_nvfp4(const Tensor &test,
+void compareResults_nvfp4(Tensor &test,
                           const void *ref, const void *ref_t, const int rows, const int cols,
                           double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true,
                           bool dump_data = false, bool compare_columnwise = true) {
@@ -529,10 +499,6 @@ void compareResults_nvfp4(const Tensor &test,
     const fp4e2m1 *test_data = test.rowwise_cpu_dptr<fp4e2m1>();
     const fp4e2m1 *ref_data = reinterpret_cast<const fp4e2m1*>(ref);
 
-    // Print detailed element-by-element comparison
-    // print_detailed_tensor_comparison("output", test_data, ref_data, rows, cols);
-    // print_detailed_tensor_comparison("output_t", test_data_t, ref_data_t, cols, rows);
-
     // Optionally dump tensor data to files for detailed analysis
     if (dump_data) {
         dump_nvfp4_tensor_data("output", test_data, ref_data, rows, cols);
@@ -549,9 +515,10 @@ void compareResults_nvfp4(const Tensor &test,
     }
 }
 
-void compare_rowwise_amax(const Tensor &output, const std::vector<float> &ref_amax) {
-    const std::vector<float> test_amax_data = output.tensor_amax_values();
-    ASSERT_EQ(test_amax_data.size(), ref_amax.size());
+void compare_rowwise_amax(Tensor &output, const std::vector<float> &ref_amax) {
+    ASSERT_EQ(output.rowwise_amax_size(), ref_amax.size());
+    const auto *amax_ptr = output.cpu_rowwise_amax_ptr<float>();
+    const std::vector<float> test_amax_data(amax_ptr, amax_ptr + ref_amax.size());
     for (size_t row = 0; row < ref_amax.size(); ++row) {
         ASSERT_EQ(test_amax_data[row], ref_amax[row])
             << "Row-scaled amax mismatch at row " << row;
@@ -568,6 +535,9 @@ void performTest(float (*OP)(const float),
     DType itype = TypeInfo<InputType>::dtype;
     DType otype = DType::kFloat4E2M1;
 
+    const bool rowwise = true;
+    const bool columnwise = !row_scaled_nvfp4;
+
     const size_t rows = first_dimension(shape);
     const size_t cols = last_dimension(shape);
 
@@ -589,7 +559,7 @@ void performTest(float (*OP)(const float),
     const size_t scales_stride_t = blocks_X_t;
 
     Tensor input("input", shape, itype);
-    Tensor output("output", shape, otype, true, !row_scaled_nvfp4, NVTE_NVFP4_1D_SCALING);
+    Tensor output("output", shape, otype, rowwise, columnwise, NVTE_NVFP4_1D_SCALING);
 
     std::unique_ptr<fp4e2m1x2[]> ref_output   = std::make_unique<fp4e2m1x2[]>(rows * (cols / 2));
     std::unique_ptr<fp4e2m1x2[]> ref_output_t = std::make_unique<fp4e2m1x2[]>(cols * (rows / 2));
@@ -598,58 +568,65 @@ void performTest(float (*OP)(const float),
 
     fillCase<fp32>(&input, InputsFillCase::uniform);
 
-    // Golden value of amax chosen to make the 2nd-stage scaling mantissa zero and avoid rounding issues
-    const float amax = 448.0f * 6.0f * 8.0f;
-    std::vector<float> ref_rowwise_amax;
-    bool use_2d_quantization = false;
+    // Compute 2nd stage NVFP4 scaling factor
+    std::vector<float> ref_amax;
     if (row_scaled_nvfp4) {
-        output.set_tensor_amax_shape({rows});
-        output.set_row_scaled_nvfp4(true);
-        compute_ref<InputType>(OP,
-                               input.rowwise_cpu_dptr<InputType>(),
-                               ref_output.get(),
-                               ref_output_t.get(),
-                               ref_scales.get(),
-                               ref_scales_t.get(),
-                               0.0f,
-                               rows,
-                               cols,
-                               scales_stride,
-                               scales_stride_t,
-                               use_fast_math,
-                               use_2d_quantization,
-                               &ref_rowwise_amax);
+        // Compute per-row amaxes
+        const auto *input_vals = input.rowwise_cpu_dptr<InputType>();
+        for (size_t row = 0; row < rows; ++row){
+            float row_amax = 0.0f;
+            for (size_t col = 0; col < cols; ++col) {
+                row_amax = fmaxf(row_amax, fabsf(static_cast<float>(input_vals[row * cols + col])));
+            }
+            ref_amax.push_back(row_amax);
+        }
+
+        // Update tensor
+        // Note: No need to update amax like standard NVFP4, amaxes
+        // are computed during quantization.
+        output.set_row_scaled_nvfp4(row_scaled_nvfp4);
     } else {
-        // Set 2nd stage NVFP4 scaling factor
-        output.set_tensor_amax(amax);
-        output.set_tensor_amax_columnwise(amax);
-        compute_ref<InputType>(OP,
-                               input.rowwise_cpu_dptr<InputType>(),
-                               ref_output.get(),
-                               ref_output_t.get(),
-                               ref_scales.get(),
-                               ref_scales_t.get(),
-                               amax,
-                               rows,
-                               cols,
-                               scales_stride,
-                               scales_stride_t,
-                               use_fast_math,
-                               use_2d_quantization);
+        // Golden value of amax chosen to make the 2nd-stage scaling mantissa zero and avoid rounding issues
+        ref_amax.assign(1, 448.0f * 6.0f * 8.0f);
+
+        // Update tensor
+        if (rowwise) {
+            std::copy(ref_amax.begin(), ref_amax.end(), output.cpu_rowwise_amax_ptr<float>());
+        }
+        if (columnwise) {
+            std::copy(ref_amax.begin(), ref_amax.end(), output.cpu_columnwise_amax_ptr<float>());
+        }
+        output.from_cpu();
     }
 
+    // Reference implementation
+    bool use_2d_quantization = false;
+    compute_ref<InputType>(OP,
+                           input.rowwise_cpu_dptr<InputType>(),
+                           ref_output.get(),
+                           ref_output_t.get(),
+                           ref_scales.get(),
+                           ref_scales_t.get(),
+                           ref_amax.data(),
+                           rows,
+                           cols,
+                           scales_stride,
+                           scales_stride_t,
+                           use_fast_math,
+                           use_2d_quantization,
+                           row_scaled_nvfp4);
+
     // Initialize stochastic rounding
     Tensor rng_state("rng_state", std::vector<size_t>{2}, DType::kInt64);
     rng_state.rowwise_cpu_dptr<int64_t>()[0] = 123;  // rng_seed
     rng_state.rowwise_cpu_dptr<int64_t>()[1] = 321;  // rng_sequence
     rng_state.from_cpu();
 
+    // Quantization options
     QuantizationConfigWrapper quant_config;
     quant_config.set_use_fast_math(use_fast_math);
     quant_config.set_stochastic_rounding(false);
     quant_config.set_rng_state(rng_state.data());
-
-    // Set 2D quantization based on compile-time flag
     quant_config.set_nvfp4_2d_quantization(use_2d_quantization);
 
     // Call appropriate function based on operation type
@@ -696,9 +673,7 @@ void performTest(float (*OP)(const float),
                                           scale_mismatches_num);
     }
 
-    if (row_scaled_nvfp4) {
-        compare_rowwise_amax(output, ref_rowwise_amax);
-    }
+    compare_rowwise_amax(output, ref_amax);
 }
 
 std::vector<std::vector<size_t>> tensor_dims = {
diff --git a/tests/cpp/operator/test_cast_transpose.cu b/tests/cpp/operator/test_cast_transpose.cu
index 44c78e4a09..9a5dc959da 100644
--- a/tests/cpp/operator/test_cast_transpose.cu
+++ b/tests/cpp/operator/test_cast_transpose.cu
@@ -55,13 +55,13 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output_c.get(),
-                                     ref_output_t.get(), N, H, &ref_amax,
-                                     output.scale());
+                                     ref_output_t.get(), N, H, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
@@ -69,7 +69,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_transpose_dbias.cu b/tests/cpp/operator/test_cast_transpose_dbias.cu
index 5b06b28327..f9303d34f5 100644
--- a/tests/cpp/operator/test_cast_transpose_dbias.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias.cu
@@ -73,6 +73,7 @@ void performTest(const size_t N, const size_t H) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
@@ -80,7 +81,7 @@ void performTest(const size_t N, const size_t H) {
 
   CType ref_amax;
   compute_ref_cast_transpose_dbias(input.rowwise_cpu_dptr<IType>(),
-                                   output.scale(),
+                                   ref_scale,
                                    ref_output_c.get(),
                                    ref_output_t.get(),
                                    &ref_amax,
@@ -111,7 +112,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
   auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
index 9a4a2fa080..31eafff80f 100644
--- a/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dbias_dgelu.cu
@@ -86,6 +86,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&input);
   fillUniform(&gelu_input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N*H);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N*H);
@@ -94,7 +95,7 @@ void performTest(const size_t N, const size_t H) {
   CType ref_amax;
   compute_ref_cast_transpose_dbias_dgelu(input.rowwise_cpu_dptr<IType>(),
                                          gelu_input.rowwise_cpu_dptr<IType>(),
-                                         output.scale(),
+                                         ref_scale,
                                          ref_output_c.get(),
                                          ref_output_t.get(),
                                          &ref_amax,
@@ -127,7 +128,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_cast_transpose_dgeglu.cu b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
index a87c0c5a42..15ecd3ab66 100644
--- a/tests/cpp/operator/test_cast_transpose_dgeglu.cu
+++ b/tests/cpp/operator/test_cast_transpose_dgeglu.cu
@@ -81,6 +81,7 @@ void performTest(const size_t N, const size_t H) {
   fillUniform(&grad);
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = isFp8Type(otype) ? output.scale() : 1.0f;
 
   std::unique_ptr<OType[]> ref_output_c = std::make_unique<OType[]>(N * H * 2);
   std::unique_ptr<OType[]> ref_output_t = std::make_unique<OType[]>(N * H * 2);
@@ -89,7 +90,7 @@ void performTest(const size_t N, const size_t H) {
 
   CType ref_amax;
   compute_ref_cast_transpose_dgated_gelu(grad.rowwise_cpu_dptr<IType>(), input.rowwise_cpu_dptr<IType>(),
-                                         output.scale(), ref_output_c.get(), ref_output_t.get(),
+                                         ref_scale, ref_output_c.get(), ref_output_t.get(),
                                          &ref_amax, N, H);
 
   cudaDeviceSynchronize();
@@ -99,7 +100,7 @@ void performTest(const size_t N, const size_t H) {
   if (isFp8Type(otype)) {
     auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
     compareResults("amax", output.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / output.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", output.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_dequantize_nvfp4.cu b/tests/cpp/operator/test_dequantize_nvfp4.cu
index ec405b1d90..eb9e8bce23 100644
--- a/tests/cpp/operator/test_dequantize_nvfp4.cu
+++ b/tests/cpp/operator/test_dequantize_nvfp4.cu
@@ -76,7 +76,7 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data,
 }
 
 template <typename OutputType>
-float compute_amax(const test::Tensor &t, size_t rows, size_t cols) {
+float compute_amax(test::Tensor &t, size_t rows, size_t cols) {
     t.to_cpu();
     const auto *data = t.rowwise_cpu_dptr<OutputType>();
     float amax = 0.0f;
@@ -94,50 +94,63 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
     using namespace test;
     DType otype = TypeInfo<OutputType>::dtype;
 
+    // Tensors
     Tensor input("input", std::vector<size_t>{rows, cols}, otype);
-    fillCase<fp32>(&input, InputsFillCase::uniform);
-
     Tensor quantized("quantized", std::vector<size_t>{rows, cols},
                      DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
+    Tensor output("output", std::vector<size_t>{rows, cols}, otype, true, false);
+
+    // Fill input with random data
+    fillCase<fp32>(&input, InputsFillCase::uniform);
+
+    // Configure quantized tensor amax
+    size_t amax_size = 1;
     if (row_scaled_nvfp4) {
-        quantized.set_tensor_amax_shape({rows});
-        quantized.set_row_scaled_nvfp4(true);
+      quantized.set_row_scaled_nvfp4(true);
+      amax_size = rows;
     } else if (rows > 0 && cols > 0) {
-        quantized.set_tensor_amax(compute_amax<OutputType>(input, rows, cols));
+      quantized.set_amax(compute_amax<OutputType>(input, rows, cols));
     } else {
-        quantized.set_tensor_amax(0.0f);
+      quantized.set_amax(0.0f);
     }
 
+    // Quantize
     if (rows > 0 && cols > 0) {
         nvte_quantize(input.data(), quantized.data(), 0);
         cudaDeviceSynchronize();
+        auto err = cudaGetLastError();
+        ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
     }
 
-    Tensor output("output", std::vector<size_t>{rows, cols}, otype, true, false);
+    // Dequantize
     nvte_dequantize(quantized.data(), output.data(), 0);
     cudaDeviceSynchronize();
-
     auto err = cudaGetLastError();
     ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
 
-    if (rows > 0 && cols > 0) {
-        quantized.to_cpu();
-        const uint8_t *fp4_data =
-            reinterpret_cast<const uint8_t *>(quantized.rowwise_cpu_dptr<fp4e2m1>());
-        const fp8e4m3 *scales = quantized.rowwise_cpu_scale_inv_ptr<fp8e4m3>();
-        const std::vector<float> amax_val = quantized.tensor_amax_values();
-        const NVTEShape scale_shape = quantized.rowwise_scale_inv_shape();
-        const size_t scale_stride = scale_shape.data[scale_shape.ndim - 1];
-
-        std::unique_ptr<OutputType[]> ref_output =
-            std::make_unique<OutputType[]>(rows * cols);
-        compute_ref_dequantize_nvfp4<OutputType>(
-            fp4_data, scales, amax_val, ref_output.get(),
-            rows, cols, scale_stride);
-
-        auto [atol, rtol] = getTolerances(otype);
-        compareResults("output_nvfp4", output, ref_output.get(), true, atol, rtol);
+    // Nothing to be done if tensor is empty
+    if (rows == 0 && cols == 0) {
+      return;
     }
+
+    // Dequantize reference implementation
+    quantized.to_cpu();
+    const uint8_t *fp4_data =
+      reinterpret_cast<const uint8_t *>(quantized.rowwise_cpu_dptr<fp4e2m1>());
+    const fp8e4m3 *scales = quantized.rowwise_cpu_scale_inv_ptr<fp8e4m3>();
+    const auto *amax = quantized.cpu_rowwise_amax_ptr<float>();
+    const std::vector<float> amax_vals(amax, amax + amax_size);
+    const NVTEShape scale_shape = quantized.rowwise_scale_inv_shape();
+    const size_t scale_stride = scale_shape.data[scale_shape.ndim - 1];
+    std::unique_ptr<OutputType[]> ref_output =
+      std::make_unique<OutputType[]>(rows * cols);
+    compute_ref_dequantize_nvfp4<OutputType>(
+      fp4_data, scales, amax_vals, ref_output.get(),
+      rows, cols, scale_stride);
+
+    // Compare results from TE and reference impls
+    auto [atol, rtol] = getTolerances(otype);
+    compareResults("output_nvfp4", output, ref_output.get(), true, atol, rtol);
 }
 
 // Dequantize NVFP4 with GEMM-swizzled scales and compare against compact path.
@@ -153,12 +166,11 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     Tensor quantized_compact("quantized_compact", std::vector<size_t>{rows, cols},
                              DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
     if (row_scaled_nvfp4) {
-        quantized_compact.set_tensor_amax_shape({rows});
         quantized_compact.set_row_scaled_nvfp4(true);
     } else if (rows > 0 && cols > 0) {
-        quantized_compact.set_tensor_amax(compute_amax<OutputType>(input, rows, cols));
+        quantized_compact.set_amax(compute_amax<OutputType>(input, rows, cols));
     } else {
-        quantized_compact.set_tensor_amax(0.0f);
+        quantized_compact.set_amax(0.0f);
     }
 
     if (rows > 0 && cols > 0) {
@@ -175,10 +187,9 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     Tensor quantized_swizzled("quantized_swizzled", std::vector<size_t>{rows, cols},
                               DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
     if (row_scaled_nvfp4) {
-        quantized_swizzled.set_tensor_amax_shape({rows});
         quantized_swizzled.set_row_scaled_nvfp4(true);
     } else {
-        quantized_swizzled.set_tensor_amax(0.0f);
+        quantized_swizzled.set_amax(0.0f);
     }
     quantized_swizzled.set_with_gemm_swizzled_scales(true);
 
@@ -186,9 +197,12 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     // since from_cpu() uploads all CPU buffers (including zero-init data).
     quantized_compact.to_cpu();
     if (row_scaled_nvfp4) {
-        quantized_swizzled.copy_tensor_amax_from(quantized_compact);
+        const auto *src = quantized_compact.cpu_rowwise_amax_ptr<float>();
+        auto *dst = quantized_swizzled.cpu_rowwise_amax_ptr<float>();
+        std::copy(src, src + rows, dst);
+        quantized_swizzled.from_cpu();
     } else {
-        quantized_swizzled.set_tensor_amax(quantized_compact.amax());
+        quantized_swizzled.set_amax(quantized_compact.amax());
     }
 
     // Copy FP4 data after from_cpu() to avoid being overwritten
diff --git a/tests/cpp/operator/test_multi_cast_transpose.cu b/tests/cpp/operator/test_multi_cast_transpose.cu
index 2bb35c4b89..0271c9dc6b 100644
--- a/tests/cpp/operator/test_multi_cast_transpose.cu
+++ b/tests/cpp/operator/test_multi_cast_transpose.cu
@@ -97,7 +97,7 @@ void performTest() {
     std::copy(input.rowwise_cpu_dptr<InputType>(),
               input.rowwise_cpu_dptr<InputType>() + height * width,
               ref_input_list.back().begin());
-    ref_scale_list[tensor_id] = output.scale();
+    ref_scale_list[tensor_id] = isFp8Type(otype) ? output.scale() : 1.0f;
     ref_height_list[tensor_id] = height;
     ref_width_list[tensor_id] = width;
   }
@@ -138,7 +138,7 @@ void performTest() {
                      atol_amax, rtol_amax);
       compareResults("scale_inv",
                      output_list[tensor_id].rowwise_scale_inv(),
-                     1.f / output_list[tensor_id].scale(),
+                     1.f / ref_scale_list[tensor_id],
                      atol_amax, rtol_amax);
     }
     auto [atol, rtol] = getTolerances(otype);
diff --git a/tests/cpp/operator/test_normalization.cu b/tests/cpp/operator/test_normalization.cu
index f737005e26..ea6692dba4 100644
--- a/tests/cpp/operator/test_normalization.cu
+++ b/tests/cpp/operator/test_normalization.cu
@@ -208,7 +208,7 @@ void performTest(const size_t N, const size_t H, const bool zero_centered_gamma,
   auto [atol_amax, rtol_amax] = getTolerances(DType::kFloat32);
   if (isFp8Type(otype)) {
     compareResults("amax", z.amax(), ref_amax, atol_amax, rtol_amax);
-    float ref_scale_inv = 1.f / z.scale();
+    float ref_scale_inv = 1.f / ref_scale;
     compareResults("scale_inv", z.rowwise_scale_inv(), ref_scale_inv, atol_amax, rtol_amax);
   }
 
diff --git a/tests/cpp/operator/test_qdq.cu b/tests/cpp/operator/test_qdq.cu
index 4e364fffa4..034280aa9a 100644
--- a/tests/cpp/operator/test_qdq.cu
+++ b/tests/cpp/operator/test_qdq.cu
@@ -65,12 +65,13 @@ void performTestQ(const size_t N) {
 
   fillUniform(&input);
   setRandomScale(&output);
+  const float ref_scale = output.scale();
 
   nvte_quantize(input.data(), output.data(), 0);
 
   float ref_amax;
   compute_ref_q<InputType, OutputType>(input.rowwise_cpu_dptr<InputType>(), ref_output.get(),
-                                       N, &ref_amax, output.scale());
+                                       N, &ref_amax, ref_scale);
 
   cudaDeviceSynchronize();
   auto err = cudaGetLastError();
diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
index 96e71f9513..4fd75bb927 100644
--- a/tests/cpp/test_common.cu
+++ b/tests/cpp/test_common.cu
@@ -8,12 +8,14 @@
 #include "test_common.h"
 
 #include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstring>
+#include <iostream>
 #include <memory>
 #include <numeric>
+#include <optional>
 #include <random>
-#include <iostream>
-#include <cassert>
-#include <cmath>
 #include <string>
 
 #include <gtest/gtest.h>
@@ -193,33 +195,6 @@ std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
 
     return {ret_rowwise, ret_colwise};
   }
-  if (scaling_mode == NVTE_MXFP8_1D_SCALING) {
-    std::vector<size_t> shape_vec;
-    for (size_t i = 0; i < shape.ndim; ++i) {
-      shape_vec.push_back(shape.data[i]);
-    }
-    size_t first_dim = first_dimension(shape_vec);
-    size_t last_dim = last_dimension(shape_vec);
-
-    scale_inv_meta ret_rowwise, ret_colwise;
-
-    const size_t block_size_X_rowwise = 32;
-    size_t scale_dim_Y_rowwise = DIVUP_TO_MULTIPLE(first_dim, scale_tensor_alignment_Y_rowwise);
-    size_t scale_dim_X_rowwise = DIVUP_TO_MULTIPLE(DIVUP(last_dim, block_size_X_rowwise), scale_tensor_alignment_X_rowwise);
-    ret_rowwise.shape = {scale_dim_Y_rowwise, scale_dim_X_rowwise};
-
-    const size_t block_size_Y_colwise = 32;
-    size_t scale_dim_Y_colwise = DIVUP_TO_MULTIPLE(DIVUP(first_dim, block_size_Y_colwise), scale_tensor_alignment_Y_colwise);
-    size_t scale_dim_X_colwise = DIVUP_TO_MULTIPLE(last_dim, scale_tensor_alignment_X_colwise);
-    ret_colwise.shape = {scale_dim_Y_colwise, scale_dim_X_colwise};
-
-    ret_rowwise.type = DType::kFloat8E8M0;
-    ret_colwise.type = DType::kFloat8E8M0;
-    ret_rowwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0);
-    ret_colwise.type_size_bits = typeToNumBits(DType::kFloat8E8M0);
-
-    return {ret_rowwise, ret_colwise};
-  }
   if (scaling_mode == NVTE_BLOCK_SCALING_2D) {
     std::vector<size_t> shape_vec;
     for (size_t i = 0; i < shape.ndim; ++i) {
@@ -276,6 +251,30 @@ std::pair<scale_inv_meta, scale_inv_meta> get_scales(const NVTEShape& shape,
   NVTE_ERROR("Invalid scaling mode!");
 }
 
+Tensor::Buffer::Buffer(size_t size, DType dtype)
+  : size_{size}, dtype_{dtype}, bytes_{size * typeToNumBits(dtype) / 8} {
+  if (bytes_ > 0) {
+    cpu_buffer_.reset(new unsigned char[bytes_]);
+    std::memset(cpu_buffer_.get(), 0, bytes_);
+    unsigned char *gpu_buffer = nullptr;
+    NVTE_CHECK_CUDA(cudaMalloc(&gpu_buffer, bytes_));
+    gpu_buffer_.reset(gpu_buffer);
+    NVTE_CHECK_CUDA(cudaMemset(gpu_buffer_.get(), 0, bytes_));
+  }
+}
+
+void Tensor::Buffer::to_cpu() {
+  if (bytes_ > 0) {
+    NVTE_CHECK_CUDA(cudaMemcpy(cpu_buffer_.get(), gpu_buffer_.get(), bytes_, cudaMemcpyDeviceToHost));
+  }
+}
+
+void Tensor::Buffer::from_cpu() {
+  if (bytes_ > 0) {
+    NVTE_CHECK_CUDA(cudaMemcpy(gpu_buffer_.get(), cpu_buffer_.get(), bytes_, cudaMemcpyHostToDevice));
+  }
+}
+
 Tensor::Tensor(const std::string& name,
                const NVTEShape &shape, const DType type,
                const bool rowwise, const bool columnwise,
@@ -303,31 +302,13 @@ Tensor::Tensor(const std::string& name,
     flattened_shape = convertShape(flattened_shape_vec);
   }
 
-  // Allocate and initialize data
-  void *dptr_rowwise = nullptr, *dptr_columnwise = nullptr;
-  const size_t total_size = bytes(shape, type);
-  if (total_size != 0) {
-    if (rowwise) {
-      cudaMalloc((void**)&dptr_rowwise, total_size);  // NOLINT(*)
-      cudaMemset(dptr_rowwise, 0, total_size);
-      cpu_data_rowwise_ = std::make_unique<unsigned char[]>(total_size);
-      std::fill_n(cpu_data_rowwise_.get(), total_size, 0);
-    }
-    if (columnwise) {
-      cudaMalloc((void**)&dptr_columnwise, total_size);  // NOLINT(*)
-      cudaMemset(dptr_columnwise, 0, total_size);
-      cpu_data_columnwise_ = std::make_unique<unsigned char[]>(total_size);
-      std::fill_n(cpu_data_columnwise_.get(), total_size, 0);
-    }
-  }
-
-  // Set tensor row-wise data
+  // Allocate row-wise data
   if (rowwise) {
-    const DType rowwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
-    tensor_.set_rowwise_data(dptr_rowwise, rowwise_type, shape);
+    data_rowwise_.emplace(product(shape), type);
+    tensor_.set_rowwise_data(data_rowwise_->gpu_buffer(), type, shape);
   }
 
-  // Set tensor column-wise data
+  // Allocate column-wise data
   if (columnwise) {
     // Determine shape of column-wise data
     std::vector<size_t> columnwise_shape_vec;
@@ -358,310 +339,224 @@ Tensor::Tensor(const std::string& name,
     const auto columnwise_shape = nvte_make_shape(columnwise_shape_vec.data(),
                                                   columnwise_shape_vec.size());
 
-    // Set column-wise data buffer
-    const DType colwise_type = (scaling_mode == NVTE_NVFP4_1D_SCALING) ? DType::kFloat4E2M1 : type;
-    tensor_.set_columnwise_data(dptr_columnwise, colwise_type, columnwise_shape);
+    // Allocate buffer
+    data_columnwise_.emplace(product(columnwise_shape), type);
+
+    // Configure TE tensor
+    tensor_.set_columnwise_data(data_columnwise_->gpu_buffer(), type, columnwise_shape);
   }
 
-  // Configure scales, amaxes, and other tensor buffers
-  float *amax = nullptr;
-  float *amax_columnwise = nullptr;
-  float *scale = nullptr;
-  float *rowwise_scale_inv = nullptr;
-  float *columnwise_scale_inv = nullptr;
-  if (isFp8Type(type) || isFp4Type(type)) {
-    if (scaling_mode == NVTE_DELAYED_TENSOR_SCALING) {
-      cudaMalloc((void**)&amax, sizeof(float));  // NOLINT(*)
-      cudaMemset(amax, 0, sizeof(float));
-      cudaMalloc((void**)&scale, sizeof(float));  // NOLINT(*)
-      cudaMemset(scale, 0, sizeof(float));
-      amax_cpu_data_ = std::make_shared<float>(0);
-      scale_cpu_data_ = std::make_shared<float>(0);
-      tensor_.set_amax(amax, DType::kFloat32, std::vector<size_t>{1});
-      tensor_.set_scale(scale, DType::kFloat32, std::vector<size_t>{1});
-      cudaMalloc((void**)&rowwise_scale_inv, sizeof(float));  // NOLINT(*)
+  // Allocate recipe-specific buffers
+  switch (scaling_mode) {
+  case NVTE_DELAYED_TENSOR_SCALING:
+    if (isFp8Type(type)) {
+      amax_rowwise_.emplace(1, DType::kFloat32);
+      scale_.emplace(1, DType::kFloat32);
+      tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+      tensor_.set_scale(scale_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+
+      // Use same buffer for row-wise and column-wise scale-inverse
+      auto scale_inv = std::make_shared<Buffer>(1, DType::kFloat32);
       if (rowwise) {
-        tensor_.set_rowwise_scale_inv(rowwise_scale_inv, DType::kFloat32,
-                                      std::vector<size_t>{1});
-        rowwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(sizeof(float));
-        std::fill_n(rowwise_scale_inv_cpu_data_.get(), sizeof(float), 0);
+        scale_inv_rowwise_ = scale_inv;
+        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
       if (columnwise) {
-        tensor_.set_columnwise_scale_inv(rowwise_scale_inv, DType::kFloat32,
-                                          std::vector<size_t>{1});
-        columnwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(sizeof(float));
-        std::fill_n(columnwise_scale_inv_cpu_data_.get(), sizeof(float), 0);
-      }
-    } else {
-      if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
-        // Used for NVFP4 second stage scaling
-        amax_cpu_data_ = std::make_shared<float>(0);
-        amax_cpu_data_columnwise_ = std::make_shared<float>(0);
-        cudaMalloc((void**)&amax, sizeof(float));  // NOLINT(*)
-        cudaMalloc((void**)&amax_columnwise, sizeof(float));  // NOLINT(*)
-        cudaMemset(amax, 0, sizeof(float));
-        cudaMemset(amax_columnwise, 0, sizeof(float));
-        tensor_.set_amax(amax, DType::kFloat32, std::vector<size_t>{1});
-        tensor_.set_columnwise_amax(amax_columnwise, DType::kFloat32, std::vector<size_t>{1});
+        scale_inv_columnwise_ = scale_inv;
+        tensor_.set_columnwise_scale_inv(scale_inv_columnwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
       }
+    }
+    break;
+  case NVTE_MXFP8_1D_SCALING:
+  case NVTE_BLOCK_SCALING_1D:
+  case NVTE_BLOCK_SCALING_2D:
+  case NVTE_NVFP4_1D_SCALING:
+    {
+      // Block scaling factors
       auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(flattened_shape, tensor_.scaling_mode());
-      auto rowwise_scale_size = rowwise_scale_meta.bytes();
-      auto columnwise_scale_size = colwise_scale_meta.bytes();
-      auto scale_shape = rowwise_scale_meta.shape;
-      auto columnwise_scale_shape = colwise_scale_meta.shape;
       if (rowwise) {
-        cudaMalloc((void **)&rowwise_scale_inv, rowwise_scale_size);  // NOLINT(*)
-        cudaMemset(rowwise_scale_inv, 0, rowwise_scale_size);
-        rowwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(rowwise_scale_size);
-        std::fill_n(rowwise_scale_inv_cpu_data_.get(), rowwise_scale_size, 0);
-        auto scale_dtype = rowwise_scale_meta.type;
-        tensor_.set_rowwise_scale_inv(rowwise_scale_inv, scale_dtype, scale_shape);
+        const auto scale_shape = rowwise_scale_meta.shape;
+        const auto scale_dtype = rowwise_scale_meta.type;
+        scale_inv_rowwise_ = std::make_shared<Buffer>(product(scale_shape), scale_dtype);
+        tensor_.set_rowwise_scale_inv(scale_inv_rowwise_->gpu_buffer(), scale_dtype, scale_shape);
       }
       if (columnwise) {
-        cudaMalloc((void**)&columnwise_scale_inv, columnwise_scale_size);  // NOLINT(*)
-        cudaMemset(columnwise_scale_inv, 0, columnwise_scale_size);
-        columnwise_scale_inv_cpu_data_ = std::make_unique<unsigned char[]>(columnwise_scale_size);
-        std::fill_n(columnwise_scale_inv_cpu_data_.get(), columnwise_scale_size, 0);
-        auto scale_dtype = colwise_scale_meta.type;
-        tensor_.set_columnwise_scale_inv(columnwise_scale_inv, scale_dtype, columnwise_scale_shape);
+        const auto scale_shape = colwise_scale_meta.shape;
+        const auto scale_dtype = colwise_scale_meta.type;
+        scale_inv_columnwise_ = std::make_shared<Buffer>(product(scale_shape), scale_dtype);
+        tensor_.set_columnwise_scale_inv(scale_inv_columnwise_->gpu_buffer(), scale_dtype, scale_shape);
       }
-    }
-  }
-}
 
-void Tensor::to_cpu() const {
-  const NVTEShape s = tensor_.shape();
-  const size_t size = bytes(s, tensor_.dtype());
-  if (rowwise_) {
-    cudaMemcpy(cpu_data_rowwise_.get(),
-               tensor_.get_rowwise_data().data_ptr,
-               size,
-               cudaMemcpyDeviceToHost);
-  }
-  if (columnwise_) {
-    const DType colwise_type = tensor_.dtype();
-
-    const size_t colwise_size = bytes(s, colwise_type);
-    cudaMemcpy(cpu_data_columnwise_.get(),
-                tensor_.get_columnwise_data().data_ptr,
-                colwise_size,
-                cudaMemcpyDeviceToHost);
-  }
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      if (tensor_.amax() != nullptr){
-        cudaMemcpy(amax_cpu_data_.get(),
-                  tensor_.amax(),
-                  sizeof(float),
-                  cudaMemcpyDeviceToHost);
-      }
-      cudaMemcpy(scale_cpu_data_.get(),
-                 tensor_.scale(),
-                 sizeof(float),
-                 cudaMemcpyDeviceToHost);
-    } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) {
-      if (rowwise_ && (tensor_.amax() != nullptr)){
-        cudaMemcpy(amax_cpu_data_.get(),
-                   tensor_.amax(),
-                   sizeof(float),
-                   cudaMemcpyDeviceToHost);
-      }
-      if (columnwise_ && (tensor_.get_columnwise_amax().data_ptr != nullptr)){
-        cudaMemcpy(amax_cpu_data_columnwise_.get(),
-                   tensor_.get_columnwise_amax().data_ptr,
-                   sizeof(float),
-                   cudaMemcpyDeviceToHost);
+      // NVFP4 uses amax for tensor scaling
+      if (scaling_mode == NVTE_NVFP4_1D_SCALING) {
+        if (rowwise) {
+          amax_rowwise_.emplace(1, DType::kFloat32);
+          tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        }
+        if (columnwise) {
+          amax_columnwise_.emplace(1, DType::kFloat32);
+          tensor_.set_columnwise_amax(amax_columnwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
+        }
       }
     }
-    auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode());
-    if (rowwise_) {
-      auto scale_size = rowwise_scale_meta.bytes();
-      cudaMemcpy(rowwise_scale_inv_cpu_data_.get(),
-                 tensor_.get_rowwise_scale_inv().data_ptr,
-                 scale_size,
-                 cudaMemcpyDeviceToHost);
-    }
-    if (columnwise_) {
-      auto scale_size = colwise_scale_meta.bytes();
-      cudaMemcpy(columnwise_scale_inv_cpu_data_.get(),
-                 tensor_.get_columnwise_scale_inv().data_ptr,
-                 scale_size,
-                 cudaMemcpyDeviceToHost);
-    }
+    break;
+  default:
+    NVTE_ERROR("Unsupported tensor format (", static_cast<int>(scaling_mode), ")");
   }
 }
 
-void Tensor::from_cpu() const {
-  const NVTEShape s = tensor_.shape();
-  const size_t size = bytes(s, tensor_.dtype());
-  if (rowwise_) {
-    cudaMemcpy(tensor_.get_rowwise_data().data_ptr, cpu_data_rowwise_.get(), size,
-               cudaMemcpyHostToDevice);
-  }
-  if (columnwise_) {
-    cudaMemcpy(tensor_.get_columnwise_data().data_ptr, cpu_data_columnwise_.get(), size,
-               cudaMemcpyHostToDevice);
-  }
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      if (tensor_.amax() != nullptr){
-        cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice);
-      }
-      cudaMemcpy(tensor_.scale(), scale_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice);
-    } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) {
-      if (rowwise_ && (tensor_.amax() != nullptr)) {
-        cudaMemcpy(tensor_.amax(), amax_cpu_data_.get(), sizeof(float), cudaMemcpyHostToDevice);
-      }
-      if (columnwise_ && (tensor_.get_columnwise_amax().data_ptr != nullptr)) {
-        cudaMemcpy(tensor_.get_columnwise_amax().data_ptr, amax_cpu_data_columnwise_.get(),
-                   sizeof(float), cudaMemcpyHostToDevice);
-      }
-    }
-    auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(s, tensor_.scaling_mode());
+void Tensor::set_tensor_amax_nullptr() {
+  tensor_.set_amax(nullptr, DType::kFloat32, tensor_.defaultShape);
+}
+
+void Tensor::set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales) {
+  tensor_.set_with_gemm_swizzled_scales(with_gemm_swizzled_scales);
+}
+
+void Tensor::set_row_scaled_nvfp4(bool row_scaled_nvfp4) {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "Row-scaled NVFP4 is only supported for NVFP4 tensors.");
+  tensor_.set_row_scaled_nvfp4(row_scaled_nvfp4);
+
+  // Update amax tensor
+  if (row_scaled_nvfp4) {
+    // Row-scaled NVFP4 has amax matching number of rows
+    NVTE_CHECK(rowwise_, "Row-scaled NVFP4 requires row-wise data.");
+    NVTE_CHECK(!columnwise_, "Row-scaled NVFP4 does not support column-wise data.");
+    auto shape = tensor_.shape();
+    const size_t rows = product(shape, 0, shape.ndim - 1);
+    amax_rowwise_.emplace(rows, DType::kFloat32);
+    tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{rows});
+  } else {
+    // Tensor-scaled NVFP4 has single amax
     if (rowwise_) {
-      auto scale_size = rowwise_scale_meta.bytes();
-      cudaMemcpy(tensor_.get_rowwise_scale_inv().data_ptr,
-                 rowwise_scale_inv_cpu_data_.get(), scale_size,
-                 cudaMemcpyHostToDevice);
+      amax_rowwise_.emplace(1, DType::kFloat32);
+      tensor_.set_amax(amax_rowwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
     }
     if (columnwise_) {
-      auto scale_size = colwise_scale_meta.bytes();
-      cudaMemcpy(tensor_.get_columnwise_scale_inv().data_ptr,
-                 columnwise_scale_inv_cpu_data_.get(), scale_size,
-                 cudaMemcpyHostToDevice);
+      amax_columnwise_.emplace(1, DType::kFloat32);
+      tensor_.set_columnwise_amax(amax_columnwise_->gpu_buffer(), DType::kFloat32, std::vector<size_t>{1});
     }
   }
 }
 
-void Tensor::set_scale(float scale) {
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    NVTE_CHECK(scale_cpu_data_);
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
-      *scale_cpu_data_ = scale;
-      from_cpu();
-    }
-  }
+void Tensor::to_cpu() {
+  if (data_rowwise_) { data_rowwise_->to_cpu(); }
+  if (data_columnwise_) { data_columnwise_->to_cpu(); }
+  if (scale_inv_rowwise_) { scale_inv_rowwise_->to_cpu(); }
+  if (scale_inv_columnwise_) { scale_inv_columnwise_->to_cpu(); }
+  if (amax_rowwise_) { amax_rowwise_->to_cpu(); }
+  if (amax_columnwise_) { amax_columnwise_->to_cpu(); }
+  if (scale_) { scale_->to_cpu(); }
 }
 
-void Tensor::set_tensor_amax_shape(const std::vector<size_t> &shape) {
-  const size_t numel = product(shape);
-  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
-             "Amax shape override is only supported for NVFP4 test tensors.");
-
-  auto old_amax = tensor_.get_amax();
-  if (old_amax.data_ptr != nullptr) {
-    NVTE_CHECK_CUDA(cudaFree(old_amax.data_ptr));
-  }
-
-  float *amax = nullptr;
-  NVTE_CHECK_CUDA(cudaMalloc(&amax, numel * sizeof(float)));
-  NVTE_CHECK_CUDA(cudaMemset(amax, 0, numel * sizeof(float)));
-  tensor_.set_amax(amax, DType::kFloat32, shape);
+void Tensor::from_cpu() {
+  if (data_rowwise_) { data_rowwise_->from_cpu(); }
+  if (data_columnwise_) { data_columnwise_->from_cpu(); }
+  if (scale_inv_rowwise_) { scale_inv_rowwise_->from_cpu(); }
+  if (scale_inv_columnwise_) { scale_inv_columnwise_->from_cpu(); }
+  if (amax_rowwise_) { amax_rowwise_->from_cpu(); }
+  if (amax_columnwise_) { amax_columnwise_->from_cpu(); }
+  if (scale_) { scale_->from_cpu(); }
 }
 
-std::vector<float> Tensor::tensor_amax_values() const {
-  const auto amax = tensor_.get_amax();
-  NVTE_CHECK(static_cast<DType>(amax.dtype) == DType::kFloat32, "Tensor amax must be FP32.");
-
-  const size_t numel = product(amax.shape);
-  if (numel == 0) {
-    return {};
-  }
-  NVTE_CHECK(amax.data_ptr != nullptr, "Tensor amax is not allocated.");
-
-  std::vector<float> values(numel);
-  NVTE_CHECK_CUDA(
-      cudaMemcpy(values.data(), amax.data_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost));
-  return values;
+void Tensor::set_amax(float amax) {
+  NVTE_CHECK(amax_rowwise_);
+  NVTE_CHECK(amax_rowwise_->size() == 1);
+  NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32);
+  *amax_rowwise_->cpu_buffer<float>() = amax;
+  amax_rowwise_->from_cpu();
 }
 
-void Tensor::copy_tensor_amax_from(const Tensor &other) {
-  const auto other_amax = other.tensor_.get_amax();
-  NVTE_CHECK(static_cast<DType>(other_amax.dtype) == DType::kFloat32,
-             "Source tensor amax must be FP32.");
-
-  auto my_amax = tensor_.get_amax();
-  NVTE_CHECK(static_cast<DType>(my_amax.dtype) == DType::kFloat32,
-             "Destination tensor amax must be FP32.");
-  NVTE_CHECK(areShapesEqual(my_amax.shape, other_amax.shape), "Amax shape mismatch.");
+void Tensor::set_scale(float scale) {
+  NVTE_CHECK(scale_);
+  NVTE_CHECK(scale_->size() == 1);
+  NVTE_CHECK(scale_->dtype() == DType::kFloat32);
+  *scale_->cpu_buffer<float>() = scale;
+  scale_->from_cpu();
+}
 
-  const size_t numel = product(other_amax.shape);
-  if (numel == 0) {
-    return;
-  }
+void Tensor::set_scale_inv(float scale_inv) {
+  NVTE_CHECK(scale_inv_rowwise_);
+  NVTE_CHECK(scale_inv_rowwise_->size() == 1);
+  NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32);
+  *scale_inv_rowwise_->cpu_buffer<float>() = scale_inv;
+  scale_inv_rowwise_->from_cpu();
+}
 
-  NVTE_CHECK(other_amax.data_ptr != nullptr, "Source tensor amax is not allocated.");
-  NVTE_CHECK(my_amax.data_ptr != nullptr, "Destination tensor amax is not allocated.");
-  NVTE_CHECK_CUDA(cudaMemcpy(my_amax.data_ptr, other_amax.data_ptr, numel * sizeof(float),
-                             cudaMemcpyDeviceToDevice));
+void Tensor::set_tensor_amax_columnwise(float amax) {
+  NVTE_CHECK(amax_columnwise_);
+  NVTE_CHECK(amax_columnwise_->size() == 1);
+  NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32);
+  *amax_columnwise_->cpu_buffer<float>() = amax;
+  amax_columnwise_->from_cpu();
 }
 
-void Tensor::set_scale_inv(float scale_inv) {
-  if (isFp8Type(dtype()) || isFp4Type(dtype())) {
-    if (rowwise_) {
-      NVTE_CHECK(rowwise_scale_inv_cpu_data_);
-    }
-    if (columnwise_) {
-      NVTE_CHECK(columnwise_scale_inv_cpu_data_);
-    }
+namespace {
 
-    auto [rowwise_scale_meta, colwise_scale_meta] = get_scales(tensor_.shape(), tensor_.scaling_mode());
-    if (rowwise_) {
-      auto num_scales = product(rowwise_scale_meta.shape);
-      if (num_scales == 1) {
-        rowwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
-      } else {
-        std::uniform_int_distribution<uint8_t> dis(0, 127);
-        auto *scale_inv_ptr = rowwise_cpu_scale_inv_ptr<uint8_t>();
-        for (size_t i = 0; i < num_scales; i++) {
-          scale_inv_ptr[i] = dis(gen_);
-        }
+void fill_uniform_buffer(void *cpu_data, size_t numel, DType dtype, std::mt19937 &gen) {
+  switch (dtype) {
+  case DType::kFloat32:
+    {
+      auto *data = static_cast<float *>(cpu_data);
+      std::uniform_real_distribution<float> dis(-2.0, 1.0);
+      for (size_t i = 0; i < numel; ++i) {
+        data[i] = dis(gen);
       }
     }
-    if (columnwise_) {
-      auto num_scales = product(colwise_scale_meta.shape);
-      if (num_scales == 1) {
-        columnwise_cpu_scale_inv_ptr<float>()[0] = scale_inv;
-      } else {
-        std::uniform_int_distribution<uint8_t> dis(0, 127);
-        auto *scale_inv_ptr = columnwise_cpu_scale_inv_ptr<uint8_t>();
-        for (size_t i = 0; i < num_scales; i++) {
-          scale_inv_ptr[i] = dis(gen_);
-        }
+    break;
+  case DType::kFloat8E4M3:
+  case DType::kFloat8E8M0:
+  case DType::kByte:
+    {
+      auto *data = static_cast<uint8_t *>(cpu_data);
+      std::uniform_int_distribution<uint8_t> dis(0, 127);
+      for (size_t i = 0; i < numel; ++i) {
+        data[i] = dis(gen);
       }
     }
-    from_cpu();
+    break;
+  default:
+    NVTE_ERROR("Unsupported dtype (", static_cast<int>(dtype), ").");
+  }
+}
+
+}  // namespace
+
+void Tensor::fill_uniform_rowwise_scale_inv() {
+  if (!scale_inv_rowwise_ || scale_inv_rowwise_->size() == 0) {
+    return;
+  }
+  fill_uniform_buffer(scale_inv_rowwise_->cpu_buffer(), scale_inv_rowwise_->size(),
+                      scale_inv_rowwise_->dtype(), gen_);
+  scale_inv_rowwise_->from_cpu();
+}
+
+void Tensor::fill_uniform_columnwise_scale_inv() {
+  if (!scale_inv_columnwise_ || scale_inv_columnwise_->size() == 0) {
+    return;
   }
+  fill_uniform_buffer(scale_inv_columnwise_->cpu_buffer(), scale_inv_columnwise_->size(),
+                      scale_inv_columnwise_->dtype(), gen_);
+  scale_inv_columnwise_->from_cpu();
 }
 
-void Tensor::shareFP8Meta(const Tensor &other) {
-  if ((isFp8Type(dtype()) && isFp8Type(other.dtype()))
-      || isFp4Type(dtype()) && isFp4Type(other.dtype())) {
-    auto new_tensor = TensorWrapper(other.tensor_.scaling_mode());
-    auto my_rowwise_data = tensor_.get_rowwise_data();
-    new_tensor.set_rowwise_data(my_rowwise_data.data_ptr, static_cast<DType>(my_rowwise_data.dtype),
-                                my_rowwise_data.shape);
-    auto my_columnwise_data = tensor_.get_columnwise_data();
-    new_tensor.set_columnwise_data(my_columnwise_data.data_ptr,
-                                   static_cast<DType>(my_columnwise_data.dtype),
-                                   my_columnwise_data.shape);
-    auto other_amax = other.tensor_.get_amax();
-    new_tensor.set_amax(other_amax.data_ptr, static_cast<DType>(other_amax.dtype),
-                        other_amax.shape);
-    auto other_scale = other.tensor_.get_scale();
-    new_tensor.set_scale(other_scale.data_ptr, static_cast<DType>(other_scale.dtype),
-                         other_scale.shape);
-    auto other_row_scale_inv = other.tensor_.get_rowwise_scale_inv();
-    new_tensor.set_rowwise_scale_inv(other_row_scale_inv.data_ptr,
-                                     static_cast<DType>(other_row_scale_inv.dtype),
-                                     other_row_scale_inv.shape);
-    auto other_col_scale_inv = other.tensor_.get_columnwise_scale_inv();
-    new_tensor.set_columnwise_scale_inv(other_col_scale_inv.data_ptr,
-                                        static_cast<DType>(other_col_scale_inv.dtype),
-                                        other_col_scale_inv.shape);
-    tensor_ = std::move(new_tensor);
-    to_cpu();
+void Tensor::fill_uniform_scale() {
+  if (!scale_ || scale_->size() == 0) {
+    return;
+  }
+
+  // Generate random scales on CPU
+  auto *cpu_data = scale_->cpu_buffer<float>();
+  const auto numel = scale_->size();
+  NVTE_CHECK(scale_->dtype() == DType::kFloat32);
+  std::uniform_real_distribution<float> dis(-2.0, 1.0);
+  for (size_t i = 0; i < numel; ++i) {
+    cpu_data[i] = dis(gen_);
   }
+
+  // Update GPU tensor
+  scale_->from_cpu();
 }
 
 using std::to_string;
@@ -689,7 +584,7 @@ std::vector<size_t> unravel(const size_t i, const NVTEShape &shape) {
   return ret;
 }
 
-void compareResults_sequential(const std::string &name, const Tensor &test,
+void compareResults_sequential(const std::string &name, Tensor &test,
                                const void *ref, const bool rowwise,
                                double atol, double rtol, bool if_on_gpus,
                                const size_t tolerable_mismatches_limit) {
@@ -779,7 +674,7 @@ static size_t getFirstMismatchIdx(const DType data_type, const T* test_data, con
   return first_mismatch_idx;
 }
 
-void compareResults_parallel(const std::string &name, const Tensor &test, const void *ref,
+void compareResults_parallel(const std::string &name, Tensor &test, const void *ref,
                              const bool rowwise, double atol, double rtol, bool if_on_gpus,
                              const size_t tolerable_mismatches_limit) {
   if (if_on_gpus) test.to_cpu();
@@ -806,7 +701,7 @@ void compareResults_parallel(const std::string &name, const Tensor &test, const
   );
 }
 
-void compareResults(const std::string &name, const Tensor &test, const void *ref,
+void compareResults(const std::string &name, Tensor &test, const void *ref,
                     const bool rowwise, double atol, double rtol, bool if_on_gpus,
                     const size_t tolerable_mismatches_limit) {
   constexpr bool sequential = false;
@@ -992,6 +887,7 @@ void generate_data_uniformly(T* data, const size_t size, std::mt19937* gen) {
 }
 
 void fillUniform(Tensor *t) {
+  // Generate random row-wise data and column-wise data
   if (t->rowwise()) {
     const size_t size = product(t->rowwise_shape());
     TRANSFORMER_ENGINE_TYPE_SWITCH_ALL(t->dtype(), T,
@@ -1009,8 +905,12 @@ void fillUniform(Tensor *t) {
       }
     );
   }
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  t->set_scale_inv(dis(t->gen()));
+
+  // Generate random scales
+  t->fill_uniform_rowwise_scale_inv();
+  t->fill_uniform_columnwise_scale_inv();
+
+  // Update data on GPU
   t->from_cpu();
 }
 
@@ -1046,7 +946,20 @@ void fillCase_special(Tensor *t) {
       }
     });
   }
-  t->set_scale_inv(1.0);
+
+  // Fill scales
+  if (t->scaling_mode() == NVTE_DELAYED_TENSOR_SCALING) {
+    if (isFp8Type(t->dtype())) {
+      // FP8 tensor scale is set to 1
+      t->set_scale_inv(1.0);
+    }
+  } else {
+    // Block scales are filled randomly
+    t->fill_uniform_rowwise_scale_inv();
+    t->fill_uniform_columnwise_scale_inv();
+  }
+
+  // Update GPU tensor data
   t->from_cpu();
 }
 
@@ -1080,15 +993,12 @@ template void fillCase<fp4e2m1>(Tensor *t, const InputsFillCase fill_case);
 #endif
 
 void setRandomScale(Tensor *t) {
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  const float scale = dis(t->gen());
-  t->set_scale(scale);
+  t->fill_uniform_scale();
 }
 
 void setRandomScaleInv(Tensor *t) {
-  std::uniform_real_distribution<> dis(-2.0, 1.0);
-  const float scale_inv = dis(t->gen());
-  t->set_scale_inv(scale_inv);
+  t->fill_uniform_rowwise_scale_inv();
+  t->fill_uniform_columnwise_scale_inv();
 }
 
 bool isFp8Type(DType type) {
diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
index b2a7da89cf..17f36a99dd 100644
--- a/tests/cpp/test_common.h
+++ b/tests/cpp/test_common.h
@@ -6,10 +6,12 @@
 
 #pragma once
 
-#include <memory>
-#include <vector>
 #include <array>
+#include <memory>
+#include <optional>
 #include <random>
+#include <vector>
+
 #include <cudaTypedefs.h>
 #define FP4_TYPE_SUPPORTED (CUDA_VERSION >= 12080)
 
@@ -27,6 +29,11 @@
 namespace test {
 using namespace transformer_engine;
 
+size_t typeToNumBits(DType type);
+size_t product(const NVTEShape &shape);
+size_t product(const std::vector<size_t> &shape);
+size_t bytes(const NVTEShape& shape, const DType type);
+
 template <size_t i>
 struct BytesToType {};
 
@@ -114,9 +121,30 @@ struct TypeInfo {
     }
 
     constexpr static DType dtype = getType<T>();
-    constexpr static size_t size = BitsNumber<T>::num_bits;;
+    constexpr static size_t size = BitsNumber<T>::num_bits;
+};
+
+// Deleter for CUDA buffer RAII class
+struct CudaDeleter {
+  void operator()(void* ptr) const { if (ptr != nullptr) cudaFree(ptr); }
 };
 
+// CUDA buffer RAII class
+template <typename T = void>
+using CudaPtr = std::unique_ptr<T, CudaDeleter>;
+
+// Construct CUDA memory
+template <typename T = void>
+CudaPtr<T> cuda_alloc(size_t bytes) {
+  void* ptr = nullptr;
+  NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes));
+  return CudaPtr<T>(static_cast<T*>(ptr));
+}
+
+/* Wrapper for Transformer Engine tensor
+ *
+ * Maintains matching GPU and CPU buffers.
+ */
 class Tensor {
  public:
   Tensor(const std::string& name,
@@ -133,7 +161,7 @@ class Tensor {
          const NVTEScalingMode &mode = NVTE_DELAYED_TENSOR_SCALING) :
     Tensor(name, nvte_make_shape(shape.data(), shape.size()), type, rowwise, columnwise, mode) {}
 
-  Tensor() {}
+  Tensor() = default;
 
   Tensor& operator=(const Tensor &other) = delete;
   Tensor(const Tensor &other) = delete;
@@ -141,42 +169,7 @@ class Tensor {
   Tensor(Tensor &&other) = default;
   Tensor& operator=(Tensor &&other) = default;
 
-  ~Tensor() {
-    void *data_ptr = tensor_.dptr();
-    void *scale_inv = tensor_.scale_inv();
-    void *columnwise_data_ptr = tensor_.get_columnwise_data().data_ptr;
-    void *columnwise_scale_inv = tensor_.get_columnwise_scale_inv().data_ptr;
-    void *amax = tensor_.amax();
-    void *columnwise_amax_ptr = tensor_.get_columnwise_amax().data_ptr;
-    void *scale = tensor_.scale();
-    if (columnwise_data_ptr == data_ptr) {
-      columnwise_data_ptr = nullptr;
-    }
-    if (columnwise_scale_inv == scale_inv) {
-      columnwise_scale_inv = nullptr;
-    }
-    if (data_ptr != nullptr) {
-      cudaFree(data_ptr);
-    }
-    if (scale_inv != nullptr) {
-      cudaFree(scale_inv);
-    }
-    if (columnwise_data_ptr != nullptr) {
-      cudaFree(columnwise_data_ptr);
-    }
-    if (columnwise_scale_inv != nullptr) {
-      cudaFree(columnwise_scale_inv);
-    }
-    if (amax != nullptr) {
-      cudaFree(amax);
-    }
-    if (columnwise_amax_ptr != nullptr) {
-      cudaFree(columnwise_amax_ptr);
-    }
-    if (scale != nullptr) {
-      cudaFree(scale);
-    }
-  }
+  ~Tensor() = default;
 
   NVTETensor data() const noexcept { return tensor_.data(); }
 
@@ -213,141 +206,176 @@ class Tensor {
   }
 
   template <typename T>
-  T *rowwise_cpu_dptr() const {
-    NVTE_CHECK(TypeInfo<T>::dtype == tensor_.dtype(), "Invalid type!");
+  T *rowwise_cpu_dptr() {
+    NVTE_CHECK(data_rowwise_, "Tensor does not have rowwise data!");
+    NVTE_CHECK(TypeInfo<T>::dtype == data_rowwise_->dtype(), "Invalid type!");
     NVTE_CHECK(rowwise_, "Tensor does not have rowwise data!");
-    return reinterpret_cast<T *>(cpu_data_rowwise_.get());
+    return data_rowwise_->cpu_buffer<T>();
   }
 
   template <typename T>
-  T *columnwise_cpu_dptr() const {
-    NVTE_CHECK(TypeInfo<T>::dtype == tensor_.dtype(), "Invalid type!");
+  T *columnwise_cpu_dptr() {
+    NVTE_CHECK(data_columnwise_, "Tensor does not have columnwise data!");
+    NVTE_CHECK(TypeInfo<T>::dtype == data_columnwise_->dtype(), "Invalid type!");
     NVTE_CHECK(columnwise_, "Tensor does not have columnwise data!");
-    return reinterpret_cast<T *>(cpu_data_columnwise_.get());
+    return data_columnwise_->cpu_buffer<T>();
   }
 
-  float amax() const {
-    if(amax_cpu_data_) {
-      to_cpu();
-      return *amax_cpu_data_;
-    } else {
-      return 0;
-    }
+  float amax() {
+    NVTE_CHECK(amax_rowwise_);
+    NVTE_CHECK(amax_rowwise_->size() == 1);
+    NVTE_CHECK(amax_rowwise_->dtype() == DType::kFloat32);
+    amax_rowwise_->to_cpu();
+    return *amax_rowwise_->cpu_buffer<float>();
   }
 
-  float amax_columnwise() const {
-    if(amax_cpu_data_columnwise_) {
-      to_cpu();
-      return *amax_cpu_data_columnwise_;
-    } else {
-      return 0;
-    }
+  float amax_columnwise() {
+    NVTE_CHECK(amax_columnwise_);
+    NVTE_CHECK(amax_columnwise_->size() == 1);
+    NVTE_CHECK(amax_columnwise_->dtype() == DType::kFloat32);
+    amax_columnwise_->to_cpu();
+    return *amax_columnwise_->cpu_buffer<float>();
   }
 
-  float scale() const {
-    if(scale_cpu_data_) {
-      NVTE_CHECK(tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING, "Invalid scaling_mode!");
-      to_cpu();
-      return *scale_cpu_data_;
-    } else {
-      return 1;
-    }
+  float scale() {
+    NVTE_CHECK(scale_);
+    NVTE_CHECK(scale_->size() == 1);
+    NVTE_CHECK(scale_->dtype() == DType::kFloat32);
+    scale_->to_cpu();
+    return *scale_->cpu_buffer<float>();
+  }
+
+  float rowwise_scale_inv(){
+    NVTE_CHECK(scale_inv_rowwise_);
+    NVTE_CHECK(scale_inv_rowwise_->size() == 1);
+    NVTE_CHECK(scale_inv_rowwise_->dtype() == DType::kFloat32);
+    scale_inv_rowwise_->to_cpu();
+    return *scale_inv_rowwise_->cpu_buffer<float>();
   }
 
   template <typename T>
   T *rowwise_cpu_scale_inv_ptr(){
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat8E4M3, "Invalid type!");
-    } else {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kByte, "Invalid type!");
-    }
-    to_cpu();
-    return reinterpret_cast<T*>(rowwise_scale_inv_cpu_data_.get());
+    NVTE_CHECK(scale_inv_rowwise_);
+    scale_inv_rowwise_->to_cpu();
+    return scale_inv_rowwise_->cpu_buffer<T>();
   }
 
   template <typename T>
   T *columnwise_cpu_scale_inv_ptr(){
-    if (tensor_.scaling_mode() == NVTE_DELAYED_TENSOR_SCALING){
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_BLOCK_SCALING_1D || tensor_.scaling_mode() == NVTE_BLOCK_SCALING_2D) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat32, "Invalid type!");
-    } else if (tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING) {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kFloat8E4M3, "Invalid type!");
-    } else {
-      NVTE_CHECK(TypeInfo<T>::dtype == DType::kByte, "Invalid type!");
-    }
-    to_cpu();
-    return reinterpret_cast<T*>(columnwise_scale_inv_cpu_data_.get());
+    NVTE_CHECK(scale_inv_columnwise_);
+    scale_inv_columnwise_->to_cpu();
+    return scale_inv_columnwise_->cpu_buffer<T>();
   }
 
-  float rowwise_scale_inv(){
-    if(rowwise_scale_inv_cpu_data_) {
-      float scale_inv = rowwise_cpu_scale_inv_ptr<float>()[0];
-      return scale_inv;
-    } else {
-      return 1;
-    }
-  }
-
-  bool rowwise() const {
-    return rowwise_;
+  template <typename T>
+  T *cpu_rowwise_amax_ptr() {
+    NVTE_CHECK(amax_rowwise_);
+    amax_rowwise_->to_cpu();
+    return amax_rowwise_->cpu_buffer<T>();
   }
 
-  bool columnwise() const {
-    return columnwise_;
+  template <typename T>
+  T *cpu_columnwise_amax_ptr() {
+    NVTE_CHECK(amax_columnwise_);
+    amax_columnwise_->to_cpu();
+    return amax_columnwise_->cpu_buffer<T>();
   }
 
-  void set_tensor_amax(const float amax) {
-    if (amax_cpu_data_) {
-      *amax_cpu_data_ = amax;
-      from_cpu();
-    }
+  size_t rowwise_amax_size() const noexcept {
+    return amax_rowwise_ ? amax_rowwise_->size() : 0;
   }
 
-  void set_tensor_amax_columnwise(const float amax) {
-    if (amax_cpu_data_columnwise_) {
-      *amax_cpu_data_columnwise_ = amax;
-      from_cpu();
-    }
+  bool rowwise() const {
+    return rowwise_;
   }
 
-  void set_tensor_amax_nullptr(){
-    tensor_.set_amax(nullptr, DType::kFloat32, tensor_.defaultShape);
+  bool columnwise() const {
+    return columnwise_;
   }
 
-  void set_tensor_amax_shape(const std::vector<size_t> &shape);
-  std::vector<float> tensor_amax_values() const;
-  void copy_tensor_amax_from(const Tensor &other);
+  void set_tensor_amax_nullptr();
 
-  void set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales){
-    tensor_.set_with_gemm_swizzled_scales(with_gemm_swizzled_scales);
-  }
+  void set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales);
+  void set_row_scaled_nvfp4(bool row_scaled_nvfp4);
 
-  void set_row_scaled_nvfp4(bool row_scaled_nvfp4) {
-    tensor_.set_row_scaled_nvfp4(row_scaled_nvfp4);
-  }
+  void to_cpu();
+  void from_cpu();
 
-  void to_cpu() const;
-  void from_cpu() const;
+  void set_amax(float amax);
   void set_scale(float scale);
   void set_scale_inv(float scale_inv);
-  void shareFP8Meta(const Tensor &other);
+  void set_tensor_amax_columnwise(float amax);
+
+  void fill_uniform_rowwise_scale_inv();
+  void fill_uniform_columnwise_scale_inv();
+  void fill_uniform_scale();
 
   std::mt19937& gen() { return gen_; }
 
  private:
+
+  /* Manages matching GPU and CPU buffers. */
+  class Buffer {
+  public:
+
+    Buffer(size_t size = 0, DType dtype = DType::kByte);
+    ~Buffer() = default;
+    Buffer(const Buffer&) = delete;
+    Buffer& operator=(const Buffer&) = delete;
+    Buffer(Buffer&&) = default;
+    Buffer& operator=(Buffer&&) = default;
+
+    size_t size() const noexcept { return size_; }
+    DType dtype() const noexcept { return dtype_; }
+
+    // Void pointer accessors
+    void *cpu_buffer() { return cpu_buffer_.get(); }
+    const void *cpu_buffer() const { return cpu_buffer_.get(); }
+    void *gpu_buffer() { return gpu_buffer_.get(); }
+    const void *gpu_buffer() const { return gpu_buffer_.get(); }
+
+    // Templated pointer accessors
+    template <typename T>
+    T *cpu_buffer() {
+      return reinterpret_cast<T *>(cpu_buffer());
+    }
+    template <typename T>
+    const T *cpu_buffer() const {
+      return const_cast<Buffer *>(this)->cpu_buffer<T>();
+    }
+    template <typename T>
+    T *gpu_buffer() {
+      return reinterpret_cast<T *>(gpu_buffer());
+    }
+    template <typename T>
+    const T *gpu_buffer() const {
+      return const_cast<Buffer *>(this)->gpu_buffer<T>();
+    }
+
+    // Memory transfers between CPU and GPU
+    void to_cpu();
+    void from_cpu();
+
+  private:
+    std::unique_ptr<unsigned char[]> cpu_buffer_;
+    CudaPtr<unsigned char[]> gpu_buffer_;
+    size_t size_;
+    DType dtype_;
+    size_t bytes_;
+  };
+
+  // Transformer Engine tensor
   TensorWrapper tensor_;
-  std::unique_ptr<unsigned char[]> cpu_data_rowwise_;
-  std::unique_ptr<unsigned char[]> cpu_data_columnwise_;
-  std::shared_ptr<float> amax_cpu_data_;
-  std::shared_ptr<float> amax_cpu_data_columnwise_;
-  std::shared_ptr<float> scale_cpu_data_;
-  std::unique_ptr<unsigned char[]> rowwise_scale_inv_cpu_data_;
-  std::unique_ptr<unsigned char[]> columnwise_scale_inv_cpu_data_;
+
+  // Data buffers
+  std::optional<Buffer> data_rowwise_;
+  std::optional<Buffer> data_columnwise_;
+  std::shared_ptr<Buffer> scale_inv_rowwise_;
+  std::shared_ptr<Buffer> scale_inv_columnwise_;
+  std::optional<Buffer> amax_rowwise_;
+  std::optional<Buffer> amax_columnwise_;
+  std::optional<Buffer> scale_;
+
   bool rowwise_;
   bool columnwise_;
   std::string name_;
@@ -497,17 +525,12 @@ inline float dsilu(const float x)    { return x * dsigmoid(x) + sigmoid(x); }
 inline float srelu(const float x)    { return x > 0 ? x * x : 0; }
 inline float dsrelu(const float x)   { return fmaxf(0, 2 * x); }
 
-size_t typeToNumBits(DType type);
-size_t product(const NVTEShape &shape);
-size_t product(const std::vector<size_t> &shape);
-size_t bytes(const NVTEShape& shape, const DType type);
-
 size_t first_dimension(const std::vector<size_t> &shape);
 size_t last_dimension(const std::vector<size_t> &shape);
 
 bool areShapesEqual(const NVTEShape &s1, const NVTEShape &s2);
 
-void compareResults(const std::string &name, const Tensor &test, const void *ref,
+void compareResults(const std::string &name, Tensor &test, const void *ref,
                     bool rowwise, double atol = 1e-5, double rtol = 1e-8, bool if_on_gpus = true,
                     const size_t tolerable_mismatches_limit = 0);
 void compareResults(const std::string &name, const float test, const float ref,
@@ -550,26 +573,14 @@ int32_t getDeviceComputeCapability();
 constexpr int32_t hopperComputeCapability = 90;
 constexpr int32_t blackwellComputeCapability = 100;
 
-// Custom deleters for RAII
-struct CudaDeleter {
-  void operator()(void* p) const { if (p) cudaFree(p); }
-};
+// Custom deleter for RAII
 struct GroupedTensorDeleter {
   void operator()(NVTEGroupedTensor h) const { if (h) nvte_destroy_grouped_tensor(h); }
 };
 
-template <typename T = void>
-using CudaPtr = std::unique_ptr<T, CudaDeleter>;
+// Grouped tensor RAII class
 using GroupedTensorHandle = std::unique_ptr<std::remove_pointer_t<NVTEGroupedTensor>, GroupedTensorDeleter>;
 
-// Helper to allocate CUDA memory into a CudaPtr
-template <typename T = void>
-CudaPtr<T> cuda_alloc(size_t bytes) {
-  void* ptr = nullptr;
-  NVTE_CHECK_CUDA(cudaMalloc(&ptr, bytes));
-  return CudaPtr<T>(static_cast<T*>(ptr));
-}
-
 // Helper owning GPU buffers that back NVTEGroupedTensor.
 // NVTEGroupedTensor does not own memory; data/offsets/scales
 // must be allocated and freed by the test.
diff --git a/tests/cpp_distributed/test_comm_gemm.cu b/tests/cpp_distributed/test_comm_gemm.cu
index cc0d760a39..45f6664567 100644
--- a/tests/cpp_distributed/test_comm_gemm.cu
+++ b/tests/cpp_distributed/test_comm_gemm.cu
@@ -107,8 +107,10 @@ std::vector<T> CopyMatrix(const std::vector<T>& data, size_t mstart, size_t nsta
 template <typename T>
 test::Tensor Make(size_t m, size_t n, float scale) {
   test::Tensor ret("", std::vector{n, m}, TypeInfo<T>::dtype);
-  ret.set_scale(scale);
-  ret.set_scale_inv(1.0 / scale);
+  if (test::isFp8Type(TypeInfo<T>::dtype)) {
+    ret.set_scale(scale);
+    ret.set_scale_inv(1.0 / scale);
+  }
   return ret;
 }
 
@@ -116,8 +118,10 @@ template <typename T>
 test::Tensor MakeFromData(const std::vector<T>& data, size_t mstart, size_t nstart, size_t msize,
                           size_t nsize, size_t ld, float scale) {
   test::Tensor ret("", std::vector{nsize, msize}, TypeInfo<T>::dtype);
-  ret.set_scale(scale);
-  ret.set_scale_inv(1.0 / scale);
+  if (test::isFp8Type(TypeInfo<T>::dtype)) {
+    ret.set_scale(scale);
+    ret.set_scale_inv(1.0 / scale);
+  }
   auto local = CopyMatrix(data, mstart, nstart, msize, nsize, ld);
   NVTE_CHECK_CUDA(cudaMemcpy(ret.rowwise_dptr(), local.data(), local.size() * sizeof local[0],
                              cudaMemcpyDefault));
diff --git a/transformer_engine/common/include/transformer_engine/transformer_engine.h b/transformer_engine/common/include/transformer_engine/transformer_engine.h
index e9a6f4f735..488f259150 100644
--- a/transformer_engine/common/include/transformer_engine/transformer_engine.h
+++ b/transformer_engine/common/include/transformer_engine/transformer_engine.h
@@ -72,7 +72,17 @@ enum NVTETensorParam {
   kNVTEColumnwiseScaleInv = 5,     /*!< Scale inverse tensor for decoding Columnwise Data */
   kNVTEColumnwiseAmax = 6,         /*!< Columnwise Amax tensor */
   kNVTEWithGEMMSwizzledScales = 7, /*!< Whether scaling factors are in format expected by GEMM */
-  kNVTERowScaledNVFP4 = 8,         /*!< Whether an NVFP4 tensor uses row scaling */
+  /*! Whether an NVFP4 tensor uses row scaling instead of tensor scaling.
+   *
+   *  Column-wise data is not supported with row scaling.
+   *
+   *  Row scaling affects the interpretation of the amax tensor. With
+   *  tensor scaling, the amax tensor is a single FP32 that must be
+   *  computed prior to quantization. With row scaling, the amax
+   *  tensor size is the number of tensor rows (flattened to 2D), and
+   *  its values are populated during quantization.
+   */
+  kNVTERowScaledNVFP4 = 8,
   kNVTENumTensorParams
 };