NVIDIA · zianglih · May 9, 2026 · May 9, 2026 · May 9, 2026 · May 9, 2026
diff --git a/docs/envvars.rst b/docs/envvars.rst
@@ -287,6 +287,30 @@ Kernel Configuration
    :Default: ``0``
    :Description: Enable row-scaled NVFP4 tensors for forward activation quantizers in the ``NVFP4BlockScaling`` recipe. When set to ``1`` (or when ``NVFP4BlockScaling(row_scaled_activation=True)`` is used), rowwise ``amax`` metadata is stored as one FP32 value per tensor row instead of a single scalar.
 
+.. envvar:: NVTE_NVFP4_4OVER6
+
+   :Type: ``str`` (``weights``, ``activations``, or ``all``)
+   :Default: unset
+   :Description: Enable per-block map-to-4 versus map-to-6 candidate selection for selected NVFP4 quantizers in the ``NVFP4BlockScaling`` recipe. ``weights`` selects weight tensor roles, ``activations`` selects non-weight tensor roles, and ``all`` selects both. The selected block scale is the candidate with lower configured input-domain error, and ties select map-to-6. By default, this mode keeps the standard NVFP4 global E4M3 scale bound of 448. Tensors using 4over6 currently require RHT and stochastic rounding to be disabled; activation and backward scopes therefore require ``NVTE_NVFP4_DISABLE_RHT=1`` and ``NVTE_NVFP4_DISABLE_STOCHASTIC_ROUNDING=1``.
+
+.. envvar:: NVTE_NVFP4_4OVER6_E4M3_USE_256
+
+   :Type: ``str`` (``weights``, ``activations``, or ``all``)
+   :Default: unset
+   :Description: Select NVFP4 4over6 quantizers that use 256 instead of 448 as the global E4M3 scale bound. ``weights`` selects weight tensor roles, ``activations`` selects non-weight tensor roles, and ``all`` selects both. This option is only meaningful for tensor roles that also enable :envvar:`NVTE_NVFP4_4OVER6`.
+
+.. envvar:: NVTE_NVFP4_4OVER6_ERR_MODE
+
+   :Type: ``str`` (``MAE`` or ``MSE``)
+   :Default: ``MAE``
+   :Description: Select the input-domain error metric used by NVFP4 4over6 map-to-4 versus map-to-6 candidate selection in the ``NVFP4BlockScaling`` recipe.
+
+.. envvar:: NVTE_NVFP4_4OVER6_ERR_USE_FAST_MATH
+
+   :Type: ``int`` (0 or 1)
+   :Default: ``0``
+   :Description: Allow the NVFP4 4over6 candidate error computation to use faster non-strict floating-point expressions. By default, 4over6 error comparison uses strict expressions; ``NVTE_USE_FAST_MATH`` does not control this error-comparison path.
+
 Torch Compilation and Fusion
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/tests/cpp/operator/test_cast_nvfp4_transpose.cu b/tests/cpp/operator/test_cast_nvfp4_transpose.cu
diff --git a/tests/cpp/operator/test_dequantize_nvfp4.cu b/tests/cpp/operator/test_dequantize_nvfp4.cu
@@ -46,8 +46,9 @@ void compute_ref_dequantize_nvfp4(const uint8_t *packed_data,
                                   OType *output,
                                   size_t rows,
                                   size_t cols,
-                                  size_t scale_stride) {
-    constexpr float factor_inv = 1.0f / (6.0f * 448.0f);
+                                  size_t scale_stride,
+                                  int e4m3_max) {
+    const float factor_inv = 1.0f / (6.0f * static_cast<float>(e4m3_max));
     constexpr size_t BLOCK_SIZE = 16;
     const size_t Mread = cols / BLOCK_SIZE;
     const size_t bytes_per_block = BLOCK_SIZE / 2;
@@ -90,7 +91,9 @@ float compute_amax(test::Tensor &t, size_t rows, size_t cols) {
 // against a CPU reference computed from the quantized data.
 template <typename OutputType>
 void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
-                                  const bool row_scaled_nvfp4) {
+                                  const bool row_scaled_nvfp4,
+                                  const bool use_4over6,
+                                  const int e4m3_max) {
     using namespace test;
     DType otype = TypeInfo<OutputType>::dtype;
 
@@ -105,6 +108,10 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
 
     // Configure quantized tensor amax
     size_t amax_size = 1;
+    quantized.set_nvfp4_4over6(use_4over6);
+    quantized.set_nvfp4_e4m3_max((use_4over6 ? e4m3_max : 448));
+    ASSERT_EQ(quantized.nvfp4_4over6(), use_4over6);
+    ASSERT_EQ(quantized.nvfp4_e4m3_max(), (use_4over6 ? e4m3_max : 448));
     if (row_scaled_nvfp4) {
       quantized.set_row_scaled_nvfp4(true);
       amax_size = rows;
@@ -116,7 +123,10 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
 
     // Quantize
     if (rows > 0 && cols > 0) {
-        nvte_quantize(input.data(), quantized.data(), 0);
+        QuantizationConfigWrapper quant_config;
+        quant_config.set_nvfp4_4over6(use_4over6);
+        quant_config.set_nvfp4_e4m3_max((use_4over6 ? e4m3_max : 448));
+        nvte_quantize_v2(input.data(), quantized.data(), quant_config, 0);
         cudaDeviceSynchronize();
         auto err = cudaGetLastError();
         ASSERT_EQ(err, cudaSuccess) << cudaGetErrorString(err);
@@ -146,7 +156,7 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
       std::make_unique<OutputType[]>(rows * cols);
     compute_ref_dequantize_nvfp4<OutputType>(
       fp4_data, scales, amax_vals, ref_output.get(),
-      rows, cols, scale_stride);
+      rows, cols, scale_stride, (use_4over6 ? e4m3_max : 448));
 
     // Compare results from TE and reference impls
     auto [atol, rtol] = getTolerances(otype);
@@ -156,7 +166,9 @@ void performTest_dequantize_nvfp4(const size_t rows, const size_t cols,
 // Dequantize NVFP4 with GEMM-swizzled scales and compare against compact path.
 template <typename OutputType>
 void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
-                                           const bool row_scaled_nvfp4) {
+                                           const bool row_scaled_nvfp4,
+                                           const bool use_4over6,
+                                           const int e4m3_max) {
     using namespace test;
     DType otype = TypeInfo<OutputType>::dtype;
 
@@ -165,6 +177,10 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
 
     Tensor quantized_compact("quantized_compact", std::vector<size_t>{rows, cols},
                              DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
+    quantized_compact.set_nvfp4_4over6(use_4over6);
+    quantized_compact.set_nvfp4_e4m3_max((use_4over6 ? e4m3_max : 448));
+    ASSERT_EQ(quantized_compact.nvfp4_4over6(), use_4over6);
+    ASSERT_EQ(quantized_compact.nvfp4_e4m3_max(), (use_4over6 ? e4m3_max : 448));
     if (row_scaled_nvfp4) {
         quantized_compact.set_row_scaled_nvfp4(true);
     } else if (rows > 0 && cols > 0) {
@@ -174,7 +190,10 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     }
 
     if (rows > 0 && cols > 0) {
-        nvte_quantize(input.data(), quantized_compact.data(), 0);
+        QuantizationConfigWrapper quant_config;
+        quant_config.set_nvfp4_4over6(use_4over6);
+        quant_config.set_nvfp4_e4m3_max((use_4over6 ? e4m3_max : 448));
+        nvte_quantize_v2(input.data(), quantized_compact.data(), quant_config, 0);
         cudaDeviceSynchronize();
     }
 
@@ -186,6 +205,10 @@ void performTest_dequantize_nvfp4_swizzled(const size_t rows, const size_t cols,
     // Create tensor with same FP4 data but swizzled scales
     Tensor quantized_swizzled("quantized_swizzled", std::vector<size_t>{rows, cols},
                               DType::kFloat4E2M1, true, false, NVTE_NVFP4_1D_SCALING);
+    quantized_swizzled.set_nvfp4_4over6(use_4over6);
+    quantized_swizzled.set_nvfp4_e4m3_max((use_4over6 ? e4m3_max : 448));
+    ASSERT_EQ(quantized_swizzled.nvfp4_4over6(), use_4over6);
+    ASSERT_EQ(quantized_swizzled.nvfp4_e4m3_max(), (use_4over6 ? e4m3_max : 448));
     if (row_scaled_nvfp4) {
         quantized_swizzled.set_row_scaled_nvfp4(true);
     } else {
@@ -260,7 +283,9 @@ std::vector<std::pair<size_t, size_t>> nvfp4_tensor_dims = {
 class DequantizeNVFP4TestSuite : public ::testing::TestWithParam
     <std::tuple<std::pair<size_t, size_t>,
                 transformer_engine::DType,
-                bool>> {};
+                bool,
+                bool,
+                int>> {};
 
 TEST_P(DequantizeNVFP4TestSuite, TestDequantizeNVFP4)
 {
@@ -271,10 +296,12 @@ TEST_P(DequantizeNVFP4TestSuite, TestDequantizeNVFP4)
     const auto tensor_size = std::get<0>(GetParam());
     const DType output_type = std::get<1>(GetParam());
     const bool row_scaled_nvfp4 = std::get<2>(GetParam());
+    const bool use_4over6 = std::get<3>(GetParam());
+    const int e4m3_max = use_4over6 ? std::get<4>(GetParam()) : 448;
 
     TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(output_type, OutputType,
         performTest_dequantize_nvfp4<OutputType>(
-            tensor_size.first, tensor_size.second, row_scaled_nvfp4);
+            tensor_size.first, tensor_size.second, row_scaled_nvfp4, use_4over6, e4m3_max);
     );
 }
 
@@ -284,21 +311,30 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         ::testing::ValuesIn(nvfp4_tensor_dims),
         ::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
-        ::testing::Bool()),
+        ::testing::Bool(),
+        ::testing::Bool(),
+        ::testing::Values(448, 256)),
     [](const testing::TestParamInfo<DequantizeNVFP4TestSuite::ParamType>& info)
     {
         std::string name = std::to_string(std::get<0>(info.param).first) + "X" +
                            std::to_string(std::get<0>(info.param).second) + "X" +
                            test::typeName(std::get<1>(info.param)) + "X" +
-                           (std::get<2>(info.param) ? "RowScaled" : "PerTensor");
+                           (std::get<2>(info.param) ? "RowScaled" : "PerTensor") + "X" +
+                           (std::get<3>(info.param) ? "FourOverSix" : "Default") + "X" +
+                           (std::get<3>(info.param)
+                                ? (std::get<4>(info.param) == 256 ? "E4M3Max256" : "E4M3Max448")
+                                : (std::get<4>(info.param) == 256 ? "E4M3Max256Ignored"
+                                                           : "E4M3Max448"));
         return name;
     }
 );
 
 class DequantizeNVFP4SwizzledTestSuite : public ::testing::TestWithParam
     <std::tuple<std::pair<size_t, size_t>,
                 transformer_engine::DType,
-                bool>> {};
+                bool,
+                bool,
+                int>> {};
 
 TEST_P(DequantizeNVFP4SwizzledTestSuite, TestDequantizeNVFP4Swizzled)
 {
@@ -309,10 +345,12 @@ TEST_P(DequantizeNVFP4SwizzledTestSuite, TestDequantizeNVFP4Swizzled)
     const auto tensor_size = std::get<0>(GetParam());
     const DType output_type = std::get<1>(GetParam());
     const bool row_scaled_nvfp4 = std::get<2>(GetParam());
+    const bool use_4over6 = std::get<3>(GetParam());
+    const int e4m3_max = use_4over6 ? std::get<4>(GetParam()) : 448;
 
     TRANSFORMER_ENGINE_TYPE_SWITCH_FP16_FP32_ONLY(output_type, OutputType,
         performTest_dequantize_nvfp4_swizzled<OutputType>(
-            tensor_size.first, tensor_size.second, row_scaled_nvfp4);
+            tensor_size.first, tensor_size.second, row_scaled_nvfp4, use_4over6, e4m3_max);
     );
 }
 
@@ -322,13 +360,20 @@ INSTANTIATE_TEST_SUITE_P(
     ::testing::Combine(
         ::testing::ValuesIn(nvfp4_tensor_dims),
         ::testing::Values(DType::kFloat32, DType::kBFloat16, DType::kFloat16),
-        ::testing::Bool()),
+        ::testing::Bool(),
+        ::testing::Bool(),
+        ::testing::Values(448, 256)),
     [](const testing::TestParamInfo<DequantizeNVFP4SwizzledTestSuite::ParamType>& info)
     {
         std::string name = std::to_string(std::get<0>(info.param).first) + "X" +
                            std::to_string(std::get<0>(info.param).second) + "X" +
                            test::typeName(std::get<1>(info.param)) + "X" +
                            (std::get<2>(info.param) ? "RowScaled" : "PerTensor") + "X" +
+                           (std::get<3>(info.param) ? "FourOverSix" : "Default") + "X" +
+                           (std::get<3>(info.param)
+                                ? (std::get<4>(info.param) == 256 ? "E4M3Max256" : "E4M3Max448")
+                                : (std::get<4>(info.param) == 256 ? "E4M3Max256Ignored"
+                                                           : "E4M3Max448")) + "X" +
                            "Swizzled";
         return name;
     }

diff --git a/tests/cpp/test_common.cu b/tests/cpp/test_common.cu
@@ -440,6 +440,30 @@ void Tensor::set_row_scaled_nvfp4(bool row_scaled_nvfp4) {
   }
 }
 
+void Tensor::set_nvfp4_4over6(bool nvfp4_4over6) {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "NVFP4 4over6 is only supported for NVFP4 tensors.");
+  tensor_.set_nvfp4_4over6(nvfp4_4over6);
+}
+
+void Tensor::set_nvfp4_e4m3_max(int nvfp4_e4m3_max) {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "NVFP4 E4M3 max is only supported for NVFP4 tensors.");
+  tensor_.set_nvfp4_e4m3_max(nvfp4_e4m3_max);
+}
+
+bool Tensor::nvfp4_4over6() const {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "NVFP4 4over6 is only supported for NVFP4 tensors.");
+  return tensor_.get_nvfp4_4over6();
+}
+
+int Tensor::nvfp4_e4m3_max() const {
+  NVTE_CHECK(tensor_.scaling_mode() == NVTE_NVFP4_1D_SCALING,
+             "NVFP4 E4M3 max is only supported for NVFP4 tensors.");
+  return tensor_.get_nvfp4_e4m3_max();
+}
+
 void Tensor::to_cpu() {
   if (data_rowwise_) { data_rowwise_->to_cpu(); }
   if (data_columnwise_) { data_columnwise_->to_cpu(); }

diff --git a/tests/cpp/test_common.h b/tests/cpp/test_common.h
@@ -293,10 +293,15 @@ class Tensor {
     return columnwise_;
   }
 
+  bool nvfp4_4over6() const;
+  int nvfp4_e4m3_max() const;
+
   void set_tensor_amax_nullptr();
 
   void set_with_gemm_swizzled_scales(bool with_gemm_swizzled_scales);
   void set_row_scaled_nvfp4(bool row_scaled_nvfp4);
+  void set_nvfp4_4over6(bool nvfp4_4over6);
+  void set_nvfp4_e4m3_max(int nvfp4_e4m3_max);
 
   void to_cpu();
   void from_cpu();