From fae3a70b031a53af253ebc942fa1bebae9e537d7 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 12 Feb 2026 22:35:29 +0000 Subject: [PATCH 1/8] added support for rotate in fp32 Signed-off-by: Kinjal Patel --- modelopt/torch/quantization/config.py | 14 +++++++---- modelopt/torch/quantization/nn/functional.py | 8 +++++-- .../nn/modules/tensor_quantizer.py | 17 ++++++++++--- tests/gpu/torch/quantization/test_hadamard.py | 24 +++++++++++++++---- 4 files changed, 49 insertions(+), 14 deletions(-) diff --git a/modelopt/torch/quantization/config.py b/modelopt/torch/quantization/config.py index 5d95ffe5f..618c22606 100644 --- a/modelopt/torch/quantization/config.py +++ b/modelopt/torch/quantization/config.py @@ -1033,14 +1033,20 @@ def validate_calibrator(cls, v, info: ValidationInfo): assert v in ["max", "histogram"] return v - rotate: bool = ModeloptField( + rotate: bool | dict[str, bool] = ModeloptField( default=False, - title="""If rotate the input before quantization.""", - description=""""If true, the input of the quantizer will be rotated with a hadamard matrix + title="""Configuration for rotating the input before quantization.""", + description="""Can be a boolean or a dictionary with the following keys: + - "enable": Boolean to enable/disable rotation (default: False) + - "rotate_fp32": Boolean to compute rotation in float32 precision (default: False) + + If a boolean is provided, it is treated as the "enable" value with "rotate_fp32" defaulting to False. + + When enabled, the input of the quantizer will be rotated with a hadamard matrix given by scipy.linalg.hadamard, i.e. ``input = input @ scipy.linalg.hadamard(input.shape[-1]) / sqrt(input.shape[-1])``. - This can be used for ratation based PTQ methods, e.g. QuaRot or SpinQuant. + This can be used for rotation based PTQ methods, e.g. QuaRot or SpinQuant. See https://arxiv.org/abs/2404.00456 for example.""", ) diff --git a/modelopt/torch/quantization/nn/functional.py b/modelopt/torch/quantization/nn/functional.py index df8bcbbcd..662aea66e 100644 --- a/modelopt/torch/quantization/nn/functional.py +++ b/modelopt/torch/quantization/nn/functional.py @@ -93,7 +93,7 @@ def backward(ctx, grad_outputs): return fast_hadamard_transform.hadamard_transform(grad_outputs) # type: ignore[name-defined] -def normalized_hadamard_transform(inputs): +def normalized_hadamard_transform(inputs, rotate_fp32=False): """Normalized fast hadamard transform.""" global fast_hadamard_transform try: @@ -104,6 +104,10 @@ def normalized_hadamard_transform(inputs): "`pip install git+https://github.com/Dao-AILab/fast-hadamard-transform.git`" ) - return FastHadamardTransform.apply(inputs) / torch.sqrt( + dtype = inputs.dtype + if rotate_fp32: + inputs = inputs.float() + outputs = FastHadamardTransform.apply(inputs) / torch.sqrt( torch.tensor(inputs.shape[-1], dtype=torch.float32) ) + return outputs.to(dtype) if rotate_fp32 else outputs diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 9b401a335..76a5f600d 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -996,8 +996,14 @@ def forward(self, inputs): inputs = inputs * self.pre_quant_scale # Rotating the input - if self._rotate: - inputs = normalized_hadamard_transform(inputs) + rotate_fp32 = ( + self._rotate.get("rotate_fp32", False) if isinstance(self._rotate, dict) else False + ) + rotate_enable = ( + self._rotate.get("enable", False) if isinstance(self._rotate, dict) else self._rotate + ) + if rotate_enable: + inputs = normalized_hadamard_transform(inputs, rotate_fp32=rotate_fp32) if self._disabled: # if quantizer is disabled, we still need to track the input dtype for saving the model @@ -1109,7 +1115,12 @@ def extra_repr(self): if self.pre_quant_scale is not None else "" ) - s += " rotated" if self._rotate else "" + s += ( + " rotated" + if (isinstance(self._rotate, dict) and self._rotate.get("enable", False)) + or self._rotate + else "" + ) s += ( f" calibrator={self._calibrator.__class__.__name__}" if (self._calibrator is not None) diff --git a/tests/gpu/torch/quantization/test_hadamard.py b/tests/gpu/torch/quantization/test_hadamard.py index c768bc87e..07c179026 100644 --- a/tests/gpu/torch/quantization/test_hadamard.py +++ b/tests/gpu/torch/quantization/test_hadamard.py @@ -41,9 +41,17 @@ def test_hadamard_transform(dim): xxt_h = x_h @ x_h.T # The numerical error can be large, especially for 16-bit floats. assert torch.allclose(xxt_h, xxt, atol=0.05) + x_h_fp32 = normalized_hadamard_transform(x, rotate_fp32=True) + xxt_h_fp32 = x_h_fp32 @ x_h_fp32.T + # test the numerical error is smaller when using float32 + assert torch.allclose(xxt_h_fp32, xxt, atol=1e-6) -def test_kv_rotate(): +@pytest.mark.parametrize( + "rotate_fp32", + [True, False], +) +def test_kv_rotate(rotate_fp32): mtq.plugins.register_attention_for_kv_quant(SDPAAttention) model = nn.Sequential(SDPAAttention()) mtq.replace_quant_module(model) @@ -51,27 +59,33 @@ def test_kv_rotate(): set_quantizer_by_cfg(model, {"*": {"enable": False}}) dummy_input = SDPAAttention.get_input(device="cuda") output_ref = model(dummy_input) + if rotate_fp32: + rotate = {"enable": True, "rotate_fp32": True} + atol = 1e-6 + else: + rotate = True + atol = 0.05 with set_quantizer_by_cfg_context( model, { "*[qk]_bmm_quantizer": { - "rotate": True, + "rotate": rotate, }, }, ): output_test = model(dummy_input) - assert torch.allclose(output_ref, output_test, atol=0.05) + assert torch.allclose(output_ref, output_test, atol=atol) # Test the rotation is actually applied by turning on only one of the query, key quantizers with set_quantizer_by_cfg_context( model, { "*k_bmm_quantizer": { - "rotate": True, + "rotate": rotate, }, }, ): output_test1 = model(dummy_input) - assert not torch.allclose(output_ref, output_test1, atol=0.05) + assert not torch.allclose(output_ref, output_test1, atol=atol) mtq.unregister(SDPAAttention) From 7f37ed8dc492e6382212601ac7b6d0e561ddc6de Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 12 Feb 2026 22:41:10 +0000 Subject: [PATCH 2/8] updated changelog Signed-off-by: Kinjal Patel --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 744238656..9a0d70916 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -9,6 +9,7 @@ NVIDIA Model Optimizer Changelog (Linux) - User does not need to manually register MOE modules to cover experts calibration coverage in PTQ workflow. - ``hf_ptq.py`` now saves the quantization summary and moe expert token count table to the export directory. - Add sparse attention optimization for transformer models (``modelopt.torch.sparsity.attention_sparsity``). This reduces computational cost by skipping attention computation. Supports calibration for threshold selection on HuggingFace models. See `examples/llm_sparsity/attention_sparsity/README.md `_ for usage. +- Add support for rotating the input before quantization for RHT. 0.42 (2026-02-xx) ^^^^^^^^^^^^^^^^^ From 071ca714eb8da7fe902565165968dff12bfa344a Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 12 Feb 2026 22:44:02 +0000 Subject: [PATCH 3/8] minor Signed-off-by: Kinjal Patel --- modelopt/torch/quantization/nn/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelopt/torch/quantization/nn/functional.py b/modelopt/torch/quantization/nn/functional.py index 662aea66e..0beb7c956 100644 --- a/modelopt/torch/quantization/nn/functional.py +++ b/modelopt/torch/quantization/nn/functional.py @@ -106,7 +106,7 @@ def normalized_hadamard_transform(inputs, rotate_fp32=False): dtype = inputs.dtype if rotate_fp32: - inputs = inputs.float() + inputs = inputs.to(torch.float32) outputs = FastHadamardTransform.apply(inputs) / torch.sqrt( torch.tensor(inputs.shape[-1], dtype=torch.float32) ) From 0de9ed007541ecf6c35ba3e755bf9de137827f2d Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Thu, 12 Feb 2026 22:57:25 +0000 Subject: [PATCH 4/8] minor Signed-off-by: Kinjal Patel --- .../quantization/nn/modules/tensor_quantizer.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 76a5f600d..090c1d967 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -1115,12 +1115,13 @@ def extra_repr(self): if self.pre_quant_scale is not None else "" ) - s += ( - " rotated" - if (isinstance(self._rotate, dict) and self._rotate.get("enable", False)) - or self._rotate - else "" - ) + if isinstance(self._rotate, dict): + if self._rotate.get("enable", False): + s += " rotated" + if self._rotate.get("rotate_fp32", False): + s += " (fp32)" + elif self._rotate: + s += " rotated" s += ( f" calibrator={self._calibrator.__class__.__name__}" if (self._calibrator is not None) From 39e5733f314c3f21d6c4e47830be76ca97246add Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Fri, 13 Feb 2026 18:50:08 +0000 Subject: [PATCH 5/8] minor Signed-off-by: Kinjal Patel --- .../nn/modules/tensor_quantizer.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py index 090c1d967..2caec2565 100644 --- a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py +++ b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py @@ -529,6 +529,20 @@ def is_static_block_quant(self): and self._fake_quant ) + @property + def rotate_is_enabled(self): + """Check if rotate is enabled in quant config.""" + return self._rotate.get("enable", False) if isinstance(self._rotate, dict) else self._rotate + + @property + def rotate_is_fp32(self): + """Check if rotation needs to be computed in float32.""" + return ( + self._rotate.get("rotate_fp32", False) + if isinstance(self._rotate, dict) and self.rotate_is_enabled + else False + ) + def disable_calib(self): """Disable calibration.""" self._if_calib = False @@ -996,14 +1010,8 @@ def forward(self, inputs): inputs = inputs * self.pre_quant_scale # Rotating the input - rotate_fp32 = ( - self._rotate.get("rotate_fp32", False) if isinstance(self._rotate, dict) else False - ) - rotate_enable = ( - self._rotate.get("enable", False) if isinstance(self._rotate, dict) else self._rotate - ) - if rotate_enable: - inputs = normalized_hadamard_transform(inputs, rotate_fp32=rotate_fp32) + if self.rotate_is_enabled: + inputs = normalized_hadamard_transform(inputs, rotate_fp32=self.rotate_is_fp32) if self._disabled: # if quantizer is disabled, we still need to track the input dtype for saving the model @@ -1115,13 +1123,8 @@ def extra_repr(self): if self.pre_quant_scale is not None else "" ) - if isinstance(self._rotate, dict): - if self._rotate.get("enable", False): - s += " rotated" - if self._rotate.get("rotate_fp32", False): - s += " (fp32)" - elif self._rotate: - s += " rotated" + s += " rotated" if self.rotate_is_enabled else "" + s += " (fp32)" if self.rotate_is_fp32 else "" s += ( f" calibrator={self._calibrator.__class__.__name__}" if (self._calibrator is not None) From f0c76e3bd6edd5dc1b1af64224f7cb3f1f6a245f Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Fri, 13 Feb 2026 21:38:58 +0000 Subject: [PATCH 6/8] minor Signed-off-by: Kinjal Patel --- tests/gpu/torch/quantization/test_hadamard.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/gpu/torch/quantization/test_hadamard.py b/tests/gpu/torch/quantization/test_hadamard.py index 07c179026..64dd39e2c 100644 --- a/tests/gpu/torch/quantization/test_hadamard.py +++ b/tests/gpu/torch/quantization/test_hadamard.py @@ -43,8 +43,7 @@ def test_hadamard_transform(dim): assert torch.allclose(xxt_h, xxt, atol=0.05) x_h_fp32 = normalized_hadamard_transform(x, rotate_fp32=True) xxt_h_fp32 = x_h_fp32 @ x_h_fp32.T - # test the numerical error is smaller when using float32 - assert torch.allclose(xxt_h_fp32, xxt, atol=1e-6) + assert torch.allclose(xxt_h_fp32, xxt, atol=0.05) @pytest.mark.parametrize( @@ -61,10 +60,8 @@ def test_kv_rotate(rotate_fp32): output_ref = model(dummy_input) if rotate_fp32: rotate = {"enable": True, "rotate_fp32": True} - atol = 1e-6 else: rotate = True - atol = 0.05 with set_quantizer_by_cfg_context( model, { @@ -74,7 +71,7 @@ def test_kv_rotate(rotate_fp32): }, ): output_test = model(dummy_input) - assert torch.allclose(output_ref, output_test, atol=atol) + assert torch.allclose(output_ref, output_test, atol=0.05) # Test the rotation is actually applied by turning on only one of the query, key quantizers with set_quantizer_by_cfg_context( @@ -86,6 +83,6 @@ def test_kv_rotate(rotate_fp32): }, ): output_test1 = model(dummy_input) - assert not torch.allclose(output_ref, output_test1, atol=atol) + assert not torch.allclose(output_ref, output_test1, atol=0.05) mtq.unregister(SDPAAttention) From 126570f928d448f831883a42db89081ddb92e1b6 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Mon, 23 Feb 2026 10:34:57 +0000 Subject: [PATCH 7/8] updated bounds Signed-off-by: Kinjal Patel --- tests/gpu/torch/quantization/test_hadamard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/gpu/torch/quantization/test_hadamard.py b/tests/gpu/torch/quantization/test_hadamard.py index 64dd39e2c..1ac096eca 100644 --- a/tests/gpu/torch/quantization/test_hadamard.py +++ b/tests/gpu/torch/quantization/test_hadamard.py @@ -43,7 +43,7 @@ def test_hadamard_transform(dim): assert torch.allclose(xxt_h, xxt, atol=0.05) x_h_fp32 = normalized_hadamard_transform(x, rotate_fp32=True) xxt_h_fp32 = x_h_fp32 @ x_h_fp32.T - assert torch.allclose(xxt_h_fp32, xxt, atol=0.05) + assert torch.allclose(xxt_h_fp32, xxt, atol=0.001) @pytest.mark.parametrize( @@ -83,6 +83,6 @@ def test_kv_rotate(rotate_fp32): }, ): output_test1 = model(dummy_input) - assert not torch.allclose(output_ref, output_test1, atol=0.05) + assert not torch.allclose(output_ref, output_test1, atol=0.001) mtq.unregister(SDPAAttention) From e9d3b665da8f6fa4c38095924aeae7e262f7d006 Mon Sep 17 00:00:00 2001 From: Kinjal Patel Date: Mon, 23 Feb 2026 17:01:56 +0000 Subject: [PATCH 8/8] minor Signed-off-by: Kinjal Patel --- tests/gpu/torch/quantization/test_hadamard.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/gpu/torch/quantization/test_hadamard.py b/tests/gpu/torch/quantization/test_hadamard.py index 1ac096eca..64dd39e2c 100644 --- a/tests/gpu/torch/quantization/test_hadamard.py +++ b/tests/gpu/torch/quantization/test_hadamard.py @@ -43,7 +43,7 @@ def test_hadamard_transform(dim): assert torch.allclose(xxt_h, xxt, atol=0.05) x_h_fp32 = normalized_hadamard_transform(x, rotate_fp32=True) xxt_h_fp32 = x_h_fp32 @ x_h_fp32.T - assert torch.allclose(xxt_h_fp32, xxt, atol=0.001) + assert torch.allclose(xxt_h_fp32, xxt, atol=0.05) @pytest.mark.parametrize( @@ -83,6 +83,6 @@ def test_kv_rotate(rotate_fp32): }, ): output_test1 = model(dummy_input) - assert not torch.allclose(output_ref, output_test1, atol=0.001) + assert not torch.allclose(output_ref, output_test1, atol=0.05) mtq.unregister(SDPAAttention)