Samsung · stamalakhov · May 14, 2026 · May 14, 2026 · May 14, 2026
diff --git a/tico/quantization/algorithm/fpi_gptq/fpi_gptq.py b/tico/quantization/algorithm/fpi_gptq/fpi_gptq.py
@@ -32,29 +32,7 @@
 )
 
 from tico.quantization.algorithm.gptq.quant import quantize, Quantizer
-
-
-def iterate_GPTQ(scale, zero, maxq, W, Hinv, max_num_of_iters=50):
-
-    cur_weights = W.clone()
-    mults = torch.pow(torch.diag(Hinv), -1)
-    Hinv_U = torch.triu(Hinv, diagonal=1)
-
-    init_weights = W.clone()
-    for _ in range(max_num_of_iters):
-        cur_Q = quantize(cur_weights, scale, zero, maxq)
-
-        d_W = torch.mul((cur_weights - cur_Q), mults)
-        cur_weights = init_weights - torch.matmul(d_W, Hinv_U)
-        del d_W, cur_Q
-        d_W = cur_Q = None
-
-    del init_weights
-    init_weights = None
-
-    cur_Q = quantize(cur_weights, scale, zero, maxq)
-
-    return cur_Q, cur_weights
+from tico.quantization.algorithm.fpi_gptq.util import iterate_GPTQ, quantize
 
 
 class FPI_GPTQ:

diff --git a/tico/quantization/algorithm/fpi_gptq/util.py b/tico/quantization/algorithm/fpi_gptq/util.py
@@ -0,0 +1,56 @@
+# Copyright IST-DASLab. 2025. (commit: 2d65066). GitHub repository.
+# Retrieved from https://github.com/IST-DASLab/gptq. Licensed under the
+# Apache License 2.0.
+
+# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://github.com/IST-DASLab/gptq/blob/2d65066/quant.py
+
+import torch
+
+
+def quantize(x, scale, zero, maxq):
+    if maxq < 0:
+        return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
+    q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
+    return scale * (q - zero)
+
+
+def iterate_GPTQ(scale, zero, maxq, W, Hinv, max_num_of_iters=50):
+
+    cur_weights = W.clone()
+    mults = torch.pow(torch.diag(Hinv), -1)
+    Hinv_U = torch.triu(Hinv, diagonal=1)
+
+    init_weights = W.clone()
+    for _ in range(max_num_of_iters):
+        cur_Q = quantize(cur_weights, scale, zero, maxq)
+
+        d_W = torch.mul((cur_weights - cur_Q), mults)
+        cur_weights = init_weights - torch.matmul(d_W, Hinv_U)
+        del d_W, cur_Q
+        d_W = cur_Q = None
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+
+    del init_weights
+    init_weights = None
+
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+
+    cur_Q = quantize(cur_weights, scale, zero, maxq)
+
+    return cur_Q, cur_weights
diff --git a/tico/quantization/algorithm/gptq/README.md b/tico/quantization/algorithm/gptq/README.md
@@ -52,6 +52,9 @@ applied after _convert()_, the effectiveness of GPTQ may be diminished.
 There are two options :
 1. `mse`- vanilla `mse`. Produce quantization parameters for GPTQ quantizer (`min`\`max`) which minimize mean squared error of quantization. $MSE_{MIN, MAX}(W) = argmin_{min, max}||W-Q_{min, max}(W)||^2$.
 2. `smse` - sensitivity-based `mse`. Use sensitivity of some global feature (e.g. float model logits) to parameters change to minimize global effect of quantization. $SMSE_{MIN, MAX}(W) = argmin_{min, max}|(W-Q_{min, max}(W))^2*Sensitivity(W)|$. So we try to keep `important` parameters unchanged, while quantizing `unimportant` parameters more aggressively.
+3. `smse_for_gptq` - `smse` adjusted for GPTQ. GPTQ modifies the matrix during the quantization process, so the most accurate method would consist in finding a quantizer that yields the smallest quantization error after the GPTQ method has been applied $SMSE\_FOR\_GPTQ_{MIN, MAX}(W) = argmin_{min, max}|(W-Q_{min, max}(W_{GPTQ}))^2*Sensitivity(W)|$. Since this would be quite computationally expensive, we can use an accelerated approximate GPTQ method — FPI_GPTQ $SMSE\_FOR\_GPTQ_{MIN, MAX}(W) = argmin_{min, max}|(W-Q_{min, max}(W_{FPI\_GPTQ}))^2*Sensitivity(W)|$. This is slower than `mse`/`smse` but can provide better accuracy.
+
+
 
 You can turn this feature `on`/`off` by using `mse` parameter of `GPTQConfig`:
 ```

diff --git a/tico/quantization/algorithm/gptq/gptq.py b/tico/quantization/algorithm/gptq/gptq.py
@@ -361,6 +361,8 @@ def fasterquant(
         H = torch.linalg.cholesky(H, upper=True).float()
         Hinv = H
 
+        self.quantizer.update(W, Hinv, perm)
+
         assert isinstance(Hinv, torch.Tensor)
         for i1 in range(0, self.columns, blocksize):
             i2 = min(i1 + blocksize, self.columns)