Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 1 addition & 23 deletions tico/quantization/algorithm/fpi_gptq/fpi_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,29 +32,7 @@
)

from tico.quantization.algorithm.gptq.quant import quantize, Quantizer


def iterate_GPTQ(scale, zero, maxq, W, Hinv, max_num_of_iters=50):

cur_weights = W.clone()
mults = torch.pow(torch.diag(Hinv), -1)
Hinv_U = torch.triu(Hinv, diagonal=1)

init_weights = W.clone()
for _ in range(max_num_of_iters):
cur_Q = quantize(cur_weights, scale, zero, maxq)

d_W = torch.mul((cur_weights - cur_Q), mults)
cur_weights = init_weights - torch.matmul(d_W, Hinv_U)
del d_W, cur_Q
d_W = cur_Q = None

del init_weights
init_weights = None

cur_Q = quantize(cur_weights, scale, zero, maxq)

return cur_Q, cur_weights
from tico.quantization.algorithm.fpi_gptq.util import iterate_GPTQ, quantize


class FPI_GPTQ:
Expand Down
56 changes: 56 additions & 0 deletions tico/quantization/algorithm/fpi_gptq/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Copyright IST-DASLab. 2025. (commit: 2d65066). GitHub repository.
# Retrieved from https://github.com/IST-DASLab/gptq. Licensed under the
# Apache License 2.0.

# Copyright (c) 2026 Samsung Electronics Co., Ltd. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# https://github.com/IST-DASLab/gptq/blob/2d65066/quant.py

import torch


def quantize(x, scale, zero, maxq):
if maxq < 0:
return (x > scale / 2).float() * scale + (x < zero / 2).float() * zero
q = torch.clamp(torch.round(x / scale) + zero, 0, maxq)
return scale * (q - zero)


def iterate_GPTQ(scale, zero, maxq, W, Hinv, max_num_of_iters=50):

cur_weights = W.clone()
mults = torch.pow(torch.diag(Hinv), -1)
Hinv_U = torch.triu(Hinv, diagonal=1)

init_weights = W.clone()
for _ in range(max_num_of_iters):
cur_Q = quantize(cur_weights, scale, zero, maxq)

d_W = torch.mul((cur_weights - cur_Q), mults)
cur_weights = init_weights - torch.matmul(d_W, Hinv_U)
del d_W, cur_Q
d_W = cur_Q = None
if torch.cuda.is_available():
torch.cuda.empty_cache()

del init_weights
init_weights = None

if torch.cuda.is_available():
torch.cuda.empty_cache()

cur_Q = quantize(cur_weights, scale, zero, maxq)

return cur_Q, cur_weights
3 changes: 3 additions & 0 deletions tico/quantization/algorithm/gptq/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ applied after _convert()_, the effectiveness of GPTQ may be diminished.
There are two options :
1. `mse`- vanilla `mse`. Produce quantization parameters for GPTQ quantizer (`min`\`max`) which minimize mean squared error of quantization. $MSE_{MIN, MAX}(W) = argmin_{min, max}||W-Q_{min, max}(W)||^2$.
2. `smse` - sensitivity-based `mse`. Use sensitivity of some global feature (e.g. float model logits) to parameters change to minimize global effect of quantization. $SMSE_{MIN, MAX}(W) = argmin_{min, max}|(W-Q_{min, max}(W))^2*Sensitivity(W)|$. So we try to keep `important` parameters unchanged, while quantizing `unimportant` parameters more aggressively.
3. `smse_for_gptq` - `smse` adjusted for GPTQ. GPTQ modifies the matrix during the quantization process, so the most accurate method would consist in finding a quantizer that yields the smallest quantization error after the GPTQ method has been applied $SMSE\_FOR\_GPTQ_{MIN, MAX}(W) = argmin_{min, max}|(W-Q_{min, max}(W_{GPTQ}))^2*Sensitivity(W)|$. Since this would be quite computationally expensive, we can use an accelerated approximate GPTQ method — FPI_GPTQ $SMSE\_FOR\_GPTQ_{MIN, MAX}(W) = argmin_{min, max}|(W-Q_{min, max}(W_{FPI\_GPTQ}))^2*Sensitivity(W)|$. This is slower than `mse`/`smse` but can provide better accuracy.



You can turn this feature `on`/`off` by using `mse` parameter of `GPTQConfig`:
```
Expand Down
2 changes: 2 additions & 0 deletions tico/quantization/algorithm/gptq/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,8 @@ def fasterquant(
H = torch.linalg.cholesky(H, upper=True).float()
Hinv = H

self.quantizer.update(W, Hinv, perm)

assert isinstance(Hinv, torch.Tensor)
for i1 in range(0, self.columns, blocksize):
i2 = min(i1 + blocksize, self.columns)
Expand Down
Loading
Loading