Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
[pre-commit.ci] auto fixes from pre-commit.com hooks
for more information, see https://pre-commit.ci
  • Loading branch information
pre-commit-ci[bot] committed Aug 14, 2024
commit 814cad77b1080490f5c3f559fd47ca137d9f7d0e
22 changes: 11 additions & 11 deletions neural_compressor/torch/algorithms/weight_only/gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,9 +795,7 @@ def tmp(_, inp, out):

gptq_post_block[layer_name] = GPTQ(sub_layers[layer_name], W, self.device)
# gptq_for_this_block[layer_name].quantizer = Quantizer()
gptq_post_block[layer_name].quantizer.configure(
weight_config_this_layer
)
gptq_post_block[layer_name].quantizer.configure(weight_config_this_layer)
# generate the gptq quantizer
handles = [] # register handles which add inputs and outputs to gptq object
for layer_name in sub_layers:
Expand Down Expand Up @@ -860,7 +858,7 @@ def tmp(_, inp, out):
# save perm for restoring the weights, but only when static_groups is not enabled.
gptq_config[full_layer_name]["perm"] = gptq_post_block[full_layer_name].perm
gptq_post_block[layer_name].free()

# 2.7.2 lm_head: export to compressed model
for layer_name in sub_layers:
full_layer_name = self.gptq_related_blocks["transformers_post"]["name"]
Expand All @@ -874,17 +872,17 @@ def tmp(_, inp, out):
gptq_perm = gptq_config[full_layer_name]["perm"]
else:
gptq_perm = None
if self.use_layer_wise: # pragma: no cover
state_dict = torch.load(
LWQ_WORKSPACE + f"/{full_layer_name}.pt"
)
if self.use_layer_wise: # pragma: no cover
state_dict = torch.load(LWQ_WORKSPACE + f"/{full_layer_name}.pt")
Q = state_dict["weight"].data
bias = state_dict["bias"] if "bias" in state_dict.keys() else None
else:
Q = sub_layers[layer_name].weight.data
if weight_config_this_layer["act_order"]:
Q.copy_(Q[:, gptq_perm])
if is_transformers_imported() and isinstance(sub_layers[layer_name], transformers.Conv1D): # pragma: no cover
if is_transformers_imported() and isinstance(
sub_layers[layer_name], transformers.Conv1D
): # pragma: no cover
Q = Q.t_().contiguous()
from .utility import quant_weight_w_scale

Expand All @@ -903,14 +901,16 @@ def tmp(_, inp, out):
if isinstance(sub_layers[layer_name], torch.nn.Linear):
in_features = sub_layers[layer_name].in_features
out_features = sub_layers[layer_name].out_features
elif is_transformers_imported() and isinstance(sub_layers[layer_name], transformers.Conv1D): # pragma: no cover
elif is_transformers_imported() and isinstance(
sub_layers[layer_name], transformers.Conv1D
): # pragma: no cover
in_features = sub_layers[layer_name].weight.shape[0]
out_features = sub_layers[layer_name].weight.shape[1]
int_weight = sub_layers[layer_name].weight.t_().contiguous()
scale = scale.t_().contiguous()
zp = zp.t_().contiguous() if zp is not None else zp

if not self.use_layer_wise: # pragma: no cover
if not self.use_layer_wise: # pragma: no cover
bias = sub_layers[layer_name].bias

new_module = INCWeightOnlyLinear(
Expand Down
18 changes: 9 additions & 9 deletions test/3x/torch/quantization/weight_only/test_gptq.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,17 +195,19 @@ def test_layer_wise(self, quant_lm_head=False):
model = load_empty_model("hf-internal-testing/tiny-random-GPTJForCausalLM")

quant_config = GPTQConfig(
use_layer_wise=True,
use_layer_wise=True,
quant_lm_head=quant_lm_head,
model_path="hf-internal-testing/tiny-random-GPTJForCausalLM")
model_path="hf-internal-testing/tiny-random-GPTJForCausalLM",
)
model = prepare(model, quant_config)
run_fn(model)
model = convert(model)
out = model(self.example_inputs)[0]
assert (torch.equal(out, q_label)
), f"use_layer_wise=True and quant_lm_head={quant_lm_head} output should be same. Please double check."
assert torch.equal(
out, q_label
), f"use_layer_wise=True and quant_lm_head={quant_lm_head} output should be same. Please double check."
if not quant_lm_head:
self.test_layer_wise(quant_lm_head=True) # Avoid errors raised by @pytest.mark.parametrize
self.test_layer_wise(quant_lm_head=True) # Avoid errors raised by @pytest.mark.parametrize

def test_true_sequential(self):
# true_sequential=False
Expand All @@ -232,7 +234,7 @@ def test_true_sequential(self):
assert (
atol_false < atol_true
), "true_sequential=True doesn't help accuracy, maybe is reasonable, please double check."

def test_quant_lm_head(self):
# quant_lm_head=False
model = copy.deepcopy(self.tiny_gptj)
Expand All @@ -258,9 +260,7 @@ def test_quant_lm_head(self):
assert (
atol_false < atol_true
), "quant_lm_head=True doesn't help accuracy, maybe is reasonable, please double check."
assert (
get_woq_linear_num(model, "INCWeightOnlyLinear") == 31
), "Incorrect number of INCWeightOnlyLinear modules"
assert get_woq_linear_num(model, "INCWeightOnlyLinear") == 31, "Incorrect number of INCWeightOnlyLinear modules"

@pytest.mark.parametrize("dtype", ["nf4", "int4"])
@pytest.mark.parametrize("double_quant_bits", [6])
Expand Down