merge master

Signed-off-by: sdp <[email protected]>
intel · Kaihui-intel · Jul 16, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
commit 483c219eb037cd211adb41da14a88a3ff358691f
diff --git a/neural_compressor/torch/algorithms/weight_only/rtn.py b/neural_compressor/torch/algorithms/weight_only/rtn.py
@@ -19,8 +19,7 @@
 # limitations under the License.
 
 
-import gc
-import os
+import copy
 from collections import OrderedDict
 
 import torch
@@ -76,6 +75,7 @@ def convert(
         use_mse_search=False,
         use_layer_wise=False,
         model_path="",
+        quant_lm_head=False,
         *args,
         **kwargs,
     ):
@@ -104,20 +104,12 @@ def convert(
         device = get_accelerator(kwargs.pop("device", "auto")).current_device_name()
         model_device = get_model_device(model)  # return model on the same device
 
-        # Put model on device explicitly
-        # TODO: refine it later, Put module on device one by one instead of the whole model
-        if not use_layer_wise:
-           model.to(device)
+        # for transformers model. If lm_head is tied from embedding, we deepcopy it.
+        if quant_lm_head and getattr(getattr(model, "config", None), "tie_word_embeddings", False):
+            for key in model._tied_weights_keys:
+                weight = get_attr(model, key)
+                set_attr(model, key, copy.deepcopy(weight))
 
-        total_time = 0.0
-        total_load_time = 0.0
-        total_save_time = 0.0
-        total_quant_time = 0.0
-        total_quant_int_time = 0.0
-        total_set_module_time = 0.0
-        save_time = 0.0
-        layer_time = 0.0
-        import time
         assert isinstance(model, torch.nn.Module), "only support torch module"
         if is_transformers_imported():
             supported_layers = (torch.nn.Linear, transformers.Conv1D)
@@ -193,8 +185,6 @@ def convert(
                 continue
             logger.debug(f"RTN quantized module:{name, m}")
             logger.debug(log_msg)
-
-
 
             if use_layer_wise:
                 load_module(model, name, model_path, device=device)

diff --git a/neural_compressor/torch/quantization/algorithm_entry.py b/neural_compressor/torch/quantization/algorithm_entry.py
@@ -93,6 +93,7 @@ def rtn_entry(
         {
             "use_layer_wise": quant_config.use_layer_wise,
             "model_path": quant_config.model_path,
+            "quant_lm_head": quant_config.quant_lm_head,
         }
     )
     quantizer = get_quantizer(model, quantizer_cls=RTNQuantizer, quant_config=weight_config)