add woq saving and loading ut and logger info

Signed-off-by: changwangss <[email protected]>
huggingface · echarlaix · Sep 9, 2024 · Aug 27, 2024 · Aug 29, 2024 · Aug 29, 2024
commit 6eba7c4ef94b8d930a4d17cbf38a1ea1442a047d
diff --git a/optimum/intel/neural_compressor/modeling_base.py b/optimum/intel/neural_compressor/modeling_base.py
@@ -143,6 +143,9 @@ def _from_pretrained(
                         "Weight only quantization model loading provided by intel_extension_for_transformers is deprecated and it is provided by INC now.",
                         DeprecationWarning,
                     )
+                    logger.info(
+                        "The weight only quantized model loading only supports the same format as GPTQ, such as https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/main."
+                    )
                     _BaseINCAutoModelClass.ORIG_MODEL = cls.auto_model_class
                     model = _BaseINCAutoModelClass.load_low_bit(
                         model_id,
@@ -165,6 +168,9 @@ def _from_pretrained(
                 "Weight only quantization provided by intel_extension_for_transformers is deprecated and it is provided by INC now.",
                 DeprecationWarning,
             )
+            logger.info(
+                "The quantized model parameters will be saved in the same format as GPTQ, here is the sample model https://huggingface.co/TheBloke/Llama-2-7B-Chat-GPTQ/tree/main for details."
+            )
             model = weight_only_quantization(
                 cls.auto_model_class,
                 model_id,

diff --git a/setup.py b/setup.py
@@ -59,7 +59,7 @@
 QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"]
 
 EXTRAS_REQUIRE = {
-    "neural-compressor": ["neural-compressor>3.0", "accelerate", "transformers"],
+    "neural-compressor": ["neural-compressor>3.0", "accelerate", "transformers<4.43"],
     "openvino": ["openvino>=2023.3", "nncf>=2.11.0", "openvino-tokenizers[transformers]"],
     "nncf": ["nncf>=2.11.0"],
     "ipex": ["intel-extension-for-pytorch", "transformers>=4.39.0,<4.44.0"],

diff --git a/tests/neural_compressor/test_modeling.py b/tests/neural_compressor/test_modeling.py
@@ -21,6 +21,7 @@
 import torch
 from parameterized import parameterized
 from transformers import AutoTokenizer, pipeline, set_seed
+from transformers.utils import SAFE_WEIGHTS_NAME
 
 from optimum.exporters import TasksManager
 from optimum.intel import (  # noqa
@@ -38,7 +39,7 @@
     INCStableDiffusionPipeline,
     INCTrainer,
 )
-from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, WEIGHTS_NAME
+from optimum.intel.neural_compressor.utils import _HEAD_TO_AUTOMODELS, QUANTIZATION_CONFIG_NAME, WEIGHTS_NAME
 
 
 os.environ["CUDA_VISIBLE_DEVICES"] = ""
@@ -145,6 +146,32 @@ def test_compare_with_and_without_past_key_values(self):
         self.assertEqual(outputs_without_pkv.shape[1], self.GENERATION_LENGTH)
         self.assertTrue(torch.equal(outputs_with_pkv, outputs_without_pkv))
 
+    def test_saving_loading_inc_woq_model(self):
+        model_name = "TheBlokeAI/Mixtral-tiny-GPTQ"
+        subfolder = "inc"
+        model = INCModelForCausalLM.from_pretrained(model_name, revision="inc", subfolder=subfolder)
+        tokenizer = AutoTokenizer.from_pretrained(model_name, revision="inc")
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        tokens = tokenizer("This is a sample output", return_tensors="pt")
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            model_save_dir = Path(tmp_dir) / subfolder
+            model.save_pretrained(model_save_dir)
+            folder_contents = os.listdir(model_save_dir)
+            self.assertIn(SAFE_WEIGHTS_NAME, folder_contents)
+            self.assertIn(QUANTIZATION_CONFIG_NAME, folder_contents)
+            loaded_model = INCModelForCausalLM.from_pretrained(tmp_dir, subfolder=subfolder)
+
+        with torch.no_grad():
+            outputs = model(**tokens)
+            loaded_outputs = loaded_model(**tokens)
+
+        self.assertTrue("logits" in loaded_outputs)
+        self.assertIsInstance(loaded_outputs.logits, torch.Tensor)
+        self.assertTrue("past_key_values" in loaded_outputs)
+        self.assertIsInstance(loaded_outputs.past_key_values, tuple)
+        self.assertTrue(torch.allclose(outputs.logits, loaded_outputs.logits, atol=1e-5))
+
     def test_saving_loading_inc_model(self):
         model_name = "echarlaix/tiny-random-PhiForCausalLM"
         subfolder = "inc"