update ut&doc

Signed-off-by: Kaihui-intel <[email protected]>
intel · XuehaoSun · Aug 19, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
commit 31bc09d234fd6d0e49ed629579558d8dd51db3de
diff --git a/docs/source/3x/PT_WeightOnlyQuant.md b/docs/source/3x/PT_WeightOnlyQuant.md
@@ -111,9 +111,10 @@ model = convert(model)
 |               model_path (str)        |  Model path that is used to load   state_dict per layer                                                                                    |                    |
 |               use_double_quant (bool) |  Enables double quantization                                                                                                               |  False  |
 |               act_order (bool)        |  Whether to sort Hessian's diagonal   values to rearrange channel-wise quantization order                                                  |  False  |
-|               percdamp (float)        |  Percentage of Hessian's diagonal   values' average, which will be added to Hessian's diagonal to increase   numerical stability           |  0.01.  |
+|               percdamp (float)        |  Percentage of Hessian's diagonal   values' average, which will be added to Hessian's diagonal to increase   numerical stability           |  0.01  |
 |               block_size (int)        |  Execute GPTQ quantization per   block, block shape = [C_out, block_size]                                                                  |  128     |
-|               static_groups (bool)    |  Whether to calculate group wise   quantization parameters in advance. This option mitigate actorder's extra   computational requirements. |  False.  |
+|               static_groups (bool)    |  Whether to calculate group wise   quantization parameters in advance. This option mitigate actorder's extra   computational requirements. |  False  |
+|               true_sequential (bool)    |  Whether to quantize layers within a transformer block in their original order. This can lead to higher accuracy but slower overall quantization process. |  False  |
 > **Note:** `model_path` is only used when use_layer_wise=True. `layer-wise` is stay-tuned.
 
 ``` python

diff --git a/test/3x/torch/quantization/weight_only/test_gptq.py b/test/3x/torch/quantization/weight_only/test_gptq.py
@@ -201,6 +201,32 @@ def test_layer_wise(self):
         out = model(self.example_inputs)[0]
         assert torch.equal(out, q_label), "use_layer_wise=True output should be same. Please double check."
 
+    def test_true_sequential(self):
+        # true_sequential=False
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig(
+            true_sequential=False,
+        )
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        out = model(self.example_inputs)[0]
+        atol_false = (out - self.label).amax()
+        # true_sequential=True
+        model = copy.deepcopy(self.tiny_gptj)
+        quant_config = GPTQConfig(
+            true_sequential=True,
+        )
+        model = prepare(model, quant_config)
+        run_fn(model)
+        model = convert(model)
+        out = model(self.example_inputs)[0]
+        atol_true = (out - self.label).amax()
+        # compare atol, this case is an ideal case.
+        assert (atol_false < atol_true
+            ), "true_sequential=True doesn't help accuracy, maybe is reasonable, please double check."
+
+
     @pytest.mark.parametrize("dtype", ["nf4", "int4"])
     @pytest.mark.parametrize("double_quant_bits", [6])
     @pytest.mark.parametrize("double_quant_group_size", [8, 256])