patch 1

Signed-off-by: xin3he <[email protected]>
intel · xin3he · Jul 9, 2024 · Jul 1, 2024 · Jul 1, 2024 · Jul 3, 2024
commit c97b6d83cfe2750ca39b923657566182ea759485
diff --git a/neural_compressor/adaptor/pytorch.py b/neural_compressor/adaptor/pytorch.py
@@ -3579,6 +3579,16 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
             return q_model
 
         self.tune_cfg["fx_sub_module_list"] = self.sub_module_list
+
+        # BF16 fallback
+        if (
+            len(self.tune_cfg["bf16_ops_list"]) > 0
+            and self.version.release >= Version("1.11.0").release
+            and self.use_bf16
+            and (CpuInfo().bf16 or os.getenv("FORCE_BF16") == "1")
+        ):  # pragma: no cover
+            q_model._model = torch_utils.bf16_convert.Convert(q_model._model, self.tune_cfg)
+
         if self.approach == "quant_aware_training":
             q_model._model.train()
             if self.sub_module_list is None:
@@ -3665,14 +3675,6 @@ def quantize(self, tune_cfg, model, dataloader, q_func=None):
                 self.sub_module_list, q_model._model, prefix="", custom_config=self.prepare_custom_config_dict
             )
 
-        if (
-            len(self.tune_cfg["bf16_ops_list"]) > 0
-            and self.version.release >= Version("1.11.0").release
-            and self.use_bf16
-            and (CpuInfo().bf16 or os.getenv("FORCE_BF16") == "1")
-        ):  # pragma: no cover
-            q_model._model = torch_utils.bf16_convert.Convert(q_model._model, self.tune_cfg)
-
         self.fused_dict = self.get_fused_list(q_model.model)
         q_model.is_quantized = True
         q_model.q_config = copy.deepcopy(self.tune_cfg)