Fix bugs in initial_load_in_hf when enable_weight_tying=true in Qwen3

Add checks for weight tying in state_dict processing
pytorch · Achazwl · Oct 29, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 31, 2025
commit d57d3a0806fec893dea911315f1781c8871dc10b
@@ -104,6 +104,9 @@ def to_hf(self, state_dict: dict[str, Any]) -> dict[str, Any]:
             else:
                 if key not in to_hf_map:
                     continue
+                # Skip output.weight if weight tying is enabled (HF checkpoint won't have lm_head.weight)
+                if self.model_args.enable_weight_tying and key == "output.weight":
+                    continue
                 new_key = to_hf_map[key]
                 hf_state_dict[new_key] = value
 
@@ -118,6 +121,13 @@ def from_hf(self, hf_state_dict: dict[str, Any]) -> dict[str, Any]:
         state_dict = {}
         expert_weights_by_layer = {}  # {layer: {abstract_key: {expert_id: tensor}}}
 
+        # If weight tying is enabled and lm_head.weight is not in HF checkpoint,
+        # copy from embed_tokens.weight
+        if self.model_args.enable_weight_tying and "lm_head.weight" not in hf_state_dict:
+            if "model.embed_tokens.weight" in hf_state_dict:
+                hf_state_dict = dict(hf_state_dict)  # Make a copy to avoid modifying original
 state_dict = self.sd_adapter.from_hf(hf_state_dict) 
 state_dict = self.sd_adapter.from_hf(hf_state_dict) 
 state_dict = self.sd_adapter.from_hf(hf_state_dict) 
+                hf_state_dict["lm_head.weight"] = hf_state_dict["model.embed_tokens.weight"]
+
         for key, value in hf_state_dict.items():
             if "mlp.experts" in key:
                 abstract_key = re.sub(r"(\d+)", "{}", key, count=2)