Make state_dict and named_parameters work

There was a bug because the removal of the parameter resulted in it no longer appearing in the state_dict and named_parameters. This commit fixes this bug. The bug also exists in the referenced lora-torch library.
huggingface · BenjaminBossan · Jan 8, 2025 · Jan 5, 2024 · Jan 5, 2024 · Jan 5, 2024
commit 173062cd048a979d857c02195424413c414e8a07
diff --git a/src/peft/tuners/lora/layer.py b/src/peft/tuners/lora/layer.py
@@ -832,6 +832,27 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> torch.Tensor:
         result = (result[0].to(previous_dtype), result[1].to(previous_dtype) if result[1] is not None else result[1])
         return result
 
+    def _restore_weights(self):
+        # Restore the weights as registered parameters on the base layer.
+        # This is necessary because the way that weights are merged/unmerged (which is necessary for forward to work
+        # correctly), the Module "forgets" these attributes. Therefore, we need to call register_parameter explicitly.
+        # We cannot call register_parameter for merging/unmerging because that cuts them off from the autograd graph.
+        # Note that this is hacky, since we need to ensure that _restore_weights is called by each method that needs it.
+
+        # TODO work with separate weights
+        base_layer = self.get_base_layer()
+        weight = base_layer.in_proj_weight.data
+        del base_layer.in_proj_weight
+        base_layer.register_parameter("in_proj_weight", nn.Parameter(weight))
+
+    def state_dict(self, *args, **kwargs):
+        self._restore_weights()
+        return super().state_dict(*args, **kwargs)
+
+    def named_modules(self, *args, **kwargs):
+        self._restore_weights()
+        return super().named_modules(*args, **kwargs)
+
     def __repr__(self) -> str:
         rep = super().__repr__()
         return "lora." + rep
diff --git a/tests/test_custom_models.py b/tests/test_custom_models.py
@@ -58,6 +58,7 @@
     ("Embedding + transformers Conv1D 3 LoRA", "EmbConv1D", LoraConfig, {"target_modules": ["emb", "conv1d"]}),
     ("Conv2d 1 LoRA", "Conv2d", LoraConfig, {"target_modules": ["conv2d"]}),
     ("Conv2d 2 LoRA", "Conv2d", LoraConfig, {"target_modules": ["conv2d", "lin0"]}),
+    ("MHA 1 LoRA", "MHA", LoraConfig, {"target_modules": ["mha"]}),
     #######
     # IA³ #
     #######
@@ -402,6 +403,21 @@ def forward(self, X):
         return X
 
 
+class ModelMha(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.mha = nn.MultiheadAttention(10, 2)
+        self.lin0 = nn.Linear(10, 2)
+        self.sm = nn.LogSoftmax(dim=-1)
+
+    def forward(self, X):
+        X = X.float()
+        X, _ = self.mha(X, X, X)
+        X = self.lin0(X)
+        X = self.sm(X)
+        return X
+
+
 class MockTransformerWrapper:
     """Mock class to behave like a transformers model.
 
@@ -426,6 +442,9 @@ def from_pretrained(cls, model_id, torch_dtype=None):
         if model_id == "Conv2d":
             return ModelConv2D().to(torch_dtype)
 
+        if model_id == "MHA":
+            return ModelMha().to(torch_dtype)
+
         raise ValueError(f"model_id {model_id} not implemented")
 
 
@@ -543,7 +562,9 @@ def test_only_params_are_updated(self, test_name, model_id, config_cls, config_k
         model_before = copy.deepcopy(model)
 
         model.train()
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
+        # we get exploding gradients with MHA when learning rate is too high
+        lr = 0.5 if "mha" not in model_id.lower() else 1e-3
+        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
         # breaking of some LoRA layers that are initialized with constants)
@@ -580,7 +601,9 @@ def test_parameters_after_loading_model(self, test_name, model_id, config_cls, c
         )
         model = get_peft_model(model, config)
         model.train()
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
+        # we get exploding gradients with MHA when learning rate is too high
+        lr = 0.5 if "mha" not in model_id.lower() else 1e-3
+        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
 
         # train at least 3 steps for all parameters to be updated (probably this is required because of symmetry
         # breaking of some LoRA layers that are initialized with constants)