update

huggingface · yiyixuxu · Jan 31, 2024 · Jan 14, 2024 · Jan 14, 2024 · Jan 14, 2024
commit d924c47bb1ffe8c842df5fc20d8bf76218f5bc11
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -720,7 +720,6 @@ def __call__(
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
         scale: float = 1.0,
-        **kwargs,
     ) -> torch.Tensor:
         residual = hidden_states
 
@@ -1197,7 +1196,6 @@ def __call__(
         attention_mask: Optional[torch.FloatTensor] = None,
         temb: Optional[torch.FloatTensor] = None,
         scale: float = 1.0,
-        **kwargs,
     ) -> torch.FloatTensor:
         residual = hidden_states
         if attn.spatial_norm is not None:
@@ -2125,12 +2123,15 @@ def __call__(
         self,
         attn,
         hidden_states,
-        encoder_hidden_states=None,
+        encoder_hidden_states,
         attention_mask=None,
         temb=None,
+        **kwargs,
     ):
         residual = hidden_states
 
+        encoder_hidden_states, ip_hidden_states = encoder_hidden_states
-        encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+        if isinstance(encoder_hidden_states, tuple):
+            encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
-        encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+        if isinstance(encoder_hidden_states, tuple):
+            encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+        else:
+            end_pos = encoder_hidden_states.shape[1] - self.num_tokens
+            encoder_hidden_states, ip_hidden_states = (
+                encoder_hidden_states[:, :end_pos, :],
+                encoder_hidden_states[:, end_pos:, :],
+            )
+
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
 
@@ -2140,28 +2141,18 @@ def __call__(
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
 
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+
         attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
 
         if attn.group_norm is not None:
             hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
 
         query = attn.to_q(hidden_states)
 
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
+        if attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
-        # split hidden states
-        end_pos = encoder_hidden_states.shape[1] - sum(self.num_tokens)
-        encoder_hidden_states, ip_hidden_states = (
-            encoder_hidden_states[:, :end_pos, :],
-            encoder_hidden_states[:, end_pos:, :],
-        )
-
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
@@ -2174,11 +2165,9 @@ def __call__(
         hidden_states = attn.batch_to_head_dim(hidden_states)
 
         # for ip-adapter
-        for num_token, scale, to_k_ip, to_v_ip in zip(self.num_tokens, self.scale, self.to_k_ip, self.to_v_ip):
-            current_ip_hidden_states, ip_hidden_states = (
-                ip_hidden_states[:, :num_token, :],
-                ip_hidden_states[:, num_token:, :],
-            )
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
+        ):
             ip_key = to_k_ip(current_ip_hidden_states)
             ip_value = to_v_ip(current_ip_hidden_states)
 
@@ -2254,13 +2243,15 @@ def __call__(
         self,
         attn,
         hidden_states,
-        encoder_hidden_states=None,
+        encoder_hidden_states,
         attention_mask=None,
         temb=None,
-        ip_hidden_states=None,
+        **kwargs,
     ):
         residual = hidden_states
 
+        encoder_hidden_states, ip_hidden_states = encoder_hidden_states
+
         if attn.spatial_norm is not None:
             hidden_states = attn.spatial_norm(hidden_states, temb)
 
@@ -2270,9 +2261,7 @@ def __call__(
             batch_size, channel, height, width = hidden_states.shape
             hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
 
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
 
         if attention_mask is not None:
             attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
@@ -2285,9 +2274,7 @@ def __call__(
 
         query = attn.to_q(hidden_states)
 
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
+        if attn.norm_cross:
             encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
 
         # split hidden states

diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
@@ -1075,9 +1075,7 @@ def forward(
                 )
             image_embeds = added_cond_kwargs.get("image_embeds")
             image_embeds = self.encoder_hid_proj(image_embeds)
-            if cross_attention_kwargs is None:
-                cross_attention_kwargs = {}
-            cross_attention_kwargs["ip_hidden_states"] = image_embeds
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)
 
         # 2. pre-process
         sample = self.conv_in(sample)