[I2VGenXL] attention_head_dim in the UNet (huggingface#6872)

sayakpaul · web-flow · commit 491a933a1bf7 · 2024-02-08T12:30:14.000+05:30
* attention_head_dim

* debug

* print more info

* correct num_attention_heads behaviour

* down_block_num_attention_heads -&gt; num_attention_heads.

* correct the image link in doc.

* add: deprecation for num_attention_head

* fix: test argument to use attention_head_dim

* more fixes.

* quality

* address comments.

* remove depcrecation.
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -158,6 +158,7 @@ def __init__(
         super().__init__()
         self.only_cross_attention = only_cross_attention
 
+        # We keep these boolean flags for backward-compatibility.
         self.use_ada_layer_norm_zero = (num_embeds_ada_norm is not None) and norm_type == "ada_norm_zero"
         self.use_ada_layer_norm = (num_embeds_ada_norm is not None) and norm_type == "ada_norm"
         self.use_ada_layer_norm_single = norm_type == "ada_norm_single"
diff --git a/src/diffusers/models/unets/unet_i2vgen_xl.py b/src/diffusers/models/unets/unet_i2vgen_xl.py
@@ -120,6 +120,7 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
         norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
             If `None`, normalization and activation layers is skipped in post-processing.
         cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 64): Attention head dim.
         num_attention_heads (`int`, *optional*): The number of attention heads.
     """
 
@@ -147,10 +148,19 @@ def __init__(
         layers_per_block: int = 2,
         norm_num_groups: Optional[int] = 32,
         cross_attention_dim: int = 1024,
-        num_attention_heads: Optional[Union[int, Tuple[int]]] = 64,
+        attention_head_dim: Union[int, Tuple[int]] = 64,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
     ):
         super().__init__()
 
+        # When we first integrated the UNet into the library, we didn't have `attention_head_dim`. As a consequence
+        # of that, we used `num_attention_heads` for arguments that actually denote attention head dimension. This
+        # is why we ignore `num_attention_heads` and calculate it from `attention_head_dims` below.
+        # This is still an incorrect way of calculating `num_attention_heads` but we need to stick to it
+        # without running proper depcrecation cycles for the {down,mid,up} blocks which are a
+        # part of the public API.
+        num_attention_heads = attention_head_dim
+
         # Check inputs
         if len(down_block_types) != len(up_block_types):
             raise ValueError(
diff --git a/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py b/src/diffusers/pipelines/i2vgen_xl/pipeline_i2vgen_xl.py
@@ -46,7 +46,7 @@
         >>> pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
         >>> pipeline.enable_model_cpu_offload()
 
-        >>> image_url = "https://github.com/ali-vilab/i2vgen-xl/blob/main/data/test_images/img_0009.png?raw=true"
+        >>> image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
         >>> image = load_image(image_url).convert("RGB")
 
         >>> prompt = "Papers were floating in the air on a table in the library"
diff --git a/tests/pipelines/i2vgen_xl/test_i2vgenxl.py b/tests/pipelines/i2vgen_xl/test_i2vgenxl.py
@@ -80,7 +80,8 @@ def get_dummy_components(self):
             down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
             up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
             cross_attention_dim=4,
-            num_attention_heads=4,
+            attention_head_dim=4,
+            num_attention_heads=None,
             norm_num_groups=2,
         )
 

Original file line number	Diff line number	Diff line change
`@@ -80,7 +80,8 @@ def get_dummy_components(self):`
`80`	`80`	`down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),`
`81`	`81`	`up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),`
`82`	`82`	`cross_attention_dim=4,`
`83`		`- num_attention_heads=4,`
	`83`	`+ attention_head_dim=4,`
	`84`	`+ num_attention_heads=None,`
`84`	`85`	`norm_num_groups=2,`
`85`	`86`	`)`
`86`	`87`