Allow mix usages of block masks

pytorch · tianyu-l · Apr 3, 2025 · Mar 21, 2025 · Mar 21, 2025 · Mar 26, 2025
commit 4d6393991ef4c9aa9fb8fa5545d23356d44fbcbb
@@ -302,6 +302,7 @@ def build_test_list():
                     "--parallelism.data_parallel_shard_degree=4",
                     "--activation_checkpoint.mode='full'",
                     "--model.use_flex_attn",
+                    "--model.attn_mask_type='block_causal'",
                 ]
             ],
             "FSDP+FLEX_ATTN",

@@ -196,10 +196,13 @@ def __init__(self):
         self.parser.add_argument(
             "--model.use_flex_attn",
             action="store_true",
-            help="Whether to use Flex Attention.",
+            help="""
+                Whether to use Flex Attention.
+                Mixed usage of SDPA and FlexAttention is not upported yet.
+            """,
         )
         self.parser.add_argument(
-            "--model.attn_bias_type",
+            "--model.attn_mask_type",
             type=str,
             default="causal",
             choices=["causal", "block_causal"],

@@ -17,32 +17,39 @@
 )
 
 
+BatchBlockMaskType = tuple[Optional[int], BlockMask]
+
+
 class FlexAttn(torch.nn.Module):
     # We registered flex_attention related attributes as class variables as we
-    # need to amortize the cost of compilation. Enabling per-instance flex_attention
-    # is not supported.
-    block_mask: ClassVar[Optional[BlockMask]] = None
-    flex_attn: ClassVar[Optional[Callable]] = None
-    attn_bias_type: ClassVar[Optional[str]] = None
-    compiled_create_block_mask: ClassVar[Optional[Callable]] = None
-
-    def __init__(self, attn_bias_type: str) -> None:
+    # need to amortize the cost of compilation.
+    flex_attn: ClassVar[Callable] = torch.compile(
+        flex_attention, mode="max-autotune-no-cudagraphs"
+    )
+    compiled_create_block_mask: ClassVar[Callable] = torch.compile(create_block_mask)
+    used_attn_mask_types: ClassVar[set[str]] = set()
+    # Attention mask type to the created (id(batch), BlockMask).
+    # This allows us to keep track the created block masks for each
+    # new batch. We will use this to update the block mask when a
+    # new batch is created. This also allows user to create different
+    # block masks for different layers.
+    block_masks: ClassVar[dict[str, BatchBlockMaskType]] = {}
+
+    # Instance variables.
+    attn_mask_type: str
+
+    def __init__(self, attn_mask_type: str) -> None:
         super().__init__()
-        if FlexAttn.attn_bias_type is not None:
-            assert (
-                FlexAttn.attn_bias_type == attn_bias_type
-            ), "All FlexAttention must have the same configurations."
-        else:
-            if attn_bias_type not in ["causal", "block_causal"]:
-                raise ValueError(f"Unrecognized attn_bias_type {attn_bias_type}.")
-            FlexAttn.attn_bias_type = attn_bias_type
+        if attn_mask_type not in ["causal", "block_causal"]:
+            raise ValueError(f"Unrecognized attn_mask_type {attn_mask_type}.")
+        self.attn_mask_type = attn_mask_type
+        FlexAttn.used_attn_mask_types.add(attn_mask_type)
 
     def forward(
         self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor
     ) -> torch.Tensor:
-        assert FlexAttn.block_mask is not None
-        assert FlexAttn.flex_attn is not None
-        return FlexAttn.flex_attn(q, k, v, block_mask=FlexAttn.block_mask)
+        block_mask = FlexAttn.block_masks[self.attn_mask_type][1]
+        return FlexAttn.flex_attn(q, k, v, block_mask=block_mask)
 
     @classmethod
     def _get_causal_mask_fn(cls) -> Callable:
@@ -66,34 +73,41 @@ def block_causal_mask(b, h, q_idx, kv_idx):
 
     @classmethod
     @torch.no_grad()
-    def init_attention_bias(
+    def init_attention_mask(
         cls, batch: torch.Tensor, eos_id: Optional[int] = None
     ) -> None:
-        if cls.block_mask is not None and cls.attn_bias_type == "causal":
-            # We don't need to create another block mask for causal masking if existed.
-            return
-
-        match cls.attn_bias_type:
-            case "causal":
-                mask_fn = cls._get_causal_mask_fn()
-            case "block_causal":
-                mask_fn = cls._get_block_causal_mask_fn(batch, eos_id)
-            case _:
-                raise RuntimeError(f"Shouldn't reach here. {cls.attn_bias_type}")
-
-        seq_len = batch.shape[1]
-        if cls.compiled_create_block_mask is None:
-            cls.compiled_create_block_mask = torch.compile(create_block_mask)
-        cls.block_mask = cls.compiled_create_block_mask(
-            mask_fn, None, None, seq_len, seq_len
-        )
-        cls.flex_attn = torch.compile(flex_attention, mode="max-autotune-no-cudagraphs")
-
-
-class SDPA(torch.nn.Module):
-    def __init__(self, attn_bias_type: str) -> None:
+        for attn_mask_type in cls.used_attn_mask_types:
+            block_mask = cls.block_masks.get(attn_mask_type, None)
+            if block_mask is not None:
+                batch_id = block_mask[0]
+                if batch_id is None or batch_id == id(batch):
+                    continue
+
+            match attn_mask_type:
+                case "causal":
+                    batch_id = None
+                    mask_fn = cls._get_causal_mask_fn()
+                case "block_causal":
+                    batch_id = id(batch)
+                    if eos_id is None:
+                        raise RuntimeError(
+                            "eos_id must be provided for block_causal mask."
+                        )
+                    mask_fn = cls._get_block_causal_mask_fn(batch, eos_id)
+                case _:
+                    raise RuntimeError(f"Shouldn't reach here. {attn_mask_type}")
+
+            seq_len = batch.shape[1]
+            block_mask = cls.compiled_create_block_mask(
+                mask_fn, None, None, seq_len, seq_len
+            )
+            cls.block_masks[attn_mask_type] = (batch_id, block_mask)
+
+
+class ScaledDotProductAttention(torch.nn.Module):
+    def __init__(self, attn_mask_type: str) -> None:
         super().__init__()
-        if attn_bias_type != "causal":
+        if attn_mask_type != "causal":
             raise ValueError(
                 "TorchTitan with SDPA currently only supports causal mask."
             )
@@ -103,32 +117,13 @@ def forward(
     ) -> torch.Tensor:
         return F.scaled_dot_product_attention(q, k, v, is_causal=True)
 
-    @classmethod
-    @torch.no_grad()
-    def init_attention_bias(
-        cls,
-        batch: torch.Tensor,
-        eos_id: Optional[int] = None,
-    ) -> None:
-        # For SDPA, we don't need to do anything.
-        return
-
-
-_selected_attention = None
-
 
-def build_attention(use_flex_attn: bool, attn_bias_type: str):
-    global _selected_attention
+def build_attention(use_flex_attn: bool, attn_mask_type: str):
     if use_flex_attn:
-        assert _selected_attention is None or _selected_attention == FlexAttn
-        _selected_attention = FlexAttn
-        return FlexAttn(attn_bias_type)
+        return FlexAttn(attn_mask_type)
     else:
-        assert _selected_attention is None or _selected_attention == SDPA
-        _selected_attention = SDPA
-        return SDPA(attn_bias_type)
+        return SDPA(attn_mask_type)
 
 
-def init_attention_bias(batch: torch.Tensor, eos_id: Optional[int] = None) -> None:
-    global _selected_attention
-    _selected_attention.init_attention_bias(batch, eos_id)
+def init_attention_mask(batch: torch.Tensor, eos_id: Optional[int] = None) -> None:
+    FlexAttn.init_attention_mask(batch, eos_id)
@@ -16,7 +16,7 @@
 
 from torchtitan.components.tokenizer import Tokenizer
 from torchtitan.config_manager import JobConfig
-from torchtitan.models.attention import build_attention, init_attention_bias
+from torchtitan.models.attention import build_attention, init_attention_mask
 from torchtitan.models.norms import build_norm
 from torchtitan.protocols.train_spec import BaseModelArgs, ModelProtocol
 
@@ -40,15 +40,15 @@ class TransformerModelArgs(BaseModelArgs):
     norm_type: str = "rmsnorm"
 
     use_flex_attn: bool = False
-    attn_bias_type: str = "causal"
+    attn_mask_type: str = "causal"
     eos_id: int = 0
 
     def update_from_config(self, job_config: JobConfig, tokenizer: Tokenizer) -> None:
         self.norm_type = job_config.model.norm_type
         self.vocab_size = tokenizer.n_words
         self.max_seq_len = job_config.training.seq_len
         self.use_flex_attn = job_config.model.use_flex_attn
-        self.attn_bias_type = job_config.model.attn_bias_type
+        self.attn_mask_type = job_config.model.attn_mask_type
 
     def get_num_flop_per_token(self, num_params: int, seq_len: int) -> int:
         l, h, q, t = (
@@ -196,7 +196,7 @@ def __init__(self, model_args: TransformerModelArgs):
         self.wo = nn.Linear(
             model_args.n_heads * self.head_dim, model_args.dim, bias=False
         )
-        self.sdpa = build_attention(model_args.use_flex_attn, model_args.attn_bias_type)
+        self.sdpa = build_attention(model_args.use_flex_attn, model_args.attn_mask_type)
 
     def init_weights(self, init_std: float):
         for linear in (self.wq, self.wk, self.wv):
@@ -471,7 +471,7 @@ def forward(self, tokens: torch.Tensor):
             torch.Tensor: Output logits after applying the Transformer model.
 
         """
-        init_attention_bias(tokens, eos_id=self.eos_id)
+        init_attention_mask(tokens, eos_id=self.eos_id)
 
         # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
         h = self.tok_embeddings(tokens) if self.tok_embeddings else tokens