Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1 +1 @@
transformers==4.57.1
transformers==5.0.0rc0
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ This enables HF transformers models to be trained with `4D parallelism + torch.c

## Quick start

- Requirements `transformers==4.57.1`
- Requirements `transformers==5.0.0rc0`

- Config: `torchtitan/torchtitan/experiments/transformers_modeling_backend/configs/qwen3.toml`
```diff
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import torch
from torch import nn
from torch.distributed.tensor import DTensor
from torch.nn import init
from torchtitan.tools.logging import logger
from transformers.configuration_utils import PretrainedConfig
Expand Down Expand Up @@ -270,7 +271,10 @@ def _init_weights_patched(self, module):
module.weight.data.normal_(mean=0.0, std=std)

if module.padding_idx is not None:
module.weight.data[module.padding_idx].zero_()
if isinstance(module.weight.data, DTensor):
module.weight.data._local_tensor[module.padding_idx].zero_()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sorry I probably didn't what you are doing here.
If the padding is on the "global tensor", we should just do the same thing module.weight.data[module.padding_idx].zero_()

The code here is doing local modification, which may or may not be correct depending on if the padding_idx is meant to be local or global.

else:
module.weight.data[module.padding_idx].zero_()

elif (
isinstance(
Expand Down Expand Up @@ -429,10 +433,6 @@ def init_weights(self, *args, **kwargs):
# This method replicates the behavior of the original PreTrainedModel.init_weights,
# but with a custom weight initialization function that skips nn.Identity modules (when PP is enabled)

if self.model.config.pruned_heads:
logger.info("Pruning heads as per model configuration.")
self.model.prune_heads(self.model.config.pruned_heads)

original_init_weights_fn = self.model._init_weights

def selective_init(module):
Expand Down
Loading