diff --git a/docs/examples/config.rst b/docs/examples/config.rst index 062ab4b707c..c160532d69e 100644 --- a/docs/examples/config.rst +++ b/docs/examples/config.rst @@ -618,7 +618,7 @@ Optim optimizer_impl: torch.optim lr: 1e-5 weight_decay: 0.01 - warmup_steps_ratio: 0.1 + lr_warmup_steps_ratio: 0.1 clip_grad: 1.0 lr_scheduler: cosine override_optimizer_config: null @@ -627,7 +627,7 @@ Optim - ``optimizer_impl``: Module path to import optimizer from (e.g., ``"torch.optim"``, ``"torchao.optim"``, ``"bitsandbytes.optim"``). - ``optim.lr``: Learning rate for the optimizer. - ``optim.weight_decay``: Weight decay for the optimizer. -- ``optim.warmup_steps_ratio``: Ratio of warmup steps to total training steps. +- ``optim.lr_warmup_steps_ratio``: Ratio of warmup steps to total training steps. - ``optim.clip_grad``: Gradient clipping value. - ``optim.lr_scheduler``: Learning rate scheduler type. Options: diff --git a/verl/trainer/config/sft_trainer.yaml b/verl/trainer/config/sft_trainer.yaml index bb946be88ab..58c8fe80027 100644 --- a/verl/trainer/config/sft_trainer.yaml +++ b/verl/trainer/config/sft_trainer.yaml @@ -1,3 +1,7 @@ +defaults: + - optim: fsdp + - _self_ + data: train_batch_size: 256 micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu @@ -45,7 +49,7 @@ optim: lr: 1e-5 betas: [0.9, 0.95] weight_decay: 0.01 - warmup_steps_ratio: 0.1 + lr_warmup_steps_ratio: 0.1 clip_grad: 1.0 lr_scheduler: cosine ulysses_sequence_parallel_size: 1 diff --git a/verl/trainer/fsdp_sft_trainer.py b/verl/trainer/fsdp_sft_trainer.py index 918f57b27ad..e962b9904c2 100644 --- a/verl/trainer/fsdp_sft_trainer.py +++ b/verl/trainer/fsdp_sft_trainer.py @@ -331,7 +331,7 @@ def _build_model_optimizer(self): f"{self.config.trainer.total_epochs}, total number of steps {self.total_steps}" ) - num_warmup_steps = int(self.total_steps * self.config.optim.warmup_steps_ratio) + num_warmup_steps = int(self.total_steps * self.config.optim.lr_warmup_steps_ratio) if not hasattr(self.config.optim, "lr_scheduler") or self.config.optim.lr_scheduler == "cosine": self.lr_scheduler = get_cosine_schedule_with_warmup(