Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
merge fsdp2 main and solve conflicts
  • Loading branch information
0x404 committed May 5, 2025
commit e513360f5dc388d0fb6dd28dbc3a4e50d6e35a8e
4 changes: 2 additions & 2 deletions verl/utils/checkpoint/fsdp_checkpoint_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
# every rank download its own checkpoint
state_dict_cfg = ShardedStateDictConfig(offload_to_cpu=True) if self.save_model else None
optim_cfg = ShardedOptimStateDictConfig(offload_to_cpu=True) if self.save_optimizer else None
with FSDP.state_dict_type(self.model, StateDictType.SHARDED_STATE_DICT, state_dict_config=state_dict_cfg, optim_state_dict_config=optim_cfg):
with get_fsdp_state_ctx(self.model, StateDictType.SHARDED_STATE_DICT, state_dict_cfg, optim_cfg):
if self.save_model:
remote_model_path = os.path.join(local_path, f"model_world_size_{self.world_size}_rank_{self.rank}.pt")
local_model_path = copy_to_local(remote_model_path)
Expand Down Expand Up @@ -146,7 +146,7 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
optim_cfg = ShardedOptimStateDictConfig(offload_to_cpu=True) if self.save_optimizer else None
with warnings.catch_warnings():
warnings.simplefilter("ignore")
with FSDP.state_dict_type(self.model, StateDictType.SHARDED_STATE_DICT, state_dict_config=state_dict_cfg, optim_state_dict_config=optim_cfg):
with get_fsdp_state_ctx(self.model, StateDictType.SHARDED_STATE_DICT, state_dict_cfg, optim_cfg):
model_path = os.path.join(local_path, f"model_world_size_{self.world_size}_rank_{self.rank}.pt")
optim_path = os.path.join(local_path, f"optim_world_size_{self.world_size}_rank_{self.rank}.pt")
extra_path = os.path.join(local_path, f"extra_state_world_size_{self.world_size}_rank_{self.rank}.pt")
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.