[misc] fix: no need to use world_size to decide whether to use full_tensor in FSDP2 (#1529)

PeterSH6 · web-flow · commit 146676091fae · 2025-05-15T19:18:00.000+08:00
[misc] fix: no need to use world_size to decide whether to use full_tensor() for FSDP2 state_dict() when world_size==1 ### Checklist Before Starting - [x] Search for similar PR(s). ### What does this PR do? This PR simplifies the parameter loading logic within the `FSDPVLLMShardingManager` by removing an unnecessary `world_size` check when determining whether to call `full_tensor()` on parameters obtained from an FSDP2 model's `state_dict()`. As the FSDP2 parameters are all `DTensor`. ### High-Level Design The change modifies the update_params method. When loading weights into the vLLM model, parameters from the FSDP state_dict() (which might be ShardedTensor or DTensor instances under FSDP2 when world_size == 1) are converted to full tensors using param.full_tensor(). This PR ensures this conversion happens if the full_tensor() method is available on the parameter, without an additional, potentially incorrect, check against world_size == 1. ### Specific Changes Skip. See file changes ### API No ### Usage Example No ### Test No CI changes ### Additional Info. - **Issue Number**: No - **Training**: [Note which backend this PR will affect: FSDP - **Inference**: [Note which backend this PR will affect: vLLM ### Checklist Before Submitting - [x] Read the [Contribute Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting). - [ ] Add `[BREAKING]` to the PR title if it breaks any API. - [ ] Update the documentation about your changes in the [docs](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add CI test(s) if neccessary.
diff --git a/verl/workers/sharding_manager/fsdp_vllm.py b/verl/workers/sharding_manager/fsdp_vllm.py
@@ -21,6 +21,12 @@
 from torch.distributed.fsdp.api import FullStateDictConfig, ShardedStateDictConfig, StateDictType
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
 
+try:
+    # for torch 2.5+
+    from torch.distributed.tensor import DTensor
+except ImportError:
+    from torch.distributed._tensor import DTensor
+
 from verl import DataProto
 from verl.protocol import all_gather_data_proto
 from verl.third_party.vllm import LLM, vllm_version
@@ -52,13 +58,13 @@ def __init__(
         self.inference_engine = inference_engine
         # self.model_runner = inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner if inference_engine else None
 
-        if 'vllm_v_0_6_3' in str(type(self.inference_engine)) or 'vllm_v_0_5_4' in str(type(self.inference_engine)):
+        if "vllm_v_0_6_3" in str(type(self.inference_engine)) or "vllm_v_0_5_4" in str(type(self.inference_engine)):
             # vLLM <= v0.6.3
             self.model_runner = self.inference_engine.llm_engine.model_executor.worker.model_runner if self.inference_engine else None
         else:
             # vLLM > v0.6.3
             self.model_runner = self.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner if self.inference_engine else None
-            
+
         self.model_config = model_config
         self.device_mesh = device_mesh
         self.offload_param = offload_param
@@ -188,7 +194,6 @@ def postprocess_data(self, data: DataProto) -> DataProto:
     def update_params(self, updated_params):
         model = self.model_runner.model
         patch_vllm_moe_model_weight_loader(model)
-        world_size = torch.distributed.get_world_size()
         device = torch.cuda.current_device()  # used when fsdp2 set cpu_offload_policy
-        loaded_params = model.load_weights(((name, param.to(device, non_blocking=True).full_tensor() if world_size != 1 and hasattr(param, "full_tensor") else param) for name, param in updated_params.items()))
+        loaded_params = model.load_weights(((name, param.to(device, non_blocking=True).full_tensor() if isinstance(param, DTensor) else param) for name, param in updated_params.items()))
         logger.info("vLLM load weights, loaded_params: %d", len(loaded_params))