[megatron] feat: use mbridge as megatron adaptor (#2064)

ISEEKYAN · web-flow · commit 433544f0be95 · 2025-07-03T12:49:51.000+08:00
### What does this PR do? MBridge provides a seamless bridge between Hugging Face models and Megatron-Core's optimized implementation for efficient distributed training and inference. It also offers necessary tools and processes for integrating Reinforcement Learning (RL) with Megatron. see https://github.com/ISEEKYAN/mbridge mbridge is developed and maintained by NVIDIA, providing functions for: - modeling HF models with megatron - loading/saving HF format weights with no memory overhead - online export parameter to rollout engine with per-tensor-generator - RL specific optimization and friendly APIs on Megatron side. Some early access features for megatron. with mbridge, the direct improvement is: - a clean interface for megatron - no offline dist_ckpt conversion needed - no offline model merger needed ### Test tested with GSM8k qwen2-7B-instruct <img width="486" alt="image" src="https://github.com/user-attachments/assets/dd271e8a-9167-470f-8b0c-dde2bcfe1800" /> ### High-Level Design add an option `actor_rollout_ref.actor.megatron.use_mbridge`, default is False. Set it to true for enable. when enabled, the model_instantiate/model_init_load/checkpoint_save/checkpoint_load/per_tensor_generator will be taken over by mbridge ### Specific Changes > List the specific changes. ### API > Demonstrate how the API changes if any. ### Usage Example add this line to the script: ``` actor_rollout_ref.actor.megatron.use_mbridge=True \ ``` ### Checklist Before Submitting - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl?tab=readme-ov-file#contribution-guide). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl?tab=readme-ov-file#code-linting-and-formatting). - [ ] Add `[BREAKING]` to the PR title `description` if it breaks any API. - [ ] Update the documentation about your changes in the [docs](https://github.com/volcengine/verl/tree/main/docs). - [ ] New CI unit test(s) are added to cover the code path. - [ ] Rely on existing unit tests on CI that covers the code path.
diff --git a/.github/workflows/e2e_ppo_trainer_megatron.yml b/.github/workflows/e2e_ppo_trainer_megatron.yml
@@ -320,7 +320,7 @@ jobs:
           ADV_ESTIMATOR=grpo USE_DUMMY_MODEL=True DUMMY_MODEL_CONFIG_PATH=tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json \
           PPO_MAX_TOKEN_LEN=512 FWD_MAX_TOKEN_LEN=512 \
           MAX_PROMPT_LENGTH=256 MAX_RESPONSE_LENGTH=256 \
-          MODEL_ID=Qwen/Qwen1.5-MoE-A2.7B-Chat \
+          MODEL_ID=Qwen/Qwen1.5-MoE-A2.7B-Chat USE_MBRIDGE=True \
           COMMON_PP=2 COMMON_VPP=null COMMON_CP=1 COMMON_TP=4 COMMON_EP=4 COMMON_ETP=1 INFER_TP=8 \
           USE_DIST_CKPT=True ALL_OFFLOAD=True SKIP_SAVE_HF_MODEL=1 bash tests/special_e2e/run_ppo_trainer_megatron.sh
       - name: clean up
diff --git a/setup.py b/setup.py
@@ -56,6 +56,7 @@
     "torch==2.6.0",
 ]
 TRL_REQUIRES = ["trl<=0.9.6"]
+MCORE_REQUIRES = ["mbridge"]
 
 extras_require = {
     "test": TEST_REQUIRES,
@@ -66,6 +67,7 @@
     "vllm": VLLM_REQUIRES,
     "sglang": SGLANG_REQUIRES,
     "trl": TRL_REQUIRES,
+    "mcore": MCORE_REQUIRES,
 }
 
 
diff --git a/tests/special_e2e/run_ppo_trainer_megatron.sh b/tests/special_e2e/run_ppo_trainer_megatron.sh
@@ -102,6 +102,7 @@ CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
 CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
 CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
 RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
+USE_MBRIDGE=${USE_MBRIDGE:-False}
 
 LR_WARMUP_STEPS=${LR_WARMUP_STEPS:-null}
 
@@ -182,6 +183,7 @@ for ENGINE in "${ENGINES[@]}"; do
         actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
         actor_rollout_ref.ref.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \
         actor_rollout_ref.ref.megatron.dist_checkpointing_path=${DIST_CKPT_PATH} \
+        actor_rollout_ref.ref.megatron.use_mbridge=${USE_MBRIDGE} \
         critic.optim.lr=2e-5 \
         critic.optim.lr_warmup_steps=$LR_WARMUP_STEPS \
         critic.model.path="${MODEL_PATH}" \
diff --git a/verl/models/mcore/mbridge.py b/verl/models/mcore/mbridge.py
@@ -0,0 +1,23 @@
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from mbridge import AutoBridge
+    from mbridge.utils.post_creation_callbacks import freeze_moe_router, make_value_model
+except ImportError:
+    print("mbridge package not found. Please install mbridge with `pip install verl[mcore]` or `pip install mbridge`")
+    raise
+
+__all__ = ["AutoBridge", "make_value_model", "freeze_moe_router"]
diff --git a/verl/single_controller/base/megatron/worker.py b/verl/single_controller/base/megatron/worker.py
@@ -47,6 +47,7 @@ def _init_hf_config_and_tf_config(
         override_model_config,
         override_transformer_config,
         trust_remote_code=False,
+        use_mbridge=False,
     ):
         from transformers import AutoConfig
 
@@ -105,6 +106,15 @@ def add_optimization_config_to_tf_config(tf_config):
                         setattr(tf_config, k, v)
 
         add_optimization_config_to_tf_config(tf_config)
+        if use_mbridge:
+            from verl.models.mcore.mbridge import AutoBridge
+
+            bridge = AutoBridge.from_config(hf_config)
+            bridge.set_extra_args(**override_transformer_config)
+            tf_config = bridge.config
+            self.bridge = bridge
+        else:
+            self.bridge = None
 
         print(f"TF config: {tf_config}")
         self.hf_config = hf_config
diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -102,7 +102,8 @@ actor_rollout_ref:
       dist_checkpointing_path: null
       seed: 42
       override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
-    profile: # profile the actor model in `update_policy`
+      use_mbridge: False
+    profile: # profile the actor model in `update_policy` 
       use_profile: False # open it when you want to profile the actor model
       profile_ranks: null # list, you can specify the ranks to profile
       step_start: -1 # start step in update_policy
@@ -138,6 +139,7 @@ actor_rollout_ref:
       dist_checkpointing_path: null
       seed: ${actor_rollout_ref.actor.megatron.seed}
       override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
+      use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
     profile:
       use_profile: False
       profile_ranks: null
@@ -311,6 +313,7 @@ critic:
     dist_checkpointing_path: null
     seed: ${actor_rollout_ref.actor.megatron.seed}
     override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
+    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
   load_weight: True
   ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
   ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
@@ -355,6 +358,7 @@ reward_model:
     dist_checkpointing_path: null
     seed: ${actor_rollout_ref.actor.megatron.seed}
     override_transformer_config: {}
+    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
   model:
     input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
     path: ~/models/FsfairX-LLaMA3-RM-v0.1
diff --git a/verl/utils/checkpoint/megatron_checkpoint_manager.py b/verl/utils/checkpoint/megatron_checkpoint_manager.py
@@ -116,6 +116,8 @@ def __init__(
         optimizer_scheduler,
         use_distributed_optimizer: bool,
         use_checkpoint_opt_param_scheduler: bool = False,
+        use_dist_checkpointing: bool = True,
+        bridge=None,
         **kwargs,
     ):
         super().__init__(
@@ -139,8 +141,10 @@ def __init__(
         self.model_path = self.config.model.path
         self.use_distributed_optimizer = use_distributed_optimizer
         self.use_checkpoint_opt_param_scheduler = use_checkpoint_opt_param_scheduler
-
+        self.bridge = bridge
         self.rank = torch.distributed.get_rank()
+        self.use_dist_checkpointing = use_dist_checkpointing or not self.bridge or self.is_value_model
+        self.use_hf_checkpoint = not self.use_dist_checkpointing
 
         self.weight_saver = get_weight_saver(self.arch)
 
@@ -303,7 +307,7 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
             ckpt_dir=dist_checkpoint_path,
         )
 
-        if self.should_load_model:
+        if self.should_load_model and self.use_dist_checkpointing:
             assert "model" in state_dict or any(
                 f"model{vpp_rank}" in state_dict for vpp_rank in range(len(self.model))
             ), f"Model state dict not found in {state_dict.keys()}. Please check the checkpoint file {local_path}."
@@ -316,6 +320,10 @@ def load_checkpoint(self, local_path: str, hdfs_path: str = None, del_local_afte
                 mpu.set_virtual_pipeline_model_parallel_rank(vpp_rank)
                 self.model[vpp_rank].load_state_dict(model_state_dict)
             log_with_rank(f"Loaded sharded model checkpoint from {local_path}", rank=self.rank, logger=logger)
+        elif self.should_load_model and self.use_hf_checkpoint:
+            hf_model_path = get_hf_model_checkpoint_path(local_path)
+            self.bridge.load_weights(self.model, hf_model_path)
+            log_with_rank(f"Loaded HF model checkpoint from {hf_model_path} with bridge", rank=self.rank, logger=logger)
 
         if self.should_load_optimizer:
             assert "optimizer" in state_dict, (
@@ -370,29 +378,35 @@ def save_checkpoint(self, local_path: str, hdfs_path: str = None, global_step: i
         local_path = local_mkdir_safe(local_path)
         dist_checkpoint_path = get_dist_checkpoint_path(local_path)
 
-        # Generate state dict for saving
-        state_dict = self.generate_state_dict()
-        log_with_rank(f"Generated state dict for saving: {state_dict.keys()}", rank=self.rank, logger=logger)
-        for vpp_rank, model in enumerate(self.model):
-            if len(self.model) > 1:
-                model_i_keys = state_dict[f"model{vpp_rank}"].keys()
-                log_with_rank(f"Generated state dict for saving: {model_i_keys}", rank=self.rank, logger=logger)
-            else:
-                log_with_rank(
-                    f"Generated state dict for saving: {state_dict['model'].keys()}", rank=self.rank, logger=logger
-                )
-
-        # Start Async save if enabled
-        async_save_request = save_dist_checkpointing(
-            sharded_state_dict=state_dict,
-            ckpt_path=dist_checkpoint_path,
-            async_save=self.checkpoint_config.async_save,
-        )
+        if self.use_dist_checkpointing:
+            # Generate state dict for saving
+            state_dict = self.generate_state_dict()
+            log_with_rank(f"Generated state dict for saving: {state_dict.keys()}", rank=self.rank, logger=logger)
+            for vpp_rank, model in enumerate(self.model):
+                if len(self.model) > 1:
+                    model_i_keys = state_dict[f"model{vpp_rank}"].keys()
+                    log_with_rank(f"Generated state dict for saving: {model_i_keys}", rank=self.rank, logger=logger)
+                else:
+                    log_with_rank(
+                        f"Generated state dict for saving: {state_dict['model'].keys()}", rank=self.rank, logger=logger
+                    )
+            # Start Async save if enabled
+            async_save_request = save_dist_checkpointing(
+                sharded_state_dict=state_dict,
+                ckpt_path=dist_checkpoint_path,
+                async_save=self.checkpoint_config.async_save,
+            )
 
-        # Synchronize all async save requests
-        if not self.checkpoint_config.async_save:
-            assert async_save_request is None, "Async save request should be None when not using async save."
-            torch.distributed.barrier()
+            # Synchronize all async save requests
+            if not self.checkpoint_config.async_save:
+                assert async_save_request is None, "Async save request should be None when not using async save."
+                torch.distributed.barrier()
+        else:
+            assert self.use_hf_checkpoint, "use_hf_checkpoint should be True when not using dist checkpointing"
+            log_with_rank(f"Saving HF model checkpoint to {local_path} with bridge", rank=self.rank, logger=logger)
+            hf_ckpt_path = get_hf_model_checkpoint_path(local_path)
+            self.bridge.save_weights(self.model, hf_ckpt_path)
+            log_with_rank(f"Saved bridge checkpoint to {hf_ckpt_path}", rank=self.rank, logger=logger)
 
         if self.should_save_model:
             # Only rank 0 saves the hf config and tokenizer to huggingface path
diff --git a/verl/utils/model.py b/verl/utils/model.py
@@ -443,6 +443,19 @@ def _load_hf_model(config, model_config, is_value_model, local_cache_path):
     return architectures, model, state_dict, is_value_model
 
 
+def get_hf_model_path(config, local_cache_path="~/.cache/verl/rlhf"):
+    local_cache_path = os.path.expanduser(local_cache_path)
+    if config.model.path.startswith("hdfs:"):
+        from verl.utils.fs import copy_to_local
+
+        local_model_path = copy_to_local(
+            src=config.model.path, cache_dir=local_cache_path, use_shm=config.model.get("use_shm", False)
+        )
+    else:
+        local_model_path = config.model.path
+    return local_model_path
+
+
 def load_megatron_model_weights(
     config, model_config, parallel_model, params_dtype, is_value_model=False, local_cache_path="~/.cache/verl/rlhf"
 ):
diff --git a/verl/workers/megatron_workers.py b/verl/workers/megatron_workers.py
diff --git a/verl/workers/sharding_manager/megatron_sglang.py b/verl/workers/sharding_manager/megatron_sglang.py
diff --git a/verl/workers/sharding_manager/megatron_vllm.py b/verl/workers/sharding_manager/megatron_vllm.py

Original file line number	Diff line number	Diff line change
`@@ -56,6 +56,7 @@`
`56`	`56`	`"torch==2.6.0",`
`57`	`57`	`]`
`58`	`58`	`TRL_REQUIRES = ["trl<=0.9.6"]`
	`59`	`+MCORE_REQUIRES = ["mbridge"]`
`59`	`60`
`60`	`61`	`extras_require = {`
`61`	`62`	`"test": TEST_REQUIRES,`
`@@ -66,6 +67,7 @@`
`66`	`67`	`"vllm": VLLM_REQUIRES,`
`67`	`68`	`"sglang": SGLANG_REQUIRES,`
`68`	`69`	`"trl": TRL_REQUIRES,`
	`70`	`+ "mcore": MCORE_REQUIRES,`
`69`	`71`	`}`
`70`	`72`
`71`	`73`