[recipe, rollout] feat: enable gpt-oss training for tool agent add gpt-oss for retool recipe (verl-project#3837)

HJSang · Hejian Sang · gemini-code-assist[bot] · web-flow · commit f1c6971c0b55 · 2025-10-21T13:40:29.000+08:00
### What does this PR do? > Add **concise** overview of what this PR aims to achieve or accomplish. Reference related GitHub issues and PRs that help with the review. * add the tool response parsing logic for gpt-oss models * add training recipe for retool ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) --------- Co-authored-by: Hejian Sang <hsang@linkedin.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/recipe/langgraph_agent/chat_model.py b/recipe/langgraph_agent/chat_model.py
@@ -38,25 +38,12 @@
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopOutput, AsyncLLMServerManager
 from verl.experimental.agent_loop.tool_parser import ToolParser
+from verl.experimental.agent_loop.utils import add_generation_prompt_for_gpt_oss, format_gpt_oss_tool_response_manually
 
 logger = logging.getLogger(__file__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
-def format_tool_response_manually(tool_message: dict, tool_call_name: str) -> str:
-    """Manually format tool response without using tokenizer template.
-
-    Args:
-        tool_message: Tool message dictionary with 'content' field
-        tool_call_name: Name of the tool that was called
-
-    Returns:
-        Formatted tool response string
-    """
-    content = tool_message["content"]
-    return f"<|start|>functions.{tool_call_name} to=assistant<|channel|>commentary<|message|>{content}<|end|>"
-
-
 class MaxTokenExceededError(Exception):
     """Indicate that history chat messages + tool message exceeds LLM max_tokens."""
 
@@ -235,14 +222,14 @@ async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any) -> tuple
                     actual_tool_name = tool_msg.get("name", "unknown")
                     if actual_tool_name == "unknown":
                         logger.error(f"actual_tool_name: {actual_tool_name}")
-                    formatted = format_tool_response_manually(tool_msg, actual_tool_name)
+                    formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name)
                     tool_response_texts.append(formatted)
-            # need to add generation tokens for gpt-oss manually since add_generation_prompt is True
-            tool_response_texts.append("<|start|>assistant")
 
             # Tokenize the manually formatted tool responses
             tool_response_text = "".join(tool_response_texts)
-            print(f"tool_response_text: {tool_response_text}")
+            # need to add generation tokens for gpt-oss manually since add_generation_prompt is True
+            tool_response_text = add_generation_prompt_for_gpt_oss(tool_response_text)
+            logger.debug(f"tool_response_text: {tool_response_text}")
 
             tool_response_ids = await loop.run_in_executor(
                 None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False)
diff --git a/recipe/retool/run_gpt_oss_ppo.sh b/recipe/retool/run_gpt_oss_ppo.sh
@@ -0,0 +1,125 @@
+set -x
+
+# ================= data/model/tool =================
+HDFS_ROOT=${HDFS_ROOT:-$PWD}
+DATA_ROOT=${DATA_ROOT:-$PWD}
+
+dapo_math_17k=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k
+aime_2024=$DATA_ROOT/dataset/Maxwell-Jia/AIME_2024
+aime_2025=$DATA_ROOT/dataset/yentinglin/aime_2025
+actor_model_path=lmsys/gpt-oss-20b-bf16
+critic_model_path=$actor_model_path
+
+train_files="['$dapo_math_17k']"
+test_files="['$aime_2025']"
+
+# tool
+tool_config_path=recipe/retool/sandbox_fusion_tool_config.yaml
+
+# wandb
+project_name=wuxibin_retool
+experiment_name=gpt-oss-20b-bf16_ppo
+default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
+
+# ================= algorithm =================
+adv_estimator=gae
+
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_turns=8
+max_prompt_length=2048
+max_response_length=16384
+actor_lr=1e-6
+critic_lr=2e-6
+gae_gamma=1.0
+gae_lam=1.0
+
+critic_warmup=20
+
+train_batch_size=512
+ppo_mini_batch_size=512
+n_resp_per_prompt_val=30
+
+# ================= perfomance =================
+infer_tp=4 # vllm
+train_sp=4 # train
+
+offload=True
+
+actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 2 ))
+critic_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
+
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=$adv_estimator \
+    algorithm.use_kl_in_reward=$use_kl_in_reward \
+    algorithm.kl_ctrl.kl_coef=$kl_coef \
+    algorithm.gamma=$gae_gamma \
+    algorithm.lam=$gae_lam \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.return_raw_chat=True \
+    data.train_batch_size=$train_batch_size \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    data.filter_overlong_prompts=True \
+    +data.apply_chat_template_kwargs.reasoning_effort=medium \
+    data.truncation='error' \
+    data.custom_cls.path=recipe/retool/retool.py \
+    data.custom_cls.name=CustomRLHFDataset \
+    custom_reward_function.path=recipe/retool/retool.py \
+    custom_reward_function.name=compute_score \
+    actor_rollout_ref.model.path=$actor_model_path \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
+    actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
+    actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
+    actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.optim.lr=$actor_lr \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
+    actor_rollout_ref.rollout.multi_turn.enable=True \
+    actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.tool_config_path=$tool_config_path \
+    actor_rollout_ref.rollout.multi_turn.format=gpt-oss \
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=triton \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
+    critic.optim.lr=$critic_lr \
+    critic.model.use_remove_padding=True \
+    critic.model.path=$critic_model_path \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_max_token_len_per_gpu=$critic_max_token_len_per_gpu \
+    critic.ulysses_sequence_parallel_size=$train_sp \
+    critic.model.fsdp_config.param_offload=$offload \
+    critic.model.fsdp_config.optimizer_offload=$offload \
+    trainer.critic_warmup=$critic_warmup \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name=$project_name \
+    trainer.experiment_name=$experiment_name \
+    trainer.n_gpus_per_node=8 \
+    trainer.val_before_train=True \
+    trainer.log_val_generations=100 \
+    trainer.nnodes=2 \
+    trainer.save_freq=30 \
+    trainer.default_local_dir=$default_local_dir \
+    trainer.test_freq=5 \
+    trainer.total_epochs=1 $@
diff --git a/verl/experimental/agent_loop/tool_agent_loop.py b/verl/experimental/agent_loop/tool_agent_loop.py
@@ -22,6 +22,7 @@
 
 from verl.experimental.agent_loop.agent_loop import AgentLoopBase, AgentLoopOutput, register
 from verl.experimental.agent_loop.tool_parser import FunctionCall, ToolParser
+from verl.experimental.agent_loop.utils import add_generation_prompt_for_gpt_oss, format_gpt_oss_tool_response_manually
 from verl.interactions.base import BaseInteraction
 from verl.interactions.utils.interaction_registry import initialize_interactions_from_config
 from verl.tools.schemas import ToolResponse
@@ -261,8 +262,10 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt
         new_images_this_turn: list[Any] = []  # Local variable instead of agent_data attribute
 
         tasks = []
+        tool_call_names = []
         for tool_call in agent_data.tool_calls[: self.max_parallel_calls]:
             tasks.append(self._call_tool(tool_call, agent_data.tools_kwargs))
+            tool_call_names.append(tool_call.name)
 
         with simple_timer("tool_calls", agent_data.metrics):
             responses = await asyncio.gather(*tasks)
@@ -341,11 +344,25 @@ async def _handle_processing_tools_state(self, agent_data: AgentData) -> AgentSt
             model_inputs = self.processor(text=[raw_tool_response], images=current_images, return_tensors="pt")
             response_ids = model_inputs.pop("input_ids").squeeze(0).tolist()
         else:
-            response_ids = await self.loop.run_in_executor(
-                None,
-                lambda: self.tokenizer.apply_chat_template(add_messages, add_generation_prompt=True, tokenize=True),
-            )
-        response_ids = response_ids[len(self.system_prompt) :]
+            if self.tool_parser == "gpt-oss":
+                logger.info("manually format tool responses for gpt-oss")
+                # Format tool responses manually
+                tool_response_texts = []
+                for i, tool_msg in enumerate(add_messages):
+                    actual_tool_name = tool_call_names[i]
+                    formatted = format_gpt_oss_tool_response_manually(tool_msg["content"], actual_tool_name)
+                    tool_response_texts.append(formatted)
+
+                tool_response_text = add_generation_prompt_for_gpt_oss("".join(tool_response_texts))
+                response_ids = await self.loop.run_in_executor(
+                    None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False)
+                )
+            else:
+                response_ids = await self.loop.run_in_executor(
+                    None,
+                    lambda: self.tokenizer.apply_chat_template(add_messages, add_generation_prompt=True, tokenize=True),
+                )
+                response_ids = response_ids[len(self.system_prompt) :]
         if len(agent_data.response_mask) + len(response_ids) >= self.response_length:
             return AgentState.TERMINATED
         # Update prompt_ids and response_mask
diff --git a/verl/experimental/agent_loop/utils.py b/verl/experimental/agent_loop/utils.py
@@ -0,0 +1,38 @@
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# tokenizer.apply_chat_template is not working properly for gpt-oss model.
+# Because the chat template requires tool call messages to parse tool response messages
+# so we need to format the tool response manually.
+def format_gpt_oss_tool_response_manually(tool_response: str, tool_call_name: str) -> str:
+    """Format tool response for gpt-oss model.
+    Args:
+        tool_response: Tool response string
+        tool_call_name: Name of the tool that was called
+
+    Returns:
+        Formatted tool response string
+    """
+    return f"<|start|>functions.{tool_call_name} to=assistant<|channel|>commentary<|message|>{tool_response}<|end|>"
+
+
+def add_generation_prompt_for_gpt_oss(message_content: str) -> str:
+    """Add generation prompt for gpt-oss model.
+    Args:
+        message_content: Message content string
+
+    Returns:
+        Message content string with generation prompt
+    """
+    return message_content + "<|start|>assistant"