[Update] update user interaction design

verl-project · eric-haibin-lin · Jun 22, 2025 · May 21, 2025 · May 22, 2025 · May 28, 2025
commit 6c2cf1b31f590bdcb70ecbe9cf653dae7f860ea4
diff --git a/examples/sglang_multiturn/config/feedback_config/gsm8k_feedback_config.yaml b/examples/sglang_multiturn/config/feedback_config/gsm8k_feedback_config.yaml
diff --git a/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml b/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml
@@ -0,0 +1,3 @@
+interaction:
+  - class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction"
+    config: {}
diff --git a/verl/feedbacks/base.py b/verl/feedbacks/base.py
diff --git a/verl/feedbacks/__init__.py → verl/interactions/__init__.py b/verl/feedbacks/__init__.py → verl/interactions/__init__.py
diff --git a/verl/interactions/base.py b/verl/interactions/base.py
@@ -0,0 +1,63 @@
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any, Dict, Tuple
+
+
+class BaseInteraction:
+    def __init__(self, config: Dict[str, Any]):
+        self.config = config
+        self.name: str = config.get("name", "interaction_agent")  # More general agent default role name
+
+    async def start_interaction(self) -> str:  # More clear interaction start method
+        """
+        Initializes a new interaction session and returns its unique ID.
+        Simulates: get id + state init
+        """
+        # ...implement the logic to get ID and initialize state...
+        interaction_id = "some_unique_id"
+        return interaction_id
+
+    async def generate_response(self, messages: Any) -> Tuple[bool, str, float, Dict[str, Any]]:  # More clear response generation method
+        """
+        Generates a response for the current turn of interaction.
+        Returns a tuple containing:
+        - should_terminate_sequence (bool): True if the interaction sequence should end.
+        - response_content (str): The textual content of the response.
+        - current_turn_score (float): The score for this specific turn/response.
+        - additional_data (dict): Any extra information or metadata.
+        """
+        should_terminate_sequence: bool = False  # if True, end rollout
+        response_content: str = "Your current result seems acceptable."
+        current_turn_score: float = 0.8
+        additional_data: Dict[str, Any] = {}
+        return should_terminate_sequence, response_content, current_turn_score, additional_data
+
+    async def calculate_score(self) -> float:  # More clear score calculation method
+        """
+        Calculates a score for the interaction,
+        potentially considering aspects like partial exposure & in-context task switching.
+        should be invoke at turn-level
+        """
+        # ...implement the logic to calculate turn-level score...
+        score = 0.0
+        return score
+
+    async def finalize_interaction(self) -> None:  # More clear interaction end and resource release method
+        """
+        Finalizes the interaction session and releases any associated state or resources.
+        Simulates: release state
+        """
+        # ...implement the logic to release state...
+        pass
diff --git a/verl/feedbacks/gsm8k_feedback.py → verl/interactions/gsm8k_interaction.py b/verl/feedbacks/gsm8k_feedback.py → verl/interactions/gsm8k_interaction.py
@@ -20,25 +20,26 @@
 
 from verl.utils.reward_score import gsm8k
 
-from .base import BaseFeedback
+from .base import BaseInteraction
 
 logger = logging.getLogger(__name__)
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
-class Gsm8kFeedback(BaseFeedback):
-    """A demo feedback for calculating the reward of gsm8k.
+class Gsm8kInteraction(BaseInteraction):
+    """A demo interaction for calculating the reward of gsm8k.
 
-    - `create`: create a feedback instance for a trajectory.
-    - `get_feedback`: get the feedback of the user.
-    - `release`: release the feedback instance.
+    - `start_interaction`: start a interaction instance for a trajectory.
+    - `generate_response`: generate the response of the user.
+    - `calculate_score`: calculate the score of the interaction.
+    - `finalize_interaction`: finalize the interaction instance.
     """
 
     def __init__(self, config: dict):
         super().__init__(config)
         self._instance_dict = {}
 
-    async def create(self, instance_id: Optional[str] = None, ground_truth: Optional[str] = None, **kwargs) -> str:
+    async def start_interaction(self, instance_id: Optional[str] = None, ground_truth: Optional[str] = None, **kwargs) -> str:
         if instance_id is None:
             instance_id = str(uuid4())
         self._instance_dict[instance_id] = {
@@ -48,30 +49,30 @@ async def create(self, instance_id: Optional[str] = None, ground_truth: Optional
         }
         return instance_id
 
-    async def get_feedback(self, instance_id: str, messages: List[Dict[str, Any]], **kwargs) -> Tuple[str, float, dict]:
-        content = ''
+    async def generate_response(self, instance_id: str, messages: List[Dict[str, Any]], **kwargs) -> Tuple[str, float, dict]:
+        content = ""
         for i in range(len(messages) - 1, -1, -1):
             item = messages[i]
-            if item.get('role') == 'user':
-                content = item.get('content')
+            if item.get("role") == "user":
+                content = item.get("content")
                 break
 
         if content.startswith("#### "):
             self._instance_dict[instance_id]["response"] = content
         else:
             self._instance_dict[instance_id]["response"] = "#### " + content
 
-        reward = await self.calc_reward(instance_id)
+        reward = await self.calculate_score(instance_id)
         if reward == 1.0:
             feedback = "Your response is correct!"
-            go_on = False
+            should_terminate_sequence = True
         else:
             feedback = "Your response is incorrect! You need to reflect on your answer and try again."
-            go_on = True
+            should_terminate_sequence = False
 
-        return f"{feedback=} {reward=}", go_on, {}
+        return should_terminate_sequence, f"{feedback=}", reward, {}
 
-    async def calc_reward(self, instance_id: str, **kwargs) -> float:
+    async def calculate_score(self, instance_id: str, **kwargs) -> float:
         return gsm8k.compute_score(
             self._instance_dict[instance_id]["response"],
             self._instance_dict[instance_id]["ground_truth"],
@@ -80,5 +81,5 @@ async def calc_reward(self, instance_id: str, **kwargs) -> float:
             score=1.0,
         )
 
-    async def release(self, instance_id: str, **kwargs) -> None:
+    async def finalize_interaction(self, instance_id: str, **kwargs) -> None:
         del self._instance_dict[instance_id]
@@ -140,11 +140,11 @@ actor_rollout_ref:
       do_sample: False # default eager for validation
     multi_turn:
       enable: False  # should set rollout.name to sglang_async if True
-      max_turns: null  # null for no limit (default max_length // 3)
+      max_assistant_turns: null  # null for no limit (default max_length // 3)
       tool_config_path: null  # null for no tool
-      feedback_config_path: null  # null for no feedback
+      interaction_config_path: null  # null for no interaction
       format: chatml  # chatml, more formats will be supported in the future
-      user_max_turns: 1
+      max_user_turns: null
 
 critic:
   rollout_n: ${actor_rollout_ref.rollout.n}

diff --git a/verl/workers/rollout/schemas.py b/verl/workers/rollout/schemas.py
@@ -83,7 +83,7 @@ class AsyncRolloutRequest(BaseModel):
     loss_mask: List[int]
     prompt_loss_mask: List[int]
     response_loss_mask: List[int]
-    reward_scores: Dict[str, float]
+    reward_scores: Dict[str, List[float]]
     max_response_len: int = 8192
     max_model_len: int = 32768
     metrics: Dict[str, List[Any]] = {}
@@ -108,7 +108,7 @@ class AsyncRolloutRequest(BaseModel):
         },
             "user_prefix_msg": "\n<|im_start|>user",
             "user_suffix_msg": "<|im_end|>",
-        }
+        },
     }
 
     def get_generation_prompt(self, tokenizer: PreTrainedTokenizer) -> list[int]:
@@ -118,6 +118,7 @@ def get_generation_prompt(self, tokenizer: PreTrainedTokenizer) -> list[int]:
             add_generation_prompt=True,
             tokenize=True,
         )
+
     def add_user_message(
         self,
         tokenizer: PreTrainedTokenizer,
@@ -158,7 +159,7 @@ def add_user_message(
             raise ValueError(f"Unsupported format: {format}")
         assert len(self.input_ids) == len(self.attention_mask) == len(self.position_ids) == len(self.loss_mask), f"""Request {self.request_id} has different length of {len(self.input_ids)=}, 
             {len(self.attention_mask)=}, {len(self.position_ids)=}, {len(self.loss_mask)=}"""
-    
+
     def add_assistant_message(
         self,
         tokenizer: PreTrainedTokenizer,
@@ -263,7 +264,7 @@ def update_metrics(self, metrics: Any, tool_id: str) -> None:
     def finalize(
         self,
         tokenizer: PreTrainedTokenizer,
-        reward_scores: Dict[str, float],
+        reward_scores: Dict[str, List[float]],
         finish_reason_type: FinishReasonTypeEnum = FinishReasonTypeEnum.STOP,
     ) -> None:
         self.state = AsyncRolloutRequestStateEnum.COMPLETED