support for llama3-11b-vision

verl-project · SchumiDing · Jan 31, 2026 · Jan 31, 2026 · Feb 5, 2026 · Jan 31, 2026
commit 610b14bbc6f9a306d2d303f75a26165a2dc11387
diff --git a/verl/experimental/agent_loop/agent_loop.py b/verl/experimental/agent_loop/agent_loop.py
@@ -682,14 +682,20 @@ def _compute_position_ids(self, input_ids, attention_mask, multi_modal_inputs) -
         image_grid_thw = multi_modal_inputs.get("image_grid_thw")
         video_grid_thw = multi_modal_inputs.get("video_grid_thw")
 
-        # Model's get_rope_index has been dynamically bind to the processor.
-        vision_position_ids, _ = self.processor.get_rope_index(
-            input_ids=input_ids,
-            image_grid_thw=image_grid_thw,
-            video_grid_thw=video_grid_thw,
-            attention_mask=attention_mask,
-        )
-        vision_position_ids = vision_position_ids.transpose(0, 1)  # (3, 1, seq_len) => (1, 3, seq_len)
+
+        vision_position_ids = None
+        if video_grid_thw is not None: # some models that use cross attention do not need vision position ids
+            # Model's get_rope_index has been dynamically bind to the processor.
+            vision_position_ids, _ = self.processor.get_rope_index(
+                input_ids=input_ids,
+                image_grid_thw=image_grid_thw,
+                video_grid_thw=video_grid_thw,
+                attention_mask=attention_mask,
+            )
+            vision_position_ids = vision_position_ids.transpose(0, 1)  # (3, 1, seq_len) => (1, 3, seq_len)
+
+        if vision_position_ids is None:
+            return compute_position_id_with_mask(attention_mask)  # (1, seq_len)
 
         valid_mask = attention_mask[0].bool()
         text_position_ids = torch.ones((1, len(input_ids[0])), dtype=torch.long)

diff --git a/verl/utils/dataset/rl_dataset.py b/verl/utils/dataset/rl_dataset.py
@@ -392,8 +392,16 @@ async def process_vision_info(
         """
         from qwen_vl_utils import process_vision_info
 
-        images, videos = process_vision_info(messages, image_patch_size=image_patch_size, return_video_metadata=True)
-        return images, videos
+        if image_patch_size is not None:
+            images, videos = process_vision_info(messages, image_patch_size=image_patch_size, return_video_metadata=True)
+            return images, videos
+        else: # some processor does not need and have a image patch size parameter, than this function will be used with image_patch_size as None, indicating no image patch size is needed
+            try:
+                images, videos = process_vision_info(messages, return_video_metadata=True)
+                return images, videos
+            except Exception as e:# some processor does not have a return_video_metadata parameter, this param is used with try-except to handle this case
+                images, videos = process_vision_info(messages)
+                return images, videos
 
     def split(self, num_splits: int):
         """