albertimff
diff --git a/‎.github/workflows/.deprecate/e2e_prime.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/.deprecate/e2e_prime.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/README.md‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/Dockerfile.extention.awsefa‎
Lines changed: 1 addition & 1 deletion b/‎docker/Dockerfile.extention.awsefa‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/start/multinode.rst‎
Lines changed: 1 addition & 1 deletion b/‎docs/start/multinode.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎verl/models/mcore/model_forward.py‎
Lines changed: 59 additions & 133 deletions b/‎verl/models/mcore/model_forward.py‎
Lines changed: 59 additions & 133 deletions
@@ -47,7 +47,7 @@ jobs:
       HF_ENDPOINT: "https://hf-mirror.com"
       HF_HUB_ENABLE_HF_TRANSFER: "0" # This is more stable
     container:
-      image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.0-te2.3
+      image: "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
       options: --gpus all --shm-size=10g
     steps:
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
@@ -31,7 +31,7 @@ permissions:
   contents: read
 
 env:
-  IMAGE: "your vemlp image" # e.g. "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.4-vllm0.8.5-mcore0.12.2"
+  IMAGE: "your vemlp image" # e.g. "verl-ci-cn-beijing.cr.volces.com/verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2"
   DYNAMIC_RUNNER_URL: "https://sd10g3clalm04ug7alq90.apigateway-cn-beijing.volceapi.com/runner" # public veFaas api
 
 jobs:
 
@@ -1,6 +1,6 @@
 # Base Image support aws EFA
 # Build Image with frameworks based on this
-FROM verlai/verl:app-verl0.5-sglang0.4.6.post5-mcore0.12.2
+FROM verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
 
 # For aws instances with EFA net interface (Sagemaker AI Pod)
 #     install EFA driver:
 
@@ -334,7 +334,7 @@ Once the fleet is created, define a Ray cluster task, e.g. in ``ray-cluster.dsta
         - PYTHONUNBUFFERED=1
         - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
     
-    image: whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.2
+    image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2
     commands:
         - git clone https://github.com/volcengine/verl
         - cd verl
 
@@ -21,105 +21,42 @@
     postprocess_packed_seqs_no_padding,
     preprocess_packed_seqs,
     preprocess_packed_seqs_no_padding,
-    recover_left_padding,
-    remove_left_padding,
 )
 
 
-def gptmodel_forward(
-    model,
-    input_ids,
-    attention_mask,
-    position_ids,
-    sequence_parallel,
-    value_model=False,
-    pack_seqs=True,
-    logits_processor=None,
-    logits_processor_args: dict = None,
-    **kwargs,
-):
-    """Default forward pass for GPT models with optional sequence packing."""
-    pre_process = unwrap_model(model).pre_process
-    post_process = unwrap_model(model).post_process
-    if pack_seqs:
-        batch_size, seq_len = attention_mask.shape[:2]
-        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process)
-        input_ids_rmpad = input_ids_rmpad.contiguous()
-        output_orig = model(
-            input_ids=input_ids_rmpad,
-            attention_mask=None,
-            position_ids=position_ids,
-            packed_seq_params=packed_seq_params,
-        )
-        if post_process and logits_processor is not None:
-            args = {
-                k: preprocess_packed_seqs(v, attention_mask, pre_process=True)[0]
-                for k, v in logits_processor_args.items()
-            }
-            output_dict = logits_processor(output_orig, **args)
-            output = {
-                k: postprocess_packed_seqs(
-                    v, packed_seq_params, attention_mask, batch_size, seq_len, post_process=post_process
-                )
-                for k, v in output_dict.items()
-            }
-        else:
-            output = postprocess_packed_seqs(
-                output_orig, packed_seq_params, attention_mask, batch_size, seq_len, post_process=post_process
-            )
-    else:
-        batch_size, sequence_length = attention_mask.shape
-        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(
-            input_ids, attention_mask, position_ids, sequence_parallel, pre_process=pre_process
-        )
-        output = model(input_ids=new_input_ids, attention_mask=new_attention_mask, position_ids=new_position_ids)
-        if post_process:
-            output = logits_processor(output, **logits_processor_args)
-        output = recover_left_padding(
-            output, new_attention_mask, attention_mask, sequence_length, post_process=post_process
-        )
-    if value_model and post_process:
-        output = output[..., 0]
-    return output
+def model_forward_gen(vision_model: bool = False):
+    def model_forward(
+        model,
+        input_ids,
+        attention_mask,
+        position_ids,
+        multi_modal_inputs: dict,
+        logits_processor=None,
+        logits_processor_args: dict = None,
+        value_model=False,
+    ):
+        """Forward pass for models with sequence packing."""
+        pre_process = (
+            unwrap_model(model).pre_process if not vision_model else True
+        )  # vision model always needs pre_process
+        post_process = unwrap_model(model).post_process
 
+        model_kwargs = {}
+        if "pixel_values" in multi_modal_inputs:
+            model_kwargs["pixel_values"] = multi_modal_inputs["pixel_values"].to(input_ids.device)
+        if "image_grid_thw" in multi_modal_inputs:
+            model_kwargs["image_grid_thw"] = multi_modal_inputs["image_grid_thw"].to(input_ids.device)
 
-def gptmodel_forward_qwen2_5_vl(
-    model,
-    input_ids,
-    attention_mask,
-    position_ids,
-    sequence_parallel,
-    value_model=False,
-    pack_seqs=True,
-    multi_modal_inputs=None,
-    logits_processor=None,
-    logits_processor_args: dict = None,
-    **kwargs,
-):
-    from megatron.core import parallel_state as mpu
-
-    assert mpu.get_context_parallel_world_size() == 1, "qwen2_5_vl's context parallel is not accurate yet"
-    pre_process = unwrap_model(model).pre_process
-    post_process = unwrap_model(model).post_process
-    pixel_values = (
-        multi_modal_inputs["pixel_values"].to(input_ids.device) if "pixel_values" in multi_modal_inputs else None
-    )
-    image_grid_thw = (
-        multi_modal_inputs["image_grid_thw"].to(input_ids.device) if "image_grid_thw" in multi_modal_inputs else None
-    )
-    if pack_seqs:
         batch_size, seq_len = attention_mask.shape[:2]
-        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=True)
+        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs(input_ids, attention_mask, pre_process=pre_process)
         input_ids_rmpad = input_ids_rmpad.contiguous()
         output_orig = model(
             input_ids=input_ids_rmpad,
             attention_mask=None,
-            position_ids=None,  # model will calculate position_ids
+            position_ids=position_ids if not vision_model else None,  # vision models will calculate position_ids
             packed_seq_params=packed_seq_params,
-            pixel_values=pixel_values,
-            image_grid_thw=image_grid_thw,
+            **model_kwargs,
         )
-
         if post_process and logits_processor is not None:
             args = {
                 k: preprocess_packed_seqs(v, attention_mask, pre_process=True)[0]
@@ -136,66 +73,55 @@ def gptmodel_forward_qwen2_5_vl(
             output = postprocess_packed_seqs(
                 output_orig, packed_seq_params, attention_mask, batch_size, seq_len, post_process=post_process
             )
-    else:
-        batch_size, sequence_length = attention_mask.shape
-        new_input_ids, new_attention_mask, new_position_ids = remove_left_padding(
-            input_ids, attention_mask, position_ids, sequence_parallel, pre_process=pre_process
-        )
-        output = model(
-            input_ids=new_input_ids,
-            position_ids=new_position_ids,
-            attention_mask=new_attention_mask,
-            pixel_values=pixel_values,
-            image_grid_thw=image_grid_thw,
-        )
-        output = recover_left_padding(
-            output, new_attention_mask, attention_mask, sequence_length, post_process=post_process
-        )
-    if value_model and post_process:
-        output = output[..., 0]
-    return output
+        if value_model and post_process:
+            output = output[..., 0]
+        return output
+
+    return model_forward
 
 
 def gptmodel_forward_no_padding(
     model,
     input_ids,
-    value_model=False,
-    pack_seqs=True,
+    multi_modal_inputs: dict,
     logits_processor=None,
     logits_processor_args: dict = None,
-    **kwargs,
+    value_model=False,
 ):
     """Default forward pass for GPT models with optional sequence packing."""
     pre_process = unwrap_model(model).pre_process
     post_process = unwrap_model(model).post_process
-    if pack_seqs:
-        batch_size = input_ids.shape[0]
-        input_ids_rmpad, packed_seq_params = preprocess_packed_seqs_no_padding(input_ids, pre_process=pre_process)
-        input_ids_rmpad = input_ids_rmpad.contiguous()
-        output_orig = model(
-            input_ids=input_ids_rmpad,
-            attention_mask=None,
-            position_ids=None,
-            packed_seq_params=packed_seq_params,
-        )
 
-        if post_process and logits_processor is not None:
-            args = {
-                k: preprocess_packed_seqs_no_padding(v, pre_process=True)[0] for k, v in logits_processor_args.items()
-            }
-            output_dict = logits_processor(output_orig, **args)
-            output = {
-                k: postprocess_packed_seqs_no_padding(
-                    v, packed_seq_params, input_ids, batch_size, post_process=post_process
-                )
-                for k, v in output_dict.items()
-            }
-        else:
-            output = postprocess_packed_seqs_no_padding(
-                output_orig, packed_seq_params, input_ids, batch_size, post_process=post_process
+    model_kwargs = {}
+    if "pixel_values" in multi_modal_inputs:
+        model_kwargs["pixel_values"] = multi_modal_inputs["pixel_values"].to(input_ids.device)
+    if "image_grid_thw" in multi_modal_inputs:
+        model_kwargs["image_grid_thw"] = multi_modal_inputs["image_grid_thw"].to(input_ids.device)
+
+    batch_size = input_ids.shape[0]
+    input_ids_rmpad, packed_seq_params = preprocess_packed_seqs_no_padding(input_ids, pre_process=pre_process)
+    input_ids_rmpad = input_ids_rmpad.contiguous()
+    output_orig = model(
+        input_ids=input_ids_rmpad,
+        attention_mask=None,
+        position_ids=None,
+        packed_seq_params=packed_seq_params,
+        **model_kwargs,
+    )
+
+    if post_process and logits_processor is not None:
+        args = {k: preprocess_packed_seqs_no_padding(v, pre_process=True)[0] for k, v in logits_processor_args.items()}
+        output_dict = logits_processor(output_orig, **args)
+        output = {
+            k: postprocess_packed_seqs_no_padding(
+                v, packed_seq_params, input_ids, batch_size, post_process=post_process
             )
+            for k, v in output_dict.items()
+        }
     else:
-        raise NotImplementedError("gptmodel_forward_no_padding only supports packed sequences")
+        output = postprocess_packed_seqs_no_padding(
+            output_orig, packed_seq_params, input_ids, batch_size, post_process=post_process
+        )
 
     if value_model and post_process:
         # output = output[..., 0]