feat: working for Qwen 2.5 VL

ai-dynamo · hhzhang16 · Jun 11, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
commit d5863438fbac711a7203d2c3694b339d78088975
@@ -52,7 +52,7 @@ flowchart LR
 cd $DYNAMO_HOME/examples/multimodal
 # Serve a LLaVA 1.5 7B model:
 dynamo serve graphs.agg:Frontend -f ./configs/agg-llava.yaml
-# Serve a Qwen2 VL model:
+# Serve a Qwen2.5-VL model:
 # dynamo serve graphs.agg:Frontend -f ./configs/agg-qwen.yaml
 # Serve a Phi3V model:
 # dynamo serve graphs.agg:Frontend -f ./configs/agg-phi3v.yaml
@@ -89,7 +89,7 @@ curl http://localhost:8000/v1/chat/completions \
     }'
 ```
 
-If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
+If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
 
 You should see a response similar to this:
 ```json
@@ -204,7 +204,7 @@ DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" |  awk
 export DEPLOYMENT_NAME=multimodal-agg
 # For aggregated serving with LLaVA:
 dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-llava.yaml
-# For aggregated serving with Qwen2-VL:
+# For aggregated serving with Qwen2.5-VL:
 # dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-qwen.yaml
 # For aggregated serving with Phi3V:
 # dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-phi3v.yaml
@@ -249,6 +249,6 @@ curl localhost:8000/v1/chat/completions \
   }'
 ```
 
-If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
+If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
 
 For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
@@ -117,7 +117,7 @@ async def async_init(self):
             )
 
         runtime = dynamo_context["runtime"]
-        embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
+        embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
             self.engine_args.model, self.engine_args.num_patches
         )
         logger.debug(f"Embeddings shape: {embeddings_shape}")
@@ -139,6 +139,7 @@ async def async_init(self):
             else:
                 self.disaggregated_router = None
         else:
+            EMBEDDINGS_DTYPE = torch.float16
             EMBEDDINGS_DEVICE = "cuda"
 
             enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address()  # type: ignore
@@ -154,7 +155,7 @@ async def async_init(self):
 
             # Create a longer-lived buffer for receiving the image embeddings.
             embeddings = torch.empty(
-                embeddings_shape, dtype=embeddings_dtype, device=EMBEDDINGS_DEVICE
+                embeddings_shape, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE
             )
             descriptor = connect.Descriptor(embeddings)
             # Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
@@ -290,7 +291,7 @@ async def local_prefill(self, request: vLLMMultimodalRequest) -> tuple:
         )
         # When using disaggregated serving, the encode worker will have provided the key-value cache updates via the encode worker.
         multi_modal_data = construct_mm_data(
-            self.engine_args.model, encode_output, embeddings
+            self.engine_args.model, encode_output, embeddings, self.embeddings_dtype
         )
 
         return prompt_ids, multi_modal_data, remote_prefill_params

@@ -111,12 +111,12 @@ async def async_init(self):
         await self._connector.initialize()
 
         # Create a longer-lived buffer for receiving the image embeddings.
-        embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
+        embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
             self.engine_args.model, self.engine_args.num_patches
         )
         embeddings = torch.empty(
             embeddings_shape,
-            dtype=embeddings_dtype,
+            dtype=self.embeddings_dtype,
             device=EMBEDDINGS_DEVICE,
         )
         descriptor = connect.Descriptor(embeddings)
@@ -265,7 +265,10 @@ async def generate(self, request: RemotePrefillRequest):
                 prompt=TokensPrompt(
                     prompt_token_ids=prompt_token_ids,
                     multi_modal_data=construct_mm_data(
-                        self.engine_args.model, encode_output, embeddings
+                        self.engine_args.model,
+                        encode_output,
+                        embeddings,
+                        self.embeddings_dtype,
                     ),
                 ),
                 sampling_params=sampling_params,

@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 Common:
-  model: Qwen/Qwen2-VL-7B-Instruct
+  model: Qwen/Qwen2.5-VL-7B-Instruct
   block-size: 64
   max-model-len: 4096
 
@@ -29,6 +29,7 @@ VllmDecodeWorker:
   mm-processor-kwargs:
     min_pixels: 784
     max_pixels: 1003520
+    fps: 1
   enable-prefix-caching: true
   image-token-id: 151655
   num-patches: 345

@@ -68,13 +68,17 @@ def get_vision_embeddings_info(
 
 
 def construct_mm_data(
-    model: str, encode_output: EncodeResponse, image_embeds: torch.Tensor
+    model: str,
+    encode_output: EncodeResponse,
+    image_embeds: torch.Tensor,
+    embeddings_dtype: torch.dtype,
 ) -> Dict[str, torch.Tensor | Dict[str, Any]]:
     """Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
+    image_embeds = image_embeds.to(embeddings_dtype)
     if "Qwen2" in model:
         return {
             "image": {
-                "image_embeds": image_embeds.squeeze(0).to(torch.float16),
+                "image_embeds": image_embeds.squeeze(0),
                 "image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
             }
         }