Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
8d67b7d
feat: add more robust handling for MM prompt
hhzhang16 Jun 4, 2025
b65efb5
feat: [WIP] generalize workers
hhzhang16 Jun 4, 2025
40c2154
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 4, 2025
e13f827
feat: remove cls token
hhzhang16 Jun 4, 2025
a866d73
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 4, 2025
0adb7e6
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 5, 2025
86c6135
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 5, 2025
19f2158
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 6, 2025
a766509
feat: working multimodal agg for multiple vision models
hhzhang16 Jun 7, 2025
17aecda
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 7, 2025
496ee57
feat: addressing ci comments
hhzhang16 Jun 9, 2025
820c7e3
feat: addressing ci comments
hhzhang16 Jun 9, 2025
bb4f95e
Merge branch 'main' of github.com:ai-dynamo/dynamo into hannahz/dep-1…
hhzhang16 Jun 9, 2025
027341a
Update examples/multimodal/README.md
hhzhang16 Jun 9, 2025
0eff4e0
feat: trust remote code when loading autoconfig
hhzhang16 Jun 9, 2025
d736895
feat: working code for phi3v
hhzhang16 Jun 10, 2025
36eacb9
docs: add phi3v to multimodal readme
hhzhang16 Jun 10, 2025
d586343
feat: working for Qwen 2.5 VL
hhzhang16 Jun 11, 2025
d5025a7
docs: fixing dash issue
hhzhang16 Jun 11, 2025
1b0efc0
Merge branch 'main' into hannahz/dep-114-generalize-vlm-embedding-ext…
hhzhang16 Jun 11, 2025
843d586
docs: add readme note about disagg support
hhzhang16 Jun 11, 2025
d12e86d
Merge branch 'hannahz/dep-114-generalize-vlm-embedding-extraction' of…
hhzhang16 Jun 11, 2025
073ad67
feat: remove pynvml from this MR
hhzhang16 Jun 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: working for Qwen 2.5 VL
  • Loading branch information
hhzhang16 committed Jun 11, 2025
commit d5863438fbac711a7203d2c3694b339d78088975
8 changes: 4 additions & 4 deletions examples/multimodal/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ flowchart LR
cd $DYNAMO_HOME/examples/multimodal
# Serve a LLaVA 1.5 7B model:
dynamo serve graphs.agg:Frontend -f ./configs/agg-llava.yaml
# Serve a Qwen2 VL model:
# Serve a Qwen2.5-VL model:
# dynamo serve graphs.agg:Frontend -f ./configs/agg-qwen.yaml
# Serve a Phi3V model:
# dynamo serve graphs.agg:Frontend -f ./configs/agg-phi3v.yaml
Expand Down Expand Up @@ -89,7 +89,7 @@ curl http://localhost:8000/v1/chat/completions \
}'
```

If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.

You should see a response similar to this:
```json
Expand Down Expand Up @@ -204,7 +204,7 @@ DYNAMO_TAG=$(dynamo build graphs.agg:Frontend | grep "Successfully built" | awk
export DEPLOYMENT_NAME=multimodal-agg
# For aggregated serving with LLaVA:
dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-llava.yaml
# For aggregated serving with Qwen2-VL:
# For aggregated serving with Qwen2.5-VL:
# dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-qwen.yaml
# For aggregated serving with Phi3V:
# dynamo deploy $DYNAMO_TAG -n $DEPLOYMENT_NAME -f ./configs/agg-phi3v.yaml
Expand Down Expand Up @@ -249,6 +249,6 @@ curl localhost:8000/v1/chat/completions \
}'
```

If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.
If serving the example Qwen model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"Qwen/Qwen2.5-VL-7B-Instruct"`. If serving the example Phi3V model, replace `"llava-hf/llava-1.5-7b-hf"` in the `"model"` field with `"microsoft/Phi-3.5-vision-instruct"`.

For more details on managing deployments, testing, and troubleshooting, please refer to the [Operator Deployment Guide](../../docs/guides/dynamo_deploy/operator_deployment.md).
7 changes: 4 additions & 3 deletions examples/multimodal/components/decode_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ async def async_init(self):
)

runtime = dynamo_context["runtime"]
embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
self.engine_args.model, self.engine_args.num_patches
)
logger.debug(f"Embeddings shape: {embeddings_shape}")
Expand All @@ -139,6 +139,7 @@ async def async_init(self):
else:
self.disaggregated_router = None
else:
EMBEDDINGS_DTYPE = torch.float16
EMBEDDINGS_DEVICE = "cuda"

enc_comp_ns, enc_comp_name = VllmEncodeWorker.dynamo_address() # type: ignore
Expand All @@ -154,7 +155,7 @@ async def async_init(self):

# Create a longer-lived buffer for receiving the image embeddings.
embeddings = torch.empty(
embeddings_shape, dtype=embeddings_dtype, device=EMBEDDINGS_DEVICE
embeddings_shape, dtype=EMBEDDINGS_DTYPE, device=EMBEDDINGS_DEVICE
)
descriptor = connect.Descriptor(embeddings)
# Register the descriptor w/ NIXL (this is optional, if not done here the connect subsytem will take care of this automatically).
Expand Down Expand Up @@ -290,7 +291,7 @@ async def local_prefill(self, request: vLLMMultimodalRequest) -> tuple:
)
# When using disaggregated serving, the encode worker will have provided the key-value cache updates via the encode worker.
multi_modal_data = construct_mm_data(
self.engine_args.model, encode_output, embeddings
self.engine_args.model, encode_output, embeddings, self.embeddings_dtype
)

return prompt_ids, multi_modal_data, remote_prefill_params
Expand Down
9 changes: 6 additions & 3 deletions examples/multimodal/components/prefill_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ async def async_init(self):
await self._connector.initialize()

# Create a longer-lived buffer for receiving the image embeddings.
embeddings_shape, embeddings_dtype = get_vision_embeddings_info(
embeddings_shape, self.embeddings_dtype = get_vision_embeddings_info(
self.engine_args.model, self.engine_args.num_patches
)
embeddings = torch.empty(
embeddings_shape,
dtype=embeddings_dtype,
dtype=self.embeddings_dtype,
device=EMBEDDINGS_DEVICE,
)
descriptor = connect.Descriptor(embeddings)
Expand Down Expand Up @@ -265,7 +265,10 @@ async def generate(self, request: RemotePrefillRequest):
prompt=TokensPrompt(
prompt_token_ids=prompt_token_ids,
multi_modal_data=construct_mm_data(
self.engine_args.model, encode_output, embeddings
self.engine_args.model,
encode_output,
embeddings,
self.embeddings_dtype,
),
),
sampling_params=sampling_params,
Expand Down
3 changes: 2 additions & 1 deletion examples/multimodal/configs/agg-qwen.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
Common:
model: Qwen/Qwen2-VL-7B-Instruct
model: Qwen/Qwen2.5-VL-7B-Instruct
block-size: 64
max-model-len: 4096

Expand All @@ -29,6 +29,7 @@ VllmDecodeWorker:
mm-processor-kwargs:
min_pixels: 784
max_pixels: 1003520
fps: 1
enable-prefix-caching: true
image-token-id: 151655
num-patches: 345
Expand Down
8 changes: 6 additions & 2 deletions examples/multimodal/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,17 @@ def get_vision_embeddings_info(


def construct_mm_data(
model: str, encode_output: EncodeResponse, image_embeds: torch.Tensor
model: str,
encode_output: EncodeResponse,
image_embeds: torch.Tensor,
embeddings_dtype: torch.dtype,
) -> Dict[str, torch.Tensor | Dict[str, Any]]:
"""Construct multimodal data for a vLLM request for models that require additional parameters alongside the embeddings"""
image_embeds = image_embeds.to(embeddings_dtype)
if "Qwen2" in model:
return {
"image": {
"image_embeds": image_embeds.squeeze(0).to(torch.float16),
"image_embeds": image_embeds.squeeze(0),
"image_grid_thw": torch.tensor(encode_output.image_grid_thw).squeeze(0),
}
}
Expand Down
Loading