From 8edc5561d29bc871a52a198e173ad826d7f6b97f Mon Sep 17 00:00:00 2001 From: yechank <161688079+yechank-nvidia@users.noreply.github.com> Date: Mon, 14 Jul 2025 14:53:49 +0900 Subject: [PATCH 1/3] chore: set default device to cpu on Multimodal models 1. Change use_fast=True for Qwen2/2.5-VL models 2. Change the test keywords accordingly Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> --- examples/llm-api/quickstart_multimodal.py | 2 +- .../_torch/models/modeling_mistral.py | 2 -- .../_torch/models/modeling_qwen2vl.py | 8 ++--- tensorrt_llm/inputs/utils.py | 10 +++--- tests/integration/defs/test_e2e.py | 35 +++++++------------ 5 files changed, 22 insertions(+), 35 deletions(-) diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py index 967a8636e1b..c4d40655d3d 100644 --- a/examples/llm-api/quickstart_multimodal.py +++ b/examples/llm-api/quickstart_multimodal.py @@ -138,7 +138,7 @@ def main(): open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type'] assert model_type in ALL_SUPPORTED_MULTIMODAL_MODELS, f"Unsupported model_type: {model_type}" - device = "cuda" + device = "cpu" inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer, model_dir=llm._hf_model_dir, model_type=model_type, diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py index a8e07f24d7f..4a96407dca8 100644 --- a/tensorrt_llm/_torch/models/modeling_mistral.py +++ b/tensorrt_llm/_torch/models/modeling_mistral.py @@ -226,7 +226,6 @@ def __init__( self.model_config = model_config self.tokenizer = tokenizer - self._device = "cuda" self._processor = AutoProcessor.from_pretrained(model_path, use_fast=False) @@ -256,7 +255,6 @@ def __call__( if pixel_values is not None: # We have no use for the `attention_mask`. processed.pop("attention_mask") - processed = processed.to(self._device) # NOTE: `processed` is a dict-like object, but not actually a dict. extra_processed_inputs = { "multimodal_data": { diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py index 25a2778f8b8..3371bb6fc55 100644 --- a/tensorrt_llm/_torch/models/modeling_qwen2vl.py +++ b/tensorrt_llm/_torch/models/modeling_qwen2vl.py @@ -34,9 +34,7 @@ def __init__(self, trust_remote_code: bool = True): self.model_config = model_config self.tokenizer = tokenizer - # TODO: change to True and also change the according test result - self.use_fast = False - self.device = 'cuda' + self.use_fast = True self.processor = AutoProcessor.from_pretrained( model_path, use_fast=self.use_fast, @@ -226,7 +224,7 @@ def _post_init_(self): self.model_config.num_attention_heads), theta=float(self.model_config.rope_theta), scale_type=RotaryScalingType.mrope) - self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin).to(self.device) + self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin) self.rotary_cos_sin = self.rotary_cos_sin.reshape( self.model_config.max_position_embeddings, int(self.model_config.hidden_size / @@ -344,7 +342,7 @@ def __call__( inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {}) processed_inputs = self._preprocess(text_prompt, mm_data, - mm_processor_kwargs).to(self.device) + mm_processor_kwargs) if not mm_data: fused_input_ids = processed_inputs['input_ids'] diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py index a58e6e4b58a..a4bf8570d0a 100644 --- a/tensorrt_llm/inputs/utils.py +++ b/tensorrt_llm/inputs/utils.py @@ -45,7 +45,7 @@ def load_base64_image(parsed_url: str) -> Image.Image: def load_image(image: str, format: str = "pt", - device: str = "cuda") -> Union[Image.Image, torch.Tensor]: + device: str = "cpu") -> Union[Image.Image, torch.Tensor]: assert format in ["pt", "pil"], "format must be either Pytorch or PIL" parsed_url = urlparse(image) @@ -67,7 +67,7 @@ def load_image(image: str, async def async_load_image( image: str, format: str = "pt", - device: str = "cuda") -> Union[Image.Image, torch.Tensor]: + device: str = "cpu") -> Union[Image.Image, torch.Tensor]: assert format in ["pt", "pil"], "format must be either Pytorch or PIL" parsed_url = urlparse(image) @@ -92,7 +92,7 @@ def load_video( video: str, num_frames: int = 10, format: str = "pt", - device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]: + device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]: # Keep this import local to avoid importing cv2 if not needed import cv2 @@ -141,7 +141,7 @@ async def async_load_video( video: str, num_frames: int = 10, format: str = "pt", - device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]: + device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]: assert format in ["pt", "pil"], "format must be either Pytorch or PIL" parsed_url = urlparse(video) @@ -480,7 +480,7 @@ def default_multimodal_input_loader( media: Union[List[str], List[List[str]]], image_data_format: str = "pt", num_frames: int = 8, - device: str = "cuda") -> List[dict[str, Union[str, torch.Tensor]]]: + device: str = "cpu") -> List[dict[str, Union[str, torch.Tensor]]]: def convert_to_conversation_message(prompt: str, media: Union[str, List[str]], diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 0ac0ec43df4..c8d20f02b23 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1994,22 +1994,19 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, }, "llava-v1.6-mistral-7b": { "image": [ + ["ocean", "sky", "large", "waves", "shore", "blue"], [ - "ocean", "cloud", "waves", "white", "shore", "large", - "dramatic", "breaking" + "landscape", "rock", "landamark", "cliff", "surface", + "mountain" ], - ["mountain", "butte", "flat", "top", "sky"], - ["highway", "vehicles", "traffic", "divider", "suburban"], + ["highway", "vehicles", "traffic", "bus", "suburban"], ], }, "qwen2-vl-7b-instruct": { "image": [ - ["ocean", "waves", "shore", "natural", "clouds", "turbulent"], - [ - "mountainous", "landscape", "rock", "peak", "weather", - "steep" - ], - ["traffic", "vehicles", "moderate", "lanes", "road"], + ["ocean", "waves", "atmosphere", "stormy", "clouds", "intense"], + ["trees", "rocks", "road", "sunny", "natural", "greenery"], + ["traffic", "vehicles", "moderate", "lanes", "road", "cars"], ], "video": [ ["city", "night", "lights", "jacket", "wet"], @@ -2018,25 +2015,19 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, }, "qwen2.5-vl-7b-instruct": { "image": [ - ["dramatic", "moody", "stormy", "turbulent", "wave"], - [ - "large", "dome", "yosemite", "landmark", "rock", "road", - "formation" - ], - ["highway", "traffic", "vehicles", "bus", "police"], + ["dramatic", "moody", "ocean", "stormy", "sky", "clouds"], + ["large", "dome", "yosemite", "landmark", "rock", "road"], + ["highway", "traffic", "vehicles", "bus", "police", "traffic"], ], "video": [ ["woman", "neon", "night", "jacket", "wet"], - ["earth", "rotating", "night", "lights", "cities"], + ["earth", "world", "night", "lights", "cities"], ], }, "mistral-small-3.1-24b-instruct": { "image": [ - [ - "dramatic", "seascape", "cloudy", "turbulent", "waves", - "water" - ], - ["scenic", "rock", "landscape", "snow", "formation"], + ["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"], + ["scenic", "rock", "landscape", "snow", "altitude"], ["highway", "traffic", "directions", "lanes", "Jurong"], ], }, From 3d6fb07ad44323339a188cc83e32d6e70ce83217 Mon Sep 17 00:00:00 2001 From: yechank <161688079+yechank-nvidia@users.noreply.github.com> Date: Mon, 14 Jul 2025 18:09:27 +0900 Subject: [PATCH 2/3] remove gemma3 & mistral and fix llava test Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> --- tests/integration/defs/test_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index c8d20f02b23..e3c61629c36 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1996,7 +1996,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, "image": [ ["ocean", "sky", "large", "waves", "shore", "blue"], [ - "landscape", "rock", "landamark", "cliff", "surface", + "landscape", "rock", "landmark", "formation", "smooth", "mountain" ], ["highway", "vehicles", "traffic", "bus", "suburban"], From 85255e05a5d4e7e48ea6ab659b3d8a0395d838ce Mon Sep 17 00:00:00 2001 From: yechank <161688079+yechank-nvidia@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:17:32 +0900 Subject: [PATCH 3/3] address gemma3_vl Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com> --- tests/integration/defs/test_e2e.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index e3c61629c36..9cfd2eed341 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2035,7 +2035,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, "image": [ ["dramatic", "turbulent", "waves", "ocean", "overcast"], ["half", "dome", "yosemite", "landmark", "rounded"], - ["flowing", "standstill", "vehicles", "road", "Changi"], + ["flowing", "traffic", "vehicles", "road", "Changi"], ], }, }