Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/llm-api/quickstart_multimodal.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def main():
open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type']
assert model_type in ALL_SUPPORTED_MULTIMODAL_MODELS, f"Unsupported model_type: {model_type}"

device = "cuda"
device = "cpu"
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
model_dir=llm._hf_model_dir,
model_type=model_type,
Expand Down
2 changes: 0 additions & 2 deletions tensorrt_llm/_torch/models/modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,6 @@ def __init__(
self.model_config = model_config
self.tokenizer = tokenizer

self._device = "cuda"
self._processor = AutoProcessor.from_pretrained(model_path,
use_fast=False)

Expand Down Expand Up @@ -256,7 +255,6 @@ def __call__(
if pixel_values is not None:
# We have no use for the `attention_mask`.
processed.pop("attention_mask")
processed = processed.to(self._device)
# NOTE: `processed` is a dict-like object, but not actually a dict.
extra_processed_inputs = {
"multimodal_data": {
Expand Down
8 changes: 3 additions & 5 deletions tensorrt_llm/_torch/models/modeling_qwen2vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ def __init__(self,
trust_remote_code: bool = True):
self.model_config = model_config
self.tokenizer = tokenizer
# TODO: change to True and also change the according test result
self.use_fast = False
self.device = 'cuda'
self.use_fast = True
self.processor = AutoProcessor.from_pretrained(
model_path,
use_fast=self.use_fast,
Expand Down Expand Up @@ -226,7 +224,7 @@ def _post_init_(self):
self.model_config.num_attention_heads),
theta=float(self.model_config.rope_theta),
scale_type=RotaryScalingType.mrope)
self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin).to(self.device)
self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin)
self.rotary_cos_sin = self.rotary_cos_sin.reshape(
self.model_config.max_position_embeddings,
int(self.model_config.hidden_size /
Expand Down Expand Up @@ -344,7 +342,7 @@ def __call__(
inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {})

processed_inputs = self._preprocess(text_prompt, mm_data,
mm_processor_kwargs).to(self.device)
mm_processor_kwargs)

if not mm_data:
fused_input_ids = processed_inputs['input_ids']
Expand Down
10 changes: 5 additions & 5 deletions tensorrt_llm/inputs/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def load_base64_image(parsed_url: str) -> Image.Image:

def load_image(image: str,
format: str = "pt",
device: str = "cuda") -> Union[Image.Image, torch.Tensor]:
device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"

parsed_url = urlparse(image)
Expand All @@ -67,7 +67,7 @@ def load_image(image: str,
async def async_load_image(
image: str,
format: str = "pt",
device: str = "cuda") -> Union[Image.Image, torch.Tensor]:
device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"

parsed_url = urlparse(image)
Expand All @@ -92,7 +92,7 @@ def load_video(
video: str,
num_frames: int = 10,
format: str = "pt",
device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]:
device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]:

# Keep this import local to avoid importing cv2 if not needed
import cv2
Expand Down Expand Up @@ -141,7 +141,7 @@ async def async_load_video(
video: str,
num_frames: int = 10,
format: str = "pt",
device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]:
device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]:
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"

parsed_url = urlparse(video)
Expand Down Expand Up @@ -480,7 +480,7 @@ def default_multimodal_input_loader(
media: Union[List[str], List[List[str]]],
image_data_format: str = "pt",
num_frames: int = 8,
device: str = "cuda") -> List[dict[str, Union[str, torch.Tensor]]]:
device: str = "cpu") -> List[dict[str, Union[str, torch.Tensor]]]:

def convert_to_conversation_message(prompt: str, media: Union[str,
List[str]],
Expand Down
37 changes: 14 additions & 23 deletions tests/integration/defs/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -1994,22 +1994,19 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
},
"llava-v1.6-mistral-7b": {
"image": [
["ocean", "sky", "large", "waves", "shore", "blue"],
[
"ocean", "cloud", "waves", "white", "shore", "large",
"dramatic", "breaking"
"landscape", "rock", "landmark", "formation", "smooth",
"mountain"
],
["mountain", "butte", "flat", "top", "sky"],
["highway", "vehicles", "traffic", "divider", "suburban"],
["highway", "vehicles", "traffic", "bus", "suburban"],
],
},
"qwen2-vl-7b-instruct": {
"image": [
["ocean", "waves", "shore", "natural", "clouds", "turbulent"],
[
"mountainous", "landscape", "rock", "peak", "weather",
"steep"
],
["traffic", "vehicles", "moderate", "lanes", "road"],
["ocean", "waves", "atmosphere", "stormy", "clouds", "intense"],
["trees", "rocks", "road", "sunny", "natural", "greenery"],
["traffic", "vehicles", "moderate", "lanes", "road", "cars"],
],
"video": [
["city", "night", "lights", "jacket", "wet"],
Expand All @@ -2018,33 +2015,27 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
},
"qwen2.5-vl-7b-instruct": {
"image": [
["dramatic", "moody", "stormy", "turbulent", "wave"],
[
"large", "dome", "yosemite", "landmark", "rock", "road",
"formation"
],
["highway", "traffic", "vehicles", "bus", "police"],
["dramatic", "moody", "ocean", "stormy", "sky", "clouds"],
["large", "dome", "yosemite", "landmark", "rock", "road"],
["highway", "traffic", "vehicles", "bus", "police", "traffic"],
],
"video": [
["woman", "neon", "night", "jacket", "wet"],
["earth", "rotating", "night", "lights", "cities"],
["earth", "world", "night", "lights", "cities"],
],
},
"mistral-small-3.1-24b-instruct": {
"image": [
[
"dramatic", "seascape", "cloudy", "turbulent", "waves",
"water"
],
["scenic", "rock", "landscape", "snow", "formation"],
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
["scenic", "rock", "landscape", "snow", "altitude"],
["highway", "traffic", "directions", "lanes", "Jurong"],
],
},
"gemma-3-27b-it": {
"image": [
["dramatic", "turbulent", "waves", "ocean", "overcast"],
["half", "dome", "yosemite", "landmark", "rounded"],
["flowing", "standstill", "vehicles", "road", "Changi"],
["flowing", "traffic", "vehicles", "road", "Changi"],
],
},
}
Expand Down