[TRTLLM-7410][feat] Support hashing and KV cache reuse for videos (NVIDIA#7360)

chang-l · Wong4j · commit bea117c47013 · 2025-09-20T12:32:14.000+08:00
Signed-off-by: Chang Liu (Enterprise Products) &lt;9713593+chang-l@users.noreply.github.com&gt;
Signed-off-by: Chang Liu &lt;9713593+chang-l@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llava_next.py b/tensorrt_llm/_torch/models/modeling_llava_next.py
@@ -14,8 +14,8 @@
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
-from ...inputs import (ExtraProcessedInputs, InputProcessor,
-                       MultimodalPlaceholderMetadata,
+from ...inputs import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
+                       InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...llmapi.utils import download_hf_model
@@ -32,7 +32,7 @@
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
 
 
-class LlavaNextInputProcessor(InputProcessor):
+class LlavaNextInputProcessor(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
@@ -56,17 +56,6 @@ def __init__(self,
         self.vocab_size = model_config.vocab_size
         self.config = model_config.vision_config
 
-    def get_num_tokens_per_image(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-    ) -> int:
-        image_size = (image_height, image_width)
-        num_image_tokens = self.processor._get_num_multimodal_tokens(
-            [image_size])["num_image_tokens"][0]
-        return num_image_tokens
-
     def _postprocess(
         self, input_ids: torch.Tensor, mm_features: Union[torch.Tensor,
                                                           List[torch.Tensor]]
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -7,14 +7,13 @@
 from transformers import (AutoProcessor, AutoTokenizer, PretrainedConfig,
                           PreTrainedModel, Qwen2_5_VLForConditionalGeneration,
                           Qwen2VLForConditionalGeneration)
-from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
 
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
 from ..._utils import nvtx_range_debug
 from ...functional import RopeEmbeddingUtils, RotaryScalingType
-from ...inputs import (ExtraProcessedInputs, InputProcessor,
-                       MultimodalPlaceholderMetadata,
+from ...inputs import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
+                       InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor)
 from ...logger import logger
@@ -29,7 +28,7 @@
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
 
 
-class Qwen2VLInputProcessorBase(InputProcessor):
+class Qwen2VLInputProcessorBase(BaseMultimodalInputProcessor, InputProcessor):
 
     def __init__(self,
                  model_path: str,
@@ -45,6 +44,9 @@ def __init__(self,
             trust_remote_code=trust_remote_code)
 
         self.tllm_multimodal_token_id = self.model_config.vocab_size + 1
+        # temporal patch size for video frames
+        self.temporal_patch_size = getattr(model_config.vision_config,
+                                           'temporal_patch_size', 1)
 
     @classmethod
     def get_rope_index(
@@ -220,38 +222,6 @@ def get_rope_index(
             mrope_position_deltas, device=input_ids.device).unsqueeze(1)
         return position_ids, mrope_position_deltas
 
-    def get_num_tokens_per_image(
-        self,
-        *,
-        image_width: int,
-        image_height: int,
-        num_frames: int = 1,
-        do_resize: bool = True,
-    ):
-        patch_size = self.model_config.vision_config.patch_size
-        merge_size = self.model_config.vision_config.spatial_merge_size
-        temporal_patch_size = self.model_config.vision_config.temporal_patch_size
-        if do_resize:
-            resized_height, resized_width = smart_resize(
-                height=image_height,
-                width=image_width,
-                factor=patch_size * merge_size,
-                min_pixels=self.processor.image_processor.min_pixels,
-                max_pixels=self.processor.image_processor.max_pixels,
-            )
-            image_width, image_height = resized_width, resized_height
-
-        padded_num_frames = num_frames + num_frames % temporal_patch_size
-
-        grid_t = max(padded_num_frames // temporal_patch_size, 1)
-        grid_h = image_height // patch_size
-        grid_w = image_width // patch_size
-
-        num_patches = grid_t * grid_h * grid_w
-        num_vision_tokens = num_patches // (merge_size**2)
-
-        return num_vision_tokens
-
     def _preprocess(self, text: dict[str, any], mm_data: dict[str, any],
                     mm_processor_kwargs: Dict[str, Any]):
         images = mm_data.get("image")
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
@@ -1,7 +1,7 @@
 from .data import PromptInputs, TextPrompt, TokensPrompt, prompt_inputs
 from .multimodal import MultimodalInput
-from .registry import (ExtraProcessedInputs, InputProcessor,
-                       MultimodalPlaceholderMetadata,
+from .registry import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
+                       InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, create_input_processor,
                        create_input_processor_with_hash,
                        register_input_processor)
@@ -27,6 +27,7 @@
     "create_input_processor_with_hash",
     "register_input_processor",
     "ExtraProcessedInputs",
+    "BaseMultimodalInputProcessor",
     "MultimodalPlaceholderMetadata",
     "MultimodalPlaceholderPlacement",
     "ConversationMessage",
diff --git a/tensorrt_llm/inputs/multimodal.py b/tensorrt_llm/inputs/multimodal.py
@@ -9,6 +9,8 @@
 from blake3 import blake3
 from torchvision.transforms import ToPILImage
 
+from tensorrt_llm.logger import logger
+
 # Default hasher
 default_hasher = blake3
 
@@ -435,13 +437,22 @@ def apply_mm_hashes(mm_data: Dict[str, Any],
     """Apply hashing to multimodal data items."""
 
     def _hash_image(image):
-        # only support single modality w/ PIL.Image.Image for now
         # TODO: possible hash collision w/ this simplified version (vllm/PR/17378)
         hasher = hash_lib()
         if isinstance(image, torch.Tensor):
-            # TODO: Device tensor hashing is an open issue. Limited hashing to CPU for now.
-            image = image.cpu()
-        hasher.update(serialize_item(image))
+            # Ensure tensor is on CPU and contiguous for consistent hashing
+            image = image.detach().cpu().contiguous()
+            hasher.update(serialize_item(image))
+        elif isinstance(image, list):
+            # Hash each frame with a separator to avoid collisions between [A,B] and [AB]
+            for frame in image:
+                hasher.update(b"<frame>")
+                if isinstance(frame, torch.Tensor):
+                    frame = frame.detach().cpu().contiguous()
+                hasher.update(serialize_item(frame))
+        else:
+            hasher.update(serialize_item(image))
+
         return hasher.hexdigest()
 
     mm_items = {
@@ -483,54 +494,71 @@ def find_mm_token_lengths(mm_data: Dict[str, Any],
     num_mm_tokens = {}
 
     for modality, items in mm_items.items():
-        if modality != "image":
-            #TODO: support other modalities
-            raise ValueError(
-                f"Unsupported modality: {modality}. Only 'image' modality is currently supported for hashing."
-            )
-        if not hasattr(input_processor, "get_num_tokens_per_image"):
-            #TODO: backward compatibility for models that don't yet have get_num_tokens_per_image implemented
-            #TODO: only support qwen2_vl for now
+        if not hasattr(input_processor, f"get_num_tokens_per_{modality}"):
             raise AttributeError(
-                f"Input processor {type(input_processor).__name__} does not have 'get_num_tokens_per_image' method required for multimodal hashing."
+                f"Input processor {type(input_processor).__name__} does not have 'get_num_tokens_per_{modality}' method required for multimodal hashing."
             )
 
         modality_token_lengths = []
         for item in items:
-            if isinstance(item, torch.Tensor):
-                item = ToPILImage()(item)
-            num_tokens = input_processor.get_num_tokens_per_image(
-                image_width=item.width,
-                image_height=item.height,
-            )
-            modality_token_lengths.append(num_tokens)
+            if modality == "image":
+                if isinstance(item, torch.Tensor):
+                    item = ToPILImage()(item)
+                num_tokens = input_processor.get_num_tokens_per_image(
+                    image_width=item.width,
+                    image_height=item.height,
+                )
+                modality_token_lengths.append(num_tokens)
+            elif modality == "video":
+                assert isinstance(item, list), "Video must be a list of frames"
+                if isinstance(item[0], torch.Tensor):
+                    item = [ToPILImage()(frame) for frame in item]
+                num_tokens = input_processor.get_num_tokens_per_video(
+                    video_width=item[0].width,
+                    video_height=item[0].height,
+                    num_frames=len(item),
+                )
+                modality_token_lengths.append(num_tokens)
+            else:
+                # TODO: add audio support if needed
+                raise ValueError(f"Unsupported modality: {modality}")
 
         num_mm_tokens[modality] = modality_token_lengths
 
-    return num_mm_tokens['image']  # flatten all mm instances to a single list
+    return num_mm_tokens  # flatten all mm instances to a single list
 
 
-def find_mm_token_positions(input_ids: Union[torch.Tensor, List[int],
-                                             np.ndarray],
-                            num_mm_tokens: List[int],
-                            vocab_size: int,
-                            mm_token_ids: torch.Tensor = None) -> List[int]:
+def find_mm_token_positions(
+        input_ids: Union[torch.Tensor, List[int], np.ndarray],
+        num_mm_tokens: List[int],
+        vocab_size: Optional[int] = None,
+        mm_token_ids: Optional[torch.Tensor] = None) -> List[int]:
     """Get multimodal token positions using IDs > vocab_size and known lengths.
 
     This function finds multimodal tokens (with IDs > vocab_size) and uses the
     provided lengths in num_mm_tokens to identify where each chunk starts.
     This works even when there are no gaps between different image sequences
     (e.g., when all images use the same token IDs).
+    Note at least one of vocab_size or mm_token_ids must be provided. If mm_token_ids is provided, vocab_size is ignored.
 
     Args:
         input_ids: Token sequence (tensor, list, or numpy array)
         num_mm_tokens: List of lengths for each multimodal token chunk
         vocab_size: Size of the model's vocabulary
-        mm_token_ids (optional): possible token ids for multimodal tokens
+        mm_token_ids: Possible token ids for multimodal tokens
 
     Returns:
         List of starting positions for each multimodal token chunk
     """
+    if mm_token_ids is None and vocab_size is None:
+        raise ValueError(
+            "Provide either mm_token_ids or vocab_size to find multimodal token positions"
+        )
+    if mm_token_ids is not None and vocab_size is not None:
+        logger.warning(
+            "Both mm_token_ids and vocab_size are provided, using mm_token_ids and ignoring vocab_size"
+        )
+
     # Convert input_ids to tensor if needed
     if not isinstance(input_ids, torch.Tensor):
         if isinstance(input_ids, list):
@@ -542,6 +570,9 @@ def find_mm_token_positions(input_ids: Union[torch.Tensor, List[int],
     if mm_token_ids is None:
         mm_mask = input_ids >= vocab_size
     else:
+        if mm_token_ids.ndim != 1:
+            raise ValueError("mm_token_ids must be a 1D tensor")
+        mm_token_ids = torch.unique(mm_token_ids)
         mm_mask = torch.isin(input_ids, mm_token_ids)
 
     # If no multimodal tokens found, return empty list
diff --git a/tensorrt_llm/inputs/registry.py b/tensorrt_llm/inputs/registry.py
diff --git a/tests/unittest/_torch/multimodal/test_find_num_image_tokens.py b/tests/unittest/_torch/multimodal/test_find_num_image_tokens.py