feat: remove cls token

ai-dynamo · hhzhang16 · Jun 11, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
commit e13f827af465a422566b336708d87b5f0cb92ff1
@@ -136,7 +136,7 @@ async def async_init(self):
 
             self.embedding_size = get_vision_embedding_size(self.engine_args.model)
         else:
-            EMBEDDINGS_SHAPE = (1, 577, 4096)
+            EMBEDDINGS_SHAPE = (1, 576, 4096)
             EMBEDDINGS_DTYPE = torch.float16
             EMBEDDINGS_DEVICE = "cuda"
 
@@ -345,7 +345,7 @@ async def remote_prefill(self, request: vLLMMultimodalRequest) -> tuple:
         # As a workaround, here we manually insert some placeholder dummy tokens based on the embedding size
         # so that decode worker can pre-allocate the memory with the correct size.
         # The structure of the prompt will be like: "\nUSER: <image> <dummy_tokens>\n<user_prompt>\nASSISTANT:".
-        # Since the "<image>" token is included in the prompt, only need to insert (embedding_size - 1) dummy tokens after the image token.
+        # Since the "<image>" token is included in the prompt, only need to insert embedding_size dummy tokens after the image token.
         IMAGE_TOKEN_ID = 32000
         DUMMY_TOKEN_ID = 0
         # Find the index of the image token in the prompt token ids
@@ -355,7 +355,7 @@ async def remote_prefill(self, request: vLLMMultimodalRequest) -> tuple:
         dummy_token_index = image_token_index + 1
         prompt_ids = (
             request.engine_prompt["prompt_token_ids"][:dummy_token_index]
-            + [DUMMY_TOKEN_ID] * (self.embedding_size - 1)
+            + [DUMMY_TOKEN_ID] * self.embedding_size
             + request.engine_prompt["prompt_token_ids"][dummy_token_index:]
         )
         logger.debug(

@@ -173,7 +173,7 @@ async def encode(self, request: EncodeRequest) -> AsyncIterator[EncodeResponse]:
             with torch.no_grad():
                 embeddings = self.vision_model.get_multimodal_embeddings(**image_embeds)
 
-                logger.info(
+                logger.debug(
                     f"Embeddings: {{ shape: {embeddings.shape}, dtype: {embeddings.dtype}, device: {embeddings.device}, ptr: {embeddings.data_ptr()}, elements: {{ count: {embeddings.numel()}, size: {embeddings.element_size()} }} }}."
                 )
 

@@ -250,7 +250,7 @@ async def generate(self, request: RemotePrefillRequest):
             # TODO: make this more flexible/model-dependent
             IMAGE_TOKEN_ID = 32000
             embedding_size = embeddings.shape[1]
-            padding_size = embedding_size - 1
+            padding_size = embedding_size
             image_token_index = request.prompt_token_ids.index(IMAGE_TOKEN_ID)
             dummy_token_index = image_token_index + 1
             prompt_token_ids = (

@@ -80,5 +80,4 @@ def get_vision_embedding_size(model_id: str) -> int:
 
     num_patches = (h // ph) * (w // pw)
 
-    # 4. Add CLS token (standard in ViT architectures)
-    return num_patches + 1
+    return num_patches