triton-inference-server · Shixiaowei02 · Sep 30, 2024 · Sep 30, 2024 · Sep 30, 2024
diff --git a/README.md b/README.md
diff --git a/all_models/gpt/postprocessing/config.pbtxt b/all_models/gpt/postprocessing/config.pbtxt
@@ -1,6 +1,7 @@
 name: "postprocessing"
 backend: "python"
 max_batch_size: 1024
+dynamic_batching {}
 input [
   {
     name: "TOKENS_BATCH"

diff --git a/all_models/gpt/tensorrt_llm/1/model.py b/all_models/gpt/tensorrt_llm/1/model.py
@@ -160,7 +160,7 @@ def execute(self, requests):
             sampling_config.output_log_probs = inputs['output_log_probs']
             sampling_config.return_dict = True
 
-            outputs = self.runner.generate(input_ids, sampling_config)
+            outputs = self.runner.generate(input_ids, None, sampling_config)
             output_ids = outputs["output_ids"]
 
             if self.rank == 0:

diff --git a/all_models/inflight_batcher_llm/ensemble/config.pbtxt b/all_models/inflight_batcher_llm/ensemble/config.pbtxt
@@ -39,12 +39,6 @@ input [
     dims: [ 1 ]
     optional: true
   },
-  {
-    name: "image_input"
-    data_type: TYPE_FP16
-    dims: [ 3, 224, 224 ]
-    optional: true
-  },
   {
     name: "max_tokens"
     data_type: TYPE_INT32
@@ -164,6 +158,12 @@ input [
     dims: [ -1, -1 ]
     optional: true
   },
+  {
+    name: "prompt_table_extra_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "prompt_vocab_size"
     data_type: TYPE_INT32
@@ -228,10 +228,6 @@ ensemble_scheduling {
         key: "DECODER_QUERY"
         value: "decoder_text_input"
       }
-      input_map {
-        key: "IMAGE"
-        value: "image_input"
-      }
       input_map {
         key: "REQUEST_OUTPUT_LEN"
         value: "max_tokens"
@@ -261,8 +257,8 @@ ensemble_scheduling {
         value: "pad_id"
       }
       input_map {
-        key: "PROMPT_EMBEDDING_TABLE"
-        value: "prompt_embedding_table"
+        key: "PROMPT_TABLE_EXTRA_ID"
+        value: "prompt_table_extra_id"
       }
       output_map {
         key: "REQUEST_INPUT_LEN"
@@ -305,8 +301,8 @@ ensemble_scheduling {
         value: "_PREPROCESSOR_PAD_ID"
       }
       output_map {
-        key: "OUT_PROMPT_EMBEDDING_TABLE"
-        value: "out_prompt_embedding_table"
+        key: "OUT_PROMPT_TABLE_EXTRA_IDS"
+        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
       }
     },
     {
@@ -402,7 +398,7 @@ ensemble_scheduling {
       }
       input_map {
         key: "prompt_embedding_table"
-        value: "out_prompt_embedding_table"
+        value: "prompt_embedding_table"
       }
       input_map {
         key: "prompt_vocab_size"
@@ -416,6 +412,10 @@ ensemble_scheduling {
         key: "bad_words_list"
         value: "_BAD_WORDS_IDS"
       }
+      input_map {
+        key: "prompt_table_extra_ids"
+        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
+      }
       output_map {
         key: "output_ids"
         value: "_TOKENS_BATCH"

diff --git a/all_models/inflight_batcher_llm/postprocessing/1/model.py b/all_models/inflight_batcher_llm/postprocessing/1/model.py
@@ -240,7 +240,7 @@ def _postprocessing(self, tokens_batch, sequence_lengths):
                 # Exclude fake ids in multimodal models
                 fake_id_len = 0
                 for i in range(seq_len):
-                    if tokens[i] < self.tokenizer.vocab_size:
+                    if tokens[i] < len(self.tokenizer.vocab):
                         fake_id_len = i
                         break
                 output = self.tokenizer.decode(

diff --git a/all_models/inflight_batcher_llm/postprocessing/config.pbtxt b/all_models/inflight_batcher_llm/postprocessing/config.pbtxt
@@ -27,6 +27,7 @@
 name: "postprocessing"
 backend: "python"
 max_batch_size: ${triton_max_batch_size}
+dynamic_batching {}
 input [
   {
     name: "TOKENS_BATCH"