ShuaiShao93
diff --git a/‎README.md‎
Lines changed: 510 additions & 318 deletions b/‎README.md‎
Lines changed: 510 additions & 318 deletions
diff --git a/‎all_models/gpt/postprocessing/config.pbtxt‎
Lines changed: 1 addition & 0 deletions b/‎all_models/gpt/postprocessing/config.pbtxt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎all_models/gpt/tensorrt_llm/1/model.py‎
Lines changed: 1 addition & 1 deletion b/‎all_models/gpt/tensorrt_llm/1/model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt‎
Lines changed: 15 additions & 15 deletions b/‎all_models/inflight_batcher_llm/ensemble/config.pbtxt‎
Lines changed: 15 additions & 15 deletions
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/1/model.py‎
Lines changed: 1 addition & 1 deletion b/‎all_models/inflight_batcher_llm/postprocessing/1/model.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt‎
Lines changed: 1 addition & 0 deletions b/‎all_models/inflight_batcher_llm/postprocessing/config.pbtxt‎
Lines changed: 1 addition & 0 deletions
@@ -1,6 +1,7 @@
 name: "postprocessing"
 backend: "python"
 max_batch_size: 1024
+dynamic_batching {}
 input [
   {
     name: "TOKENS_BATCH"
 
@@ -160,7 +160,7 @@ def execute(self, requests):
             sampling_config.output_log_probs = inputs['output_log_probs']
             sampling_config.return_dict = True
 
-            outputs = self.runner.generate(input_ids, sampling_config)
+            outputs = self.runner.generate(input_ids, None, sampling_config)
             output_ids = outputs["output_ids"]
 
             if self.rank == 0:
 
@@ -39,12 +39,6 @@ input [
     dims: [ 1 ]
     optional: true
   },
-  {
-    name: "image_input"
-    data_type: TYPE_FP16
-    dims: [ 3, 224, 224 ]
-    optional: true
-  },
   {
     name: "max_tokens"
     data_type: TYPE_INT32
@@ -164,6 +158,12 @@ input [
     dims: [ -1, -1 ]
     optional: true
   },
+  {
+    name: "prompt_table_extra_id"
+    data_type: TYPE_UINT64
+    dims: [ 1 ]
+    optional: true
+  },
   {
     name: "prompt_vocab_size"
     data_type: TYPE_INT32
@@ -228,10 +228,6 @@ ensemble_scheduling {
         key: "DECODER_QUERY"
         value: "decoder_text_input"
       }
-      input_map {
-        key: "IMAGE"
-        value: "image_input"
-      }
       input_map {
         key: "REQUEST_OUTPUT_LEN"
         value: "max_tokens"
@@ -261,8 +257,8 @@ ensemble_scheduling {
         value: "pad_id"
       }
       input_map {
-        key: "PROMPT_EMBEDDING_TABLE"
-        value: "prompt_embedding_table"
+        key: "PROMPT_TABLE_EXTRA_ID"
+        value: "prompt_table_extra_id"
       }
       output_map {
         key: "REQUEST_INPUT_LEN"
@@ -305,8 +301,8 @@ ensemble_scheduling {
         value: "_PREPROCESSOR_PAD_ID"
       }
       output_map {
-        key: "OUT_PROMPT_EMBEDDING_TABLE"
-        value: "out_prompt_embedding_table"
+        key: "OUT_PROMPT_TABLE_EXTRA_IDS"
+        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
       }
     },
     {
@@ -402,7 +398,7 @@ ensemble_scheduling {
       }
       input_map {
         key: "prompt_embedding_table"
-        value: "out_prompt_embedding_table"
+        value: "prompt_embedding_table"
       }
       input_map {
         key: "prompt_vocab_size"
@@ -416,6 +412,10 @@ ensemble_scheduling {
         key: "bad_words_list"
         value: "_BAD_WORDS_IDS"
       }
+      input_map {
+        key: "prompt_table_extra_ids"
+        value: "_OUT_PROMPT_TABLE_EXTRA_IDS"
+      }
       output_map {
         key: "output_ids"
         value: "_TOKENS_BATCH"
 
@@ -240,7 +240,7 @@ def _postprocessing(self, tokens_batch, sequence_lengths):
                 # Exclude fake ids in multimodal models
                 fake_id_len = 0
                 for i in range(seq_len):
-                    if tokens[i] < self.tokenizer.vocab_size:
+                    if tokens[i] < len(self.tokenizer.vocab):
                         fake_id_len = i
                         break
                 output = self.tokenizer.decode(
 
@@ -27,6 +27,7 @@
 name: "postprocessing"
 backend: "python"
 max_batch_size: ${triton_max_batch_size}
+dynamic_batching {}
 input [
   {
     name: "TOKENS_BATCH"
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`name: "postprocessing"`
`2`	`2`	`backend: "python"`
`3`	`3`	`max_batch_size: 1024`
	`4`	`+dynamic_batching {}`
`4`	`5`	`input [`
`5`	`6`	`{`
`6`	`7`	`name: "TOKENS_BATCH"`
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@`
`27`	`27`	`name: "postprocessing"`
`28`	`28`	`backend: "python"`
`29`	`29`	`max_batch_size: ${triton_max_batch_size}`
	`30`	`+dynamic_batching {}`
`30`	`31`	`input [`
`31`	`32`	`{`
`32`	`33`	`name: "TOKENS_BATCH"`