Skip to content

Commit 76464e9

Browse files
authored
Update TensorRT-LLM backend (#407)
* Update TensorRT-LLM backend
1 parent 41fe3a6 commit 76464e9

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+4022
-946
lines changed

.clang-format

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ PenaltyBreakString: 1000
5959
PenaltyExcessCharacter: 1000000
6060
PenaltyReturnTypeOnItsOwnLine: 60
6161
PointerAlignment: Left
62+
QualifierAlignment: Right
6263
ReflowComments: true
6364
SeparateDefinitionBlocks: Always
6465
SortIncludes: CaseSensitive

README.md

Lines changed: 130 additions & 42 deletions
Large diffs are not rendered by default.

all_models/gpt/postprocessing/1/model.py

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import numpy as np
55
import triton_python_backend_utils as pb_utils
6-
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
6+
from transformers import AutoTokenizer
77

88

99
class TritonPythonModel:
@@ -30,21 +30,11 @@ def initialize(self, args):
3030
model_config = json.loads(args['model_config'])
3131
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
3232
'string_value']
33-
tokenizer_type = model_config['parameters']['tokenizer_type'][
34-
'string_value']
3533

36-
if tokenizer_type == 't5':
37-
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
38-
padding_side='left')
39-
elif tokenizer_type == 'auto':
40-
self.tokenizer = AutoTokenizer.from_pretrained(
41-
tokenizer_dir, padding_side='left', trust_remote_code=True)
42-
elif tokenizer_type == 'llama':
43-
self.tokenizer = LlamaTokenizer.from_pretrained(
44-
tokenizer_dir, legacy=False, padding_side='left')
45-
else:
46-
raise AttributeError(
47-
f'Unexpected tokenizer type: {tokenizer_type}')
34+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
35+
legacy=False,
36+
padding_side="left",
37+
trust_remote_code=True)
4838
self.tokenizer.pad_token = self.tokenizer.eos_token
4939

5040
# Parse model output configs

all_models/gpt/postprocessing/config.pbtxt

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,6 @@ parameters {
2323
}
2424
}
2525

26-
parameters {
27-
key: "tokenizer_type"
28-
value: {
29-
string_value: "${tokenizer_type}"
30-
}
31-
}
32-
3326
instance_group [
3427
{
3528
count: 1

all_models/gpt/preprocessing/1/model.py

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import torch
77
import triton_python_backend_utils as pb_utils
88
from torch.nn.utils.rnn import pad_sequence
9-
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
9+
from transformers import AutoTokenizer
1010

1111

1212
class TritonPythonModel:
@@ -33,23 +33,12 @@ def initialize(self, args):
3333
model_config = json.loads(args['model_config'])
3434
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
3535
'string_value']
36-
tokenizer_type = model_config['parameters']['tokenizer_type'][
37-
'string_value']
3836

39-
if tokenizer_type == 't5':
40-
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
41-
padding_side='left')
42-
elif tokenizer_type == 'auto':
43-
self.tokenizer = AutoTokenizer.from_pretrained(
44-
tokenizer_dir, padding_side='left', trust_remote_code=True)
45-
elif tokenizer_type == 'llama':
46-
self.tokenizer = LlamaTokenizer.from_pretrained(
47-
tokenizer_dir, legacy=False, padding_side='left')
48-
else:
49-
raise AttributeError(
50-
f'Unexpected tokenizer type: {tokenizer_type}')
37+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
38+
padding_side='left',
39+
legacy=False,
40+
trust_remote_code=True)
5141
self.tokenizer.pad_token = self.tokenizer.eos_token
52-
5342
self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
5443
add_special_tokens=False)[0]
5544

all_models/gpt/preprocessing/config.pbtxt

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,6 @@ parameters {
6363
}
6464
}
6565

66-
parameters {
67-
key: "tokenizer_type"
68-
value: {
69-
string_value: "${tokenizer_type}"
70-
}
71-
}
72-
7366
instance_group [
7467
{
7568
count: 1

all_models/gpt/tensorrt_llm/1/model.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ def execute(self, requests):
173173
]
174174

175175
if sampling_config.output_log_probs:
176-
# [max_new_tokens, batch_size, num_beams] -> [batch_size, max_new_tokens, num_beams]
177-
log_probs = self.runner.session.log_probs.transpose(
176+
# [max_seq_len, batch_size, num_beams] -> [batch_size, max_seq_len, num_beams]
177+
log_probs = self.runner.session.log_probs_tiled.transpose(
178178
0, 1).cpu().numpy()
179179
output_tensors.append(
180180
pb_utils.Tensor("log_probs", log_probs))

all_models/inflight_batcher_llm/postprocessing/1/model.py

Lines changed: 52 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828

2929
import numpy as np
3030
import triton_python_backend_utils as pb_utils
31-
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
31+
from transformers import AutoTokenizer
3232

3333

3434
class TritonPythonModel:
@@ -55,26 +55,16 @@ def initialize(self, args):
5555
model_config = json.loads(args['model_config'])
5656
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
5757
'string_value']
58-
tokenizer_type = model_config['parameters']['tokenizer_type'][
59-
'string_value']
6058
self.skip_special_tokens = model_config['parameters'].get(
6159
'skip_special_tokens',
6260
{'string_value': "true"})['string_value'].lower() in [
6361
'true', '1', 't', 'y', 'yes'
6462
]
6563

66-
if tokenizer_type == 't5':
67-
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
68-
padding_side='left')
69-
elif tokenizer_type == 'auto':
70-
self.tokenizer = AutoTokenizer.from_pretrained(
71-
tokenizer_dir, padding_side='left', trust_remote_code=True)
72-
elif tokenizer_type == 'llama':
73-
self.tokenizer = LlamaTokenizer.from_pretrained(
74-
tokenizer_dir, legacy=False, padding_side='left')
75-
else:
76-
raise AttributeError(
77-
f'Unexpected tokenizer type: {tokenizer_type}')
64+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
65+
legacy=False,
66+
padding_side='left',
67+
trust_remote_code=True)
7868
self.tokenizer.pad_token = self.tokenizer.eos_token
7969

8070
# Parse model output configs
@@ -120,19 +110,19 @@ def execute(self, requests):
120110

121111
# Get cum log probs
122112
cum_log_probs = pb_utils.get_input_tensor_by_name(
123-
request, 'CUM_LOG_PROBS').as_numpy()
113+
request, 'CUM_LOG_PROBS')
124114

125115
# Get sequence length
126116
output_log_probs = pb_utils.get_input_tensor_by_name(
127-
request, 'OUTPUT_LOG_PROBS').as_numpy()
117+
request, 'OUTPUT_LOG_PROBS')
128118

129119
# Get context logits
130120
context_logits = pb_utils.get_input_tensor_by_name(
131-
request, 'CONTEXT_LOGITS').as_numpy()
121+
request, 'CONTEXT_LOGITS')
132122

133123
# Get generation logits
134124
generation_logits = pb_utils.get_input_tensor_by_name(
135-
request, 'GENERATION_LOGITS').as_numpy()
125+
request, 'GENERATION_LOGITS')
136126

137127
# Reshape Input
138128
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
@@ -147,17 +137,47 @@ def execute(self, requests):
147137
'OUTPUT',
148138
np.array(outputs).astype(self.output_dtype))
149139

150-
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
151-
cum_log_probs)
152-
153-
out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
154-
output_log_probs)
155-
156-
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
157-
context_logits)
158-
159-
out_generation_logits = pb_utils.Tensor('OUT_GENERATION_LOGITS',
160-
generation_logits)
140+
outputs = []
141+
outputs.append(output_tensor)
142+
143+
if cum_log_probs:
144+
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
145+
cum_log_probs.as_numpy())
146+
outputs.append(out_cum_log_probs)
147+
else:
148+
out_cum_log_probs = pb_utils.Tensor(
149+
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
150+
outputs.append(out_cum_log_probs)
151+
152+
if output_log_probs:
153+
out_output_log_probs = pb_utils.Tensor(
154+
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
155+
outputs.append(out_output_log_probs)
156+
else:
157+
out_output_log_probs = pb_utils.Tensor(
158+
'OUT_OUTPUT_LOG_PROBS',
159+
np.array([[[0.0]]], dtype=np.float32))
160+
outputs.append(out_output_log_probs)
161+
162+
if context_logits:
163+
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
164+
context_logits.as_numpy())
165+
outputs.append(out_context_logits)
166+
else:
167+
out_context_logits = pb_utils.Tensor(
168+
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
169+
dtype=np.float32))
170+
outputs.append(out_context_logits)
171+
172+
if generation_logits:
173+
out_generation_logits = pb_utils.Tensor(
174+
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
175+
outputs.append(out_generation_logits)
176+
else:
177+
out_generation_logits = pb_utils.Tensor(
178+
'OUT_GENERATION_LOGITS',
179+
np.array([[[[0.0]]]], dtype=np.float32))
180+
outputs.append(out_generation_logits)
161181

162182
# Create InferenceResponse. You can set an error here in case
163183
# there was a problem with handling this inference request.
@@ -166,10 +186,8 @@ def execute(self, requests):
166186
#
167187
# pb_utils.InferenceResponse(
168188
# output_tensors=..., TritonError("An error occurred"))
169-
inference_response = pb_utils.InferenceResponse(output_tensors=[
170-
output_tensor, out_cum_log_probs, out_output_log_probs,
171-
out_context_logits, out_generation_logits
172-
])
189+
inference_response = pb_utils.InferenceResponse(
190+
output_tensors=outputs)
173191
responses.append(inference_response)
174192

175193
# You should return a list of pb_utils.InferenceResponse. Length

all_models/inflight_batcher_llm/postprocessing/config.pbtxt

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,13 @@ input [
4242
name: "CUM_LOG_PROBS"
4343
data_type: TYPE_FP32
4444
dims: [ -1 ]
45+
optional: true
4546
},
4647
{
4748
name: "OUTPUT_LOG_PROBS"
4849
data_type: TYPE_FP32
4950
dims: [ -1, -1 ]
51+
optional: true
5052
},
5153
{
5254
name: "CONTEXT_LOGITS"
@@ -96,13 +98,6 @@ parameters {
9698
}
9799
}
98100

99-
parameters {
100-
key: "tokenizer_type"
101-
value: {
102-
string_value: "${tokenizer_type}"
103-
}
104-
}
105-
106101
parameters {
107102
key: "skip_special_tokens"
108103
value: {

all_models/inflight_batcher_llm/preprocessing/1/model.py

Lines changed: 21 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
import numpy as np
3131
import triton_python_backend_utils as pb_utils
32-
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
32+
from transformers import AutoTokenizer, T5Tokenizer
3333

3434

3535
class TritonPythonModel:
@@ -56,26 +56,18 @@ def initialize(self, args):
5656
model_config = json.loads(args['model_config'])
5757
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
5858
'string_value']
59-
tokenizer_type = model_config['parameters']['tokenizer_type'][
60-
'string_value']
6159
self.add_special_tokens = model_config['parameters'].get(
6260
'add_special_tokens',
6361
{'string_value': "false"})['string_value'].lower() in [
6462
'true', '1', 't', 'y', 'yes'
6563
]
6664

67-
if tokenizer_type == 't5':
68-
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
69-
padding_side='left')
70-
elif tokenizer_type == 'auto':
71-
self.tokenizer = AutoTokenizer.from_pretrained(
72-
tokenizer_dir, padding_side='left', trust_remote_code=True)
73-
elif tokenizer_type == 'llama':
74-
self.tokenizer = LlamaTokenizer.from_pretrained(
75-
tokenizer_dir, legacy=False, padding_side='left')
76-
else:
77-
raise AttributeError(
78-
f'Unexpected tokenizer type: {tokenizer_type}')
65+
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
66+
legacy=False,
67+
padding_side='left',
68+
trust_remote_code=True)
69+
if isinstance(self.tokenizer, T5Tokenizer):
70+
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
7971
self.tokenizer.pad_token = self.tokenizer.eos_token
8072

8173
self.tokenizer_end_id = self.tokenizer.encode(
@@ -234,13 +226,20 @@ def _create_request(self, query):
234226
"""
235227
query : batch string (2D numpy array)
236228
"""
237-
start_ids = [
238-
np.array(
239-
self.tokenizer.encode(
240-
s[0].decode(),
241-
add_special_tokens=self.add_special_tokens)).astype(int)
242-
for s in query
243-
]
229+
if isinstance(self.tokenizer, T5Tokenizer):
230+
start_ids = [
231+
np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
232+
s[0].decode(), add_special_tokens=self.add_special_tokens)
233+
).astype(int) for s in query
234+
]
235+
else:
236+
start_ids = [
237+
np.array(
238+
self.tokenizer.encode(
239+
s[0].decode(),
240+
add_special_tokens=self.add_special_tokens)).astype(
241+
int) for s in query
242+
]
244243
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)
245244

246245
max_len = 0

0 commit comments

Comments
 (0)