Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
QualifierAlignment: Right
ReflowComments: true
SeparateDefinitionBlocks: Always
SortIncludes: CaseSensitive
Expand Down
172 changes: 130 additions & 42 deletions README.md

Large diffs are not rendered by default.

20 changes: 5 additions & 15 deletions all_models/gpt/postprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy as np
import triton_python_backend_utils as pb_utils
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
from transformers import AutoTokenizer


class TritonPythonModel:
Expand All @@ -30,21 +30,11 @@ def initialize(self, args):
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']
tokenizer_type = model_config['parameters']['tokenizer_type'][
'string_value']

if tokenizer_type == 't5':
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
padding_side='left')
elif tokenizer_type == 'auto':
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, padding_side='left', trust_remote_code=True)
elif tokenizer_type == 'llama':
self.tokenizer = LlamaTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side='left')
else:
raise AttributeError(
f'Unexpected tokenizer type: {tokenizer_type}')
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
legacy=False,
padding_side="left",
trust_remote_code=True)
self.tokenizer.pad_token = self.tokenizer.eos_token

# Parse model output configs
Expand Down
7 changes: 0 additions & 7 deletions all_models/gpt/postprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,6 @@ parameters {
}
}

parameters {
key: "tokenizer_type"
value: {
string_value: "${tokenizer_type}"
}
}

instance_group [
{
count: 1
Expand Down
21 changes: 5 additions & 16 deletions all_models/gpt/preprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import torch
import triton_python_backend_utils as pb_utils
from torch.nn.utils.rnn import pad_sequence
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
from transformers import AutoTokenizer


class TritonPythonModel:
Expand All @@ -33,23 +33,12 @@ def initialize(self, args):
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']
tokenizer_type = model_config['parameters']['tokenizer_type'][
'string_value']

if tokenizer_type == 't5':
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
padding_side='left')
elif tokenizer_type == 'auto':
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, padding_side='left', trust_remote_code=True)
elif tokenizer_type == 'llama':
self.tokenizer = LlamaTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side='left')
else:
raise AttributeError(
f'Unexpected tokenizer type: {tokenizer_type}')
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
padding_side='left',
legacy=False,
trust_remote_code=True)
self.tokenizer.pad_token = self.tokenizer.eos_token

self.pad_id = self.tokenizer.encode(self.tokenizer.pad_token,
add_special_tokens=False)[0]

Expand Down
7 changes: 0 additions & 7 deletions all_models/gpt/preprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,6 @@ parameters {
}
}

parameters {
key: "tokenizer_type"
value: {
string_value: "${tokenizer_type}"
}
}

instance_group [
{
count: 1
Expand Down
4 changes: 2 additions & 2 deletions all_models/gpt/tensorrt_llm/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,8 +173,8 @@ def execute(self, requests):
]

if sampling_config.output_log_probs:
# [max_new_tokens, batch_size, num_beams] -> [batch_size, max_new_tokens, num_beams]
log_probs = self.runner.session.log_probs.transpose(
# [max_seq_len, batch_size, num_beams] -> [batch_size, max_seq_len, num_beams]
log_probs = self.runner.session.log_probs_tiled.transpose(
0, 1).cpu().numpy()
output_tensors.append(
pb_utils.Tensor("log_probs", log_probs))
Expand Down
86 changes: 52 additions & 34 deletions all_models/inflight_batcher_llm/postprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

import numpy as np
import triton_python_backend_utils as pb_utils
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
from transformers import AutoTokenizer


class TritonPythonModel:
Expand All @@ -55,26 +55,16 @@ def initialize(self, args):
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']
tokenizer_type = model_config['parameters']['tokenizer_type'][
'string_value']
self.skip_special_tokens = model_config['parameters'].get(
'skip_special_tokens',
{'string_value': "true"})['string_value'].lower() in [
'true', '1', 't', 'y', 'yes'
]

if tokenizer_type == 't5':
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
padding_side='left')
elif tokenizer_type == 'auto':
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, padding_side='left', trust_remote_code=True)
elif tokenizer_type == 'llama':
self.tokenizer = LlamaTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side='left')
else:
raise AttributeError(
f'Unexpected tokenizer type: {tokenizer_type}')
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
legacy=False,
padding_side='left',
trust_remote_code=True)
self.tokenizer.pad_token = self.tokenizer.eos_token

# Parse model output configs
Expand Down Expand Up @@ -120,19 +110,19 @@ def execute(self, requests):

# Get cum log probs
cum_log_probs = pb_utils.get_input_tensor_by_name(
request, 'CUM_LOG_PROBS').as_numpy()
request, 'CUM_LOG_PROBS')

# Get sequence length
output_log_probs = pb_utils.get_input_tensor_by_name(
request, 'OUTPUT_LOG_PROBS').as_numpy()
request, 'OUTPUT_LOG_PROBS')

# Get context logits
context_logits = pb_utils.get_input_tensor_by_name(
request, 'CONTEXT_LOGITS').as_numpy()
request, 'CONTEXT_LOGITS')

# Get generation logits
generation_logits = pb_utils.get_input_tensor_by_name(
request, 'GENERATION_LOGITS').as_numpy()
request, 'GENERATION_LOGITS')

# Reshape Input
# tokens_batch = tokens_batch.reshape([-1, tokens_batch.shape[0]])
Expand All @@ -147,17 +137,47 @@ def execute(self, requests):
'OUTPUT',
np.array(outputs).astype(self.output_dtype))

out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
cum_log_probs)

out_output_log_probs = pb_utils.Tensor('OUT_OUTPUT_LOG_PROBS',
output_log_probs)

out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
context_logits)

out_generation_logits = pb_utils.Tensor('OUT_GENERATION_LOGITS',
generation_logits)
outputs = []
outputs.append(output_tensor)

if cum_log_probs:
out_cum_log_probs = pb_utils.Tensor('OUT_CUM_LOG_PROBS',
cum_log_probs.as_numpy())
outputs.append(out_cum_log_probs)
else:
out_cum_log_probs = pb_utils.Tensor(
'OUT_CUM_LOG_PROBS', np.array([[0.0]], dtype=np.float32))
outputs.append(out_cum_log_probs)

if output_log_probs:
out_output_log_probs = pb_utils.Tensor(
'OUT_OUTPUT_LOG_PROBS', output_log_probs.as_numpy())
outputs.append(out_output_log_probs)
else:
out_output_log_probs = pb_utils.Tensor(
'OUT_OUTPUT_LOG_PROBS',
np.array([[[0.0]]], dtype=np.float32))
outputs.append(out_output_log_probs)

if context_logits:
out_context_logits = pb_utils.Tensor('OUT_CONTEXT_LOGITS',
context_logits.as_numpy())
outputs.append(out_context_logits)
else:
out_context_logits = pb_utils.Tensor(
'OUT_CONTEXT_LOGITS', np.array([[[0.0]]],
dtype=np.float32))
outputs.append(out_context_logits)

if generation_logits:
out_generation_logits = pb_utils.Tensor(
'OUT_GENERATION_LOGITS', generation_logits.as_numpy())
outputs.append(out_generation_logits)
else:
out_generation_logits = pb_utils.Tensor(
'OUT_GENERATION_LOGITS',
np.array([[[[0.0]]]], dtype=np.float32))
outputs.append(out_generation_logits)

# Create InferenceResponse. You can set an error here in case
# there was a problem with handling this inference request.
Expand All @@ -166,10 +186,8 @@ def execute(self, requests):
#
# pb_utils.InferenceResponse(
# output_tensors=..., TritonError("An error occurred"))
inference_response = pb_utils.InferenceResponse(output_tensors=[
output_tensor, out_cum_log_probs, out_output_log_probs,
out_context_logits, out_generation_logits
])
inference_response = pb_utils.InferenceResponse(
output_tensors=outputs)
responses.append(inference_response)

# You should return a list of pb_utils.InferenceResponse. Length
Expand Down
9 changes: 2 additions & 7 deletions all_models/inflight_batcher_llm/postprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,13 @@ input [
name: "CUM_LOG_PROBS"
data_type: TYPE_FP32
dims: [ -1 ]
optional: true
},
{
name: "OUTPUT_LOG_PROBS"
data_type: TYPE_FP32
dims: [ -1, -1 ]
optional: true
},
{
name: "CONTEXT_LOGITS"
Expand Down Expand Up @@ -96,13 +98,6 @@ parameters {
}
}

parameters {
key: "tokenizer_type"
value: {
string_value: "${tokenizer_type}"
}
}

parameters {
key: "skip_special_tokens"
value: {
Expand Down
43 changes: 21 additions & 22 deletions all_models/inflight_batcher_llm/preprocessing/1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@

import numpy as np
import triton_python_backend_utils as pb_utils
from transformers import AutoTokenizer, LlamaTokenizer, T5Tokenizer
from transformers import AutoTokenizer, T5Tokenizer


class TritonPythonModel:
Expand All @@ -56,26 +56,18 @@ def initialize(self, args):
model_config = json.loads(args['model_config'])
tokenizer_dir = model_config['parameters']['tokenizer_dir'][
'string_value']
tokenizer_type = model_config['parameters']['tokenizer_type'][
'string_value']
self.add_special_tokens = model_config['parameters'].get(
'add_special_tokens',
{'string_value': "false"})['string_value'].lower() in [
'true', '1', 't', 'y', 'yes'
]

if tokenizer_type == 't5':
self.tokenizer = T5Tokenizer(vocab_file=tokenizer_dir,
padding_side='left')
elif tokenizer_type == 'auto':
self.tokenizer = AutoTokenizer.from_pretrained(
tokenizer_dir, padding_side='left', trust_remote_code=True)
elif tokenizer_type == 'llama':
self.tokenizer = LlamaTokenizer.from_pretrained(
tokenizer_dir, legacy=False, padding_side='left')
else:
raise AttributeError(
f'Unexpected tokenizer type: {tokenizer_type}')
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
legacy=False,
padding_side='left',
trust_remote_code=True)
if isinstance(self.tokenizer, T5Tokenizer):
self.tokenizer_bos_id = self.tokenizer.sp_model.bos_id()
self.tokenizer.pad_token = self.tokenizer.eos_token

self.tokenizer_end_id = self.tokenizer.encode(
Expand Down Expand Up @@ -234,13 +226,20 @@ def _create_request(self, query):
"""
query : batch string (2D numpy array)
"""
start_ids = [
np.array(
self.tokenizer.encode(
s[0].decode(),
add_special_tokens=self.add_special_tokens)).astype(int)
for s in query
]
if isinstance(self.tokenizer, T5Tokenizer):
start_ids = [
np.array([self.tokenizer_bos_id] + self.tokenizer.encode(
s[0].decode(), add_special_tokens=self.add_special_tokens)
).astype(int) for s in query
]
else:
start_ids = [
np.array(
self.tokenizer.encode(
s[0].decode(),
add_special_tokens=self.add_special_tokens)).astype(
int) for s in query
]
start_lengths = np.array([[len(ids)] for ids in start_ids]).astype(int)

max_len = 0
Expand Down
9 changes: 1 addition & 8 deletions all_models/inflight_batcher_llm/preprocessing/config.pbtxt
Original file line number Diff line number Diff line change
Expand Up @@ -125,17 +125,10 @@ parameters {
}
}

parameters {
key: "tokenizer_type"
value: {
string_value: "${tokenizer_type}"
}
}

parameters {
key: "add_special_tokens"
value: {
string_value: "False"
string_value: "${add_special_tokens}"
}
}

Expand Down
Loading