Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions examples/llm-api/quickstart_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,9 @@ def add_llm_args(parser):
default=False,
action='store_true',
help='Use piecewise CUDA graph to optimize the model')
parser.add_argument('--apply_chat_template',
default=False,
action='store_true')

# Sampling
parser.add_argument("--max_tokens", type=int, default=64)
Expand Down Expand Up @@ -273,6 +276,15 @@ def main():
prompts = args.prompt if args.prompt else example_prompts

llm, sampling_params = setup_llm(args)
new_prompts = []
if args.apply_chat_template:
for prompt in prompts:
messages = [{"role": "user", "content": f"{prompt}"}]
new_prompts.append(
llm.tokenizer.apply_chat_template(messages,
tokenize=False,
add_generation_prompt=True))
prompts = new_prompts
outputs = llm.generate(prompts, sampling_params)

for i, output in enumerate(outputs):
Expand Down
3 changes: 3 additions & 0 deletions tensorrt_llm/_torch/attention_backend/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,7 @@ def __call__(self, position_ids: torch.Tensor, q: torch.Tensor,
class RopeParams:
dim: int = 0
theta: float = 10000.0
alpha: float = 1.0
scale_type: RotaryScalingType = RotaryScalingType.none
scale: float = 1.0
low_freq_factor: float = 1.0
Expand Down Expand Up @@ -384,6 +385,7 @@ def from_config(config) -> "RopeParams":
rope_params.scale_type = RotaryScalingType.none
rope_params.scale = 1.0
if rope_scaling is not None:
rope_params.alpha = rope_scaling.get("alpha", 1.0)
rotary_scaling_type = rope_scaling.get(
"type", None) or rope_scaling.get("rope_type")
rope_params.scale_type = RotaryScalingType.from_string(
Expand Down Expand Up @@ -462,6 +464,7 @@ def create_rope_const_params(self, interleave: bool = True):
self.scale_type,
rope_scaling_config={
"factor": self.scale,
"alpha": self.alpha,
"low_freq_factor": self.low_freq_factor,
"high_freq_factor": self.high_freq_factor,
"original_max_position_embeddings":
Expand Down
7 changes: 5 additions & 2 deletions tensorrt_llm/_torch/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,11 @@ def fuse_pos_embd(self):
@property
def enable_flash_mla(self):
if self.attn_backend == 'TRTLLM':
if hasattr(self.pretrained_config, "kv_lora_rank") and hasattr(
self.pretrained_config, "qk_rope_head_dim"):
if hasattr(
self.pretrained_config, "kv_lora_rank"
) and self.pretrained_config.kv_lora_rank is not None and hasattr(
self.pretrained_config, "qk_rope_head_dim"
) and self.pretrained_config.qk_rope_head_dim is not None:
head_dim = self.pretrained_config.kv_lora_rank + self.pretrained_config.qk_rope_head_dim
if head_dim == 576 and torch.cuda.get_device_capability() == (
9, 0):
Expand Down
3 changes: 3 additions & 0 deletions tensorrt_llm/_torch/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .modeling_gemma3 import Gemma3ForCausalLM
from .modeling_gemma3vl import Gemma3VLM
from .modeling_gpt_oss import GptOssForCausalLM
from .modeling_hunyuan_moe import HunYuanMoEV1ForCausalLM
from .modeling_hyperclovax import HCXVisionForCausalLM
from .modeling_llama import LlamaForCausalLM
from .modeling_llava_next import LlavaNextModel
Expand Down Expand Up @@ -38,6 +39,8 @@
"Gemma3ForCausalLM",
"Gemma3VLM",
"HCXVisionForCausalLM",
"HunYuanMoEV1ForCausalLM",
"Gemma3Model",
"LlamaForCausalLM",
"LlavaNextModel",
"Mistral3VLM",
Expand Down
Loading
Loading