Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct:
- accuracy: 31.354
microsoft/Phi-4-mini-instruct:
- accuracy: 32.921
- quant_algo: FP8
accuracy: 32.823
bigcode/starcoder2-7b:
- accuracy: 26.611
- quant_algo: FP8
Expand Down Expand Up @@ -132,6 +134,8 @@ meta-llama/Llama-3.1-8B-Instruct:
- accuracy: 33.640
- spec_dec_algo: Eagle
accuracy: 33.640
- extra_acc_spec: logprobs=2
accuracy: 30.522
- quant_algo: FP8
accuracy: 33.841
- quant_algo: FP8
Expand Down Expand Up @@ -207,7 +211,8 @@ mistralai/Mistral-7B-Instruct-v0.3:
accuracy: 31.201
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
- accuracy: 29.20
mistralai/Mistral-Nemo-Base-2407:
mistralai/Mistral-Nemo-12b-Base:
- accuracy: 28.906
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 24.0
Expand Down
6 changes: 6 additions & 0 deletions tests/integration/defs/accuracy/references/mmlu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,8 @@ nvidia/Nemotron-H-56B-Base-8K:
accuracy: 83.82
microsoft/Phi-4-mini-instruct:
- accuracy: 68.98
- quant_algo: FP8
accuracy: 68.30
bigcode/starcoder2-7b:
- accuracy: 41.35
- quant_algo: FP8
Expand Down Expand Up @@ -275,3 +277,7 @@ GPT-OSS/MXFP4:
accuracy: 75.50
- quant_algo: W4A8_MXFP4_FP8
accuracy: 75.50
mistralai/Mistral-Nemo-12b-Base:
- accuracy: 69.66
- quant_algo: FP8
accuracy: 69.66
87 changes: 81 additions & 6 deletions tests/integration/defs/accuracy/test_llm_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
import pytest

from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
from tensorrt_llm.llmapi import (EagleDecodingConfig,
ExtendedRuntimePerfKnobConfig, KvCacheConfig,
SamplingParams)
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo

Expand Down Expand Up @@ -76,6 +78,27 @@ def test_guided_decoding_4gpus(self, backend: str):
task = JsonModeEval(self.MODEL_NAME)
task.evaluate(llm)

def test_gather_generation_logits_cuda_graph(self):
"""RCCA: https://nvbugs/5365525"""
extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig(
cuda_graph_mode=True, cuda_graph_cache_size=1)
llm = LLM(
self.MODEL_PATH,
gather_generation_logits=True,
extended_runtime_perf_knob_config=extended_runtime_perf_knob_config)
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)

def test_logprobs(self):
sampling_config = SamplingParams(logprobs=2)
llm = LLM(self.MODEL_PATH, gather_generation_logits=True)
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm,
sampling_params=sampling_config,
extra_acc_spec="logprobs=2")


class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
MODEL_NAME = "meta-llama/Llama-3.2-1B"
Expand Down Expand Up @@ -177,18 +200,49 @@ def test_quant_tp4(self, quant):
task.evaluate(llm)


class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness):
MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407"
class TestMistralNemo12B(LlmapiAccuracyTestHarness):
MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"

@pytest.mark.skip_less_device_memory(80000)
def test_auto_dtype(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)

with LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
max_batch_size=8) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

def test_auto_dtype_tp2(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)

with LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
tensor_parallel_size=2,
max_batch_size=8) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device_memory(80000)
@skip_pre_ada
def test_fp8(self):
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
quant_config = QuantConfig(QuantAlgo.FP8,
kv_cache_quant_algo=QuantAlgo.FP8)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)

with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
with LLM(self.MODEL_PATH,
quant_config=quant_config,
kv_cache_config=kv_cache_config,
max_batch_size=8) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)


class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness):
Expand Down Expand Up @@ -244,6 +298,27 @@ def test_awq_tp2(self):
task.evaluate(llm)


class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-4-mini-instruct"
MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct"

def test_auto_dtype(self):
with LLM(self.MODEL_PATH) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_ada
def test_fp8(self):
quant_config = QuantConfig(QuantAlgo.FP8)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)


class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
Expand Down Expand Up @@ -378,7 +453,7 @@ def test_fp8(self):
@skip_pre_ada
def test_fp8_kvcache(self):
"RCCA: https://nvbugs/5065080"
quant_config = QuantConfig(QuantAlgo.FP8,
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
kv_cache_quant_algo=QuantAlgo.FP8)
with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
task = CnnDailymail(self.MODEL_NAME)
Expand Down
30 changes: 30 additions & 0 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -1773,6 +1773,36 @@ def test_auto_dtype_tp8(self):
task.evaluate(llm)


class TestMistralNemo12B(LlmapiAccuracyTestHarness):
MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"

@pytest.mark.skip_less_device_memory(80000)
def test_auto_dtype(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)

with LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
max_batch_size=8) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(2)
def test_auto_dtype_tp2(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)

with LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
tensor_parallel_size=2,
max_batch_size=8) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)


@pytest.mark.timeout(5400)
@pytest.mark.skip_less_device_memory(80000)
class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):
Expand Down
10 changes: 9 additions & 1 deletion tests/integration/test_lists/qa/llm_function_full.txt
Original file line number Diff line number Diff line change
Expand Up @@ -420,6 +420,10 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_auto_dtype
accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
Expand All @@ -431,7 +435,9 @@ accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4]
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq]
accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq]
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype
accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype_tp2
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
Expand Down Expand Up @@ -576,6 +582,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2

test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_lists/test-db/l0_h100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ l0_h100:
- examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
- examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] # 5 mins
- accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
- accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
- accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
- examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
- examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
Expand Down
4 changes: 3 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489)
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489)
disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504)
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197)
accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
Expand All @@ -314,3 +314,5 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)
llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796)
disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811)
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525)