Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/integration/defs/accuracy/references/mmlu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,10 @@ meta-llama/Llama-3.1-8B:
accuracy: 64.99
meta-llama/Llama-3.1-8B-Instruct:
- accuracy: 68.17
- spec_dec_algo: EAGLE3
accuracy: 68.20
- spec_dec_algo: NGRAM
accuracy: 68.17
- quant_algo: FP8
accuracy: 67.93
- quant_algo: FP8
Expand Down
53 changes: 51 additions & 2 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,9 @@

from tensorrt_llm._torch import LLM
from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
MTPDecodingConfig, NGramDecodingConfig,
SamplingParams)
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo

Expand Down Expand Up @@ -196,7 +198,6 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5292517")
@skip_pre_hopper
def test_fp8_llm_sampler(self):
model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
Expand All @@ -214,6 +215,54 @@ def test_fp8_llm_sampler(self):
sampling_params=sampling_params,
extra_acc_spec="temperature=0.8,top_p=0.95")

def test_eagle3(self):
pytorch_config = dict(
disable_overlap_scheduler=True,
use_cuda_graph=True,
cuda_graph_batch_sizes=[1],
)
kv_cache_config = KvCacheConfig(enable_block_reuse=False)

eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"

draft_len = 4
spec_config = EagleDecodingConfig(
max_draft_len=draft_len, pytorch_eagle_weights_path=eagle_model_dir)

llm = LLM(model=target_model_dir,
**pytorch_config,
kv_cache_config=kv_cache_config,
speculative_config=spec_config,
build_config=None)

with llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)

def test_ngram(self):
pytorch_config = dict(disable_overlap_scheduler=True)

kv_cache_config = KvCacheConfig(enable_block_reuse=False)

draft_len = 4
spec_config = NGramDecodingConfig(
prompt_lookup_num_tokens=draft_len,
max_matching_ngram_size=draft_len,
is_keep_all=True,
is_use_oldest=True,
is_public_pool=True,
)

llm = LLM(model=self.MODEL_PATH,
**pytorch_config,
kv_cache_config=kv_cache_config,
speculative_config=spec_config)

with llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)


class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
MODEL_NAME = "meta-llama/Llama-3.2-1B"
Expand Down
28 changes: 28 additions & 0 deletions tests/integration/defs/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -1569,6 +1569,34 @@ def test_ptq_quickstart_advanced_mtp(llm_root, llm_venv, model_name,
_check_mem_usage(running_log, [54.50, 0, 0, 0])


@pytest.mark.parametrize("model_name,model_path", [
("Llama-3.1-8B-Instruct", "llama-3.1-model/Llama-3.1-8B-Instruct"),
])
def test_ptq_quickstart_advanced_ngram(llm_root, llm_venv, model_name,
model_path):
print(f"Testing {model_name}.")
example_root = Path(os.path.join(llm_root, "examples", "pytorch"))
with tempfile.NamedTemporaryFile(mode='w+t',
suffix=f".{model_name}.log",
dir="./",
delete=True,
delete_on_close=True) as running_log:
llm_venv.run_cmd([
str(example_root / "quickstart_advanced.py"),
"--disable_overlap_scheduler",
"--spec_decode_nextn",
"4",
"--max_matching_ngram_size",
"2",
"--spec_decode_algo",
"NGRAM",
"--model_dir",
f"{llm_models_root()}/{model_path}",
],
stdout=running_log)
_check_mem_usage(running_log, [4.60, 0, 0, 0])


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(8)
@skip_pre_hopper
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/qa/examples_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -434,6 +434,8 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
Expand Down Expand Up @@ -502,6 +504,7 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla
test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base]
test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K]
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
test_e2e.py::test_ptq_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B]
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8]
test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8]
Expand Down
3 changes: 3 additions & 0 deletions tests/integration/test_lists/qa/llm_sanity_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,9 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
accuracy/test_cli_flow.py::TestNemotronNano::test_auto_dtype
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -406,6 +406,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugspro.nvidia.com/bug/5324239)
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523)
examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523)
examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904)
Expand Down Expand Up @@ -454,6 +455,7 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5303573)
test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980)
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5318059)
test_e2e.py::test_ptq_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugspro.nvidia.com/bug/5324239)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087)
unittest/_torch/auto_deploy/integration/test_ad_build.py SKIP (https://nvbugs/5318103)
Expand Down
2 changes: 1 addition & 1 deletion tests/unittest/_torch/speculative/test_ngram.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str):
models_path = llm_models_root()

pytorch_config = dict(
enable_overlap_scheduler=False,
disable_overlap_scheduler=True,
use_cuda_graph=use_cuda_graph,
# Only create a single CUDA graph to prevent OOM in CI
attn_backend=attn_backend,
Expand Down