diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index e7981413761..9bbe98b2540 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -20,10 +20,6 @@ meta-llama/Llama-3.1-8B: accuracy: 64.99 meta-llama/Llama-3.1-8B-Instruct: - accuracy: 68.17 - - spec_dec_algo: EAGLE3 - accuracy: 68.20 - - spec_dec_algo: NGRAM - accuracy: 68.17 - quant_algo: FP8 accuracy: 67.93 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 1cbb7c96479..19897d1ee19 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -16,9 +16,7 @@ from tensorrt_llm._torch import LLM from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig -from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig, - MTPDecodingConfig, NGramDecodingConfig, - SamplingParams) +from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization import QuantAlgo @@ -198,6 +196,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend, task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5292517") @skip_pre_hopper def test_fp8_llm_sampler(self): model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8" @@ -215,54 +214,6 @@ def test_fp8_llm_sampler(self): sampling_params=sampling_params, extra_acc_spec="temperature=0.8,top_p=0.95") - def test_eagle3(self): - pytorch_config = dict( - disable_overlap_scheduler=True, - use_cuda_graph=True, - cuda_graph_batch_sizes=[1], - ) - kv_cache_config = KvCacheConfig(enable_block_reuse=False) - - eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B" - target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" - - draft_len = 4 - spec_config = EagleDecodingConfig( - max_draft_len=draft_len, pytorch_eagle_weights_path=eagle_model_dir) - - llm = LLM(model=target_model_dir, - **pytorch_config, - kv_cache_config=kv_cache_config, - speculative_config=spec_config, - build_config=None) - - with llm: - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - - def test_ngram(self): - pytorch_config = dict(disable_overlap_scheduler=True) - - kv_cache_config = KvCacheConfig(enable_block_reuse=False) - - draft_len = 4 - spec_config = NGramDecodingConfig( - prompt_lookup_num_tokens=draft_len, - max_matching_ngram_size=draft_len, - is_keep_all=True, - is_use_oldest=True, - is_public_pool=True, - ) - - llm = LLM(model=self.MODEL_PATH, - **pytorch_config, - kv_cache_config=kv_cache_config, - speculative_config=spec_config) - - with llm: - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - class TestLlama3_2_1B(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.2-1B" diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 959226150a8..20b58860a27 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1650,34 +1650,6 @@ def test_ptq_quickstart_advanced_bs1(llm_root, llm_venv): ]) -@pytest.mark.parametrize("model_name,model_path", [ - ("Llama-3.1-8B-Instruct", "llama-3.1-model/Llama-3.1-8B-Instruct"), -]) -def test_ptq_quickstart_advanced_ngram(llm_root, llm_venv, model_name, - model_path): - print(f"Testing {model_name}.") - example_root = Path(os.path.join(llm_root, "examples", "pytorch")) - with tempfile.NamedTemporaryFile(mode='w+t', - suffix=f".{model_name}.log", - dir="./", - delete=True, - delete_on_close=True) as running_log: - llm_venv.run_cmd([ - str(example_root / "quickstart_advanced.py"), - "--disable_overlap_scheduler", - "--spec_decode_nextn", - "4", - "--max_matching_ngram_size", - "2", - "--spec_decode_algo", - "NGRAM", - "--model_dir", - f"{llm_models_root()}/{model_path}", - ], - stdout=running_log) - _check_mem_usage(running_log, [4.60, 0, 0, 0]) - - @pytest.mark.skip_less_device_memory(80000) @pytest.mark.skip_less_device(8) @skip_pre_hopper diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 6c9c35a9c62..ae8450ef8d3 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -434,8 +434,6 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2 accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 @@ -504,7 +502,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla test_e2e.py::test_ptp_quickstart_advanced[Nemotron4_4B-BF16-nemotron/Minitron-4B-Base] test_e2e.py::test_ptp_quickstart_advanced[Nemotron-H-8B-Nemotron-H-8B-Base-8K] test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] -test_e2e.py::test_ptq_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B] test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt index 58bc315e362..f58386390ed 100644 --- a/tests/integration/test_lists/qa/llm_sanity_test.txt +++ b/tests/integration/test_lists/qa/llm_sanity_test.txt @@ -122,9 +122,6 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype accuracy/test_cli_flow.py::TestNemotronNano::test_auto_dtype diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index edba6af385a..0f2b9fe384a 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -393,7 +393,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5285965) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] SKIP (https://nvbugs/5285965) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugspro.nvidia.com/bug/5324239) examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (https://nvbugs/5289523) examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (https://nvbugs/5289523) examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5289904) @@ -437,7 +436,6 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5303573) test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980) test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5318059) -test_e2e.py::test_ptq_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugspro.nvidia.com/bug/5324239) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5318087) unittest/_torch/auto_deploy/integration/test_ad_build.py SKIP (https://nvbugs/5318103) diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py index 7db50084e49..e996725e5f9 100644 --- a/tests/unittest/_torch/speculative/test_ngram.py +++ b/tests/unittest/_torch/speculative/test_ngram.py @@ -26,7 +26,7 @@ def test_llama_ngram(use_cuda_graph: bool, attn_backend: str): models_path = llm_models_root() pytorch_config = dict( - disable_overlap_scheduler=True, + enable_overlap_scheduler=False, use_cuda_graph=use_cuda_graph, # Only create a single CUDA graph to prevent OOM in CI attn_backend=attn_backend,