diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index c96dc010583..949209fa205 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -261,7 +261,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } if (CloudManager.isNodeOnline(nodeName)) { - def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog" + def dockerArgs = "--gpus ${gpuCount} --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog -e NVIDIA_IMEX_CHANNELS=0" slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false) executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner) } else { @@ -362,6 +362,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL "--container-image=${container}", "--container-workdir=/home/svc_tensorrt/bloom/scripts", "--container-mounts=${mounts}", + "--container-env=NVIDIA_IMEX_CHANNELS" ].join(" ") def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh" @@ -382,6 +383,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL export perfMode=$perfMode export resourcePathNode=$resourcePathNode export MODEL_CACHE_DIR=$MODEL_CACHE_DIR + export NVIDIA_IMEX_CHANNELS=0 chmod +x ${scriptRunNode} ${srunCmd} """.stripIndent() diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 45c67a63112..acc48855da0 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -25,8 +25,7 @@ TorchCompileConfig) from tensorrt_llm.quantization import QuantAlgo -from ..conftest import (llm_models_root, parametrize_with_ids, - skip_device_contain_gb200, skip_no_hopper, +from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_hopper, skip_post_blackwell, skip_pre_ada, skip_pre_blackwell, skip_pre_hopper) from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond, @@ -85,9 +84,7 @@ def test_chunked_prefill(self, attn_backend): task.evaluate(llm) @pytest.mark.skip_less_device_memory(32000) - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"]) def test_bfloat16(self, attn_backend, torch_compile): torch_compile_config = TorchCompileConfig( @@ -103,9 +100,7 @@ def test_bfloat16(self, attn_backend, torch_compile): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"]) @pytest.mark.parametrize("tp_size,pp_size", [(4, 1), (2, 2), (1, 4)], ids=["tp4", "tp2pp2", "pp4"]) @@ -133,9 +128,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend, task.evaluate(llm) @skip_pre_ada - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"]) @parametrize_with_ids("fp8kv", [False, True]) def test_fp8(self, fp8kv, attn_backend, torch_compile): @@ -158,9 +151,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile): task.evaluate(llm) @skip_pre_ada - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"]) @parametrize_with_ids("fp8kv", [False, True]) @pytest.mark.parametrize("tp_size,pp_size", [(4, 1), (2, 2), (1, 4)], @@ -643,9 +634,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" @pytest.mark.skip_less_device_memory(60000) - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", [(False, False, False), (True, False, False), (False, True, False), (False, False, True), @@ -679,9 +668,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, task.evaluate(llm) @pytest.mark.skip_less_device(4) - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", [(False, False, False), (True, False, False), (False, True, False), (False, False, True), @@ -724,9 +711,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, task.evaluate(llm) @skip_no_hopper - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler", [(False, False, False, False), (True, False, False, False), @@ -871,9 +856,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn, @pytest.mark.skip_less_device(4) @skip_no_hopper - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler", [(False, False, False, False), (True, False, False, False), @@ -1069,9 +1052,7 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv): task.evaluate(llm) @skip_pre_blackwell - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler", [(False, False, False, False), (True, False, False, False), @@ -1113,9 +1094,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, @pytest.mark.skip_less_device(4) @skip_pre_blackwell - @parametrize_with_ids( - "torch_compile", - [False, pytest.param(True, marks=skip_device_contain_gb200)]) + @parametrize_with_ids("torch_compile", [False, True]) @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler", [(False, False, False, False), (True, False, False, False), @@ -1351,8 +1330,7 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): - - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.80) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, @@ -1830,7 +1808,7 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, task.evaluate(llm) @skip_pre_blackwell - @pytest.mark.skip_less_device(8) + @pytest.mark.skip_less_mpi_world_size(8) @pytest.mark.parametrize( "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend", [(8, 1, 8, True, True, True, "CUTLASS"), @@ -1839,6 +1817,11 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, ) def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, moe_backend): + if moe_backend == "TRTLLM": + pytest.skip( + "TRTLLM moe backend has accuracy issues: https://nvbugspro.nvidia.com/bug/5404726" + ) + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 35dcc590144..4b4c65cfcc7 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -416,7 +416,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5385981) examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987) examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992) -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914) examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422) examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424) test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)