diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index fe4434a86ce..41c66a7887d 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline) sh "tar -zxf ${tarName}" def llmPath = sh (script: "realpath .", returnStdout: true).trim() def llmSrc = "${llmPath}/TensorRT-LLM/src" + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt") sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive" } catch (InterruptedException e) { throw e @@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter) "DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4], "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8], - "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true], - "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4], + "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true], + "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], + "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], // Perf sanity post merge test // Disable perf stages due to https://nvbugs/5643646 // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4], @@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter) fullSet += SBSATestConfigs.keySet() SBSASlurmTestConfigs = [ - "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], + "GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4], + "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4], "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4], // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 49368b94c07..8f191b3edb2 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -29,10 +29,14 @@ set_value_in_command() { echo "$result" } -# Only the first process will save the job ID +# Only the first process will save the job ID and set the git config if [ $SLURM_PROCID -eq 0 ]; then # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt + # Update HOME/.gitconfig + if ! git config --global --get-all safe.directory | grep -Fxq "*"; then + git config --global --add safe.directory "*" + fi fi if [ $SLURM_LOCALID -eq 0 ]; then @@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then fi cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl - git config --global --add safe.directory "*" gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py index c7b5357d25f..c799d433fcb 100755 --- a/scripts/check_test_list.py +++ b/scripts/check_test_list.py @@ -23,10 +23,9 @@ def install_python_dependencies(llm_src): - subprocess.run( - f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt", - shell=True, - check=True) + subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt", + shell=True, + check=True) subprocess.run( f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl", shell=True,