Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -1639,6 +1639,7 @@ def launchTestListCheck(pipeline)
sh "tar -zxf ${tarName}"
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
def llmSrc = "${llmPath}/TensorRT-LLM/src"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install -r ${llmSrc}/requirements-dev.txt")
sh "NVIDIA_TRITON_SERVER_VERSION=25.10 LLM_ROOT=${llmSrc} LLM_BACKEND_ROOT=${llmSrc}/triton_backend python3 ${llmSrc}/scripts/check_test_list.py --l0 --qa --waive"
} catch (InterruptedException e) {
throw e
Expand Down Expand Up @@ -2903,8 +2904,10 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B200-4_GPUs-PyTorch-2": ["b200-x4", "l0_dgx_b200", 2, 2, 4],
"DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
"DGX_B200-8_GPUs-PyTorch-1": ["b200-x8", "l0_dgx_b200", 1, 1, 8],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 1, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test
// Disable perf stages due to https://nvbugs/5643646
// "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "perf_sanity_l0_dgx_b200", 1, 1, 4],
Expand Down Expand Up @@ -2933,7 +2936,8 @@ def launchTestJobs(pipeline, testFilter)
fullSet += SBSATestConfigs.keySet()

SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
"GB200-4_GPUs-PyTorch-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 2, 4],
"GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
"GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
// Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
Expand Down
7 changes: 5 additions & 2 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,14 @@ set_value_in_command() {
echo "$result"
}

# Only the first process will save the job ID
# Only the first process will save the job ID and set the git config
if [ $SLURM_PROCID -eq 0 ]; then
# Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
# Update HOME/.gitconfig
if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
git config --global --add safe.directory "*"
fi
fi

if [ $SLURM_LOCALID -eq 0 ]; then
Expand All @@ -47,7 +51,6 @@ if [ $SLURM_LOCALID -eq 0 ]; then
fi
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
Expand Down
7 changes: 3 additions & 4 deletions scripts/check_test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@


def install_python_dependencies(llm_src):
subprocess.run(
f"cd {llm_src} && pip3 install --retries 1 -r requirements-dev.txt",
shell=True,
check=True)
subprocess.run(f"cd {llm_src} && pip3 install -r requirements-dev.txt",
shell=True,
check=True)
subprocess.run(
f"pip3 install --force-reinstall --no-deps {llm_src}/../tensorrt_llm-*.whl",
shell=True,
Expand Down
Loading