Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -972,8 +972,13 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
if(nodeCount > 1) {
srunArgs.add("--mpi=pmi2")
}

def exemptionComment = ""
if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) {
exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'"""
}
def scriptContent = """#!/bin/bash
#SBATCH --output=${outputPath}
#SBATCH ${exemptionComment} --output=${outputPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs}
${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
Expand Down Expand Up @@ -1049,7 +1054,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptExecPathNode
"\"${scriptExecPathNode}\""
),
numRetries: 3
)
Expand Down Expand Up @@ -1525,7 +1530,7 @@ def runLLMDocBuild(pipeline, config)
if (env.alternativeTRT) {
sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
}
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install --retries 1 -r requirements-dev.txt")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install -r requirements-dev.txt")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl")

// Step 3: build doc
Expand Down Expand Up @@ -2150,7 +2155,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
if (env.alternativeTRT) {
sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt"
}
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install --retries 1 -r requirements-dev.txt")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install -r requirements-dev.txt")
if (stageName.contains("-Ray-")) {
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install ray[default]")
}
Expand Down
6 changes: 3 additions & 3 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ if [ $SLURM_LOCALID -eq 0 ]; then
apt-get install -y libffi-dev
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
pip3 install ray[default]
pip3 install --retries 10 ray[default]
fi
cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
Expand Down