diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index bf553b4d9e7..98bc5216035 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -972,8 +972,13 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG if(nodeCount > 1) { srunArgs.add("--mpi=pmi2") } + + def exemptionComment = "" + if (cluster.host.contains("oci-nrt") || cluster.host.contains("oci-hsg") || cluster.host.contains("lbd-lax")) { + exemptionComment = """--comment='{"OccupiedIdleGPUsJobReaper":{"exemptIdleTimeMins":"90","reason":"other","description":"Long data and model loading time and disaggregated serving tests"}}'""" + } def scriptContent = """#!/bin/bash - #SBATCH --output=${outputPath} + #SBATCH ${exemptionComment} --output=${outputPath} ${taskArgs.collect { "#SBATCH $it" }.join('\n')} #SBATCH ${partition.additionalArgs} ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""} @@ -1049,7 +1054,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG timeout: false, script: Utils.sshUserCmd( remote, - scriptExecPathNode + "\"${scriptExecPathNode}\"" ), numRetries: 3 ) @@ -1525,7 +1530,7 @@ def runLLMDocBuild(pipeline, config) if (env.alternativeTRT) { sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" } - trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install --retries 1 -r requirements-dev.txt") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install -r requirements-dev.txt") trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl") // Step 3: build doc @@ -2150,7 +2155,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO if (env.alternativeTRT) { sh "cd ${llmSrc} && sed -i 's#tensorrt~=.*\$#tensorrt#g' requirements.txt && cat requirements.txt" } - trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install --retries 1 -r requirements-dev.txt") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmSrc} && pip3 install -r requirements-dev.txt") if (stageName.contains("-Ray-")) { trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install ray[default]") } diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 70081809cba..6325a62fa6d 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -55,10 +55,10 @@ if [ $SLURM_LOCALID -eq 0 ]; then apt-get install -y libffi-dev nvidia-smi && nvidia-smi -q && nvidia-smi topo -m if [[ $pytestCommand == *--run-ray* ]]; then - pip3 install ray[default] + pip3 install --retries 10 ray[default] fi - cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt - cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl + cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt + cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl git config --global --add safe.directory "*" gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"