Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions jenkins/BuildDockerImage.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ def buildImage(config, imageKeyToTag)
sh "git config --global --add safe.directory '*'"

withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}")
}

withCredentials([
Expand All @@ -294,7 +294,7 @@ def buildImage(config, imageKeyToTag)
),
string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
]) {
sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}")
}
}
def containerGenFailure = null
Expand Down
82 changes: 50 additions & 32 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -105,24 +105,28 @@ REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false

COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=5"
COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20"

def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
]

Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")

def downloadSucceed = false

pipeline.stage('Submit Test Results') {
sh "mkdir -p ${stageName}"
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
if (downloadSucceed) {
sh "ls ${stageName}"
echo "Upload test results."
Expand All @@ -136,18 +140,20 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
println("No results xml to submit")
}
}

if (downloadSucceed) {
junit(testResults: "${stageName}/results*.xml")
junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
}
}
}

//TODO: consolidate slurm related code for both multi nodes and single nodes
def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
Expand Down Expand Up @@ -207,9 +213,10 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo

def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
Expand Down Expand Up @@ -290,13 +297,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)

def slurmJobID = null
def dockerArgs = null

try {
// Run ssh command to start node in desired cluster via SLURM
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
Expand All @@ -314,6 +323,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")

Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")

def slurmSubmitOutput = Utils.exec(
pipeline,
timeout: false,
Expand Down Expand Up @@ -353,9 +364,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

stage('Checking if the Node is Online') {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
Expand All @@ -373,8 +385,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}

if (CloudManager.isNodeOnline(nodeName)) {
def dockerGPUOption = ""

node(nodeName) {
sh """
env | sort
Expand All @@ -393,7 +403,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
// It's intentional to check NV_GPU first.
dockerGPUOption = sh(script: """
dockerArgs = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
Expand All @@ -404,7 +414,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
""", returnStdout: true).trim()
}

def dockerArgs = "${dockerGPUOption} " +
dockerArgs = "${dockerArgs} " +
"--cap-add=SYS_ADMIN " +
"--ipc=host " +
"--entrypoint=\"\" " +
Expand All @@ -415,18 +425,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add=SYSLOG"

echo "Final dockerArgs: ${dockerArgs}"

if (partition.clusterName == "dlcluster") {
dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
}

slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
echo "Final dockerArgs: ${dockerArgs}"
} else {
error "The Slurm node does not come online in the waiting period. Terminating the job."
}
}

slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
} finally {
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
Expand Down Expand Up @@ -473,9 +482,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
passwordVariable: 'PASSWORD'
)
]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [
ip : cluster.ip,
host : cluster.host,
ip : randomLoginNode,
host : randomLoginNode,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
allowAnyHosts: true,
Expand Down Expand Up @@ -545,7 +555,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptContent = """#!/bin/bash
set -o pipefail
set -Eeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
Expand All @@ -571,6 +582,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
}

stage('Run Test') {
Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")

Utils.exec(
pipeline,
timeout: false,
Expand Down Expand Up @@ -1940,14 +1953,18 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
// We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
// The timeout here is to avoid the Slurm job being stuck.
timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
docker.image(image).inside(dockerArgs) {
runner()
}
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
throw e // Re-throw if it's a different Exception
}
}
}
Expand Down Expand Up @@ -2128,10 +2145,11 @@ def launchTestJobs(pipeline, testFilter)

multiNodesSBSAConfigs = [
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
// Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140)
// "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
// "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
Expand Down
24 changes: 20 additions & 4 deletions jenkins/scripts/slurm_run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
#!/bin/bash

# Set up error handling
set -Eeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src

Expand Down Expand Up @@ -27,21 +32,25 @@ if [ $SLURM_LOCALID -eq 0 ]; then
cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
git config --global --add safe.directory "*"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
touch install_lock.lock
else
while [ ! -f install_lock.lock ]; do
sleep 5
done
fi
testList="$testList_$splitId"
export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
export LLM_ROOT=$llmSrcNode
export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
export UCX_TLS=^gdr_copy

# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later
llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch"
chmod +x $llmapiLaunchScript
cd $llmSrcNode/tests/integration/defs
testCmdLines=(
"$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
"$llmapiLaunchScript"
"pytest"
"-v"
"--timeout-method=thread"
Expand Down Expand Up @@ -88,6 +97,13 @@ echo "Library Path:"
echo "$LD_LIBRARY_PATH"
env | sort
fullCmd="${testCmdLines[*]}"
echo "Running: $testCase"
echo "Full Command: $fullCmd"

# Turn off "exit on error" so the following lines always run
set +e
trap - ERR

eval $fullCmd
exitCode=$?
echo "Pytest exit code: $exitCode"
exit $exitCode
Loading