diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index 180fcfb5f5a..fcd84aa1b84 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -283,7 +283,7 @@ def buildImage(config, imageKeyToTag) sh "git config --global --add safe.directory '*'" withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { - sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}" + trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}") } withCredentials([ @@ -294,7 +294,7 @@ def buildImage(config, imageKeyToTag) ), string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL') ]) { - sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}" + trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}") } } def containerGenFailure = null diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 2e9087eccf5..534d05fa193 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -105,24 +105,28 @@ REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"] ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false -COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=5" +COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20" def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){ withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { + def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ - ip : cluster.ip, - host : cluster.host, + ip : randomLoginNode, + host : randomLoginNode, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, ] Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") + + def downloadSucceed = false + pipeline.stage('Submit Test Results') { sh "mkdir -p ${stageName}" def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml" def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/" - def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0 + downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0 if (downloadSucceed) { sh "ls ${stageName}" echo "Upload test results." @@ -136,8 +140,9 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st println("No results xml to submit") } } + if (downloadSucceed) { - junit(testResults: "${stageName}/results*.xml") + junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml") } } } @@ -145,9 +150,10 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st //TODO: consolidate slurm related code for both multi nodes and single nodes def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { + def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ - ip : cluster.ip, - host : cluster.host, + ip : randomLoginNode, + host : randomLoginNode, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, @@ -207,9 +213,10 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { + def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ - ip : cluster.ip, - host : cluster.host, + ip : randomLoginNode, + host : randomLoginNode, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, @@ -290,13 +297,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p def nodeSecret = CloudManager.createNode(nodeName, customWorkspace) def slurmJobID = null + def dockerArgs = null try { // Run ssh command to start node in desired cluster via SLURM withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { + def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ - ip : cluster.ip, - host : cluster.host, + ip : randomLoginNode, + host : randomLoginNode, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, @@ -314,6 +323,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}") + Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))") + def slurmSubmitOutput = Utils.exec( pipeline, timeout: false, @@ -353,9 +364,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p stage('Checking if the Node is Online') { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { + def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ - ip : cluster.ip, - host : cluster.host, + ip : randomLoginNode, + host : randomLoginNode, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, @@ -373,8 +385,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p } if (CloudManager.isNodeOnline(nodeName)) { - def dockerGPUOption = "" - node(nodeName) { sh """ env | sort @@ -393,7 +403,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p // Dynamically set GPU arguments based on environment variables // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html // It's intentional to check NV_GPU first. - dockerGPUOption = sh(script: """ + dockerArgs = sh(script: """ if [ -n "\$NV_GPU" ]; then echo "--gpus '\\"device=\$NV_GPU\\"'" elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then @@ -404,7 +414,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p """, returnStdout: true).trim() } - def dockerArgs = "${dockerGPUOption} " + + dockerArgs = "${dockerArgs} " + "--cap-add=SYS_ADMIN " + "--ipc=host " + "--entrypoint=\"\" " + @@ -415,18 +425,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " + "--cap-add=SYSLOG" - echo "Final dockerArgs: ${dockerArgs}" - if (partition.clusterName == "dlcluster") { dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0" } - - slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true) - executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner) + echo "Final dockerArgs: ${dockerArgs}" } else { error "The Slurm node does not come online in the waiting period. Terminating the job." } } + + slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true) + executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner) } finally { stage("Clean up SLURM Resources") { // Workaround to handle the interruption during clean up SLURM resources @@ -473,9 +482,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL passwordVariable: 'PASSWORD' ) ]) { + def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ - ip : cluster.ip, - host : cluster.host, + ip : randomLoginNode, + host : randomLoginNode, user : "${pipeline.USERNAME}", passwd : "${pipeline.PASSWORD}", allowAnyHosts: true, @@ -545,7 +555,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode) def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptContent = """#!/bin/bash - set -o pipefail + set -Eeuo pipefail + trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR export jobWorkspace=$jobWorkspace export tarName=$tarName export llmTarfile=$llmTarfile @@ -571,6 +582,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL } stage('Run Test') { + Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))") + Utils.exec( pipeline, timeout: false, @@ -1940,14 +1953,18 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true) stage('Pull Docker Image') { docker.image(image).pull() } - docker.image(image).inside(dockerArgs) { - runner() + // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout + // The timeout here is to avoid the Slurm job being stuck. + timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') { + docker.image(image).inside(dockerArgs) { + runner() + } } } catch (Exception e) { if (e.getMessage()?.contains("Failed to kill container")) { echo "Known benign error ignored: ${e.getMessage()}" } else { - throw e // Re-throw if it's a different IOException + throw e // Re-throw if it's a different Exception } } } @@ -2128,10 +2145,11 @@ def launchTestJobs(pipeline, testFilter) multiNodesSBSAConfigs = [ // Each stage test 1 testcase with 8 GPUs and 2 nodes. - "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2], - "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2], + // Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140) + // "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2], + // "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2], + // "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2], + // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2], diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index af171ba8776..a27c536b1e0 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -1,4 +1,9 @@ #!/bin/bash + +# Set up error handling +set -Eeuo pipefail +trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR + cd $resourcePathNode llmSrcNode=$resourcePathNode/TensorRT-LLM/src @@ -27,21 +32,25 @@ if [ $SLURM_LOCALID -eq 0 ]; then cd $resourcePathNode && pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl git config --global --add safe.directory "*" gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true) - echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName" + hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" + echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" touch install_lock.lock else while [ ! -f install_lock.lock ]; do sleep 5 done fi -testList="$testList_$splitId" export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout export LLM_ROOT=$llmSrcNode export LLM_MODELS_ROOT=$MODEL_CACHE_DIR export UCX_TLS=^gdr_copy + +# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later +llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch" +chmod +x $llmapiLaunchScript cd $llmSrcNode/tests/integration/defs testCmdLines=( - "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch" + "$llmapiLaunchScript" "pytest" "-v" "--timeout-method=thread" @@ -88,6 +97,13 @@ echo "Library Path:" echo "$LD_LIBRARY_PATH" env | sort fullCmd="${testCmdLines[*]}" -echo "Running: $testCase" echo "Full Command: $fullCmd" + +# Turn off "exit on error" so the following lines always run +set +e +trap - ERR + eval $fullCmd +exitCode=$? +echo "Pytest exit code: $exitCode" +exit $exitCode diff --git a/jenkins/scripts/trtllm-llmapi-launch b/jenkins/scripts/trtllm-llmapi-launch new file mode 100644 index 00000000000..b008cc7a5e4 --- /dev/null +++ b/jenkins/scripts/trtllm-llmapi-launch @@ -0,0 +1,129 @@ +#!/bin/bash +set -Eeo pipefail + +task_with_command=("$@") +native_mpi_rank=$OMPI_COMM_WORLD_RANK +mpi_rank=${SLURM_PROCID:-${OMPI_COMM_WORLD_RANK:-${PMI_RANK:-${PMI_ID:-0}}}} + +log_stderr() { echo -e "\033[33m$@\033[0m" >&2; } +log_stderr "mpi_rank: $mpi_rank" + +pid=$(ps -o pid= -p $$ | tr -d ' ') + +# Tell TRTLLM to spawn a additional process for the Proxy +export TLLM_SPAWN_PROXY_PROCESS=1 + +function mpi_world_size { + if [ -n "$SLURM_NTASKS" ]; then + echo "$SLURM_NTASKS" + elif [ -n "$OMPI_COMM_WORLD_SIZE" ]; then + echo "$OMPI_COMM_WORLD_SIZE" + else + echo "1" + fi +} + +function export_free_tcp_addr_for_spawn_proxy_process { + # find free port starting from 10012 + local free_port=$(python -c 'import socket; s=socket.socket(); +port = 10012 +while True: + try: + s.bind(("", port)) + break + except OSError: + port += 1 +print(port); s.close()') + export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}" + log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR" + + export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32) +} + + +export tllm_mpi_size=$(mpi_world_size) +log_stderr "tllm_mpi_size: $tllm_mpi_size" + +export_free_tcp_addr_for_spawn_proxy_process + +if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then + log_stderr "rank${mpi_rank} run ${task_with_command[@]} in background" + + # MPI doesn't allow spawn a process sharing the MPI environment in a MPI + # process, or duplicate MPI_Init in the child process will cause undefined + # behavior. Thus we need to clean the MPI environment in the parent process + # before spawning the child process, and restore the MPI environment later + # before running MPI operations in the parent process. + mpi_blacklist=( + OMPI_ PMIX_ PMI_ SLURM_ MPI_ UCX_ + I_MPI_ HYDRA_ KMP_ MPICH_ MV2_ CRAY_ + ) + + ( + # Remove MPI-related variables only in the subshell context + for var in $(compgen -e); do + for prefix in "${mpi_blacklist[@]}"; do + if [[ "$var" == "$prefix"* ]]; then + unset "$var" + break + fi + done + done + + # Turn off "exit on error" so the following lines always run + set +e + + # Execute the task with cleaned environment + "${task_with_command[@]}" + task_exit_code=$? + echo "Task exit code: $task_exit_code" + + # Stop the MPI Comm server + python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop + mpi_exit_code=$? + echo "MPI Comm server exit code: $mpi_exit_code" + + # Propagate task exit status + if [ $task_exit_code -ne 0 ]; then + exit $task_exit_code + else + exit $mpi_exit_code + fi + ) 1>&2 & + + # Turn off "exit on error" so the following lines always run + set +e + + # Capture subshell PID + subshell_pid=$! + echo "Subshell PID: $subshell_pid" + + log_stderr "rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..." + log_stderr "rank0 host: $HOSTNAME" + python3 -m tensorrt_llm.llmapi.mgmn_leader_node + mgmn_leader_node_exit_code=$? + echo "MGMN leader node exit code: $mgmn_leader_node_exit_code" + + # Wait for subshell + wait $subshell_pid + # This is subshell's exit code + subshell_exit_code=$? + echo "Subshell exit code: $subshell_exit_code" + + # Propagate subshell exit status + if [ $subshell_exit_code -ne 0 ]; then + exit $subshell_exit_code + else + exit $mgmn_leader_node_exit_code + fi +else + # Turn off "exit on error" so the following lines always run + set +e + + log_stderr "rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..." + python3 -m tensorrt_llm.llmapi.mgmn_worker_node + mgmn_worker_node_exit_code=$? + echo "MGMN worker node exit code: $mgmn_worker_node_exit_code" + + exit $mgmn_worker_node_exit_code +fi