diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy
index 180fcfb5f5a..fcd84aa1b84 100644
--- a/jenkins/BuildDockerImage.groovy
+++ b/jenkins/BuildDockerImage.groovy
@@ -283,7 +283,7 @@ def buildImage(config, imageKeyToTag)
         sh "git config --global --add safe.directory '*'"
 
         withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
-            sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
+            trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}")
         }
 
         withCredentials([
@@ -294,7 +294,7 @@ def buildImage(config, imageKeyToTag)
             ),
             string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
         ]) {
-            sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
+            trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}")
         }
     }
     def containerGenFailure = null
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 2e9087eccf5..534d05fa193 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -105,24 +105,28 @@ REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
 ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
 ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
 
-COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=5"
+COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20"
 
 def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
     withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
         def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
+            ip           : randomLoginNode,
+            host         : randomLoginNode,
             user         : "${pipeline.USERNAME}",
             passwd       : "${pipeline.PASSWORD}",
             allowAnyHosts: true,
         ]
 
         Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+
+        def downloadSucceed = false
+
         pipeline.stage('Submit Test Results') {
             sh "mkdir -p ${stageName}"
             def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
             def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
-            def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
+            downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
             if (downloadSucceed) {
                 sh "ls ${stageName}"
                 echo "Upload test results."
@@ -136,8 +140,9 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
                 println("No results xml to submit")
             }
         }
+
         if (downloadSucceed) {
-            junit(testResults: "${stageName}/results*.xml")
+            junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
         }
     }
 }
@@ -145,9 +150,10 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
 //TODO: consolidate slurm related code for both multi nodes and single nodes
 def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
     withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
         def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
+            ip           : randomLoginNode,
+            host         : randomLoginNode,
             user         : "${pipeline.USERNAME}",
             passwd       : "${pipeline.PASSWORD}",
             allowAnyHosts: true,
@@ -207,9 +213,10 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
 
 def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
     withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
         def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
+            ip           : randomLoginNode,
+            host         : randomLoginNode,
             user         : "${pipeline.USERNAME}",
             passwd       : "${pipeline.PASSWORD}",
             allowAnyHosts: true,
@@ -290,13 +297,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
     def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)
 
     def slurmJobID = null
+    def dockerArgs = null
 
     try {
         // Run ssh command to start node in desired cluster via SLURM
         withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+            def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
             def remote = [
-                    ip           : cluster.ip,
-                    host         : cluster.host,
+                    ip           : randomLoginNode,
+                    host         : randomLoginNode,
                     user         : "${pipeline.USERNAME}",
                     passwd       : "${pipeline.PASSWORD}",
                     allowAnyHosts: true,
@@ -314,6 +323,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
 
                 Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")
 
+                Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
+
                 def slurmSubmitOutput = Utils.exec(
                     pipeline,
                     timeout: false,
@@ -353,9 +364,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
 
         stage('Checking if the Node is Online') {
             withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+                def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
                 def remote = [
-                        ip           : cluster.ip,
-                        host         : cluster.host,
+                        ip           : randomLoginNode,
+                        host         : randomLoginNode,
                         user         : "${pipeline.USERNAME}",
                         passwd       : "${pipeline.PASSWORD}",
                         allowAnyHosts: true,
@@ -373,8 +385,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
             }
 
             if (CloudManager.isNodeOnline(nodeName)) {
-                def dockerGPUOption = ""
-
                 node(nodeName) {
                     sh """
                         env | sort
@@ -393,7 +403,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     // Dynamically set GPU arguments based on environment variables
                     // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
                     // It's intentional to check NV_GPU first.
-                    dockerGPUOption = sh(script: """
+                    dockerArgs = sh(script: """
                         if [ -n "\$NV_GPU" ]; then
                             echo "--gpus '\\"device=\$NV_GPU\\"'"
                         elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
@@ -404,7 +414,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     """, returnStdout: true).trim()
                 }
 
-                def dockerArgs = "${dockerGPUOption} " +
+                dockerArgs = "${dockerArgs} " +
                     "--cap-add=SYS_ADMIN " +
                     "--ipc=host " +
                     "--entrypoint=\"\" " +
@@ -415,18 +425,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                     "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
                     "--cap-add=SYSLOG"
 
-                echo "Final dockerArgs: ${dockerArgs}"
-
                 if (partition.clusterName == "dlcluster") {
                     dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
                 }
-
-                slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
-                executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
+                echo "Final dockerArgs: ${dockerArgs}"
             } else {
                 error "The Slurm node does not come online in the waiting period. Terminating the job."
             }
         }
+
+        slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
+        executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
     } finally {
         stage("Clean up SLURM Resources") {
             // Workaround to handle the interruption during clean up SLURM resources
@@ -473,9 +482,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 passwordVariable: 'PASSWORD'
             )
         ]) {
+            def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
             def remote = [
-                    ip           : cluster.ip,
-                    host         : cluster.host,
+                    ip           : randomLoginNode,
+                    host         : randomLoginNode,
                     user         : "${pipeline.USERNAME}",
                     passwd       : "${pipeline.PASSWORD}",
                     allowAnyHosts: true,
@@ -545,7 +555,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
                 def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
                 def scriptContent = """#!/bin/bash
-                    set -o pipefail
+                    set -Eeuo pipefail
+                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
                     export jobWorkspace=$jobWorkspace
                     export tarName=$tarName
                     export llmTarfile=$llmTarfile
@@ -571,6 +582,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
             }
 
             stage('Run Test') {
+                Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
+
                 Utils.exec(
                     pipeline,
                     timeout: false,
@@ -1940,14 +1953,18 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
                 stage('Pull Docker Image') {
                     docker.image(image).pull()
                 }
-                docker.image(image).inside(dockerArgs) {
-                    runner()
+                // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
+                // The timeout here is to avoid the Slurm job being stuck.
+                timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
+                    docker.image(image).inside(dockerArgs) {
+                        runner()
+                    }
                 }
             } catch (Exception e) {
                 if (e.getMessage()?.contains("Failed to kill container")) {
                     echo "Known benign error ignored: ${e.getMessage()}"
                 } else {
-                    throw e // Re-throw if it's a different IOException
+                    throw e // Re-throw if it's a different Exception
                 }
             }
         }
@@ -2128,10 +2145,11 @@ def launchTestJobs(pipeline, testFilter)
 
     multiNodesSBSAConfigs = [
         // Each stage test 1 testcase with 8 GPUs and 2 nodes.
-        "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
+        // Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140)
+        // "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
+        // "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
+        // "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
+        // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
index af171ba8776..a27c536b1e0 100755
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@@ -1,4 +1,9 @@
 #!/bin/bash
+
+# Set up error handling
+set -Eeuo pipefail
+trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
+
 cd $resourcePathNode
 llmSrcNode=$resourcePathNode/TensorRT-LLM/src
 
@@ -27,21 +32,25 @@ if [ $SLURM_LOCALID -eq 0 ]; then
     cd $resourcePathNode &&  pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
     git config --global --add safe.directory "*"
     gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
-    echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
+    hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
+    echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
     touch install_lock.lock
 else
     while [ ! -f install_lock.lock ]; do
         sleep 5
     done
 fi
-testList="$testList_$splitId"
 export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
 export LLM_ROOT=$llmSrcNode
 export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
 export UCX_TLS=^gdr_copy
+
+# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later
+llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch"
+chmod +x $llmapiLaunchScript
 cd $llmSrcNode/tests/integration/defs
 testCmdLines=(
-    "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
+    "$llmapiLaunchScript"
     "pytest"
     "-v"
     "--timeout-method=thread"
@@ -88,6 +97,13 @@ echo "Library Path:"
 echo "$LD_LIBRARY_PATH"
 env | sort
 fullCmd="${testCmdLines[*]}"
-echo "Running: $testCase"
 echo "Full Command: $fullCmd"
+
+# Turn off "exit on error" so the following lines always run
+set +e
+trap - ERR
+
 eval $fullCmd
+exitCode=$?
+echo "Pytest exit code: $exitCode"
+exit $exitCode
diff --git a/jenkins/scripts/trtllm-llmapi-launch b/jenkins/scripts/trtllm-llmapi-launch
new file mode 100644
index 00000000000..b008cc7a5e4
--- /dev/null
+++ b/jenkins/scripts/trtllm-llmapi-launch
@@ -0,0 +1,129 @@
+#!/bin/bash
+set -Eeo pipefail
+
+task_with_command=("$@")
+native_mpi_rank=$OMPI_COMM_WORLD_RANK
+mpi_rank=${SLURM_PROCID:-${OMPI_COMM_WORLD_RANK:-${PMI_RANK:-${PMI_ID:-0}}}}
+
+log_stderr() { echo -e "\033[33m$@\033[0m" >&2; }
+log_stderr "mpi_rank: $mpi_rank"
+
+pid=$(ps -o pid= -p $$ | tr -d ' ')
+
+# Tell TRTLLM to spawn a additional process for the Proxy
+export TLLM_SPAWN_PROXY_PROCESS=1
+
+function mpi_world_size {
+    if [ -n "$SLURM_NTASKS" ]; then
+        echo "$SLURM_NTASKS"
+    elif [ -n "$OMPI_COMM_WORLD_SIZE" ]; then
+        echo "$OMPI_COMM_WORLD_SIZE"
+    else
+        echo "1"
+    fi
+}
+
+function export_free_tcp_addr_for_spawn_proxy_process {
+    # find free port starting from 10012
+    local free_port=$(python -c 'import socket; s=socket.socket();
+port = 10012
+while True:
+    try:
+        s.bind(("", port))
+        break
+    except OSError:
+        port += 1
+print(port); s.close()')
+    export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}"
+    log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"
+
+    export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32)
+}
+
+
+export tllm_mpi_size=$(mpi_world_size)
+log_stderr "tllm_mpi_size: $tllm_mpi_size"
+
+export_free_tcp_addr_for_spawn_proxy_process
+
+if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
+    log_stderr "rank${mpi_rank} run ${task_with_command[@]} in background"
+
+    # MPI doesn't allow spawn a process sharing the MPI environment in a MPI
+    # process, or duplicate MPI_Init in the child process will cause undefined
+    # behavior. Thus we need to clean the MPI environment in the parent process
+    # before spawning the child process, and restore the MPI environment later
+    # before running MPI operations in the parent process.
+    mpi_blacklist=(
+        OMPI_ PMIX_ PMI_ SLURM_ MPI_ UCX_
+        I_MPI_ HYDRA_ KMP_ MPICH_ MV2_ CRAY_
+    )
+
+    (
+        # Remove MPI-related variables only in the subshell context
+        for var in $(compgen -e); do
+            for prefix in "${mpi_blacklist[@]}"; do
+                if [[ "$var" == "$prefix"* ]]; then
+                    unset "$var"
+                    break
+                fi
+            done
+        done
+
+        # Turn off "exit on error" so the following lines always run
+        set +e
+
+        # Execute the task with cleaned environment
+        "${task_with_command[@]}"
+        task_exit_code=$?
+        echo "Task exit code: $task_exit_code"
+
+        # Stop the MPI Comm server
+        python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop
+        mpi_exit_code=$?
+        echo "MPI Comm server exit code: $mpi_exit_code"
+
+        # Propagate task exit status
+        if [ $task_exit_code -ne 0 ]; then
+            exit $task_exit_code
+        else
+            exit $mpi_exit_code
+        fi
+    ) 1>&2 &
+
+    # Turn off "exit on error" so the following lines always run
+    set +e
+
+    # Capture subshell PID
+    subshell_pid=$!
+    echo "Subshell PID: $subshell_pid"
+
+    log_stderr "rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..."
+    log_stderr "rank0 host: $HOSTNAME"
+    python3 -m tensorrt_llm.llmapi.mgmn_leader_node
+    mgmn_leader_node_exit_code=$?
+    echo "MGMN leader node exit code: $mgmn_leader_node_exit_code"
+
+    # Wait for subshell
+    wait $subshell_pid
+    # This is subshell's exit code
+    subshell_exit_code=$?
+    echo "Subshell exit code: $subshell_exit_code"
+
+    # Propagate subshell exit status
+    if [ $subshell_exit_code -ne 0 ]; then
+        exit $subshell_exit_code
+    else
+        exit $mgmn_leader_node_exit_code
+    fi
+else
+    # Turn off "exit on error" so the following lines always run
+    set +e
+
+    log_stderr "rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..."
+    python3 -m tensorrt_llm.llmapi.mgmn_worker_node
+    mgmn_worker_node_exit_code=$?
+    echo "MGMN worker node exit code: $mgmn_worker_node_exit_code"
+
+    exit $mgmn_worker_node_exit_code
+fi