NVIDIA · yiqingy0 · Dec 4, 2025 · Nov 24, 2025 · Nov 24, 2025 · Nov 24, 2025
@@ -122,13 +122,32 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
 
         Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
 
-        def downloadSucceed = false
+        def hasTimeoutTest = false
+        def downloadResultSucceed = false
 
         pipeline.stage('Submit Test Results') {
             sh "mkdir -p ${stageName}"
+            // Download timeout test results
+            def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
+            def downloadTimeoutTestSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${timeoutTestFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
+            if (downloadTimeoutTestSucceed) {
+                sh "ls ${stageName}"
+                def timeoutTestXml = generateTimeoutTestResultXml(stageName, "unfinished_test.txt")
+                if (timeoutTestXml != null) {
+                    sh """
+cat > ${stageName}/results-timeout.xml << 'EOF_TIMEOUT_XML'
+${timeoutTestXml}
+EOF_TIMEOUT_XML
+                    """
+                    hasTimeoutTest = true
+                }
+            }
+            // Download normal test results
             def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results.xml"
-            downloadSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
-            if (downloadSucceed) {
+            downloadResultSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
+
+            echo "hasTimeoutTest: ${hasTimeoutTest}, downloadResultSucceed: ${downloadResultSucceed}"
+            if (hasTimeoutTest || downloadResultSucceed) {
                 sh "ls ${stageName}"
                 echo "Upload test results."
                 sh "tar -czvf results-${stageName}.tar.gz ${stageName}/"
@@ -142,7 +161,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
             }
         }
 
-        if (downloadSucceed) {
+        if (hasTimeoutTest || downloadResultSucceed) {
             junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
         }
     }
@@ -165,12 +184,12 @@ def runIsolatedTests(preprocessedLists, testCmdLine, llmSrc, stageName) {
             !cmd.contains("--test-list=") &&
             !cmd.contains("--test-prefix=") &&
             !cmd.contains("--csv=") &&
-            !cmd.contains("--junit-xml")
+            !cmd.contains("--periodic-junit-xmlpath")
         }
         isolateTestCmdLine += ["--test-list=${singleTestFile}"]
         isolateTestCmdLine += ["--test-prefix=${stageName}"]
         isolateTestCmdLine += ["--csv=${WORKSPACE}/${stageName}/report_isolated_${i}.csv"]
-        isolateTestCmdLine += ["--junit-xml ${WORKSPACE}/${stageName}/results_isolated_${i}.xml"]
+        isolateTestCmdLine += ["--periodic-junit-xmlpath ${WORKSPACE}/${stageName}/results_isolated_${i}.xml"]
         isolateTestCmdLine += ["--cov-append"]  // Append coverage data to avoid overwriting previous data
 
         try {
@@ -774,13 +793,16 @@ def getPytestBaseCommandLine(
         "--waives-file=${waivesFilePath}",
         "--output-dir=${outputPath}/",
         "--csv=${outputPath}/report.csv",
-        "--junit-xml ${outputPath}/results.xml",
         "-o junit_logging=out-err",
         "--cov=${llmSrc}/examples/",
         "--cov=${llmSrc}/tensorrt_llm/",
         "--cov=${trtllmWheelPath}/tensorrt_llm/",
         "--cov-report=",
         "--cov-config=${coverageConfigFile}",
+        "--periodic-junit",
+        "--periodic-junit-xmlpath ${outputPath}/results.xml",
+        "--periodic-batch-size=1",
+        "--periodic-save-unfinished-test",
     ]
 
     if (perfMode) {
@@ -1258,6 +1280,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
             sh "mkdir -p ${stageName}"
             finallyRunner()
             if (stageIsFailed) {
+                def timeoutTestXml = generateTimeoutTestResultXml(stageName, "unfinished_test.txt")
+                if (timeoutTestXml != null) {
+                    sh """
+cat > ${stageName}/results-timeout.xml << 'EOF_TIMEOUT_XML'
+${timeoutTestXml}
+EOF_TIMEOUT_XML
+                    """
+                }
                 def stageXml = generateStageFailTestResultXml(stageName, "Stage Failed", "Stage run failed without result", "results*.xml")
                 if (stageXml != null) {
                     sh "echo '${stageXml}' > ${stageName}/results-stage.xml"
@@ -1618,9 +1648,32 @@ def launchTestListCheck(pipeline)
     })
 }
 
+def generateTimeoutTestResultXml(stageName, testFilePath) {
+    if (!fileExists("${stageName}/${testFilePath}")) {
+        echo "No ${testFilePath} found in ${stageName}, skipping timeout XML generation"
+        return null
+    }
+    String timeoutTests = sh(script: "cd ${stageName} && cat ${testFilePath}", returnStdout: true).trim()
+    echo "timeoutTests: ${timeoutTests}"
+
+    if (timeoutTests == null || timeoutTests == "") {
+        return null
+    }
+    def testList = timeoutTests.split("\n")
+    String xmlContent = """<?xml version="1.0" encoding="UTF-8"?><testsuites>
+        <testsuite name="${stageName}" errors="${testList.size()}" failures="0" skipped="0" tests="${testList.size()}" time="1.00">"""
+    testList.each { test ->
+        xmlContent += """<testcase name="${test}" classname="${stageName}" time="1.0">
+        <error message="Test terminated unexpectedly"> Test terminated unexpectedly
+        </error></testcase>"""
+    }
+    xmlContent += "</testsuite></testsuites>"
+    return xmlContent
+}
+
 def generateStageFailTestResultXml(stageName, subName, failureLog, resultPath) {
     String resultFiles = sh(script: "cd ${stageName} && ls -l ${resultPath} | wc -l", returnStdout: true).trim()
-    echo "${resultFiles}"
+    echo "resultFiles: ${resultFiles}"
     if (resultFiles != "0") {
         return null
     }
@@ -1941,14 +1994,14 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine, resultFileName="results.xml
         def xmlFile = "${rerunDir}/rerun_results_${times}.xml"
         // change the testCmdLine for rerun
         def noNeedLine = ["--splitting-algorithm", "--splits", "--group", "--cov"]
-        def needToChangeLine = ["--test-list", "--csv", "--junit-xml"]
+        def needToChangeLine = ["--test-list", "--csv", "--periodic-junit-xmlpath"]
         def newTestCmdLine = testCmdLine.findAll { cmd ->
             !noNeedLine.any { line -> cmd.contains(line) } && !needToChangeLine.any { line -> cmd.contains(line) }
         }
         newTestCmdLine += [
             "--test-list=${currentRerunTestList}",
             "--csv=${rerunDir}/rerun_report_${times}.csv",
-            "--junit-xml ${xmlFile}",
+            "--periodic-junit-xmlpath ${xmlFile}",
             "--reruns ${times - 1}"
         ]
         try {

diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
@@ -2147,6 +2147,15 @@ def pytest_addoption(parser):
         help="Enable GPU clock locking during tests. "
         "By default, GPU clock locking is disabled.",
     )
+    parser.addoption(
+        "--periodic-save-unfinished-test",
+        action="store_true",
+        default=False,
+        help=
+        "Save unfinished test name to unfinished_test.txt during test execution (default: False). "
+        "This helps identify which test was running when a timeout or crash occurs. "
+        "Only used with --periodic-junit.",
+    )
 
 
 @pytest.hookimpl(trylast=True)
@@ -2256,6 +2265,8 @@ def pytest_configure(config):
     if periodic and output_dir:
         periodic_interval = config.getoption("--periodic-interval")
         periodic_batch_size = config.getoption("--periodic-batch-size")
+        periodic_save_unfinished_test = config.getoption(
+            "--periodic-save-unfinished-test", default=False)
 
         # Create output directory early (like --junitxml does) to avoid conflicts with other plugins
         # that may need to write to the same directory (e.g., pytest-split)
@@ -2272,6 +2283,7 @@ def pytest_configure(config):
                 'info': print_info,
                 'warning': print_warning
             },
+            save_unfinished_test=periodic_save_unfinished_test,
         )
 
         # Configure and register the reporter
@@ -2283,6 +2295,7 @@ def pytest_configure(config):
             f"  Interval: {periodic_interval}s ({periodic_interval/60:.1f} min)"
         )
         print_info(f"  Batch size: {periodic_batch_size} tests")
+        print_info(f"  Save unfinished test: {periodic_save_unfinished_test}")
     elif periodic and not output_dir:
         print_warning(
             "Warning: --periodic-junit requires --output-dir to be set. "
@@ -2344,6 +2357,8 @@ def deselect_by_test_model_suites(test_model_suites, items, test_prefix,
     if periodic and output_dir:
         periodic_interval = config.getoption("--periodic-interval")
         periodic_batch_size = config.getoption("--periodic-batch-size")
+        periodic_save_unfinished_test = config.getoption(
+            "--periodic-save-unfinished-test", default=False)
 
         # Create the reporter with logger
         xmlpath = os.path.join(output_dir, "results.xml")
@@ -2355,6 +2370,7 @@ def deselect_by_test_model_suites(test_model_suites, items, test_prefix,
                 'info': print_info,
                 'warning': print_warning
             },
+            save_unfinished_test=periodic_save_unfinished_test,
         )
 
         # Configure and register the reporter
@@ -2366,6 +2382,7 @@ def deselect_by_test_model_suites(test_model_suites, items, test_prefix,
             f"  Interval: {periodic_interval}s ({periodic_interval/60:.1f} min)"
         )
         print_info(f"  Batch size: {periodic_batch_size} tests")
+        print_info(f"  Save unfinished test: {periodic_save_unfinished_test}")
     elif periodic and not output_dir:
         print_warning(
             "Warning: --periodic-junit requires --output-dir to be set. "

diff --git a/tests/integration/defs/utils/periodic_junit.py b/tests/integration/defs/utils/periodic_junit.py
@@ -73,6 +73,8 @@ def __init__(
             interval: int = 18000,  # Default 5 hours
             batch_size: int = 10,
             logger=None,  # Optional logger (info, warning functions)
+            save_unfinished_test:
+        bool = False,  # Save unfinished test name in output-dir/unfinished_test.txt if True
     ):
         """
         Initialize periodic reporter.
@@ -85,11 +87,13 @@ def __init__(
             interval: Time interval in seconds between saves (default: 18000 = 5 hours)
             batch_size: Number of tests before triggering a save (default: 10)
             logger: Optional dictionary with 'info' and 'warning' functions for logging
+            save_unfinished_test: If True, save unfinished test name in output-dir/unfinished_test.txt
         """
         self.xmlpath = os.path.abspath(xmlpath)
         self.time_interval = interval
         self.batch_size = batch_size
         self.logger = logger or {}
+        self.save_unfinished_test = save_unfinished_test
 
         self.completed_tests = 0
         self.last_save_time = time.time()
@@ -160,11 +164,42 @@ def pytest_runtest_logreport(self, report: TestReport):
         # Collect the report for later batch processing (fast)
         self.pending_reports.append(report)
 
+        output_dir = os.path.dirname(self.xmlpath)
+        unfinished_test_path = os.path.join(output_dir, "unfinished_test.txt")
+
+        # save unfinished test nodeid to output-dir/unfinished_test.txt
+        if self.save_unfinished_test and report.when == "setup":
+            try:
+                # Create directory if it doesn't exist
+                os.makedirs(output_dir, exist_ok=True)
+                with open(unfinished_test_path, "a", encoding="utf-8") as f:
+                    f.write(report.nodeid + "\n")
+            except Exception as e:
+                self._log_warning(
+                    f"Error writing unfinished test {report.nodeid} to {unfinished_test_path}: {e}"
+                )
+
         # Only increment counter and check for save on teardown phase
         if report.when == "teardown":
             self.completed_tests += 1
             current_time = time.time()
 
+            if self.save_unfinished_test:
+                if os.path.exists(unfinished_test_path):
+                    try:
+                        with open(unfinished_test_path, "r+",
+                                  encoding="utf-8") as f:
+                            lines = f.readlines()
+                            f.seek(0)
+                            f.truncate()
+                            for line in lines:
+                                if line.strip() != report.nodeid:
+                                    f.write(line)
+                    except Exception as e:
+                        self._log_warning(
+                            f"Error clearing nodeid {report.nodeid} from {unfinished_test_path}: {e}"
+                        )
+
             # Flush if batch threshold reached OR time interval elapsed
             should_flush_by_time = (current_time -
                                     self.last_save_time) >= self.time_interval