Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 63 additions & 10 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -122,13 +122,32 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st

Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")

def downloadSucceed = false
def hasTimeoutTest = false
def downloadResultSucceed = false

pipeline.stage('Submit Test Results') {
sh "mkdir -p ${stageName}"
// Download timeout test results
def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
def downloadTimeoutTestSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${timeoutTestFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
if (downloadTimeoutTestSucceed) {
sh "ls ${stageName}"
def timeoutTestXml = generateTimeoutTestResultXml(stageName, "unfinished_test.txt")
if (timeoutTestXml != null) {
sh """
cat > ${stageName}/results-timeout.xml << 'EOF_TIMEOUT_XML'
${timeoutTestXml}
EOF_TIMEOUT_XML
"""
hasTimeoutTest = true
}
}
// Download normal test results
def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results.xml"
downloadSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0
if (downloadSucceed) {
downloadResultSucceed = Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -P ${remote.port} -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/", returnStatus: true, numRetries: 3) == 0

echo "hasTimeoutTest: ${hasTimeoutTest}, downloadResultSucceed: ${downloadResultSucceed}"
if (hasTimeoutTest || downloadResultSucceed) {
sh "ls ${stageName}"
echo "Upload test results."
sh "tar -czvf results-${stageName}.tar.gz ${stageName}/"
Expand All @@ -142,7 +161,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
}
}

if (downloadSucceed) {
if (hasTimeoutTest || downloadResultSucceed) {
junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
}
}
Expand All @@ -165,12 +184,12 @@ def runIsolatedTests(preprocessedLists, testCmdLine, llmSrc, stageName) {
!cmd.contains("--test-list=") &&
!cmd.contains("--test-prefix=") &&
!cmd.contains("--csv=") &&
!cmd.contains("--junit-xml")
!cmd.contains("--periodic-junit-xmlpath")
}
isolateTestCmdLine += ["--test-list=${singleTestFile}"]
isolateTestCmdLine += ["--test-prefix=${stageName}"]
isolateTestCmdLine += ["--csv=${WORKSPACE}/${stageName}/report_isolated_${i}.csv"]
isolateTestCmdLine += ["--junit-xml ${WORKSPACE}/${stageName}/results_isolated_${i}.xml"]
isolateTestCmdLine += ["--periodic-junit-xmlpath ${WORKSPACE}/${stageName}/results_isolated_${i}.xml"]
isolateTestCmdLine += ["--cov-append"] // Append coverage data to avoid overwriting previous data

try {
Expand Down Expand Up @@ -774,13 +793,16 @@ def getPytestBaseCommandLine(
"--waives-file=${waivesFilePath}",
"--output-dir=${outputPath}/",
"--csv=${outputPath}/report.csv",
"--junit-xml ${outputPath}/results.xml",
"-o junit_logging=out-err",
"--cov=${llmSrc}/examples/",
"--cov=${llmSrc}/tensorrt_llm/",
"--cov=${trtllmWheelPath}/tensorrt_llm/",
"--cov-report=",
"--cov-config=${coverageConfigFile}",
"--periodic-junit",
"--periodic-junit-xmlpath ${outputPath}/results.xml",
"--periodic-batch-size=1",
"--periodic-save-unfinished-test",
]

if (perfMode) {
Expand Down Expand Up @@ -1258,6 +1280,14 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
sh "mkdir -p ${stageName}"
finallyRunner()
if (stageIsFailed) {
def timeoutTestXml = generateTimeoutTestResultXml(stageName, "unfinished_test.txt")
if (timeoutTestXml != null) {
sh """
cat > ${stageName}/results-timeout.xml << 'EOF_TIMEOUT_XML'
${timeoutTestXml}
EOF_TIMEOUT_XML
"""
}
def stageXml = generateStageFailTestResultXml(stageName, "Stage Failed", "Stage run failed without result", "results*.xml")
if (stageXml != null) {
sh "echo '${stageXml}' > ${stageName}/results-stage.xml"
Expand Down Expand Up @@ -1618,9 +1648,32 @@ def launchTestListCheck(pipeline)
})
}

def generateTimeoutTestResultXml(stageName, testFilePath) {
if (!fileExists("${stageName}/${testFilePath}")) {
echo "No ${testFilePath} found in ${stageName}, skipping timeout XML generation"
return null
}
String timeoutTests = sh(script: "cd ${stageName} && cat ${testFilePath}", returnStdout: true).trim()
echo "timeoutTests: ${timeoutTests}"

if (timeoutTests == null || timeoutTests == "") {
return null
}
def testList = timeoutTests.split("\n")
String xmlContent = """<?xml version="1.0" encoding="UTF-8"?><testsuites>
<testsuite name="${stageName}" errors="${testList.size()}" failures="0" skipped="0" tests="${testList.size()}" time="1.00">"""
testList.each { test ->
xmlContent += """<testcase name="${test}" classname="${stageName}" time="1.0">
<error message="Test terminated unexpectedly"> Test terminated unexpectedly
</error></testcase>"""
}
xmlContent += "</testsuite></testsuites>"
return xmlContent
}

def generateStageFailTestResultXml(stageName, subName, failureLog, resultPath) {
String resultFiles = sh(script: "cd ${stageName} && ls -l ${resultPath} | wc -l", returnStdout: true).trim()
echo "${resultFiles}"
echo "resultFiles: ${resultFiles}"
if (resultFiles != "0") {
return null
}
Expand Down Expand Up @@ -1941,14 +1994,14 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine, resultFileName="results.xml
def xmlFile = "${rerunDir}/rerun_results_${times}.xml"
// change the testCmdLine for rerun
def noNeedLine = ["--splitting-algorithm", "--splits", "--group", "--cov"]
def needToChangeLine = ["--test-list", "--csv", "--junit-xml"]
def needToChangeLine = ["--test-list", "--csv", "--periodic-junit-xmlpath"]
def newTestCmdLine = testCmdLine.findAll { cmd ->
!noNeedLine.any { line -> cmd.contains(line) } && !needToChangeLine.any { line -> cmd.contains(line) }
}
newTestCmdLine += [
"--test-list=${currentRerunTestList}",
"--csv=${rerunDir}/rerun_report_${times}.csv",
"--junit-xml ${xmlFile}",
"--periodic-junit-xmlpath ${xmlFile}",
"--reruns ${times - 1}"
]
try {
Expand Down
17 changes: 17 additions & 0 deletions tests/integration/defs/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2147,6 +2147,15 @@ def pytest_addoption(parser):
help="Enable GPU clock locking during tests. "
"By default, GPU clock locking is disabled.",
)
parser.addoption(
"--periodic-save-unfinished-test",
action="store_true",
default=False,
help=
"Save unfinished test name to unfinished_test.txt during test execution (default: False). "
"This helps identify which test was running when a timeout or crash occurs. "
"Only used with --periodic-junit.",
)


@pytest.hookimpl(trylast=True)
Expand Down Expand Up @@ -2256,6 +2265,8 @@ def pytest_configure(config):
if periodic and output_dir:
periodic_interval = config.getoption("--periodic-interval")
periodic_batch_size = config.getoption("--periodic-batch-size")
periodic_save_unfinished_test = config.getoption(
"--periodic-save-unfinished-test", default=False)

# Create output directory early (like --junitxml does) to avoid conflicts with other plugins
# that may need to write to the same directory (e.g., pytest-split)
Expand All @@ -2272,6 +2283,7 @@ def pytest_configure(config):
'info': print_info,
'warning': print_warning
},
save_unfinished_test=periodic_save_unfinished_test,
)

# Configure and register the reporter
Expand All @@ -2283,6 +2295,7 @@ def pytest_configure(config):
f" Interval: {periodic_interval}s ({periodic_interval/60:.1f} min)"
)
print_info(f" Batch size: {periodic_batch_size} tests")
print_info(f" Save unfinished test: {periodic_save_unfinished_test}")
elif periodic and not output_dir:
print_warning(
"Warning: --periodic-junit requires --output-dir to be set. "
Expand Down Expand Up @@ -2344,6 +2357,8 @@ def deselect_by_test_model_suites(test_model_suites, items, test_prefix,
if periodic and output_dir:
periodic_interval = config.getoption("--periodic-interval")
periodic_batch_size = config.getoption("--periodic-batch-size")
periodic_save_unfinished_test = config.getoption(
"--periodic-save-unfinished-test", default=False)

# Create the reporter with logger
xmlpath = os.path.join(output_dir, "results.xml")
Expand All @@ -2355,6 +2370,7 @@ def deselect_by_test_model_suites(test_model_suites, items, test_prefix,
'info': print_info,
'warning': print_warning
},
save_unfinished_test=periodic_save_unfinished_test,
)

# Configure and register the reporter
Expand All @@ -2366,6 +2382,7 @@ def deselect_by_test_model_suites(test_model_suites, items, test_prefix,
f" Interval: {periodic_interval}s ({periodic_interval/60:.1f} min)"
)
print_info(f" Batch size: {periodic_batch_size} tests")
print_info(f" Save unfinished test: {periodic_save_unfinished_test}")
elif periodic and not output_dir:
print_warning(
"Warning: --periodic-junit requires --output-dir to be set. "
Expand Down
35 changes: 35 additions & 0 deletions tests/integration/defs/utils/periodic_junit.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@ def __init__(
interval: int = 18000, # Default 5 hours
batch_size: int = 10,
logger=None, # Optional logger (info, warning functions)
save_unfinished_test:
bool = False, # Save unfinished test name in output-dir/unfinished_test.txt if True
):
"""
Initialize periodic reporter.
Expand All @@ -85,11 +87,13 @@ def __init__(
interval: Time interval in seconds between saves (default: 18000 = 5 hours)
batch_size: Number of tests before triggering a save (default: 10)
logger: Optional dictionary with 'info' and 'warning' functions for logging
save_unfinished_test: If True, save unfinished test name in output-dir/unfinished_test.txt
"""
self.xmlpath = os.path.abspath(xmlpath)
self.time_interval = interval
self.batch_size = batch_size
self.logger = logger or {}
self.save_unfinished_test = save_unfinished_test

self.completed_tests = 0
self.last_save_time = time.time()
Expand Down Expand Up @@ -160,11 +164,42 @@ def pytest_runtest_logreport(self, report: TestReport):
# Collect the report for later batch processing (fast)
self.pending_reports.append(report)

output_dir = os.path.dirname(self.xmlpath)
unfinished_test_path = os.path.join(output_dir, "unfinished_test.txt")

# save unfinished test nodeid to output-dir/unfinished_test.txt
if self.save_unfinished_test and report.when == "setup":
try:
# Create directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
with open(unfinished_test_path, "a", encoding="utf-8") as f:
f.write(report.nodeid + "\n")
except Exception as e:
self._log_warning(
f"Error writing unfinished test {report.nodeid} to {unfinished_test_path}: {e}"
)

# Only increment counter and check for save on teardown phase
if report.when == "teardown":
self.completed_tests += 1
current_time = time.time()

if self.save_unfinished_test:
if os.path.exists(unfinished_test_path):
try:
with open(unfinished_test_path, "r+",
encoding="utf-8") as f:
lines = f.readlines()
f.seek(0)
f.truncate()
for line in lines:
if line.strip() != report.nodeid:
f.write(line)
except Exception as e:
self._log_warning(
f"Error clearing nodeid {report.nodeid} from {unfinished_test_path}: {e}"
)

# Flush if batch threshold reached OR time interval elapsed
should_flush_by_time = (current_time -
self.last_save_time) >= self.time_interval
Expand Down