From b3e8fa29601567a86d62e90b5cffbcd89d5ad6f4 Mon Sep 17 00:00:00 2001
From: Eran Geva <19514940+MrGeva@users.noreply.github.com>
Date: Mon, 11 Aug 2025 08:33:13 +0300
Subject: [PATCH 01/15] [None][test] Test trtllm-bench AD vs, PT BEs on H100
 single gpu (#6487)

Signed-off-by: Eran Geva <19514940+MrGeva@users.noreply.github.com>
Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
Co-authored-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com>
---
 .../integration/test_lists/test-db/l0_a30.yml |   3 +-
 .../test_lists/test-db/l0_b200.yml            |   2 +-
 .../test_lists/test-db/l0_h100.yml            |   1 +
 .../unit/singlegpu/test_ad_trtllm_bench.py    | 566 +++++++++++++++++-
 4 files changed, 549 insertions(+), 23 deletions(-)

diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
index ce8058136fa..5ec16996e7c 100644
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -18,8 +18,7 @@ l0_a30:
   - unittest/_torch/modeling -k "modeling_phi3"
   - unittest/_torch/modeling -k "modeling_qwen"
   - unittest/_torch/modeling -k "modeling_qwen_moe"
-  - unittest/_torch/modeling -k "modeling_exaone4"
-  - unittest/_torch/auto_deploy/unit/singlegpu
+  - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
   - unittest/_torch/test_beam_search.py
 - condition:
     ranges:
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 730cd016743..26b4b2a0a88 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -70,7 +70,7 @@ l0_b200:
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_deepseek"
   - unittest/_torch/modeling -k "modeling_gpt_oss"
-  - unittest/_torch/auto_deploy/unit/singlegpu
+  - unittest/_torch/auto_deploy/unit/singlegpu -k "not test_trtllm_bench_backend_comparison"
   - unittest/_torch/speculative/test_eagle3.py
   - unittest/_torch/speculative/test_kv_cache_reuse.py
   - unittest/_torch/speculative/test_dynamic_spec_decode.py
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 43ee39de1af..1a8fded524b 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -15,6 +15,7 @@ l0_h100:
   tests:
   # ------------- PyTorch tests ---------------
   # Only key models in H100: llama/mixtral/nemotron/deepseek
+  - unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py::test_trtllm_bench_backend_comparison
   - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" TIMEOUT (90)
   - unittest/_torch -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
index 2985e662b27..f5ec68e28d9 100644
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -1,14 +1,231 @@
+import json
+import re
 import subprocess
 import tempfile
 from pathlib import Path
 
+import pytest
 import yaml
 from _model_test_utils import _hf_model_dir_or_hub_id
-from click.testing import CliRunner
 from utils.cpp_paths import llm_root  # noqa: F401
 from utils.llm_data import llm_models_root
 
-from tensorrt_llm.commands.bench import main
+
+def parse_kv_cache_metrics(log_output: str, free_mem_ratio: float = 0.8):
+    """Parse KV cache metrics from the benchmark log output."""
+    metrics = {}
+
+    # Simple patterns based on actual log format
+    patterns = {
+        "current_cache_size": r"Current cache size:\s*(\d+)",
+        "free_mem_pre_mb": r"Free memory before forward pass \(MB\):\s*(\d+)",
+        "free_mem_post_mb": r"Free memory after forward pass \(MB\):\s*(\d+)",
+    }
+
+    # Extract metrics using simple regex patterns
+    for metric_name, pattern in patterns.items():
+        match = re.search(pattern, log_output, re.IGNORECASE)
+        if match:
+            value = int(match.group(1))
+            metrics[metric_name] = value
+            print(f"  ✅ Found {metric_name}: {value}")
+        else:
+            print(f"  ❌ Could not find {metric_name}")
+
+    # Calculate new_cache_size using the same formula as in resize_kv_cache
+    # new_cache_size = free_mem_post * 1024 * 1024 * free_mem_ratio + current_cache_size
+    if "free_mem_post_mb" in metrics and "current_cache_size" in metrics:
+        metrics["new_cache_size"] = int(
+            metrics["free_mem_post_mb"] * 1024 * 1024 * free_mem_ratio
+            + metrics["current_cache_size"]
+        )
+        print(
+            f"  ✅ Calculated new_cache_size: {metrics['new_cache_size']} (using free_mem_ratio={free_mem_ratio})"
+        )
+    else:
+        print("  ❌ Cannot calculate new_cache_size - missing required metrics")
+
+    return metrics
+
+
+def run_benchmark(
+    model_name: str,
+    dataset_path: str,
+    temp_dir: str,
+    backend: str = "_autodeploy",
+    report_json_path: str = None,
+    max_batch_size: int = 32,
+    num_hidden_layers: int = 2,
+    free_mem_ratio: float = 0.1,
+):
+    """Run benchmark and capture KV cache metrics from log output."""
+
+    # Read the test config to get free_mem_ratio
+    config_path = f"{temp_dir}/extra_llm_api_options.yaml"
+
+    # Build the command to run the benchmark
+    cmd = [
+        "python",
+        "-m",
+        "tensorrt_llm.commands.bench",
+        "--model",
+        model_name,
+        "throughput",
+        "--backend",
+        backend,
+        "--dataset",
+        str(dataset_path),
+        "--max_batch_size",
+        str(max_batch_size),
+    ]
+
+    # Add report_json argument if path is provided
+    if report_json_path:
+        cmd.extend(["--report_json", report_json_path])
+
+    if backend == "_autodeploy":
+        # Add extra_llm_api_options only for autodeploy backend
+        cmd.extend(["--extra_llm_api_options", config_path])
+
+    # Run benchmark as subprocess to capture ALL output
+    import os
+
+    env = os.environ.copy()
+    if backend == "pytorch":
+        env["TLLM_OVERRIDE_LAYER_NUM"] = str(num_hidden_layers)
+        print(f"📋 Using TLLM_OVERRIDE_LAYER_NUM from env: {env['TLLM_OVERRIDE_LAYER_NUM']}")
+        cmd.extend(["--kv_cache_free_gpu_mem_fraction", str(free_mem_ratio)])
+    print(f"🚀 Running benchmark command ({backend} backend): {' '.join(cmd)}")
+    result = subprocess.run(cmd, capture_output=True, text=True, env=env, timeout=600)
+
+    # Check if the command succeeded
+    assert result.returncode == 0, (
+        f"Benchmark failed with return code {result.returncode}:\n"
+        f"STDOUT:\n{result.stdout}\nSTDERR:\n{result.stderr}"
+    )
+
+    # Combine stdout and stderr for parsing
+    full_log_output = f"{result.stdout}\n{result.stderr}"
+
+    # Parse KV cache metrics from the combined log output (only for autodeploy backend)
+    kv_cache_metrics = {}
+    if backend == "_autodeploy":
+        kv_cache_metrics = parse_kv_cache_metrics(full_log_output, free_mem_ratio)
+        print("📊 KV Cache Metrics parsed from logs:")
+        if kv_cache_metrics:
+            for key, value in kv_cache_metrics.items():
+                if "mb" in key.lower():
+                    print(f"  {key}: {value}MB")
+                else:
+                    print(f"  {key}: {value} bytes")
+        else:
+            print("  ⚠️ No KV cache metrics were parsed successfully")
+    else:
+        print(f"📊 KV Cache Metrics: Skipped for {backend} backend")
+
+    # Return parsed JSON report with KV cache metrics if requested
+    if report_json_path and Path(report_json_path).exists():
+        with open(report_json_path, "r") as f:
+            report_data = json.load(f)
+
+        # Add KV cache metrics to the report (only for autodeploy backend)
+        if backend == "_autodeploy":
+            report_data["kv_cache_metrics"] = kv_cache_metrics
+        report_data["backend"] = backend
+        return report_data
+    return None
+
+
+def compare_backends_performance(
+    autodeploy_tokens_per_sec: float,
+    pytorch_tokens_per_sec: float,
+    relative_tolerance: float = 0.20,
+    absolute_tolerance: float = 10.0,
+):
+    """
+    Compare performance between autodeploy and pytorch backends.
+    Fails if autodeploy is significantly worse than pytorch.
+
+    Args:
+        autodeploy_tokens_per_sec: Performance of autodeploy backend
+        pytorch_tokens_per_sec: Performance of pytorch backend
+        relative_tolerance: Relative tolerance (20% by default for backend comparison)
+        absolute_tolerance: Absolute tolerance (10 tokens/sec by default)
+    """
+    # Calculate performance difference
+    performance_diff = pytorch_tokens_per_sec - autodeploy_tokens_per_sec
+    relative_diff = performance_diff / pytorch_tokens_per_sec if pytorch_tokens_per_sec > 0 else 0
+
+    print("=== BACKEND PERFORMANCE COMPARISON ===")
+    print(f"PyTorch backend: {pytorch_tokens_per_sec:.2f} tokens/sec/user")
+    print(f"Autodeploy backend: {autodeploy_tokens_per_sec:.2f} tokens/sec/user")
+    print(f"Performance difference: {performance_diff:.2f} tokens/sec ({relative_diff:.2%})")
+
+    # If autodeploy is better than or equal to pytorch, always pass
+    if autodeploy_tokens_per_sec >= pytorch_tokens_per_sec:
+        print("✅ Autodeploy backend matches or exceeds PyTorch backend performance")
+        return
+
+    # Autodeploy is slower - check if it's within acceptable tolerance
+    within_relative_tolerance = relative_diff <= relative_tolerance
+    within_absolute_tolerance = performance_diff <= absolute_tolerance
+
+    if within_relative_tolerance or within_absolute_tolerance:
+        print("✅ Autodeploy backend performance within acceptable tolerance")
+        print(
+            f"   Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute"
+        )
+    else:
+        assert False, (
+            f"Autodeploy backend significantly underperforms compared to PyTorch! "
+            f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user, "
+            f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user, "
+            f"Performance gap: {performance_diff:.2f} tokens/sec ({relative_diff:.2%}), "
+            f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute"
+        )
+
+
+def assert_performance_within_tolerance(
+    actual_tokens_per_sec: float,
+    golden_tokens_per_sec: float,
+    relative_tolerance: float = 0.15,
+    absolute_tolerance: float = 10.0,
+):
+    """
+    Assert that actual performance is within tolerance of golden result.
+    Only fails if performance is WORSE than golden - improvements always pass.
+
+    Args:
+        actual_tokens_per_sec: Measured performance metric
+        golden_tokens_per_sec: Expected performance metric
+        relative_tolerance: Relative tolerance (15% by default)
+        absolute_tolerance: Absolute tolerance (10 tokens/sec by default)
+    """
+    # If actual performance is better than or equal to golden, always pass
+    if actual_tokens_per_sec >= golden_tokens_per_sec:
+        print(
+            f"✅ Performance improvement detected:"
+            f" {actual_tokens_per_sec:.2f} >= {golden_tokens_per_sec:.2f} tokens/sec/user"
+        )
+        return
+
+    # Performance is worse than golden - check if it's within acceptable tolerance
+    performance_drop = golden_tokens_per_sec - actual_tokens_per_sec
+    relative_drop = (
+        performance_drop / golden_tokens_per_sec if golden_tokens_per_sec > 0 else float("inf")
+    )
+
+    # Performance should be within relative tolerance OR absolute tolerance
+    within_relative_tolerance = relative_drop <= relative_tolerance
+    within_absolute_tolerance = performance_drop <= absolute_tolerance
+
+    assert within_relative_tolerance or within_absolute_tolerance, (
+        f"Performance regression detected! "
+        f"Actual: {actual_tokens_per_sec:.2f} tokens/sec/user, "
+        f"Golden: {golden_tokens_per_sec:.2f} tokens/sec/user, "
+        f"Performance drop: {performance_drop:.2f} tokens/sec ({relative_drop:.2%}), "
+        f"Tolerance: {relative_tolerance:.2%} relative OR {absolute_tolerance:.2f} tokens/sec absolute"
+    )
 
 
 def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
@@ -17,7 +234,7 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
     dataset_tool = Path(root_dir, "benchmarks", "cpp", "prepare_dataset.py")
     script_dir = Path(root_dir, "benchmarks", "cpp")
 
-    # Generate a small dataset to run a test.
+    # Generate a small dataset to run a test - matching workload configuration
     command = [
         "python3",
         f"{dataset_tool}",
@@ -37,7 +254,9 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
         "10",
     ]
     print(f"Running command: {' '.join(command)}")
-    result = subprocess.run(command, cwd=str(script_dir), capture_output=True, text=True)
+    result = subprocess.run(
+        command, cwd=str(script_dir), capture_output=True, text=True, timeout=300
+    )
     if result.returncode != 0:
         raise RuntimeError(f"Failed to prepare dataset: {result.stderr}")
     # Grab the stdout and write it to a dataset file for passing to suite.
@@ -46,22 +265,324 @@ def prepare_dataset(root_dir: str, temp_dir: str, model_name: str):
     return dataset_path
 
 
-def run_benchmark(model_name: str, dataset_path: str, temp_dir: str):
-    runner = CliRunner()
+def calculate_expected_kv_cache_metrics(free_mem_ratio: float):
+    """Calculate expected KV cache metrics based on actual GPU memory."""
+    try:
+        import torch
 
-    args = [
-        "--model",
-        model_name,
-        "throughput",
-        "--backend",
-        "_autodeploy",
-        "--dataset",
-        dataset_path,
-        "--extra_llm_api_options",
-        f"{temp_dir}/model_kwargs.yaml",
+        if torch.cuda.is_available():
+            # Get total GPU memory in MB
+            _, total_mem_bytes = torch.cuda.mem_get_info(0)
+            total_mem_mb = total_mem_bytes // (1024 * 1024)
+
+            # Estimate expected values based on model size
+            # For TinyLlama-1.1B, model should be 2.2GB
+            estimated_model_size_mb = 2200  # Conservative estimate
+            # TODO: https://github.com/NVIDIA/TensorRT-LLM/issues/6335 check why there is extra consumption
+            extra_consumption_mb = 2500
+            expected_free_mem_range = (
+                total_mem_mb - estimated_model_size_mb - extra_consumption_mb,
+                total_mem_mb - estimated_model_size_mb,
+            )
+
+            # Current cache size is typically small initially (16MB range)
+            expected_current_cache_size = 16777216
+
+            # Free memory values should be in reasonable range
+            expected_free_mem_pre_range = expected_free_mem_range
+            expected_free_mem_post_range = (
+                expected_free_mem_range[0] - 1000,
+                expected_free_mem_range[1] - 500,
+            )
+
+            print("📊 GPU Memory Analysis:")
+            print(f"  Total GPU memory: {total_mem_mb}MB")
+            print(
+                f"  Expected free memory range: {expected_free_mem_range[0]}-{expected_free_mem_range[1]}MB"
+            )
+
+            return {
+                "total_mem_mb": total_mem_mb,
+                "expected_current_cache_size": expected_current_cache_size,
+                "expected_free_mem_pre_range": expected_free_mem_pre_range,
+                "expected_free_mem_post_range": expected_free_mem_post_range,
+                "free_mem_ratio": free_mem_ratio,
+            }
+        else:
+            return None
+    except ImportError:
+        return None
+
+
+def validate_kv_cache_metrics_dynamic(kv_cache_metrics: dict, expected_metrics: dict):
+    """Validate KV cache metrics using dynamic expected values."""
+
+    # Validate current_cache_size (should be relatively stable)
+    current_cache_size = kv_cache_metrics.get("current_cache_size")
+    expected_cache_size = expected_metrics["expected_current_cache_size"]
+    if current_cache_size:
+        cache_diff = abs(current_cache_size - expected_cache_size) / expected_cache_size
+        assert cache_diff <= 0.5, (  # 50% tolerance for cache size
+            f"Current cache size outside expected range: {current_cache_size} vs expected ~{expected_cache_size}"
+        )
+        print(f"  ✅ current_cache_size: {current_cache_size} bytes (within range)")
+
+    # Validate free memory values are in reasonable ranges
+    free_mem_pre = kv_cache_metrics.get("free_mem_pre_mb")
+    free_mem_post = kv_cache_metrics.get("free_mem_post_mb")
+
+    if free_mem_pre:
+        pre_range = expected_metrics["expected_free_mem_pre_range"]
+        assert pre_range[0] <= free_mem_pre <= pre_range[1], (
+            f"Free memory before forward pass outside expected range: "
+            f"{free_mem_pre}MB not in range {pre_range[0]}-{pre_range[1]}MB"
+        )
+        print(f"  ✅ free_mem_pre_mb: {free_mem_pre}MB (within range)")
+
+    if free_mem_post:
+        post_range = expected_metrics["expected_free_mem_post_range"]
+        assert post_range[0] <= free_mem_post <= post_range[1], (
+            f"Free memory after forward pass outside expected range: "
+            f"{free_mem_post}MB not in range {post_range[0]}-{post_range[1]}MB"
+        )
+        print(f"  ✅ free_mem_post_mb: {free_mem_post}MB (within range)")
+
+    # Validate memory consumption (pre should be > post)
+    if free_mem_pre and free_mem_post:
+        memory_consumed = free_mem_pre - free_mem_post
+        assert memory_consumed > 0, (
+            f"Expected memory consumption during forward pass, got {memory_consumed}MB"
+        )
+        assert memory_consumed < 5000, f"Memory consumption too high: {memory_consumed}MB"
+        print(f"  ✅ Memory consumed during forward pass: {memory_consumed}MB (reasonable)")
+
+    # Validate calculated new_cache_size
+    new_cache_size = kv_cache_metrics.get("new_cache_size")
+    if new_cache_size and free_mem_post and current_cache_size:
+        expected_new_cache = int(
+            free_mem_post * 1024 * 1024 * expected_metrics["free_mem_ratio"] + current_cache_size
+        )
+        cache_size_diff = abs(new_cache_size - expected_new_cache) / expected_new_cache
+        assert cache_size_diff <= 0.01, (  # 1% tolerance for calculated value
+            f"Calculated new_cache_size mismatch: {new_cache_size} vs expected {expected_new_cache}"
+        )
+        print(f"  ✅ new_cache_size: {new_cache_size} bytes (calculation correct)")
+
+
+def extract_performance_metric(report_data, report_name="benchmark"):
+    """Extract performance metric from a benchmark report with validation."""
+    assert report_data is not None, f"Failed to capture {report_name} report"
+    assert "performance" in report_data, f"Performance metrics not found in {report_name} report"
+
+    tokens_per_sec = report_data["performance"].get("output_throughput_per_user_tok_s")
+    assert tokens_per_sec is not None, (
+        f"output_throughput_per_user_tok_s not found in {report_name} performance metrics"
+    )
+
+    return tokens_per_sec
+
+
+def validate_and_extract_kv_cache_metrics(report_data, free_mem_ratio, require_metrics=True):
+    """
+    Validate and extract KV cache metrics from report.
+
+    Args:
+        report_data: The benchmark report data
+        free_mem_ratio: Free memory ratio for calculating expected metrics
+        require_metrics: If True, fail when metrics are missing. If False, just warn.
+
+    Returns:
+        Tuple of (kv_cache_metrics, expected_metrics) or (None, None) if validation fails
+    """
+    required_metrics = [
+        "current_cache_size",
+        "free_mem_pre_mb",
+        "free_mem_post_mb",
+        "new_cache_size",
     ]
-    result = runner.invoke(main, args, catch_exceptions=False)
-    assert result.exit_code == 0
+
+    # Extract KV cache metrics
+    kv_cache_metrics = report_data.get("kv_cache_metrics", {})
+
+    if not kv_cache_metrics:
+        message = (
+            "KV cache metrics not found! "
+            "The autodeploy backend must log memory statistics for this test to pass. "
+            f"Expected metrics: {', '.join(required_metrics)}"
+        )
+        if require_metrics:
+            assert False, f"REQUIRED {message}"
+        else:
+            print(f"ℹ️ {message}")
+            assert False, "KV cache metrics are missing"
+
+    # Check for missing metrics
+    missing_metrics = [metric for metric in required_metrics if metric not in kv_cache_metrics]
+
+    if missing_metrics:
+        message = (
+            f"Missing required KV cache metrics: {missing_metrics}. "
+            f"Found metrics: {list(kv_cache_metrics.keys())}. "
+            f"All of {required_metrics} are required for the test to pass."
+        )
+        if require_metrics:
+            assert False, message
+        else:
+            print(f"ℹ️ KV cache validation skipped - {message}")
+            assert False, "KV cache metrics are missing"
+
+    # Calculate expected metrics
+    expected_metrics = calculate_expected_kv_cache_metrics(free_mem_ratio)
+    assert expected_metrics, "Could not determine expected metrics for this GPU"
+
+    return kv_cache_metrics, expected_metrics
+
+
+def print_kv_cache_metrics(kv_cache_metrics):
+    """Print KV cache metrics in a formatted way."""
+    print("=== KV CACHE METRICS (DYNAMIC VALIDATION) ===")
+    for metric_name, actual_value in kv_cache_metrics.items():
+        if "mb" in metric_name.lower():
+            print(f"{metric_name}: {actual_value}MB")
+        else:
+            print(f"{metric_name}: {actual_value} bytes")
+
+
+def trtllm_bench_unified_comparison(
+    llm_root,  # noqa: F811
+    comparison_mode="backend",
+    free_mem_ratio=0.1,
+    num_hidden_layers=2,
+    max_batch_size=32,  # below this value the kv cache resizing is skipped
+    golden_tokens_per_sec=1400,
+    backend_relative_tolerance=0.2,
+    backend_absolute_tolerance=250.0,
+    golden_relative_tolerance=0.1,
+    golden_absolute_tolerance=5.0,
+):
+    """
+    Unified test that compares autodeploy backend performance in two modes:
+    - "backend": compares against pytorch backend performance
+    - "golden": compares against predefined golden performance values
+
+    Args:
+        llm_root: Root directory for LLM models (pytest fixture)
+        comparison_mode: Either "backend" or "golden" to determine comparison type
+        free_mem_ratio: Ratio of free memory to use for KV cache
+        num_hidden_layers: Number of hidden layers for the model
+        max_batch_size: Maximum batch size for benchmarking
+        golden_tokens_per_sec: Golden performance value in tokens/sec/user
+        backend_relative_tolerance: Relative tolerance for backend comparison
+        backend_absolute_tolerance: Absolute tolerance for backend comparison
+        golden_relative_tolerance: Relative tolerance for golden comparison
+        golden_absolute_tolerance: Absolute tolerance for golden comparison
+    """
+    model_name = _hf_model_dir_or_hub_id(
+        f"{llm_models_root()}/TinyLlama-1.1B-Chat-v1.0", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
+    )
+
+    with tempfile.TemporaryDirectory() as temp_dir:
+        with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f:
+            yaml.dump(
+                {
+                    "model_kwargs": {"num_hidden_layers": num_hidden_layers},
+                    "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32],
+                    "compile_backend": "torch-opt",
+                    "free_mem_ratio": free_mem_ratio,
+                    "runtime": "trtllm",
+                },
+                f,
+            )
+
+        dataset_path = prepare_dataset(llm_root, temp_dir, model_name)
+
+        # Always run autodeploy backend
+        autodeploy_report_path = f"{temp_dir}/autodeploy_report.json"
+        print("=== RUNNING AUTODEPLOY BACKEND ===")
+        autodeploy_report = run_benchmark(
+            model_name,
+            dataset_path,
+            temp_dir,
+            "_autodeploy",
+            autodeploy_report_path,
+            max_batch_size,
+            num_hidden_layers,
+            free_mem_ratio,
+        )
+
+        # Extract autodeploy performance metrics
+        autodeploy_tokens_per_sec = extract_performance_metric(autodeploy_report, "autodeploy")
+
+        # Validate and extract KV cache metrics (now required for both modes after user's changes)
+        kv_cache_metrics, expected_metrics = validate_and_extract_kv_cache_metrics(
+            autodeploy_report, free_mem_ratio, require_metrics=True
+        )
+
+        if comparison_mode == "backend":
+            # Backend comparison mode: also run pytorch backend
+            pytorch_report_path = f"{temp_dir}/pytorch_report.json"
+            print("=== RUNNING PYTORCH BACKEND ===")
+            pytorch_report = run_benchmark(
+                model_name,
+                dataset_path,
+                temp_dir,
+                "pytorch",
+                pytorch_report_path,
+                max_batch_size,
+                num_hidden_layers,
+                free_mem_ratio,
+            )
+
+            # Extract pytorch performance metrics
+            pytorch_tokens_per_sec = extract_performance_metric(pytorch_report, "pytorch")
+
+            # Compare backend performance
+            compare_backends_performance(
+                autodeploy_tokens_per_sec,
+                pytorch_tokens_per_sec,
+                relative_tolerance=backend_relative_tolerance,
+                absolute_tolerance=backend_absolute_tolerance,
+            )
+
+            # Validate KV cache metrics
+            validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics)
+            print("✅ KV Cache Metrics validation passed")
+
+            print("=== BACKEND COMPARISON TEST PASSED ===")
+            print(f"Autodeploy: {autodeploy_tokens_per_sec:.2f} tokens/sec/user")
+            print(f"PyTorch: {pytorch_tokens_per_sec:.2f} tokens/sec/user")
+
+        elif comparison_mode == "golden":
+            # Golden comparison mode: compare against golden values
+            print("=== PERFORMANCE METRICS ===")
+            print(f"Measured performance: {autodeploy_tokens_per_sec:.2f} tokens/sec/user")
+            print(f"Golden performance: {golden_tokens_per_sec:.2f} tokens/sec/user")
+
+            # Print KV cache metrics
+            print_kv_cache_metrics(kv_cache_metrics)
+
+            # Performance validation
+            assert_performance_within_tolerance(
+                autodeploy_tokens_per_sec,
+                golden_tokens_per_sec,
+                relative_tolerance=golden_relative_tolerance,
+                absolute_tolerance=golden_absolute_tolerance,
+            )
+
+            # KV cache metrics validation
+            print(
+                f"Validating {len(kv_cache_metrics)} KV cache metrics against GPU-specific ranges..."
+            )
+            validate_kv_cache_metrics_dynamic(kv_cache_metrics, expected_metrics)
+
+            print("=== ALL TESTS PASSED ===")
+            print(f"Performance: ✅ {autodeploy_tokens_per_sec:.2f} tokens/sec/user within bounds")
+            print("KV Cache Metrics: ✅ All metrics within GPU-specific expected ranges")
+
+        else:
+            raise ValueError(
+                f"Invalid comparison_mode: {comparison_mode}. Must be 'backend' or 'golden'"
+            )
 
 
 def test_trtllm_bench(llm_root):  # noqa: F811
@@ -70,15 +591,20 @@ def test_trtllm_bench(llm_root):  # noqa: F811
     )
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        with open(f"{temp_dir}/model_kwargs.yaml", "w") as f:
+        with open(f"{temp_dir}/extra_llm_api_options.yaml", "w") as f:
             yaml.dump(
                 {
                     "model_kwargs": {"num_hidden_layers": 2},
                     "cuda_graph_batch_sizes": [1, 2],
-                    "max_batch_size": 128,
                 },
                 f,
             )
 
         dataset_path = prepare_dataset(llm_root, temp_dir, model_name)
         run_benchmark(model_name, dataset_path, temp_dir)
+
+
+@pytest.mark.no_xdist
+def test_trtllm_bench_backend_comparison(llm_root):  # noqa: F811
+    """Test that compares autodeploy backend performance against pytorch backend."""
+    trtllm_bench_unified_comparison(llm_root, comparison_mode="backend")

From 62d6c98d68b5a83f05f5c0d04d6fbb056fc19806 Mon Sep 17 00:00:00 2001
From: Yiqing Yan <yiqingy@nvidia.com>
Date: Mon, 11 Aug 2025 14:38:05 +0800
Subject: [PATCH 02/15] [TRTLLM-5633][infra] Force set changed file diff to
 empty string for post-merge CI (#6777)

Signed-off-by: Yiqing Yan <yiqingy@nvidia.com>
---
 jenkins/L0_MergeRequest.groovy | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index 95522b2bf26..d00dd66d534 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -591,6 +591,12 @@ def getMergeRequestChangedFileList(pipeline, globalVars) {
 }
 
 def getMergeRequestOneFileChanges(pipeline, globalVars, filePath) {
+    def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
+    if (env.alternativeTRT || isOfficialPostMergeJob) {
+        pipeline.echo("Force set changed file diff to empty string.")
+        return ""
+    }
+
     def githubPrApiUrl = globalVars[GITHUB_PR_API_URL]
     def diff = ""
 

From 9c358c26e486db89de44ed55e3d210eb198a6556 Mon Sep 17 00:00:00 2001
From: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
Date: Mon, 11 Aug 2025 14:39:58 +0800
Subject: [PATCH 03/15] [None][chore] remove closed bugs (#6772)

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 71643434923..fb5964279e6 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -256,7 +256,6 @@ unittest/trt/attention/test_gpt_attention.py -k "partition3" SKIP (https://nvbug
 test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False] SKIP (https://nvbugs/5414909)
 unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep1-disable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673)
 unittest/_torch/multi_gpu_modeling/test_llama4.py::test_llama4[pp1-ep4-enable_adp-enable_graph-tp8-trtllm-scout] SKIP (https://nvbugs/5418673)
-examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct] SKIP (https://nvbugs/5419066)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5141288)
 examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_vl_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (https://nvbugs/5419067)
 examples/test_qwen.py::test_llm_qwen_awq_single_gpu_summary[qwen2_vl_7b_instruct-nb:4] SKIP (https://nvbugs/5419068)

From d6ad4a9d5b0e1a2c8211f3688985d125459c9cde Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Mon, 11 Aug 2025 15:16:25 +0800
Subject: [PATCH 04/15] [None][infra] Waive failed tests on main 0811 (#6778)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 tests/integration/test_lists/waives.txt   | 1 +
 tests/unittest/llmapi/test_llm_pytorch.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index fb5964279e6..026eeeca5c4 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -293,3 +293,4 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466)
 disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[llama-3.1-8b] SKIP (https://nvbugs/5445642)
 examples/test_qwen2audio.py::test_llm_qwen2audio_single_gpu[qwen2_audio_7b_instruct] SKIP (https://nvbugs/5447530)
+examples/test_nemotron_nas.py::test_nemotron_nas_summary_2gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index 541965b588f..e519df1cf2c 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -471,6 +471,7 @@ def test_llama_7b_lora_config_overrides_peft_cache_config():
 
 # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high
 # https://jirasw.nvidia.com/browse/TRTLLM-5045
+@pytest.mark.skip(reason="https://nvbugs/5448464")
 @skip_gpu_memory_less_than_138gb
 def test_nemotron_nas_lora() -> None:
     lora_config = LoraConfig(lora_dir=[

From 9a8195ef88b7f8279e3608e32f19f2959e68c671 Mon Sep 17 00:00:00 2001
From: Martin Marciniszyn Mehringer
 <11665257+MartinMarciniszyn@users.noreply.github.com>
Date: Mon, 11 Aug 2025 00:18:17 -0700
Subject: [PATCH 05/15] fix: Ensure that Python stub generation works against
 libnvidia-ml stubs (#6188)

Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com>
---
 cpp/tensorrt_llm/nanobind/CMakeLists.txt |   2 +-
 cpp/tensorrt_llm/pybind/CMakeLists.txt   |   2 +-
 docker/Dockerfile.multi                  |   5 +-
 scripts/build_wheel.py                   | 185 +++++++++++++----------
 4 files changed, 113 insertions(+), 81 deletions(-)

diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index aa5b3cf45da..af657a625e2 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -52,6 +52,6 @@ if(NOT WIN32)
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
index b4809d5135e..bb1d87f9d4b 100755
--- a/cpp/tensorrt_llm/pybind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -53,6 +53,6 @@ if(NOT WIN32)
     ${TRTLLM_PYBIND_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index c832481da9f..eeafc8f4a65 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
 # Install OpenCV with FFMPEG support
-RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
-RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
+RUN pip3 uninstall -y opencv && \
+    rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
+    pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
 
 # WARs against security issues inherited from pytorch:25.06
 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 52abdbcb844..3041e684c96 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -27,7 +27,7 @@
 from shutil import copy, copytree, rmtree
 from subprocess import DEVNULL, CalledProcessError, check_output, run
 from textwrap import dedent
-from typing import List
+from typing import Sequence
 
 try:
     from packaging.requirements import Requirement
@@ -120,7 +120,8 @@ def create_venv(project_dir: Path):
     return venv_prefix
 
 
-def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
+def setup_venv(project_dir: Path, requirements_file: Path,
+               no_venv: bool) -> tuple[Path, Path]:
     """Creates/updates a venv and installs requirements.
 
     Args:
@@ -279,6 +280,103 @@ def generate_fmha_cu(project_dir, venv_python):
     os.chdir(project_dir)
 
 
+def create_cuda_stub_links(cuda_stub_dir: str):
+    """
+  Creates symbolic links for CUDA stub libraries in the provided directory.
+
+  Args:
+      cuda_stub_dir (str): Path to the directory containing CUDA stubs.
+  """
+    cuda_stub_path = Path(cuda_stub_dir)
+    if not cuda_stub_path.exists():
+        raise RuntimeError(
+            f"CUDA stub directory '{cuda_stub_dir}' does not exist.")
+
+    shared_objects = ["cuda.so",
+                      "nvidia-ml.so"]  # List of shared object names to process.
+
+    for lib_name in shared_objects:
+        # Define the full paths for the library (.so) and its versioned link (.so.1).
+        so = cuda_stub_path / f"lib{lib_name}"  # e.g., libcuda.so
+        so_versioned = cuda_stub_path / f"lib{lib_name}.1"  # e.g., libcuda.so.1
+
+        # Check if the library exists and the versioned link does not.
+        if so.exists() and not so_versioned.exists():
+            try:
+                # Attempt to create the symbolic link.
+                so_versioned.symlink_to(so)
+            except PermissionError:
+                # Handle permission errors by attempting to use `sudo` to create the link.
+                try:
+                    build_run(f"sudo ln -s {str(so)} {str(so_versioned)}")
+                except CalledProcessError as sudo_error:
+                    print(
+                        f"Failed to create symbolic link even with sudo: {sudo_error}"
+                    )
+
+
+def generate_python_stubs_linux(binding_type: str, venv_python: Path,
+                                deep_ep: bool):
+    is_nanobind = binding_type == "nanobind"
+    package = "nanobind" if is_nanobind else "pybind11-stubgen"
+    build_run(f"\"{venv_python}\" -m pip install {package}")
+
+    env_stub_gen = os.environ.copy()
+    cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
+        "CUDA_PATH") or "/usr/local/cuda"
+    cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
+    ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
+    if Path(cuda_stub_dir).exists():
+        # Create symbolic links for the CUDA stubs
+        create_cuda_stub_links(cuda_stub_dir)
+        env_stub_gen[
+            "LD_LIBRARY_PATH"] = f"{ld_library_path}:{cuda_stub_dir}" if ld_library_path else cuda_stub_dir
+    if is_nanobind:
+        build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
+                  env=env_stub_gen)
+    else:
+        build_run(
+            f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
+            env=env_stub_gen)
+        build_run(
+            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
+            env=env_stub_gen)
+        if deep_ep:
+            build_run(
+                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
+                env=env_stub_gen)
+
+
+def generate_python_stubs_windows(binding_type: str, venv_python: Path,
+                                  pkg_dir: Path, lib_dir: Path):
+    if binding_type == "nanobind":
+        print("Windows not yet supported for nanobind stubs")
+        exit(1)
+    else:
+        build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
+        stubgen = "stubgen.py"
+        stubgen_contents = """
+                        # Loading torch, trt before bindings is required to avoid import errors on windows.
+                        # isort: off
+                        import torch
+                        import tensorrt as trt
+                        # isort: on
+                        import os
+                        import platform
+
+                        from pybind11_stubgen import main
+
+                        if __name__ == "__main__":
+                            # Load dlls from `libs` directory before launching bindings.
+                            if platform.system() == "Windows":
+                                os.add_dll_directory(r\"{lib_dir}\")
+                            main()
+                        """.format(lib_dir=lib_dir)
+        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
+        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
+        (pkg_dir / stubgen).unlink()
+
+
 def main(*,
          build_type: str = "Release",
          generator: str = "",
@@ -286,7 +384,7 @@ def main(*,
          dist_dir: Path = None,
          cuda_architectures: str = None,
          job_count: int = None,
-         extra_cmake_vars: List[str] = list(),
+         extra_cmake_vars: Sequence[str] = tuple(),
          extra_make_targets: str = "",
          trt_root: str = '/usr/local/tensorrt',
          nccl_root: str = None,
@@ -361,7 +459,7 @@ def main(*,
 
     if on_windows:
         # Windows does not support multi-device currently.
-        extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
+        extra_cmake_vars += ["ENABLE_MULTI_DEVICE=0"]
 
         # The Ninja CMake generator is used for our Windows build
         # (Easier than MSBuild to make compatible with our Docker image)
@@ -703,81 +801,14 @@ def get_binding_lib(subdirectory, name):
                      dirs_exist_ok=True)
 
         if not skip_stubs:
-            with working_directory(project_dir):
-                if binding_type == "nanobind":
-                    build_run(f"\"{venv_python}\" -m pip install nanobind")
-                else:
-                    build_run(
-                        f"\"{venv_python}\" -m pip install pybind11-stubgen")
             with working_directory(pkg_dir):
                 if on_windows:
-                    if binding_type == "nanobind":
-                        print("Windows not yet supported for nanobind stubs")
-                        exit(1)
-                    else:
-                        stubgen = "stubgen.py"
-                        stubgen_contents = """
-                        # Loading torch, trt before bindings is required to avoid import errors on windows.
-                        # isort: off
-                        import torch
-                        import tensorrt as trt
-                        # isort: on
-                        import os
-                        import platform
-
-                        from pybind11_stubgen import main
-
-                        if __name__ == "__main__":
-                            # Load dlls from `libs` directory before launching bindings.
-                            if platform.system() == "Windows":
-                                os.add_dll_directory(r\"{lib_dir}\")
-                            main()
-                        """.format(lib_dir=lib_dir)
-                        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
-                        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
-                        (pkg_dir / stubgen).unlink()
-                else:
-                    env_ld = os.environ.copy()
-
-                    new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real"
-                    if 'LD_LIBRARY_PATH' in env_ld:
-                        new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
-
-                    result = build_run("find /usr -name *libnvidia-ml.so*",
-                                       capture_output=True,
-                                       text=True)
-                    assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}"
-
-                    # Build containers only contain stub version of libnvidia-ml.so and not the real version.
-                    # If real version not in system, we need to create symbolic link to stub version to prevent import errors.
-                    if "libnvidia-ml.so.1" not in result.stdout:
-                        if "libnvidia-ml.so" in result.stdout:
-                            line = result.stdout.splitlines()[0]
-                            path = os.path.dirname(line)
-                            new_library_path += f":{path}"
-                            build_run(f"ln -s {line} {path}/libnvidia-ml.so.1")
-                        else:
-                            print(
-                                f"Failed to find libnvidia-ml.so: {result.stderr}",
-                                file=sys.stderr)
-                            exit(1)
-
-                    env_ld["LD_LIBRARY_PATH"] = new_library_path
-                    if binding_type == "nanobind":
-                        build_run(
-                            f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
-                            env=env_ld)
-                    else:
-                        build_run(
-                            f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
-                            env=env_ld)
-                        if deep_ep_cuda_architectures:
-                            build_run(
-                                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
-                                env=env_ld)
-                        build_run(
-                            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
-                            env=env_ld)
+                    generate_python_stubs_windows(binding_type, venv_python,
+                                                  pkg_dir, lib_dir)
+                else:  # on linux
+                    generate_python_stubs_linux(
+                        binding_type, venv_python,
+                        bool(deep_ep_cuda_architectures))
 
     if not skip_building_wheel:
         if dist_dir is None:

From 83dbc6c75dd1b107bfbdc5d7af943ef3db78be28 Mon Sep 17 00:00:00 2001
From: bhsueh_NV <11360707+byshiue@users.noreply.github.com>
Date: Mon, 11 Aug 2025 16:14:52 +0800
Subject: [PATCH 06/15] [TRTLLM-5532][feat] store the block of context request
 into kv cache (#6683)

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
---
 cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp  | 9 ++++++---
 tensorrt_llm/_torch/pyexecutor/resource_manager.py | 4 ++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
index c032c80757c..d5fa982a37a 100644
--- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -2043,10 +2043,13 @@ void KVCacheManager::addSequence(
 void KVCacheManager::storeContextBlocks(LlmRequest const& llmRequest)
 {
     auto const requestId = llmRequest.mRequestId;
-    auto& sequence = getSequence(requestId);
-    if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())
+    if (mSequences.find(requestId) != mSequences.end())
     {
-        mBlockManager.storeContextBlocks(sequence, llmRequest);
+        auto& sequence = getSequence(requestId);
+        if (mEnableBlockReuse && !sequence.isCyclic() && !llmRequest.isDummyRequest())
+        {
+            mBlockManager.storeContextBlocks(sequence, llmRequest);
+        }
     }
 }
 
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index b08c106e7e1..89be7d40e35 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -492,6 +492,10 @@ def update_resources(self, scheduled_batch: ScheduledRequests):
                 if request.py_rewind_len > 0:
                     self.rewind_kv_cache(request, request.py_rewind_len)
 
+        # For context requests, we store the blocks for reuse.
+        for request in scheduled_batch.context_requests:
+            self.impl.store_context_blocks(request)
+
     def free_resources(self, request: LlmRequest):
         self.impl.remove_sequence(request.py_request_id, request)
 

From a2e9153cb0abba0e5ba4f47404f41cdaf40aa4e4 Mon Sep 17 00:00:00 2001
From: Liao Lanyu <108499334+lancelly@users.noreply.github.com>
Date: Mon, 11 Aug 2025 16:25:41 +0800
Subject: [PATCH 07/15] [None][doc] Add K2 tool calling examples (#6667)

Signed-off-by: Lanyu Liao <lancelly@users.noreply.github.com>
Co-authored-by: Lanyu Liao <lancelly@users.noreply.github.com>
---
 examples/models/core/kimi_k2/README.md        | 127 +++++++++++
 .../kimi_k2/kimi_k2_tool_calling_example.py   | 201 ++++++++++++++++++
 2 files changed, 328 insertions(+)
 create mode 100644 examples/models/core/kimi_k2/README.md
 create mode 100644 examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py

diff --git a/examples/models/core/kimi_k2/README.md b/examples/models/core/kimi_k2/README.md
new file mode 100644
index 00000000000..1dd3e353c5a
--- /dev/null
+++ b/examples/models/core/kimi_k2/README.md
@@ -0,0 +1,127 @@
+# K2 (Kimi-K2-Instruct)
+
+## Overview
+
+Kimi K2 is Moonshot AI's Mixture-of-Experts model with 32 billion activated parameters and 1 trillion total parameters. It achieves state-of-the-art performance in frontier knowledge, math, and coding among non-thinking models. Notably, K2 also excels in agentic capabilities, demonstrating outstanding performance across complex, multi-step tasks.
+
+## Prerequisites for Tool Calling in Kimi-K2
+
+K2 model supports tool calling functionality. The official guide can be found at: [tool_call_guidance](https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md)
+
+As described in the official guide, a tool calling process in Kimi-K2 includes:
+1. Passing function descriptions to Kimi-K2.
+2. Kimi-K2 decides to make a function call and returns the necessary information for the function call to the user.
+3. The user performs the function call, collects the call results, and passes the function call results to Kimi-K2
+4. Kimi-K2 continues to generate content based on the function call results until the model believes it has obtained sufficient information to respond to the user
+
+Tools are the primary way to define callable functions for K2. Each tool requires:
+- A unique name
+- A clear description
+- A JSON schema defining the expected parameters
+
+A possible example of tool description(you may refer to [Using tools](https://huggingface.co/docs/hugs/guides/function-calling) for more information) is as follows:
+```python
+# Collect the tool descriptions in tools
+tools = [{
+    "type": "function",
+    "function": {
+        "name": "get_weather",
+        "description": "Get weather information. Call this tool when the user needs to get weather information",
+         "parameters": {
+              "type": "object",
+              "required": ["location"],
+              "properties": {
+                  "location": {
+                      "type": "string",
+                      "description": "location name",
+                }
+            }
+        }
+    }
+}]
+```
+
+Kimi currently supports two main approaches for tool calling:
+1. *Use openai.OpenAI to send messages to Kimi-K2 together with tool descriptions.*
+In this mode, the descriptions of the tools are passed as an argument to `client.chat.completions.create`, and the tool-call details can be read directly from the corresponding fields in the response.
+2. *Manually parse the tool-call requests from the outputs generated by Kimi-K2.*
+The tool call requests generated by Kimi-K2 are wrapped by <|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.{func_name}:{idx}, from which we can parse the function name.
+
+**Note that TensorRT-LLM does not support the first approach for now. If you deploy K2 with TensorRT-LLM, you need to manually parse the tool-call requests from the outputs.**
+
+The next section is an example that deploys the K2 model using TensorRT-LLM and then manually parses the tool-call results.
+
+## Example: Manually Parsing Tool-Call Requests from Kimi-K2 Outputs
+
+First, launch a server using trtllm-serve:
+
+```bash
+cat > ./extra_llm_api_options.yaml <<EOF
+# define your extra parameters here
+cuda_graph_config:
+  batch_sizes:
+    - 1
+    - 4
+enable_attention_dp: False
+EOF
+
+trtllm-serve  \
+    --model /path_to_model/Kimi-K2-Instruct/ \
+    --backend pytorch \
+    --tp_size 8 \
+    --ep_size 8 \
+    --extra_llm_api_options extra_llm_api_options.yaml
+```
+
+Run the script [kimi_k2_tool_calling_example.py](./kimi_k2_tool_calling_example.py), which performs the following steps:
+
+1. The client provides tool definitions and a user prompt to the LLM server.
+2. Instead of answering the prompt directly, the LLM server responds with a selected tool and corresponding arguments based on the user prompt.
+3. The client calls the selected tool with the arguments and retrieves the results.
+
+For example, you can query "What's the weather like in shanghai today?" with the following command:
+
+```bash
+python kimi_k2_tool_calling_example.py \
+    --model "moonshotai/Kimi-K2-Instruct" \
+    --prompt "What's the weather like in shanghai today?"
+```
+
+The output would look similar to:
+
+```txt
+[The original output from Kimi-K2]: <|tool_calls_section_begin|>
+<|tool_call_begin|>functions.get_weather:0<|tool_call_argument_begin|>{"location": "shanghai"}<|tool_call_end|>
+<|tool_calls_section_end|>user
+
+[The tool-call requests parsed from the output]: [{'id': 'functions.get_weather:0', 'type': 'function', 'function': {'name': 'get_weather', 'arguments': '{"location": "shanghai"}'}}]
+
+[Tool call result]: tool_name=get_weather, tool_result=Cloudy
+```
+
+The tool call works successfully:
+- In `[The original output from Kimi-K2]`, the LLM selects the correct tool `get_weather` and provides the appropriate arguments.
+- In `[The tool-call requests parsed from the output]`, the client parses the LLM response.
+- In `[Tool call result]`, the client executes the tool function and get the result.
+
+Let's try another query, "What's the weather like in beijing today?", using a predefined system prompt to specify the output format as shown below.
+
+```bash
+python kimi_k2_tool_calling_example.py \
+    --model "moonshotai/Kimi-K2-Instruct" \
+    --prompt "What's the weather like in beijing today?"
+    --specify_output_format
+```
+
+The output would look like:
+
+```txt
+[The original output from Kimi-K2]: [get_weather(location='beijing')]user
+
+[The tool-call requests parsed from the output]: [{'type': 'function', 'function': {'name': 'get_weather', 'arguments': {'location': 'beijing'}}}]
+
+[Tool call result]: tool_name=get_weather, tool_result=Sunny
+```
+Once again, the tool call works successfully and the original output from Kimi-K2 is formatted.
+
+**Note that, without guided decoding or other deterministic tool adapters, K2 sometimes deviates from the specified output format. Because TensorRT-LLM does not support K2 with guided decoding for now, you have to parse the tool calls carefully from the raw model output to ensure they meet the required format.**
diff --git a/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py
new file mode 100644
index 00000000000..28505477041
--- /dev/null
+++ b/examples/models/core/kimi_k2/kimi_k2_tool_calling_example.py
@@ -0,0 +1,201 @@
+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import argparse
+import ast
+import json
+import re
+
+from openai import OpenAI
+
+SPECIFY_OUTPUT_FORMAT_PROMPT = """You are an AI assistant with the role name "assistant." \
+Based on the provided API specifications and conversation history from steps 1 to t, \
+generate the API requests that the assistant should call in step t+1. \
+The API requests should be output in the format [api_name(key1='value1', key2='value2', ...)], \
+replacing api_name with the actual API name, key1, key2, etc., with the actual parameter names, \
+and value1, value2, etc., with the actual parameter values. The output should start with a square bracket "[" and end with a square bracket "]".
+If there are multiple API requests, separate them with commas, for example: \
+[api_name(key1='value1', key2='value2', ...), api_name(key1='value1', key2='value2', ...), ...]. \
+Do not include any other explanations, prompts, or API call results in the output.
+If the API parameter description does not specify otherwise, the parameter is optional \
+(parameters mentioned in the user input need to be included in the output; if not mentioned, they do not need to be included).
+If the API parameter description does not specify the required format for the value, use the user's original text for the parameter value. \
+If the API requires no parameters, output the API request directly in the format [api_name()], and do not invent any nonexistent parameter names.
+
+API Specifications:
+{tools}"""
+
+NOT_SPECIFY_OUTPUT_FORMAT_PROMPT = """Important: Only give the tool call requests, \
+do not include any other explanations, prompts, or API call results in the output.
+The tool call requests generated by you are wrapped by \
+<|tool_calls_section_begin|> and <|tool_calls_section_end|>, with each tool call wrapped by <|tool_call_begin|> and <|tool_call_end|>. \
+The tool ID and arguments are separated by <|tool_call_argument_begin|>. The format of the tool ID is functions.func_name:idx, \
+from which we can parse the function name.
+
+API Specifications:
+{tools}"""
+
+
+def get_weather(location: str):
+    if location.lower() == "beijing":
+        return "Sunny"
+    elif location.lower() == "shanghai":
+        return "Cloudy"
+    else:
+        return "Rainy"
+
+
+# Tool name->object mapping for easy calling later
+tool_map = {"get_weather": get_weather}
+
+
+# ref: https://huggingface.co/moonshotai/Kimi-K2-Instruct/blob/main/docs/tool_call_guidance.md
+def extract_tool_call_info(tool_call_rsp: str):
+    if '<|tool_calls_section_begin|>' not in tool_call_rsp:
+        # No tool calls
+        return []
+    pattern = r"<\|tool_calls_section_begin\|>(.*?)<\|tool_calls_section_end\|>"
+
+    tool_calls_sections = re.findall(pattern, tool_call_rsp, re.DOTALL)
+
+    # Extract multiple tool calls
+    func_call_pattern = r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[\w\.]+:\d+)\s*<\|tool_call_argument_begin\|>\s*(?P<function_arguments>.*?)\s*<\|tool_call_end\|>"
+    tool_calls = []
+    for match in re.findall(func_call_pattern, tool_calls_sections[0],
+                            re.DOTALL):
+        function_id, function_args = match
+        # function_id: functions.get_weather:0
+        function_name = function_id.split('.')[1].split(':')[0]
+        tool_calls.append({
+            "id": function_id,
+            "type": "function",
+            "function": {
+                "name": function_name,
+                "arguments": function_args
+            }
+        })
+    return tool_calls
+
+
+def parse_specified_format_tool_calls(text: str):
+    pattern = re.compile(r'(\w+)\s*\(([^)]*)\)')
+    tool_calls = []
+
+    for m in pattern.finditer(text):
+        api_name, kv_body = m.group(1), m.group(2)
+
+        kv_pattern = re.compile(r'(\w+)\s*=\s*([^,]+)')
+        kwargs = {}
+        for k, v in kv_pattern.findall(kv_body):
+            try:
+                kwargs[k] = ast.literal_eval(v.strip())
+            except Exception:
+                kwargs[k] = v.strip()
+
+        tool_calls.append({
+            "type": "function",
+            "function": {
+                "name": api_name,
+                "arguments": kwargs
+            }
+        })
+
+    return tool_calls
+
+
+def get_tools():
+    # Collect the tool descriptions in tools
+    return [{
+        "type": "function",
+        "function": {
+            "name": "get_weather",
+            "description":
+            "Get weather information. Call this tool when the user needs to get weather information",
+            "parameters": {
+                "type": "object",
+                "required": ["location"],
+                "properties": {
+                    "location": {
+                        "type": "string",
+                        "description": "Location name",
+                    }
+                }
+            }
+        }
+    }]
+
+
+def get_tool_call_requests(args, client):
+    model = args.model
+    tools = get_tools()
+    system_prompt = SPECIFY_OUTPUT_FORMAT_PROMPT if args.specify_output_format else NOT_SPECIFY_OUTPUT_FORMAT_PROMPT.format(
+        tools=tools)
+    messages = [{
+        "role": "system",
+        "content": system_prompt
+    }, {
+        "role": "user",
+        "content": args.prompt
+    }]
+
+    response = client.chat.completions.create(model=model,
+                                              messages=messages,
+                                              max_tokens=256,
+                                              temperature=0.0)
+
+    output = response.choices[0].message.content
+    tool_calls = parse_specified_format_tool_calls(
+        output) if args.specify_output_format else extract_tool_call_info(
+            output)
+    print(f"[The original output from Kimi-K2]: {output}\n")
+    print(f"[The tool-call requests parsed from the output]: {tool_calls}\n")
+    return tool_calls, messages
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="moonshotai/Kimi-K2-Instruct")
+    parser.add_argument("--prompt",
+                        type=str,
+                        default="What's the weather like in Shanghai today?")
+    parser.add_argument("--specify_output_format",
+                        action="store_true",
+                        default=False)
+
+    args = parser.parse_args()
+
+    # start trt-llm server before running this script
+    client = OpenAI(
+        api_key="tensorrt_llm",
+        base_url="http://localhost:8000/v1",
+    )
+
+    tool_calls, messages = get_tool_call_requests(args, client)
+
+    for tool_call in tool_calls:
+        tool_name = tool_call['function']['name']
+        if args.specify_output_format:
+            tool_arguments = tool_call['function']['arguments']
+        else:
+            tool_arguments = json.loads(tool_call['function']['arguments'])
+        tool_function = tool_map[tool_name]
+        tool_result = tool_function(**tool_arguments)
+        print(
+            f"[Tool call result]: tool_name={tool_name}, tool_result={tool_result}\n"
+        )

From c9f216fe5f490ac7f7b2a2111484e152ad580775 Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Tue, 5 Aug 2025 07:57:47 +0000
Subject: [PATCH 08/15] [TRTLLM-6675][Infra] nixl doc and test completion - I

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 .../serve/scripts/benchmark_serving.py        |   2 +-
 .../defs/disaggregated/test_disaggregated.py  | 202 +++++++++++++++++-
 .../test_lists/test-db/l0_dgx_h100.yml        |   4 +
 3 files changed, 204 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py
index 1aeb87554d9..303688f0017 100644
--- a/tensorrt_llm/serve/scripts/benchmark_serving.py
+++ b/tensorrt_llm/serve/scripts/benchmark_serving.py
@@ -581,7 +581,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
     pt_records = convert_to_pytorch_benchmark_format(
         args=args,
         metrics={k: [results[k]]
-                 for k in metrics},
+                 for k in metrics if k in results},
         extra_info={
             k: results[k]
             for k in results if k not in metrics and k not in ignored_metrics
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index 2a961553905..c3819b6e1ad 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -14,18 +14,23 @@
 # limitations under the License.
 
 import os
+import re
 import subprocess
 
 import pytest
-from defs.conftest import skip_arm, skip_no_hopper
-from defs.trt_test_alternative import check_call, popen
+import yaml
+from defs.conftest import llm_models_root, skip_arm, skip_no_hopper
+from defs.trt_test_alternative import check_call, check_output, popen
 
 from tensorrt_llm.logger import logger
 
 
 def cleanup_output_files():
     """Clean up output files from previous runs."""
-    for file in ['output.json', 'output_streaming.json']:
+    for file in [
+            'output.json', 'output_streaming.json', 'ucx_config.yaml',
+            'nixl_config.yaml'
+    ]:
         try:
             os.remove(file)
         except FileNotFoundError:
@@ -1051,3 +1056,194 @@ def test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp(
                            "deepseek_v3_lite_fp8_tp1_two_mtp",
                            env=llm_venv._new_env,
                            cwd=llm_venv.get_working_directory())
+
+
+@pytest.fixture(scope="module")
+def benchmark_root():
+    llm_root = os.getenv("LLM_ROOT")
+    return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
+
+
+@pytest.fixture(scope="module")
+def shared_gpt_path():
+    DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models")
+    LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT)
+    return os.path.join(LLM_MODELS_ROOT, "datasets",
+                        "ShareGPT_V3_unfiltered_cleaned_split.json")
+
+
+@pytest.fixture(scope="function")
+def benchmark_model_root(request):
+    models_root = llm_models_root()
+    if (request.param == "DeepSeek-V3-Lite-fp8"):
+        model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8")
+    elif (request.param == "DeepSeek-V3-Lite-bf16"):
+        model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16")
+    elif request.param == "llama-v3-8b-hf":
+        model_path = os.path.join(models_root, "llama-models-v3", "8B")
+    elif request.param == "llama-3.1-8b-instruct-hf-fp8":
+        model_path = os.path.join(models_root, "llama-3.1-model",
+                                  "Llama-3.1-8B-Instruct-FP8")
+    else:
+        raise ValueError(f"Failed to find the model: {request.param}")
+    return model_path
+
+
+def run_disaggregated_benchmark(example_dir,
+                                config_file,
+                                benchmark_root,
+                                benchmark_model_root,
+                                shared_gpt_path,
+                                env=None,
+                                cwd=None):
+    """Run disaggregated test with given configuration."""
+    run_env = env.copy()
+    run_env["UCX_TLS"] = "^ib"
+    num_rank = 2
+    workers_cmd = [
+        'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
+        str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
+        config_file
+    ]
+
+    server_start_timeout = 900
+    server_cmd = [
+        'trtllm-serve', 'disaggregated', '--server_start_timeout',
+        str(server_start_timeout), '-c', config_file
+    ]
+    try:
+        with (  # Start workers
+                open('output_workers.log', 'w') as output_workers,
+                popen(workers_cmd,
+                      stdout=output_workers,
+                      stderr=subprocess.STDOUT,
+                      env=run_env,
+                      cwd=cwd) as workers_proc,
+                # Start server
+                open('output_disagg.log', 'w') as output_disagg,
+                popen(server_cmd,
+                      stdout=output_disagg,
+                      stderr=subprocess.STDOUT,
+                      env=run_env,
+                      cwd=cwd) as server_proc):
+            # Ensure the sever has started
+            client_dir = f"{example_dir}/clients"
+            client_cmd = [
+                'python3', f'{client_dir}/disagg_client.py', '-c',
+                f'{example_dir}/disagg_config.yaml', '-p',
+                f'{client_dir}/prompts.json', '--ignore-eos',
+                '--server-start-timeout',
+                str(server_start_timeout)
+            ]
+            # Warm up
+            check_call(client_cmd,
+                       env=env,
+                       poll_procs=[workers_proc, server_proc])
+            # Start Benchmark
+            benchmark_script = os.path.join(benchmark_root,
+                                            "benchmark_serving.py")
+            benchmark_cmd = [
+                'python3',
+                benchmark_script,
+                '--model',
+                benchmark_model_root,
+                '--tokenizer',
+                benchmark_model_root,
+                '--dataset-name',
+                'random',
+                '--dataset-path',
+                shared_gpt_path,
+                '--random-input-len',
+                '256',
+                '--random-output-len',
+                '64',
+                '--random-prefix-len',
+                '0',
+                '--num-prompts',
+                '320',
+                '--max-concurrency',
+                '32',
+                '--host',
+                'localhost',
+                '--port',
+                '8000',
+                '--ignore-eos',
+                '--no-test-input',
+                '--percentile-metrics',
+                'e2el,ttft',
+            ]
+            # warm up
+            check_call(benchmark_cmd, env=env)
+            output = check_output(benchmark_cmd, env=env)
+            e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)"
+            ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)"
+            e2el_match = re.search(e2el_pattern, output)
+            ttft_match = re.search(ttft_pattern, output)
+            if e2el_match and ttft_match:
+                median_e2el = float(e2el_match.group(1))
+                median_ttft = float(ttft_match.group(1))
+                return median_e2el, median_ttft
+            else:
+                raise ValueError("No benchmark result found")
+
+    except Exception:
+        # Print outputs on error
+        logger.error("-------- Workers output --------")
+        with open('output_workers.log', 'r') as f:
+            logger.error(f.read())
+
+        logger.error("-------- Disagg server output --------")
+        with open('output_disagg.log', 'r') as f:
+            logger.error(f.read())
+        raise
+    finally:
+        server_proc.terminate()
+        workers_proc.terminate()
+        server_proc.wait()
+        workers_proc.wait()
+
+
+@pytest.mark.parametrize("benchmark_model_root", [
+    'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
+    'llama-3.1-8b-instruct-hf-fp8'
+],
+                         indirect=True)
+def test_disaggregated_benchmark_on_diff_backends(
+        disaggregated_test_root, disaggregated_example_root, llm_venv,
+        benchmark_model_root, benchmark_root, shared_gpt_path):
+    base_config_path = os.path.join(os.path.dirname(__file__), "test_configs",
+                                    "disagg_config_for_benchmark.yaml")
+    with open(base_config_path, 'r', encoding='utf-8') as f:
+        config = yaml.load(f, Loader=yaml.SafeLoader)
+        config["model"] = benchmark_model_root
+        with open("ucx_config.yaml", 'w', encoding='utf-8') as ucx_config:
+            yaml.dump(config, ucx_config)
+        config["context_servers"]["cache_transceiver_config"][
+            "backend"] = "nixl"
+        config["generation_servers"]["cache_transceiver_config"][
+            "backend"] = "nixl"
+        with open("nixl_config.yaml", 'w', encoding='utf-8') as nixl_config:
+            yaml.dump(config, nixl_config)
+
+    env = llm_venv._new_env.copy()
+    nixl_e2el, nixl_ttft = run_disaggregated_benchmark(
+        disaggregated_example_root,
+        f"{os.path.dirname(__file__)}/nixl_config.yaml",
+        benchmark_root,
+        benchmark_model_root,
+        shared_gpt_path,
+        env=env,
+        cwd=llm_venv.get_working_directory())
+    ucx_e2el, ucx_ttft = run_disaggregated_benchmark(
+        disaggregated_example_root,
+        f"{os.path.dirname(__file__)}/ucx_config.yaml",
+        benchmark_root,
+        benchmark_model_root,
+        shared_gpt_path,
+        env=env,
+        cwd=llm_venv.get_working_directory())
+    print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms")
+    print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms")
+
+    assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el
+    assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 29d8efac07f..4160efb6529 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -110,6 +110,10 @@ l0_dgx_h100:
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]

From 0a6143d69eb2ce45d23d74ad45d55b64ddffe120 Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Tue, 5 Aug 2025 08:18:10 +0000
Subject: [PATCH 09/15] [TRTLLM-6675][Infra] nixl doc and test completion - II

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 .../accuracy/test_disaggregated_serving.py    | 50 +++++++++++++++++++
 .../test_lists/qa/llm_function_full.txt       |  2 +
 .../test_lists/qa/llm_function_sanity.txt     |  2 +
 .../test_lists/test-db/l0_dgx_b200.yml        |  7 +++
 .../test_lists/test-db/l0_dgx_h100.yml        |  2 +
 5 files changed, 63 insertions(+)

diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index e0801302eba..a276f172e7a 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -281,6 +281,30 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
+    def test_nixl_backend(self, backend):
+        ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
+        gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @pytest.mark.skip_less_device_memory(32000)
     @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
     def test_auto_dtype(self, disable_overlap_scheduler):
@@ -565,6 +589,32 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
 
+    def test_nixl_backend(self, backend):
+        ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
+        gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config,
+                                      gen_server_config,
+                                      self.MODEL_PATH,
+                                      tensor_parallel_size=4) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @parametrize_with_ids("overlap_scheduler", [True, False])
     @parametrize_with_ids("mtp_nextn",
                           [0, pytest.param(2, marks=skip_pre_hopper)])
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index 037bb0c3d29..da6bac0e419 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -573,6 +573,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index ae7008f815d..488336a77ab 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -110,6 +110,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 29b83ac0778..7c8c92fa186 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -79,3 +79,10 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
+  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 4160efb6529..c4bbca390ef 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -52,6 +52,8 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
   - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
   - unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism

From dc60530f1c3a9389214d6fe275e88a659eaa8c80 Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Tue, 5 Aug 2025 08:41:25 +0000
Subject: [PATCH 10/15] fix typo

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 tests/integration/defs/accuracy/test_disaggregated_serving.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index a276f172e7a..fd03bbb036f 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -281,7 +281,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
-    def test_nixl_backend(self, backend):
+    def test_nixl_backend(self):
         ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
         gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
         disaggregated_server_config = {
@@ -589,7 +589,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
 
-    def test_nixl_backend(self, backend):
+    def test_nixl_backend(self):
         ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
         gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
         disaggregated_server_config = {

From 148e3af7b1d8140535c8d87e6a14af91641f35ce Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Wed, 6 Aug 2025 00:32:24 +0000
Subject: [PATCH 11/15] add missing config file

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 .../disagg_config_for_benchmark.yaml          | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml

diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml
new file mode 100644
index 00000000000..be2ced4b463
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml
@@ -0,0 +1,29 @@
+model: DeepSeek-V3-Lite/fp8
+hostname: localhost
+port: 8000
+backend: "pytorch"
+context_servers:
+  num_instances: 1
+  max_batch_size: 2
+  max_num_tokens: 384
+  max_seq_len: 320
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  disable_overlap_scheduler: true
+  cache_transceiver_config:
+    backend: ucx
+    max_tokens_in_buffer: 512
+  urls:
+    - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 1
+  pipeline_parallel_size: 1
+  max_batch_size: 2
+  max_num_tokens: 384
+  max_seq_len: 320
+  cache_transceiver_config:
+    backend: ucx
+    max_tokens_in_buffer: 512
+  urls:
+      - "localhost:8002"

From 0610d8cad47ee0bf8ff6acb7f70ef702c9a7742b Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Wed, 6 Aug 2025 09:58:39 +0000
Subject: [PATCH 12/15] fix accuracy tests

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 .../accuracy/test_disaggregated_serving.py    | 80 ++++++++++++-------
 .../test_lists/qa/llm_function_full.txt       |  2 +-
 .../test_lists/qa/llm_function_sanity.txt     |  2 +-
 .../test_lists/test-db/l0_dgx_b200.yml        |  2 +-
 .../test_lists/test-db/l0_dgx_h100.yml        |  2 +-
 5 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index fd03bbb036f..08a96916ff3 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -281,30 +281,6 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
 
-    def test_nixl_backend(self):
-        ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
-        gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
-        disaggregated_server_config = {
-            "hostname": "localhost",
-            "port": 8000,
-            "backend": "pytorch",
-            "context_servers": {
-                "num_instances": 1,
-                "urls": ["localhost:8001"]
-            },
-            "generation_servers": {
-                "num_instances": 1,
-                "urls": ["localhost:8002"]
-            }
-        }
-        with launch_disaggregated_llm(disaggregated_server_config,
-                                      ctx_server_config, gen_server_config,
-                                      self.MODEL_PATH) as llm:
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = GSM8K(self.MODEL_NAME)
-            task.evaluate(llm)
-
     @pytest.mark.skip_less_device_memory(32000)
     @pytest.mark.parametrize("disable_overlap_scheduler", [False, True])
     def test_auto_dtype(self, disable_overlap_scheduler):
@@ -590,8 +566,18 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
 
     def test_nixl_backend(self):
-        ctx_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
-        gen_server_config = {"cache_transceiver_config": {"backend": "nixl"}}
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "cache_transceiver_config": {
+                "backend": "nixl"
+            }
+        }
+        gen_server_config = {
+            "disable_overlap_scheduler": True,
+            "cache_transceiver_config": {
+                "backend": "nixl"
+            }
+        }
         disaggregated_server_config = {
             "hostname": "localhost",
             "port": 8000,
@@ -606,10 +592,8 @@ def test_nixl_backend(self):
             }
         }
         with launch_disaggregated_llm(disaggregated_server_config,
-                                      ctx_server_config,
-                                      gen_server_config,
-                                      self.MODEL_PATH,
-                                      tensor_parallel_size=4) as llm:
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
@@ -716,6 +700,42 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen3/Qwen3-8B"
     MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
 
+    def test_nixl_backend(self):
+        ctx_server_config = {
+            "disable_overlap_scheduler": True,
+            "cache_transceiver_config": {
+                "backend": "nixl"
+            }
+        }
+        gen_server_config = {
+            "disable_overlap_scheduler": True,
+            "cache_transceiver_config": {
+                "backend": "nixl"
+            }
+        }
+        ctx_server_config["cache_transceiver_config"]
+        ctx_server_config["cache_transceiver_config"]
+        disaggregated_server_config = {
+            "hostname": "localhost",
+            "port": 8000,
+            "backend": "pytorch",
+            "context_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8001"]
+            },
+            "generation_servers": {
+                "num_instances": 1,
+                "urls": ["localhost:8002"]
+            }
+        }
+        with launch_disaggregated_llm(disaggregated_server_config,
+                                      ctx_server_config, gen_server_config,
+                                      self.MODEL_PATH) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @pytest.mark.parametrize("overlap_scheduler", [False, True])
     def test_auto_dtype(self, overlap_scheduler):
         ctx_server_config = {
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index da6bac0e419..4a1f44dbb29 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -573,7 +573,7 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index 488336a77ab..42ec8d21a91 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -110,7 +110,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 7c8c92fa186..ca23535a199 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -84,5 +84,5 @@ l0_dgx_b200:
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
-  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index c4bbca390ef..798353ddc02 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -52,7 +52,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
-  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_nixl_backend
+  - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
   - accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
   - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]

From 57c7e4f0ccdf8f095308291a8f4d7ccc44cb4f51 Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Wed, 6 Aug 2025 20:05:19 -0700
Subject: [PATCH 13/15] fix some tests

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 .../disagg_config_for_benchmark.yaml          | 29 ---------
 .../defs/disaggregated/test_disaggregated.py  | 64 ++++++++++++++-----
 2 files changed, 49 insertions(+), 44 deletions(-)
 delete mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml

diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml
deleted file mode 100644
index be2ced4b463..00000000000
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_for_benchmark.yaml
+++ /dev/null
@@ -1,29 +0,0 @@
-model: DeepSeek-V3-Lite/fp8
-hostname: localhost
-port: 8000
-backend: "pytorch"
-context_servers:
-  num_instances: 1
-  max_batch_size: 2
-  max_num_tokens: 384
-  max_seq_len: 320
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  disable_overlap_scheduler: true
-  cache_transceiver_config:
-    backend: ucx
-    max_tokens_in_buffer: 512
-  urls:
-    - "localhost:8001"
-generation_servers:
-  num_instances: 1
-  tensor_parallel_size: 1
-  pipeline_parallel_size: 1
-  max_batch_size: 2
-  max_num_tokens: 384
-  max_seq_len: 320
-  cache_transceiver_config:
-    backend: ucx
-    max_tokens_in_buffer: 512
-  urls:
-      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index c3819b6e1ad..9c95defe0ab 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -16,6 +16,7 @@
 import os
 import re
 import subprocess
+import tempfile
 
 import pytest
 import yaml
@@ -1203,6 +1204,43 @@ def run_disaggregated_benchmark(example_dir,
         workers_proc.wait()
 
 
+def get_config_for_benchmark(model_root, backend):
+    serve_config = {
+        "model": model_root,
+        "hostname": "localhost",
+        "port": 8000,
+        "backend": "pytorch",
+        "context_servers": {
+            "num_instances": 1,
+            "max_batch_size": 2,
+            "max_num_tokens": 384,
+            "max_seq_len": 320,
+            "tensor_parallel_size": 1,
+            "pipeline_parallel_size": 1,
+            "disable_overlap_scheduler": True,
+            "cache_transceiver_config": {
+                "backend": backend,
+                "max_tokens_in_buffer": 512,
+            },
+            "urls": ["localhost:8001"]
+        },
+        "generation_servers": {
+            "num_instances": 1,
+            "tensor_parallel_size": 1,
+            "pipeline_parallel_size": 1,
+            "max_batch_size": 2,
+            "max_num_tokens": 384,
+            "max_seq_len": 320,
+            "cache_transceiver_config": {
+                "backend": backend,
+                "max_tokens_in_buffer": 512,
+            },
+            "urls": ["localhost:8002"]
+        }
+    }
+    return serve_config
+
+
 @pytest.mark.parametrize("benchmark_model_root", [
     'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
     'llama-3.1-8b-instruct-hf-fp8'
@@ -1211,24 +1249,20 @@ def run_disaggregated_benchmark(example_dir,
 def test_disaggregated_benchmark_on_diff_backends(
         disaggregated_test_root, disaggregated_example_root, llm_venv,
         benchmark_model_root, benchmark_root, shared_gpt_path):
-    base_config_path = os.path.join(os.path.dirname(__file__), "test_configs",
-                                    "disagg_config_for_benchmark.yaml")
-    with open(base_config_path, 'r', encoding='utf-8') as f:
-        config = yaml.load(f, Loader=yaml.SafeLoader)
-        config["model"] = benchmark_model_root
-        with open("ucx_config.yaml", 'w', encoding='utf-8') as ucx_config:
-            yaml.dump(config, ucx_config)
-        config["context_servers"]["cache_transceiver_config"][
-            "backend"] = "nixl"
-        config["generation_servers"]["cache_transceiver_config"][
-            "backend"] = "nixl"
-        with open("nixl_config.yaml", 'w', encoding='utf-8') as nixl_config:
-            yaml.dump(config, nixl_config)
+    nixl_config = get_config_for_benchmark(benchmark_model_root, "nixl")
+    ucx_config = get_config_for_benchmark(benchmark_model_root, "ucx")
+    temp_dir = tempfile.TemporaryDirectory()
+    nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml")
+    ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml")
+    with open(nixl_config_path, 'w', encoding='utf-8') as f:
+        yaml.dump(nixl_config, f)
+    with open(ucx_config_path, 'w', encoding='utf-8') as f:
+        yaml.dump(ucx_config, f)
 
     env = llm_venv._new_env.copy()
     nixl_e2el, nixl_ttft = run_disaggregated_benchmark(
         disaggregated_example_root,
-        f"{os.path.dirname(__file__)}/nixl_config.yaml",
+        nixl_config_path,
         benchmark_root,
         benchmark_model_root,
         shared_gpt_path,
@@ -1236,7 +1270,7 @@ def test_disaggregated_benchmark_on_diff_backends(
         cwd=llm_venv.get_working_directory())
     ucx_e2el, ucx_ttft = run_disaggregated_benchmark(
         disaggregated_example_root,
-        f"{os.path.dirname(__file__)}/ucx_config.yaml",
+        ucx_config_path,
         benchmark_root,
         benchmark_model_root,
         shared_gpt_path,

From fb8c19da325d974152a9ee0d4c1630c391df063d Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Wed, 6 Aug 2025 20:11:39 -0700
Subject: [PATCH 14/15] clean codes

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 tests/integration/defs/disaggregated/test_disaggregated.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index 9c95defe0ab..fa150ea26e7 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -28,10 +28,7 @@
 
 def cleanup_output_files():
     """Clean up output files from previous runs."""
-    for file in [
-            'output.json', 'output_streaming.json', 'ucx_config.yaml',
-            'nixl_config.yaml'
-    ]:
+    for file in ['output.json', 'output_streaming.json']:
         try:
             os.remove(file)
         except FileNotFoundError:

From a87a577ae382e715fed79cf291d190a1cb263356 Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Sun, 10 Aug 2025 21:45:45 -0700
Subject: [PATCH 15/15] adjust tests

Signed-off-by: Bo Deng <deemod@nvidia.com>
---
 tests/integration/defs/accuracy/test_disaggregated_serving.py | 2 --
 tests/integration/defs/disaggregated/test_disaggregated.py    | 4 ++--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 08a96916ff3..98432a3aab8 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -713,8 +713,6 @@ def test_nixl_backend(self):
                 "backend": "nixl"
             }
         }
-        ctx_server_config["cache_transceiver_config"]
-        ctx_server_config["cache_transceiver_config"]
         disaggregated_server_config = {
             "hostname": "localhost",
             "port": 8000,
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index fa150ea26e7..c193a358197 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -1211,7 +1211,7 @@ def get_config_for_benchmark(model_root, backend):
             "num_instances": 1,
             "max_batch_size": 2,
             "max_num_tokens": 384,
-            "max_seq_len": 320,
+            "max_seq_len": 384,
             "tensor_parallel_size": 1,
             "pipeline_parallel_size": 1,
             "disable_overlap_scheduler": True,
@@ -1227,7 +1227,7 @@ def get_config_for_benchmark(model_root, backend):
             "pipeline_parallel_size": 1,
             "max_batch_size": 2,
             "max_num_tokens": 384,
-            "max_seq_len": 320,
+            "max_seq_len": 384,
             "cache_transceiver_config": {
                 "backend": backend,
                 "max_tokens_in_buffer": 512,