From e88cb92f2490fb7f76dfdd17a144f92ce3f19d16 Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Mon, 18 Aug 2025 13:47:14 +0800
Subject: [PATCH 01/20] [None] [feat] Support accurate device iter time (#6906)

Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 44 +++++++++++++++++--
 1 file changed, 40 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index af39b213042..4573528e314 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -403,6 +403,16 @@ def _profiler(self):
         it = -1
         enabled = False
         start_time = None
+
+        # These events are used to record the time of the previous batch.
+        # We need two set of the start-end events to record the time through
+        # a ping-pong way so that it works with overlap scheduler.
+        start_event_1 = None
+        end_event_1 = torch.cuda.Event(enable_timing=True)
+        start_event_2 = None
+        end_event_2 = torch.cuda.Event(enable_timing=True)
+        prev_device_step_time = None
+
         torch_trace_path = os.environ.get(PROFILE_TRACE_ENV_VAR_NAME, None)
         profile_start_stop = os.environ.get(PROFILE_START_STOP_ENV_VAR_NAME,
                                             None)
@@ -425,7 +435,7 @@ def _profiler(self):
                                                     with_modules=True)
 
         def profile_step():
-            nonlocal it, enabled, start_time
+            nonlocal it, enabled, start_time, start_event_1, end_event_1, start_event_2, end_event_2, prev_device_step_time
             if it in self.profile_stop_iters and not self.is_warmup:
                 assert enabled, "Inconsistent CUDA profiling state"
                 if enable_torch_trace:
@@ -438,7 +448,24 @@ def profile_step():
 
             if start_time is not None and self.print_log and self.dist.rank == 0:
                 end_time = time.time()
-
+                if it % 2 == 0:
+                    end_event_1.record()
+                    if start_event_2 is not None:
+                        end_event_2.synchronize()
+                        prev_device_step_time = start_event_2.elapsed_time(
+                            end_event_2)
+                else:
+                    end_event_2.record()
+                    if start_event_1 is not None:
+                        end_event_1.synchronize()
+                        prev_device_step_time = start_event_1.elapsed_time(
+                            end_event_1)
+
+                if prev_device_step_time is None:
+                    prev_device_step_time = "N/A"  # Handle first iteration
+                else:
+                    prev_device_step_time = f"{prev_device_step_time}ms"
+                host_step_time = (end_time - start_time) * 1000  # milliseconds
                 formatted_timestamp = datetime.datetime.now().strftime(
                     "%Y-%m-%d %H:%M:%S")
                 logger.info(
@@ -447,7 +474,8 @@ def profile_step():
                     f"rank = {self.dist.rank}, "
                     f"currank_total_requests = {self.executor_request_queue.num_fetch_requests_cur_rank}/"
                     f"{self.executor_request_queue.num_fetch_requests}, "
-                    f"elapsed_time = {end_time - start_time}s, "
+                    f"host_step_time = {host_step_time}ms, "
+                    f"prev_device_step_time = {prev_device_step_time}, "
                     f"timestamp = {formatted_timestamp}, "
                     f"num_scheduled_requests: {self.num_scheduled_requests}, "
                     f"states = {self.model_engine.iter_states}")
@@ -462,6 +490,14 @@ def profile_step():
                 logger.info(f"Profiling started at iteration {it}.")
                 enabled = True
             start_time = time.time()
+            if it % 2 == 0:
+                if start_event_1 is None:
+                    start_event_1 = torch.cuda.Event(enable_timing=True)
+                start_event_1.record()
+            else:
+                if start_event_2 is None:
+                    start_event_2 = torch.cuda.Event(enable_timing=True)
+                start_event_2.record()
 
         try:
             yield profile_step
@@ -1400,7 +1436,7 @@ def _forward_step(self,
                       new_tensors_device: Optional[SampleStateTensors] = None):
 
         @nvtx_range(
-            f"[Executor] _forward_step {self.model_engine.iter_counter}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs"
+            f"[Executor] _forward_step {self.model_engine.iter_counter + 1}: {len(scheduled_requests.context_requests)} ctx reqs, {len(scheduled_requests.generation_requests)} gen reqs"
         )
         def forward(scheduled_requests, resource_manager, new_tensors_device,
                     gather_context_logits, cache_indirection_buffer):

From 5ec15b98f0692471693ee1f086a351c6afc25953 Mon Sep 17 00:00:00 2001
From: Shi Xiaowei <39303645+Shixiaowei02@users.noreply.github.com>
Date: Mon, 18 Aug 2025 14:33:23 +0800
Subject: [PATCH 02/20] [TRTLLM-7030][fix] uppercase def value in pd-config
 (#6981)

Signed-off-by: ShiXiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
---
 .../test_configs/disagg_config_gen_only_bs1.yaml              | 4 ++--
 tests/integration/test_lists/waives.txt                       | 4 ----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml
index 4efbc9a9493..19d1eca714f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_bs1.yaml
@@ -17,7 +17,7 @@ context_servers:
     enable_partial_reuse: False
   disable_overlap_scheduler: True
   cache_transceiver_config:
-    backend: default
+    backend: DEFAULT
   urls:
       - "localhost:8001"
 generation_servers:
@@ -32,6 +32,6 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
   cache_transceiver_config:
-    backend: default
+    backend: DEFAULT
   urls:
       - "localhost:8002"
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index f681b1b196b..7a75485ec6c 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -315,9 +315,5 @@ triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5434320)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437)
-disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5459811)
-disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8] SKIP (https://nvbugs/5459811)
-disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5459811)
-disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5459811)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)

From 55f4f2d80c0da73bdd828d695d3964390c4a4821 Mon Sep 17 00:00:00 2001
From: ChristinaZ <83400082+ChristinaZ@users.noreply.github.com>
Date: Mon, 18 Aug 2025 15:08:32 +0800
Subject: [PATCH 03/20] [None] [fix] Fix the macro name (#6983)

Signed-off-by: Christina Zhang <83400082+ChristinaZ@users.noreply.github.com>
---
 .../trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
index 601229591e5..f03e02c2e29 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingRenormalize.cu
@@ -199,7 +199,7 @@ __global__ void __launch_bounds__(NumThreadsSingleBlock) routingIndicesBlockKern
     }
 
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
-#if !defined(FDL_PROFILE) || FDL_PROFILE == 0
+#if !defined(PDL_PROFILE) || PDL_PROFILE == 0
     // we can trigger the next kernel at this point
     if constexpr (KernelParams::UsePdl)
     {

From 69ff32f9b1b3c8d7437dabda00d302c74633b4a1 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Mon, 18 Aug 2025 20:34:52 +0800
Subject: [PATCH 04/20] [None][infra] Waive failed tests on main 0818 (#6992)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 7a75485ec6c..2459426b917 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -316,4 +316,7 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend SKIP (https://nvbugs/5448437)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5448437)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] SKIP (https://nvbugs/5445466)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5445466)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)
+llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796)
+disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811)

From 1ce23545fc337f7b240b4d8a983297224497a2dc Mon Sep 17 00:00:00 2001
From: Yiqing Yan <yiqingy@nvidia.com>
Date: Mon, 18 Aug 2025 21:15:49 +0800
Subject: [PATCH 05/20] [None][chore] Remove duplicate test waives (#6998)

Signed-off-by: Yiqing Yan <yiqingy@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 2459426b917..49c9a6d0107 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -264,18 +264,14 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype SKIP (https://nvbu
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope SKIP (https://nvbugs/5433543)
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype SKIP (https://nvbugs/5433545)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5431139)
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=4] SKIP (https://nvbugs/5431139)
-accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4] SKIP (https://nvbugs/5431139)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-27b-it] SKIP (https://nvbugs/5434451)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it] SKIP (https://nvbugs/5434451)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437405)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 SKIP (https://nvbugs/5440241)
-test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060)
-test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060)
+test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
+test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444060,https://nvbugs/5444095)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5433545)
-test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095)
-test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5444095)
 examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B] SKIP (https://nvbugs/5444636)
 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive SKIP (https://nvbugs/5444627)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687)
@@ -294,10 +290,9 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True] SKIP (https://nvbugs/5403818)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=False] SKIP (https://nvbugs/5403818)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5403818)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5442827)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5442827,https://nvbugs/5445466)
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] SKIP (https://nvbugs/5453992)
 accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype SKIP (https://nvbugs/5454875)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5445466)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] SKIP (https://nvbugs/5454898)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] SKIP (https://nvbugs/5454898)
 examples/test_llm_api_with_mpi.py::test_llm_api_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5434372)

From 425dad01fdf24a65132d62a6ad2736ce46b0000f Mon Sep 17 00:00:00 2001
From: Martin Marciniszyn Mehringer
 <11665257+MartinMarciniszyn@users.noreply.github.com>
Date: Mon, 18 Aug 2025 08:20:51 -0700
Subject: [PATCH 06/20] [None][fix] Clean up linking to CUDA stub libraries in
 build_wheel.py (#6823)

Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com>
Co-authored-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
---
 cpp/tensorrt_llm/nanobind/CMakeLists.txt |   3 +-
 cpp/tensorrt_llm/pybind/CMakeLists.txt   |   3 +-
 docker/Dockerfile.multi                  |   5 +-
 scripts/build_wheel.py                   | 223 +++++++++++++++--------
 4 files changed, 153 insertions(+), 81 deletions(-)

diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index 8367d007971..1ccb50a02b7 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -43,6 +43,7 @@ target_link_libraries(
          ${Python3_LIBRARIES}
          ${TORCH_LIBRARIES}
          torch_python
+         CUDA::cuda_driver
          ${CUDA_NVML_LIB}
          th_common)
 target_compile_definitions(
@@ -54,6 +55,6 @@ if(NOT WIN32)
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
index 9e6e2909395..91b5ebf5482 100755
--- a/cpp/tensorrt_llm/pybind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -44,6 +44,7 @@ target_link_libraries(
          ${Python3_LIBRARIES}
          ${TORCH_LIBRARIES}
          torch_python
+         CUDA::cuda_driver
          ${CUDA_NVML_LIB}
          th_common)
 target_compile_definitions(
@@ -55,6 +56,6 @@ if(NOT WIN32)
     ${TRTLLM_PYBIND_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index c832481da9f..eeafc8f4a65 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -71,8 +71,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
 # Install OpenCV with FFMPEG support
-RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
-RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
+RUN pip3 uninstall -y opencv && \
+    rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
+    pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
 
 # WARs against security issues inherited from pytorch:25.06
 # * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 2ac3b484835..e40543c78f3 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -16,8 +16,10 @@
 
 import os
 import platform
+import re
 import sys
 import sysconfig
+import tempfile
 import warnings
 from argparse import ArgumentParser
 from contextlib import contextmanager
@@ -27,7 +29,7 @@
 from shutil import copy, copytree, rmtree
 from subprocess import DEVNULL, CalledProcessError, check_output, run
 from textwrap import dedent
-from typing import List
+from typing import Sequence
 
 try:
     from packaging.requirements import Requirement
@@ -120,7 +122,8 @@ def create_venv(project_dir: Path):
     return venv_prefix
 
 
-def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
+def setup_venv(project_dir: Path, requirements_file: Path,
+               no_venv: bool) -> tuple[Path, Path]:
     """Creates/updates a venv and installs requirements.
 
     Args:
@@ -279,6 +282,139 @@ def generate_fmha_cu(project_dir, venv_python):
     os.chdir(project_dir)
 
 
+def create_cuda_stub_links(cuda_stub_dir: str, missing_libs: list[str]) -> str:
+    """
+    Creates symbolic links for CUDA stub libraries in a temporary directory.
+
+    Args:
+        cuda_stub_dir (str): Path to the directory containing CUDA stubs.
+        missing_libs: Versioned names of the missing libraries.
+
+    Returns:
+        str: Path to the temporary directory where links were created.
+    """
+    cuda_stub_path = Path(cuda_stub_dir)
+    if not cuda_stub_path.exists():
+        raise RuntimeError(
+            f"CUDA stub directory '{cuda_stub_dir}' does not exist.")
+
+    # Create a temporary directory for the symbolic links
+    temp_dir = tempfile.mkdtemp(prefix="cuda_stub_links_")
+    temp_dir_path = Path(temp_dir)
+
+    version_pattern = r'\.\d+'
+    for missing_lib in filter(lambda x: re.search(version_pattern, x),
+                              missing_libs):
+        # Define `so` as the first part of `missing_lib` with trailing '.' and digits removed
+        so = cuda_stub_path / re.sub(version_pattern, '', missing_lib)
+        so_versioned = temp_dir_path / missing_lib
+
+        # Check if the library exists in the original directory
+        if so.exists():
+            try:
+                # Create the symbolic link in the temporary directory
+                so_versioned.symlink_to(so)
+            except OSError as e:
+                # Clean up the temporary directory on error
+                rmtree(temp_dir)
+                raise RuntimeError(
+                    f"Failed to create symbolic link for '{missing_lib}' in temporary directory '{temp_dir}': {e}"
+                )
+        else:
+            warnings.warn(
+                f"Warning: Source library '{so}' does not exist and was skipped."
+            )
+
+    # Return the path to the temporary directory where the links were created
+    return str(temp_dir_path)
+
+
+def check_missing_libs(so_prefix: str) -> list[str]:
+    result = build_run(f"ldd {so_prefix}.cpython*.so",
+                       capture_output=True,
+                       text=True)
+    missing = []
+    for line in result.stdout.splitlines():
+        if "not found" in line:
+            lib_name = line.split()[
+                0]  # Extract the library name before "=> not found"
+            if lib_name not in missing:
+                missing.append(lib_name)
+    return missing
+
+
+def generate_python_stubs_linux(binding_type: str, venv_python: Path,
+                                deep_ep: bool):
+    is_nanobind = binding_type == "nanobind"
+    if is_nanobind:
+        build_run(f"\"{venv_python}\" -m pip install nanobind")
+    build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
+
+    env_stub_gen = os.environ.copy()
+    cuda_home_dir = env_stub_gen.get("CUDA_HOME") or env_stub_gen.get(
+        "CUDA_PATH") or "/usr/local/cuda"
+    missing_libs = check_missing_libs("bindings")
+    cuda_stub_dir = f"{cuda_home_dir}/lib64/stubs"
+
+    if missing_libs and Path(cuda_stub_dir).exists():
+        # Create symbolic links for the CUDA stubs
+        link_dir = create_cuda_stub_links(cuda_stub_dir, missing_libs)
+        ld_library_path = env_stub_gen.get("LD_LIBRARY_PATH")
+        env_stub_gen["LD_LIBRARY_PATH"] = ":".join(
+            filter(None, [link_dir, cuda_stub_dir, ld_library_path]))
+    else:
+        link_dir = None
+
+    try:
+        if is_nanobind:
+            build_run(f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
+                      env=env_stub_gen)
+        else:
+            build_run(
+                f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
+                env=env_stub_gen)
+        build_run(
+            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
+            env=env_stub_gen)
+        if deep_ep:
+            build_run(
+                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
+                env=env_stub_gen)
+    finally:
+        if link_dir:
+            rmtree(link_dir)
+
+
+def generate_python_stubs_windows(binding_type: str, venv_python: Path,
+                                  pkg_dir: Path, lib_dir: Path):
+    if binding_type == "nanobind":
+        print("Windows not yet supported for nanobind stubs")
+        exit(1)
+    else:
+        build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
+        stubgen = "stubgen.py"
+        stubgen_contents = """
+                        # Loading torch, trt before bindings is required to avoid import errors on windows.
+                        # isort: off
+                        import torch
+                        import tensorrt as trt
+                        # isort: on
+                        import os
+                        import platform
+
+                        from pybind11_stubgen import main
+
+                        if __name__ == "__main__":
+                            # Load dlls from `libs` directory before launching bindings.
+                            if platform.system() == "Windows":
+                                os.add_dll_directory(r\"{lib_dir}\")
+                            main()
+                        """.format(lib_dir=lib_dir)
+        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
+        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
+        (pkg_dir / stubgen).unlink()
+
+
 def main(*,
          build_type: str = "Release",
          generator: str = "",
@@ -286,7 +422,7 @@ def main(*,
          dist_dir: Path = None,
          cuda_architectures: str = None,
          job_count: int = None,
-         extra_cmake_vars: List[str] = list(),
+         extra_cmake_vars: Sequence[str] = tuple(),
          extra_make_targets: str = "",
          trt_root: str = '/usr/local/tensorrt',
          nccl_root: str = None,
@@ -361,7 +497,7 @@ def main(*,
 
     if on_windows:
         # Windows does not support multi-device currently.
-        extra_cmake_vars.extend(["ENABLE_MULTI_DEVICE=0"])
+        extra_cmake_vars = list(extra_cmake_vars) + ["ENABLE_MULTI_DEVICE=0"]
 
         # The Ninja CMake generator is used for our Windows build
         # (Easier than MSBuild to make compatible with our Docker image)
@@ -703,81 +839,14 @@ def get_binding_lib(subdirectory, name):
                      dirs_exist_ok=True)
 
         if not skip_stubs:
-            with working_directory(project_dir):
-                if binding_type == "nanobind":
-                    build_run(f"\"{venv_python}\" -m pip install nanobind")
-                else:
-                    build_run(
-                        f"\"{venv_python}\" -m pip install pybind11-stubgen")
             with working_directory(pkg_dir):
                 if on_windows:
-                    if binding_type == "nanobind":
-                        print("Windows not yet supported for nanobind stubs")
-                        exit(1)
-                    else:
-                        stubgen = "stubgen.py"
-                        stubgen_contents = """
-                        # Loading torch, trt before bindings is required to avoid import errors on windows.
-                        # isort: off
-                        import torch
-                        import tensorrt as trt
-                        # isort: on
-                        import os
-                        import platform
-
-                        from pybind11_stubgen import main
-
-                        if __name__ == "__main__":
-                            # Load dlls from `libs` directory before launching bindings.
-                            if platform.system() == "Windows":
-                                os.add_dll_directory(r\"{lib_dir}\")
-                            main()
-                        """.format(lib_dir=lib_dir)
-                        (pkg_dir / stubgen).write_text(dedent(stubgen_contents))
-                        build_run(f"\"{venv_python}\" {stubgen} -o . bindings")
-                        (pkg_dir / stubgen).unlink()
-                else:
-                    env_ld = os.environ.copy()
-
-                    new_library_path = "/usr/local/cuda/compat:/usr/local/cuda/compat/lib:/usr/local/cuda/compat/lib.real"
-                    if 'LD_LIBRARY_PATH' in env_ld:
-                        new_library_path += f":{env_ld['LD_LIBRARY_PATH']}"
-
-                    result = build_run("find /usr -name *libnvidia-ml.so*",
-                                       capture_output=True,
-                                       text=True)
-                    assert result.returncode == 0, f"Failed to run find *libnvidia-ml.so*: {result.stderr}"
-
-                    # Build containers only contain stub version of libnvidia-ml.so and not the real version.
-                    # If real version not in system, we need to create symbolic link to stub version to prevent import errors.
-                    if "libnvidia-ml.so.1" not in result.stdout:
-                        if "libnvidia-ml.so" in result.stdout:
-                            line = result.stdout.splitlines()[0]
-                            path = os.path.dirname(line)
-                            new_library_path += f":{path}"
-                            build_run(f"ln -s {line} {path}/libnvidia-ml.so.1")
-                        else:
-                            print(
-                                f"Failed to find libnvidia-ml.so: {result.stderr}",
-                                file=sys.stderr)
-                            exit(1)
-
-                    env_ld["LD_LIBRARY_PATH"] = new_library_path
-                    if binding_type == "nanobind":
-                        build_run(
-                            f"\"{venv_python}\" -m nanobind.stubgen -m bindings -O .",
-                            env=env_ld)
-                    else:
-                        build_run(
-                            f"\"{venv_python}\" -m pybind11_stubgen -o . bindings --exit-code",
-                            env=env_ld)
-                        if deep_ep_cuda_architectures:
-                            build_run(
-                                f"\"{venv_python}\" -m pybind11_stubgen -o . deep_ep_cpp_tllm --exit-code",
-                                env=env_ld)
-                        build_run(
-                            f"\"{venv_python}\" -m pybind11_stubgen -o . deep_gemm_cpp_tllm --exit-code",
-                            env=env_ld)
+                    generate_python_stubs_windows(binding_type, venv_python,
+                                                  pkg_dir, lib_dir)
+                else:  # on linux
+                    generate_python_stubs_linux(
+                        binding_type, venv_python,
+                        bool(deep_ep_cuda_architectures))
 
     if not skip_building_wheel:
         if dist_dir is None:

From d1d17dbebab05b11e74ac373ec7c55b3326c16b4 Mon Sep 17 00:00:00 2001
From: Yanchao Lu <yanchaol@nvidia.com>
Date: Tue, 19 Aug 2025 01:35:30 +0800
Subject: [PATCH 07/20] [None][infra] Cherry-pick #6836 from main branch and
 improve SSH connection (#6971) (#7005)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
Co-authored-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
---
 jenkins/L0_Test.groovy | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 3ed53788815..c1025efbc7e 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -99,6 +99,8 @@ MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
 ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
 ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false
 
+COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null"
+
 def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
     withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
         def remote = [
@@ -113,7 +115,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
         pipeline.stage('Submit Test Results') {
             sh "mkdir -p ${stageName}"
             def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
-            def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
+            def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
             def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
             if (downloadSucceed) {
                 sh "ls ${stageName}"
@@ -239,7 +241,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
 
                 Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
 
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
 
                 Utils.exec(
                     pipeline,
@@ -327,7 +329,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
 
             stage('Prepare Testing') {
                 // Create Job Workspace folder in Frontend Node
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host} 'mkdir -p ${jobWorkspace}'",)
 
                 // Download and Unzip Tar File
                 trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
@@ -336,11 +338,11 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 // Upload slurm_run_sh to Frontend node
                 def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
                 Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
 
                 // Upload waives.txt to Frontend node
                 def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
 
                 // Generate Test List and Upload to Frontend Node
                 def makoArgs = getMakoArgsFromStageName(stageName, true)
@@ -349,7 +351,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 // if the line cannot be split by "=", just ignore that line.
                 def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
                 def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
 
                 // Generate Multi Node Job Launch Script
                 def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
@@ -393,7 +395,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 """.stripIndent()
                 pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
                 Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
-                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
             }
             stage('Run Test') {
                 def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
@@ -1089,7 +1091,7 @@ def getSSHConnectionPorts(portConfigFile, stageName)
         usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
         string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
         ]) {
-        portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'netstat -tuln'",returnStdout: true)
+        portUsage = sh(script: "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'netstat -tuln'", returnStdout: true)
     }
     echo "Port Usage: ${portUsage}"
 
@@ -1248,7 +1250,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
     def llmRootConfig = "${LLM_ROOT}${config}"
     sh "mkdir ${llmRootConfig}"
 
-    def llmPath = sh (script: "realpath ${llmRootConfig}",returnStdout: true).trim()
+    def llmPath = sh (script: "realpath ${llmRootConfig}", returnStdout: true).trim()
     def llmSrc = "${llmPath}/TensorRT-LLM/src"
     echoNodeAndGpuInfo(pipeline, stageName)
 
@@ -1362,9 +1364,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                 usernamePassword(credentialsId: 'tensorrt_llm_infra_debug_vm_01_credentials', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD'),
                 string(credentialsId: 'DEBUG_HOST_NAME', variable: 'HOST_NAME')
                 ]) {
-                sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
-                sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
-                sh "ssh -v ${USERNAME}@${HOST_NAME} -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
+                sh "sshpass -p ${PASSWORD} -v ssh ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat >> ~/.ssh/authorized_keys' < ~/.ssh/id_rsa.pub"
+                sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'echo \"\" > ~/.ssh/known_hosts && cat ~/.ssh/id_rsa.pub' >> ~/.ssh/authorized_keys"
+                sh "ssh -v ${USERNAME}@${HOST_NAME} ${COMMON_SSH_OPTIONS} 'cat ~/.ssh/ports_config.txt' >> ${portConfigFilePath}"
 
                 def (int userPort, int monitorPort) = getSSHConnectionPorts(portConfigFilePath, stageName)
                 if (userPort == 0) {
@@ -1373,7 +1375,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                     return
                 }
 
-                sh "ssh -f -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
+                sh "ssh -f ${COMMON_SSH_OPTIONS} -L 1111:127.0.0.1:${monitorPort} -R ${monitorPort}:127.0.0.1:1112 -NR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
                 sh "autossh -fNR ${userPort}:localhost:22 ${USERNAME}@${HOST_NAME}"
                 sh "ps aux | grep ssh"
                 try {

From d16af87d037f30f7fb487777a602785c1d60ec87 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20C=C3=A1mpora?=
 <961215+dcampora@users.noreply.github.com>
Date: Tue, 19 Aug 2025 00:10:05 +0200
Subject: [PATCH 08/20] [TRTLLM-7158][feat] Introduce sampler options in trtllm
 bench (#6855)

Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
---
 tensorrt_llm/bench/benchmark/low_latency.py   | 25 +++++++++----
 tensorrt_llm/bench/benchmark/throughput.py    | 24 ++++++++++---
 tensorrt_llm/bench/benchmark/utils/general.py | 36 +++++++++++++++++++
 3 files changed, 73 insertions(+), 12 deletions(-)

diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
index 2ee3e7ea5ce..ad200af9c69 100644
--- a/tensorrt_llm/bench/benchmark/low_latency.py
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -25,7 +25,7 @@
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
 # isort: off
-from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, ALL_SUPPORTED_BACKENDS
+from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, update_sampler_args_with_extra_options, ALL_SUPPORTED_BACKENDS
 # isort: on
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                            initialize_tokenizer,
@@ -135,6 +135,13 @@
     default=1,
     help="Number of search beams.",
 )
+@optgroup.option("--sampler_options",
+                 type=click.Path(exists=True,
+                                 readable=True,
+                                 path_type=Path,
+                                 resolve_path=True),
+                 default=None,
+                 help="Path to a YAML file that sets sampler options.")
 @optgroup.option(
     "--concurrency",
     type=int,
@@ -326,12 +333,16 @@ def latency_command(
         eos_id = tokenizer.eos_token_id if not ignore_eos else -1
         pad_id = tokenizer.pad_token_id if not ignore_eos else -1
 
-        sampling_params = SamplingParams(
-            end_id=eos_id,
-            pad_id=pad_id,
-            n=beam_width,
-            use_beam_search=beam_width > 1,
-        )
+        sampler_args = {
+            "end_id": eos_id,
+            "pad_id": pad_id,
+            "n": beam_width,
+            "use_beam_search": beam_width > 1
+        }
+        sampler_args = update_sampler_args_with_extra_options(
+            sampler_args, params.pop("sampler_options"))
+        sampling_params = SamplingParams(**sampler_args)
+
         post_proc_params = None  # No detokenization
 
         # Perform warmup if requested.
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index 18d7980ea4d..57c86ac0f34 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -22,7 +22,8 @@
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
-from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
+from tensorrt_llm.bench.benchmark.utils.general import (
+    generate_warmup_dataset, update_sampler_args_with_extra_options)
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
@@ -67,6 +68,13 @@
     help=
     "Path to a YAML file that overwrites the parameters specified by trtllm-bench."
 )
+@optgroup.option("--sampler_options",
+                 type=click.Path(exists=True,
+                                 readable=True,
+                                 path_type=Path,
+                                 resolve_path=True),
+                 default=None,
+                 help="Path to a YAML file that sets sampler options.")
 @optgroup.option(
     "--max_batch_size",
     type=int,
@@ -455,10 +463,16 @@ def ignore_trt_only_args(kwargs: dict):
         else:
             llm = LLM(**kwargs)
 
-        sampling_params = SamplingParams(end_id=eos_id,
-                                         pad_id=eos_id,
-                                         n=beam_width,
-                                         use_beam_search=beam_width > 1)
+        sampler_args = {
+            "end_id": eos_id,
+            "pad_id": eos_id,
+            "n": beam_width,
+            "use_beam_search": beam_width > 1
+        }
+        sampler_args = update_sampler_args_with_extra_options(
+            sampler_args, params.pop("sampler_options"))
+        sampling_params = SamplingParams(**sampler_args)
+
         post_proc_params = None  # No detokenization
 
         # Perform warmup if requested.
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 45a7a32c1ba..ff3cd933ce1 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -199,3 +199,39 @@ def generate_warmup_dataset(requests, steps) -> List[InferenceRequest]:
     warm_up_dataset = choices(requests, k=steps)
     shuffle(warm_up_dataset)
     return warm_up_dataset
+
+
+def update_sampler_args_with_extra_options(sampler_args: Dict,
+                                           sampler_options: str) -> Dict:
+    """Update sampler arguments with options from a YAML file.
+
+    Args:
+        sampler_args: Base sampler arguments dictionary.
+        sampler_options: Path to YAML file containing additional options.
+
+    Returns:
+        Dict: Merged sampler arguments.
+
+    Raises:
+        FileNotFoundError: If the YAML file doesn't exist.
+        yaml.YAMLError: If the YAML file is malformed.
+        TypeError: If the YAML content is not a dictionary.
+    """
+    if sampler_options is not None:
+        try:
+            with open(sampler_options, 'r') as f:
+                sampler_options_dict = yaml.safe_load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                f"Sampler options file not found: {sampler_options}")
+        except yaml.YAMLError as e:
+            raise yaml.YAMLError(
+                f"Invalid YAML in sampler options file {sampler_options}: {e}")
+
+        if not isinstance(sampler_options_dict, dict):
+            raise TypeError(
+                f"Sampler options file {sampler_options} must contain a dictionary, "
+                f"got {type(sampler_options_dict)}")
+
+        sampler_args = sampler_args | sampler_options_dict
+    return sampler_args

From e76e5c640fe97dab94f50f0a02af5e9952d8aca7 Mon Sep 17 00:00:00 2001
From: Leslie Fang <leslief@nvidia.com>
Date: Tue, 19 Aug 2025 07:42:52 +0800
Subject: [PATCH 09/20] [None][infra] Enable accuracy test for mtp and chunked
 prefill (#6314)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
---
 .../features/feature_combination_matrix.md    |  2 +-
 tests/integration/defs/.test_durations        | 20 ++++++-------
 .../defs/accuracy/test_llm_api_pytorch.py     |  7 ++++-
 .../test_lists/qa/llm_function_full.txt       |  2 +-
 .../test_lists/qa/llm_function_rtx6kd.txt     | 20 ++++++-------
 .../test_lists/qa/llm_function_sanity.txt     |  2 +-
 .../test_lists/test-db/l0_b200.yml            |  5 ++--
 .../test_lists/test-db/l0_gb200.yml           |  3 ++
 .../test_lists/test-db/l0_h100.yml            | 28 +++++++++----------
 9 files changed, 49 insertions(+), 40 deletions(-)

diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md
index eee4ca7e155..f39a800fcdb 100644
--- a/docs/source/torch/features/feature_combination_matrix.md
+++ b/docs/source/torch/features/feature_combination_matrix.md
@@ -7,7 +7,7 @@
 | Attention Data Parallelism | Yes               | Yes        | ---                        |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Disaggregated Serving      | Yes               | Yes        | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |
 | Chunked Prefill            | Yes               | Yes        | Yes                        | Untested              | ---             |          |                           |                           |               |                  |                |                        |                       |                 |
-| MTP                        | Yes               | Yes        | Yes                        | Yes                   | Untested        | ---      |                           |                           |               |                  |                |                        |                       |                 |
+| MTP                        | Yes               | Yes        | Yes                        | Yes                   | Yes        | ---      |                           |                           |               |                  |                |                        |                       |                 |
 | EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | ---                       |                           |               |                  |                |                        |                       |                 |
 | EAGLE-3(Two Model Engine)  | NO                | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | ---                       |               |                  |                |                        |                       |                 |
 | Torch Sampler              | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |
diff --git a/tests/integration/defs/.test_durations b/tests/integration/defs/.test_durations
index a2ca6317b6c..23a7d075d94 100644
--- a/tests/integration/defs/.test_durations
+++ b/tests/integration/defs/.test_durations
@@ -281,11 +281,11 @@
    "disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]": 98.97588296607137,
    "disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]": 67.9668476767838,
    "test_unittests.py::test_unittests_v2[unittest/_torch/test_attention_mla.py]": 26.32902159006335,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 591.2785023800097,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]": 306.84709841990843,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 220.57452515885234,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 202.22269394202158,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 165.08514453098178,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 591.2785023800097,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 306.84709841990843,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 220.57452515885234,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 202.22269394202158,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 165.08514453098178,
    "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 252.70569713797886,
    "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 85.24235329206567,
    "test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]": 81.43792725296225,
@@ -305,11 +305,11 @@
    "test_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf]": 200.82293555140495,
    "test_unittests.py::test_unittests_v2[unittest/trt/model/test_llama.py]": 1494.1103300452232,
    "test_unittests.py::test_unittests_v2[unittest/trt/attention/test_gpt_attention.py -k \"partition0\"]": 77.31474154582247,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 295.3527018489549,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]": 143.84012729604729,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]": 107.58471493399702,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]": 205.7252635700861,
-   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]": 113.82226522010751,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 295.3527018489549,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 143.84012729604729,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 107.58471493399702,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]": 205.7252635700861,
+   "accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]": 113.82226522010751,
    "examples/test_llama.py::test_llm_llama_1gpu[llama-3.1-8b-instruct-hf-fp8-enable_fp8-float16-summarization-nb:1]": 853.2910006027669,
    "test_e2e.py::test_openai_chat_example": 876.1966922096908,
    "test_e2e.py::test_trtllm_serve_example": 200.09309104084969,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 649e826207a..89483fd2620 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -930,6 +930,10 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
 
     @pytest.mark.skip_less_device_memory(60000)
+    # Chunked Prefill for MLA can only be enabled on SM100
+    @parametrize_with_ids(
+        "enable_chunked_prefill",
+        [False, pytest.param(True, marks=skip_pre_blackwell)])
     @parametrize_with_ids("torch_compile", [False, True])
     @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
                           [(False, False, False), (True, False, False),
@@ -939,7 +943,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     @parametrize_with_ids("mtp_nextn",
                           [0, pytest.param(2, marks=skip_pre_hopper)])
     def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
-                      overlap_scheduler, torch_compile):
+                      overlap_scheduler, torch_compile, enable_chunked_prefill):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -955,6 +959,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         with LLM(self.MODEL_PATH,
                  kv_cache_config=kv_cache_config,
+                 enable_chunked_prefill=enable_chunked_prefill,
                  **pytorch_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index ab43ce124f9..1859762fc17 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -481,7 +481,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8_chunked_pref
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4_chunked_prefill[tp4ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/qa/llm_function_rtx6kd.txt b/tests/integration/test_lists/qa/llm_function_rtx6kd.txt
index fbabac6b84f..b3d14c393bb 100644
--- a/tests/integration/test_lists/qa/llm_function_rtx6kd.txt
+++ b/tests/integration/test_lists/qa/llm_function_rtx6kd.txt
@@ -1,16 +1,16 @@
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2_manage_weights
 accuracy/test_cli_flow.py::TestMixtral8x7B::test_nvfp4_prequantized
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index aeaa1ba573b..1943a015cf6 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -36,7 +36,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 26b4b2a0a88..cb36129a147 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -17,8 +17,9 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_4]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4_streaming[stream_interval_64]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_gb200.yml b/tests/integration/test_lists/test-db/l0_gb200.yml
index 6e1f6219472..ac39fbdc88c 100644
--- a/tests/integration/test_lists/test-db/l0_gb200.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200.yml
@@ -20,6 +20,9 @@ l0_gb200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 481c5f709ff..64f6498d095 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -178,20 +178,20 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=FLASHINFER-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=FLASHINFER-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=FLASHINFER-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

From 97ba0eb879750ee67bd55920ac511bc66e458adc Mon Sep 17 00:00:00 2001
From: Fridah-nv <201670829+Fridah-nv@users.noreply.github.com>
Date: Mon, 18 Aug 2025 17:43:28 -0700
Subject: [PATCH 10/20] [None][autodeploy] Doc: fix link path in trtllm bench
 doc (#7007)

Signed-off-by: Frida Hou <201670829+Fridah-nv@users.noreply.github.com>
---
 .../auto_deploy/advanced/benchmarking_with_trtllm_bench.md      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
index 10515500797..6032aacd4fa 100644
--- a/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
+++ b/docs/source/torch/auto_deploy/advanced/benchmarking_with_trtllm_bench.md
@@ -4,7 +4,7 @@ AutoDeploy is integrated with the `trtllm-bench` performance benchmarking utilit
 
 ## Getting Started
 
-Before benchmarking with AutoDeploy, review the [TensorRT-LLM benchmarking guide](../../performance/perf-benchmarking.md#running-with-the-pytorch-workflow) to familiarize yourself with the standard trtllm-bench workflow and best practices.
+Before benchmarking with AutoDeploy, review the [TensorRT-LLM benchmarking guide](../../../performance/perf-benchmarking.md#running-with-the-pytorch-workflow) to familiarize yourself with the standard trtllm-bench workflow and best practices.
 
 ## Basic Usage
 

From dabebb2c7a6994821e045bd6713c092a6f26a0d5 Mon Sep 17 00:00:00 2001
From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
Date: Tue, 19 Aug 2025 09:42:35 +0800
Subject: [PATCH 11/20] [https://nvbugs/5371480][fix] Enable test_phi3_small_8k
 (#6938)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
---
 tests/unittest/llmapi/test_llm_models.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/unittest/llmapi/test_llm_models.py b/tests/unittest/llmapi/test_llm_models.py
index 737511d2156..4fbc00ddf76 100644
--- a/tests/unittest/llmapi/test_llm_models.py
+++ b/tests/unittest/llmapi/test_llm_models.py
@@ -110,7 +110,6 @@ def test_llm_phi_3_mini_4k():
         sampling_params=phi3_mini_4k_sampling_params)
 
 
-@pytest.mark.skip(reason="https://nvbugs/5371480")
 @force_ampere
 def test_llm_phi_3_small_8k():
     phi_requirement_path = os.path.join(

From 71e28eab36f3221f2bda0bfcf5be2045504150d5 Mon Sep 17 00:00:00 2001
From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
Date: Tue, 19 Aug 2025 09:58:22 +0800
Subject: [PATCH 12/20] [TRTLLM-7014][chore] Add accuracy test for ctx and gen
 workers with different models (#6741)

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
---
 .../defs/accuracy/references/gsm8k.yaml       |  1 +
 .../accuracy/test_disaggregated_serving.py    | 76 ++++++++++++++-----
 .../test_lists/qa/llm_function_sanity.txt     |  1 +
 .../test_lists/test-db/l0_dgx_b200.yml        |  1 +
 4 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 29458d3bf49..26de82cbc09 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -80,6 +80,7 @@ Qwen3/Qwen3-8B:
     kv_cache_quant_algo: FP8
     accuracy: 87.1114
 Qwen3/Qwen3-30B-A3B:
+  - accuracy: 83.43
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 84.36
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 051c5401a06..51a572ce493 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -23,7 +23,7 @@
 from tensorrt_llm.llmapi.tokenizer import load_hf_tokenizer
 
 from ..conftest import (get_device_count, llm_models_root, parametrize_with_ids,
-                        skip_pre_hopper)
+                        skip_pre_blackwell, skip_pre_hopper)
 from ..trt_test_alternative import popen
 from .accuracy_core import (GSM8K, MMLU, JsonModeEval,
                             LlmapiAccuracyTestHarness, get_accuracy_task)
@@ -71,7 +71,9 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
                              ctx_server_config: Dict[str, Any],
                              gen_server_config: Dict[str, Any],
                              model_name: str,
-                             tensor_parallel_size: int = 1):
+                             tensor_parallel_size: int = 1,
+                             ctx_model: str = None,
+                             gen_model: str = None):
     temp_dir = tempfile.TemporaryDirectory()
     disaggregated_serving_config_path = os.path.join(
         temp_dir.name, "disaggregated_serving_config.yaml")
@@ -97,9 +99,19 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
 
     trtllm_serve_path = "trtllm-serve"
     # Common arguments for both servers
-    common_args = [
+    ctx_model = ctx_model or model_name
+    gen_model = gen_model or model_name
+    ctx_args = [
         trtllm_serve_path,
-        model_name,
+        ctx_model,
+        "--host",
+        "localhost",
+        "--backend",
+        "pytorch",
+    ]
+    gen_args = [
+        trtllm_serve_path,
+        gen_model,
         "--host",
         "localhost",
         "--backend",
@@ -125,11 +137,11 @@ def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
     env_gen["TRTLLM_USE_UCX_KVCACHE"] = "1"
     env_gen["CUDA_VISIBLE_DEVICES"] = ",".join(
         map(str, range(ctx_total_gpus, ctx_total_gpus + gen_total_gpus)))
-    ctx_server_args = common_args + [
+    ctx_server_args = ctx_args + [
         "--port", "8001", "--extra_llm_api_options", ctx_server_config_path,
         f"--tp_size={ctx_tp}", f"--pp_size={ctx_pp}"
     ]
-    gen_server_args = common_args + [
+    gen_server_args = gen_args + [
         "--port", "8002", "--extra_llm_api_options", gen_server_config_path,
         f"--tp_size={gen_tp}", f"--pp_size={gen_pp}"
     ]
@@ -226,17 +238,21 @@ def generate_async(prompt: str,
             disaggregated_server.wait()
 
 
-def run_parallel_test(model_name: str, model_path: str, ctx_pp: int,
-                      ctx_tp: int, gen_pp: int, gen_tp: int,
-                      test_set: LlmapiAccuracyTestHarness):
+def run_parallel_test(model_name: str,
+                      model_path: str,
+                      ctx_pp: int,
+                      ctx_tp: int,
+                      gen_pp: int,
+                      gen_tp: int,
+                      test_sets: List[LlmapiAccuracyTestHarness],
+                      ctx_model: str = None,
+                      gen_model: str = None):
     if ctx_tp * ctx_pp + gen_tp * gen_pp > get_device_count():
         pytest.fail(
             f"Not enough devices for ctx_pp={ctx_pp}+ctx_tp={ctx_tp} and gen_pp={gen_pp}+gen_tp={gen_tp} test"
         )
-
     kv_cache_config = {
         "free_gpu_memory_fraction": 0.5,
-        "enable_block_reuse": False
     }
     ctx_server_config = {
         "pipeline_parallel_size": ctx_pp,
@@ -270,10 +286,14 @@ def run_parallel_test(model_name: str, model_path: str, ctx_pp: int,
         }
     }
     with launch_disaggregated_llm(disaggregated_server_config,
-                                  ctx_server_config, gen_server_config,
-                                  model_path) as llm:
-        task = test_set(model_name)
-        task.evaluate(llm)
+                                  ctx_server_config,
+                                  gen_server_config,
+                                  model_path,
+                                  ctx_model=ctx_model,
+                                  gen_model=gen_model) as llm:
+        for test_set in test_sets:
+            task = test_set(model_name)
+            task.evaluate(llm)
 
 
 @pytest.mark.timeout(3600)
@@ -512,7 +532,7 @@ def test_tp_pp_symmetric(self, tp, pp, testset):
         if tp * pp * 2 > get_device_count():
             pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test")
         return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp,
-                                 tp, get_accuracy_task(testset))
+                                 tp, [get_accuracy_task(testset)])
 
     @parametrize_with_ids("ctx_pp", [2, 4])
     @parametrize_with_ids("gen_tp", [1, 2])
@@ -522,7 +542,7 @@ def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset):
             pytest.skip(
                 f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test")
         return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1,
-                                 gen_tp, get_accuracy_task(testset))
+                                 gen_tp, [get_accuracy_task(testset)])
 
 
 @pytest.mark.skip_less_device_memory(140000)
@@ -776,3 +796,25 @@ def test_auto_dtype(self, overlap_scheduler):
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+@skip_pre_blackwell
+@pytest.mark.timeout(3600)
+class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
+    FP4_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf"
+    FP8_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf"
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.parametrize("ctx_pp,gen_tp", [(2, 2)], ids=["ctxpp2gentp2"])
+    def test_mixed_ctx_gen_model(self, ctx_pp, gen_tp):
+        ctx_model = self.FP4_MODEL
+        gen_model = self.FP8_MODEL
+        return run_parallel_test("Qwen3/Qwen3-30B-A3B",
+                                 ctx_model,
+                                 ctx_pp=ctx_pp,
+                                 ctx_tp=1,
+                                 gen_pp=1,
+                                 gen_tp=gen_tp,
+                                 test_sets=[GSM8K, MMLU],
+                                 ctx_model=ctx_model,
+                                 gen_model=gen_model)
diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt
index 1943a015cf6..8dc118d991c 100644
--- a/tests/integration/test_lists/qa/llm_function_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_sanity.txt
@@ -25,6 +25,7 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
+accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 2c04beb634a..fb3f518a686 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -70,6 +70,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
+  - accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-CUTLASS]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-TRTLLM]

From a15af879ec62541bef8690e1abc4d0aa1135075c Mon Sep 17 00:00:00 2001
From: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
Date: Tue, 19 Aug 2025 09:58:44 +0800
Subject: [PATCH 13/20] [None][refactor] Refactor Torch Compile Backend,
 MoeLoadBalancer and warmup Logic (#6615)

Signed-off-by: yizhang-nv <187001205+yizhang-nv@users.noreply.github.com>
Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
---
 tensorrt_llm/_torch/compilation/backend.py    |  10 +-
 .../_torch/compilation/piecewise_optimizer.py |  23 +-
 tensorrt_llm/_torch/compilation/utils.py      |  15 +-
 .../_torch/custom_ops/torch_custom_ops.py     |   9 +
 .../modules/fused_moe/moe_load_balancer.py    |  28 +-
 .../_torch/modules/multi_stream_utils.py      |  33 ++-
 tensorrt_llm/_torch/pyexecutor/_util.py       |   2 +-
 tensorrt_llm/_torch/pyexecutor/config.py      |   1 +
 .../_torch/pyexecutor/cuda_graph_runner.py    |  34 +--
 .../_torch/pyexecutor/model_engine.py         | 250 ++++++++++--------
 tensorrt_llm/_torch/pyexecutor/py_executor.py |  16 +-
 tensorrt_llm/_torch/utils.py                  |  10 +
 tensorrt_llm/llmapi/llm_args.py               |  19 ++
 .../_torch/modules/test_moe_load_balancer.py  |   6 +-
 14 files changed, 271 insertions(+), 185 deletions(-)

diff --git a/tensorrt_llm/_torch/compilation/backend.py b/tensorrt_llm/_torch/compilation/backend.py
index f6e7ae64905..02e2ae8fe50 100644
--- a/tensorrt_llm/_torch/compilation/backend.py
+++ b/tensorrt_llm/_torch/compilation/backend.py
@@ -37,7 +37,7 @@ def __init__(
         enable_inductor=True,
         enable_userbuffers=False,
         enable_piecewise_cuda_graph: bool = False,
-        cuda_graph_batch_sizes: Optional[List[int]] = None,
+        capture_num_tokens: Optional[List[int]] = None,
         max_num_streams: int = 1,
     ) -> None:
         super().__init__()
@@ -48,14 +48,12 @@ def __init__(
         self.custom_passes = Backend.get_custom_pass(enable_userbuffers)
         self.rank = tensorrt_llm.mpi_rank()
         self.enable_inductor = enable_inductor
-        self.cuda_graph_batch_sizes = (cuda_graph_batch_sizes
-                                       if cuda_graph_batch_sizes is not None
-                                       else [])
+        self.capture_num_tokens = capture_num_tokens or []
         self.piecewise_cuda_graph = enable_piecewise_cuda_graph
         self.no_optimization = False
         # We only need to create aux streams.
         self.aux_streams = Backend.Streams(
-            [torch.cuda.Stream() for i in range(max_num_streams - 1)])
+            [torch.cuda.Stream() for _ in range(max_num_streams - 1)])
         self.events = Backend.Events()
         inductor_config.enable_auto_functionalized_v2 = False
 
@@ -125,7 +123,7 @@ def optimize(
                 example_inputs,
                 self.enable_inductor,
                 self.input_num_tokens,
-                self.cuda_graph_batch_sizes,
+                self.capture_num_tokens,
                 self._graph_pool_handle,
                 len(self.aux_streams) + 1,
             )
diff --git a/tensorrt_llm/_torch/compilation/piecewise_optimizer.py b/tensorrt_llm/_torch/compilation/piecewise_optimizer.py
index f7624e6b161..c83644eed24 100644
--- a/tensorrt_llm/_torch/compilation/piecewise_optimizer.py
+++ b/tensorrt_llm/_torch/compilation/piecewise_optimizer.py
@@ -14,8 +14,7 @@
 from ..utils import (get_model_extra_attrs, get_piecewise_cuda_graph_flag,
                      make_weak_ref)
 from .multi_stream.auto_multi_stream import multi_stream_schedule
-from .utils import (get_enable_piecewise_cuda_graph_capture_flag,
-                    is_call_function)
+from .utils import get_capture_piecewise_cuda_graph_flag, is_call_function
 
 
 class PiecewiseInterpreter(Interpreter):
@@ -25,7 +24,7 @@ def __init__(
         module: GraphModule,
         enable_inductor: bool,
         compile_time_num_tokens: Union[int | torch.SymInt],
-        cuda_graph_batch_sizes: list[int],
+        capture_num_tokens: list[int],
         exclude_modules_id: list[int],
         graph_pool_handle: tuple[int, int],
         garbage_collect_values: bool = True,
@@ -37,7 +36,7 @@ def __init__(
         self.fake_mode = detect_fake_mode()
 
         self.compile_time_num_tokens = compile_time_num_tokens
-        self.cuda_graph_batch_sizes = cuda_graph_batch_sizes
+        self.capture_num_tokens = capture_num_tokens
         self.exclude_modules = [f"submod_{i}" for i in exclude_modules_id]
         self.graph_pool_handle = graph_pool_handle
         self.enable_inductor = enable_inductor
@@ -86,7 +85,7 @@ def call_module(self, target, args, kwargs):
                 target,
                 self.compile_time_num_tokens,
                 runtime_num_tokens_idx,
-                self.cuda_graph_batch_sizes,
+                self.capture_num_tokens,
                 self.graph_pool_handle,
                 compile_fx(submod, args) if self.enable_inductor else submod,
                 self.enable_inductor,
@@ -120,7 +119,7 @@ def __init__(
         name: str,
         compile_time_num_tokens: Union[int | torch.SymInt],
         runtime_num_tokens_idx: tuple[int],
-        cuda_graph_batch_sizes: List[int],
+        capture_num_tokens: List[int],
         graph_pool_handle,
         default_callable: Callable,
         enable_inductor: bool,
@@ -139,9 +138,9 @@ def __init__(
 
         self.entries: dict[int, Entry] = {}
 
-        for bs in cuda_graph_batch_sizes:
-            self.entries[bs] = Entry(
-                bs,
+        for num_tokens in capture_num_tokens:
+            self.entries[num_tokens] = Entry(
+                num_tokens,
                 enable_inductor=self.enable_inductor,
                 callable=default_callable,
             )
@@ -167,7 +166,7 @@ def __call__(self, *args):
 
         if entry.cuda_graph is None:
 
-            if not get_enable_piecewise_cuda_graph_capture_flag():
+            if not get_capture_piecewise_cuda_graph_flag():
                 return entry.callable(*args)
 
             if entry.warmup_count < 3:
@@ -228,7 +227,7 @@ def piecewise_optimizer(
     example_inputs: List[torch.Tensor],
     enable_inductor: bool,
     input_num_tokens: Union[int | torch.SymInt],
-    cuda_graph_batch_sizes: Sequence[int],
+    capture_num_tokens: Sequence[int],
     graph_pool_handle: tuple[int, int],
     max_num_streams: int = 1,
 ) -> tuple[GraphModule, int]:
@@ -269,7 +268,7 @@ def piecewise_optimizer(
         gm,
         enable_inductor,
         input_num_tokens,
-        cuda_graph_batch_sizes,
+        capture_num_tokens,
         exclude_modules_id,
         graph_pool_handle,
         max_num_streams=max_num_streams,
diff --git a/tensorrt_llm/_torch/compilation/utils.py b/tensorrt_llm/_torch/compilation/utils.py
index fef3de2a067..0166c455d23 100644
--- a/tensorrt_llm/_torch/compilation/utils.py
+++ b/tensorrt_llm/_torch/compilation/utils.py
@@ -1,3 +1,4 @@
+import contextlib
 from typing import Callable, List, Union
 
 import torch
@@ -33,16 +34,26 @@ def is_call_function(node: Node, target: Union[List[Callable], Callable]):
 _enable_piecewise_cuda_graph_capture = False
 
 
-def set_enable_piecewise_cuda_graph_capture_flag(enable: bool):
+def set_capture_piecewise_cuda_graph_flag(enable: bool):
     global _enable_piecewise_cuda_graph_capture
     _enable_piecewise_cuda_graph_capture = enable
 
 
-def get_enable_piecewise_cuda_graph_capture_flag() -> bool:
+def get_capture_piecewise_cuda_graph_flag() -> bool:
     global _enable_piecewise_cuda_graph_capture
     return _enable_piecewise_cuda_graph_capture
 
 
+@contextlib.contextmanager
+def capture_piecewise_cuda_graph(enable: bool):
+    prev_enable = get_capture_piecewise_cuda_graph_flag()
+    set_capture_piecewise_cuda_graph_flag(enable)
+    try:
+        yield
+    finally:
+        set_capture_piecewise_cuda_graph_flag(prev_enable)
+
+
 def inplace_info():
     inplace_map = {
         torch.ops.trtllm.flashinfer_fused_add_rmsnorm.default: {
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
index a323bb4f553..bd946343b09 100644
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -8,6 +8,7 @@
 
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
+from ..modules.multi_stream_utils import do_multi_stream
 from ..utils import (fp4_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
@@ -925,6 +926,8 @@ def get_stream(stream_id: int):
 
 @torch.library.custom_op("trtllm::set_stream", mutates_args=())
 def set_stream(stream_id: int) -> None:
+    if not do_multi_stream():
+        return
     stream = get_stream(stream_id)
     assert stream is not None
     torch.cuda.set_stream(stream)
@@ -932,18 +935,24 @@ def set_stream(stream_id: int) -> None:
 
 @torch.library.custom_op("trtllm::record_event", mutates_args=())
 def record_event(event_idx: int) -> None:
+    if not do_multi_stream():
+        return
     event = get_event(event_idx)
     event.record()
 
 
 @torch.library.custom_op("trtllm::wait_event", mutates_args=())
 def wait_event(event_idx: int) -> None:
+    if not do_multi_stream():
+        return
     event = get_event(event_idx)
     event.wait()
 
 
 @torch.library.custom_op("trtllm::record_stream", mutates_args=())
 def record_stream(tensor: torch.Tensor, stream_id: int) -> None:
+    if not do_multi_stream():
+        return
     stream = get_stream(stream_id)
     assert stream is not None
     tensor.record_stream(stream)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
index 460625fb891..ff26c87687a 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/moe_load_balancer.py
@@ -9,12 +9,12 @@
 
 import tensorrt_llm
 import tensorrt_llm.bindings.internal.runtime as _tbr
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import is_graph_capturing
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
 from ...distributed import AllReduce
 from ...utils import EventType
+from ..multi_stream_utils import do_multi_stream
 
 
 def _tensor_to_weight(t: torch.Tensor) -> _tbr.MoeWeight:
@@ -472,7 +472,7 @@ def start_wait_gpu_stage(self):
         assert self.func_called_count["start_wait_gpu_stage"] == 0
         self.func_called_count["start_wait_gpu_stage"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -491,7 +491,7 @@ def done_wait_gpu_stage(self):
         assert self.func_called_count["done_wait_gpu_stage"] == 0
         self.func_called_count["done_wait_gpu_stage"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.MoeBalancer].wait()
 
     def start_set_cpu_stage(self):
@@ -502,7 +502,7 @@ def start_set_cpu_stage(self):
         assert self.func_called_count["start_set_cpu_stage"] == 0
         self.func_called_count["start_set_cpu_stage"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -522,7 +522,7 @@ def done_set_cpu_stage(self):
             self.func_called_count[name] = 0
         self.statistic_flag_tensor = None
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.MoeBalancer].wait()
 
     def update_local_statistic(self, local_raw_expert_ids: torch.Tensor,
@@ -544,7 +544,7 @@ def update_local_statistic(self, local_raw_expert_ids: torch.Tensor,
                     (self.expert_count, ),
                     dtype=torch.int32,
                     device=torch.device('cuda'))
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -569,7 +569,7 @@ def get_local_statistic_tensor(self) -> Optional[torch.Tensor]:
         assert self.func_called_count["update_local_statistic"] > 0
         self.func_called_count["get_local_statistic_tensor"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.MoeBalancer].record()
                 self.event_dict[EventType.MoeBalancer].wait()
@@ -598,7 +598,7 @@ def _update_statistic():
                 self.single_layer_load_balancer_ptr)
 
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -636,7 +636,7 @@ def _update_statistic():
         if self.updates_enabled:
             self.update_local_statistic(local_raw_expert_ids, is_first_stage,
                                         is_last_stage)
-            if is_graph_capturing():
+            if do_multi_stream():
                 with torch.cuda.stream(self.aux_stream):
                     _update_statistic()
             else:
@@ -660,7 +660,7 @@ def update_statistic_with_global_ids(self,
         assert self.func_called_count["update_statistic_with_local_ids"] == 0
         self.func_called_count["update_statistic_with_global_ids"] += 1
         if self.updates_enabled:
-            if is_graph_capturing():
+            if do_multi_stream():
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -851,8 +851,8 @@ def set_warm_up_iter_count(self, iter_count: int):
         """
         self.load_balancer_impl.set_warm_up_iter_count(iter_count)
 
-    def set_next_iter_info(self, enable_statistic: Optional[bool],
-                           enable_update_weights: Optional[bool]):
+    def set_iter_info(self, enable_statistic: Optional[bool],
+                      enable_update_weights: Optional[bool]):
         if enable_statistic is not None:
             self.enable_statistic = enable_statistic
         if enable_update_weights is not None:
@@ -998,8 +998,8 @@ def __enter__(self):
         """
         if self.moe_load_balancer is not None and not self.moe_load_balancer.is_static_routing(
         ):
-            self.moe_load_balancer.set_next_iter_info(self.enable_statistic,
-                                                      self.enable_updates)
+            self.moe_load_balancer.set_iter_info(self.enable_statistic,
+                                                 self.enable_updates)
             self.moe_load_balancer.start_iter()
         return self
 
diff --git a/tensorrt_llm/_torch/modules/multi_stream_utils.py b/tensorrt_llm/_torch/modules/multi_stream_utils.py
index e91b7eac245..c7b58c0896b 100644
--- a/tensorrt_llm/_torch/modules/multi_stream_utils.py
+++ b/tensorrt_llm/_torch/modules/multi_stream_utils.py
@@ -1,8 +1,35 @@
+import threading
+from contextlib import contextmanager
 from typing import Any, Callable, Optional
 
 import torch
 
-from ..pyexecutor.cuda_graph_runner import is_graph_capturing
+
+class do_multi_stream_local(threading.local):
+
+    def __init__(self):
+        self.do_multi_stream = False
+
+
+_local = do_multi_stream_local()
+
+
+def set_do_multi_stream(enable: bool):
+    _local.do_multi_stream = enable
+
+
+def do_multi_stream() -> bool:
+    return _local.do_multi_stream
+
+
+@contextmanager
+def with_multi_stream(enable: bool):
+    prev_do_multi_stream = _local.do_multi_stream
+    set_do_multi_stream(enable)
+    try:
+        yield
+    finally:
+        set_do_multi_stream(prev_do_multi_stream)
 
 
 def maybe_execute_in_parallel(
@@ -30,9 +57,9 @@ def maybe_execute_in_parallel(
         tuple[Any, Any]: the return values of fn0() and fn1()
     """
 
-    do_multi_stream = is_graph_capturing() and aux_stream is not None
+    multi_stream = do_multi_stream() and aux_stream is not None
 
-    if do_multi_stream:
+    if multi_stream:
         event0.record()
         result0 = fn0()
 
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index ca224f0b2cd..fed6a715374 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -242,8 +242,8 @@ def estimate_max_tokens(self, py_executor: PyExecutor) -> None:
             torch_used_bytes = torch.cuda.memory_stats(
             )["allocated_bytes.all.current"]
         finally:
-            py_executor.shutdown()
             py_executor.is_warmup = False
+            py_executor.shutdown()
             py_executor.enable_iter_perf_stats = origin_iter_stats
             py_executor.set_gather_responses(False)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index 631f974db26..c656aac8c60 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -79,6 +79,7 @@ class PyTorchConfig:
     torch_compile_fullgraph: bool = True
     torch_compile_inductor_enabled: bool = False
     torch_compile_piecewise_cuda_graph: bool = False
+    torch_compile_piecewise_cuda_graph_num_tokens: Optional[List[int]] = None
     # When torch compile is enabled, userbuffers is enabled by default
     torch_compile_enable_userbuffers: bool = True
     torch_compile_max_num_streams: int = 1
diff --git a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
index 50306d66a66..df674a94968 100644
--- a/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
+++ b/tensorrt_llm/_torch/pyexecutor/cuda_graph_runner.py
@@ -1,28 +1,11 @@
-import threading
 from typing import Any, Callable, Dict, Optional, Tuple
 
 import torch
 
 from ..attention_backend.interface import AttentionMetadata
+from ..modules.multi_stream_utils import with_multi_stream
 from ..speculative.interface import SpecMetadata
-from ..utils import make_weak_ref, set_piecewise_cuda_graph_flag
-
-
-class graph_capturing_local(threading.local):
-
-    def __init__(self):
-        self.is_graph_capturing = False
-
-
-_local = graph_capturing_local()
-
-
-def set_graph_capturing(enable: bool):
-    _local.is_graph_capturing = enable
-
-
-def is_graph_capturing() -> bool:
-    return _local.is_graph_capturing
+from ..utils import make_weak_ref, piecewise_cuda_graph
 
 
 class DecodingCUDAGraphRunner:
@@ -97,14 +80,11 @@ def capture(
         # internal states according to the docs:
         # https://pytorch.org/docs/stable/notes/cuda.html#cuda-graph-semantics
         # This also lets us initialize states in the attn_metadata.
-        set_graph_capturing(True)
-        set_piecewise_cuda_graph_flag(False)
-        for _ in range(2):
-            forward_fn(inputs)
-        with torch.cuda.graph(self._graph, pool=pool):
-            output = forward_fn(inputs)
-        set_graph_capturing(False)
-        set_piecewise_cuda_graph_flag(True)
+        with with_multi_stream(True), piecewise_cuda_graph(False):
+            for _ in range(2):
+                forward_fn(inputs)
+            with torch.cuda.graph(self._graph, pool=pool):
+                output = forward_fn(inputs)
         # Mark weak ref here. The output tensor should be freed properly.
         self._output = make_weak_ref(output)
         return self._graph.pool()
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 22a53c4666f..a34f03edb55 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -40,7 +40,7 @@
 from ..attention_backend.vanilla import VanillaAttentionMetadata
 from ..autotuner import AutoTuner, autotune
 from ..compilation.backend import Backend
-from ..compilation.utils import set_enable_piecewise_cuda_graph_capture_flag
+from ..compilation.utils import capture_piecewise_cuda_graph
 from ..distributed import MPIDist
 from ..distributed.communicator import init_pp_comm
 from ..expert_statistic import ExpertStatistic
@@ -293,8 +293,6 @@ def __init__(
         self.enable_spec_decode = self.is_spec_decode
         self.is_draft_model = is_draft_model
 
-        self.in_warmup = False
-
         self.attn_runtime_features = attn_runtime_features or AttentionRuntimeFeatures(
         )
 
@@ -335,6 +333,15 @@ def __init__(
             pytorch_backend_config.torch_compile_piecewise_cuda_graph
             and not self.enable_attention_dp)
 
+        piecewise_cuda_graph_num_tokens = (
+            pytorch_backend_config.torch_compile_piecewise_cuda_graph_num_tokens
+            or pytorch_backend_config.cuda_graph_batch_sizes or [])
+
+        self._piecewise_cuda_graph_num_tokens = [
+            i for i in piecewise_cuda_graph_num_tokens
+            if i <= self.max_num_tokens
+        ]
+
         try:
             use_ub_for_nccl = (
                 pytorch_backend_config.allreduce_strategy == "NCCL_SYMMETRIC"
@@ -349,8 +356,7 @@ def __init__(
                     enable_userbuffers=use_ub,
                     enable_piecewise_cuda_graph=self.
                     _torch_compile_piecewise_cuda_graph,
-                    cuda_graph_batch_sizes=pytorch_backend_config.
-                    cuda_graph_batch_sizes,
+                    capture_num_tokens=self._piecewise_cuda_graph_num_tokens,
                     max_num_streams=pytorch_backend_config.
                     torch_compile_max_num_streams)
                 if isinstance(self.model, DecoderModelForCausalLM):
@@ -373,6 +379,8 @@ def __init__(
             traceback.print_exception(Exception, e, e.__traceback__)
             raise e
 
+        self.is_warmup = False
+
         self.attn_backend = get_attention_backend(attn_backend)
 
         if self.is_spec_decode:
@@ -478,17 +486,44 @@ def use_mrope(self):
         logger.debug(f"Detected use_mrope: {use_mrope}")
         return use_mrope
 
+    @property
+    def is_warmup(self):
+        return getattr(self, "_is_warmup", False)
+
+    @is_warmup.setter
+    def is_warmup(self, value: bool):
+        self._is_warmup = value
+
+        self.moe_load_balancer_iter_info = (not value, not value)
+
+    @property
+    def moe_load_balancer_iter_info(self):
+        moe_load_balancer: MoeLoadBalancer = getattr(self, 'moe_load_balancer',
+                                                     None)
+        if moe_load_balancer is not None:
+            return moe_load_balancer.enable_statistic, moe_load_balancer.enable_update_weights
+        return False, False
+
+    @moe_load_balancer_iter_info.setter
+    def moe_load_balancer_iter_info(self, value: Tuple[bool, bool]):
+        moe_load_balancer: MoeLoadBalancer = getattr(self, 'moe_load_balancer',
+                                                     None)
+        if moe_load_balancer is not None:
+            moe_load_balancer.set_iter_info(enable_statistic=value[0],
+                                            enable_update_weights=value[1])
+
     @property
     def use_beam_search(self):
         return self.max_beam_width > 1
 
     @contextmanager
     def set_warmup_flag(self):
-        self.in_warmup = True
+        prev_is_warmup = self.is_warmup
+        self.is_warmup = True
         try:
             yield
         finally:
-            self.in_warmup = False
+            self.is_warmup = prev_is_warmup
 
     @staticmethod
     def with_warmup_flag(method):
@@ -669,120 +704,110 @@ def release_batch(result: ScheduledRequests | None):
         if cp_type == CpType.STAR:
             return
 
-        with contextlib.ExitStack() as stack:
-            if self._torch_compile_enabled:
-
-                def disable_optimization(backend: Backend):
-                    # Disable torch.compile optimization and fallback to eager execution
-                    backend.bypass_optimization()
-                    # Disable piecewise CUDA graph capture since the capture run will produce wrong results
-                    set_enable_piecewise_cuda_graph_capture_flag(False)
-
-                stack.callback(disable_optimization,
-                               self._torch_compile_backend)
-
-                self._torch_compile_backend.enable_optimization()
-
-                # Disable cuda graph capture here so that we can properly capture it later
-                with self.no_cuda_graph():
-                    available_tokens = kv_cache_manager.get_num_available_tokens(
-                        self.runtime_draft_len)
-                    warmup_batch_size = [1, self.batch_size // 2]
-                    if self.batch_size < 2:
-                        warmup_batch_size = [1]
-                    for bs in warmup_batch_size:
-                        for num_tokens_per_request in [
-                                1,
-                                min(self.max_num_tokens // max(bs, 1),
-                                    min(available_tokens, self.max_seq_len - 1))
-                        ]:
-                            with release_batch(
-                                    get_torch_compile_warmup_request(
-                                        bs, num_tokens_per_request)) as batch:
-                                if batch is None:
-                                    # No KV cache space!
-                                    continue
-                                logger.info(
-                                    f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request > 1 else 'generation'} phase"
-                                )
-                                self.forward(batch,
-                                             new_tensors_device=None,
-                                             resource_manager=resource_manager)
-                                torch.cuda.synchronize()
-
-            if self.pytorch_backend_config.enable_autotuner:
-                with self.no_cuda_graph(), autotune():
-                    result = get_autotune_warmup_request()
-                    with release_batch(result) as batch:
-                        if batch is None:
-                            # No KV cache space!
-                            pass
-                        else:
+        if self._torch_compile_enabled:
+
+            # Disable cuda graph capture here so that we can properly capture it later
+            with self.no_cuda_graph():
+                available_tokens = kv_cache_manager.get_num_available_tokens(
+                    self.runtime_draft_len)
+                warmup_batch_size = [1, self.batch_size // 2]
+                if self.batch_size < 2:
+                    warmup_batch_size = [1]
+                for bs in warmup_batch_size:
+                    for num_tokens_per_request in [
+                            1,
+                            min(self.max_num_tokens // max(bs, 1),
+                                min(available_tokens, self.max_seq_len - 1))
+                    ]:
+                        with release_batch(
+                                get_torch_compile_warmup_request(
+                                    bs, num_tokens_per_request)) as batch:
+                            if batch is None:
+                                # No KV cache space!
+                                continue
+                            logger.info(
+                                f"Run warmup for batch size={bs}, pure {'context' if num_tokens_per_request > 1 else 'generation'} phase"
+                            )
                             self.forward(batch,
                                          new_tensors_device=None,
                                          resource_manager=resource_manager)
                             torch.cuda.synchronize()
 
-                    logger.info(
-                        f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}"
-                    )
-
-                AutoTuner.get().print_profiling_cache()
-
-            if not (self._run_cuda_graphs
-                    or self._torch_compile_piecewise_cuda_graph):
-                return
-
-            logger.info(
-                f"Creating CUDA graph instances for {len(self._cuda_graph_batch_sizes)} batch sizes."
-            )
-            # Reverse the order of the cuda graph batch sizes to make smaller batch size graph could reuse larger batch size graph memory
-            cuda_graph_batch_sizes = sorted(self._cuda_graph_batch_sizes,
-                                            reverse=True)
-            # Create CUDA graphs for different draft lengths
-            draft_lengths = [self.max_draft_len]
-            # For non-draft model, we also capture the CUDA graph instance for draft length 0,
-            # so that when we disable spec decode at runtime, we can still run the captured graph.
-            # Note that for one engine mode, we are not able to turn off spec decode at runtime.
-            if (not self.is_draft_model and self.max_draft_len > 0
-                    and not self.spec_config.spec_dec_mode.use_one_engine()
-                    # Assume that speculation is always on if the user didn't give us a max_concurrency
-                    # value. This will save on memory.
-                    and self.spec_config.max_concurrency is not None):
-                draft_lengths.append(0)
-
-            for bs in cuda_graph_batch_sizes:
-                if bs > self.batch_size:
-                    # skip batch size larger than self.batch_size
-                    continue
-
-                for draft_len in draft_lengths:
-                    with release_batch(
-                            get_cuda_graph_warmup_request(bs,
-                                                          draft_len)) as batch:
-                        if batch is None:
-                            # No KV cache space!
-                            return
-                        logger.info(
-                            f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
-                        )
-                        self.enable_spec_decode = draft_len > 0 or self.is_draft_model
+        if self.pytorch_backend_config.enable_autotuner:
+            with self.no_cuda_graph(), autotune():
+                result = get_autotune_warmup_request()
+                with release_batch(result) as batch:
+                    if batch is None:
+                        # No KV cache space!
+                        pass
+                    else:
                         self.forward(batch,
                                      new_tensors_device=None,
                                      resource_manager=resource_manager)
                         torch.cuda.synchronize()
 
-            if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
-                for seq_lens in cuda_graph_batch_sizes:
-                    set_enable_piecewise_cuda_graph_capture_flag(True)
+                logger.info(
+                    f"[Autotuner] Cache size after warmup is {len(AutoTuner.get().profiling_cache)}"
+                )
+
+            AutoTuner.get().print_profiling_cache()
+
+        if not (self._run_cuda_graphs
+                or self._torch_compile_piecewise_cuda_graph):
+            return
+
+        logger.info(
+            f"Creating CUDA graph instances for {len(self._cuda_graph_batch_sizes)} batch sizes."
+        )
+        # Reverse the order of the cuda graph batch sizes to make smaller batch size graph could reuse larger batch size graph memory
+        cuda_graph_batch_sizes = sorted(self._cuda_graph_batch_sizes,
+                                        reverse=True)
+        # Create CUDA graphs for different draft lengths
+        draft_lengths = [self.max_draft_len]
+        # For non-draft model, we also capture the CUDA graph instance for draft length 0,
+        # so that when we disable spec decode at runtime, we can still run the captured graph.
+        # Note that for one engine mode, we are not able to turn off spec decode at runtime.
+        if (not self.is_draft_model and self.max_draft_len > 0
+                and not self.spec_config.spec_dec_mode.use_one_engine()
+                # Assume that speculation is always on if the user didn't give us a max_concurrency
+                # value. This will save on memory.
+                and self.spec_config.max_concurrency is not None):
+            draft_lengths.append(0)
+
+        for bs in cuda_graph_batch_sizes:
+            if bs > self.batch_size:
+                # skip batch size larger than self.batch_size
+                continue
+
+            for draft_len in draft_lengths:
+                with release_batch(get_cuda_graph_warmup_request(
+                        bs, draft_len)) as batch:
+                    if batch is None:
+                        # No KV cache space!
+                        return
+                    logger.info(
+                        f"Run generation only CUDA graph warmup for batch size={bs}, draft_len={draft_len}"
+                    )
+                    self.enable_spec_decode = draft_len > 0 or self.is_draft_model
+                    self.forward(batch,
+                                 new_tensors_device=None,
+                                 resource_manager=resource_manager)
+                    torch.cuda.synchronize()
+
+        if self._torch_compile_piecewise_cuda_graph and self._torch_compile_enabled:
+            piecewise_cuda_graph_num_tokens = sorted(
+                self._piecewise_cuda_graph_num_tokens, reverse=True)
+
+            with capture_piecewise_cuda_graph(True):
+                for num_tokens in piecewise_cuda_graph_num_tokens:
                     with self.no_cuda_graph():
                         with release_batch(
                                 get_torch_compile_warmup_request(
-                                    1, seq_lens)) as batch:
+                                    1, num_tokens)) as batch:
                             logger.info(
-                                f"Run piecewise CUDA graph warmup for seq_lens={seq_lens}"
+                                f"Run piecewise CUDA graph warmup for num tokens={num_tokens}"
                             )
-                            # self.model.mtp_worker.stored_input_ids = []
+
                             for _ in range(3):
                                 self.forward(batch,
                                              new_tensors_device=None,
@@ -793,7 +818,6 @@ def disable_optimization(backend: Backend):
                             torch.cuda.synchronize()
                             gc.collect()
                             torch.cuda.empty_cache()
-                    set_enable_piecewise_cuda_graph_capture_flag(False)
 
         # Set the value back to the original value
         self.enable_spec_decode = self.is_spec_decode
@@ -1541,7 +1565,7 @@ def previous_seq_slots_device():
         # Cache indirection is only used for beam search on generation requests
         if self.use_beam_search and num_generation_requests > 0:
             # CUDA Graph needs to set beam width during warmup (where the graph is captured), to ensure that cache indirection buffer is correctly picked up by the CUDA graph
-            is_cuda_graph_during_warmup = self.in_warmup and attn_metadata.is_cuda_graph
+            is_cuda_graph_during_warmup = self.is_warmup and attn_metadata.is_cuda_graph
             if cache_indirection_buffer is not None:
                 #Copy cache indirection to local buffer with offsets changing:  seq_slots[i] -> i
                 self.cache_indirection_attention[:num_generation_requests].copy_(
@@ -2151,14 +2175,8 @@ def forward(
             spec_resource_manager = None
             spec_metadata = None
 
-        moe_load_balancer = None
-        if hasattr(self, 'moe_load_balancer'):
-            moe_load_balancer = getattr(self, 'moe_load_balancer')
-            if not self.in_warmup:
-                moe_enable_statistic = True
-                moe_enable_update = True
-                moe_load_balancer.set_next_iter_info(moe_enable_statistic,
-                                                     moe_enable_update)
+        moe_load_balancer: MoeLoadBalancer = getattr(self, 'moe_load_balancer',
+                                                     None)
 
         if kv_cache_manager is None:
             inputs, gather_ids = self._prepare_tp_inputs_no_cache(
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 4573528e314..8dbbe39abb6 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -161,7 +161,6 @@ def __init__(self,
         self.profile_start_iters, self.profile_stop_iters = _load_iteration_indexes(
             PROFILE_START_STOP_ENV_VAR_NAME)
         self.gc_nvtx_watcher_handle = _gc_nvtx_watcher()
-        self.is_warmup = False  # During warmup, we don't enable the profiler
 
         # related modules
         self.resource_manager = resource_manager
@@ -220,9 +219,12 @@ def __init__(self,
 
         self.inflight_req_ids = ReqIdsSet()
 
+        # During warmup, we don't enable the profiler
+        self.is_warmup = True
         self.model_engine.warmup(self.resource_manager)
         if self.draft_model_engine is not None:
             self.draft_model_engine.warmup(self.resource_manager)
+        self.is_warmup = False
 
         self.is_shutdown = False
         self.max_batch_size = max_batch_size
@@ -280,6 +282,18 @@ def _event_loop_wrapper(self):
         finally:
             self._executor_loop_cleanup()
 
+    @property
+    def is_warmup(self) -> bool:
+        return getattr(self, "_is_warmup", False)
+
+    @is_warmup.setter
+    def is_warmup(self, value: bool):
+        self._is_warmup = value
+        # Set warmup flag in model engine to trigger torch compile and avoid moe load balancer statistics update
+        self.model_engine.is_warmup = value
+        if self.draft_model_engine is not None:
+            self.draft_model_engine.is_warmup = value
+
     def start_worker(self):
         with self.worker_lock:
             if self.worker_started == False:
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
index 932aa7e7e67..4068ad44a60 100644
--- a/tensorrt_llm/_torch/utils.py
+++ b/tensorrt_llm/_torch/utils.py
@@ -265,3 +265,13 @@ def set_piecewise_cuda_graph_flag(enable: bool):
 def get_piecewise_cuda_graph_flag() -> bool:
     global _enable_piecewise_cuda_graph
     return _enable_piecewise_cuda_graph
+
+
+@contextlib.contextmanager
+def piecewise_cuda_graph(enable: bool):
+    prev_enable = get_piecewise_cuda_graph_flag()
+    set_piecewise_cuda_graph_flag(enable)
+    try:
+        yield
+    finally:
+        set_piecewise_cuda_graph_flag(prev_enable)
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index abc41b00356..948c4b16883 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1990,6 +1990,21 @@ class TorchCompileConfig(StrictBaseModel):
         default=False,
         description="Enable piecewise CUDA graph in torch.compile.")
 
+    capture_num_tokens: Optional[List[int]] = Field(
+        default=None,
+        description=
+        "List of num of tokens to capture the piecewise CUDA graph for. If not provided, the number of tokens will be the same as cuda_graph_config.batch_sizes."
+    )
+
+    @field_validator('capture_num_tokens')
+    @classmethod
+    def validate_capture_num_tokens(cls, v):
+        if v is None:
+            return v
+        if any(t <= 0 for t in v):
+            raise ValueError("capture_num_tokens must contain positive ints.")
+        return sorted(set(v), reverse=True)
+
     enable_userbuffers: bool = Field(
         default=True,
         description=
@@ -2368,6 +2383,10 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             enable_piecewise_cuda_graph
             if self.torch_compile_config is not None else TorchCompileConfig.
             model_fields['enable_piecewise_cuda_graph'].default,
+            torch_compile_piecewise_cuda_graph_num_tokens=self.
+            torch_compile_config.capture_num_tokens
+            if self.torch_compile_config is not None else
+            TorchCompileConfig.model_fields['capture_num_tokens'].default,
             torch_compile_enable_userbuffers=self.torch_compile_config.
             enable_userbuffers if self.torch_compile_config is not None else
             TorchCompileConfig.model_fields['enable_userbuffers'].default,
diff --git a/tests/unittest/_torch/modules/test_moe_load_balancer.py b/tests/unittest/_torch/modules/test_moe_load_balancer.py
index 66edbd6d17e..969fb04d030 100644
--- a/tests/unittest/_torch/modules/test_moe_load_balancer.py
+++ b/tests/unittest/_torch/modules/test_moe_load_balancer.py
@@ -269,7 +269,7 @@ def test_moe_load_balancer_lifecycle_methods(self, mock_load_balancer_impl):
         mock_load_balancer_impl.return_value.set_warm_up_iter_count.assert_called_once_with(
             10)
 
-        balancer.set_next_iter_info(True, True)
+        balancer.set_iter_info(True, True)
 
         with MoeLoadBalancerIterContext(balancer):
             mock_load_balancer_impl.return_value.start_iter.assert_called_once_with(
@@ -308,7 +308,7 @@ def test_real_statistic_kernel(self):
         balancer.finalize_model()
 
         # enable statistic, disable weight update
-        balancer.set_next_iter_info(True, False)
+        balancer.set_iter_info(True, False)
 
         # Create sample token data - each token selects 2 experts
         # 4 tokens, each selecting 2 experts
@@ -373,7 +373,7 @@ def test_real_routing_kernel(self):
         balancer.finalize_model()
 
         # enable statistic, disable weight update
-        balancer.set_next_iter_info(True, False)
+        balancer.set_iter_info(True, False)
 
         # Create sample token data - tokens selecting different experts
         token_selected_experts = torch.tensor(

From 06911c0173f98d372d3ec282a6f773b091f08ed2 Mon Sep 17 00:00:00 2001
From: Venky <23023424+venkywonka@users.noreply.github.com>
Date: Mon, 18 Aug 2025 19:11:36 -0700
Subject: [PATCH 14/20] [None] [infra] stricter coderabbit pr title generation
 instructions (#6918)

Signed-off-by: Venky Ganesh <23023424+venkywonka@users.noreply.github.com>
---
 .coderabbit.yaml                 | 2 +-
 .github/pull_request_template.md | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index a7d05d70d51..1644fad2728 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -20,7 +20,7 @@ language: "en-US"
 reviews:
   profile: chill
   auto_title_placeholder: '@coderabbitai title'
-  auto_title_instructions: 'Should follow the format: "[fix/feat/doc/infra/...] \<summary of this PR\>". Keep it concise.'
+  auto_title_instructions: 'Format: "[<category>] <title>". Category must be one of: fix, feat, doc, infra, style, refactor, perf, test, chore, revert. Enclose the category in square brackets. Title should be concise (<= 60 chars). Example: "[feat] Add logit_bias support".'
   commit_status: false
   collapse_walkthrough: true
   assess_linked_issues: true
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 45f9ebf7f12..4665a9682a3 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -18,6 +18,14 @@ Examples:
   - [https://nvbugs/1234567][fix] Fix some bugs
   - [#1234][doc] Update documentation
   - [None][chore] Minor clean-up
+
+Alternative (faster) way using CodeRabbit AI:
+
+**[JIRA ticket/NVBugs ID/GitHub issue/None] @coderabbitai title**
+
+NOTE: "@coderabbitai title" will be replaced by the title generated by CodeRabbit AI, that includes the "[type]" and title.
+For more info, see /.coderabbit.yaml.
+
 -->
 
 ## Description

From 2bb90ba002a6c794b7b5b5ed3c0cc4b09a61dfb4 Mon Sep 17 00:00:00 2001
From: Zhenhuan Chen <chenzhh3671@gmail.com>
Date: Tue, 19 Aug 2025 10:18:04 +0800
Subject: [PATCH 15/20] [TRTLLM-6960][fix] enable scaled_mm tests (#6936)

Signed-off-by: Zhenhuan Chen <chenzhh3671@gmail.com>
---
 tests/unittest/_torch/thop/test_scaled_mm.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/tests/unittest/_torch/thop/test_scaled_mm.py b/tests/unittest/_torch/thop/test_scaled_mm.py
index 31149de7310..40dfa4e0cb4 100644
--- a/tests/unittest/_torch/thop/test_scaled_mm.py
+++ b/tests/unittest/_torch/thop/test_scaled_mm.py
@@ -38,11 +38,6 @@
     [torch.float16, torch.float32, torch.bfloat16],
 )
 def test_fp8_scaled_mm(output_dtype, m, k_n):
-    if getSMVersion() == 90:
-        pytest.skip(
-            "Skip test for sm90 because it's too flaky. https://nvbugspro.nvidia.com/bug/5441734"
-        )
-
     k, n = k_n
     torch.random.manual_seed(0)
     shape_x = (m, k)
@@ -76,7 +71,7 @@ def test_fp8_scaled_mm(output_dtype, m, k_n):
     os.environ["CUBLASLT_WORKSPACE_SIZE"] = old_env
     np.testing.assert_allclose(ref.float().cpu(),
                                output.float().cpu(),
-                               atol=1,
+                               atol=0.01,
                                rtol=0.01)
 
     if getSMVersion() == 90:

From c49f0f1bc81f989f129f40e7fcde2938526c6bcb Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Tue, 19 Aug 2025 11:52:24 +0800
Subject: [PATCH 16/20] add llmapi trt flow test case with cuda graph and
 generate logits

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api.py    | 14 +++++++++++++-
 .../test_lists/qa/llm_function_full.txt            |  1 +
 tests/integration/test_lists/waives.txt            |  2 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index f34bcdb5be4..8f4851472a0 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -15,7 +15,8 @@
 import pytest
 
 from tensorrt_llm._tensorrt_engine import LLM
-from tensorrt_llm.llmapi import EagleDecodingConfig, KvCacheConfig
+from tensorrt_llm.llmapi import (EagleDecodingConfig,
+                                 ExtendedRuntimePerfKnobConfig, KvCacheConfig)
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -76,6 +77,17 @@ def test_guided_decoding_4gpus(self, backend: str):
             task = JsonModeEval(self.MODEL_NAME)
             task.evaluate(llm)
 
+    def test_gather_generation_logits_cuda_graph(self):
+        extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig(
+            cuda_graph_mode=True, cuda_graph_cache_size=1)
+        llm = LLM(
+            self.MODEL_PATH,
+            gather_generation_logits=True,
+            extended_runtime_perf_knob_config=extended_runtime_perf_knob_config)
+        with llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.2-1B"
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index 1859762fc17..29b915c214b 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -420,6 +420,7 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp4
 accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
+accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
 accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 49c9a6d0107..7c1a5bb8693 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -315,3 +315,5 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5459817)
 llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796)
 disaggregated/test_disaggregated.py::test_disaggregated_genbs1[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5459811)
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] SKIP (https://nvbugs/5437384)
+accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525)

From e395958562770e26e6d2f1c9939e59206047136b Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Mon, 21 Jul 2025 16:20:43 +0800
Subject: [PATCH 17/20] add llmapi logprobs test with trt flow

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 .../defs/accuracy/references/cnn_dailymail.yaml      |  2 ++
 tests/integration/defs/accuracy/test_llm_api.py      | 12 +++++++++++-
 .../integration/test_lists/qa/llm_function_full.txt  |  1 +
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index 67781cd8d15..fa0d1b32f54 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -124,6 +124,8 @@ meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 33.640
   - spec_dec_algo: Eagle
     accuracy: 33.640
+  - extra_acc_spec: logprobs=2
+    accuracy: 30.522
   - quant_algo: FP8
     accuracy: 33.841
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index 8f4851472a0..24974238b99 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -16,7 +16,8 @@
 
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.llmapi import (EagleDecodingConfig,
-                                 ExtendedRuntimePerfKnobConfig, KvCacheConfig)
+                                 ExtendedRuntimePerfKnobConfig, KvCacheConfig,
+                                 SamplingParams)
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
@@ -88,6 +89,15 @@ def test_gather_generation_logits_cuda_graph(self):
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
 
+    def test_logprobs(self):
+        sampling_config = SamplingParams(logprobs=2)
+        llm = LLM(self.MODEL_PATH, gather_generation_logits=True)
+        with llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=sampling_config,
+                          extra_acc_spec="logprobs=2")
+
 
 class TestLlama3_2_1B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.2-1B"
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index 29b915c214b..2aab7e288aa 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -421,6 +421,7 @@ accuracy/test_cli_flow.py::TestQwen2_57B_A14B::test_tp2pp2
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
+accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
 accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise

From 117bd93214287cb600d31de4a6ce5e4188d23f84 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Tue, 22 Jul 2025 14:21:00 +0800
Subject: [PATCH 18/20] add llmapi trt flow phi-4-mini-instruct acc test

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  4 ++++
 .../defs/accuracy/references/mmlu.yaml        |  2 ++
 .../integration/defs/accuracy/test_llm_api.py | 20 +++++++++++++++++++
 .../test_lists/qa/llm_function_full.txt       |  2 ++
 4 files changed, 28 insertions(+)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index fa0d1b32f54..2c2094b6d4a 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -232,6 +232,10 @@ mistralai/Mixtral-8x22B-v0.1:
     accuracy: 25.519
   - quant_algo: W8A16
     accuracy: 27.427
+microsoft/Phi-4-mini-instruct:
+  - accuracy: 32.958
+  - quant_algo: FP8
+    accuracy: 32.823
 google/gemma-2b:
   - accuracy: 23.194
   - quant_algo: W8A16
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 7f2bb55e6f7..b465940f2bb 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -229,6 +229,8 @@ nvidia/Nemotron-H-56B-Base-8K:
     accuracy: 83.82
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
+  - quant_algo: FP8
+    accuracy: 68.30
 # Created a dummy accuracy to track tp_size=2 for phi4-mini model.
 # TODO: update once https://nvbugs/5393849 is fixed.
 microsoft/Phi-4-mini-instruct-tp2:
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index 24974238b99..fb535b47396 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -266,6 +266,26 @@ def test_awq_tp2(self):
             task.evaluate(llm)
 
 
+class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "microsoft/Phi-4-mini-instruct"
+    MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct"
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    def test_fp8(self):
+        quant_config = QuantConfig(QuantAlgo.FP8)
+        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestQwen2_7BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/Qwen2-7B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct"
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index 2aab7e288aa..cf3eec8eeb3 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -422,6 +422,8 @@ accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
+accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_auto_dtype
+accuracy/test_llm_api.py::TestPhi4MiniInstruct::test_fp8
 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_auto_dtype
 accuracy/test_llm_api.py::TestQwen2_5_1_5BInstruct::test_weight_only
 accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise

From ead824e59c08a5a9e87be9b3a8190d8b0a04bbb3 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Tue, 22 Jul 2025 16:00:29 +0800
Subject: [PATCH 19/20] add nemo 12b base test cases

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  9 ++--
 .../defs/accuracy/references/mmlu.yaml        |  4 ++
 .../integration/defs/accuracy/test_llm_api.py | 43 ++++++++++++++++---
 .../defs/accuracy/test_llm_api_pytorch.py     | 30 +++++++++++++
 .../test_lists/qa/llm_function_full.txt       |  6 ++-
 5 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index 2c2094b6d4a..93e025b2d8b 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -45,6 +45,8 @@ microsoft/Phi-3.5-mini-instruct:
   - accuracy: 31.354
 microsoft/Phi-4-mini-instruct:
   - accuracy: 32.921
+  - quant_algo: FP8
+    accuracy: 32.823
 state-spaces/mamba-130m-hf:
   - accuracy: 19.470
 lmsys/vicuna-7b-v1.3:
@@ -201,7 +203,8 @@ mistralai/Mistral-7B-Instruct-v0.3:
     accuracy: 31.201
 mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 29.20
-mistralai/Mistral-Nemo-Base-2407:
+mistralai/Mistral-Nemo-12b-Base:
+  - accuracy: 28.906
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 24.0
@@ -232,10 +235,6 @@ mistralai/Mixtral-8x22B-v0.1:
     accuracy: 25.519
   - quant_algo: W8A16
     accuracy: 27.427
-microsoft/Phi-4-mini-instruct:
-  - accuracy: 32.958
-  - quant_algo: FP8
-    accuracy: 32.823
 google/gemma-2b:
   - accuracy: 23.194
   - quant_algo: W8A16
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index b465940f2bb..b6c38930a15 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -266,3 +266,7 @@ GPT-OSS/MXFP4:
     accuracy: 75.50
   - quant_algo: W4A8_MXFP4_FP8
     accuracy: 75.50
+mistralai/Mistral-Nemo-12b-Base:
+  - accuracy: 69.66
+  - quant_algo: FP8
+    accuracy: 69.66
diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py
index fb535b47396..602ed91dbf6 100644
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@@ -79,6 +79,7 @@ def test_guided_decoding_4gpus(self, backend: str):
             task.evaluate(llm)
 
     def test_gather_generation_logits_cuda_graph(self):
+        """RCCA: https://nvbugs/5365525"""
         extended_runtime_perf_knob_config = ExtendedRuntimePerfKnobConfig(
             cuda_graph_mode=True, cuda_graph_cache_size=1)
         llm = LLM(
@@ -199,18 +200,49 @@ def test_quant_tp4(self, quant):
             task.evaluate(llm)
 
 
-class TestMistral_Nemo_12B_Base(LlmapiAccuracyTestHarness):
-    MODEL_NAME = "mistralai/Mistral-Nemo-Base-2407"
+class TestMistralNemo12B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
     MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
 
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_auto_dtype(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    def test_auto_dtype_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=2,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device_memory(80000)
     @skip_pre_ada
     def test_fp8(self):
-        quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
+        quant_config = QuantConfig(QuantAlgo.FP8,
                                    kv_cache_quant_algo=QuantAlgo.FP8)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
 
-        with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
+        with LLM(self.MODEL_PATH,
+                 quant_config=quant_config,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=8) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
 
 
 class TestMistral_NeMo_Minitron_8B_Instruct(LlmapiAccuracyTestHarness):
@@ -277,6 +309,7 @@ def test_auto_dtype(self):
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_ada
     def test_fp8(self):
         quant_config = QuantConfig(QuantAlgo.FP8)
         with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
@@ -420,7 +453,7 @@ def test_fp8(self):
     @skip_pre_ada
     def test_fp8_kvcache(self):
         "RCCA: https://nvbugs/5065080"
-        quant_config = QuantConfig(QuantAlgo.FP8,
+        quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
                                    kv_cache_quant_algo=QuantAlgo.FP8)
         with LLM(self.MODEL_PATH, quant_config=quant_config) as llm:
             task = CnnDailymail(self.MODEL_NAME)
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 89483fd2620..c6db6076f7c 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1766,6 +1766,36 @@ def test_auto_dtype_tp8(self):
             task.evaluate(llm)
 
 
+class TestMistralNemo12B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "mistralai/Mistral-Nemo-12b-Base"
+    MODEL_PATH = f"{llm_models_root()}/Mistral-Nemo-Base-2407"
+
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_auto_dtype(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(2)
+    def test_auto_dtype_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=2,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 @pytest.mark.timeout(5400)
 @pytest.mark.skip_less_device_memory(80000)
 class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):
diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt
index cf3eec8eeb3..8b19cabd605 100644
--- a/tests/integration/test_lists/qa/llm_function_full.txt
+++ b/tests/integration/test_lists/qa/llm_function_full.txt
@@ -435,7 +435,9 @@ accuracy/test_llm_api.py::TestQwen2_5_7BInstruct::test_fp8_kvcache
 accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4]
 accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int4_awq]
 accuracy/test_llm_api.py::TestMistral7B_0_3::test_quant_tp4[int8_awq]
-accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
+accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype
+accuracy/test_llm_api.py::TestMistralNemo12B::test_auto_dtype_tp2
+accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
 accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
 accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
 accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
@@ -579,6 +581,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
+accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype_tp2
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

From 26a83d6fdb78185305413d5baa2b0ab05c5f73f9 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Mon, 18 Aug 2025 13:32:29 +0800
Subject: [PATCH 20/20] fix invalid test name

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/test_lists/test-db/l0_h100.yml | 2 +-
 tests/integration/test_lists/waives.txt          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 64f6498d095..a52b515e644 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -253,7 +253,7 @@ l0_h100:
   - examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
   - examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] # 5 mins
   - accuracy/test_llm_api.py::TestMistral_NeMo_Minitron_8B_Instruct::test_fp8
-  - accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
+  - accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
   - examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
   - examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
   - examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 7c1a5bb8693..c4733b9b36a 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -301,7 +301,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] SKIP (https://nvbugs/5457489)
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] SKIP (https://nvbugs/5457489)
 disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5457504)
-accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197)
+accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8 SKIP (https://nvbugs/5413197)
 triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (https://nvbugs/5371349)
 triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
 triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)