From f277afdd93365809dbb76ced50be0cd2f9db55c4 Mon Sep 17 00:00:00 2001
From: Daniel Stokes <40156487+djns99@users.noreply.github.com>
Date: Tue, 15 Jul 2025 09:04:15 +1200
Subject: [PATCH 01/88] perf: Enable 128x256 tile shapes for FP4 MOE CUTLASS
 backend (#5986)

Signed-off-by: Daniel Stokes <40156487+djns99@users.noreply.github.com>
---
 .../kernels/cutlass_kernels/cutlass_heuristic.cpp        | 6 ++++--
 .../moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl      | 9 +++++++--
 .../moe_gemm/moe_gemm_template_dispatch_tma_ws.h         | 2 +-
 .../kernels/cutlass_kernels/python/generate_kernels.py   | 2 +-
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
index 47d95589b77..9e3bbaa32b7 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -383,8 +383,10 @@ std::vector<CutlassGemmConfig> get_candidate_configs_sm100(CutlassGemmConfig::Ca
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x128x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
-            // candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
-            //     MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
+            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
+                MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1});
+            candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x256x128B,
+                MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_2x1x1});
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape128x256x128B,
                 MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x2x1});
             candidate_configs.push_back(CutlassGemmConfig{CutlassTileConfigSM100::CtaShape256x64x128B,
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
index fe35f690a9e..d5f0b198fd8 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
@@ -342,11 +342,16 @@ using SafeBF16 = void;
             using EpilogueTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{}));                                                                                                                                                      \
             using EpilogueTileShape = std::conditional_t<IsBlackwell, EpilogueTileShapeSm100, EpilogueTileShapeSm90>;                                                                                                                             \
             using EpilogueElementC = std::conditional_t<IsSM120, ElementCSafe, ElementC>;                                                                                                                                                         \
+            using EpilogueTensorOp = std::conditional_t<IsBlackwell && IsBlockScaled,                                                                                                                                                             \
+                cutlass::arch::OpClassBlockScaledTensorOp, cutlass::arch::OpClassTensorOp>;                                                                                                                                                       \
+            using EpilogueSubTile                                                                                                                                                                                                                 \
+                = std::conditional_t<Arch::kMinComputeCapability == 100 && IsFP4 && CTA_N_ == 256, /* SM100 Exactly */                                                                                                                            \
+                    cute::Shape<cute::_128, cute::_64>, cutlass::epilogue::collective::EpilogueTileAuto>;                                                                                                                                         \
             /* Epilogue For Default Finalize */                                                                                                                                                                                                   \
             using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::CollectiveBuilder</**/                                                                                                                                      \
-                Arch, cutlass::arch::OpClassTensorOp,                                                   /**/                                                                                                                                      \
+                Arch, EpilogueTensorOp,                                                                 /**/                                                                                                                                      \
                 EpilogueTileShape, ClusterShape,                                                        /**/                                                                                                                                      \
-                cutlass::epilogue::collective::EpilogueTileAuto,                                        /**/                                                                                                                                      \
+                EpilogueSubTile,                                                                        /**/                                                                                                                                      \
                 ElementAccumulator, ElementAccumulator,                                                 /**/                                                                                                                                      \
                 EpilogueElementC, LayoutC*, AlignmentC,                                                 /**/                                                                                                                                      \
                 ElementD, LayoutD*, AlignmentD,                                                         /**/                                                                                                                                      \
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
index 57a7da59960..d9df31513f3 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws.h
@@ -159,7 +159,7 @@ constexpr bool are_tile_shapes_supported_sm100()
         // {
         //     return false;
         // }
-        if ((TileN != 64 && TileN != 128) || TileM != 128)
+        if ((TileN != 64 && TileN != 128 && TileN != 256) || TileM != 128)
         {
             return false;
         }
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
index c3c6f47904b..838120136c0 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py
@@ -359,7 +359,7 @@ def is_gemm_op_valid_sm100(op):
         # TODO 128x256x256 FP4 compiles but crashes
         # if tile_n % 64 != 0 or tile_n < 128:
         #     return False
-        if tile_n not in [64, 128] or tile_m != 128:
+        if tile_n not in [64, 128, 256] or tile_m != 128:
             return False
 
     # Shapes for fp8 small N shapes

From f225f5cd2e21508e23654ae9fdda45a14ad70e3f Mon Sep 17 00:00:00 2001
From: ixlmar <206748156+ixlmar@users.noreply.github.com>
Date: Tue, 15 Jul 2025 00:49:42 +0200
Subject: [PATCH 02/88] [nvbugs-5318143] fix: restrict PyTorch memory usage to
 avoid OOMs (#5964)

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
---
 docker/Dockerfile.multi                       |  3 ++
 tensorrt_llm/_torch/pyexecutor/_util.py       | 44 +++++++++++++++++++
 tensorrt_llm/_torch/pyexecutor/config.py      |  5 +++
 .../_torch/pyexecutor/py_executor_creator.py  |  6 ++-
 tests/integration/test_lists/waives.txt       |  2 -
 5 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index a8ee5da002e..da67d0a4994 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -66,6 +66,9 @@ RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.s
 ARG TORCH_INSTALL_TYPE="skip"
 COPY docker/common/install_pytorch.sh install_pytorch.sh
 RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
+#
+# NB: PyTorch requires this to be < 1.0
+ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 
 # Install OpenCV with FFMPEG support
 RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index 6e969a8d1de..979bc83f218 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -1,3 +1,4 @@
+import os
 import random
 from collections.abc import Iterable
 from typing import Dict, List, Optional
@@ -18,6 +19,7 @@
 
 from ..model_config import ModelConfig
 from ..speculative import get_spec_decoder
+from .config import PyTorchConfig
 from .config_utils import is_mla, is_nemotron_hybrid
 from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver
 from .llm_request import ExecutorResponse
@@ -718,3 +720,45 @@ def _try_infer_num_experts(model_config: ModelConfig) -> int:
         return 1
 
     return num_experts
+
+
+def _adjust_torch_mem_fraction(pytorch_backend_config: PyTorchConfig):
+    # FIXME: PyTorch only uses the garbage_collection_threshold setting
+    #        if a memory fraction is set, cf.
+    #   https://github.com/pytorch/pytorch/blob/cd995bfb2aac8891465809be3ce29543bd524287/c10/cuda/CUDACachingAllocator.cpp#L1357
+    logger.debug("Setting PyTorch memory fraction to 1.0")
+    torch.cuda.set_per_process_memory_fraction(1.0)
+
+    # FIXME: As soon as
+    #     torch.cuda._set_allocator_settings (added in PyTorch 2.8.0-rc1)
+    #   or a similar API is available, the warning below should be removed
+    #   and the allocator GC threshold be set via the new API instead.
+    torch_allocator_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
+    torch_mem_threshold_advised = (
+        torch.cuda.get_allocator_backend() == "native"
+        and "expandable_segments:True" not in torch_allocator_config)
+    torch_mem_threshold_set = "garbage_collection_threshold:" in torch_allocator_config
+    if torch_mem_threshold_advised and not torch_mem_threshold_set:
+        logger.warning(
+            "It is recommended to incl. 'garbage_collection_threshold:0.???' or 'backend:cudaMallocAsync'"
+            " or 'expandable_segments:True' in PYTORCH_CUDA_ALLOC_CONF.")
+
+    # NOTE: Even if a memory threshold was not set (cf. warning above), setting a memory
+    #       fraction < 1.0 is beneficial, because
+    #         https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L2719
+    #       and
+    #         https://github.com/pytorch/pytorch/blob/5228986c395dc79f90d2a2b991deea1eef188260/c10/cuda/CUDACachingAllocator.cpp#L1240
+    #       lead PyTorch to release all unused memory before hitting the set fraction. This
+    #       still mitigates OOM, although at a higher performance impact, because it
+    #       effectively resets the allocator cache.
+    if not pytorch_backend_config._limit_torch_cuda_mem_fraction:
+        return
+    mem_reserved = torch.cuda.memory_reserved()
+    mem_free, mem_total = torch.cuda.mem_get_info()
+    safety_margin = 32 * 1024**2
+    mem_torch_max = mem_free + mem_reserved - safety_margin
+    mem_torch_fraction = mem_torch_max / mem_total
+    logger.info(
+        f"Setting PyTorch memory fraction to {mem_torch_fraction} ({mem_torch_max / 1024**3} GiB)"
+    )
+    torch.cuda.set_per_process_memory_fraction(mem_torch_fraction)
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index 19278089677..b1935a51234 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -92,6 +92,11 @@ class PyTorchConfig:
 
     force_dynamic_quantization: bool = False
 
+    # If true, adjust PyTorch CUDA memory fraction to correspond to the
+    # total GPU memory minus the statically allocated engine memory.
+    # If false, set the PyTorch CUDA memory fraction to 1.0.
+    _limit_torch_cuda_mem_fraction: bool = True
+
 
 EXETENDED_EXECUTOR_CONFIG_FIELDS = [
     'backend',
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index a72f6a58b12..b6893d69e26 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -20,8 +20,8 @@
 from ..attention_backend.interface import AttentionRuntimeFeatures
 from ..distributed import MPIDist
 from ..speculative import get_spec_drafter, get_spec_resource_manager
-from ._util import (KvCacheCreator, create_py_executor_instance,
-                    instantiate_sampler, is_mla)
+from ._util import (KvCacheCreator, _adjust_torch_mem_fraction,
+                    create_py_executor_instance, instantiate_sampler, is_mla)
 from .config import PyTorchConfig
 from .config_utils import is_mla
 from .model_engine import PyTorchModelEngine
@@ -432,5 +432,7 @@ def create_py_executor(
                 garbage_collection_gen0_threshold,
             )
 
+    _adjust_torch_mem_fraction(executor_config.pytorch_backend_config)
+
     py_executor.start_worker()
     return py_executor
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index da014ed54de..291e549c648 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -372,8 +372,6 @@ perf/test_perf.py::test_perf[mamba_130m-bench-float16-input_output_len:128,128]
 perf/test_perf.py::test_perf[bert_large-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
 perf/test_perf.py::test_perf[roberta_base-bench-float16-maxbs:32-input_len:128+512] SKIP (https://nvbugspro.nvidia.com/bug/5295411)
 test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5236980)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143)
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143)
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5328160)
 stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test] SKIP (https://nvbugs/5328495)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5322354)

From 2320f12321fef0e508b25de0b5d4ee3f46e6b374 Mon Sep 17 00:00:00 2001
From: Yechan Kim <161688079+yechank-nvidia@users.noreply.github.com>
Date: Tue, 15 Jul 2025 10:26:51 +0900
Subject: [PATCH 03/88] doc: update EXAONE 4.0 news (#6034)

Signed-off-by: yechank <161688079+yechank-nvidia@users.noreply.github.com>
---
 README.md                             |  1 +
 examples/models/core/exaone/README.md | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 7c25d94c1e6..99b00e26195 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,7 @@ TensorRT-LLM
 ✨ [➡️ link](./docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md)
 
 ## Latest News
+* [07/15] 🌟 TensorRT-LLM delivers Day-0 support for LG AI Research's latest model, EXAONE 4.0 [➡️ link](https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B)
 * [06/17] Join NVIDIA and DeepInfra for a developer meetup on June 26 ✨ [➡️ link](https://events.nvidia.com/scaletheunscalablenextgenai)
 * [05/22] Blackwell Breaks the 1,000 TPS/User Barrier With Meta’s Llama 4 Maverick
 ✨ [➡️ link](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/)
diff --git a/examples/models/core/exaone/README.md b/examples/models/core/exaone/README.md
index cf5be149ddf..51c17e14c02 100644
--- a/examples/models/core/exaone/README.md
+++ b/examples/models/core/exaone/README.md
@@ -52,16 +52,13 @@ git clone https://huggingface.co/LGAI-EXAONE/EXAONE-Deep-2.4B $HF_MODEL_DIR
 
 ### EXAONE-4.0
 
-Download he HuggingFace checkpoints of EXAONE-4.0 model. Here, we only use the `TODO: replace with REAL name, EXAONE-4.0` model for the example. From EXAONE-4.0 model, we support EXAONE models only on PyTorch flow.
+Download he HuggingFace checkpoints of EXAONE-4.0 model. Here, we only use the `EXAONE-4.0-32B` model for the example. From EXAONE-4.0 model, we support only on PyTorch flow.
 
 ```bash
 export HF_MODEL_DIR=hf_models/exaone4
-git clone ... $HF_MODEL_DIR (TODO Change ... to real HF directory)
+git clone https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B $HF_MODEL_DIR
 ```
 
-## Usage
-The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`.
-
 ### Pytorch flow
 
 To quickly run EXAONE-4.0 models, you can use [examples/llm-api/quickstart_advanced.py](../../../llm-api/quickstart_advanced.py):
@@ -116,6 +113,9 @@ Temporarily switching to `DynamicCache` when creating PTQ models could help addr
 For models with sliding window attention, DynamicCache is less memory-efficient than HybridCache because it retains the entire key-value cache. However, this does not break the model's attention logic, as the cache implementation is separated from the attention computation itself. This trade-off is acceptable for the PTQ process, which is a one-time procedure. Our tests confirm that this workaround does not degrade accuracy on MMLU or GSM8K benchmarks with the default ModelOpt settings.
 
 ### TRT flow
+
+The next section describe how to convert the weights from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format. We will use llama's [convert_checkpoint.py](../llama/convert_checkpoint.py) for EXAONE model and then we build the model with `trtllm-build`.
+
 ### Convert checkpoint and build TensorRT engine(s)
 
 ```bash

From 2ea407799370baf4b2089290a3f1e52b6e4be422 Mon Sep 17 00:00:00 2001
From: Rashid Kaleem <4079439+arekay@users.noreply.github.com>
Date: Mon, 14 Jul 2025 20:29:19 -0500
Subject: [PATCH 04/88] [Model load] Fix llama min-latency model load (#5883)

Signed-off-by: Rashid Kaleem <4079439+arekay@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_llama.py  |  3 ++
 .../models/modeling_llama_min_latency.py      |  3 ++
 tensorrt_llm/_torch/models/modeling_utils.py  | 30 +++++++++++++++++--
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index fc3febe8384..1c17eeb5a8e 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -624,6 +624,7 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
         self.num_hidden_layers = config.num_hidden_layers
         self.aux_stream = torch.cuda.Stream()
         self.mapping = model_config.mapping
+        self.preload_weight_modules = []
 
         if self.model_config.mapping.enable_attention_dp:
             self.embed_tokens = Embedding(
@@ -646,6 +647,7 @@ def __init__(self, model_config: ModelConfig[LlamaConfig]):
         if model_config.enable_min_latency:
             from .modeling_llama_min_latency import Llama4MinLatencyDecoderLayer
             DecoderLayerClass = Llama4MinLatencyDecoderLayer
+            self.preload_weight_modules = ["gate_up_proj"]
 
         self.layers = nn.ModuleList([
             DecoderLayerClass(
@@ -878,6 +880,7 @@ def __init__(
         model_config.pretrained_config = model_config.pretrained_config.text_config
         model_config.pretrained_config.architectures = architectures
         super().__init__(Llama4Model(model_config), model_config)
+        self.preload_weight_modules = self.model.preload_weight_modules
 
     def forward(
         self,
diff --git a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
index 88a78cfb136..72a5b4843fb 100644
--- a/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
+++ b/tensorrt_llm/_torch/models/modeling_llama_min_latency.py
@@ -98,6 +98,9 @@ def load_weights(self, weights: List[Dict]):
         # After loading weights, calculate the combined scale (input_scale * weight_scale) for special kernels and
         # trtllm-gen kernels.
         if self.has_fp8_qdq:
+            if self.weight_scale.device != self.input_scale.device:
+                self.weight_scale = torch.nn.Parameter(
+                    self.weight_scale.to(self.input_scale.device))
             self.combined_scale = self.input_scale * self.weight_scale
 
             # If this is gate_up_proj + swiglu and trtllm-gen kernels will be used, we need to reorder the weights
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
index a8ce31bf2ce..1dac009f5c1 100755
--- a/tensorrt_llm/_torch/models/modeling_utils.py
+++ b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -525,7 +525,11 @@ def forward(
         )
 
     def load_weights(self, weights: Dict, skip_modules: List[str] = []):
-        _load_weights_impl(self, weights, skip_modules)
+        preload_weight_modules = getattr(self, "preload_weight_modules", None)
+        _load_weights_impl(self,
+                           weights,
+                           skip_modules,
+                           preload_weight_modules=preload_weight_modules)
 
     def infer_max_seq_len(self) -> int:
         # Modified from tensorrt_llm/builder.py _init_max_seq_len
@@ -675,7 +679,10 @@ def run_concurrently(func,
 def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
                        weights: Dict,
                        skip_modules: List[str] = [],
-                       params_map: Optional[Dict[str, str]] = None):
+                       params_map: Optional[Dict[str, str]] = None,
+                       preload_weight_modules: Optional[List[str]] = None):
+    # TODO: remove preload_weight_modules - it is a workaround for min-latency llama4 model loading where
+    # we need some order in the module loading. Once this is resolved, we can remove this workaround.
     if not hasattr(model, 'model_config') or not isinstance(
             model.model_config, ModelConfig):
         raise ValueError("model must have a model_config attribute")
@@ -756,7 +763,24 @@ def load_single_module(name, module):
                                  desc="Loading weights"):
             load_single_module(name, module)
     else:
+        all_modules = dict(model.named_modules())
+        serial_load_modules = []
+        if preload_weight_modules is not None:
+            for module in preload_weight_modules:
+                serial_load_modules.extend([
+                    name for name in all_modules.keys() if name.endswith(module)
+                ])
+            logger.info(f"Serial load modules: {serial_load_modules}")
+            pbar = tqdm(serial_load_modules, desc="Loading weights serially")
+            for module in serial_load_modules:
+                # logger.info(f"Loading weights for {module} in serial")
+                load_single_module(module, all_modules[module])
+                pbar.update(1)
+                del all_modules[module]
+            pbar.close()
+
         pbar = tqdm(list(model.named_modules()),
                     desc="Loading weights concurrently")
-        args_list = [(name, module) for name, module in model.named_modules()]
+        args_list = [(name, module) for name, module in model.named_modules()
+                     if name not in serial_load_modules]
         run_concurrently(load_single_module, args_list, pbar=pbar)

From dd2491f47d2022f5f013e467ce62026ef13bcab2 Mon Sep 17 00:00:00 2001
From: Daniel Stokes <40156487+djns99@users.noreply.github.com>
Date: Tue, 15 Jul 2025 13:40:42 +1200
Subject: [PATCH 05/88] fix: Fix MOE benchmark to rotate buffers to prevent L2
 cache reuse (#4135)

Signed-off-by: Daniel Stokes <40156487+djns99@users.noreply.github.com>
---
 .../mixtureOfExpertsBackendBenchmarkFixture.h | 166 +++++++++++++-----
 ...ixtureOfExpertsBackendBenchmarkLauncher.cu |   2 +-
 2 files changed, 122 insertions(+), 46 deletions(-)

diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
index 3cd87acf251..0790b842d45 100644
--- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
+++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
@@ -76,6 +76,7 @@ namespace
 // Abstract class for routing config
 struct RoutingConfig
 {
+    virtual void start(){};
     virtual void setRouting(int* selected_experts, int64_t num_experts, int64_t k, int64_t num_tokens) = 0;
     virtual std::string getName() = 0;
     virtual bool isDeterministic() const = 0;
@@ -143,6 +144,11 @@ struct RandomDistributionRoutingConfig : public RoutingConfig
             "Cannot create random routing distribution. Number of experts does not match the number of weights");
     }
 
+    void start()
+    {
+        twister.seed(0xD5);
+    }
+
     std::string getName() override
     {
         return name;
@@ -208,6 +214,11 @@ struct UniformRoutingConfig : public RoutingConfig
 {
     std::mt19937_64 twister{0xD5};
 
+    void start()
+    {
+        twister.seed(0xD5);
+    }
+
     std::string getName() override
     {
         return "uniform";
@@ -522,14 +533,32 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
     ActivationType mActType = ActivationType::Relu;
 
-    QuantParams mQuantParams{};
+    constexpr static int64_t NUM_BUFFERS = 32;
+
+    std::array<QuantParams, NUM_BUFFERS> mQuantParams{};
     bool mUseLora = false;
     bool mUsePrequantScale = false;
     int mGroupSize = -1;
-    LoraParams mLoraParams{};
+    std::array<LoraParams, NUM_BUFFERS> mLoraParams{};
 
     std::optional<tensorrt_llm::cutlass_extensions::CutlassGemmConfig> mSelectedConfig = std::nullopt;
 
+    int64_t mBufferIndex = 0;
+    size_t mWorkspaceSize = 0;
+    size_t mExpertWeight1Size = 0;
+    size_t mExpertWeight2Size = 0;
+    size_t mExpertBias1Size = 0;
+    size_t mExpertBias2Size = 0;
+    size_t mInputTensorSize = 0;
+    size_t mFinalOutputSize = 0;
+    size_t mSourceToExpandedMapSize = 0;
+    size_t mScaleProbsSize = 0;
+    size_t mSelectedExpertsSize = 0;
+    size_t mExpertFP4WeightSf1Size = 0;
+    size_t mExpertFP4WeightSf2Size = 0;
+    size_t mExpertIntScale1Size = 0;
+    size_t mExpertIntScale2Size = 0;
+
     template <class T>
     T* allocBuffer(size_t size)
     {
@@ -558,30 +587,39 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         mGatedMultiplier = mIsGated ? 2 : 1;
         auto const gated_inter = mInterSize * mGatedMultiplier;
 
-        size_t workspace_size
-            = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType, {},
-                mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
+        mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType,
+            {}, mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
 
-        mWorkspace = allocBuffer<char>(workspace_size);
+        mWorkspace = allocBuffer<char>(mWorkspaceSize * NUM_BUFFERS);
         size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize;
 
-        mExpertWeight1 = allocBuffer<WeightStorage>(expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE);
-        mExpertWeight2 = allocBuffer<WeightStorage>(expert_matrix_size / WEIGHT_ELEM_PER_BYTE);
+        mExpertWeight1Size = expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE;
+        mExpertWeight2Size = expert_matrix_size / WEIGHT_ELEM_PER_BYTE;
+        mExpertWeight1 = allocBuffer<WeightStorage>(mExpertWeight1Size * NUM_BUFFERS);
+        mExpertWeight2 = allocBuffer<WeightStorage>(mExpertWeight2Size * NUM_BUFFERS);
 
         mExpertBias1 = nullptr;
         mExpertBias2 = nullptr;
         if (mUseBias)
         {
-            mExpertBias1 = allocBuffer<DataType>(mNumExperts * gated_inter);
-            mExpertBias2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
+            mExpertBias1Size = mNumExperts * gated_inter;
+            mExpertBias2Size = mNumExperts * mHiddenSize;
+            mExpertBias1 = allocBuffer<DataType>(mExpertBias1Size * NUM_BUFFERS);
+            mExpertBias2 = allocBuffer<DataType>(mExpertBias2Size * NUM_BUFFERS);
         }
 
         if constexpr (INT_QUANT)
         {
-            mExpertIntScale1 = allocBuffer<DataType>(mNumExperts * gated_inter);
-            mExpertIntScale2 = allocBuffer<DataType>(mNumExperts * mHiddenSize);
+            mExpertIntScale1Size = mNumExperts * gated_inter;
+            mExpertIntScale2Size = mNumExperts * mHiddenSize;
+            mExpertIntScale1 = allocBuffer<DataType>(mExpertIntScale1Size * NUM_BUFFERS);
+            mExpertIntScale2 = allocBuffer<DataType>(mExpertIntScale2Size * NUM_BUFFERS);
 
-            mQuantParams = QuantParams::Int(mExpertIntScale1, mExpertIntScale2);
+            for (int i = 0; i < NUM_BUFFERS; i++)
+            {
+                mQuantParams[i] = QuantParams::Int(
+                    mExpertIntScale1 + mExpertIntScale1Size * i, mExpertIntScale2 + mExpertIntScale2Size * i);
+            }
         }
         else if constexpr (FP8)
         {
@@ -589,39 +627,57 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             mExpertFP8Scale2 = allocBuffer<float>(1);
             mExpertFP8Scale3 = allocBuffer<float>(mNumExperts);
 
-            mQuantParams = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
+            for (int i = 0; i < NUM_BUFFERS; i++)
+            {
+                mQuantParams[i] = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
+            }
         }
         else if constexpr (ANY_FP4)
         {
             mExpertFP4ActScale1 = allocBuffer<float>(1);
-            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE);
+            mExpertFP4WeightSf1Size = num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE;
+            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(mExpertFP4WeightSf1Size * NUM_BUFFERS);
             mExpertFP4GlobalScale1 = allocBuffer<float>(num_experts);
 
             mExpertFP4ActScale2 = allocBuffer<float>(1);
-            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE);
+            mExpertFP4WeightSf2Size = num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE;
+            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(mExpertFP4WeightSf2Size * NUM_BUFFERS);
             mExpertFP4GlobalScale2 = allocBuffer<float>(num_experts);
 
             auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4;
-            mQuantParams = func(mExpertFP4ActScale1, mExpertFP4WeightSf1, mExpertFP4GlobalScale1, mExpertFP4ActScale2,
-                mExpertFP4WeightSf2, mExpertFP4GlobalScale2, false, false);
+            for (int i = 0; i < NUM_BUFFERS; i++)
+            {
+                mQuantParams[i] = func(mExpertFP4ActScale1, mExpertFP4WeightSf1 + mExpertFP4WeightSf1Size * i,
+                    mExpertFP4GlobalScale1, mExpertFP4ActScale2, mExpertFP4WeightSf2 + mExpertFP4WeightSf2Size * i,
+                    mExpertFP4GlobalScale2, false, false);
+            }
         }
 
-        mSelectedExperts = allocBuffer<int>(mTotalTokens * mK);
-        mScaleProbs = allocBuffer<float>(mTotalTokens * mK);
-        mInputTensor = allocBuffer<DataType>(mTotalTokens * mHiddenSize);
-        mFinalOutput = allocBuffer<OutputType>(mTotalTokens * mHiddenSize);
+        mSelectedExpertsSize = mTotalTokens * mK;
+        mSelectedExperts = allocBuffer<int>(mSelectedExpertsSize * NUM_BUFFERS);
+        mScaleProbsSize = mTotalTokens * mK;
+        mScaleProbs = allocBuffer<float>(mScaleProbsSize * NUM_BUFFERS);
+        mInputTensorSize = mTotalTokens * mHiddenSize;
+        mInputTensor = allocBuffer<DataType>(mInputTensorSize * NUM_BUFFERS);
+        mFinalOutputSize = mTotalTokens * mHiddenSize;
+        mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * NUM_BUFFERS);
 
-        mSourceToExpandedMap = allocBuffer<int>(mTotalTokens * mK);
+        mSourceToExpandedMapSize = mTotalTokens * mK;
+        mSourceToExpandedMap = allocBuffer<int>(mSourceToExpandedMapSize * NUM_BUFFERS);
 
         mRoutingConfigIndex = routing_config;
         auto tactic = routingConfigCache.at(routing_config);
-        tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens);
+        tactic->start();
+        for (int i = 0; i < NUM_BUFFERS; i++)
+        {
+            tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * i, mNumExperts, mK, mTotalTokens);
+        }
 
         check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
     }
 
-    cudaGraph_t mGraph{};
-    cudaGraphExec_t mGraphInstance{};
+    std::array<cudaGraph_t, NUM_BUFFERS> mGraph{};
+    std::array<cudaGraphExec_t, NUM_BUFFERS> mGraphInstance{};
 
     void createGraph(MOEParallelismConfig parallelism_config)
     {
@@ -630,11 +686,15 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
         NVTX3_SCOPED_RANGE(BuildGraph);
 
-        check_cuda_error(cudaGraphCreate(&mGraph, 0));
-        check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
-        runMoEPermute(parallelism_config);
-        check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph));
-        check_cuda_error(cudaGraphInstantiate(&mGraphInstance, mGraph, nullptr, nullptr, 0));
+        for (int i = 0; i < NUM_BUFFERS; i++)
+        {
+            mBufferIndex = i;
+            check_cuda_error(cudaGraphCreate(&mGraph[i], 0));
+            check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
+            runMoEPermute(parallelism_config);
+            check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph[i]));
+            check_cuda_error(cudaGraphInstantiate(&mGraphInstance[i], mGraph[i], nullptr, nullptr, 0));
+        }
     }
 
     void destroyGraph()
@@ -644,16 +704,20 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
         NVTX3_SCOPED_RANGE(DestroyGraph);
 
-        check_cuda_error(cudaGraphExecDestroy(mGraphInstance));
-        check_cuda_error(cudaGraphDestroy(mGraph));
+        for (int i = 0; i < NUM_BUFFERS; i++)
+        {
+            check_cuda_error(cudaGraphExecDestroy(mGraphInstance[i]));
+            check_cuda_error(cudaGraphDestroy(mGraph[i]));
+        }
     }
 
     float benchmarkLoop(MOEParallelismConfig parallelism_config)
     {
+        mBufferIndex = (mBufferIndex + 1) % NUM_BUFFERS;
         auto tactic = routingConfigCache.at(mRoutingConfigIndex);
         if (!tactic->isDeterministic())
         {
-            tactic->setRouting(mSelectedExperts, mNumExperts, mK, mTotalTokens);
+            tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mNumExperts, mK, mTotalTokens);
         }
 
         {
@@ -661,7 +725,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             check_cuda_error(cudaEventRecord(mStartEvent, streamPtr->get()));
             if (useCudaGraph)
             {
-                cudaGraphLaunch(mGraphInstance, streamPtr->get());
+                cudaGraphLaunch(mGraphInstance[mBufferIndex], streamPtr->get());
             }
             else
             {
@@ -802,17 +866,29 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         auto stream = streamPtr->get();
         MoeMinLatencyParams min_latency_params;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
-            mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
-            mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
-            parallelism_config, /*enable_alltoall=*/false, mUseLora, mLoraParams,
-            /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
+        mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+            mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
+            mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
+            mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
+            mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+            mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
+            mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
+            mFinalOutput + mFinalOutputSize * mBufferIndex,
+            mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
+            /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
+            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #else
-        mMoERunner.runMoe(mInputTensor, nullptr, mSelectedExperts, mUseFinalScale ? mScaleProbs : nullptr,
-            mExpertWeight1, mExpertBias1, mActType, mExpertWeight2, mExpertBias2, mQuantParams, mTotalTokens,
-            mHiddenSize, mInterSize, mNumExperts, mK, mWorkspace, mFinalOutput, mSourceToExpandedMap,
-            parallelism_config, mUseLora, mLoraParams, /*use_deepseek_fp8_block_scale=*/false,
-            /*min_latency_mode=*/false, min_latency_params, stream);
+        mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+            mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
+            mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
+            mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
+            mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+            mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
+            mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
+            mFinalOutput + mFinalOutputSize * mBufferIndex,
+            mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, mUseLora,
+            mLoraParams[mBufferIndex],
+            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #endif
     }
 
diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
index b71509fdbc1..663759e3ff7 100644
--- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
+++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
@@ -623,7 +623,7 @@ void help()
            "    \"dtypes\": [string, ...], (optional)\n"
            "    \"routing_name\": string, (optional)\n"
            "    \"selected_experts\": [int, ...], or string, (optional, length is a multiple of k)\n"
-           "    \"expert_distribtuion\": [float, ...], or string, (optional, length is num_experts)\n"
+           "    \"expert_distribution\": [float, ...], or string, (optional, length is num_experts)\n"
            "  },\n"
            "  ...\n"
            "]\n"

From 24dfd4cd0bf49e3917b8ac69c381391a989aec6c Mon Sep 17 00:00:00 2001
From: jiahanc <173873397+jiahanc@users.noreply.github.com>
Date: Mon, 14 Jul 2025 19:37:26 -0700
Subject: [PATCH 06/88] Doc: Update llama-3.3-70B guide (#6028)

Signed-off-by: jiahanc <173873397+jiahanc@users.noreply.github.com>
---
 examples/models/core/llama/README.md | 52 ++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)

diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
index cdf660035c2..18f950ac4b4 100644
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@@ -37,6 +37,10 @@ This document shows how to build and run a LLaMA model in TensorRT-LLM on both s
     - [Convert Checkpoint to TensorRT-LLM Unified Checkpoint](#convert-checkpoint-to-tensorrt-llm-unified-checkpoint)
     - [Build Engine](#build-engine)
     - [Run Inference](#run-inference)
+  - [Run LLaMa-3.3 70B Model on PyTorch Backend](#run-llama-33-70b-model-on-pytorch-backend)
+    - [Prepare TensorRT-LLM extra configs](#prepare-tensorrt-llm-extra-configs)
+    - [Launch trtllm-serve OpenAI-compatible API server](#launch-trtllm-serve-openai-compatible-api-server)
+    - [Run performance benchmarks](#run-performance-benchmarks)
 
 ## Overview
 
@@ -1542,3 +1546,51 @@ bash -c 'python ./examples/mmlu.py --test_trt_llm \
                                    --kv_cache_free_gpu_memory_fraction 0.999 \
                                    --max_tokens_in_paged_kv_cache 65064'
 ```
+
+## Run LLaMa-3.3 70B Model on PyTorch Backend
+This section provides the steps to run LLaMa-3.3 70B model FP8 precision on PyTorch backend by launching TensorRT-LLM server and run performance benchmarks.
+
+
+### Prepare TensorRT-LLM extra configs
+```bash
+cat >./extra-llm-api-config.yml <<EOF
+stream_interval: 2
+cuda_graph_config:
+  max_batch_size: 1024
+  padding_enabled: true
+EOF
+```
+Explanation:
+- `stream_interval`: The iteration interval to create responses under the streaming mode.
+- `cuda_graph_config`: CUDA Graph config.
+  - `max_batch_size`: Max CUDA graph batch size to capture.
+  - `padding_enabled`: Whether to enable CUDA graph padding.
+
+
+### Launch trtllm-serve OpenAI-compatible API server
+TensorRT-LLM supports nvidia TensorRT Model Optimizer quantized FP8 checkpoint
+``` bash
+trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 \
+    --backend pytorch \
+    --tp_size 8 \
+    --max_batch_size 1024 \
+    --trust_remote_code \
+    --num_postprocess_workers 2 \
+    --extra_llm_api_options ./extra-llm-api-config.yml
+```
+
+### Run performance benchmarks
+TensorRT-LLM provides a benchmark tool to benchmark `trtllm-serve`.
+
+Prepare a new terminal and run `benchmark_serving`.
+```bash
+python -m tensorrt_llm.serve.scripts.benchmark_serving \
+        --model nvidia/Llama-3.3-70B-Instruct-FP8 \
+        --dataset-name random \
+        --ignore-eos \
+        --num-prompts 8192 \
+        --random-input-len 1024 \
+        --random-output-len 2048 \
+        --random-ids \
+        --max-concurrency 1024 \
+```

From 01b2def5efe64bd162500f24fd3d40c5c8fcf143 Mon Sep 17 00:00:00 2001
From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:06:03 +0800
Subject: [PATCH 07/88] infra: [TRTLLM-6331] Support show all stage name list
 when stage name check failed (#5946)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
---
 jenkins/L0_Test.groovy | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 7dfac2415b5..26c52689766 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1642,7 +1642,8 @@ def checkStageNameSet(stageNames, jobKeys, paramName) {
     echo "Validate stage names for the passed GitLab bot params [${paramName}]."
     invalidStageName = stageNames.findAll { !(it in jobKeys) }
     if (invalidStageName) {
-        throw new Exception("Cannot find the stage names [${invalidStageName}] from the passed params [${paramName}].")
+        def sortedJobKeys = jobKeys.sort()
+        throw new Exception("Cannot find the stage names [${invalidStageName}] from the passed params [${paramName}]. Available stage names (${sortedJobKeys.size()} total):\n${sortedJobKeys.collect { "    ${it}" }.join('\n')}")
     }
 }
 

From 6b35afaf1bc3983e3cf871bf76c31e89a3470866 Mon Sep 17 00:00:00 2001
From: Yiqing Yan <yiqingy@nvidia.com>
Date: Tue, 15 Jul 2025 11:27:21 +0800
Subject: [PATCH 08/88] [Infra][TRTLLM-6013] - Fix stage name in single stage
 test rerun report (#5672)

Signed-off-by: Yiqing Yan <yiqingy@nvidia.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
---
 jenkins/L0_MergeRequest.groovy                    |  2 +-
 jenkins/L0_Test.groovy                            | 11 +++++++----
 {tests/integration/defs => jenkins}/test_rerun.py |  0
 3 files changed, 8 insertions(+), 5 deletions(-)
 rename {tests/integration/defs => jenkins}/test_rerun.py (100%)

diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index 6773d1c3ef5..ecfdac3a8dc 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -771,7 +771,7 @@ def collectTestResults(pipeline, testFilter)
             trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add py3-pip")
             trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true")
             sh """
-                python3 llm/tests/integration/defs/test_rerun.py \
+                python3 llm/jenkins/test_rerun.py \
                 generate_rerun_report \
                 --output-file=rerun/rerun_report.xml \
                 --input-files=${inputfiles}
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 26c52689766..548846612f7 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -1111,7 +1111,7 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
     // Generate rerun test lists
     def failSignaturesList = trtllm_utils.getFailSignaturesList().join(",")
     sh """
-        python3 ${llmSrc}/tests/integration/defs/test_rerun.py \
+        python3 ${llmSrc}/jenkins/test_rerun.py \
         generate_rerun_tests_list \
         --output-dir=${WORKSPACE}/${stageName}/ \
         --input-file=${WORKSPACE}/${stageName}/results.xml \
@@ -1184,12 +1184,15 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
         }
     }
 
-    // generate rerun report
+    // Specify the stage name correctly
+    sh "cd ${WORKSPACE}/${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
+
+    // Generate rerun report
     inputFiles = ["${WORKSPACE}/${stageName}/results.xml",
                   "${WORKSPACE}/${stageName}/rerun_results_1.xml",
                   "${WORKSPACE}/${stageName}/rerun_results_2.xml"]
     sh """
-        python3 ${llmSrc}/tests/integration/defs/test_rerun.py \
+        python3 ${llmSrc}/jenkins/test_rerun.py \
         generate_rerun_report \
         --output-file=${WORKSPACE}/${stageName}/rerun_results.xml \
         --input-files=${inputFiles.join(",")}
@@ -1197,7 +1200,7 @@ def rerunFailedTests(stageName, llmSrc, testCmdLine) {
 
     // Update original results xml file with rerun results xml files for junit
     sh """
-        python3 ${llmSrc}/tests/integration/defs/test_rerun.py \
+        python3 ${llmSrc}/jenkins/test_rerun.py \
         merge_junit_xmls \
         --output-file=${WORKSPACE}/${stageName}/results.xml \
         --input-files=${inputFiles.join(",")} \
diff --git a/tests/integration/defs/test_rerun.py b/jenkins/test_rerun.py
similarity index 100%
rename from tests/integration/defs/test_rerun.py
rename to jenkins/test_rerun.py

From e499f6c44ab32674ae99b4ff31b01a9cb41c6765 Mon Sep 17 00:00:00 2001
From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
Date: Tue, 15 Jul 2025 01:31:35 -0400
Subject: [PATCH 09/88] [Fix] check for ImportError or ModuleNotFoundError for
 deep_ep_utils (#6026)

Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
---
 tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
index 178d4d35849..62146d9295f 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
@@ -12,7 +12,7 @@
 try:
     from tensorrt_llm.deep_ep import Buffer
     deep_ep_installed = True
-except ModuleNotFoundError:
+except ImportError:
     deep_ep_installed = False
 
 

From d811843a08ae6fcd61b5c1f53d8fd21865862800 Mon Sep 17 00:00:00 2001
From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Date: Tue, 15 Jul 2025 14:39:31 +0800
Subject: [PATCH 10/88] =?UTF-8?q?infra:=20[TRTLLM-6313]=20Fix=20the=20pack?=
 =?UTF-8?q?age=20sanity=20stage=20'Host=20Node=20Name'=20in=E2=80=A6=20(#5?=
 =?UTF-8?q?945)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
---
 jenkins/L0_Test.groovy | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 548846612f7..7dec81f7fde 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -2019,6 +2019,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
                     pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)
                     trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", {
                         echo "###### Prerequisites Start ######"
+                        echoNodeAndGpuInfo(pipeline, toStageName(values[1], key))
                         // Clean up the pip constraint file from the base NGC PyTorch image.
                         if (values[5] == DLFW_IMAGE) {
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true")
@@ -2064,7 +2065,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
                         }
                         withEnv(libEnv) {
                             sh "env | sort"
-                            runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, "${values[1]}-${key}-sanity-check" , 1, 1, true, null)
+                            runLLMTestlistOnPlatform(pipeline, gpu_type, "l0_sanity_check", config, false, toStageName(values[1], key), 1, 1, true, null)
                         }
                     })
                 }

From 4e4d18826fbd476bb05a09307bc0abe350b5a235 Mon Sep 17 00:00:00 2001
From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Date: Tue, 15 Jul 2025 14:50:03 +0800
Subject: [PATCH 11/88] =?UTF-8?q?chore:=20[Breaking=20Change]=20Rename=20c?=
 =?UTF-8?q?uda=5Fgraph=5Fconfig=20padding=5Fenabled=20fie=E2=80=A6=20(#600?=
 =?UTF-8?q?3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
 ...practice_on_DeepSeek-R1_in_TensorRT-LLM.md |   7 +-
 ..._R1_MTP_Implementation_and_Optimization.md |   6 +-
 ..._R1_Throughput_on_NVIDIA_Blackwell_GPUs.md |   2 +-
 ...ling_Expert_Parallelism_in_TensorRT-LLM.md |   3 +-
 docs/source/performance/perf-overview.md      |   2 +-
 docs/source/scripts/disaggregated/gen_yaml.py |   6 +-
 examples/llm-api/llm_runtime.py               |   2 +-
 examples/llm-api/quickstart_advanced.py       |   6 +-
 examples/models/core/deepseek_v3/README.md    |  13 ++-
 examples/models/core/llama/README.md          |   4 +-
 examples/models/core/llama4/README.md         |   8 +-
 examples/models/core/qwen/README.md           |   4 +-
 examples/wide_ep/ep_load_balancer/README.md   |  26 +++--
 examples/wide_ep/slurm_scripts/gen_yaml.py    |   2 +-
 .../_torch/pyexecutor/model_engine.py         |   6 +-
 tensorrt_llm/bench/benchmark/utils/general.py |   2 +-
 tensorrt_llm/llmapi/__init__.py               |   3 +-
 tensorrt_llm/llmapi/llm_args.py               | 103 ++++++++++--------
 .../defs/accuracy/test_llm_api_pytorch.py     |  45 ++++----
 ..._lite_attention_dp_overlap_cuda_graph.yaml |   2 +-
 ...2_deepseek_v3_lite_overlap_cuda_graph.yaml |   2 +-
 .../disagg_config_cuda_graph_padding.yaml     |   2 +-
 .../defs/perf/pytorch_model_config.py         |   6 +-
 .../defs/stress_test/stress_test.py           |   2 +-
 .../_torch/modeling/test_modeling_deepseek.py |   4 +-
 .../multi_gpu_modeling/test_deepseek.py       |   6 +-
 .../_torch/test_pytorch_model_engine.py       |   5 +-
 .../api_stability/references/llm.yaml         |  10 +-
 tests/unittest/llmapi/test_llm_args.py        |   6 +-
 29 files changed, 156 insertions(+), 139 deletions(-)

diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
index f17caefc445..98c72e700d6 100644
--- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
+++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
@@ -138,7 +138,8 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 cuda_graph_config: {}
-moe_backend: TRTLLM
+moe_config:
+  backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -196,7 +197,7 @@ We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 896
   - 512
@@ -263,7 +264,7 @@ YOUR_DATA_PATH=./dataset.txt
 
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 1
   - 2
diff --git a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
index b2749c3fe1b..9093aa663a3 100644
--- a/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
+++ b/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md
@@ -124,7 +124,8 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 cuda_graph_config: {}
-moe_backend: TRTLLM
+moe_config:
+  backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
@@ -179,7 +180,8 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 cuda_graph_config: {}
-moe_backend: TRTLLM
+moe_config:
+  backend: TRTLLM
 speculative_config:
     decoding_type: MTP
     num_nextn_predict_layers: 3
diff --git a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
index ea9373dad67..a5bb524e661 100644
--- a/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
+++ b/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md
@@ -157,7 +157,7 @@ These optimizations target the overall execution flow, scheduling, and resource
 
     There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
 
-    Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n  padding_enabled: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
+    Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n  enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
 
 * Overlap Scheduler:
 
diff --git a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
index 53fdaf44881..d6fbd8128f9 100644
--- a/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
+++ b/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md
@@ -623,7 +623,8 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
-moe_load_balancer: ./moe_load_balancer.yaml
+moe_config:
+  load_balancer: ./moe_load_balancer.yaml
 EOF
 
 trtllm-llmapi-launch \
diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 05c4918db52..3f55a4e1095 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -201,7 +201,7 @@ trtllm-bench --model $model_name throughput --dataset $dataset_file --backend py
 `llm_options.yml`
 ```yaml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
diff --git a/docs/source/scripts/disaggregated/gen_yaml.py b/docs/source/scripts/disaggregated/gen_yaml.py
index 4cd8288fee9..1d198a9766d 100644
--- a/docs/source/scripts/disaggregated/gen_yaml.py
+++ b/docs/source/scripts/disaggregated/gen_yaml.py
@@ -190,12 +190,14 @@ def gen_config_file(config_path: str,
             'max_seq_len': 8576,
             'free_gpu_memory_fraction': gen_gpu_memory_fraction,
             'cuda_graph_config': {
-                'padding_enabled': True,
+                'enable_padding': True,
                 'batch_sizes': gen_cuda_graph_batch_sizes,
             },
             'print_iter_log': True,
             'kv_cache_dtype': 'fp8',
-            'moe_backend': 'TRTLLM',
+            'moe_config': {
+                'backend': 'TRTLLM',
+            },
             'cache_transceiver_config': {
                 'max_num_tokens': 8320,
             },
diff --git a/examples/llm-api/llm_runtime.py b/examples/llm-api/llm_runtime.py
index 5a6252400b1..deebdd68eb8 100644
--- a/examples/llm-api/llm_runtime.py
+++ b/examples/llm-api/llm_runtime.py
@@ -21,7 +21,7 @@ def example_cuda_graph_config():
 
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1, 2, 4],
-        padding_enabled=True,
+        enable_padding=True,
     )
 
     llm = LLM(
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index 352a23893ca..4abb501b6d9 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -2,7 +2,7 @@
 
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm.llmapi import (CudaGraphConfig, DraftTargetDecodingConfig,
-                                 EagleDecodingConfig, KvCacheConfig,
+                                 EagleDecodingConfig, KvCacheConfig, MoeConfig,
                                  MTPDecodingConfig, NGramDecodingConfig,
                                  TorchCompileConfig)
 
@@ -188,7 +188,7 @@ def setup_llm(args):
 
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=args.cuda_graph_batch_sizes,
-        padding_enabled=args.cuda_graph_padding_enabled,
+        enable_padding=args.cuda_graph_padding_enabled,
     ) if args.use_cuda_graph else None
     llm = LLM(
         model=args.model_dir,
@@ -207,7 +207,7 @@ def setup_llm(args):
             enable_piecewise_cuda_graph= \
                 args.use_piecewise_cuda_graph)
         if args.use_torch_compile else None,
-        moe_backend=args.moe_backend,
+        moe_config=MoeConfig(backend=args.moe_backend),
         enable_trtllm_sampler=args.enable_trtllm_sampler,
         max_seq_len=args.max_seq_len,
         max_batch_size=args.max_batch_size,
diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
index d39d00d1342..fa4561066dc 100644
--- a/examples/models/core/deepseek_v3/README.md
+++ b/examples/models/core/deepseek_v3/README.md
@@ -142,7 +142,7 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes: [1, 4, 8, 12]
 EOF
 
@@ -169,9 +169,10 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes: [1, 2]
-moe_max_num_tokens: 16384
+moe_config:
+  max_num_tokens: 16384
 EOF
 
 trtllm-bench -m deepseek-ai/DeepSeek-R1 --model_path ${DS_R1_NVFP4_MODEL_PATH} throughput \
@@ -237,7 +238,7 @@ To serve the model using `trtllm-serve`:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
@@ -316,7 +317,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
@@ -539,7 +540,7 @@ python3 /path/to/TensorRT-LLM/benchmarks/cpp/prepare_dataset.py \
 
 cat >/path/to/TensorRT-LLM/extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
diff --git a/examples/models/core/llama/README.md b/examples/models/core/llama/README.md
index 18f950ac4b4..ce67674e845 100644
--- a/examples/models/core/llama/README.md
+++ b/examples/models/core/llama/README.md
@@ -1557,14 +1557,14 @@ cat >./extra-llm-api-config.yml <<EOF
 stream_interval: 2
 cuda_graph_config:
   max_batch_size: 1024
-  padding_enabled: true
+  enable_padding: true
 EOF
 ```
 Explanation:
 - `stream_interval`: The iteration interval to create responses under the streaming mode.
 - `cuda_graph_config`: CUDA Graph config.
   - `max_batch_size`: Max CUDA graph batch size to capture.
-  - `padding_enabled`: Whether to enable CUDA graph padding.
+  - `enable_padding`: Whether to enable CUDA graph padding.
 
 
 ### Launch trtllm-serve OpenAI-compatible API server
diff --git a/examples/models/core/llama4/README.md b/examples/models/core/llama4/README.md
index d7714110def..7e1644d5d94 100644
--- a/examples/models/core/llama4/README.md
+++ b/examples/models/core/llama4/README.md
@@ -29,7 +29,7 @@ enable_attention_dp: true
 stream_interval: 2
 cuda_graph_config:
   max_batch_size: 512
-  padding_enabled: true
+  enable_padding: true
 EOF
 ```
 Explanation:
@@ -37,7 +37,7 @@ Explanation:
 - `stream_interval`: The iteration interval to create responses under the streaming mode.
 - `cuda_graph_config`: CUDA Graph config.
   - `max_batch_size`: Max CUDA graph batch size to capture.
-  - `padding_enabled`: Whether to enable CUDA graph padding.
+  - `enable_padding`: Whether to enable CUDA graph padding.
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
@@ -81,7 +81,7 @@ enable_min_latency: true
 stream_interval: 2
 cuda_graph_config:
   max_batch_size: 8
-  padding_enabled: true
+  enable_padding: true
 EOF
 ```
 Explanation:
@@ -90,7 +90,7 @@ Explanation:
 - `stream_interval`: The iteration interval to create responses under the streaming mode.
 - `cuda_graph_config`: CUDA Graph config.
   - `max_batch_size`: Max CUDA graph batch size to capture.
-  - `padding_enabled`: Whether to enable CUDA graph padding.
+  - `enable_padding`: Whether to enable CUDA graph padding.
 
 
 #### 2. Launch trtllm-serve OpenAI-compatible API server
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
index 0de4f0f8e19..83e0eab5284 100644
--- a/examples/models/core/qwen/README.md
+++ b/examples/models/core/qwen/README.md
@@ -745,7 +745,7 @@ To serve the model using `trtllm-serve`:
 ```bash
 cat >./extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
   - 1
   - 2
@@ -821,7 +821,7 @@ export TRTLLM_USE_UCX_KVCACHE=1
 
 cat >./gen-extra-llm-api-config.yml <<EOF
 cuda_graph_config:
-  padding_enabled: true
+  enable_padding: true
   batch_sizes:
     - 1
     - 2
diff --git a/examples/wide_ep/ep_load_balancer/README.md b/examples/wide_ep/ep_load_balancer/README.md
index 7417a196a1a..454d8681d9f 100644
--- a/examples/wide_ep/ep_load_balancer/README.md
+++ b/examples/wide_ep/ep_load_balancer/README.md
@@ -28,7 +28,9 @@ cat > ./extra_llm_api_options.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
 moe_backend: WideEP
-moe_max_num_tokens: 8192
+moe_config:
+    backend: WideEP
+    max_num_tokens: 8192
 EOF
 
 trtllm-llmapi-launch \
@@ -117,9 +119,10 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
-moe_backend: WideEP
-moe_max_num_tokens: 9216
-moe_load_balancer: ./moe_load_balancer.yaml
+moe_config:
+    backend: WideEP
+    max_num_tokens: 9216
+    load_balancer: ./moe_load_balancer.yaml
 EOF
 
 trtllm-llmapi-launch \
@@ -183,9 +186,10 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 cuda_graph_config: {}
-moe_backend: WideEP
-moe_max_num_tokens: 9216
-moe_load_balancer: ./moe_load_balancer.yaml
+moe_config:
+    backend: WideEP
+    max_num_tokens: 9216
+    load_balancer: ./moe_load_balancer.yaml
 EOF
 
 trtllm-llmapi-launch \
@@ -204,9 +208,9 @@ trtllm-bench --model ${MODEL_NAME} \
 
 > **Note:** Similar to offline EP Load Balancer, you can enable expert ID counting to verify the effectiveness of EPLB, but remember to disable it when running inference for benchmarking or production purposes.
 
-> **Explanation on moe_max_num_tokens:** For Large Scale EP, there can be extreme conditions that all ranks send tokens to a single rank since they all want that expert.
+> **Explanation on max_num_tokens of moe_config:** For Large Scale EP, there can be extreme conditions that all ranks send tokens to a single rank since they all want that expert.
 In that case, that rank will have too many tokens to compute. In order not to make the hot rank OOM, there is one strategy that chunk the tokens if there are too much.
-`moe_max_num_tokens` is the parameter that controls the max chunk size. However, this may have performance penalty if there is enough since batch size is smaller.
+`max_num_tokens` of moe_config is the parameter that controls the max chunk size. However, this may have performance penalty if there is enough since batch size is smaller.
 So by default, it is set to some value that all tokens can complete in one wave. However, if EP size is large, we may need to trade off that in order not to OOM or got other runtime errors due to lack of memory.
-One good point is that if memory is OK, we can set `moe_max_num_tokens` to `max_batch_size * ep_size` to make all generation requests can be processed in one chunk.
-For example, if `ep_size` is 36 and `max_batch_size` is 256, we may set `moe_max_num_tokens` to 9216.
+One good point is that if memory is OK, we can set `max_num_tokens` to `max_batch_size * ep_size` to make all generation requests can be processed in one chunk.
+For example, if `ep_size` is 36 and `max_batch_size` is 256, we may set `max_num_tokens` to 9216.
diff --git a/examples/wide_ep/slurm_scripts/gen_yaml.py b/examples/wide_ep/slurm_scripts/gen_yaml.py
index fd1de76b98b..121f614d870 100644
--- a/examples/wide_ep/slurm_scripts/gen_yaml.py
+++ b/examples/wide_ep/slurm_scripts/gen_yaml.py
@@ -196,7 +196,7 @@ def gen_config_file(config_path: str,
             'max_seq_len': 2176,
             'free_gpu_memory_fraction': gen_gpu_memory_fraction,
             'cuda_graph_config': {
-                'padding_enabled': True,
+                'enable_padding': True,
                 'batch_sizes': gen_cuda_graph_batch_sizes,
             },
             'print_iter_log': True,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 204094646a1..42a0c001076 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -309,7 +309,7 @@ def get_rank_model_storage(model):
 def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
                                    max_batch_size: int, max_num_tokens: int,
                                    max_draft_len: int,
-                                   padding_enabled: bool) -> list[int]:
+                                   enable_padding: bool) -> list[int]:
     # This is the largest possible batch size for a pure decoding batch.
     max_cuda_graph_bs = min(max_batch_size,
                             int(max_num_tokens / (1 + max_draft_len)))
@@ -326,8 +326,8 @@ def _filter_cuda_graph_batch_sizes(cuda_graph_batch_sizes: list[int],
             # is that if the user is OK padding to a batch size B, they should also
             # be OK with padding to some size B' < B since the performance will generally
             # just be better in the smaller case.
-            if padding_enabled and (i == 0
-                                    or result[i - 1] != max_cuda_graph_bs):
+            if enable_padding and (i == 0
+                                   or result[i - 1] != max_cuda_graph_bs):
                 logger.warning(
                     "CUDA graph padding is enabled, but one of the given CUDA graph "
                     f"batch sizes ({bs}) is larger than the executor's max batch size "
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 0db7d7b21b4..153e262276f 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -152,7 +152,7 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
             pass
 
     cuda_graph_config = {
-        "padding_enabled": True,
+        "enable_padding": True,
         "max_batch_size": max_batch_size
     }
 
diff --git a/tensorrt_llm/llmapi/__init__.py b/tensorrt_llm/llmapi/__init__.py
index a912b1c80c8..24f7ad00e75 100644
--- a/tensorrt_llm/llmapi/__init__.py
+++ b/tensorrt_llm/llmapi/__init__.py
@@ -9,7 +9,7 @@
                        CudaGraphConfig, DraftTargetDecodingConfig,
                        DynamicBatchConfig, EagleDecodingConfig,
                        ExtendedRuntimePerfKnobConfig, KvCacheConfig, LlmArgs,
-                       LookaheadDecodingConfig, MedusaDecodingConfig,
+                       LookaheadDecodingConfig, MedusaDecodingConfig, MoeConfig,
                        MTPDecodingConfig, NGramDecodingConfig, SchedulerConfig,
                        TorchCompileConfig, TorchLlmArgs, TrtLlmArgs,
                        UserProvidedDecodingConfig)
@@ -27,6 +27,7 @@
     'KvCacheConfig',
     'KvCacheRetentionConfig',
     'CudaGraphConfig',
+    'MoeConfig',
     'LookaheadDecodingConfig',
     'MedusaDecodingConfig',
     'EagleDecodingConfig',
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 4ca266d53ed..1b385b6e8fc 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -72,7 +72,7 @@ class CudaGraphConfig(BaseModel):
     max_batch_size: int = Field(
         default=0, description="Maximum batch size for CUDA graphs.")
 
-    padding_enabled: bool = Field(
+    enable_padding: bool = Field(
         default=False,
         description=
         "If true, batches are rounded up to the nearest cuda_graph_batch_size. This is usually a net win for performance."
@@ -88,6 +88,30 @@ def validate_cuda_graph_max_batch_size(cls, v):
         return v
 
 
+class MoeConfig(BaseModel):
+    """
+    Configuration for MoE.
+    """
+    backend: Literal["CUTLASS", "CUTEDSL", "WIDEEP", "TRTLLM",
+                     "VANILLA"] = Field(default='CUTLASS',
+                                        description="MoE backend to use.")
+
+    max_num_tokens: Optional[int] = Field(
+        default=None,
+        description=
+        "If set, at most max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
+    )
+
+    load_balancer: Optional[Union[object, str]] = Field(
+        default=None,
+        description="Configuration for MoE load balancing.",
+        json_schema_extra={"type": "Union[MoeLoadBalancerConfig, str]"})
+
+    @classmethod
+    def from_dict(cls, data: dict):
+        return cls(**data)
+
+
 @dataclass
 class _ParallelConfig:
     ''' The model distribution configs for LLM.  '''
@@ -1768,26 +1792,12 @@ class TorchLlmArgs(BaseLlmArgs):
     disable_overlap_scheduler: bool = Field(
         default=False, description="Disable the overlap scheduler.")
 
-    moe_max_num_tokens: Optional[int] = Field(
-        default=None,
-        description=
-        "If set, at most moe_max_num_tokens tokens will be sent to torch.ops.trtllm.fused_moe at the same time. If the number of tokens exceeds moe_max_num_tokens, the input tensors will be split into chunks and a for loop will be used."
-    )
-
-    moe_load_balancer: Optional[Union[object, str]] = Field(
-        default=None,
-        description="Configuration for MoE load balancing.",
-        json_schema_extra={
-            "type":
-            "Union[tensorrt_llm._torch.model_config.MoeLoadBalancerConfig, str, None]"
-        })
+    moe_config: MoeConfig = Field(default_factory=MoeConfig,
+                                  description="MoE config.")
 
     attn_backend: str = Field(default='TRTLLM',
                               description="Attention backend to use.")
 
-    moe_backend: str = Field(default='CUTLASS',
-                             description="MoE backend to use.")
-
     enable_mixed_sampler: bool = Field(
         default=False,
         description=
@@ -1889,25 +1899,6 @@ def extra_resource_managers(self) -> Dict[str, object]:
     def extra_resource_managers(self, value: Dict[str, object]) -> None:
         self._extra_resource_managers = value
 
-    @model_validator(mode="after")
-    def validate_moe_load_balancer(self):
-        from .._torch.model_config import MoeLoadBalancerConfig
-        if isinstance(self.moe_load_balancer, str):
-            if not os.path.exists(self.moe_load_balancer):
-                raise FileNotFoundError(
-                    f"MoE load balancer config file not found: {self.moe_load_balancer}"
-                )
-            try:
-                with open(self.moe_load_balancer) as f:
-                    moe_load_balancer_config = yaml.safe_load(f)
-                self.moe_load_balancer = MoeLoadBalancerConfig(
-                    **moe_load_balancer_config)
-            except Exception as e:
-                raise ValueError(
-                    f"Failed to load MoE load balancer config file: {self.moe_load_balancer}"
-                ) from e
-        return self
-
     @model_validator(mode="after")
     def validate_stream_interval(self):
         if self.stream_interval <= 0:
@@ -1917,17 +1908,17 @@ def validate_stream_interval(self):
 
     @staticmethod
     def _generate_cuda_graph_batch_sizes(max_batch_size: int,
-                                         padding_enabled: bool) -> List[int]:
+                                         enable_padding: bool) -> List[int]:
         """Generate a list of batch sizes for CUDA graphs.
 
         Args:
             max_batch_size: Maximum batch size to generate up to
-            padding_enabled: Whether padding is enabled, which affects the batch size distribution
+            enable_padding: Whether padding is enabled, which affects the batch size distribution
 
         Returns:
             List of batch sizes to create CUDA graphs for
         """
-        if padding_enabled:
+        if enable_padding:
             batch_sizes = [1, 2, 4] + [i * 8 for i in range(1, 17)]
         else:
             batch_sizes = list(range(1, 32)) + [32, 64, 128]
@@ -1947,6 +1938,25 @@ def _generate_cuda_graph_batch_sizes(max_batch_size: int,
 
         return batch_sizes
 
+    @model_validator(mode="after")
+    def validate_load_balancer(self) -> 'TorchLlmArgs':
+        from .._torch import MoeLoadBalancerConfig
+        if isinstance(self.moe_config.load_balancer, str):
+            if not os.path.exists(self.moe_config.load_balancer):
+                raise FileNotFoundError(
+                    f"MoE load balancer config file not found: {self.moe_config.load_balancer}"
+                )
+            try:
+                with open(self.moe_config.load_balancer) as f:
+                    moe_load_balancer_config = yaml.safe_load(f)
+                self.moe_config.load_balancer = MoeLoadBalancerConfig(
+                    **moe_load_balancer_config)
+            except Exception as e:
+                raise ValueError(
+                    f"Failed to load MoE load balancer config file: {self.load_balancer}"
+                ) from e
+        return self
+
     @model_validator(mode='after')
     def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
         """Validate CUDA graph configuration.
@@ -1965,7 +1975,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
             config.batch_sizes = sorted(config.batch_sizes)
             if config.max_batch_size != 0:
                 if config.batch_sizes != self._generate_cuda_graph_batch_sizes(
-                        config.max_batch_size, config.padding_enabled):
+                        config.max_batch_size, config.enable_padding):
                     raise ValueError(
                         "Please don't set both cuda_graph_config.batch_sizes "
                         "and cuda_graph_config.max_batch_size.\n"
@@ -1977,7 +1987,7 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
         else:
             max_batch_size = config.max_batch_size or 128
             generated_sizes = self._generate_cuda_graph_batch_sizes(
-                max_batch_size, config.padding_enabled)
+                max_batch_size, config.enable_padding)
             config.batch_sizes = generated_sizes
             config.max_batch_size = max_batch_size
 
@@ -1996,14 +2006,14 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             cuda_graph_max_batch_size=self.cuda_graph_config.max_batch_size
             if self.cuda_graph_config else
             CudaGraphConfig.model_fields['max_batch_size'].default,
-            cuda_graph_padding_enabled=self.cuda_graph_config.padding_enabled
+            cuda_graph_padding_enabled=self.cuda_graph_config.enable_padding
             if self.cuda_graph_config else
-            CudaGraphConfig.model_fields['padding_enabled'].default,
+            CudaGraphConfig.model_fields['enable_padding'].default,
             disable_overlap_scheduler=self.disable_overlap_scheduler,
-            moe_max_num_tokens=self.moe_max_num_tokens,
-            moe_load_balancer=self.moe_load_balancer,
+            moe_max_num_tokens=self.moe_config.max_num_tokens,
+            moe_load_balancer=self.moe_config.load_balancer,
             attn_backend=self.attn_backend,
-            moe_backend=self.moe_backend,
+            moe_backend=self.moe_config.backend,
             enable_mixed_sampler=self.enable_mixed_sampler,
             enable_trtllm_sampler=self.enable_trtllm_sampler,
             kv_cache_dtype=self.kv_cache_dtype,
@@ -2046,6 +2056,7 @@ def update_llm_args_with_extra_dict(
         "enable_build_cache": BuildCacheConfig,
         "speculative_config": DecodingBaseConfig,
         "lora_config": LoraConfig,
+        "moe_config": MoeConfig,
     }
     for field_name, field_type in field_mapping.items():
         if field_name in llm_args_dict:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index f0b8bd50b5c..eb4cadc985d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -19,7 +19,7 @@
 from tensorrt_llm import LLM
 from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
 from tensorrt_llm.llmapi import (CudaGraphConfig, EagleDecodingConfig,
-                                 KvCacheConfig, MTPDecodingConfig,
+                                 KvCacheConfig, MoeConfig, MTPDecodingConfig,
                                  NGramDecodingConfig, SamplingParams,
                                  TorchCompileConfig)
 from tensorrt_llm.models.modeling_utils import QuantConfig
@@ -97,7 +97,7 @@ def test_bfloat16(self, attn_backend, torch_compile):
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -123,7 +123,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -147,7 +147,7 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -185,7 +185,7 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
             torch_compile_config=torch_compile_config,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=torch_compile,
+            cuda_graph_config=CudaGraphConfig(enable_padding=torch_compile,
                                               batch_sizes=[4]),
             attn_backend=attn_backend,
             disable_overlap_scheduler=torch_compile,
@@ -719,7 +719,7 @@ def test_cute_dsl_fp8_block_scales(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_config=torch_compile_config,
-            moe_backend="CUTEDSL",
+            moe_config=MoeConfig(backend="CUTEDSL"),
         )
 
         quant_config = QuantConfig()
@@ -759,7 +759,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
             disable_overlap_scheduler=False,
             cuda_graph_config=CudaGraphConfig(
                 max_batch_size=512,
-                padding_enabled=True,
+                enable_padding=True,
             ),
         )
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
@@ -782,7 +782,7 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         pytorch_config = dict(
             disable_overlap_scheduler=False,
-            cuda_graph_config=CudaGraphConfig(padding_enabled=True),
+            cuda_graph_config=CudaGraphConfig(enable_padding=True),
         )
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
@@ -899,7 +899,7 @@ def test_cute_dsl_fp8_block_scales_4gpus(
             disable_overlap_scheduler=not overlap_scheduler,
             use_cuda_graph=cuda_graph,
             torch_compile_config=torch_compile_config,
-            moe_backend="CUTEDSL",
+            moe_config=MoeConfig(backend="CUTEDSL"),
         )
 
         quant_config = QuantConfig()
@@ -948,8 +948,9 @@ def test_fp8_block_scales_4gpus_static_eplb(self):
             initial_global_assignments=initial_global_assignments,
             layer_updates_per_iter=0)
         pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(),
-                                       moe_backend="WIDEEP",
-                                       moe_load_balancer=eplb_config)
+                                       moe_config=MoeConfig(
+                                           backend="WIDEEP",
+                                           load_balancer=eplb_config))
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                  tensor_parallel_size=4,
                  moe_expert_parallel_size=4,
@@ -968,8 +969,8 @@ def test_bfloat16_4gpus_online_eplb(self, mtp_nextn):
         eplb_config = MoeLoadBalancerConfig(num_slots=num_slots,
                                             layer_updates_per_iter=2)
         pytorch_config = dict(cuda_graph_config=CudaGraphConfig(),
-                              moe_backend="WIDEEP",
-                              moe_load_balancer=eplb_config)
+                              moe_config=MoeConfig(backend="WIDEEP",
+                                                   load_balancer=eplb_config))
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@@ -992,8 +993,9 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv):
         eplb_config = MoeLoadBalancerConfig(num_slots=num_slots,
                                             layer_updates_per_iter=2)
         pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(),
-                                       moe_backend="WIDEEP",
-                                       moe_load_balancer=eplb_config)
+                                       moe_config=MoeConfig(
+                                           backend="WIDEEP",
+                                           load_balancer=eplb_config))
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
@@ -1035,8 +1037,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             torch_compile_config=torch_compile_config,
-            moe_backend=moe_backend,
-        )
+            moe_config=MoeConfig(backend=moe_backend))
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@@ -1095,7 +1096,7 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             torch_compile_config=torch_compile_config,
-            moe_backend=moe_backend,
+            moe_config=MoeConfig(backend=moe_backend),
         )
 
         mtp_config = None
@@ -1331,7 +1332,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
-            moe_backend=moe_backend)
+            moe_config=MoeConfig(backend=moe_backend))
 
         quant_config = QuantConfig()
         quant_config.quant_algo = QuantAlgo.NVFP4
@@ -1726,7 +1727,7 @@ def test_nvfp4(
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
-            moe_backend=moe_backend,
+            moe_config=MoeConfig(backend=moe_backend),
         )
 
         with LLM(
@@ -1808,7 +1809,7 @@ def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
-            moe_backend=moe_backend)
+            moe_config=MoeConfig(backend=moe_backend))
 
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
         with LLM(
@@ -1854,7 +1855,7 @@ class TestKanana_Instruct(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self):
         "RCCA: https://nvbugspro.nvidia.com/bug/5310520"
         pytorch_config = dict(cuda_graph_config=CudaGraphConfig(
-            padding_enabled=True, max_batch_size=384))
+            enable_padding=True, max_batch_size=384))
         with LLM(self.MODEL_PATH, **pytorch_config,
                  enable_attention_dp=True) as llm:
             task = MMLU(self.MODEL_NAME)
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
index 6135aefa0a7..1171fb4f102 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@@ -17,7 +17,7 @@ generation_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
   cuda_graph_config:
-    padding_enabled: False
+    enable_padding: False
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
index e4880434eb0..18acc70f9ac 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@@ -15,7 +15,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   cuda_graph_config:
-    padding_enabled: False
+    enable_padding: False
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
index 8f1ff654b38..7009df9fd0f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
@@ -28,7 +28,7 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
   cuda_graph_config:
-    padding_enabled: True
+    enable_padding: True
     batch_sizes: [1,4,8,16,24,32]
   disable_overlap_scheduler: True
   urls:
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index ea256eee7bb..d2ef5f8b536 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -30,7 +30,7 @@ def get_model_yaml_config(model_label: str,
     base_config = {
         'print_iter_log': True,
         'cuda_graph_config': {
-            'padding_enabled': True,
+            'enable_padding': True,
         },
     }
     if 'kv_cache_dtype' in model_label:
@@ -66,7 +66,7 @@ def get_model_yaml_config(model_label: str,
             'config': {
                 'enable_attention_dp': True,
                 'cuda_graph_config': {
-                    'padding_enabled': True,
+                    'enable_padding': True,
                     'batch_sizes': [1, 2, 4, 8, 16, 32, 64, 128, 256, 384]
                 }
             }
@@ -89,7 +89,7 @@ def get_model_yaml_config(model_label: str,
             'config': {
                 'print_iter_log': True,
                 'cuda_graph_config': {
-                    'padding_enabled': True,
+                    'enable_padding': True,
                     'batch_sizes': [1, 512, 1024, 2048]
                 }
             }
diff --git a/tests/integration/defs/stress_test/stress_test.py b/tests/integration/defs/stress_test/stress_test.py
index bfa6abd0177..f0f85fe51e3 100644
--- a/tests/integration/defs/stress_test/stress_test.py
+++ b/tests/integration/defs/stress_test/stress_test.py
@@ -519,7 +519,7 @@ def stress_test(config,
         if config.backend == "pytorch":
             extra_llm_options.update({
                 "cuda_graph_config": {
-                    "padding_enabled": True,
+                    "enable_padding": True,
                     "batch_sizes": [1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
                 },
                 "print_iter_log": True,
diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
index 660e09393f5..e5cf9680bbf 100644
--- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py
+++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@@ -8,7 +8,7 @@
 from utils.util import getSMVersion
 
 from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
+from tensorrt_llm.llmapi import KvCacheConfig, MoeConfig, MTPDecodingConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
 
@@ -71,7 +71,7 @@ def test_deepseek_trtllmgen(model_name):
         kv_cache_dtype="auto",
         attn_backend="TRTLLM",
         load_format="dummy",
-        moe_backend="TRTLLM",
+        moe_config=MoeConfig(backend="TRTLLM"),
     )
 
     model_dir = str(llm_models_root() / Path(f"DeepSeek-R1/{model_name}"))
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
index eccdaaec988..5d2a8b71374 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -8,7 +8,7 @@
 from utils.util import getSMVersion
 
 from tensorrt_llm import LLM, SamplingParams
-from tensorrt_llm.llmapi import KvCacheConfig
+from tensorrt_llm.llmapi import KvCacheConfig, MoeConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory
 
 
@@ -65,9 +65,8 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
         disable_overlap_scheduler=True,
         kv_cache_dtype="auto",
         attn_backend=backend,
-        moe_max_num_tokens=moe_max_num_tokens,
     )
-
+    moe_config = MoeConfig(max_num_tokens=moe_max_num_tokens)
     model_dir = str(llm_models_root() / model_name / model_path[quant])
 
     assert Path(model_dir).exists()
@@ -76,6 +75,7 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
               tensor_parallel_size=tp_size,
               enable_chunked_prefill=False,
               **pytorch_config,
+              moe_config=moe_config,
               moe_expert_parallel_size=-1,
               moe_tensor_parallel_size=-1,
               enable_attention_dp=enable_attention_dp,
diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/test_pytorch_model_engine.py
index 5a7c43bb5e7..30c05a67aa0 100644
--- a/tests/unittest/_torch/test_pytorch_model_engine.py
+++ b/tests/unittest/_torch/test_pytorch_model_engine.py
@@ -307,8 +307,7 @@ def test_cuda_graph_enable(self):
             "CUDA graphs should be disabled when cuda_graph_config=None")
 
         # Test 4: Custom CudaGraphConfig with specific settings
-        custom_config = CudaGraphConfig(max_batch_size=256,
-                                        padding_enabled=True)
+        custom_config = CudaGraphConfig(max_batch_size=256, enable_padding=True)
         llm_args_custom = LlmArgs.from_kwargs(model="dummy_model",
                                               cuda_graph_config=custom_config)
         pytorch_config_custom = llm_args_custom.get_pytorch_backend_config()
@@ -317,7 +316,7 @@ def test_cuda_graph_enable(self):
         self.assertEqual(pytorch_config_custom.cuda_graph_max_batch_size, 256,
                          "Custom max_batch_size should be respected")
         self.assertTrue(pytorch_config_custom.cuda_graph_padding_enabled,
-                        "Custom padding_enabled should be respected")
+                        "Custom enable_padding should be respected")
 
 
 if __name__ == "__main__":
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index e0d3bf5216c..132bdee5804 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -69,18 +69,12 @@ methods:
       disable_overlap_scheduler:
         annotation: bool
         default: False
-      moe_max_num_tokens:
-        annotation: Optional[int]
-        default: null
-      moe_load_balancer:
-        annotation: Union[tensorrt_llm._torch.MoeLoadBalancerConfig, str, None]
+      moe_config:
+        annotation: tensorrt_llm.llmapi.llm_args.MoeConfig
         default: null
       attn_backend:
         annotation: str
         default: TRTLLM
-      moe_backend:
-        annotation: str
-        default: CUTLASS
       enable_mixed_sampler:
         annotation: bool
         default: False
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index b2eb9e8d8cd..0c2aaf20a13 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -272,7 +272,7 @@ def test_cuda_graph_batch_sizes_case_0_1(self):
             cuda_graph_config=CudaGraphConfig(
                 batch_sizes=CudaGraphConfig._generate_cuda_graph_batch_sizes(
                     128, True),
-                padding_enabled=True,
+                enable_padding=True,
                 max_batch_size=128))
         assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes(
             128, True)
@@ -282,14 +282,14 @@ def test_cuda_graph_batch_sizes_case_1(self):
         # set cuda_graph_batch_sizes only
         args = TorchLlmArgs(model=llama_model_path,
                             cuda_graph_config=CudaGraphConfig(
-                                batch_sizes=[1, 2, 4], padding_enabled=True))
+                                batch_sizes=[1, 2, 4], enable_padding=True))
         assert args.cuda_graph_config.batch_sizes == [1, 2, 4]
 
     def test_cuda_graph_batch_sizes_case_2(self):
         # set cuda_graph_config.max_batch_size only
         args = TorchLlmArgs(model=llama_model_path,
                             cuda_graph_config=CudaGraphConfig(
-                                max_batch_size=128, padding_enabled=True))
+                                max_batch_size=128, enable_padding=True))
         assert args.cuda_graph_config.batch_sizes == CudaGraphConfig._generate_cuda_graph_batch_sizes(
             128, True)
         assert args.cuda_graph_config.max_batch_size == 128

From 2504aa552efdd85dc486cf23e78611b01f1dc8db Mon Sep 17 00:00:00 2001
From: ruodil <200874449+ruodil@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:53:15 +0800
Subject: [PATCH 12/88] test: add recursive updating pytorch config and change
 MOE backend format in perf test (#6046)

Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com>
---
 .../integration/defs/perf/pytorch_model_config.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index d2ef5f8b536..40b2b9f9682 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -18,6 +18,15 @@
 """
 
 
+def recursive_update(d, u):
+    for k, v in u.items():
+        if isinstance(v, dict) and isinstance(d.get(k), dict):
+            recursive_update(d[k], v)
+        else:
+            d[k] = v
+    return d
+
+
 def get_model_yaml_config(model_label: str,
                           lora_dirs: list[str] = None) -> dict:
     """
@@ -130,7 +139,9 @@ def get_model_yaml_config(model_label: str,
             ],
             'config': {
                 'enable_attention_dp': False,
-                'moe_backend': 'TRTLLM'
+                'moe_config': {
+                    'backend': 'TRTLLM'
+                }
             }
         }
     ]
@@ -142,7 +153,7 @@ def get_model_yaml_config(model_label: str,
             patterns = [patterns]
         for pattern in patterns:
             if pattern in model_label.lower():
-                base_config.update(pattern_config['config'])
+                recursive_update(base_config, pattern_config['config'])
                 break  # Stop checking other patterns for this config once we find a match
 
     # lora-specific change for pytorch

From 2a147c4d01d0b73ee73c193c0d709f8d4c29f462 Mon Sep 17 00:00:00 2001
From: ruodil <200874449+ruodil@users.noreply.github.com>
Date: Tue, 15 Jul 2025 15:53:59 +0800
Subject: [PATCH 13/88] test: add llama_v3.3_70b_cases in perf test (#6035)

Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com>
---
 .../defs/perf/pytorch_model_config.py         | 20 +++++++++++++++++++
 tests/integration/defs/perf/test_perf.py      |  1 +
 .../qa/trt_llm_release_perf_sanity_test.yml   |  2 ++
 .../qa/trt_llm_release_perf_test.yml          |  7 +++++++
 4 files changed, 30 insertions(+)

diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index 40b2b9f9682..4c0ef184093 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -143,6 +143,26 @@ def get_model_yaml_config(model_label: str,
                     'backend': 'TRTLLM'
                 }
             }
+        },
+        # Llama-v3.3 models with fp8 quantization
+        {
+            'patterns': [
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:500,2000-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:1000,1000-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:2000,500-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-maxbs:512-maxnt:2048-input_output_len:128,128-gpus:4',
+                'llama_v3.3_70b_instruct_fp8-bench-pytorch-bfloat16-maxbs:512-maxnt:2048-input_output_len:512,32-gpus:4',
+            ],
+            'config': {
+                'use_cuda_graph':
+                True,
+                'cuda_graph_padding_enabled':
+                True,
+                'cuda_graph_batch_sizes': [
+                    1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048,
+                    4096, 8192
+                ]
+            }
         }
     ]
 
diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py
index eb8af3d593e..759ff9273f8 100644
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@@ -54,6 +54,7 @@
     "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
     "llama_v3.3_70b_instruct_fp4":
     "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
+    "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
     "llama_v3.1_405b_instruct_fp4":
     "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
     "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml
index f5013e1b5b1..e7369bac1cd 100644
--- a/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml
+++ b/tests/integration/test_lists/qa/trt_llm_release_perf_sanity_test.yml
@@ -202,6 +202,8 @@ trt_llm_release_perf_sanity_test:
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:2000,200-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.1_70b-bench-pytorch-bfloat16-maxbs:1-input_output_len:200,2000-reqs:10-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
   - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:1-input_output_len:128,128-reqs:10-gpus:8]
 
 
diff --git a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
index 6c9f6bcb261..1b3b539fd3e 100644
--- a/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
+++ b/tests/integration/test_lists/qa/trt_llm_release_perf_test.yml
@@ -295,6 +295,11 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[qwen_14b_chat-cppmanager-ootb_except_mha-float16-input_output_len:128,128+512,32-gpus:4]
   - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-exe-plugin_ifb-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
   - perf/test_perf.py::test_perf[starcoder_15.5b-cppmanager-ootb_except_mha-float16-maxbs:1-input_output_len:512,200-reqs:10-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:500,2000-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:1000,1000-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:4]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4]
 
 # FP8 specific tests
 - condition:
@@ -357,6 +362,8 @@ trt_llm_release_perf_test:
   - perf/test_perf.py::test_perf[llama_v3.1_70b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:2000,200-reqs:64-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-input_output_len:128,128-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct-bench-bfloat16-maxbs:16-maxnt:5000-input_output_len:5000,500-reqs:64-con:250-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:500,2000-gpus:8]
+  - perf/test_perf.py::test_perf[llama_v3.3_70b-bench-pytorch-bfloat16-input_output_len:2000,500-gpus:8]
   - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:128,128-reqs:80-gpus:8]
   - perf/test_perf.py::test_perf[gpt_20b-bench-float16-maxbs:8-input_output_len:512,32-reqs:80-gpus:8]
 

From 9e871ca582e7867686398e3a9377f2e029944020 Mon Sep 17 00:00:00 2001
From: Yiteng Niu <6831097+niukuo@users.noreply.github.com>
Date: Tue, 15 Jul 2025 17:18:38 +0800
Subject: [PATCH 14/88] [infra] add more log on reuse-uploading (#6036)

Signed-off-by: Yiteng Niu <6831097+niukuo@users.noreply.github.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
---
 jenkins/Build.groovy | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 51cd760425a..81193d2ddd5 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -306,18 +306,19 @@ def uploadArtifacts(artifacts, prefix = UPLOAD_PATH, retryTimes = 2, serverId =
     for (it in artifacts) {
         def uploadpath = it.key
         def filepath = it.value
-        echo "uploading ${filepath} as ${uploadpath}"
-        trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {
-            rtUpload (
-                serverId: serverId,
-                spec: """{
+        def spec = """{
                     "files": [
                         {
                         "pattern": "${filepath}",
                         "target": "${prefix}/${uploadpath}"
                         }
                     ]
-                }""",
+                }"""
+        echo "Uploading ${filepath} as ${uploadpath}. Spec: ${spec}"
+        trtllm_utils.llmRetry(retryTimes, "uploadArtifacts", {
+            rtUpload (
+                serverId: serverId,
+                spec: spec,
             )
         })
     }

From ab1c54709d42452bfe194517745094f36dbc751b Mon Sep 17 00:00:00 2001
From: Jaedeok Kim <110799725+jaedeok-nvidia@users.noreply.github.com>
Date: Tue, 15 Jul 2025 18:41:54 +0900
Subject: [PATCH 15/88] fix: adjust window sizes of VSWA at torch backend
 (#5880)

Signed-off-by: Jaedeok Kim <jaedeokk@nvidia.com>
---
 .../_torch/pyexecutor/resource_manager.py     | 132 ++++++++++++++++--
 tensorrt_llm/_utils.py                        |  16 +++
 .../unittest/_torch/test_resource_manager.py  | 116 ++++++++++++++-
 3 files changed, 251 insertions(+), 13 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index e52096727c6..ffa8ce4bdae 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -1,8 +1,9 @@
+import copy
 import enum
 import math
 from abc import ABC, abstractmethod
 from collections import OrderedDict, defaultdict
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Set, Tuple, Union
 
 import torch
 
@@ -11,7 +12,7 @@
 from tensorrt_llm.bindings.BuildInfo import ENABLE_MULTI_DEVICE
 from tensorrt_llm.sampling_params import SamplingParams
 
-from ..._utils import nvtx_range
+from ..._utils import binding_dtype_size, nvtx_range
 from ...logger import logger
 from ...mapping import Mapping
 from .llm_request import LlmRequest, LlmRequestState, SamplingConfig
@@ -437,14 +438,10 @@ def calculate_max_num_blocks(self,
         cache_size_per_token = kv_factor * sum(
             self.num_kv_heads_per_layer) * head_dim
 
-        if dtype == DataType.FP8:
-            kv_cache_dtype_bytes = 1
-        elif dtype in (DataType.HALF, DataType.BF16):
-            kv_cache_dtype_bytes = 2
-        elif dtype == DataType.FLOAT:
-            kv_cache_dtype_bytes = 4
-        else:
+        if dtype not in (DataType.FP8, DataType.HALF, DataType.BF16,
+                         DataType.FLOAT):
             raise ValueError(f'Cannot support {dtype} KV cache.')
+        kv_cache_dtype_bytes = binding_dtype_size(dtype)
 
         cache_size_bytes_per_token = cache_size_per_token * kv_cache_dtype_bytes
         free_mem, total_mem = torch.cuda.mem_get_info()
@@ -603,6 +600,102 @@ def _get_window_size_to_layers(self) -> dict[int, list[int]]:
             window_size_to_layers_map[window_size].append(local_layer_idx)
         return window_size_to_layers_map
 
+    @staticmethod
+    def adjust_window_sizes_for_vswa(
+        window_size_to_layers: Dict[int, List[int]],
+        kv_cache_config: KvCacheConfigCpp,
+        model_config: ModelConfig,
+        pool_memory_bytes: int,
+        kv_factor: int,
+        dtype: DataType,
+        is_cross_attention: bool = False,
+    ) -> Dict[int, List[int]]:
+
+        assert is_cross_attention is False, 'Cross attention is not supported'
+
+        max_tokens_from_config = kv_cache_config.max_tokens
+
+        def calculate_cache_size_per_token(layers: Set[int]) -> int:
+            # Same as BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize
+            total_kv_heads = sum(model_config.num_kv_heads_per_layer[i]
+                                 for i in layers)
+            return total_kv_heads * kv_factor * model_config.head_size
+
+        # Calculate the required memory bytes per sequence.
+        required_mem_bytes_per_seq = 0
+        for window_size in sorted(window_size_to_layers):
+            layers = window_size_to_layers[window_size]
+            cache_size_per_token = calculate_cache_size_per_token(layers)
+            cache_size_bytes_per_token = cache_size_per_token * binding_dtype_size(
+                dtype)
+            required_mem_bytes_per_seq += window_size * cache_size_bytes_per_token
+        logger.debug(
+            f'Required memory per sequence: {required_mem_bytes_per_seq} bytes')
+
+        if required_mem_bytes_per_seq < pool_memory_bytes:
+            # No need to adjust the window sizes.
+            return copy.deepcopy(window_size_to_layers)
+
+        logger.debug(
+            f'Adjusting the window sizes {list(window_size_to_layers)} to fit '
+            f'the memory {pool_memory_bytes} bytes.')
+        adjusted_window_size_to_layers = {}
+
+        remaining_mem_bytes = pool_memory_bytes
+        remaining_layers = set(i for layers in window_size_to_layers.values()
+                               for i in layers)
+
+        accum_max_tokens = 0
+        prev_window_size = 0
+
+        for window_size in sorted(window_size_to_layers):
+            layers = window_size_to_layers[window_size]
+            if remaining_mem_bytes > 0 and remaining_layers:
+                # Calculate cache size per token for remaining layers only
+                cache_size_per_token = calculate_cache_size_per_token(
+                    remaining_layers)
+                cache_size_bytes_per_token = cache_size_per_token * binding_dtype_size(
+                    dtype)
+                logger.debug(
+                    f'Cache size per token for {len(remaining_layers)} layers: '
+                    f'{cache_size_bytes_per_token} bytes')
+                # Calculate max tokens that can fit in this window with remaining memory.
+                max_tokens_in_window = min(
+                    remaining_mem_bytes // cache_size_bytes_per_token,
+                    window_size - prev_window_size)
+                remaining_mem_bytes -= max_tokens_in_window * cache_size_bytes_per_token
+                accum_max_tokens += max_tokens_in_window
+                logger.debug(f'Remaining memory: {remaining_mem_bytes} bytes')
+                logger.debug(
+                    f'Max token of window {window_size}: {accum_max_tokens}')
+
+                if accum_max_tokens < window_size:
+                    logger.debug(
+                        f'Max tokens ({accum_max_tokens}) cannot fill the current window ({window_size}). '
+                        f'The larger windows will have the same max tokens.')
+                    remaining_mem_bytes = 0
+
+                # Clamp the sequence length if provided explicitly.
+                if max_tokens_from_config is not None:
+                    accum_max_tokens = min(max_tokens_from_config,
+                                           accum_max_tokens)
+                    # If max tokens from config is reached, stop allocating
+                    # more memory. Since the maximum number of tokens is
+                    # already reached, for the remaining windows maxTokens
+                    # will be set by the current value of accumMaxTokens.
+                    if accum_max_tokens == max_tokens_from_config:
+                        remaining_mem_bytes = 0
+
+            if accum_max_tokens not in adjusted_window_size_to_layers:
+                adjusted_window_size_to_layers[accum_max_tokens] = layers.copy()
+            else:
+                adjusted_window_size_to_layers[accum_max_tokens].extend(layers)
+
+            remaining_layers -= set(layers)
+            prev_window_size = window_size
+
+        return adjusted_window_size_to_layers
+
     def calculate_max_num_blocks_from_cpp(
             self,
             kv_cache_config: KvCacheConfigCpp,
@@ -622,6 +715,9 @@ def calculate_max_num_blocks_from_cpp(
             A dict of (max_attention_window, (blocks_in_primary_pool, blocks_in_secondary_pool)).
         """
 
+        # VSWA on Torch backend has not supported the cross attention.
+        is_cross_attention = False
+
         # Construct WorldConfig from self.mapping
         world_config_cpp = WorldConfig(
             tensor_parallelism=self.mapping.tp_size,
@@ -636,12 +732,26 @@ def calculate_max_num_blocks_from_cpp(
         primary_pool_memory_bytes = free_mem
         secondary_pool_memory_bytes = 0
         logger.debug(
-            f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \nsecondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"
+            f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \n"
+            f"secondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"
+        )
+
+        # Adjust the window sizes to fit the memory if even a single sequence
+        # cannot fit in the memory.
+        window_size_to_layers = self.adjust_window_sizes_for_vswa(
+            window_size_to_layers=window_size_to_layers,
+            model_config=model_config,
+            kv_cache_config=kv_cache_config,
+            pool_memory_bytes=primary_pool_memory_bytes,
+            kv_factor=self.kv_factor,
+            dtype=self.dtype,
+            is_cross_attention=is_cross_attention,
         )
 
         blocks_per_window = KVCacheManagerCpp.calculate_max_num_blocks(
             config=kv_cache_config,
-            is_cross_attention=False,  #TODO: support cross attention
+            # TODO: support cross attention
+            is_cross_attention=is_cross_attention,
             dtype=self.dtype,
             model_config=model_config,
             world_config=world_config_cpp,
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 9c3197e7c93..87144cb85c4 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -180,6 +180,22 @@ def str_dtype_to_torch(dtype):
     fp8=DataType.FP8,
 )
 
+_binding_dtype_size = {
+    DataType.INT64: 8,
+    DataType.FLOAT: 4,
+    DataType.INT32: 4,
+    DataType.BF16: 2,
+    DataType.HALF: 2,
+    DataType.BOOL: 1,
+    DataType.FP8: 1,
+    DataType.INT8: 1,
+    DataType.UINT8: 1,
+}
+
+
+def binding_dtype_size(dtype: DataType):
+    return _binding_dtype_size[dtype]
+
 
 def str_dtype_to_binding(dtype):
     ret = _str_to_binding_dtype_dict.get(dtype)
diff --git a/tests/unittest/_torch/test_resource_manager.py b/tests/unittest/_torch/test_resource_manager.py
index 0632834e4e6..da1dae84ba1 100644
--- a/tests/unittest/_torch/test_resource_manager.py
+++ b/tests/unittest/_torch/test_resource_manager.py
@@ -10,13 +10,15 @@
 
 import tensorrt_llm
 import tensorrt_llm.bindings
-from tensorrt_llm._torch.pyexecutor.resource_manager import (PeftCacheConfig,
+from tensorrt_llm._torch.pyexecutor.resource_manager import (KVCacheManager,
+                                                             PeftCacheConfig,
                                                              PeftCacheManager)
 from tensorrt_llm.bindings import ModelConfig as ModelConfigCpp
 from tensorrt_llm.bindings import executor as tllm
 from tensorrt_llm.bindings.internal.batch_manager import \
     PeftTaskNotCachedException
 
+DataType = tensorrt_llm.bindings.DataType
 LoraModule = tensorrt_llm.bindings.LoraModule
 LoraModuleType = tensorrt_llm.bindings.LoraModuleType
 current_dir = pathlib.Path(__file__).parent.resolve()
@@ -66,7 +68,15 @@ def __init__(self):
             self.num_rnn_layers = 0
             self.num_attention_heads = 1
             self.hidden_size = 16
-            self.data_type = tensorrt_llm.bindings.DataType.HALF
+            self.data_type = DataType.HALF
+
+        @property
+        def num_kv_heads_per_layer(self):
+            return [self.num_attention_heads] * self.num_attention_layers
+
+        @property
+        def head_size(self):
+            return self.hidden_size // self.num_attention_heads
 
     class MockPeftCacheManagerConfig:
         """
@@ -416,3 +426,105 @@ def test_put_get(self):
             self.assertEqual(entry.layer_id, expected_values[i][5])
             self.assertEqual(entry.adapter_size, expected_values[i][6])
             self.assertEqual(entry.num_slots, expected_values[i][7])
+
+    def test_adjust_window_sizes_for_vswa(self):
+        window_size_to_layers = {
+            100: [0, 1, 2, 3],
+            200: [4, 5, 6],
+            7000: [7, 8],
+        }
+
+        model_config = self.MockModelConfig()
+        model_config.num_attention_heads = 2
+        model_config.hidden_size = 2
+        model_config.data_type = DataType.HALF
+
+        total_layers = [
+            i for layers in window_size_to_layers.values() for i in layers
+        ]
+
+        model_config.num_hidden_layers = len(total_layers)
+        model_config.num_attention_layers = len(total_layers)
+
+        kv_factor = 2
+        cache_bytes_per_token_per_layer = 8
+
+        # Define test cases:
+        #    (memory_bytes, expected_window_sizes, max_tokens, description)
+        #    If max_tokens is None, then it will use the default value of KvCacheConfig.
+        test_cases = [
+            (
+                # Case 1: Limited memory - windows get clamped
+                cache_bytes_per_token_per_layer * (100 * 9 + 30 * 5) + 4,
+                {
+                    100: [0, 1, 2, 3],
+                    130: [4, 5, 6, 7, 8],
+                },
+                None,
+                "limited_memory_clamped_windows"),
+            (
+                # Case 2: Less limited memory - the largest window get clamped
+                cache_bytes_per_token_per_layer *
+                (100 * 9 + 100 * 5 + 817 * 2) + 4,
+                {
+                    100: [0, 1, 2, 3],
+                    200: [4, 5, 6],
+                    1017: [7, 8],
+                },
+                None,
+                "less_limited_memory_clamped_windows"),
+            (
+                # Case 3: Sufficient memory - no clamping needed
+                cache_bytes_per_token_per_layer *
+                (100 * 4 + 200 * 3 + 7000 * 2) + 9402,
+                {
+                    100: [0, 1, 2, 3],
+                    200: [4, 5, 6],
+                    7000: [7, 8],
+                },
+                None,
+                "sufficient_memory_no_clamping"),
+            (
+                # Case 4: Very limited memory - all windows get small values
+                cache_bytes_per_token_per_layer * (51 * 9) + 1,
+                {
+                    51: [0, 1, 2, 3, 4, 5, 6, 7, 8],
+                },
+                None,
+                "very_limited_memory_all_clamped"),
+            (
+                # Case 5: Less limited memory but max_tokens is given.
+                # memory is enough for 1017 tokens, it will be clamped by max_tokens=134.
+                cache_bytes_per_token_per_layer *
+                (100 * 9 + 100 * 5 + 817 * 2) + 4,
+                {
+                    100: [0, 1, 2, 3],
+                    134: [4, 5, 6, 7, 8],
+                },
+                134,
+                "less_limited_memory_but_clamped_by_max_tokens"),
+        ]
+
+        for memory_bytes, expected_window_sizes, max_tokens, description in test_cases:
+            with self.subTest(case=description, memory_bytes=memory_bytes):
+                kv_cache_config = tllm.KvCacheConfig(max_tokens=max_tokens)
+                adjusted = KVCacheManager.adjust_window_sizes_for_vswa(
+                    window_size_to_layers=window_size_to_layers,
+                    model_config=model_config,
+                    kv_cache_config=kv_cache_config,
+                    pool_memory_bytes=memory_bytes,
+                    kv_factor=kv_factor,
+                    dtype=model_config.data_type,
+                    is_cross_attention=False,
+                )
+
+                self.assertEqual(
+                    adjusted, expected_window_sizes,
+                    f"Test case '{description}' failed.\n"
+                    f"Memory bytes: {memory_bytes}\n"
+                    f"Actual: {adjusted}\n"
+                    f"Expected: {expected_window_sizes}")
+
+
+if __name__ == "__main__":
+    unittest.main()

From 9ebc3ab9c421c64e951daab535c27f4e7d99ce68 Mon Sep 17 00:00:00 2001
From: MinaHuai <121143971+MinaHuai@users.noreply.github.com>
Date: Tue, 15 Jul 2025 22:01:35 +0800
Subject: [PATCH 16/88] [nvbugs/5385972][nvbugs/5387423][Fix] Minor fix for
 llava_next/llava_onevision (#5998)

Signed-off-by: Mina Huai <121143971+MinaHuai@users.noreply.github.com>
---
 tensorrt_llm/runtime/multimodal_model_runner.py | 4 ++--
 tensorrt_llm/tools/multimodal_builder.py        | 4 ++--
 tests/integration/test_lists/waives.txt         | 4 ----
 3 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
index 9d6be2bddad..bb3a5480fcb 100644
--- a/tensorrt_llm/runtime/multimodal_model_runner.py
+++ b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -2647,7 +2647,7 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None):
                 )
                 image = None
         elif self.model_type in ['llava_onevision']:
-            pre_prompt = "<|im_start|>user "
+            pre_prompt = "<|im_start|>user " + "<video>" if self.args.video_path is not None else "<image>"
             if input_text is None:
                 input_text = "Question: which city is this? Answer:" if self.args.video_path is None else "Why is this video funny?"
             post_prompt = f"\n{input_text}<|im_end|><|im_start|>assistant\n"
@@ -2658,7 +2658,7 @@ def setup_inputs(self, input_text, raw_image, raw_audio=None):
                                        text=prompt,
                                        return_tensors="pt")
             else:
-                image = self.processor(videos=raw_image,
+                image = self.processor(videos=list(raw_image),
                                        text=prompt,
                                        return_tensors="pt")
 
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index d4a8e8287b5..c8a10fe1b6d 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -596,12 +596,12 @@ def forward(self, pixel_values):
         args.output_dir,
         args.max_batch_size)
     if args.model_type == "llava_next":
-        image_newline = model.image_newline.data
+        image_newline = model.model.image_newline.data
         tensor_img_newline = {"image_newline": image_newline}
         save_file(tensor_img_newline,
                   os.path.join(args.output_dir, "image_newlines.safetensors"))
     if args.model_type == "llava_onevision":
-        image_newline = model.image_newline.data
+        image_newline = model.model.image_newline.data
         tensor_img_newline = {"image_newline": image_newline}
         save_file(tensor_img_newline,
                   os.path.join(args.output_dir, "image_newlines.safetensors"))
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 291e549c648..0039dca742f 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -432,15 +432,11 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8 SKIP (https://nvbugs/5380101)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8] SKIP (https://nvbugs/5380570)
 test_e2e.py::test_ptp_quickstart_advanced_8gpus[Nemotron-Ultra-253B-nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1] SKIP (https://nvbugs/5380570)
-triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5385972)
 examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5385981)
-examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385972)
 examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987)
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914)
 test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5387375)
-examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423)
-examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
 test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)

From 4a26bd65008c40cd5071638d532eb3f909f1278b Mon Sep 17 00:00:00 2001
From: Tailing Yuan <yuantailing@gmail.com>
Date: Tue, 15 Jul 2025 22:14:01 +0800
Subject: [PATCH 17/88] Fix: pad DeepEP fp4 recv tensors if empty (#6048)

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
---
 .../modules/fused_moe/fused_moe_wide_ep.py    | 56 +++++++++++++------
 1 file changed, 40 insertions(+), 16 deletions(-)

diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index 9290aae3029..33682b1146e 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -428,6 +428,8 @@ def forward_chunk(
                 if not self.use_postquant_alltoall:
                     x, recv_topk_idx, token_final_scales, num_recv_tokens_per_expert_list, deep_ep_handle = \
                         self.deep_ep_buffer.dispatch(x, token_selected_slots.to(torch.int64), token_final_scales, self.num_slots)
+                    padded, x, _, recv_topk_idx, token_final_scales = self.pad_empty_recv_tensors(
+                        x, None, recv_topk_idx, token_final_scales)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
                 if not self.use_postquant_alltoall:
                     deep_ep_topk_idx = token_selected_slots.to(torch.int64)
@@ -559,6 +561,8 @@ def forward_chunk(
                     x_sf = x_sf.view(torch.float32)
                 (x, x_sf), recv_topk_idx, token_final_scales, num_recv_tokens_per_expert_list, deep_ep_handle = \
                     self.deep_ep_buffer.dispatch((x, x_sf), token_selected_slots.to(torch.int64), token_final_scales, self.num_slots)
+                padded, x, x_sf, recv_topk_idx, token_final_scales = self.pad_empty_recv_tensors(
+                    x, x_sf, recv_topk_idx, token_final_scales)
                 if x_sf is not None:
                     x_sf = x_sf.view(x_sf_dtype)
                     if self.has_nvfp4:
@@ -644,20 +648,6 @@ def forward_chunk(
                 mask = token_selected_slots == -1
                 token_selected_slots += self.expert_size_per_partition * self.mapping.moe_ep_rank
                 token_selected_slots[mask] = self.num_slots
-                num_recv_token_is_zero = x.shape[0] == 0
-                if x.shape[0] == 0:
-                    x = torch.zeros((1, x.shape[1]),
-                                    dtype=x.dtype,
-                                    device=x.device)
-                    token_selected_slots = torch.full(
-                        (1, token_selected_slots.shape[1]),
-                        self.num_slots,
-                        dtype=token_selected_slots.dtype,
-                        device=token_selected_slots.device)
-                    token_final_scales = torch.ones(
-                        (1, token_final_scales.shape[1]),
-                        dtype=token_final_scales.dtype,
-                        device=token_final_scales.device)
 
         final_hidden_states = torch.ops.trtllm.fused_moe(
             x,
@@ -698,8 +688,8 @@ def forward_chunk(
                 final_hidden_states = self.alltoall_combine(
                     final_hidden_states, alltoall_info, token_count)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
-                if num_recv_token_is_zero:
-                    final_hidden_states = final_hidden_states[:0]
+                final_hidden_states = self.unpad_tensors(
+                    padded, final_hidden_states)
                 final_hidden_states = self.deep_ep_buffer.combine(
                     final_hidden_states, deep_ep_handle)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
@@ -972,6 +962,40 @@ def alltoall_combine(self, final_hidden_states: torch.Tensor,
 
         return final_hidden_states
 
+    def pad_empty_recv_tensors(
+        self, x: torch.Tensor, x_sf: Optional[torch.Tensor],
+        recv_topk_idx: torch.Tensor, token_final_scales: torch.Tensor
+    ) -> Tuple[bool, torch.Tensor, Optional[torch.Tensor], torch.Tensor,
+               torch.Tensor]:
+        """
+        Pad the output of DeepEP `dispatch` if the output length is zero.
+        We can remove the adapter if both `fused_moe` op and `swizzle_sf`
+        accept zero-length inputs.
+        """
+        if x.shape[0] == 0:
+            padded = True
+            x = torch.zeros((1, x.shape[1]), dtype=x.dtype, device=x.device)
+            if x_sf is not None:
+                x_sf = torch.zeros((1, x_sf.shape[1]),
+                                   dtype=x_sf.dtype,
+                                   device=x_sf.device)
+            recv_topk_idx = torch.full((1, recv_topk_idx.shape[1]),
+                                       self.num_slots,
+                                       dtype=recv_topk_idx.dtype,
+                                       device=recv_topk_idx.device)
+            token_final_scales = torch.ones((1, token_final_scales.shape[1]),
+                                            dtype=token_final_scales.dtype,
+                                            device=token_final_scales.device)
+        else:
+            padded = False
+        return padded, x, x_sf, recv_topk_idx, token_final_scales
+
+    def unpad_tensors(self, padded: bool,
+                      final_hidden_states: torch.Tensor) -> torch.Tensor:
+        if padded:
+            final_hidden_states = final_hidden_states[:0]
+        return final_hidden_states
+
     def register_parameter_weight_slot_fn(self, weight_name: str,
                                           local_slot_id: int):
         assert hasattr(

From e761231c0bdd40908f312fc2f6f6cd2dca5370ce Mon Sep 17 00:00:00 2001
From: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>
Date: Tue, 15 Jul 2025 23:25:32 +0800
Subject: [PATCH 18/88] [fix] Move NCCL group in all-gather and reduce-scatter
 OPs outside the outer loop (#6053)

Signed-off-by: Jinyang Yuan <154768711+jinyangyuan-nvidia@users.noreply.github.com>
---
 cpp/tensorrt_llm/thop/allgatherOp.cpp     | 91 ++++++++++------------
 cpp/tensorrt_llm/thop/reducescatterOp.cpp | 92 ++++++++++-------------
 2 files changed, 82 insertions(+), 101 deletions(-)

diff --git a/cpp/tensorrt_llm/thop/allgatherOp.cpp b/cpp/tensorrt_llm/thop/allgatherOp.cpp
index 5a1a759d6a4..81310003de7 100644
--- a/cpp/tensorrt_llm/thop/allgatherOp.cpp
+++ b/cpp/tensorrt_llm/thop/allgatherOp.cpp
@@ -55,70 +55,61 @@ class AllgatherOp
         return 0;
     }
 
-    torch::Tensor run(torch::Tensor input, torch::optional<torch::List<int64_t>> sizes)
+    std::vector<torch::Tensor> run_list(torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes)
     {
         TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used");
-        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-        auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
-        std::vector<int64_t> outputShape = input.sizes().vec();
-        if (sizes.has_value())
-        {
-            outputShape[0] = std::accumulate(sizes.value().begin(), sizes.value().end(), 0, std::plus<>{});
-        }
-        else
-        {
-            outputShape[0] *= mGroup.size();
-        }
-        auto output = torch::empty(outputShape, input.options());
         bool use_nccl_allgather = !sizes.has_value()
             || std::all_of(sizes.value().begin(), sizes.value().end(),
                 [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_allgather)
-        {
-            NCCLCHECK_THROW(ncclAllGather(input.data_ptr(), output.mutable_data_ptr(), input.numel(),
-                (*getDtypeMap())[type], *mNcclComm, stream));
-        }
-        else
-        {
-            size_t numel_base = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
-            int64_t split_offset = 0;
-            ncclGroupStart();
-            for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
-            {
-                auto split_size = sizes.value()[root];
-                NCCLCHECK_THROW(ncclBroadcast(input.data_ptr(),
-                    output.index({torch::indexing::Slice(split_offset, torch::indexing::None)}).mutable_data_ptr(),
-                    numel_base * split_size, (*getDtypeMap())[type], root, *mNcclComm, stream));
-                split_offset += split_size;
-            }
-            ncclGroupEnd();
-        }
-        return output;
-    }
-
-    std::vector<torch::Tensor> run_list(torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes)
-    {
+        int64_t sum_sizes
+            = sizes.has_value() ? std::accumulate(sizes.value().begin(), sizes.value().end(), 0, std::plus<>{}) : 0;
         std::vector<torch::Tensor> output_list;
         output_list.reserve(input_list.size());
-        bool use_nccl_allgather = !sizes.has_value()
-            || std::all_of(sizes.value().begin(), sizes.value().end(),
-                [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_allgather)
-        {
-            ncclGroupStart();
-        }
+        ncclGroupStart();
         for (auto const& input : input_list)
         {
-            auto output = run(input, sizes);
+            auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+            auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
+            std::vector<int64_t> outputShape = input.sizes().vec();
+            if (sizes.has_value())
+            {
+                outputShape[0] = sum_sizes;
+            }
+            else
+            {
+                outputShape[0] *= mGroup.size();
+            }
+            auto output = torch::empty(outputShape, input.options());
+            if (use_nccl_allgather)
+            {
+                ncclAllGather(input.data_ptr(), output.mutable_data_ptr(), input.numel(), (*getDtypeMap())[type],
+                    *mNcclComm, stream);
+            }
+            else
+            {
+                size_t numel_base
+                    = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
+                int64_t split_offset = 0;
+                for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
+                {
+                    auto split_size = sizes.value()[root];
+                    ncclBroadcast(input.data_ptr(),
+                        output.index({torch::indexing::Slice(split_offset, torch::indexing::None)}).mutable_data_ptr(),
+                        numel_base * split_size, (*getDtypeMap())[type], root, *mNcclComm, stream);
+                    split_offset += split_size;
+                }
+            }
             output_list.push_back(output);
         }
-        if (use_nccl_allgather)
-        {
-            ncclGroupEnd();
-        }
+        NCCLCHECK_THROW(ncclGroupEnd());
         return output_list;
     }
 
+    torch::Tensor run(torch::Tensor input, torch::optional<torch::List<int64_t>> sizes)
+    {
+        return run_list({input}, sizes)[0];
+    }
+
 private:
     std::set<int> mGroup;
     std::shared_ptr<ncclComm_t> mNcclComm;
diff --git a/cpp/tensorrt_llm/thop/reducescatterOp.cpp b/cpp/tensorrt_llm/thop/reducescatterOp.cpp
index 05535089d26..4f7157e381c 100644
--- a/cpp/tensorrt_llm/thop/reducescatterOp.cpp
+++ b/cpp/tensorrt_llm/thop/reducescatterOp.cpp
@@ -55,16 +55,16 @@ class ReducescatterOp
         return 0;
     }
 
-    torch::Tensor run(torch::Tensor const& input, torch::optional<torch::List<int64_t>> sizes)
+    std::vector<torch::Tensor> run_list(torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes)
     {
         TLLM_CHECK_WITH_INFO(mNcclComm.get() != nullptr, "mNcclComm should be initialized before used");
-        auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
-        auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
-        std::vector<int64_t> outputShape = input.sizes().vec();
+        bool use_nccl_reducescatter = !sizes.has_value()
+            || std::all_of(sizes.value().begin(), sizes.value().end(),
+                [&sizes](int64_t size) { return size == sizes.value()[0]; });
+        int groupRank = 0;
         if (sizes.has_value())
         {
             auto rank = COMM_SESSION.getRank();
-            int groupRank = 0;
             for (auto const& currentRank : mGroup)
             {
                 if (rank == currentRank)
@@ -72,62 +72,52 @@ class ReducescatterOp
                 ++groupRank;
             }
             TLLM_CHECK(static_cast<size_t>(groupRank) < mGroup.size());
-            outputShape[0] = sizes.value()[groupRank];
-        }
-        else
-        {
-            outputShape[0] = outputShape[0] / mGroup.size();
         }
-        auto output = torch::empty(outputShape, input.options());
-        bool use_nccl_reducescatter = !sizes.has_value()
-            || std::all_of(sizes.value().begin(), sizes.value().end(),
-                [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_reducescatter)
-        {
-            NCCLCHECK_THROW(ncclReduceScatter(input.data_ptr(), output.mutable_data_ptr(), output.numel(),
-                (*getDtypeMap())[type], ncclSum, *mNcclComm, stream));
-        }
-        else
+        std::vector<torch::Tensor> output_list;
+        output_list.reserve(input_list.size());
+        ncclGroupStart();
+        for (auto const& input : input_list)
         {
-            size_t numel_base = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
-            int64_t split_offset = 0;
-            ncclGroupStart();
-            for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
+            auto stream = at::cuda::getCurrentCUDAStream(input.get_device());
+            auto type = tensorrt_llm::runtime::TorchUtils::dataType(input.scalar_type());
+            std::vector<int64_t> outputShape = input.sizes().vec();
+            if (sizes.has_value())
             {
-                auto split_size = sizes.value()[root];
-                NCCLCHECK_THROW(
+                outputShape[0] = sizes.value()[groupRank];
+            }
+            else
+            {
+                outputShape[0] = outputShape[0] / mGroup.size();
+            }
+            auto output = torch::empty(outputShape, input.options());
+            if (use_nccl_reducescatter)
+            {
+                ncclReduceScatter(input.data_ptr(), output.mutable_data_ptr(), output.numel(), (*getDtypeMap())[type],
+                    ncclSum, *mNcclComm, stream);
+            }
+            else
+            {
+                size_t numel_base
+                    = std::accumulate(outputShape.cbegin() + 1, outputShape.cend(), 1, std::multiplies<>{});
+                int64_t split_offset = 0;
+                for (int root = 0; root < static_cast<int>(mGroup.size()); ++root)
+                {
+                    auto split_size = sizes.value()[root];
                     ncclReduce(input.index({torch::indexing::Slice(split_offset, torch::indexing::None)}).data_ptr(),
                         output.mutable_data_ptr(), numel_base * split_size, (*getDtypeMap())[type], ncclSum, root,
-                        *mNcclComm, stream));
-                split_offset += split_size;
+                        *mNcclComm, stream);
+                    split_offset += split_size;
+                }
             }
-            ncclGroupEnd();
+            output_list.push_back(output);
         }
-        return output;
+        NCCLCHECK_THROW(ncclGroupEnd());
+        return output_list;
     }
 
-    std::vector<torch::Tensor> run_list(
-        torch::TensorList input_list, torch::optional<torch::List<int64_t>> sizes) noexcept
+    torch::Tensor run(torch::Tensor const& input, torch::optional<torch::List<int64_t>> sizes)
     {
-        std::vector<torch::Tensor> output_list;
-        output_list.reserve(input_list.size());
-        bool use_nccl_reducescatter = !sizes.has_value()
-            || std::all_of(sizes.value().begin(), sizes.value().end(),
-                [&sizes](int64_t size) { return size == sizes.value()[0]; });
-        if (use_nccl_reducescatter)
-        {
-            ncclGroupStart();
-        }
-        for (auto const& input : input_list)
-        {
-            auto output = run(input, sizes);
-            output_list.push_back(output);
-        }
-        if (use_nccl_reducescatter)
-        {
-            ncclGroupEnd();
-        }
-        return output_list;
+        return run_list({input}, sizes)[0];
     }
 
 private:

From 0523f77b363e52897467a82f97caa97412f2efa8 Mon Sep 17 00:00:00 2001
From: "Xiaodong (Vincent) Huang" <vincenth@nvidia.com>
Date: Tue, 15 Jul 2025 08:34:21 -0700
Subject: [PATCH 19/88] =?UTF-8?q?support=20TRTLLM=5FDEEP=5FEP=5FTOKEN=5FLI?=
 =?UTF-8?q?MIT=20to=20allow=20run=20deep-ep=20on=20memory-con=E2=80=A6=20(?=
 =?UTF-8?q?#5684)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Vincent Huang <vincenth@nvidia.com>
---
 examples/llm-api/quickstart_advanced.py       |  2 +-
 .../_torch/models/modeling_deepseekv3.py      |  7 +-
 .../modules/fused_moe/fused_moe_wide_ep.py    | 84 +++++++++++++------
 3 files changed, 63 insertions(+), 30 deletions(-)

diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index 4abb501b6d9..9065bd2f00f 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -50,7 +50,7 @@ def add_llm_args(parser):
     parser.add_argument('--moe_backend',
                         type=str,
                         default='CUTLASS',
-                        choices=['CUTLASS', 'TRTLLM', 'VANILLA'])
+                        choices=['CUTLASS', 'TRTLLM', 'VANILLA', 'WIDEEP'])
     parser.add_argument('--enable_attention_dp',
                         default=False,
                         action='store_true')
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 1834e7b1476..b92cef4dc54 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -62,8 +62,7 @@
 from ..modules.rms_norm import RMSNorm
 from ..peft.lora.layer import LoraLayer
 from ..speculative import MTPEagleWorker, MTPSpecMetadata, MTPWorker
-from ..utils import (AuxStreamType, EventType, Fp4QuantizedTensor,
-                     disable_fp4_allgather)
+from ..utils import AuxStreamType, EventType, Fp4QuantizedTensor
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
                              EagerFusionConfig, filter_weights,
                              register_auto_model)
@@ -514,9 +513,7 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
         if self.use_dp and self.mapping.tp_size > 1:
             # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
             # to reduce allreduce BW
-            if (disable_fp4_allgather()
-                    and not self.experts.enable_alltoall) or isinstance(
-                        self.experts, TRTLLMGenFusedMoE):
+            if isinstance(self.experts, TRTLLMGenFusedMoE):
                 hidden_states = allgather(hidden_states,
                                           self.mapping,
                                           dim=0,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index 33682b1146e..1d46d0712ff 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -192,8 +192,12 @@ def __init__(
                     model_config.mapping)
                 self.deep_ep_buffer.reserve(hidden_size, dtype)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
-                self.deep_ep_max_num_tokens = min(model_config.max_num_tokens,
-                                                  self.moe_max_num_tokens)
+                self.deep_ep_max_num_tokens = int(
+                    os.environ.get(
+                        "TRTLLM_DEEP_EP_TOKEN_LIMIT",
+                        str(
+                            min(model_config.max_num_tokens,
+                                self.moe_max_num_tokens))))
                 self.deep_ep_buffer = buffer_pool.get_low_latency_buffer(
                     model_config.mapping)
                 self.deep_ep_buffer.reserve(self.deep_ep_max_num_tokens,
@@ -274,6 +278,25 @@ def enable_alltoall(self):
         """
         return self.alltoall_method_type != AlltoallMethodType.NotEnabled
 
+    def calculate_num_chunks(self, all_rank_num_tokens: List[int]) -> int:
+        num_rows = sum(all_rank_num_tokens)
+        return (num_rows + self.moe_max_num_tokens -
+                1) // self.moe_max_num_tokens
+
+    def can_use_alltoall(self, input, all_rank_num_tokens):
+        # Disable alltoall when chunking is used
+        if self.calculate_num_chunks(all_rank_num_tokens) > 1:
+            return False
+
+        num_tokens = input.shape[0]
+
+        # For DeepEPLowLatency, check if tokens exceed the threshold
+        if (self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency
+                and num_tokens > self.deep_ep_max_num_tokens):
+            return False
+
+        return self.enable_alltoall
+
     def _get_quant_method(self):
         if self.quant_config is not None and self.quant_config.layer_quant_mode.has_any_quant(
                 exclude_kv_cache=True):
@@ -316,11 +339,12 @@ def dummy_allreduce(self):
     def reducescatter_or_allreduce(
         self,
         inputs,
+        use_all_to_all: bool,
         all_rank_num_tokens: Optional[List[int]] = None,
         use_dp_padding: Optional[bool] = None,
     ):
         outputs = inputs
-        if not self.enable_alltoall:
+        if not use_all_to_all:
             if self.enable_dummy_allreduce:
                 self.dummy_allreduce()
             outputs = reducescatter(
@@ -334,6 +358,7 @@ def forward_chunk(
             self,
             x: Union[torch.Tensor, Fp4QuantizedTensor],
             router_logits: torch.Tensor,
+            use_all_to_all: bool,
             output_dtype: Optional[torch.dtype] = None,
             all_rank_num_tokens: Optional[List[int]] = None,
             all_rank_max_num_tokens: Optional[int] = None,
@@ -382,7 +407,7 @@ def forward_chunk(
         ) and is_first_call:
             self.layer_load_balancer.maybe_cudagraph_done_wait()
 
-        use_allgather = not self.enable_alltoall
+        use_allgather = not use_all_to_all
 
         loadbalancer_local_statistic_info = None
         gathered_loadbalancer_local_statistic_info = None
@@ -391,7 +416,7 @@ def forward_chunk(
             token_selected_slots = token_selected_experts
         else:
             if not self.layer_load_balancer.is_static_routing(
-            ) and self.enable_alltoall:
+            ) and use_all_to_all:
                 self.layer_load_balancer.local_statistic(
                     token_selected_experts,
                     is_first_stage=is_first_call,
@@ -400,7 +425,7 @@ def forward_chunk(
                 token_selected_experts, self.use_dp)
             if not self.layer_load_balancer.is_static_routing():
                 # split into two part to get possible overlap with load balancer routing
-                if self.enable_alltoall:
+                if use_all_to_all:
                     if is_last_call:
                         loadbalancer_local_statistic_info = self.layer_load_balancer.get_local_statistic_tensor(
                         )
@@ -412,7 +437,9 @@ def forward_chunk(
         ExpertStatistic.set_layer(self.layer_idx)
         ExpertStatistic.maybe_add_info(self.num_slots, token_selected_slots)
 
-        if self.enable_alltoall:
+        # If alltoall is disabled, we need also disable use_postquant_alltoall
+        use_postquant_alltoall = self.use_postquant_alltoall and use_all_to_all
+        if use_all_to_all:
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
                 if self.enable_dummy_allreduce:
                     self.dummy_allreduce()
@@ -423,15 +450,16 @@ def forward_chunk(
                                                          x,
                                                          token_selected_slots,
                                                          token_final_scales,
+                                                         use_postquant_alltoall,
                                                          loadbalancer_local_statistic_info)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
-                if not self.use_postquant_alltoall:
+                if not use_postquant_alltoall:
                     x, recv_topk_idx, token_final_scales, num_recv_tokens_per_expert_list, deep_ep_handle = \
                         self.deep_ep_buffer.dispatch(x, token_selected_slots.to(torch.int64), token_final_scales, self.num_slots)
                     padded, x, _, recv_topk_idx, token_final_scales = self.pad_empty_recv_tensors(
                         x, None, recv_topk_idx, token_final_scales)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
-                if not self.use_postquant_alltoall:
+                if not use_postquant_alltoall:
                     deep_ep_topk_idx = token_selected_slots.to(torch.int64)
                     deep_ep_topk_weights = token_final_scales
                     x, recv_expert_count, deep_ep_handle = \
@@ -471,7 +499,7 @@ def forward_chunk(
                 x, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
                     x, self.fc31_input_dequant)
             elif self.has_nvfp4:
-                if use_allgather or self.use_postquant_alltoall:
+                if use_allgather or use_postquant_alltoall:
                     if isinstance(x, Fp4QuantizedTensor):
                         if use_allgather:
                             assert not x.is_sf_swizzled, "Fp4QuantizedTensor should not be swizzled before allgather"
@@ -527,7 +555,7 @@ def forward_chunk(
 
         if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
         ):
-            if self.enable_alltoall:
+            if use_all_to_all:
                 if is_last_call:
                     gathered_loadbalancer_local_statistic_info = gathered_loadbalancer_local_statistic_info.view(
                         (self.mapping.moe_ep_size, self.num_experts))
@@ -547,7 +575,7 @@ def forward_chunk(
         cluster_rank = self.cluster_rank
         quant_scales = self.quant_scales
 
-        if self.use_postquant_alltoall:
+        if use_postquant_alltoall:
             if x_sf is not None and self.has_nvfp4:
                 assert not x_is_sf_swizzled, "Fp4 scaling factor should not be swizzled before Alltoall"
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
@@ -640,7 +668,7 @@ def forward_chunk(
                     f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
-        if self.enable_alltoall:
+        if use_all_to_all:
             # Adapter between `torch.ops.trtllm.fused_moe` and DeepEP
             # TODO: remove the adapter by changing APIs
             if self.alltoall_method_type == AlltoallMethodType.DeepEP:
@@ -666,7 +694,7 @@ def forward_chunk(
             ep_rank=ep_rank,
             cluster_size=cluster_size,
             cluster_rank=cluster_rank,
-            enable_alltoall=self.enable_alltoall,
+            enable_alltoall=use_all_to_all,
             use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale,
             use_w4a8_group_scaling=use_w4a8_group_scaling,
             min_latency_mode=False,
@@ -681,7 +709,7 @@ def forward_chunk(
         # Otherwise, the output should be unpacked as a single tensor.
         final_hidden_states = final_hidden_states[0]
 
-        if self.enable_alltoall:
+        if use_all_to_all:
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
                 if self.enable_dummy_allreduce:
                     self.dummy_allreduce()
@@ -737,11 +765,10 @@ def forward(
     ) -> torch.Tensor:
         assert all_rank_num_tokens is not None
         assert use_dp_padding is not None
-        num_rows = sum(all_rank_num_tokens)
 
         # in case of num_rows is larger than max_chunk_size, we need to split the input into multiple chunks
-        num_chunks = (num_rows + self.moe_max_num_tokens -
-                      1) // self.moe_max_num_tokens
+        num_chunks = self.calculate_num_chunks(all_rank_num_tokens)
+        use_all_to_all = self.can_use_alltoall(x, all_rank_num_tokens)
 
         if use_dp_padding:
             all_rank_num_tokens_padded = [all_rank_max_num_tokens
@@ -754,6 +781,7 @@ def forward(
             outputs = self.forward_chunk(
                 x,
                 router_logits,
+                use_all_to_all,
                 output_dtype,
                 all_rank_num_tokens=all_rank_num_tokens_padded,
                 all_rank_max_num_tokens=all_rank_max_num_tokens,
@@ -761,6 +789,7 @@ def forward(
                 repeating_info=(is_first_call, is_last_call))
             outputs = self.reducescatter_or_allreduce(
                 outputs,
+                use_all_to_all,
                 all_rank_num_tokens=all_rank_num_tokens_padded,
                 use_dp_padding=use_dp_padding)
         else:
@@ -782,7 +811,7 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
             all_rank_max_num_tokens_list = split_chunk(all_rank_max_num_tokens,
                                                        num_chunks)
             chunk_size_list = all_rank_chunk_size_list[self.rank]
-            if self.enable_alltoall:
+            if use_all_to_all:
                 all_rank_num_tokens_list = [[
                     1 if val == 0 else val for val in val_list
                 ] for val_list in all_rank_num_tokens_list]
@@ -794,7 +823,7 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
             x_list = x.split(chunk_size_list)
             router_logits_list = router_logits.split(chunk_size_list)
 
-            if not self.enable_alltoall:
+            if not use_all_to_all:
                 self.event_dict[EventType.Main].record()
                 with torch.cuda.stream(self.aux_stream):
                     self.event_dict[EventType.Main].wait()
@@ -805,12 +834,13 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
                     zip(x_list, router_logits_list)):
                 is_first_call = idx_chunk == 0 and self.repeat_idx == 0
                 is_last_call = idx_chunk == num_chunks - 1 and self.repeat_idx == self.repeat_count - 1
-                if not self.enable_alltoall:
+                if not use_all_to_all:
                     if idx_chunk % 2 == 0:
                         with torch.cuda.stream(self.aux_stream):
                             outputs = self.forward_chunk(
                                 x,
                                 router_logits,
+                                use_all_to_all,
                                 all_rank_num_tokens=all_rank_num_tokens_list[
                                     idx_chunk],
                                 all_rank_max_num_tokens=
@@ -820,6 +850,7 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
                         if idx_chunk > 0:
                             outputs_list[-1] = self.reducescatter_or_allreduce(
                                 outputs_list[-1],
+                                use_all_to_all,
                                 all_rank_num_tokens=all_rank_num_tokens_list[
                                     idx_chunk - 1],
                                 use_dp_padding=use_dp_padding)
@@ -827,6 +858,7 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
                         outputs = self.forward_chunk(
                             x,
                             router_logits,
+                            use_all_to_all,
                             all_rank_num_tokens=all_rank_num_tokens_list[
                                 idx_chunk],
                             all_rank_max_num_tokens=all_rank_max_num_tokens_list[
@@ -836,6 +868,7 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
                         with torch.cuda.stream(self.aux_stream):
                             outputs_list[-1] = self.reducescatter_or_allreduce(
                                 outputs_list[-1],
+                                use_all_to_all,
                                 all_rank_num_tokens=all_rank_num_tokens_list[
                                     idx_chunk - 1],
                                 use_dp_padding=use_dp_padding)
@@ -843,22 +876,25 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
                     outputs = self.forward_chunk(
                         x,
                         router_logits,
+                        use_all_to_all,
                         all_rank_num_tokens=all_rank_num_tokens_list[idx_chunk],
                         all_rank_max_num_tokens=all_rank_max_num_tokens_list[
                             idx_chunk],
                         repeating_info=(is_first_call, is_last_call))
 
                 outputs_list.append(outputs)
-            if not self.enable_alltoall:
+            if not use_all_to_all:
                 if num_chunks % 2 == 0:
                     outputs_list[-1] = self.reducescatter_or_allreduce(
                         outputs_list[-1],
+                        use_all_to_all,
                         all_rank_num_tokens=all_rank_num_tokens_list[-1],
                         use_dp_padding=use_dp_padding)
                 else:
                     with torch.cuda.stream(self.aux_stream):
                         outputs_list[-1] = self.reducescatter_or_allreduce(
                             outputs_list[-1],
+                            use_all_to_all,
                             all_rank_num_tokens=all_rank_num_tokens_list[-1],
                             use_dp_padding=use_dp_padding)
                 with torch.cuda.stream(self.aux_stream):
@@ -873,7 +909,7 @@ def split_chunk(split_token_num: int, split_num_chunks: int):
     def alltoall_prepare_maybe_dispatch(
             self, all_rank_max_num_tokens: int, x: torch.Tensor,
             token_selected_slots: torch.Tensor,
-            token_final_scales: torch.Tensor,
+            token_final_scales: torch.Tensor, use_postquant_alltoall: bool,
             local_statistic_tensor: Optional[torch.Tensor]):
         top_k = self.routing_method.experts_per_token
 
@@ -919,7 +955,7 @@ def alltoall_prepare_maybe_dispatch(
                 gathered_token_final_scales, all_rank_max_num_tokens,
                 self.num_slots, top_k, self.ep_rank, self.ep_size)
 
-        if not self.use_postquant_alltoall:
+        if not use_postquant_alltoall:
             assert not isinstance(
                 x, Fp4QuantizedTensor
             ), "pre-quant alltoall doesn't support fp4 tensor"

From 7a1af1c7388b5ec6e690d9dd12b44bb85615aba1 Mon Sep 17 00:00:00 2001
From: Fanrong Li <23290157+lfr-0531@users.noreply.github.com>
Date: Wed, 16 Jul 2025 00:33:12 +0800
Subject: [PATCH 20/88] Cherry-pick
 https://github.com/NVIDIA/TensorRT-LLM/pull/5947 (#5989)

Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/model_engine.py       | 10 ++++++++++
 tests/integration/test_lists/test-db/l0_dgx_b200.yml |  1 +
 2 files changed, 11 insertions(+)

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 42a0c001076..635787a0324 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1463,6 +1463,16 @@ def previous_seq_slots_device():
                                 previous_batch_len * self.max_beam_width].copy_(
                                     new_tokens.flatten(), non_blocking=True)
 
+        if (not self._disable_overlap_scheduler
+                and next_draft_tokens_device is None
+                and len(extend_requests) > 0):
+            # During warmup, for those generation requests, we don't have previous tensors,
+            # so we need to set the previous_pos_id_offsets and previous_kv_lens_offsets to zeros
+            # to skip the value changes in _preprocess_inputs. Otherwise, there will be illegal memory access
+            # when writing key/values to the KV cache.
+            self.previous_pos_id_offsets_cuda *= 0
+            self.previous_kv_lens_offsets_cuda *= 0
+
         position_ids = torch.tensor(position_ids,
                                     dtype=torch.int,
                                     pin_memory=True)
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 5521ce0d5a1..8a70e6efefc 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -23,6 +23,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]

From 9214ac662accd351563e0344c384ab4d8e3b6d71 Mon Sep 17 00:00:00 2001
From: brb-nv <169953907+brb-nv@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:37:56 -0700
Subject: [PATCH 21/88] test: Add regression tests for Gemma3 VLM (#6033)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  2 ++
 .../defs/accuracy/references/gsm8k.yaml       |  2 ++
 .../defs/accuracy/references/mmlu.yaml        |  2 ++
 .../defs/accuracy/test_llm_api_pytorch.py     | 24 +++++++++++++++++++
 .../test_lists/test-db/l0_h100.yml            |  2 ++
 5 files changed, 32 insertions(+)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index 3aa2f742763..c85be741469 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -1,5 +1,7 @@
 google/gemma-3-1b-it:
   - accuracy: 22.988
+google/gemma-3-27b-it:
+  - accuracy: 28.90
 gpt2:
   - accuracy: 18.408
   - quant_algo: W8A16
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index ba0ffff4ffc..5a30e0c8e7a 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -108,6 +108,8 @@ speakleash/Bielik-11B-v2.2-Instruct:
     accuracy: 40.41
 google/gemma-3-1b-it:
   - accuracy: 25.52 # score getting from lm-eval with HF implementation
+google/gemma-3-27b-it:
+  - accuracy: 91.66
 mistralai/Ministral-8B-Instruct-2410:
   - accuracy: 79.25
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 6430d14b007..75774693e6f 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -92,6 +92,8 @@ mistralai/Mixtral-8x22B-v0.1:
     accuracy: 77.63
 google/gemma-2-9b-it:
   - accuracy: 73.05
+google/gemma-3-27b-it:
+  - accuracy: 77.80
 Qwen/Qwen2-0.5B-Instruct:
   - accuracy: 45.30
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index eb4cadc985d..b0942231808 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -471,6 +471,30 @@ def test_fp8(self):
             pytest.skip("FP8 pre-quantized Ministral-8B model not available")
 
 
+class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "google/gemma-3-27b-it"
+    MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-27b-it/"
+
+    def test_auto_dtype(self):
+        # Disabling kv cache reuse as a WAR to deal with gaps in kernel support for Gemma3's non-inclusive sliding window size.
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=False,
+            enable_partial_reuse=False,
+        )
+        # We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images.
+        # So, testing with it here.
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 attn_backend="FLASHINFER",
+                 cuda_graph_config=None) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
 class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "google/gemma-3-1b-it"
     MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-1b-it/"
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 5a8c8e9e840..3e9f0d3995b 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -186,8 +186,10 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
+  - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
 - condition:
     ranges:
       system_gpu_count:

From edab7532dd40f1baae13c9ca73696d5386ba232a Mon Sep 17 00:00:00 2001
From: danielafrimi <45691845+danielafrimi@users.noreply.github.com>
Date: Tue, 15 Jul 2025 23:13:49 +0300
Subject: [PATCH 22/88] feat/add latency support for trtllm bench (#3730)

Signed-off-by: Ubuntu <dafrimi@nvidia.com>
Signed-off-by: Daniel Afrimi <danielafrimi8@gmail.com>
Signed-off-by: Frank <3429989+FrankD412@users.noreply.github.com>
Co-authored-by: Daniel Afrimi <dafrimi@nvidia.com>
Co-authored-by: Frank <3429989+FrankD412@users.noreply.github.com>
---
 tensorrt_llm/bench/benchmark/low_latency.py   | 177 ++++++++++++++----
 tensorrt_llm/bench/benchmark/throughput.py    |   9 +-
 tensorrt_llm/bench/benchmark/utils/general.py |   2 +
 tests/integration/defs/test_e2e.py            |   8 +-
 4 files changed, 153 insertions(+), 43 deletions(-)

diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
index 490ac62f4f5..cacb7a2ada4 100644
--- a/tensorrt_llm/bench/benchmark/low_latency.py
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -9,11 +9,14 @@
 import yaml
 from click_option_group import (MutuallyExclusiveOptionGroup, OptionGroup,
                                 optgroup)
+from huggingface_hub import snapshot_download
 
+from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm.bench.benchmark.utils.asynchronous import async_benchmark
 from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
 from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
+from tensorrt_llm.bench.build.build import get_model_config
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
@@ -21,10 +24,11 @@
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
 # isort: off
-from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine
+from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, ALL_SUPPORTED_BACKENDS
 # isort: on
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
-                                           initialize_tokenizer)
+                                           initialize_tokenizer,
+                                           update_metadata_for_multimodal)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -38,15 +42,25 @@
                     readable=True,
                     path_type=Path,
                     resolve_path=True),
-    required=True,
+    default=None,
     help="Path to a serialized TRT-LLM engine.",
 )
+@optgroup.option("--backend",
+                 type=click.Choice(ALL_SUPPORTED_BACKENDS),
+                 default="pytorch",
+                 help="The backend to use when running benchmarking.")
 @optgroup.option(
     "--kv_cache_free_gpu_mem_fraction",
     type=float,
     default=.90,
     help="The percentage of memory to use for KV Cache after model load.",
 )
+@optgroup.option(
+    "--max_seq_len",
+    type=int,
+    default=None,
+    help="Maximum sequence length.",
+)
 @optgroup.group(
     "Engine Input Configuration",
     help="Input configuration for driving the engine.",
@@ -60,6 +74,20 @@
     default=None,
     help="Pass in a dataset file for parsing instead of stdin.",
 )
+@optgroup.option(
+    "--modality",
+    type=click.Choice(["image", "video"]),
+    default=None,
+    help="Modality of the multimodal requests.",
+)
+@optgroup.option(
+    "--max_input_len",
+    type=int,
+    default=4096,
+    help=
+    "Maximum input sequence length to use for multimodal models. This is used only when --modality "
+    "is specified since the actual number of vision tokens is unknown before the model is run.",
+)
 @optgroup.option(
     "--num_requests",
     type=int,
@@ -73,6 +101,24 @@
     default=2,
     help="Number of requests warm up benchmark.",
 )
+@optgroup.option(
+    "--tp",
+    type=int,
+    default=1,
+    help="tensor parallelism size",
+)
+@optgroup.option(
+    "--pp",
+    type=int,
+    default=1,
+    help="pipeline parallelism size",
+)
+@optgroup.option(
+    "--ep",
+    type=int,
+    default=None,
+    help="expert parallelism size",
+)
 @optgroup.group("Request Load Control Options",
                 cls=MutuallyExclusiveOptionGroup,
                 help="Limits how requests are loaded.")
@@ -142,11 +188,11 @@ def latency_command(
     concurrency: int = params.pop("concurrency")
     beam_width: int = params.pop("beam_width")
     warmup: int = params.get("warmup")
-    # Engine configuration parsing
-    exec_settings, build_cfg = get_settings_from_engine(engine_dir)
-    exec_settings["model"] = model
-    engine_tokens = exec_settings["settings_config"]["max_num_tokens"]
-    engine_max_seq_len = build_cfg["max_seq_len"]
+    modality: str = params.pop("modality")
+    max_input_len: int = params.pop("max_input_len")
+    max_seq_len: int = params.pop("max_seq_len")
+    backend: str = params.get("backend")
+    model_type = get_model_config(model, checkpoint_path).model_type
 
     # Runtime Options
     kv_cache_percent = params.pop("kv_cache_free_gpu_mem_fraction")
@@ -157,6 +203,60 @@ def latency_command(
     iteration_log: Path = params.pop("iteration_log")
     iteration_writer = IterationWriter(iteration_log)
 
+    # Initialize the HF tokenizer for the specified model.
+    tokenizer = initialize_tokenizer(checkpoint_path)
+
+    # Dataset Loading and Preparation
+    with open(dataset_path, "r") as dataset:
+        metadata, requests = create_dataset_from_stream(
+            tokenizer,
+            dataset,
+            num_requests=num_requests,
+            model_dir=checkpoint_path,
+            model_type=model_type,
+            modality=modality,
+            max_input_seq_len_for_multimodal=max_input_len)
+
+        metadata.dataset_path = dataset_path
+
+    if modality is None:
+        # Log dataset info
+        # NOTE: This table is only accurate for non-multimodal models.
+        #       The accurate table for multimodal models will be logged after the benchmark is done.
+        logger.info(metadata.get_summary_for_print())
+
+    # Engine configuration parsing for PyTorch backend
+    kwargs = {}
+    if backend and backend.lower() in ALL_SUPPORTED_BACKENDS and backend.lower(
+    ) != "tensorrt":
+        if bench_env.checkpoint_path is None:
+            snapshot_download(model)
+
+        exec_settings = get_settings(params, metadata, bench_env.model,
+                                     bench_env.checkpoint_path)
+        kwargs_max_sql = max_seq_len or metadata.max_sequence_length
+        logger.info(f"Setting PyTorch max sequence length to {kwargs_max_sql}")
+        kwargs["max_seq_len"] = kwargs_max_sql
+    elif backend.lower() == "tensorrt":
+        assert max_seq_len is None, (
+            "max_seq_len is not a runtime parameter for C++ backend")
+        exec_settings, build_cfg = get_settings_from_engine(engine_dir)
+        engine_max_seq_len = build_cfg["max_seq_len"]
+
+        if metadata.max_sequence_length > engine_max_seq_len:
+            raise RuntimeError(
+                f"Engine supports a max sequence of {engine_max_seq_len}. Provided "
+                "dataset contains a maximum sequence of "
+                f"{metadata.max_sequence_length}. Please rebuild a new engine to"
+                "support this dataset.")
+    else:
+        raise RuntimeError(
+            f"Invalid backend: {backend}, please use one of the following: "
+            f"{ALL_SUPPORTED_BACKENDS}")
+
+    exec_settings["model"] = model
+    engine_tokens = exec_settings["settings_config"]["max_num_tokens"]
+
     # Update configuration with runtime options
     exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent
     exec_settings["settings_config"]["max_batch_size"] = 1
@@ -187,32 +287,25 @@ def latency_command(
     # Construct the runtime configuration dataclass.
     runtime_config = RuntimeConfig(**exec_settings)
 
-    # Initialize the HF tokenizer for the specified model.
-    ignore_eos = True if runtime_config.decoding_config.decoding_mode == SpeculativeDecodingMode.NONE else False
-    tokenizer = initialize_tokenizer(checkpoint_path)
-    eos_id = tokenizer.eos_token_id if not ignore_eos else -1
-    pad_id = tokenizer.pad_token_id if not ignore_eos else -1
+    llm = None
+    kwargs = kwargs | runtime_config.get_llm_args()
+    kwargs['backend'] = backend
 
-    # Dataset Loading and Preparation
-    with open(dataset_path, "r") as dataset:
-        metadata, requests = create_dataset_from_stream(
-            tokenizer, dataset, num_requests=num_requests)
-        metadata.dataset_path = dataset_path
+    try:
+        logger.info("Setting up latency benchmark.")
 
-    if metadata.max_sequence_length > engine_max_seq_len:
-        raise RuntimeError(
-            f"Engine supports a max sequence of {engine_max_seq_len}. Provided "
-            "dataset contains a maximum sequence of "
-            f"{metadata.max_sequence_length}. Please rebuild a new engine to"
-            "support this dataset.")
+        if "pytorch_backend_config" in kwargs and iteration_log is not None:
+            kwargs["pytorch_backend_config"].enable_iter_perf_stats = True
 
-    logger.info(metadata.get_summary_for_print())
-    logger.info("Running experimental latency benchmark.")
+        if runtime_config.backend == 'pytorch':
+            llm = PyTorchLLM(**kwargs)
+        else:
+            llm = LLM(**kwargs)
 
-    llm = None
-    kwargs = runtime_config.get_llm_args()
+        ignore_eos = True if runtime_config.decoding_config.decoding_mode == SpeculativeDecodingMode.NONE else False
+        eos_id = tokenizer.eos_token_id if not ignore_eos else -1
+        pad_id = tokenizer.pad_token_id if not ignore_eos else -1
 
-    try:
         sampling_params = SamplingParams(
             end_id=eos_id,
             pad_id=pad_id,
@@ -220,7 +313,6 @@ def latency_command(
             use_beam_search=beam_width > 1,
         )
         post_proc_params = None  # No detokenization
-        llm = LLM(**kwargs)
 
         # Perform warmup if requested.
         if warmup > 0:
@@ -228,8 +320,13 @@ def latency_command(
             warmup_dataset = generate_warmup_dataset(requests, warmup)
             logger.info("Running warmup.")
             asyncio.run(
-                async_benchmark(llm, sampling_params, post_proc_params,
-                                warmup_dataset, False, concurrency))
+                async_benchmark(llm,
+                                sampling_params,
+                                post_proc_params,
+                                warmup_dataset,
+                                False,
+                                concurrency,
+                                modality=modality))
             # WAR: IterationResult is a singleton tied to the executor.
             # Since the benchmark calls asyncio.run() multiple times (e.g., during warmup),
             # we must reset it to ensure it attaches to the correct event loop.
@@ -238,11 +335,21 @@ def latency_command(
 
         with iteration_writer.capture():
             statistics = asyncio.run(
-                async_benchmark(llm, sampling_params, post_proc_params,
-                                requests, True, concurrency,
-                                iteration_writer.full_address))
+                async_benchmark(llm,
+                                sampling_params,
+                                post_proc_params,
+                                requests,
+                                True,
+                                concurrency,
+                                iteration_writer.full_address,
+                                modality=modality))
 
         logger.info(f"Benchmark done. Reporting results...")
+
+        if modality is not None:
+            # For multimodal models, we need to update the metadata with the correct input lengths
+            metadata = update_metadata_for_multimodal(metadata, statistics)
+
         report_utility = ReportUtility(statistics, metadata, runtime_config,
                                        logger, kwargs, True)
         if report_json:
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
index 6c5279e2b36..6fdd41847bb 100755
--- a/tensorrt_llm/bench/benchmark/throughput.py
+++ b/tensorrt_llm/bench/benchmark/throughput.py
@@ -15,7 +15,7 @@
 
 # isort: off
 from tensorrt_llm.bench.benchmark.utils.general import (
-    get_settings_from_engine, get_settings)
+    get_settings_from_engine, get_settings, ALL_SUPPORTED_BACKENDS)
 # isort: on
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
@@ -45,7 +45,7 @@
     help="Path to a serialized TRT-LLM engine.",
 )
 @optgroup.option("--backend",
-                 type=click.Choice(["pytorch", "tensorrt", "_autodeploy"]),
+                 type=click.Choice(ALL_SUPPORTED_BACKENDS),
                  default="pytorch",
                  help="The backend to use when running benchmarking.")
 @optgroup.option(
@@ -305,10 +305,11 @@ def throughput_command(
         logger.info(metadata.get_summary_for_print())
 
     # Engine configuration parsing
-    if backend and backend.lower() in ["pytorch", "_autodeploy"]:
+    if backend and backend.lower() in ALL_SUPPORTED_BACKENDS and backend.lower(
+    ) != "tensorrt":
         # If we're dealing with a model name, perform a snapshot download to
         # make sure we have a local copy of the model.
-        if checkpoint_path is None:
+        if bench_env.checkpoint_path is None:
             snapshot_download(model)
 
         exec_settings = get_settings(params, metadata, bench_env.model,
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 153e262276f..0073ea1d44f 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -22,6 +22,8 @@
     QuantAlgo.NVFP4.value: "fp8",
 }
 
+ALL_SUPPORTED_BACKENDS = ["pytorch", "_autodeploy", "tensorrt"]
+
 
 def get_settings_from_engine(
     engine_path: Path
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 7a3becba969..d7bd8c0f2d5 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -872,8 +872,8 @@ def test_trtllm_bench_latency_sanity(llm_root, llm_venv, engine_dir,
                                                                 streaming=True)
 
     benchmark_cmd = \
-        f"trtllm-bench --model {model_path} latency --engine_dir {engine_path} " \
-        f"--dataset {dataset_path}"
+        f"trtllm-bench --model {model_name} --model_path {model_path} latency " \
+        f"--engine_dir {engine_path} --dataset {dataset_path} --backend tensorrt"
     check_call(benchmark_cmd, shell=True)
 
 
@@ -916,8 +916,8 @@ def test_trtllm_bench_request_rate_and_concurrency(llm_root, llm_venv,
                                                                 streaming=False)
 
     benchmark_cmd = \
-        f"trtllm-bench --model {model_path} throughput --engine_dir {engine_path} " \
-        f"--dataset {dataset_path}"
+        f"trtllm-bench --model {model_name} --model_path {model_path} throughput " \
+        f"--engine_dir {engine_path} --dataset {dataset_path} --backend tensorrt"
 
     if request_rate:
         benchmark_cmd += " --request_rate 100"

From 6a47cac9814e637796e3856b6eead5c545d34db2 Mon Sep 17 00:00:00 2001
From: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
Date: Tue, 15 Jul 2025 17:52:43 -0700
Subject: [PATCH 23/88] feat: Add support for Triton request cancellation
 (#5898)

Signed-off-by: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
---
 .../defs/triton_server/test_triton_llm.py     |  18 ++
 .../all_models/llmapi/tensorrt_llm/1/model.py | 143 ++++++++-
 .../llmapi/tensorrt_llm/config.pbtxt          |   6 +
 triton_backend/tools/llmapi_client.py         | 275 ++++++++++++++++++
 4 files changed, 436 insertions(+), 6 deletions(-)
 create mode 100755 triton_backend/tools/llmapi_client.py

diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py
index 5bdaa7731ff..fdf36756d35 100644
--- a/tests/integration/defs/triton_server/test_triton_llm.py
+++ b/tests/integration/defs/triton_server/test_triton_llm.py
@@ -2,6 +2,7 @@
 import sys
 
 import pytest
+import torch
 import yaml
 
 sys.path.append(os.path.join(os.environ["LLM_ROOT"], "triton_backend"))
@@ -3633,6 +3634,9 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
                         llm_backend_dataset_root):
     llm_backend_repo_root = os.environ["LLM_BACKEND_ROOT"]
 
+    if torch.cuda.device_count() < int(TENSOR_PARALLEL_SIZE):
+        pytest.skip("Skipping. Not enough GPUs.")
+
     # Prepare model repo
     new_model_repo = os.path.join(llm_backend_repo_root, "triton_repo")
     prepare_llmapi_model_repo(llm_backend_repo_root, new_model_repo)
@@ -3708,6 +3712,20 @@ def test_llmapi_backend(E2E_MODEL_NAME, DECOUPLED_MODE, TRITON_MAX_BATCH_SIZE,
             print_info("DEBUG:: run_cmd: python3 " + " ".join(run_cmd))
             venv_check_call(llm_backend_venv, run_cmd)
 
+            # Test request cancellation with stop request
+            run_cmd = [
+                f"{llm_backend_repo_root}/tools/llmapi_client.py",
+                "--request-output-len=200", '--stop-after-ms=25'
+            ]
+
+            output = venv_check_output(llm_backend_venv, run_cmd)
+            assert 'Request is cancelled' in output
+
+            # Test request cancellation with  request cancel
+            run_cmd += ['--stop-via-request-cancel']
+            output = venv_check_output(llm_backend_venv, run_cmd)
+            assert 'Request is cancelled' in output
+
 
 @pytest.mark.parametrize("E2E_MODEL_NAME", ["ensemble", "tensorrt_llm_bls"])
 @pytest.mark.parametrize("ACCUMULATE_TOKEN", ["False"])
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
index 7f2f00a31c5..2594fd47760 100755
--- a/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
+++ b/triton_backend/all_models/llmapi/tensorrt_llm/1/model.py
@@ -29,8 +29,12 @@
 import json
 import os
 import queue
+import sys
 import threading
 from contextlib import asynccontextmanager
+from dataclasses import dataclass
+from random import randint
+from typing import Any
 
 import numpy as np
 import pandas as pd
@@ -44,9 +48,18 @@
 
 from tensorrt_llm import LLM, SamplingParams
 from tensorrt_llm._utils import global_mpi_rank, global_mpi_size
+from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
 
 
+@dataclass
+class RequestData:
+    triton_req_id: int
+    triton_user_id: str
+    triton_request: Any
+    response_iterator: RequestOutput
+
+
 def get_model_config(filename, include_keys=None, exclude_keys=None):
     engine_args_filepath = os.path.join(pb_utils.get_model_dir(), filename)
     engine_config = None
@@ -73,6 +86,22 @@ def get_model_config(filename, include_keys=None, exclude_keys=None):
     return engine_config
 
 
+def get_input_scalar_by_name(request,
+                             name,
+                             expected_batch_size=1,
+                             batch_index=0):
+    tensor = pb_utils.get_input_tensor_by_name(request, name)
+    if tensor is None:
+        return None
+    tensor = tensor.as_numpy()
+
+    if tensor.size != expected_batch_size:
+        raise pb_utils.TritonModelException(
+            f"Expected a scalar tensor for tensor {name}")
+
+    return tensor.item(batch_index)
+
+
 class TritonPythonModel:
 
     @classmethod
@@ -138,6 +167,13 @@ def initialize(self, args):
                 f"[trtllm] rank{global_mpi_rank()} is starting trtllm engine with args: {self.llm_engine_args}"
             )
 
+            triton_config = get_model_config(
+                os.environ.get('LLM_CONFIG_PATH', 'model.yaml'),
+                include_keys=["triton_config"])["triton_config"]
+            self.cancellation_check_period_ms = int(
+                triton_config["cancellation_check_period_ms"]
+            ) if "cancellation_check_period_ms" in triton_config else 100
+
             if global_mpi_size() > 1:
                 mpi_session = MpiCommSession(comm=COMM_WORLD,
                                              n_workers=COMM_WORLD.Get_size())
@@ -146,11 +182,21 @@ def initialize(self, args):
             # Starting the TRT-LLM engine with LLM API and its event thread running the AsyncIO event loop.
             self._init_engine()
 
+            self.running = False
+
             # Starting the response thread. It allows TRT-LLM to keep making progress while
             # response sender(s) are sending responses to server frontend.
             self._response_queue = queue.Queue()
             self._response_thread = threading.Thread(target=self._response_loop)
             self._response_thread.start()
+
+            self.req_id_to_request_data = {}
+            self.triton_user_id_to_req_ids = {}
+            self.lock = threading.Lock()
+            self.cancellation_thread = threading.Thread(
+                target=self.cancellation_loop)
+            self.running = True
+            self.cancellation_thread.start()
         else:
             self.logger.log_info(
                 f"[trtllm] rank{global_mpi_rank()} is waiting for the leader node..."
@@ -268,6 +314,50 @@ def _response_loop(self):
                 if response_flag == pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL:
                     self._ongoing_request_count -= 1
 
+    def cancellation_loop(self):
+        """Checks if any pending requests have been cancelled."""
+        while self.running:
+            import time
+            time.sleep(self.cancellation_check_period_ms / 1000.0)
+            with self.lock:
+                cancelled_req_ids = []
+                for req_id, request_data in self.req_id_to_request_data.items():
+                    if request_data.triton_request.is_cancelled():
+                        request_data.response_iterator.abort()
+
+                        response_sender = request_data.triton_request.get_response_sender(
+                        )
+                        response_sender.send(
+                            pb_utils.InferenceResponse(
+                                error=pb_utils.TritonError(
+                                    "Request cancelled by client",
+                                    pb_utils.TritonError.CANCELLED)),
+                            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+                        cancelled_req_ids.append(req_id)
+                for req_id in cancelled_req_ids:
+                    del self.req_id_to_request_data[req_id]
+
+    def handle_stop_request(self, triton_user_id, response_sender):
+        if triton_user_id is None or triton_user_id == "":
+            response_sender.send(
+                pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                    "A request id must be provided for request cancellation")),
+                flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+            return
+
+        with self.lock:
+            if triton_user_id in self.triton_user_id_to_req_ids:
+                req_ids = self.triton_user_id_to_req_ids[triton_user_id]
+                for req_id in req_ids:
+                    request_data = self.req_id_to_request_data[req_id]
+                    request_data.response_iterator.abort()
+                    del self.req_id_to_request_data[req_id]
+
+        response_sender.send(
+            pb_utils.InferenceResponse(error=pb_utils.TritonError(
+                "Request cancelled by client", pb_utils.TritonError.CANCELLED)),
+            flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
+
     def execute(self, requests):
         """
         Function is called by Triton server when a new request is received.
@@ -295,6 +385,13 @@ async def _execute_single_request(self, request):
         Execute a single inference request asynchronously.
         """
         response_sender = request.get_response_sender()
+        triton_user_id = request.request_id()
+
+        stop = get_input_scalar_by_name(request, 'stop')
+        if stop:
+            self.handle_stop_request(triton_user_id, response_sender)
+            return
+
         response_state = {
             "response_sender": response_sender,
             "is_cancelled": False,
@@ -303,6 +400,10 @@ async def _execute_single_request(self, request):
         }
         self._ongoing_request_count += 1
         decrement_ongoing_request_count = True
+
+        # Unique request id used to identify each triton request
+        triton_req_id = str(randint(0, sys.maxsize))
+
         try:
             # TODO: [JIRA-4496] Implement when request contains batched prompts
             (prompt, sampling_params, streaming,
@@ -314,8 +415,20 @@ async def _execute_single_request(self, request):
             response_iterator = self._llm_engine.generate_async(
                 prompt, SamplingParams(**sampling_params), streaming)
 
+            with self.lock:
+                self.req_id_to_request_data[triton_req_id] = RequestData(
+                    triton_req_id=triton_req_id,
+                    triton_user_id=request.request_id(),
+                    triton_request=request,
+                    response_iterator=response_iterator,
+                )
+                if triton_user_id is not None and triton_user_id != "" and triton_user_id:
+                    self.triton_user_id_to_req_ids[triton_user_id] = set()
+                    # TODO: [JIRA-4496] Add all batched request ids to the set
+                    self.triton_user_id_to_req_ids[triton_user_id].add(
+                        triton_req_id)
+
             async for request_output in response_iterator:
-                # TODO: [JIRA-4040] Add request cancellation check here
                 # Send each response if streaming.
                 if streaming:
                     response = self._create_response(
@@ -332,11 +445,19 @@ async def _execute_single_request(self, request):
 
             # Send the last response which contains all the outputs if not streaming.
             if not streaming:
-                response_sender.send(
-                    self._create_response(request_output=request_output,
-                                          output_config=output_config),
-                    flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL,
-                )
+                # If the request was cancelled, we don't need to send the last response
+                with self.lock:
+                    was_cancelled = triton_req_id not in self.req_id_to_request_data
+                    if not was_cancelled:
+                        # Remove the request from the request data map so the cancellation loop stops querying
+                        # is_cancelled() on the request
+                        del self.req_id_to_request_data[triton_req_id]
+
+                if not was_cancelled:
+                    response_sender.send(
+                        self._create_response(request_output=request_output,
+                                              output_config=output_config),
+                        flags=pb_utils.TRITONSERVER_RESPONSE_COMPLETE_FINAL)
 
         except Exception as e:
             self.logger.log_error(f"[trtllm] Error generating request: {e}")
@@ -352,6 +473,11 @@ async def _execute_single_request(self, request):
         finally:
             if decrement_ongoing_request_count:
                 self._ongoing_request_count -= 1
+            with self.lock:
+                if triton_req_id in self.req_id_to_request_data:
+                    del self.req_id_to_request_data[triton_req_id]
+                if triton_user_id is not None and triton_user_id != "" and triton_user_id in self.triton_user_id_to_req_ids:
+                    del self.triton_user_id_to_req_ids[triton_user_id]
 
     def _convert_request(self, request):
         """Helper function to convert the request into a prompt for LLM.generate_async
@@ -553,6 +679,11 @@ def finalize(self):
             self._response_thread.join()
             self._response_thread = None
 
+        if self.cancellation_thread is not None:
+            self.running = False
+            self.cancellation_thread.join()
+            self.cancellation_thread = None
+
         # When using parallel tensors, the stub process may not shutdown due to
         # unreleased references, so manually run the garbage collector once.
         self.logger.log_info(
diff --git a/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt b/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt
index f4b178b238b..7ec628be9aa 100644
--- a/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt
+++ b/triton_backend/all_models/llmapi/tensorrt_llm/config.pbtxt
@@ -142,6 +142,12 @@ input [
     data_type: TYPE_BOOL
     dims: [1]
     optional: true
+  },
+  {
+    name: "stop"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    optional: true
   }
 ]
 ###################################################################
diff --git a/triton_backend/tools/llmapi_client.py b/triton_backend/tools/llmapi_client.py
new file mode 100755
index 00000000000..bd63254faaf
--- /dev/null
+++ b/triton_backend/tools/llmapi_client.py
@@ -0,0 +1,275 @@
+#!/usr/bin/env python
+# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import argparse
+import queue
+import sys
+import time
+from functools import partial
+
+import numpy as np
+import tritonclient.grpc as grpcclient
+from tritonclient.utils import InferenceServerException, np_to_triton_dtype
+
+
+def prepare_tensor(name, input):
+    t = grpcclient.InferInput(name, input.shape,
+                              np_to_triton_dtype(input.dtype))
+    t.set_data_from_numpy(input)
+    return t
+
+
+def _prepare_inputs(prompt, output_len):
+    inputs = [
+        prepare_tensor("text_input", prompt),
+        prepare_tensor("sampling_param_max_tokens",
+                       np.array([output_len], dtype=np.int32)),
+    ]
+    return inputs
+
+
+def prepare_stop_signals():
+
+    inputs = [
+        grpcclient.InferInput('text_input', [1], "BYTES"),
+        grpcclient.InferInput('stop', [1], "BOOL"),
+    ]
+
+    inputs[0].set_data_from_numpy(np.empty([1], dtype=np.bytes_))
+    inputs[1].set_data_from_numpy(np.array([True], dtype='bool'))
+
+    return inputs
+
+
+class UserData:
+
+    def __init__(self):
+        self._completed_requests = queue.Queue()
+
+
+def callback(user_data, result, error):
+    if error:
+        user_data._completed_requests.put(error)
+    else:
+        user_data._completed_requests.put(result)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable verbose output",
+    )
+    parser.add_argument(
+        "-u",
+        "--url",
+        type=str,
+        required=False,
+        default="localhost:8001",
+        help="Inference server URL. Default is localhost:8001.",
+    )
+    parser.add_argument('--text',
+                        type=str,
+                        required=False,
+                        default='Born in north-east France, Soyer trained as a',
+                        help='Input text')
+
+    parser.add_argument(
+        "-s",
+        "--ssl",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable SSL encrypted channel to the server",
+    )
+    parser.add_argument(
+        "-t",
+        "--stream-timeout",
+        type=float,
+        required=False,
+        default=None,
+        help="Stream timeout in seconds. Default is None.",
+    )
+    parser.add_argument(
+        "-C",
+        "--grpc-compression-algorithm",
+        type=str,
+        required=False,
+        default=None,
+        help=
+        "The compression algorithm to be used when sending request to server. Default is None.",
+    )
+    parser.add_argument(
+        "-S",
+        "--streaming",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Enable streaming mode. Default is False.",
+    )
+    parser.add_argument(
+        "-r",
+        "--root-certificates",
+        type=str,
+        required=False,
+        default=None,
+        help="File holding PEM-encoded root certificates. Default is None.",
+    )
+    parser.add_argument(
+        "-p",
+        "--private-key",
+        type=str,
+        required=False,
+        default=None,
+        help="File holding PEM-encoded private key. Default is None.",
+    )
+    parser.add_argument(
+        "-x",
+        "--certificate-chain",
+        type=str,
+        required=False,
+        default=None,
+        help="File holding PEM-encoded certificate chain. Default is None.",
+    )
+    parser.add_argument(
+        "--request-output-len",
+        type=int,
+        required=False,
+        default=16,
+        help="Request output length",
+    )
+    parser.add_argument(
+        '--stop-after-ms',
+        type=int,
+        required=False,
+        default=0,
+        help='Early stop the generation after a few milliseconds')
+    parser.add_argument(
+        "--stop-via-request-cancel",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Early stop use request cancellation instead of stop request")
+
+    parser.add_argument('--request-id',
+                        type=str,
+                        default='1',
+                        required=False,
+                        help='The request_id for the request')
+    parser.add_argument(
+        "--return-perf-metrics",
+        action="store_true",
+        required=False,
+        default=False,
+        help="Return per-request perf metrics",
+    )
+    parser.add_argument('--model-name',
+                        type=str,
+                        required=False,
+                        default='tensorrt_llm',
+                        help='Specify model name')
+
+    FLAGS = parser.parse_args()
+
+    input_data = np.array([FLAGS.text], dtype=object)
+
+    output_len = FLAGS.request_output_len
+
+    inputs = _prepare_inputs(input_data, output_len)
+
+    stop_inputs = None
+    if FLAGS.stop_after_ms > 0 and not FLAGS.stop_via_request_cancel:
+        stop_inputs = prepare_stop_signals()
+
+    request_id = FLAGS.request_id
+    user_data = UserData()
+    with grpcclient.InferenceServerClient(
+            url=FLAGS.url,
+            verbose=FLAGS.verbose,
+            ssl=FLAGS.ssl,
+            root_certificates=FLAGS.root_certificates,
+            private_key=FLAGS.private_key,
+            certificate_chain=FLAGS.certificate_chain,
+    ) as triton_client:
+        try:
+            # Send request
+            infer_future = triton_client.async_infer(
+                FLAGS.model_name,
+                inputs,
+                outputs=None,
+                request_id=request_id,
+                callback=partial(callback, user_data),
+                parameters={'Streaming': FLAGS.streaming})
+
+            expected_responses = 1
+
+            if FLAGS.stop_after_ms > 0:
+
+                time.sleep(FLAGS.stop_after_ms / 1000.0)
+
+                if FLAGS.stop_via_request_cancel:
+                    infer_future.cancel()
+                else:
+                    triton_client.async_infer(
+                        FLAGS.model_name,
+                        stop_inputs,
+                        request_id=request_id,
+                        callback=partial(callback, user_data),
+                        parameters={'Streaming': FLAGS.streaming})
+
+            processed_count = 0
+            while processed_count < expected_responses:
+                try:
+                    result = user_data._completed_requests.get()
+                    print("Got completed request", flush=True)
+                except Exception:
+                    break
+
+                if type(result) == InferenceServerException:
+                    if result.status() == "StatusCode.CANCELLED":
+                        print("Request is cancelled")
+                    else:
+                        print("Received an error from server:")
+                        print(result)
+                        raise result
+                else:
+                    print(
+                        f'Output text: {result.as_numpy("text_output")[0].decode("utf-8")}'
+                    )
+
+                processed_count += 1
+
+        except Exception as e:
+            err = "Encountered error: " + str(e)
+            print(err)
+            sys.exit(err)
+
+        sys.exit(0)

From 665b4469b3b5f6a3ce5c5aa7eeb37826dd52c0bf Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
Date: Tue, 15 Jul 2025 20:17:22 -0700
Subject: [PATCH 24/88] [fix] Fix Triton build (#6076)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
---
 jenkins/Build.groovy | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 81193d2ddd5..bb8fd7816ce 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -429,7 +429,8 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
 
     // Build tritonserver artifacts
     def llmPath = sh (script: "realpath ${LLM_ROOT}",returnStdout: true).trim()
-    sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DUSE_CXX11_ABI=ON && make -j${BUILD_JOBS} install"
+    // TODO: Remove after the cmake version is upgraded to 3.31.8
+    sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DTRITON_COMMON_REPO_TAG=r25.05 -DTRITON_CORE_REPO_TAG=r25.05 -DTRITON_THIRD_PARTY_REPO_TAG=r25.05 -DTRITON_BACKEND_REPO_TAG=r25.05 -DUSE_CXX11_ABI=ON && make -j${BUILD_JOBS} install"
 
     // Step 3: packaging wheels into tarfile
     sh "cp ${LLM_ROOT}/build/tensorrt_llm-*.whl TensorRT-LLM/"

From 8679a058a36a593e8aa8003513c17ede9ed7190a Mon Sep 17 00:00:00 2001
From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
Date: Wed, 16 Jul 2025 11:39:41 +0800
Subject: [PATCH 25/88] fix: Unable to load phi4-model with tp_size>1 (#5962)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
---
 tensorrt_llm/models/phi3/convert.py           |  5 ++--
 tensorrt_llm/models/phi3/split_weights.py     | 29 ++++++++++++-------
 .../defs/accuracy/references/mmlu.yaml        |  4 +++
 .../defs/accuracy/test_cli_flow.py            |  7 +++++
 .../test_lists/qa/examples_test_list.txt      |  1 +
 5 files changed, 33 insertions(+), 13 deletions(-)

diff --git a/tensorrt_llm/models/phi3/convert.py b/tensorrt_llm/models/phi3/convert.py
index ddd654dadd4..1f11408b694 100644
--- a/tensorrt_llm/models/phi3/convert.py
+++ b/tensorrt_llm/models/phi3/convert.py
@@ -126,8 +126,9 @@ def get_moe_weight(key, suffix):
                 if "qkv." in key:
                     weights[key] = shuffle_qkv_weights(weights[key], config)
 
-    if config.architecture in ['Phi3SmallForCausalLM', "PhiMoEForCausalLM"
-                               ] and config.mapping.has_tp():
+    if config.architecture in [
+            'Phi3SmallForCausalLM', "PhiMoEForCausalLM", "Phi3ForCausalLM"
+    ] and config.mapping.has_tp():
         weights = split_weights_tp(config, weights, torch_dtype)
 
     return weights
diff --git a/tensorrt_llm/models/phi3/split_weights.py b/tensorrt_llm/models/phi3/split_weights.py
index 62a8891230c..bca33ba5511 100644
--- a/tensorrt_llm/models/phi3/split_weights.py
+++ b/tensorrt_llm/models/phi3/split_weights.py
@@ -145,19 +145,23 @@ def get_quant_weight(weight, prefix, bias):
                 -1, hidden_size)
             split_weight = torch.cat(
                 [split(x, tp_size, rank) for x in [q, k, v]], dim=0)
-
-            qkv_bias = qkv_bias.reshape(num_q_per_kv + 2, -1)
-            q = qkv_bias[:num_q_per_kv, :].reshape(-1)
-            k = qkv_bias[num_q_per_kv:num_q_per_kv + 1, :].reshape(-1)
-            v = qkv_bias[num_q_per_kv + 1:num_q_per_kv + 2, :].reshape(-1)
-            split_bias = torch.cat([split(x, tp_size, rank) for x in [q, k, v]],
-                                   dim=0)
+            if qkv_bias is not None:
+                qkv_bias = qkv_bias.reshape(num_q_per_kv + 2, -1)
+                q = qkv_bias[:num_q_per_kv, :].reshape(-1)
+                k = qkv_bias[num_q_per_kv:num_q_per_kv + 1, :].reshape(-1)
+                v = qkv_bias[num_q_per_kv + 1:num_q_per_kv + 2, :].reshape(-1)
+                split_bias = torch.cat(
+                    [split(x, tp_size, rank) for x in [q, k, v]], dim=0)
+            else:
+                split_bias = None
         else:
             split_weight = split_qkv_tp(qkv_weight, num_heads, hidden_size,
                                         tp_size, rank)
-            split_bias = split_qkv_bias_tp(qkv_bias, num_heads, hidden_size,
-                                           tp_size, rank)
-
+            if qkv_bias is not None:
+                split_bias = split_qkv_bias_tp(qkv_bias, num_heads, hidden_size,
+                                               tp_size, rank)
+            else:
+                split_bias = None
         weights.update(get_quant_weight(split_weight, prefix, split_bias))
 
         prefix = layer_prefix + 'attention.dense'
@@ -171,7 +175,10 @@ def get_quant_weight(weight, prefix, bias):
             mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
                 weights, prefix, dtype)
             split_v = split_matrix_tp(mlp_fc_weight, tp_size, rank, dim=0)
-            bias = split_matrix_tp(mlp_fc_bias, tp_size, rank, dim=0)
+            if mlp_fc_bias is not None:
+                bias = split_matrix_tp(mlp_fc_bias, tp_size, rank, dim=0)
+            else:
+                bias = None
             weights.update(get_quant_weight(split_v, prefix, bias))
         else:
             mlp_fc_weight = get_weight(weights, prefix, dtype)
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 75774693e6f..7beba282671 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -178,6 +178,10 @@ nvidia/Nemotron-H-8B-Base-8K:
     accuracy: 69.180
 microsoft/Phi-4-mini-instruct:
   - accuracy: 68.98
+# Created a dummy accuracy to track tp_size=2 for phi4-mini model.
+# TODO: update once https://nvbugs/5393849 is fixed.
+microsoft/Phi-4-mini-instruct-tp2:
+  - accuracy: 0.0
 nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
   - accuracy: 83.70
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py
index cd0090d4652..a5ab844dfbc 100644
--- a/tests/integration/defs/accuracy/test_cli_flow.py
+++ b/tests/integration/defs/accuracy/test_cli_flow.py
@@ -367,6 +367,13 @@ class TestPhi4MiniInstruct(CliFlowAccuracyTestHarness):
     def test_auto_dtype(self):
         self.run(tasks=[MMLU(self.MODEL_NAME)], dtype='auto')
 
+    @pytest.mark.skip_less_device(2)
+    def test_tp2(self):
+        # Created a dummy accuracy to track tp_size=2 for phi4-mini model.
+        # TODO: update once https://nvbugs/5393849 is fixed.
+        MODEL_NAME = "microsoft/Phi-4-mini-instruct-tp2"
+        self.run(tasks=[MMLU(MODEL_NAME)], tp_size=2)
+
 
 # Long sequence length test:
 # Model FP16 7B + 32K tokens in KV cache = 14 * 1024 MB + 32K * 0.5 MB = 30720 MB + scratch memory
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 70917793a90..774a5b84ecd 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -305,6 +305,7 @@ accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype
 accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype
 accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype
 accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype
+accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_tp2
 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_auto_dtype
 accuracy/test_cli_flow.py::TestLongAlpaca7B::test_multiblock_aggressive
 accuracy/test_cli_flow.py::TestMamba130M::test_auto_dtype

From e51c541617a7c552027eab0cfaf3c315e5105c7d Mon Sep 17 00:00:00 2001
From: Yiqing Yan <yiqingy@nvidia.com>
Date: Wed, 16 Jul 2025 13:02:23 +0800
Subject: [PATCH 26/88] chore: Bump version to 1.0.0rc4 (#6086)

Signed-off-by: Yiqing Yan <yiqingy@nvidia.com>
---
 README.md                | 2 +-
 examples/constraints.txt | 2 +-
 tensorrt_llm/version.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 99b00e26195..ce6fcc9cc88 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-1.0.0rc3-green)](./tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.0.0rc4-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/torch/arch_overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
diff --git a/examples/constraints.txt b/examples/constraints.txt
index bec7f2270a3..ff505acd0cc 100644
--- a/examples/constraints.txt
+++ b/examples/constraints.txt
@@ -1,3 +1,3 @@
-tensorrt_llm==1.0.0rc3
+tensorrt_llm==1.0.0rc4
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index ebc5c92c9c1..63def6d5fee 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "1.0.0rc3"
+__version__ = "1.0.0rc4"

From 509dc7c831dd35df8dbdbf173dea0efb67e806b8 Mon Sep 17 00:00:00 2001
From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Date: Wed, 16 Jul 2025 13:10:48 +0800
Subject: [PATCH 27/88] chroe: upgrade modelopt to 0.33 (#6058)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 88bf8552f7b..a10401ecafd 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,7 +25,7 @@ tensorrt~=10.11.0
 # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0.
 torch>=2.7.1,<=2.8.0a0
 torchvision
-nvidia-modelopt[torch]~=0.31.0
+nvidia-modelopt[torch]~=0.33.0
 nvidia-nccl-cu12
 nvidia-cuda-nvrtc-cu12
 transformers==4.53.1

From 385af53a4de97748cf656289f6f623601aca2ee1 Mon Sep 17 00:00:00 2001
From: Zheng Duan <200704041+zhengd-nv@users.noreply.github.com>
Date: Wed, 16 Jul 2025 13:52:13 +0800
Subject: [PATCH 28/88] [nvbug/5347489][nvbug/5388036] increase timeout in
 disagg worker test (#6041)

Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com>
---
 .../defs/disaggregated/test_workers.py        | 25 +++++++++++--------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/tests/integration/defs/disaggregated/test_workers.py b/tests/integration/defs/disaggregated/test_workers.py
index 6731270cf64..1916af70917 100644
--- a/tests/integration/defs/disaggregated/test_workers.py
+++ b/tests/integration/defs/disaggregated/test_workers.py
@@ -64,21 +64,26 @@ def run_disaggregated_workers(
     return workers_proc, ctx_servers, gen_servers
 
 
+DEFAULT_TIMEOUT_SERVER_START = 900
+DEFAULT_TIMEOUT_REQUEST = 180
+
+
 class BasicWorkerTester:
 
     def __init__(self,
                  ctx_servers: List[str],
                  gen_servers: List[str],
-                 req_timeout_secs: int = 180,
-                 server_start_timeout_secs: int = 180):
+                 req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
+                 server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START):
         self.ctx_servers = ctx_servers
         self.gen_servers = gen_servers
         self.req_timeout_secs = req_timeout_secs
         self.server_start_timeout_secs = server_start_timeout_secs
 
     async def new_session(self):
-        session = aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(
-            total=self.req_timeout_secs))
+        session = aiohttp.ClientSession(
+            connector=aiohttp.TCPConnector(force_close=True),
+            timeout=aiohttp.ClientTimeout(total=self.req_timeout_secs))
         await OpenAIDisaggServer.wait_for_all_servers_ready(
             session, self.ctx_servers, self.gen_servers,
             self.server_start_timeout_secs)
@@ -146,8 +151,8 @@ class ConditionalWorkerTester(BasicWorkerTester):
     def __init__(self,
                  ctx_servers: List[str],
                  gen_servers: List[str],
-                 req_timeout_secs: int = 180,
-                 server_start_timeout_secs: int = 180,
+                 req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
+                 server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START,
                  model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"):
         super().__init__(ctx_servers, gen_servers, req_timeout_secs,
                          server_start_timeout_secs)
@@ -199,8 +204,8 @@ class KvCacheEventWorkerTester(BasicWorkerTester):
     def __init__(self,
                  ctx_servers: List[str],
                  gen_servers: List[str],
-                 req_timeout_secs: int = 180,
-                 server_start_timeout_secs: int = 240,
+                 req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
+                 server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START,
                  model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                  model_path: Optional[str] = None):
         super().__init__(ctx_servers, gen_servers, req_timeout_secs,
@@ -316,8 +321,8 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
     def __init__(self,
                  ctx_servers: List[str],
                  gen_servers: List[str],
-                 req_timeout_secs: int = 180,
-                 server_start_timeout_secs: int = 180,
+                 req_timeout_secs: int = DEFAULT_TIMEOUT_REQUEST,
+                 server_start_timeout_secs: int = DEFAULT_TIMEOUT_SERVER_START,
                  model_name: str = "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                  tokens_per_block: int = 32):
         super().__init__(ctx_servers, gen_servers, req_timeout_secs,

From 38db4bc7fbefa368b0bcc6205514f59537a19cce Mon Sep 17 00:00:00 2001
From: Zheng Duan <200704041+zhengd-nv@users.noreply.github.com>
Date: Wed, 16 Jul 2025 13:52:44 +0800
Subject: [PATCH 29/88] feat: use session abstraction in data transceiver and
 cache formatter (#5611)

Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com>
---
 .../batch_manager/cacheFormatter.cpp          | 88 +++++++++---------
 .../batch_manager/cacheFormatter.h            | 56 +++---------
 .../batch_manager/dataTransceiver.cpp         |  4 +-
 .../batch_manager/dataTransceiver.h           | 89 ++++++++++++++++++-
 .../batch_manager/dataTransceiverImpl.cpp     | 77 +++++++---------
 .../batch_manager/dataTransceiverImpl.h       |  8 +-
 .../batch_manager/mlaCacheFormatter.cpp       | 66 +++++++-------
 .../batch_manager/mlaCacheFormatter.h         | 13 +--
 .../batch_manager/cacheTransceiverTest.cpp    | 12 +--
 9 files changed, 229 insertions(+), 184 deletions(-)

diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
index fa6c3cfc4ed..848360b23da 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -34,6 +34,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <future>
+#include <numeric>
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
@@ -116,41 +117,45 @@ void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::
     }
 }
 
-std::vector<executor::kv_cache::Connection const*> CacheFormatter::pickRecvConnections(
-    std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-    SizeType32 selfIdx, CacheState const& destConfig) const
+std::vector<size_t> CacheFormatter::pickRecvConnections(
+    size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
 {
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
     if (targetInfo.mPeerDupHeadFactor <= 1)
     {
-        return connections;
+        std::vector<size_t> ret(numConnections);
+        std::iota(ret.begin(), ret.end(), 0);
+        return ret;
     }
-    TLLM_CHECK(connections.size() == targetInfo.mIRanks.size());
+    TLLM_CHECK(numConnections == targetInfo.mIRanks.size());
 
-    std::vector<executor::kv_cache::Connection const*> ret;
+    std::vector<size_t> ret;
     for (int i = 0; i < targetInfo.mDomainTPSize; i++)
     {
         if (i % targetInfo.mPeerDupHeadFactor == 0)
         {
             for (int j = 0; j < targetInfo.mDomainPPSize; j++)
             {
-                ret.push_back(connections.at((i * targetInfo.mDomainPPSize) + j));
+                ret.push_back((i * targetInfo.mDomainPPSize) + j);
             }
         }
     }
     return ret;
 }
 
-void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
-    std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-    SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
+void CacheFormatter::format(TransferSession& session)
 {
-    NVTX3_SCOPED_RANGE(formatOutput);
+    NVTX3_SCOPED_RANGE(CacheFormatter_format);
+    auto const& llmRequest = session.getLlmRequest();
     TLLM_LOG_DEBUG(
         mpi::MpiComm::world().getRank(), "Start sending KV cache for request ID: %ld.", llmRequest.mRequestId);
 
     TLLM_CHECK_WITH_INFO(llmRequest.mSamplingConfig.beamWidth == 1, "Currently, only beam width 1 is supported.");
-    TLLM_CHECK(!connections.empty());
+    auto const& connections = session.getConnections();
+    auto const& selfConfig = session.getSelfState().getCacheState().value();
+    auto const& destConfig = session.getOtherState().getCacheState().value();
+    auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
+    auto& bufferManager = session.getBufferManager();
     if (!needSendCache(selfConfig, destConfig, selfIdx))
     {
         return;
@@ -186,10 +191,10 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
                     TLLM_LOG_DEBUG("Block %p of pool %d shape = %s", it->data(), poolIdx,
                         runtime::ITensor::toString(it->getShape()).c_str());
                 }
-                for (auto const& connection : connections)
+                for (size_t i = 0; i < connections.size(); i++)
                 {
                     TLLM_LOG_DEBUG("Send layer %d(%d-%d)", layerIdx, poolIdx, layerIdxInPool);
-                    TransferHelper::sendBuffer(*connection, *layer, llmRequest.mRequestId);
+                    session.send(i, layer->data(), layer->getSizeInBytes());
                 }
             }
         }
@@ -247,13 +252,13 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
             TLLM_CHECK(connections.size() == 1);
 
             TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
-            for (auto const& connection : connections)
+            for (size_t i = 0; i < connections.size(); i++)
             {
                 for (auto const& [window, blocks] : inputKvCacheBlocks)
                 {
                     for (auto const& block : blocks)
                     {
-                        TransferHelper::sendBuffer(*connection, *block, llmRequest.mRequestId);
+                        session.send(i, block->data(), block->getSizeInBytes());
                     }
                 }
             }
@@ -308,9 +313,8 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
             if (bufferIdx < bufferCoverTargetNum)
             {
 
-                size = (*outputSplitCaches[bufferIdx]).getSizeInBytes();
-                TransferHelper::sendBuffer(
-                    *connections[processIdx], *outputSplitCaches[bufferIdx], llmRequest.mRequestId);
+                size = outputSplitCaches[bufferIdx]->getSizeInBytes();
+                session.send(processIdx, outputSplitCaches[bufferIdx]->data(), size);
             }
             else if (bufferCoverTargetNum > 0)
             {
@@ -318,9 +322,8 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
                 auto sendBufferIdx = bufferIdx % bufferCoverTargetNum;
                 bufferManager.copy(*outputSplitCaches[processIdx], *outputSplitCaches.at(sendBufferIdx));
                 bufferManager.getStream().synchronize();
-                size = (*outputSplitCaches.at(sendBufferIdx)).getSizeInBytes();
-                TransferHelper::sendBuffer(
-                    *connections[processIdx], *outputSplitCaches.at(sendBufferIdx), llmRequest.mRequestId);
+                size = outputSplitCaches.at(sendBufferIdx)->getSizeInBytes();
+                session.send(processIdx, outputSplitCaches.at(sendBufferIdx)->data(), size);
             }
             else
             {
@@ -341,7 +344,7 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
                     auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);
                     bufferManager.copy(*copySlice, *copyTargetSlice);
                     bufferManager.getStream().synchronize();
-                    TransferHelper::sendBuffer(*connections[processIdx], *copyTargetSlice, llmRequest.mRequestId);
+                    session.send(processIdx, copyTargetSlice->data(), sendSize);
                     remainSendSize -= sendSize;
                 }
             }
@@ -401,18 +404,21 @@ void CacheFormatter::formatOutput(LlmRequest const& llmRequest,
         mpi::MpiComm::world().getRank(), "End the sending of KV cache for the request ID:%ld ", llmRequest.mRequestId);
 }
 
-void CacheFormatter::formatInput(LlmRequest const& llmRequest,
-    std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-    SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
+void CacheFormatter::unformat(TransferSession& session)
 {
-    NVTX3_SCOPED_RANGE(formatInput);
+    NVTX3_SCOPED_RANGE(CacheFormatter_unformat);
+    auto const& llmRequest = session.getLlmRequest();
     TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
         "Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
         llmRequest.getContextPhaseParams().value().getReqId());
-    TLLM_CHECK(!connections.empty());
+    auto const& connections = session.getConnections();
+    auto const& selfConfig = session.getSelfState().getCacheState().value();
+    auto const& destConfig = session.getOtherState().getCacheState().value();
+    auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
+    auto& bufferManager = session.getBufferManager();
     auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest);
 
-    auto pickUpConnections = pickRecvConnections(connections, selfConfig, selfIdx, destConfig);
+    auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
 
     TLLM_LOG_DEBUG("pickUpConnections size: %d connections size: %d", pickUpConnections.size(), connections.size());
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
@@ -453,7 +459,6 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
     {
         NVTX3_SCOPED_RANGE(formatInputRecvBuffer);
 
-        auto reqId = llmRequest.getContextPhaseParams().value().getReqId();
         auto dataType = mCacheManager->getPrimaryPool(0)->getDataType();
         bool layerWise = common::getEnvDisaggLayerwise() && numPools == 1;
         if (layerWise)
@@ -494,14 +499,14 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
                         TLLM_LOG_DEBUG("Buffer %d of pool %d shape = %s", idx, poolIdx,
                             runtime::ITensor::toString(recvBufferTmps[idx]->getShape()).c_str());
                     }
-                    for (auto const& connection : pickUpConnections)
+                    for (size_t i = 0; i < pickUpConnections.size(); i++)
                     {
                         TLLM_LOG_DEBUG("Receive layer %d(%d-%d)", layerIdx, poolIdx, layerIdxInPool);
                         // Buffer dim: [numLayersInPool * layerVolume]
                         auto layer
                             = runtime::ITensor::slice(recvBufferTmps[idx], layerIdxInPool * layerVolume, layerVolume);
                         llmRequest.updateKvCacheSize((*layer).getSizeInBytes());
-                        TransferHelper::recvBuffer(*connection, *layer, reqId);
+                        session.recv(pickUpConnections[i], layer->data(), layer->getSizeInBytes());
                         idx++;
                     }
                 }
@@ -528,14 +533,14 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
                 TLLM_CHECK(pickUpConnections.size() == 1);
 
                 TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
-                for (auto const& connection : pickUpConnections)
+                for (size_t i = 0; i < pickUpConnections.size(); i++)
                 {
                     for (auto const& [window, blocks] : outputBuffersPerWindow)
                     {
                         for (auto const& block : blocks)
                         {
                             llmRequest.updateKvCacheSize((*block).getSizeInBytes());
-                            TransferHelper::recvBuffer(*connection, *block, reqId);
+                            session.recv(pickUpConnections[i], block->data(), block->getSizeInBytes());
                         }
                     }
                 }
@@ -586,7 +591,7 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
                 else
                 {
                     auto* agentConnnecion
-                        = dynamic_cast<executor::kv_cache::AgentConnection const*>(pickUpConnections[0]);
+                        = dynamic_cast<executor::kv_cache::AgentConnection const*>(connections[pickUpConnections[0]]);
                     if (agentConnnecion != nullptr)
                     {
                         cacheBufferId = agentConnnecion->getCacheBufferId();
@@ -639,8 +644,8 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
                         size_t blockIdx = idx % (blockNum);
                         size_t recvBufferIdx = blockIdx * pickUpConnections.size() + commIdx;
                         llmRequest.updateKvCacheSize((*recvSplitCaches[recvBufferIdx]).getSizeInBytes());
-                        TransferHelper::recvBuffer(
-                            *pickUpConnections[processIdx], *recvSplitCaches.at(recvBufferIdx), reqId);
+                        auto& buffer = recvSplitCaches.at(recvBufferIdx);
+                        session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
                         idx++;
                     }
                 }
@@ -649,15 +654,16 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
                     if (processIdx >= remainNoCoverTargetNum)
                     {
                         llmRequest.updateKvCacheSize((*recvSplitCaches.at(processIdx)).getSizeInBytes());
-                        TransferHelper::recvBuffer(*pickUpConnections[processIdx], *recvSplitCaches[processIdx], reqId);
+                        auto& buffer = recvSplitCaches[processIdx];
+                        session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
                     }
                     else if (bufferCoverTargetNum > 0)
                     {
                         auto recvBufferIdx = processIdx % bufferCoverTargetNum
                             + remainNoCoverTargetNum; // caches.at(recvBufferIdx) is allocated by cudaMalloc
                         llmRequest.updateKvCacheSize((*recvSplitCaches.at(recvBufferIdx)).getSizeInBytes());
-                        TransferHelper::recvBuffer(
-                            *pickUpConnections[processIdx], *recvSplitCaches.at(recvBufferIdx), reqId);
+                        auto& buffer = recvSplitCaches.at(recvBufferIdx);
+                        session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
                         bufferManager.copy(*recvSplitCaches.at(recvBufferIdx), *recvSplitCaches[processIdx]);
                         bufferManager.getStream().synchronize();
                     }
@@ -674,7 +680,7 @@ void CacheFormatter::formatInput(LlmRequest const& llmRequest,
                             auto copySlice = runtime::ITensor::slice(
                                 recvSplitCaches[processIdx], targetBufferSize - remainRecvSize, recvSize);
                             llmRequest.updateKvCacheSize((*recvSlice).getSizeInBytes());
-                            TransferHelper::recvBuffer(*pickUpConnections[processIdx], *recvSlice, reqId);
+                            session.recv(pickUpConnections[processIdx], recvSlice->data(), recvSlice->getSizeInBytes());
                             bufferManager.copy(*recvSlice, *copySlice);
                             bufferManager.getStream().synchronize();
                             remainRecvSize -= recvSize;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
index a30a3b0a188..36f6f57d169 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
+++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.h
@@ -26,36 +26,14 @@
 #include "tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h"
 #include "tensorrt_llm/executor/dataTransceiverState.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
-#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
 #include <NvInferRuntimeBase.h>
-#include <condition_variable>
 #include <cstddef>
 #include <cstdint>
-#include <iterator>
 
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
-class TransferHelper
-{
-public:
-    static void sendBuffer(
-        executor::kv_cache::Connection const& connection, runtime::IBuffer const& buf, uint64_t requestId)
-    {
-        int const tag = ((requestId & 0xFFF) << 8) | (kDATA_TAG & 0xFF);
-        connection.send(executor::kv_cache::DataContext{tag}, buf.data(), buf.getSizeInBytes());
-    }
-
-    static void recvBuffer(executor::kv_cache::Connection const& connection, runtime::IBuffer& buf, uint64_t requestId)
-    {
-        int const tag = ((requestId & 0xFFF) << 8) | (kDATA_TAG & 0xFF);
-        connection.recv(executor::kv_cache::DataContext{tag}, buf.data(), buf.getSizeInBytes());
-    }
-
-private:
-    static constexpr int32_t kDATA_TAG{43};
-};
-
 BlockRange getBlockRangeForSending(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest);
 
 BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmRequest const& llmRequest);
@@ -67,15 +45,13 @@ class BaseCacheFormatter
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
     using CacheState = executor::kv_cache::CacheState;
 
-    virtual void formatOutput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
-        = 0;
+    /// @brief Format the cache data into bytes for sending.
+    /// @param session The transfer session.
+    virtual void format(TransferSession& session) = 0;
 
-    virtual void formatInput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
-        = 0;
+    /// @brief Unformat the cache data from received bytes.
+    /// @param session The transfer session.
+    virtual void unformat(TransferSession& session) = 0;
 
     /// @brief Determine whether the sender is applicable to the source and target.
     /// @param selfConfig Source data arrangement.
@@ -94,9 +70,8 @@ class BaseCacheFormatter
 
     [[nodiscard]] virtual BaseKVCacheManager* getCacheManager() const noexcept = 0;
 
-    [[nodiscard]] virtual std::vector<executor::kv_cache::Connection const*> pickRecvConnections(
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig) const
+    [[nodiscard]] virtual std::vector<size_t> pickRecvConnections(
+        size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
         = 0;
 
     /// @brief Destructor.
@@ -116,13 +91,9 @@ class CacheFormatter final : public BaseCacheFormatter
         TLLM_CHECK(mCacheTransBufferManager);
     }
 
-    void formatOutput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager) override;
+    void format(TransferSession& session) override;
 
-    void formatInput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager) override;
+    void unformat(TransferSession& session) override;
 
     [[nodiscard]] bool inquireSupport(CacheState const& selfConfig, CacheState const& destConfig) const override;
 
@@ -138,9 +109,8 @@ class CacheFormatter final : public BaseCacheFormatter
     }
 
     static bool needSendCache(CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx);
-    std::vector<executor::kv_cache::Connection const*> pickRecvConnections(
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig) const override;
+    std::vector<size_t> pickRecvConnections(size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx,
+        CacheState const& destConfig) const override;
 
 private:
     BaseKVCacheManager* mCacheManager;
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
index 88f1380e0b3..a4617c0d53d 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp
@@ -363,8 +363,8 @@ class DataRequester::Impl
             llmRequest.getContextPhaseParams().value().getReqId());
         llmRequest.setKvCacheTransferStart(std::chrono::steady_clock::now());
         TLLM_CUDA_CHECK(cudaSetDevice(mDeviceId));
-        mReceiver->sendRequestInfo(llmRequest);
-        mReceiver->receiveSync(llmRequest);
+        auto session = mReceiver->sendRequestInfo(llmRequest);
+        mReceiver->receiveSync(session);
         llmRequest.setKvCacheTransferEnd(std::chrono::steady_clock::now());
 
         TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
index 638893f1a97..e92c6112de2 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.h
@@ -34,6 +34,11 @@
 namespace tensorrt_llm::batch_manager
 {
 
+// TODO: unify the following class into a namespace like tensorrt_llm::transmission
+using DataContext = tensorrt_llm::executor::kv_cache::DataContext;
+using Connection = tensorrt_llm::executor::kv_cache::Connection;
+using ConnectionManager = tensorrt_llm::executor::kv_cache::ConnectionManager;
+
 // Used to store the information that needs to be sent to the context executor to ensure the generation
 // executor smoothly receives the data.
 class RequestInfo
@@ -89,6 +94,84 @@ class RequestInfo
     executor::DataTransceiverState mTransState;
 };
 
+class TransferSession
+{
+public:
+    TransferSession(std::vector<Connection const*> connections, DataContext dataContext,
+        executor::DataTransceiverState const& selfState, executor::DataTransceiverState otherState,
+        runtime::BufferManager const& bufferManager, LlmRequest const* llmRequest = nullptr)
+        : mConnections(std::move(connections))
+        , mDataContext(dataContext)
+        , mSelfState(&selfState)
+        , mOtherState(std::move(otherState))
+        , mBufferManager(&bufferManager)
+        , mRequest(llmRequest)
+    {
+        TLLM_CHECK(!mConnections.empty());
+    }
+
+    [[nodiscard]] std::vector<Connection const*> const& getConnections() const
+    {
+        return mConnections;
+    }
+
+    // should be called only during the initialization of the TransferSession
+    void setConnection(size_t idx, Connection const* conn)
+    {
+        mConnections.at(idx) = conn;
+    }
+
+    [[nodiscard]] DataContext const& getDataContext() const
+    {
+        return mDataContext;
+    }
+
+    [[nodiscard]] executor::DataTransceiverState const& getSelfState() const
+    {
+        return *mSelfState;
+    }
+
+    [[nodiscard]] executor::DataTransceiverState const& getOtherState() const
+    {
+        return mOtherState;
+    }
+
+    [[nodiscard]] runtime::BufferManager const& getBufferManager() const
+    {
+        return *mBufferManager;
+    }
+
+    void send(size_t idx, void const* data, size_t size)
+    {
+        mConnections.at(idx)->send(mDataContext, data, size);
+    }
+
+    void recv(size_t idx, void* data, size_t size)
+    {
+        mConnections.at(idx)->recv(mDataContext, data, size);
+    }
+
+    [[nodiscard]] LlmRequest const& getLlmRequest() const
+    {
+        TLLM_CHECK(mRequest != nullptr);
+        return *mRequest;
+    }
+
+    // in DataSender, the LlmRequest is not available until the sendSync is called
+    void setLlmRequest(LlmRequest const& llmRequest)
+    {
+        mRequest = &llmRequest;
+    }
+
+private:
+    std::vector<Connection const*> mConnections;
+    DataContext mDataContext;
+    executor::DataTransceiverState const* mSelfState; // stored in DataRequester/DataResponder
+    executor::DataTransceiverState mOtherState;
+    runtime::BufferManager const* mBufferManager;
+    LlmRequest const* mRequest;
+};
+
 // Operators required for data transmission in specific communication protocols.
 class DataSender
 {
@@ -123,11 +206,11 @@ class DataReceiver
 public:
     /// @brief Send the request information.
     /// @param llmRequest The request object to which the information belongs.
-    virtual void sendRequestInfo(LlmRequest const& llmRequest) = 0;
+    virtual TransferSession sendRequestInfo(LlmRequest const& llmRequest) = 0;
 
     /// @brief Synchronously receive data.
-    /// @param llmRequest The request object to which the data belongs.
-    virtual void receiveSync(LlmRequest const& llmRequest) = 0;
+    /// @param session The transfer session.
+    virtual void receiveSync(TransferSession& session) = 0;
 
     /// @brief Destructor.
     virtual ~DataReceiver() = default;
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
index dddd491d504..e8adabed7f2 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.cpp
@@ -24,6 +24,12 @@
 namespace tensorrt_llm::batch_manager
 {
 
+static int32_t tagFromRequestId(LlmRequest::RequestIdType requestId)
+{
+    constexpr int32_t kDATA_TAG{43};
+    return ((requestId & 0xFFF) << 8) | (kDATA_TAG & 0xFF);
+}
+
 DataSenderImpl::DataSenderImpl(executor::kv_cache::ConnectionManager* manager,
     executor::kv_cache::CacheState selfCacheState, SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter)
     : mManager{manager}
@@ -75,33 +81,25 @@ DataSenderImpl::DataSenderImpl(executor::kv_cache::ConnectionManager* manager,
             peerRelativeRanks.begin(), peerRelativeRanks.end(), info.getTransState().getCommState()->getSelfIdx()));
     {
         std::unique_lock<std::mutex> lk(mMtxForMap);
-        auto it = mRequestToComms.find(requestId);
-        if (it == mRequestToComms.end())
+        auto it = mRequestToSession.find(requestId);
+        if (it == mRequestToSession.end())
         {
-            int recvExpectCount = peerRelativeRanks.size();
-            {
-                it = mRequestToComms.emplace(requestId, RequestMapInfo{}).first;
-                it->second.resize(recvExpectCount);
-            }
+            auto session = TransferSession(std::vector<Connection const*>(peerRelativeRanks.size(), nullptr),
+                DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager);
+            it = mRequestToSession.emplace(requestId, std::move(session)).first;
         }
-        it->second[peerIdx] = {connection, info.getTransState()};
+        it->second.setConnection(peerIdx, connection);
     }
     return info;
 }
 
 void DataSenderImpl::sendSync(LlmRequest const& llmRequest)
 {
-    std::vector<executor::kv_cache::Connection const*> connections;
-    auto it = mRequestToComms.find(llmRequest.mRequestId);
-    TLLM_CHECK(it != mRequestToComms.end());
-    auto const& reqToComm = it->second;
-    for (auto&& [connection, dataTransceiverState] : reqToComm)
-    {
-        connections.emplace_back(connection);
-    }
-    auto&& dataTransceiverState = reqToComm.at(0).second;
-    mFormatter->formatOutput(llmRequest, std::move(connections), mSelfState.getCacheState().value(),
-        mSelfState.getCommState().value().getSelfIdx(), dataTransceiverState.getCacheState().value(), mBufferManager);
+    auto it = mRequestToSession.find(llmRequest.mRequestId);
+    TLLM_CHECK(it != mRequestToSession.end());
+    auto& session = it->second;
+    session.setLlmRequest(llmRequest);
+    mFormatter->format(session);
 }
 
 [[nodiscard]] executor::kv_cache::CommState const& DataSenderImpl::getCommState() const
@@ -116,17 +114,17 @@ void DataSenderImpl::setCommState(executor::kv_cache::CommState commState)
 
 [[nodiscard]] size_t DataSenderImpl::getCounterpartsCount(LlmRequest::RequestIdType requestId) const
 {
-    auto it = mRequestToComms.find(requestId);
-    TLLM_CHECK(it != mRequestToComms.end());
-    return it->second.size();
+    auto it = mRequestToSession.find(requestId);
+    TLLM_CHECK(it != mRequestToSession.end());
+    return it->second.getConnections().size();
 }
 
 void DataSenderImpl::release(LlmRequest::RequestIdType requestId)
 {
-    auto it = mRequestToComms.find(requestId);
-    TLLM_CHECK(it != mRequestToComms.end());
+    auto it = mRequestToSession.find(requestId);
+    TLLM_CHECK(it != mRequestToSession.end());
     std::unique_lock<std::mutex> lk(mMtxForMap);
-    mRequestToComms.erase(it);
+    mRequestToSession.erase(it);
 }
 
 DataReceiverImpl::DataReceiverImpl(executor::kv_cache::ConnectionManager* manager,
@@ -140,7 +138,7 @@ DataReceiverImpl::DataReceiverImpl(executor::kv_cache::ConnectionManager* manage
     TLLM_CHECK(mFormatter);
 }
 
-void DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
+TransferSession DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
 {
     uint64_t requestId = llmRequest.getContextPhaseParams().value().getReqId();
     auto const& contextState = llmRequest.getDataTransceiverState();
@@ -179,17 +177,17 @@ void DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
         auto const* connection = connections.at(index);
         counterPartConnections.emplace_back(connection);
     }
-    auto pickUpConnections = mFormatter->pickRecvConnections(counterPartConnections, mSelfState.getCacheState().value(),
+    auto pickUpIdx = mFormatter->pickRecvConnections(counterParts.size(), mSelfState.getCacheState().value(),
         mSelfState.getCommState().value().getSelfIdx(), destCacheState);
-    for (auto connection : counterPartConnections)
+    for (size_t i = 0; i < counterPartConnections.size(); i++)
     {
+        auto const* connection = counterPartConnections[i];
         // if Manager is agentConnectionManager, then send request info to agent
         auto* agentConnectionManager = dynamic_cast<executor::kv_cache::AgentConnectionManager*>(mManager);
         if (agentConnectionManager != nullptr)
         {
             // TODO: index -> validConnectionIdx conversion
-            auto valideConnectionIdx
-                = std::find(pickUpConnections.begin(), pickUpConnections.end(), connection) - pickUpConnections.begin();
+            auto valideConnectionIdx = std::find(pickUpIdx.begin(), pickUpIdx.end(), i) - pickUpIdx.begin();
             auto* agentConnection = dynamic_cast<executor::kv_cache::AgentConnection const*>(connection);
             TLLM_CHECK(agentConnection != nullptr);
             TLLM_CHECK(cacheBufferId.has_value());
@@ -201,23 +199,14 @@ void DataReceiverImpl::sendRequestInfo(LlmRequest const& llmRequest)
             sendRequestInfo(connection, requestInfo);
         }
     }
+    auto const& resource = getReceiveCacheResource(llmRequest);
+    return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId)}, mSelfState,
+        contextState, resource->mBufferManager, &llmRequest);
 }
 
-void DataReceiverImpl::receiveSync(LlmRequest const& llmRequest)
+void DataReceiverImpl::receiveSync(TransferSession& session)
 {
-    auto const& contextState = llmRequest.getDataTransceiverState();
-    auto const& commState = contextState.getCommState().value();
-    auto const& destCacheState = contextState.getCacheState().value();
-    std::vector<tensorrt_llm::executor::kv_cache::Connection const*> connections;
-    for (auto index : mFormatter->getCounterparts(
-             mSelfState.getCacheState().value(), mSelfState.getCommState().value().getSelfIdx(), destCacheState))
-    {
-        auto const* connection = mManager->getConnections(commState).at(index);
-        connections.emplace_back(connection);
-    }
-    auto const& resource = getReceiveCacheResource(llmRequest);
-    mFormatter->formatInput(llmRequest, std::move(connections), mSelfState.getCacheState().value(),
-        mSelfState.getCommState().value().getSelfIdx(), destCacheState, resource->mBufferManager);
+    mFormatter->unformat(session);
 }
 
 void DataReceiverImpl::sendRequestInfo(executor::kv_cache::Connection const* connection, RequestInfo const& info)
diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
index 8f328bfaede..fa8d2728329 100644
--- a/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
+++ b/cpp/tensorrt_llm/batch_manager/dataTransceiverImpl.h
@@ -44,8 +44,6 @@ class DataSenderImpl : public DataSender, public TransceiverTag
 {
 public:
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using RequestMapInfo
-        = std::vector<std::pair<executor::kv_cache::Connection const*, executor::DataTransceiverState>>;
 
     DataSenderImpl(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState,
         SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter);
@@ -64,7 +62,7 @@ class DataSenderImpl : public DataSender, public TransceiverTag
 
 private:
     executor::kv_cache::ConnectionManager* mManager;
-    std::map<LlmRequest::RequestIdType, RequestMapInfo> mRequestToComms;
+    std::map<LlmRequest::RequestIdType, TransferSession> mRequestToSession;
     executor::DataTransceiverState mSelfState;
     std::unique_ptr<BaseCacheFormatter> mFormatter;
     std::mutex mMtxForMap;
@@ -79,9 +77,9 @@ class DataReceiverImpl : public DataReceiver, public TransceiverTag
     DataReceiverImpl(executor::kv_cache::ConnectionManager* manager, executor::kv_cache::CacheState selfCacheState,
         SizeType32 selfIndex, std::unique_ptr<BaseCacheFormatter> formatter);
 
-    void sendRequestInfo(LlmRequest const& llmRequest) override;
+    TransferSession sendRequestInfo(LlmRequest const& llmRequest) override;
 
-    void receiveSync(LlmRequest const& llmRequest) override;
+    void receiveSync(TransferSession& session) override;
 
 private:
     struct ReceiveCacheResource
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
index cff94e85544..8d7be6594fd 100644
--- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
+++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -38,20 +38,17 @@ namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
 // some context rank in connection
-std::vector<executor::kv_cache::Connection const*> MLACacheFormatter::pickRecvConnections(
-    std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-    SizeType32 selfIdx, CacheState const& destConfig) const
+std::vector<size_t> MLACacheFormatter::pickRecvConnections(
+    size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx, CacheState const& destConfig) const
 {
 
-    TLLM_CHECK(!connections.empty());
-
     auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
-    TLLM_CHECK(targetInfo.mIRanks.size() == connections.size());
-    std::vector<executor::kv_cache::Connection const*> ret;
+    TLLM_CHECK(numConnections == targetInfo.mIRanks.size());
+    std::vector<size_t> ret;
     // targetInfo , mRanks [tpranks, dpranks]
     for (int i = 0; i < targetInfo.mDomainPPSize; i++)
     {
-        ret.push_back(connections.at(i));
+        ret.push_back(i);
     }
     return ret;
 }
@@ -87,13 +84,17 @@ bool MLACacheFormatter::needSendCache(
     return selfTpRank % (selfTPNum / destTPNum) == 0;
 }
 
-void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
-    std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-    SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
+void MLACacheFormatter::format(TransferSession& session)
 {
-    NVTX3_SCOPED_RANGE(formatOutput);
+    NVTX3_SCOPED_RANGE(MLACacheFormatter_format);
+    auto const& llmRequest = session.getLlmRequest();
     TLLM_LOG_DEBUG(
         mpi::MpiComm::world().getRank(), "Start sending KV cache for request ID: %ld.", llmRequest.mRequestId);
+    auto const& selfConfig = session.getSelfState().getCacheState().value();
+    auto const& destConfig = session.getOtherState().getCacheState().value();
+    auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
+    auto const& connections = session.getConnections();
+    auto& bufferManager = session.getBufferManager();
     TLLM_CHECK_WITH_INFO(llmRequest.mSamplingConfig.beamWidth == 1, "Currently only supports beam width 1.");
     TLLM_CHECK(!connections.empty());
     // diff start
@@ -103,7 +104,6 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
     }
 
     // diff end
-    auto reqId = llmRequest.mRequestId;
 
     auto const numPools = mCacheManager->getBlockManager().getNumPools();
     auto blockRange = getBlockRangeForSending(mCacheManager, llmRequest);
@@ -130,11 +130,11 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
         NVTX3_SCOPED_RANGE(sendBufferFun);
 
         TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
-        for (auto const& connection : connections)
+        for (size_t i = 0; i < connections.size(); i++)
         {
             for (auto const& block : inputKvCacheBlocks)
             {
-                TransferHelper::sendBuffer(*connection, *block, reqId);
+                session.send(i, block->data(), block->getSizeInBytes());
             }
         }
 
@@ -193,7 +193,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
         if (cacheIdx < bufferCoverTargetNum)
         {
             size = outputSplitCaches.at(cacheIdx)->getSizeInBytes();
-            TransferHelper::sendBuffer(*connections.at(processIdx), *outputSplitCaches.at(cacheIdx), reqId);
+            session.send(processIdx, outputSplitCaches.at(cacheIdx)->data(), size);
         }
         else if (bufferCoverTargetNum > 0)
         {
@@ -202,7 +202,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
             size = outputSplitCaches.at(sendBufferIdx)->getSizeInBytes();
             bufferManager.copy(*outputSplitCaches.at(cacheIdx), *outputSplitCaches.at(sendBufferIdx));
             bufferManager.getStream().synchronize();
-            TransferHelper::sendBuffer(*connections.at(processIdx), *outputSplitCaches.at(sendBufferIdx), reqId);
+            session.send(processIdx, outputSplitCaches.at(sendBufferIdx)->data(), size);
         }
         else
         {
@@ -220,7 +220,7 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
                 auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);
                 bufferManager.copy(*copySlice, *copyTargetSlice);
                 bufferManager.getStream().synchronize();
-                TransferHelper::sendBuffer(*connections.at(processIdx), *copyTargetSlice, reqId);
+                session.send(processIdx, copyTargetSlice->data(), sendSize);
 
                 remainSendSize -= sendSize;
             }
@@ -277,19 +277,21 @@ void MLACacheFormatter::formatOutput(LlmRequest const& llmRequest,
         mpi::MpiComm::world().getRank(), "End the sending of KV cache for the request ID: %ld.", llmRequest.mRequestId);
 }
 
-void MLACacheFormatter::formatInput(LlmRequest const& llmRequest,
-    std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-    SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager)
+void MLACacheFormatter::unformat(TransferSession& session)
 {
-    NVTX3_SCOPED_RANGE(formatInput);
+    NVTX3_SCOPED_RANGE(MLACacheFormatter_unformat);
+    auto const& llmRequest = session.getLlmRequest();
     TLLM_CHECK_WITH_INFO(llmRequest.mSamplingConfig.beamWidth == 1, "Currently only supports beam width 1.");
     TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
         "Start receiving KV cache for request ID: %ld, context request ID: %ld.", llmRequest.mRequestId,
         llmRequest.getContextPhaseParams().value().getReqId());
-    auto reqId = llmRequest.getContextPhaseParams().value().getReqId();
-    TLLM_CHECK(!connections.empty());
+    auto const& selfConfig = session.getSelfState().getCacheState().value();
+    auto const& destConfig = session.getOtherState().getCacheState().value();
+    auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
+    auto const& connections = session.getConnections();
+    auto& bufferManager = session.getBufferManager();
     // diff start
-    auto pickUpConnections = pickRecvConnections(connections, selfConfig, selfIdx, destConfig);
+    auto pickUpConnections = pickRecvConnections(connections.size(), selfConfig, selfIdx, destConfig);
     // diff end
     auto blockRange = getBlockRangeForReceiving(mCacheManager, llmRequest);
     std::vector<runtime::ITensor::SharedPtr> recvBufferTmps;
@@ -319,11 +321,11 @@ void MLACacheFormatter::formatInput(LlmRequest const& llmRequest,
         NVTX3_SCOPED_RANGE(recvBufferFun);
         TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
         TLLM_CHECK(pickUpConnections.size() == 1);
-        for (auto const& connection : pickUpConnections)
+        for (size_t i = 0; i < pickUpConnections.size(); i++)
         {
             for (auto const& block : outputBuffers)
             {
-                TransferHelper::recvBuffer(*connection, *block, reqId);
+                session.recv(pickUpConnections[i], block->data(), block->getSizeInBytes());
             }
         }
         TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
@@ -375,15 +377,15 @@ void MLACacheFormatter::formatInput(LlmRequest const& llmRequest,
 
             if (processIdx >= remainNoCoverTargetNum)
             {
-
-                TransferHelper::recvBuffer(*pickUpConnections.at(processIdx), *recvSplitCaches.at(processIdx), reqId);
+                auto& buffer = recvSplitCaches.at(processIdx);
+                session.recv(pickUpConnections.at(processIdx), buffer->data(), buffer->getSizeInBytes());
             }
             else if (bufferCoverTargetNum > 0)
             {
                 auto recvBufferIdx = processIdx % bufferCoverTargetNum
                     + remainNoCoverTargetNum; // caches.at(recvBufferIdx) is allocated by cudaMalloc
-                TransferHelper::recvBuffer(
-                    *pickUpConnections.at(processIdx), *recvSplitCaches.at(recvBufferIdx), reqId);
+                auto& buffer = recvSplitCaches.at(recvBufferIdx);
+                session.recv(pickUpConnections.at(processIdx), buffer->data(), buffer->getSizeInBytes());
                 bufferManager.copy(*recvSplitCaches.at(recvBufferIdx), *recvSplitCaches.at(processIdx));
                 bufferManager.getStream().synchronize();
             }
@@ -399,7 +401,7 @@ void MLACacheFormatter::formatInput(LlmRequest const& llmRequest,
                     auto recvSlice = runtime::ITensor::slice(preAllocRecvBuffer, 0, recvSize);
                     auto copySlice = runtime::ITensor::slice(
                         recvSplitCaches.at(processIdx), targetBufferSize - remainRecvSize, recvSize);
-                    TransferHelper::recvBuffer(*pickUpConnections.at(processIdx), *recvSlice, reqId);
+                    session.recv(pickUpConnections.at(processIdx), recvSlice->data(), recvSlice->getSizeInBytes());
                     bufferManager.copy(*recvSlice, *copySlice);
                     bufferManager.getStream().synchronize();
                     remainRecvSize -= recvSize;
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h
index 9d977d73387..c96e000e612 100644
--- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h
+++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.h
@@ -35,13 +35,9 @@ class MLACacheFormatter final : public BaseCacheFormatter
         TLLM_CHECK(mCacheTransBufferManager);
     }
 
-    void formatOutput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager) override;
+    void format(TransferSession& session) override;
 
-    void formatInput(LlmRequest const& llmRequest,
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig, runtime::BufferManager const& bufferManager) override;
+    void unformat(TransferSession& session) override;
 
     [[nodiscard]] bool inquireSupport(CacheState const& selfConfig, CacheState const& destConfig) const override;
 
@@ -57,9 +53,8 @@ class MLACacheFormatter final : public BaseCacheFormatter
     }
 
     static bool needSendCache(CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx);
-    std::vector<executor::kv_cache::Connection const*> pickRecvConnections(
-        std::vector<executor::kv_cache::Connection const*> const& connections, CacheState const& selfConfig,
-        SizeType32 selfIdx, CacheState const& destConfig) const override;
+    std::vector<size_t> pickRecvConnections(size_t numConnections, CacheState const& selfConfig, SizeType32 selfIdx,
+        CacheState const& destConfig) const override;
 
 private:
     BaseKVCacheManager* mCacheManager;
diff --git a/cpp/tests/batch_manager/cacheTransceiverTest.cpp b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
index 5c3bbcb1278..99c40f810f6 100644
--- a/cpp/tests/batch_manager/cacheTransceiverTest.cpp
+++ b/cpp/tests/batch_manager/cacheTransceiverTest.cpp
@@ -186,8 +186,8 @@ texec::kv_cache::CommState MockDataSender::mState;
 class MockDataReceiver : public DataReceiver
 {
 public:
-    MOCK_METHOD(void, sendRequestInfo, (LlmRequest const&), (override));
-    MOCK_METHOD(void, receiveSync, (LlmRequest const&), (override));
+    MOCK_METHOD(TransferSession, sendRequestInfo, (LlmRequest const&), (override));
+    MOCK_METHOD(void, receiveSync, (TransferSession&), (override));
 };
 
 class MockTransceiverTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init)
@@ -237,12 +237,14 @@ TEST_F(MockTransceiverTest, MpiRequesterBasic)
         GTEST_SKIP() << "mpirun with procs<=2 is required to run this test.";
     }
     auto receiver = std::make_unique<MockDataReceiver>();
-    EXPECT_CALL(*receiver, sendRequestInfo).WillOnce(Return());
+    auto state = std::make_unique<texec::DataTransceiverState>();
+    state->setCommState(texec::kv_cache::CommState{std::vector<int>{0}});
+    EXPECT_CALL(*receiver, sendRequestInfo)
+        .WillOnce(Return(TransferSession({nullptr}, DataContext{0}, *state, *state,
+            tensorrt_llm::runtime::BufferManager{std::make_shared<tr::CudaStream>()}, nullptr)));
     EXPECT_CALL(*receiver, receiveSync).WillOnce(Return());
     DataRequester requester{std::move(receiver)};
     auto request = makeLlmRequest(0);
-    auto state = std::make_unique<texec::DataTransceiverState>();
-    state->setCommState(texec::kv_cache::CommState{std::vector<int>{0}});
     auto stats = texec::ContextPhaseParams({}, 0, state.release(), std::nullopt);
     request->setContextPhaseParams(std::move(stats));
     auto future = requester.requestAndReceiveAsync(*request);

From ec3ebae43e6796d34c9549efc1f28860a243cf6b Mon Sep 17 00:00:00 2001
From: Bo Deng <deemod@nvidia.com>
Date: Wed, 16 Jul 2025 13:54:42 +0800
Subject: [PATCH 30/88] [TRTLLM-6471] Infra: Upgrade NIXL to 0.3.1 (#5991)

Signed-off-by: Rabia Loulou <174243936+rabial-nv@users.noreply.github.com>
Signed-off-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
Signed-off-by: Bo Deng <deemod@nvidia.com>
Co-authored-by: Rabia Loulou <174243936+rabial-nv@users.noreply.github.com>
Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
---
 .../unit_tests/executor/transferAgentTest.cpp | 62 ++++++++++---------
 docker/Dockerfile.multi                       |  4 ++
 docker/common/install_nixl.sh                 | 57 +++++++----------
 docker/common/install_ucx.sh                  | 30 +++++++++
 jenkins/current_image_tags.properties         |  8 +--
 5 files changed, 91 insertions(+), 70 deletions(-)
 create mode 100644 docker/common/install_ucx.sh

diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
index 38f57204534..e58c32796e2 100644
--- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp
+++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp
@@ -260,42 +260,43 @@ TEST_F(TransferAgentTest, SyncMessage)
     auto status = nixlAgent0->submitTransferRequests(writeReq);
     status->wait();
 
+    const size_t MAX_QUERY_TIMES = std::numeric_limits<size_t>::max();
     auto notif = nixlAgent1->getNotifiedSyncMessages();
+    for (size_t i = 0; i < MAX_QUERY_TIMES && notif.size() == 0; i++)
+    {
+        notif = nixlAgent1->getNotifiedSyncMessages();
+    }
     TLLM_CHECK(notif.size() == 1);
     TLLM_CHECK(notif[agent0].size() == 1);
     TLLM_CHECK(notif[agent0][0] == syncMessage);
+
     TLLM_CHECK(memory0 == memory1);
 
     std::string syncMessage2 = "two_agent_sync_message";
     nixlAgent0->notifySyncMessage(agent1, syncMessage2);
-    while (true)
+    auto notif2 = nixlAgent1->getNotifiedSyncMessages();
+    for (size_t i = 0; i < MAX_QUERY_TIMES && notif2.size() == 0; i++)
     {
-        auto notif2 = nixlAgent1->getNotifiedSyncMessages();
-        if (notif2.size() > 0)
-        {
-            TLLM_CHECK(notif2.size() == 1);
-            TLLM_CHECK(notif2[agent0].size() == 1);
-            TLLM_CHECK(notif2[agent0][0] == syncMessage2);
-            break;
-        }
+        notif2 = nixlAgent1->getNotifiedSyncMessages();
     }
+    TLLM_CHECK(notif2.size() == 1);
+    TLLM_CHECK(notif2[agent0].size() == 1);
+    TLLM_CHECK(notif2[agent0][0] == syncMessage2);
 
     // nixlAgent1->loadRemoteAgent(agent0);
     auto connectionInfo2 = nixlAgent0->getConnectionInfo();
     nixlAgent1->connectRemoteAgent(agent0, connectionInfo2);
     std::string syncMessage3 = "three_agent_sync_message";
     nixlAgent1->notifySyncMessage(agent0, syncMessage3);
-    while (true)
+    auto notif3 = nixlAgent0->getNotifiedSyncMessages();
+    for (size_t i = 0; i < MAX_QUERY_TIMES && notif3.size() == 0; i++)
     {
-        auto notif3 = nixlAgent0->getNotifiedSyncMessages();
-        if (notif3.size() > 0)
-        {
-            TLLM_CHECK(notif3.size() == 1);
-            TLLM_CHECK(notif3[agent1].size() == 1);
-            TLLM_CHECK(notif3[agent1][0] == syncMessage3);
-            break;
-        }
+        notif3 = nixlAgent0->getNotifiedSyncMessages();
     }
+    TLLM_CHECK(notif3.size() == 1);
+    TLLM_CHECK(notif3[agent1].size() == 1);
+    TLLM_CHECK(notif3[agent1][0] == syncMessage3);
+
     bool checked2 = false;
     do
     {
@@ -308,6 +309,10 @@ TEST_F(TransferAgentTest, SyncMessage)
     auto status1 = nixlAgent1->submitTransferRequests(writeReq1);
     status1->wait();
     auto notif4 = nixlAgent0->getNotifiedSyncMessages();
+    for (size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++)
+    {
+        notif4 = nixlAgent0->getNotifiedSyncMessages();
+    }
     TLLM_CHECK(notif4.size() == 1);
     TLLM_CHECK(notif4[agent1].size() == 1);
     TLLM_CHECK(notif4[agent1][0] == syncMessage4);
@@ -322,20 +327,17 @@ TEST_F(TransferAgentTest, SyncMessage)
     Serialization::serialize(state, ss);
     std::string serializedState = ss.str();
     nixlAgent0->notifySyncMessage(agent1, serializedState);
-    while (true)
+    auto notif5 = nixlAgent1->getNotifiedSyncMessages();
+    for (size_t i = 0; i < MAX_QUERY_TIMES && notif5.size() == 0; i++)
     {
-        auto notif5 = nixlAgent1->getNotifiedSyncMessages();
-        if (notif5.size() > 0)
-        {
-            TLLM_CHECK(notif5.size() == 1);
-            TLLM_CHECK(notif5[agent0].size() == 1);
-            TLLM_CHECK(notif5[agent0][0] == serializedState);
-            std::stringstream ss2(notif5[agent0][0]);
-            auto state2 = Serialization::deserializeCommState(ss2);
-            TLLM_CHECK(state2 == state);
-            break;
-        }
+        notif5 = nixlAgent1->getNotifiedSyncMessages();
     }
+    TLLM_CHECK(notif5.size() == 1);
+    TLLM_CHECK(notif5[agent0].size() == 1);
+    TLLM_CHECK(notif5[agent0][0] == serializedState);
+    std::stringstream ss2(notif5[agent0][0]);
+    auto state2 = Serialization::deserializeCommState(ss2);
+    TLLM_CHECK(state2 == state);
 
     nixlAgent0->invalidateRemoteAgent(agent1);
     nixlAgent1->invalidateRemoteAgent(agent0);
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index da67d0a4994..19b58c24939 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -99,6 +99,10 @@ COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches
 COPY docker/common/install_triton.sh install_triton.sh
 RUN bash ./install_triton.sh && rm install_triton.sh
 
+# Install UCX first
+COPY docker/common/install_ucx.sh install_ucx.sh
+RUN bash ./install_ucx.sh && rm install_ucx.sh
+
 # Install NIXL
 COPY docker/common/install_nixl.sh install_nixl.sh
 RUN bash ./install_nixl.sh && rm install_nixl.sh
diff --git a/docker/common/install_nixl.sh b/docker/common/install_nixl.sh
index 23943449513..18ee554f693 100644
--- a/docker/common/install_nixl.sh
+++ b/docker/common/install_nixl.sh
@@ -2,56 +2,41 @@
 set -ex
 
 GITHUB_URL="https://github.com"
-
-UCX_VERSION="v1.18.1"
 UCX_INSTALL_PATH="/usr/local/ucx/"
 CUDA_PATH="/usr/local/cuda"
-
-NIXL_VERSION="0.2.0"
-
-UCX_REPO="https://github.com/openucx/ucx.git"
+NIXL_VERSION="0.3.1"
 NIXL_REPO="https://github.com/ai-dynamo/nixl.git"
 
-
-if [ ! -d ${UCX_INSTALL_PATH} ]; then
-  git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
-  cd ucx
-  ./autogen.sh
-  ./contrib/configure-release       \
-    --prefix=${UCX_INSTALL_PATH}    \
-    --enable-shared                 \
-    --disable-static                \
-    --disable-doxygen-doc           \
-    --enable-optimizations          \
-    --enable-cma                    \
-    --enable-devel-headers          \
-    --with-cuda=${CUDA_PATH}        \
-    --with-verbs                    \
-    --with-dm                       \
-    --enable-mt
-  make install -j$(nproc)
-  cd ..
-  rm -rf ucx  # Remove UCX source to save space
-  echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
-fi
-
 ARCH_NAME="x86_64-linux-gnu"
+GDS_PATH="$CUDA_PATH/targets/x86_64-linux"
 if [ "$(uname -m)" != "amd64" ] && [ "$(uname -m)" != "x86_64" ]; then
   ARCH_NAME="aarch64-linux-gnu"
-  EXTRA_NIXL_ARGS="-Ddisable_gds_backend=true"
-fi
-
-if [ $ARCH_NAME != "x86_64-linux-gnu" ]; then
-  echo "The NIXL backend is temporarily unavailable on the aarch64 platform. Exiting script."
-  exit 0
+  GDS_PATH="$CUDA_PATH/targets/sbsa-linux"
 fi
 
 pip3 install --no-cache-dir meson ninja pybind11
 git clone --depth 1 -b ${NIXL_VERSION} ${NIXL_REPO}
 cd nixl
-meson setup builddir -Ducx_path=${UCX_INSTALL_PATH} -Dstatic_plugins=UCX ${EXTRA_NIXL_ARGS}
+
+cuda_path=$(find / -name "libcuda.so.1" 2>/dev/null | head -n1)
+if [[ -z "$cuda_path" ]]; then
+    echo "libcuda.so.1 not found "
+    exit 1
+fi
+
+ln -sf $cuda_path $CUDA_PATH/lib64/libcuda.so.1
+
+meson setup builddir \
+    -Ducx_path=$UCX_INSTALL_PATH \
+    -Dcudapath_lib="$CUDA_PATH/lib64" \
+    -Dcudapath_inc="$CUDA_PATH/include" \
+    -Dgds_path="$GDS_PATH" \
+    -Dinstall_headers=true \
+    -Dstatic_plugins=UCX
+
 cd builddir && ninja install
 cd ../..
 rm -rf nixl*  # Remove NIXL source tree to save space
+rm  $CUDA_PATH/lib64/libcuda.so.1
 
 echo "export LD_LIBRARY_PATH=/opt/nvidia/nvda_nixl/lib/${ARCH_NAME}:/opt/nvidia/nvda_nixl/lib64:\$LD_LIBRARY_PATH" >> "${ENV}"
diff --git a/docker/common/install_ucx.sh b/docker/common/install_ucx.sh
new file mode 100644
index 00000000000..22f444d9746
--- /dev/null
+++ b/docker/common/install_ucx.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -ex
+
+GITHUB_URL="https://github.com"
+UCX_VERSION="v1.18.1"
+UCX_INSTALL_PATH="/usr/local/ucx/"
+CUDA_PATH="/usr/local/cuda"
+UCX_REPO="https://github.com/openucx/ucx.git"
+
+if [ ! -d ${UCX_INSTALL_PATH} ]; then
+  git clone --depth 1 -b ${UCX_VERSION} ${UCX_REPO}
+  cd ucx
+  ./autogen.sh
+  ./contrib/configure-release       \
+    --prefix=${UCX_INSTALL_PATH}    \
+    --enable-shared                 \
+    --disable-static                \
+    --disable-doxygen-doc           \
+    --enable-optimizations          \
+    --enable-cma                    \
+    --enable-devel-headers          \
+    --with-cuda=${CUDA_PATH}        \
+    --with-verbs                    \
+    --with-dm                       \
+    --enable-mt
+  make install -j$(nproc)
+  cd ..
+  rm -rf ucx  # Remove UCX source to save space
+  echo "export LD_LIBRARY_PATH=${UCX_INSTALL_PATH}/lib:\$LD_LIBRARY_PATH" >> "${ENV}"
+fi
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index 11552a7f131..5836d212c5e 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -8,7 +8,7 @@
 # NB: Although string interpolation is supported, redundant substrings are
 #     kept in the variables below for interoperability with
 #     scripts/rename_docker_images.py
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507101530-5434
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507101530-5434
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202507101530-5434
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202507101530-5434
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507150652-9504
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507150652-9504
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202507150652-9504
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202507150652-9504

From f5f31beee1ee1d04bb8f9251095df45c09370b7a Mon Sep 17 00:00:00 2001
From: peaceh-nv <103117813+peaceh-nv@users.noreply.github.com>
Date: Wed, 16 Jul 2025 15:51:45 +0800
Subject: [PATCH 31/88] feat: Add deepseek-lite tests for RTX pro 6000 (#5903)

Signed-off-by: peaceh <103117813+peaceh-nv@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     |  4 +-
 .../test_lists/test-db/l0_rtx_pro_6000.yml    | 51 +++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index b0942231808..b3ff22e1aca 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -15,6 +15,7 @@
 import os
 
 import pytest
+from defs.conftest import get_sm_version
 
 from tensorrt_llm import LLM
 from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
@@ -1109,7 +1110,8 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
-
+        if moe_backend == "TRTLLM" and get_sm_version() == 120:
+            pytest.skip("MOE TRTLLM backend does not support SM version 120")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
         torch_compile_config = TorchCompileConfig(
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
index ccc12c1df6a..32a03fd591c 100644
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -31,3 +31,54 @@ l0_rtx_pro_6000:
   - test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8]
   - test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
   - test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-FP8-Mixtral-8x7B-Instruct-v0.1-fp8]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*6000*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[pp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=0-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-ep4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-tp2pp2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backend=CUTLASS-mtp_nextn=2-pp4-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2]

From 763012a88ad21a76fd6c8a9bf036122af0a053be Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Wed, 16 Jul 2025 16:04:08 +0800
Subject: [PATCH 32/88] [nvbug/5359218][tests] add test llm api test case on
 lookahead with chunked prefill (#6051)

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 tests/integration/defs/examples/test_llama.py | 42 +++++++++++++++++++
 .../test_lists/qa/examples_test_list.txt      |  1 +
 2 files changed, 43 insertions(+)

diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py
index 3d4c545e1dc..2751b24d5c7 100644
--- a/tests/integration/defs/examples/test_llama.py
+++ b/tests/integration/defs/examples/test_llama.py
@@ -4057,3 +4057,45 @@ def test_llm_llama_lookahead_single_gpu_summary(llama_example_root,
                                        lookahead_config='[7, 7, 7]')
 
     venv_check_call(llm_venv, summary_cmd)
+
+
+@pytest.mark.parametrize("model_name,model_path", [
+    ("Llama-3.1-8B-Instruct", "llama-3.1-model/Llama-3.1-8B-Instruct"),
+])
+def test_llm_api_lookahead_decoding_1gpu(model_name, model_path):
+    """
+    RCCA: https://nvbugs/5359218
+    """
+    from defs.conftest import llm_models_root
+
+    from tensorrt_llm.llmapi import (LLM, BuildConfig, KvCacheConfig,
+                                     LookaheadDecodingConfig, SamplingParams)
+    build_config = BuildConfig(max_batch_size=128,
+                               max_input_len=2048,
+                               max_seq_len=32768,
+                               max_num_tokens=8192,
+                               max_draft_len=111)
+    build_config.plugin_config.use_paged_context_fmha = True
+    build_config.plugin_config.multiple_profiles = True
+
+    lookahead_config = LookaheadDecodingConfig(max_window_size=8,
+                                               max_ngram_size=3,
+                                               max_verification_set_size=3)
+
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+    llm = LLM(model=f"{llm_models_root()}/{model_path}",
+              kv_cache_config=kv_cache_config,
+              build_config=build_config,
+              speculative_config=lookahead_config,
+              enable_chunked_prefill=True)
+
+    prompt = """Write a C++ program to find the nth Fibonacci number using
+recursion. Now we define a sequence of numbers in which each number is the
+sum of the three preceding ones. The first three numbers are 0, -1, -1.
+Write a program to find the nth number.""" * 200  # around 13k tokens
+
+    sampling_params = SamplingParams(lookahead_config=lookahead_config)
+
+    output = llm.generate(prompt, sampling_params=sampling_params)
+
+    assert output is not None, "No output generated from LLM"
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 774a5b84ecd..167f99d5ea8 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -124,6 +124,7 @@ examples/test_llama.py::test_llm_llama_long_alpaca_8gpu_summary[pg64317-tp8pp1-n
 examples/test_llama.py::test_llm_llama_lookahead_single_gpu_summary[llama-3.1-8b]
 examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b]
 examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b]
+examples/test_llama.py::test_llm_api_lookahead_decoding_1gpu[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
 examples/test_llama.py::test_llm_llama_v1_2gpu_summary[llama-7b-nb:4-enable_auto_parallel]
 examples/test_llama.py::test_llm_llama_v1_4gpu_paged_kv_cache[llama-3.1-8b]
 examples/test_llama.py::test_llm_llama_v1_multiple_lora_1gpu[luotuo_japan-llama-7b-lora_fp16-base_fp16]

From 7568deb2f1a7338ee08197b20a2ac79d6d8808f1 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
Date: Wed, 16 Jul 2025 16:05:38 +0800
Subject: [PATCH 33/88] [nvbug/5387226] chore: add propogation for
 trust_remote_code to AutoConfig (#6001)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm_args.py               |  9 ++++---
 tensorrt_llm/llmapi/llm_utils.py              | 26 +++----------------
 tensorrt_llm/models/automodel.py              |  3 ++-
 tests/microbenchmarks/build_time_benchmark.py |  7 +++--
 tests/unittest/llmapi/test_llm.py             |  4 ++-
 tests/unittest/llmapi/test_llm_utils.py       |  2 +-
 6 files changed, 20 insertions(+), 31 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 1b385b6e8fc..f2d15a5d3fd 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1286,7 +1286,8 @@ def validate_model_format_misc(self):
                 'pytorch', '_autodeploy'
         ]:
             # Load parallel_config from the engine.
-            model_format = get_model_format(self.model)
+            model_format = get_model_format(
+                self.model, trust_remote_code=self.trust_remote_code)
 
             if model_format is _ModelFormatKind.TLLM_ENGINE:
                 if self.build_config is not None:
@@ -2083,7 +2084,8 @@ def update_llm_args_with_extra_options(llm_args: Dict,
     return llm_args
 
 
-def get_model_format(model_dir: str) -> _ModelFormatKind:
+def get_model_format(model_dir: str,
+                     trust_remote_code: bool = False) -> _ModelFormatKind:
     ''' Get the format of the model.  '''
     if not (Path(model_dir) / 'config.json').exists():
         raise ValueError(
@@ -2102,7 +2104,8 @@ def get_model_format(model_dir: str) -> _ModelFormatKind:
             PretrainedConfig.from_checkpoint(model_dir)
         else:
             model_format = _ModelFormatKind.HF
-            AutoConfig.from_hugging_face(model_dir)
+            AutoConfig.from_hugging_face(model_dir,
+                                         trust_remote_code=trust_remote_code)
     except Exception as e:
         raise ValueError(
             f"Inferred model format {model_format}, but failed to load config.json: {e}"
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index cf2bdb26c14..5ccb62be121 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -5,7 +5,6 @@
 import tempfile
 import time
 import weakref
-from argparse import Namespace
 from dataclasses import asdict, dataclass, field
 from pathlib import Path
 from typing import Callable, List, Optional, Tuple, Union
@@ -35,7 +34,7 @@
                        LookaheadDecodingConfig, MedusaDecodingConfig,
                        MTPDecodingConfig, NGramDecodingConfig,
                        UserProvidedDecodingConfig, _ModelFormatKind,
-                       _ModelWrapper, _ParallelConfig, get_model_format,
+                       _ModelWrapper, _ParallelConfig,
                        update_llm_args_with_extra_dict,
                        update_llm_args_with_extra_options)
 from .mpi_session import MPINodeState, MpiSession
@@ -315,11 +314,6 @@ def save(
             if tokenizer is not None:
                 tokenizer.save_pretrained(engine_dir)
 
-    @staticmethod
-    def get_model_format(model_dir: str) -> _ModelFormatKind:
-        ''' Get the format of the model.  '''
-        return get_model_format(model_dir)
-
     def _download_hf_model(self):
         ''' Download HF model from third-party model hub like www.modelscope.cn or huggingface.  '''
         model_dir = None
@@ -566,21 +560,6 @@ def _load_engine_buffer(self):
         # Load engine buffer from disk
         self._engine = Engine.from_dir(self._model_dir)
 
-    @staticmethod
-    def load_extra_build_configs_from_engine(
-            model_dir: str) -> Optional[Namespace]:
-        ''' Load the extra build configs from the engine directory, return None if model isn't an engine. '''
-        if ModelLoader.get_model_format(
-                model_dir) is not _ModelFormatKind.TLLM_ENGINE:
-            return None
-
-        with open(Path(model_dir) / "config.json", "r") as f:
-            engine_config = json.load(f)
-
-        build_config = engine_config['build_config']
-        build_config.pop("plugin_config")
-        return Namespace(**build_config)
-
     @staticmethod
     def load_hf_tokenizer(
             model_dir,
@@ -740,7 +719,8 @@ def get_pretrained_config(self) -> PretrainedConfig:
             self._hf_model_dir,
             mapping=self.llm_args.parallel_config.to_mapping(),
             quant_config=self.llm_args.quant_config,
-            dtype=self.llm_args.dtype)
+            dtype=self.llm_args.dtype,
+            trust_remote_code=self.llm_args.trust_remote_code)
 
     def _build_model(self) -> Path:
         model_format = self.llm_args.model_format
diff --git a/tensorrt_llm/models/automodel.py b/tensorrt_llm/models/automodel.py
index c210e58e445..463ae334cad 100644
--- a/tensorrt_llm/models/automodel.py
+++ b/tensorrt_llm/models/automodel.py
@@ -16,9 +16,10 @@ def from_hugging_face(hf_model_or_dir,
                           quant_config: Optional[QuantConfig] = None,
                           **kwargs):
         import transformers
+        trust_remote_code = kwargs.get('trust_remote_code', False)
 
         hf_config = transformers.AutoConfig.from_pretrained(
-            hf_model_or_dir, trust_remote_code=True)
+            hf_model_or_dir, trust_remote_code=trust_remote_code)
 
         if hasattr(hf_config,
                    'architectures') and hf_config.architectures is not None:
diff --git a/tests/microbenchmarks/build_time_benchmark.py b/tests/microbenchmarks/build_time_benchmark.py
index 3313cae1cda..133be635335 100644
--- a/tests/microbenchmarks/build_time_benchmark.py
+++ b/tests/microbenchmarks/build_time_benchmark.py
@@ -221,8 +221,11 @@ def build_from_hf(args,
         quant_output_dir.cleanup()
 
     else:  # fake weights
-        trtllm_config = AutoConfig.from_hugging_face(hf_model_dir, dtype,
-                                                     mapping, quant_config)
+        trtllm_config = AutoConfig.from_hugging_face(hf_model_dir,
+                                                     dtype,
+                                                     mapping,
+                                                     quant_config,
+                                                     trust_remote_code=True)
         trtllm_model = AutoModelForCausalLM.get_trtllm_model_class(
             hf_model_dir)(trtllm_config)
 
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 77fc911b865..ef644849f25 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -267,7 +267,9 @@ def test_llm_with_dummy_weights(model_format):
         hf_config = transformers.AutoConfig.from_pretrained(llama_model_path)
         hf_config.save_pretrained(dummy_dir.name)
     else:
-        config = AutoConfig.from_hugging_face(llama_model_path, dtype='float16')
+        config = AutoConfig.from_hugging_face(llama_model_path,
+                                              dtype='float16',
+                                              trust_remote_code=True)
         config.to_json_file(os.path.join(dummy_dir.name, 'config.json'))
     tokenizer = transformers.AutoTokenizer.from_pretrained(llama_model_path)
     tokenizer.save_pretrained(dummy_dir.name)
diff --git a/tests/unittest/llmapi/test_llm_utils.py b/tests/unittest/llmapi/test_llm_utils.py
index 7caa16d7001..5155e158e62 100644
--- a/tests/unittest/llmapi/test_llm_utils.py
+++ b/tests/unittest/llmapi/test_llm_utils.py
@@ -46,7 +46,7 @@ def test_CachedModelLoader():
     engine_dir, _ = model_loader()
     assert engine_dir
     assert engine_dir.exists() and engine_dir.is_dir()
-    model_format = ModelLoader.get_model_format(engine_dir)
+    model_format = get_model_format(engine_dir, trust_remote_code=True)
     assert model_format is _ModelFormatKind.TLLM_ENGINE
 
 

From dda91b51176e3a0591fcbd37858fd19a92c1c230 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Wed, 16 Jul 2025 16:14:25 +0800
Subject: [PATCH 34/88] tests: add QA test cases  (#5959)

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
---
 .../accuracy/references/cnn_dailymail.yaml    |  6 ++++
 .../defs/accuracy/test_llm_api_pytorch.py     | 30 ++++++++++++++++++-
 .../test_lists/qa/examples_test_list.txt      |  4 +++
 .../test_lists/qa/llm_sanity_test.txt         |  7 ++++-
 4 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index c85be741469..ddd6589a439 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -125,6 +125,12 @@ meta-llama/Llama-3.1-8B-Instruct:
   - dtype: float16
     spec_dec_algo: Medusa
     accuracy: 33.663
+  - quant_algo: FP8
+    extra_acc_spec: temperature=0.8,top_p=0.95
+    accuracy: 28.631
+  - quant_algo: FP8
+    extra_acc_spec: beam_width=4
+    accuracy: 31.391
 meta-llama/Llama-3.2-1B:
   - accuracy: 27.427
   - quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index b3ff22e1aca..1d0d0649898 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -218,11 +218,39 @@ def test_fp8_llm_sampler(self):
                 top_p=0.95,
             )
 
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=sampling_params,
+                          extra_acc_spec="temperature=0.8,top_p=0.95")
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm,
                           sampling_params=sampling_params,
                           extra_acc_spec="temperature=0.8,top_p=0.95")
 
+    def test_fp8_beam_search(self):
+        model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
+        pytorch_config = dict(disable_overlap_scheduler=True)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        max_beam_width = 4
+        sampling_params = SamplingParams(n=max_beam_width,
+                                         best_of=max_beam_width,
+                                         use_beam_search=True)
+
+        llm = LLM(model=model_path,
+                  **pytorch_config,
+                  kv_cache_config=kv_cache_config,
+                  max_beam_width=max_beam_width,
+                  max_batch_size=16,
+                  max_seq_len=1024,
+                  enable_trtllm_sampler=True,
+                  build_config=None)
+
+        with llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm,
+                          sampling_params=sampling_params,
+                          extra_acc_spec="beam_width=4")
+
     def test_eagle3(self):
         pytorch_config = dict(
             disable_overlap_scheduler=True,
@@ -449,7 +477,7 @@ class TestMinistral8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Ministral-8B-Instruct-2410"
     MODEL_PATH = f"{llm_models_root()}/Ministral-8B-Instruct-2410"
 
-    def test_auto_dtype_gsm8k(self):
+    def test_auto_dtype(self):
         with LLM(self.MODEL_PATH) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 167f99d5ea8..3dcfcbac093 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -436,6 +436,7 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
 accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
@@ -483,6 +484,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_tr
 accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
@@ -585,6 +588,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt
index 91a71d1c23f..19bf09b8b5e 100644
--- a/tests/integration/test_lists/qa/llm_sanity_test.txt
+++ b/tests/integration/test_lists/qa/llm_sanity_test.txt
@@ -4,6 +4,8 @@ accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype
 accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
@@ -18,6 +20,7 @@ accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
@@ -35,7 +38,7 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype_gsm8k
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
 accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
@@ -59,6 +62,8 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]

From 10349b54dfd3884a2ad1176042e0f8fa9e6666ef Mon Sep 17 00:00:00 2001
From: Martin Marciniszyn Mehringer
 <11665257+MartinMarciniszyn@users.noreply.github.com>
Date: Wed, 16 Jul 2025 10:35:27 +0200
Subject: [PATCH 35/88] fix: Add $HOME/.local/bin to PATH when running docker
 in local user mode (#6062)

Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com>
---
 docker/Dockerfile.user | 2 ++
 docker/Makefile        | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile.user b/docker/Dockerfile.user
index f6f912ddd3a..3d1ff9e52fd 100644
--- a/docker/Dockerfile.user
+++ b/docker/Dockerfile.user
@@ -21,3 +21,5 @@ RUN (getent group ${GROUP_ID} || groupadd --gid ${GROUP_ID} ${GROUP_NAME}) && \
     rm -rf /var/lib/apt/lists/*
 
 USER ${USER_NAME}
+
+ENV PATH="/home/${USER_NAME}/.local/bin:$PATH"
diff --git a/docker/Makefile b/docker/Makefile
index 71d3b6511d5..926c8cea1aa 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -152,11 +152,11 @@ endif
     		$(if $(and $(filter 1,$(LOCAL_USER)),$(shell [ -w "$(USER_CACHE_DIR)" ] && echo 1)),--volume $(USER_CACHE_DIR):/home/$(USER_NAME)/.cache:rw) \
     		--env "CCACHE_DIR=$(CCACHE_DIR)" \
     		--env "CCACHE_BASEDIR=$(CODE_DIR)" \
-			--env "CONAN_HOME=$(CONAN_DIR)" \
+    		--env "CONAN_HOME=$(CONAN_DIR)" \
     		--workdir $(WORK_DIR) \
     		--hostname $(shell hostname)-$* \
     		--name $(CONTAINER_NAME)-$*-$(USER_NAME) \
-			--tmpfs /tmp:exec \
+    		--tmpfs /tmp:exec \
     		$(IMAGE_WITH_TAG)$(IMAGE_TAG_SUFFIX) $(RUN_CMD)
 
 devel_%: STAGE = devel

From a02606a9e2ba0a3639f7a07cb873992d1f99368f Mon Sep 17 00:00:00 2001
From: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
Date: Wed, 16 Jul 2025 16:42:59 +0800
Subject: [PATCH 36/88] [TRTLLM-5530][BREAKING CHANGE] refactor: unify
 KvCacheConfig in LLM class for pytorch backend (#5752)

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
---
 examples/llm-api/quickstart_advanced.py       |   2 +-
 tensorrt_llm/bench/benchmark/utils/general.py |  12 +-
 .../bench/dataclasses/configuration.py        |   1 -
 tensorrt_llm/bench/dataclasses/reporting.py   |  14 +-
 tensorrt_llm/llmapi/llm_args.py               |  65 ++++++--
 tensorrt_llm/llmapi/llm_utils.py              |   3 +
 .../defs/accuracy/test_llm_api_pytorch.py     | 144 +++---------------
 .../test_disaggregated_single_gpu.py          |   6 +-
 .../defs/perf/pytorch_model_config.py         |   7 +
 .../_torch/modeling/test_modeling_deepseek.py |   4 +-
 .../multi_gpu_modeling/test_deepseek.py       |   1 -
 .../references_committed/llm.yaml             |   7 -
 12 files changed, 108 insertions(+), 158 deletions(-)

diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index 9065bd2f00f..90d527562a1 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -149,6 +149,7 @@ def setup_llm(args):
     kv_cache_config = KvCacheConfig(
         enable_block_reuse=not args.disable_kv_cache_reuse,
         free_gpu_memory_fraction=args.kv_cache_fraction,
+        dtype=args.kv_cache_dtype,
     )
 
     spec_decode_algo = args.spec_decode_algo.upper(
@@ -194,7 +195,6 @@ def setup_llm(args):
         model=args.model_dir,
         backend='pytorch',
         disable_overlap_scheduler=args.disable_overlap_scheduler,
-        kv_cache_dtype=args.kv_cache_dtype,
         kv_cache_config=kv_cache_config,
         attn_backend=args.attention_backend,
         cuda_graph_config=cuda_graph_config,
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
index 0073ea1d44f..bc72b5e1467 100755
--- a/tensorrt_llm/bench/benchmark/utils/general.py
+++ b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -88,12 +88,14 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     enable_chunked_prefill = params.get("enable_chunked_prefill", False)
 
     kv_cache_dtype = "auto"
+    kv_cache_config = {}
     if extra_llm_api_options:
         with open(extra_llm_api_options, 'r') as f:
             llm_args_dict = yaml.safe_load(f)
-
-        if "kv_cache_dtype" in llm_args_dict:
-            kv_cache_dtype = llm_args_dict["kv_cache_dtype"]
+            kv_cache_config = llm_args_dict.get("kv_cache_config", {
+                "dtype": "auto",
+            })
+            kv_cache_dtype = kv_cache_config.get("dtype", "auto")
 
         enable_chunked_prefill = llm_args_dict.get("enable_chunked_prefill",
                                                    enable_chunked_prefill)
@@ -158,9 +160,11 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
         "max_batch_size": max_batch_size
     }
 
+    kv_cache_config["dtype"] = kv_cache_dtype
+
     pyt_options = {
         "cuda_graph_config": cuda_graph_config,
-        "kv_cache_dtype": kv_cache_dtype,
+        "kv_cache_config": kv_cache_config,
     }
 
     backend = params.get("backend", "pytorch")
diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py
index 0d352fa068c..77f80632088 100755
--- a/tensorrt_llm/bench/dataclasses/configuration.py
+++ b/tensorrt_llm/bench/dataclasses/configuration.py
@@ -112,7 +112,6 @@ def get_pytorch_perf_config(self) -> PyTorchConfig:
     def get_autodeploy_perf_config(self) -> Dict:
         AutoDeployPerfConfig = dict
         ad_config = AutoDeployPerfConfig()
-        ad_config["kv_cache_dtype"] = "auto"
         ad_config["attn_backend"] = "flashinfer"
         return ad_config
 
diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py
index d994000d6d0..476ff50ec2d 100755
--- a/tensorrt_llm/bench/dataclasses/reporting.py
+++ b/tensorrt_llm/bench/dataclasses/reporting.py
@@ -11,6 +11,7 @@
 from tensorrt_llm.bench.dataclasses.statistics import (BenchmarkStatistics,
                                                        PercentileStats,
                                                        RequestRecord)
+from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.logger import Logger
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
@@ -275,8 +276,17 @@ def get_statistics_dict(self) -> Dict[str, Any]:
             model = self.rt_cfg.model_path or self.rt_cfg.model
             model_config = ModelConfig.from_pretrained(model,
                                                        trust_remote_code=True)
-            validate_and_set_kv_cache_quant(model_config,
-                                            self.kwargs["kv_cache_dtype"])
+            kv_cache_config = self.kwargs.get("kv_cache_config",
+                                              KvCacheConfig())
+            if isinstance(kv_cache_config, KvCacheConfig):
+                kv_cache_dtype = kv_cache_config.dtype
+            elif isinstance(kv_cache_config, dict):
+                kv_cache_dtype = kv_cache_config.get("dtype", "auto")
+            else:
+                raise ValueError(
+                    f"Invalid kv_cache_config type: {type(kv_cache_config)}.")
+
+            validate_and_set_kv_cache_quant(model_config, kv_cache_dtype)
 
             stats_dict["engine"] |= {
                 "backend":
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index f2d15a5d3fd..745aa739a86 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -821,6 +821,10 @@ class KvCacheConfig(BaseModel, PybindMirror):
     use_uvm: bool = Field(default=False,
                           description="Whether to use UVM for the KV cache.")
 
+    # This is a pure python field, not a pybind field. It is only for the Pytorch backend.
+    dtype: str = Field(default="auto",
+                       description="The data type to use for the KV cache.")
+
     def _to_pybind(self):
         return _KvCacheConfig(
             enable_block_reuse=self.enable_block_reuse,
@@ -1024,10 +1028,6 @@ class BaseLlmArgs(BaseModel):
     lora_config: Optional[LoraConfig] = Field(
         default=None, description="LoRA configuration for the model.")
 
-    # Quantization and calibration configurations
-    quant_config: Optional[QuantConfig] = Field(
-        default=None, description="Quantization config.", validate_default=True)
-
     # Several options from ExecutorConfig, expanded here for less hierarchy
     kv_cache_config: KvCacheConfig = Field(default_factory=KvCacheConfig,
                                            description="KV cache config.")
@@ -1208,13 +1208,6 @@ def validate_dtype(cls, v, info):
                 raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
         return v
 
-    @field_validator("quant_config", mode='before')
-    @classmethod
-    def validate_quant_config(cls, v, info):
-        if v is None:
-            v = QuantConfig()
-        return v
-
     @field_validator("gpus_per_node", mode='before')
     @classmethod
     def validate_gpus_per_node(cls, v, info):
@@ -1657,6 +1650,10 @@ class TrtLlmArgs(BaseLlmArgs):
     calib_config: Optional[CalibConfig] = Field(
         default=None, description="Calibration config.", validate_default=True)
 
+    # Quantization and calibration configurations
+    quant_config: Optional[QuantConfig] = Field(
+        default=None, description="Quantization config.", validate_default=True)
+
     embedding_parallel_mode: str = Field(
         default='SHARDING_ALONG_VOCAB',
         description="The embedding parallel mode.")
@@ -1694,6 +1691,13 @@ def init_calib_config(cls, v):
             return CalibConfig()
         return v
 
+    @field_validator("quant_config", mode='before')
+    @classmethod
+    def validate_quant_config(cls, v, info):
+        if v is None:
+            v = QuantConfig()
+        return v
+
     @model_validator(mode="after")
     def setup_embedding_parallel_mode(self):
         if self.embedding_parallel_mode == 'NONE':
@@ -1738,6 +1742,11 @@ def validate_enable_build_cache(self):
                 f"Invalid build_cache_config: {self.enable_build_cache}")
         return self
 
+    @model_validator(mode="after")
+    def validate_kv_cache_dtype(self):
+        assert self.kv_cache_config.dtype == "auto", "KvCacheConfig.dtype is not supported by the TensorRT backend."
+        return self
+
 
 class LoadFormat(Enum):
     AUTO = 0
@@ -1811,9 +1820,6 @@ class TorchLlmArgs(BaseLlmArgs):
         "If true, will use the TRTLLM sampler instead of the PyTorch sampler. The TRTLLM sampler has a wide coverage of sampling strategies."
     )
 
-    kv_cache_dtype: str = Field(default="auto",
-                                description="Data type for KV cache.")
-
     enable_iter_perf_stats: bool = Field(
         default=False, description="Enable iteration performance statistics.")
 
@@ -1867,6 +1873,19 @@ class TorchLlmArgs(BaseLlmArgs):
                 'MNNVL']] = Field(default='AUTO',
                                   description="Allreduce strategy to use.")
 
+    # PrivateVars
+    _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
+
+    @property
+    def quant_config(self) -> QuantConfig:
+        if self._quant_config is None:
+            self._quant_config = QuantConfig()
+        return self._quant_config
+
+    @quant_config.setter
+    def quant_config(self, value: QuantConfig):
+        self._quant_config = value
+
     # TODO: remove backend later
     @field_validator('backend', mode='before')
     def init_backend(cls, v):
@@ -1994,6 +2013,22 @@ def validate_cuda_graph_config(self) -> 'TorchLlmArgs':
 
         return self
 
+    @model_validator(mode='after')
+    def sync_quant_config_with_kv_cache_config_dtype(self) -> 'TorchLlmArgs':
+        if self.kv_cache_config is None:
+            return self
+
+        assert self.quant_config is not None
+        if self.kv_cache_config.dtype == "auto":
+            return self
+        elif self.kv_cache_config.dtype == 'fp8':
+            self.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+        else:
+            logger.warning(
+                f"Cannot sync quant_config.kv_cache_quant_algo with kv_cache_config.dtype of {self.kv_cache_config.dtype}, "
+                "please update the validator")
+        return self
+
     # TODO: Remove this after the PyTorch backend is fully migrated to TorchLlmArgs from ExecutorConfig
     def get_pytorch_backend_config(self) -> "PyTorchConfig":
         from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@@ -2017,7 +2052,7 @@ def get_pytorch_backend_config(self) -> "PyTorchConfig":
             moe_backend=self.moe_config.backend,
             enable_mixed_sampler=self.enable_mixed_sampler,
             enable_trtllm_sampler=self.enable_trtllm_sampler,
-            kv_cache_dtype=self.kv_cache_dtype,
+            kv_cache_dtype=self.kv_cache_config.dtype,
             enable_iter_perf_stats=self.enable_iter_perf_stats,
             enable_iter_req_stats=self.enable_iter_req_stats,
             print_iter_log=self.print_iter_log,
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index 5ccb62be121..31f853f3705 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -401,6 +401,9 @@ def _update_from_hf_quant_config(self) -> bool:
                 logger.info(f"Setting {key}={value} from HF quant config.")
                 setattr(quant_config, key, value)
 
+            # Update the quant_config in llm_args for pytorch
+            self.llm_args.quant_config = quant_config
+
             return True
 
         hf_config_path = f"{self._model_dir}/config.json"
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 1d0d0649898..701461b19d1 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -23,7 +23,6 @@
                                  KvCacheConfig, MoeConfig, MTPDecodingConfig,
                                  NGramDecodingConfig, SamplingParams,
                                  TorchCompileConfig)
-from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
 
 from ..conftest import (llm_models_root, parametrize_with_ids,
@@ -51,7 +50,6 @@ def test_nvfp4(self):
         model_path = f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B"
         with LLM(model_path) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
@@ -67,7 +65,6 @@ def test_nvfp4_streaming(self, stream_interval):
         with LLM(f"{llm_models_root()}/nvfp4-quantized/Meta-Llama-3.1-8B",
                  stream_interval=stream_interval) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             assert llm.args.stream_interval == stream_interval
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm, streaming=True)
@@ -143,7 +140,6 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, attn_backend,
     @parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
     @parametrize_with_ids("fp8kv", [False, True])
     def test_fp8(self, fp8kv, attn_backend, torch_compile):
-        quant_config = QuantConfig(QuantAlgo.FP8)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
@@ -154,15 +150,11 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
             disable_overlap_scheduler=torch_compile,
         )
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            pytorch_config["kv_cache_config"] = KvCacheConfig(dtype="fp8")
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
-                quant_config=quant_config,
                 **pytorch_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -181,7 +173,6 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
                 "Pipeline parallel with torch.compile is not supported yet.\n"
                 "Issue: Unfusing flashinfer_fused_add_rmsnorm causes outputs to be "
                 "discarded at graph breaks.")
-        quant_config = QuantConfig(QuantAlgo.FP8)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True) if torch_compile else None
         pytorch_config = dict(
@@ -192,17 +183,13 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             disable_overlap_scheduler=torch_compile,
         )
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            pytorch_config["kv_cache_config"] = KvCacheConfig(dtype="fp8")
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
                 tensor_parallel_size=tp_size,
                 pipeline_parallel_size=pp_size,
-                quant_config=quant_config,
                 **pytorch_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -336,7 +323,6 @@ def test_fp8_prequantized(self):
         model_path = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B-FP8"
         with LLM(model_path) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -358,7 +344,6 @@ def test_fp8_prequantized(self):
         model_path = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-3B-Instruct-FP8"
         with LLM(model_path) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
@@ -401,7 +386,6 @@ def test_nvfp4_tp4(self):
         model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
         with LLM(model_path, tensor_parallel_size=4) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
@@ -583,7 +567,6 @@ def test_fp8_tp2(self):
         model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp8"
         with LLM(model_path, tensor_parallel_size=2) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
@@ -595,7 +578,6 @@ def test_nvfp4_tp2(self):
         model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4"
         with LLM(model_path, tensor_parallel_size=2) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
@@ -716,11 +698,8 @@ def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
             torch_compile_config=torch_compile_config,
         )
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         mtp_config = None
         mtp_nextn = 2
@@ -733,13 +712,10 @@ def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
 
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
@@ -775,11 +751,8 @@ def test_cute_dsl_fp8_block_scales(
             moe_config=MoeConfig(backend="CUTEDSL"),
         )
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -789,14 +762,11 @@ def test_cute_dsl_fp8_block_scales(
                 f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                 kv_cache_config=kv_cache_config,
                 **pytorch_config,
-                quant_config=quant_config,
                 enable_attention_dp=attention_dp,
                 speculative_config=mtp_config,
         ) as llm:
 
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
@@ -837,14 +807,11 @@ def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
             disable_overlap_scheduler=False,
             cuda_graph_config=CudaGraphConfig(enable_padding=True),
         )
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
 
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                  tensor_parallel_size=4,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
@@ -886,11 +853,8 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             torch_compile_config=torch_compile_config,
         )
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -902,13 +866,10 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                  moe_expert_parallel_size=ep_size,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
 
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
@@ -955,11 +916,8 @@ def test_cute_dsl_fp8_block_scales_4gpus(
             moe_config=MoeConfig(backend="CUTEDSL"),
         )
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -972,13 +930,10 @@ def test_cute_dsl_fp8_block_scales_4gpus(
                 moe_expert_parallel_size=ep_size,
                 kv_cache_config=kv_cache_config,
                 **pytorch_config,
-                quant_config=quant_config,
                 enable_attention_dp=attention_dp,
                 speculative_config=mtp_config,
         ) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1045,23 +1000,20 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv):
         num_slots = 80
         eplb_config = MoeLoadBalancerConfig(num_slots=num_slots,
                                             layer_updates_per_iter=2)
-        pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(),
-                                       moe_config=MoeConfig(
-                                           backend="WIDEEP",
-                                           load_balancer=eplb_config))
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.NVFP4
+        pytorch_config = dict(cuda_graph_config=CudaGraphConfig(),
+                              moe_config=MoeConfig(backend="WIDEEP",
+                                                   load_balancer=eplb_config))
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_backend_options["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
-        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
-                 tensor_parallel_size=4,
-                 moe_expert_parallel_size=4,
-                 kv_cache_config=kv_cache_config,
-                 **pytorch_backend_options,
-                 enable_attention_dp=True,
-                 quant_config=quant_config) as llm:
+        with LLM(
+                f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
+                tensor_parallel_size=4,
+                moe_expert_parallel_size=4,
+                kv_cache_config=kv_cache_config,
+                **pytorch_config,
+                enable_attention_dp=True,
+        ) as llm:
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -1095,21 +1047,15 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1157,11 +1103,8 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
                  tensor_parallel_size=tp_size,
@@ -1169,12 +1112,9 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                  moe_expert_parallel_size=ep_size,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1215,21 +1155,13 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
 
         if quant_dtype == "none":
             assert not fp8kv
-            quant_config = None
         else:
-            quant_config = QuantConfig()
-            if quant_dtype == "fp8":
-                quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
-            elif quant_dtype == "nvfp4":
-                quant_config.quant_algo = QuantAlgo.NVFP4
             if fp8kv:
-                quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-                pytorch_config["kv_cache_dtype"] = "fp8"
+                kv_cache_config.dtype = "fp8"
 
         with LLM(model_path,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
             if quant_dtype == "fp8":
@@ -1237,8 +1169,6 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
             elif quant_dtype == "nvfp4":
                 assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
 
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -1275,23 +1205,15 @@ def test_chunked_prefill(self, quant_dtype, kv_cache_reuse, fp8kv,
 
         if quant_dtype == "none":
             assert not fp8kv
-            quant_config = None
         else:
-            quant_config = QuantConfig()
-            if quant_dtype == "fp8":
-                quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
-            elif quant_dtype == "nvfp4":
-                quant_config.quant_algo = QuantAlgo.NVFP4
             if fp8kv:
-                quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-                pytorch_config["kv_cache_dtype"] = "fp8"
+                kv_cache_config.dtype = "fp8"
 
         with LLM(model_path,
                  kv_cache_config=kv_cache_config,
                  enable_chunked_prefill=True,
                  max_num_tokens=512,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=True,
                  speculative_config=mtp_config) as llm:
 
@@ -1300,9 +1222,6 @@ def test_chunked_prefill(self, quant_dtype, kv_cache_reuse, fp8kv,
             elif quant_dtype == "nvfp4":
                 assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
 
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
@@ -1388,11 +1307,8 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
             moe_config=MoeConfig(backend=moe_backend))
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.NVFP4
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -1404,14 +1320,11 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                  moe_expert_parallel_size=ep_size,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
 
             assert llm.args.moe_backend == moe_backend
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1438,11 +1351,8 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
         )
 
-        quant_config = QuantConfig()
-        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
         if fp8kv:
-            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
-            pytorch_config["kv_cache_dtype"] = "fp8"
+            kv_cache_config.dtype = "fp8"
 
         mtp_config = None
         if mtp_nextn > 0:
@@ -1454,12 +1364,9 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                  moe_expert_parallel_size=ep_size,
                  kv_cache_config=kv_cache_config,
                  **pytorch_config,
-                 quant_config=quant_config,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-            if fp8kv:
-                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
 
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
@@ -1549,7 +1456,6 @@ def test_fp8_prequantized(self):
         model_path = f"{llm_models_root()}/Llama-3.1-Nemotron-Nano-8B-v1-FP8"
         with LLM(model_path) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
@@ -1601,7 +1507,6 @@ def test_fp8_prequantized(self, cuda_graph, tp_size, pp_size, ep_size):
                  kv_cache_config=KvCacheConfig(
                      free_gpu_memory_fraction=0.85)) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
@@ -1634,7 +1539,6 @@ def test_reasoning_fp8_prequantized(self):
                  kv_cache_config=kv_cache_config,
                  max_batch_size=256) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index 0a392a575a4..540313cfdff 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -110,14 +110,12 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
     worker_pytorch_configs.append(
         dict(
             disable_overlap_scheduler=True,
-            kv_cache_dtype="auto",
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
     # Generation worker
     worker_pytorch_configs.append(
         dict(
             disable_overlap_scheduler=not generation_overlap,
-            kv_cache_dtype="auto",
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
     kv_cache_configs = [KvCacheConfig(max_tokens=2048 * 8) for _ in range(2)]
@@ -233,18 +231,16 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
     worker_pytorch_configs.append(
         dict(
             disable_overlap_scheduler=True,
-            kv_cache_dtype="auto",
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
     # Generation worker
     worker_pytorch_configs.append(
         dict(
             disable_overlap_scheduler=not generation_overlap,
-            kv_cache_dtype="auto",
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
     kv_cache_configs = [
-        KvCacheConfig(max_tokens=128, enable_block_reuse=False)
+        KvCacheConfig(max_tokens=128, enable_block_reuse=False, dtype="auto")
         for _ in range(2)
     ]
     model_names = [model_path(model) for _ in range(2)]
diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py
index 4c0ef184093..23ccd0f1841 100644
--- a/tests/integration/defs/perf/pytorch_model_config.py
+++ b/tests/integration/defs/perf/pytorch_model_config.py
@@ -17,6 +17,8 @@
 Model pytorch yaml config for trtllm-bench perf tests
 """
 
+from tensorrt_llm.llmapi import KvCacheConfig
+
 
 def recursive_update(d, u):
     for k, v in u.items():
@@ -186,4 +188,9 @@ def get_model_yaml_config(model_label: str,
         }
         base_config.update(lora_config)
 
+    kv_cache_config = base_config.get('kv_cache_config', KvCacheConfig())
+    if 'kv_cache_dtype' in base_config:
+        kv_cache_config.dtype = base_config.pop('kv_cache_dtype', 'auto')
+        base_config.update({'kv_cache_config': kv_cache_config})
+
     return base_config
diff --git a/tests/unittest/_torch/modeling/test_modeling_deepseek.py b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
index e5cf9680bbf..ad242f6b28c 100644
--- a/tests/unittest/_torch/modeling/test_modeling_deepseek.py
+++ b/tests/unittest/_torch/modeling/test_modeling_deepseek.py
@@ -68,7 +68,6 @@ def test_deepseek_trtllmgen(model_name):
 
     pytorch_config = dict(
         disable_overlap_scheduler=True,
-        kv_cache_dtype="auto",
         attn_backend="TRTLLM",
         load_format="dummy",
         moe_config=MoeConfig(backend="TRTLLM"),
@@ -89,7 +88,8 @@ def test_deepseek_trtllmgen(model_name):
               moe_tensor_parallel_size=-1,
               enable_attention_dp=False,
               speculative_config=spec_config,
-              kv_cache_config=KvCacheConfig(enable_block_reuse=False,
+              kv_cache_config=KvCacheConfig(dtype="auto",
+                                            enable_block_reuse=False,
                                             free_gpu_memory_fraction=0.4))
 
     sampling_params = SamplingParams(max_tokens=20)
diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
index 5d2a8b71374..5a38f0d0788 100644
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@@ -63,7 +63,6 @@ def test_deepseek_streaming(model_name, backend, quant, tp_size):
 
     pytorch_config = dict(
         disable_overlap_scheduler=True,
-        kv_cache_dtype="auto",
         attn_backend=backend,
     )
     moe_config = MoeConfig(max_num_tokens=moe_max_num_tokens)
diff --git a/tests/unittest/api_stability/references_committed/llm.yaml b/tests/unittest/api_stability/references_committed/llm.yaml
index 4201b190a2f..66fbdabfc5d 100644
--- a/tests/unittest/api_stability/references_committed/llm.yaml
+++ b/tests/unittest/api_stability/references_committed/llm.yaml
@@ -57,13 +57,6 @@ methods:
       guided_decoding_backend:
         annotation: Optional[Literal["xgrammar", "llguidance"]]
         default: null
-      # Quantization and calibration
-      quant_config:
-        annotation: Optional[tensorrt_llm.models.modeling_utils.QuantConfig]
-        default: null
-      calib_config:
-        annotation: Optional[tensorrt_llm.llmapi.llm_utils.CalibConfig]
-        default: null
       # Speculative decoding
       speculative_config:
         annotation: Union[tensorrt_llm.llmapi.llm_args.DraftTargetDecodingConfig, tensorrt_llm.llmapi.llm_args.EagleDecodingConfig,tensorrt_llm.llmapi.llm_args.LookaheadDecodingConfig, tensorrt_llm.llmapi.llm_args.MedusaDecodingConfig, tensorrt_llm.llmapi.llm_args.MTPDecodingConfig, tensorrt_llm.llmapi.llm_args.NGramDecodingConfig, tensorrt_llm.llmapi.llm_args.UserProvidedDecodingConfig, NoneType]

From 0552a029432ecdacb538414f02390bd0198f9704 Mon Sep 17 00:00:00 2001
From: Tomer Shmilovich <81696642+tshmilnvidia@users.noreply.github.com>
Date: Wed, 16 Jul 2025 12:33:17 +0300
Subject: [PATCH 37/88] BlockManager copy constructor fix (#5982)

Signed-off-by: Tomer Shmilovich <tshmilovich@nvidia.com>
---
 cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h   | 3 +++
 cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp | 4 ++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index b863c327674..caac72744f3 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -854,6 +854,9 @@ class BlockManager
         std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false,
         bool enablePartialReuse = true, bool copyOnPartialReuse = true);
 
+    BlockManager(BlockManager const&) = delete;
+    BlockManager& operator=(BlockManager const&) = delete;
+
     //! \brief Calculate the proportional share each window size receives of the total memory pool
     //! \details Example:       (uniqueWindowSizeToLayers={1024: [1], 4096: [0, 4, 5], 8192: [2, 3]})
     //!          Would Return:  {1024: 0.0345, 4096: 0.4138, 8192: 0.5517} [sums to 1.0].
diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
index db502614141..08ab45145d5 100644
--- a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp
@@ -2115,7 +2115,7 @@ TEST_F(KVCacheManagerTest, KVCacheManagerLeafBlockWithDependentTest)
     EXPECT_THAT(cacheBlockIds0, ::testing::ElementsAreArray({0, 1, 2}));
 
     // Lower priority of middle block to prevent offloading
-    auto const blockManager = kvCacheManager.getBlockManager();
+    auto const& blockManager = kvCacheManager.getBlockManager();
     auto middleBlock = blockManager.getBlockById(cacheBlockIds0[1], maxAttentionWindow);
     middleBlock->setPriority(0);
 
@@ -2215,7 +2215,7 @@ TEST_P(KVCacheManagerTest, DISABLED_KVCacheManagerAllocationTest)
             std::nullopt, nvinfer1::DataType::kHALF, sinkTokenLength, stream, std::nullopt, enableBlockReuse,
             onboardBlocks);
 
-    auto const blockManager = kvCacheManager.getBlockManager();
+    auto const& blockManager = kvCacheManager.getBlockManager();
     auto const& bufferManager = blockManager.getBufferManager(theOnlyWindowSize(kvCacheManager));
     auto const memoryPoolUsedBefore = bufferManager.memoryPoolUsed();
     kvCacheManager.allocatePools(useUvm);

From 8ef8e73002b3d971cd357ac270ec8e53b016d1c0 Mon Sep 17 00:00:00 2001
From: qsang-nv <200703406+qsang-nv@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:50:43 +0800
Subject: [PATCH 38/88] update spec_dec (#6079)

Signed-off-by: Qidi Sang <200703406+qsang-nv@users.noreply.github.com>
---
 cpp/kernels/xqa/gen_cubins.py | 18 +++++++++---------
 cpp/kernels/xqa/mha.cu        |  2 +-
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/cpp/kernels/xqa/gen_cubins.py b/cpp/kernels/xqa/gen_cubins.py
index a8f0ecece3f..b520779dab3 100755
--- a/cpp/kernels/xqa/gen_cubins.py
+++ b/cpp/kernels/xqa/gen_cubins.py
@@ -117,12 +117,12 @@
 };
 """
 
-is_medusa = False
+is_spec_dec = False
 
 
 def generate_cubin_meta_info_line(arch: int, compile_macros: List[CompileMacro],
                                   function_name: str, cubin_size: int,
-                                  is_last: bool, is_medusa: bool):
+                                  is_last: bool, is_spec_dec: bool):
     data_type_str = None
     kv_data_type_str = None
     head_dim = None
@@ -160,7 +160,7 @@ def generate_cubin_meta_info_line(arch: int, compile_macros: List[CompileMacro],
             assert (tokens_per_page % 2 == 0)
             paged_kv_cache = 'true' if tokens_per_page > 0 else 'false'
 
-    use_medusa = 'true' if is_medusa else 'false'
+    use_medusa = 'true' if is_spec_dec else 'false'
     assert data_type_str is not None
     assert kv_data_type_str is not None
     assert head_dim is not None
@@ -376,7 +376,7 @@ def generate_compile_arch_macro_list(compile_macro_options: list):
                     option_macro_names, option_short_names, option_combination)
             ]
             if arch in (90, ) and option_combination[
-                    3] == 2 and option_combination[2] == 1 and not is_medusa:
+                    3] == 2 and option_combination[2] == 1 and not is_spec_dec:
                 input_file_name = "mha_sm90.cu"
             else:
                 input_file_name = "mha.cu"
@@ -387,7 +387,7 @@ def generate_compile_arch_macro_list(compile_macro_options: list):
 
 def generate_header_file_contents(
         all_arch_macros: List[CompileArchMacrosAndFile],
-        name_size_list: List[Tuple[str, int]], is_medusa: bool):
+        name_size_list: List[Tuple[str, int]], is_spec_dec: bool):
     cubin_data_array = []
     cubin_length_array = []
     meta_line_array = []
@@ -406,7 +406,7 @@ def generate_header_file_contents(
             generate_cubin_meta_info_line(arch, macros, function_name,
                                           cubin_size,
                                           i == len(all_arch_macros) - 1,
-                                          is_medusa))
+                                          is_spec_dec))
     cubin_data = ''.join(cubin_data_array)
     cubin_length = ''.join(cubin_length_array)
     meta_struct = ''.join([
@@ -422,8 +422,8 @@ def generate_header_file_contents(
         shutil.rmtree(cubin_dir)
     os.mkdir(cubin_dir)
 
-    if len(sys.argv) > 1 and sys.argv[1] == 'medusa':
-        is_medusa = True
+    if len(sys.argv) > 1 and sys.argv[1] == 'spec_dec':
+        is_spec_dec = True
         nvcc_flags = '-std=c++17 -O3 -cubin -DGENERATE_CUBIN=1 -DNDEBUG -DSPEC_DEC --use_fast_math -Xptxas=-v --allow-unsupported-compiler --expt-relaxed-constexpr -t 0'
         arch_options = [80, 86, 89, 90]
         config_list = [[
@@ -444,7 +444,7 @@ def generate_header_file_contents(
     with multiprocessing.Pool(processes=thread_count) as pool:
         name_size_list = pool.map(run_cubin_gen, arch_macro_lists)
     header_file_contents = generate_header_file_contents(
-        arch_macro_lists, name_size_list, is_medusa)
+        arch_macro_lists, name_size_list, is_spec_dec)
 
     with open(cubin_dir + build_func_name_prefix + '_cubin.h', "w") as f:
         f.write("".join(
diff --git a/cpp/kernels/xqa/mha.cu b/cpp/kernels/xqa/mha.cu
index 77a3ca12ee8..1acdb9852f8 100644
--- a/cpp/kernels/xqa/mha.cu
+++ b/cpp/kernels/xqa/mha.cu
@@ -1609,7 +1609,7 @@ CUBIN_EXPORT __global__
 
     uint32_t const nbSeqIters = useKVCache ? divUp(cacheSeqLen, ctaTile.x) : 0;
 #if SPEC_DEC
-    uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - qSeqLen) / ctaTile.x;
+    uint32_t const nbSeqItersWithoutMask = (cacheSeqLen - actualQSeqLen) / ctaTile.x;
 #endif
 
     uint32_t const seqStrideIters = nbSubSeqPerSeq;

From fc2347eaf57bd8f9eace9392be9811dc82e34f13 Mon Sep 17 00:00:00 2001
From: Bo Li <22713281+bobboli@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:54:36 +0800
Subject: [PATCH 39/88] chore: Cleanup disable_fp4_allgather. (#6006)

Signed-off-by: Bo Li <22713281+bobboli@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/modeling_deepseekv3.py         | 2 --
 tensorrt_llm/_torch/models/modeling_qwen3_moe.py          | 7 +------
 .../_torch/modules/fused_moe/fused_moe_cutlass.py         | 6 ++----
 tensorrt_llm/_torch/utils.py                              | 8 --------
 4 files changed, 3 insertions(+), 20 deletions(-)

diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index b92cef4dc54..ac9b85f0162 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -511,8 +511,6 @@ def compute_routed_output(self, hidden_states, hidden_states_fp4,
         # max-throughput
         use_dp_padding = False
         if self.use_dp and self.mapping.tp_size > 1:
-            # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
-            # to reduce allreduce BW
             if isinstance(self.experts, TRTLLMGenFusedMoE):
                 hidden_states = allgather(hidden_states,
                                           self.mapping,
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
index d01bce4ded6..5877f3daf5a 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -20,7 +20,6 @@
 from ..modules.linear import TensorParallelMode
 from ..modules.rms_norm import RMSNorm
 from ..speculative import SpecMetadata
-from ..utils import disable_fp4_allgather
 from .modeling_qwen3 import Qwen3Attention
 from .modeling_speculative import SpecDecOneEngineForCausalLM
 from .modeling_utils import (DecoderModel, EagerFusionConfig,
@@ -133,11 +132,7 @@ def forward(
             assert not self.enable_attention_dp
 
         if self.enable_attention_dp and self.mapping.tp_size > 1:
-            # FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
-            # to reduce allreduce BW
-            if (disable_fp4_allgather()
-                    and not self.experts.enable_alltoall) or isinstance(
-                        self.experts, TRTLLMGenFusedMoE):
+            if isinstance(self.experts, TRTLLMGenFusedMoE):
                 hidden_states = allgather(hidden_states,
                                           self.mapping,
                                           dim=0,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index c0a2c4fbecb..c42d6da2674 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -4,8 +4,7 @@
 
 from ...distributed import allgather, reducescatter
 from ...model_config import ModelConfig
-from ...utils import (EventType, Fp4QuantizedTensor, ceil_div,
-                      disable_fp4_allgather, swizzle_sf)
+from ...utils import EventType, Fp4QuantizedTensor, ceil_div, swizzle_sf
 from .interface import MoE
 from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethod,
                            FP8QDQFusedMoEMethod, MoEWeightLoadingMode,
@@ -220,8 +219,7 @@ def forward_chunk(
             # TODO: remove this once we have correct fusedmoe kernel ready
             token_final_scales = None
 
-        use_allgather = self.use_dp and self.parallel_size > 1 and not disable_fp4_allgather(
-        )
+        use_allgather = self.use_dp and self.parallel_size > 1
 
         # quantize inputs
         use_deepseek_fp8_block_scale = False
diff --git a/tensorrt_llm/_torch/utils.py b/tensorrt_llm/_torch/utils.py
index f687e9d9f55..59cbb214f8b 100644
--- a/tensorrt_llm/_torch/utils.py
+++ b/tensorrt_llm/_torch/utils.py
@@ -1,5 +1,4 @@
 import contextlib
-import os
 import threading
 from dataclasses import dataclass
 from enum import Enum
@@ -100,13 +99,6 @@ def shape(self):
         return self.fp4_tensor.shape
 
 
-_disable_fp4_allgather = os.getenv("TLLM_DISABLE_FP4_ALLGATHER", "0") == "1"
-
-
-def disable_fp4_allgather():
-    return _disable_fp4_allgather
-
-
 def compute_swizzled_sf_shape(row: int, col: int):
     padded_row = pad_up(row, 128)
     padded_col = pad_up(col, 4)

From e42f5a9581e14a054cc1ce17582e674b253b1689 Mon Sep 17 00:00:00 2001
From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Date: Wed, 16 Jul 2025 18:04:04 +0800
Subject: [PATCH 40/88] infra: [TRTLLM-5879] Spilt single GPU test and multi
 GPU test into 2 pipelines (#5199)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
---
 jenkins/L0_MergeRequest.groovy | 224 ++++++++++++++++-----------------
 jenkins/L0_Test.groovy         |  36 ++++--
 2 files changed, 132 insertions(+), 128 deletions(-)

diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index ecfdac3a8dc..7dd12cf78a4 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -878,6 +878,45 @@ def triggerJob(jobName, parameters, jenkinsUrl = "", credentials = "")
     return status
 }
 
+def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64", additionalParameters = [:]) {
+    def parameters = getCommonParameters()
+    String globalVarsJson = writeJSON returnText: true, json: globalVars
+    parameters += [
+        'enableFailFast': enableFailFast,
+        'globalVars': globalVarsJson,
+    ] + additionalParameters
+
+    if (env.alternativeTRT && platform == "x86_64") {
+        parameters += [
+            'alternativeTRT': env.alternativeTRT,
+        ]
+    }
+
+    if (env.alternativeTrtSBSA && platform == "SBSA") {
+        parameters += [
+            'alternativeTRT': env.alternativeTrtSBSA,
+        ]
+    }
+
+    if (env.testPhase2StageName) {
+        parameters += [
+            'testPhase2StageName': env.testPhase2StageName,
+        ]
+    }
+
+    if (reuseBuild) {
+        parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
+    }
+
+    echo "Trigger ${jobName} job, params: ${parameters}"
+
+    def status = triggerJob(jobName, parameters)
+    if (status != "SUCCESS") {
+        error "Downstream job did not succeed"
+    }
+    return status
+}
+
 def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
 {
     stages = [
@@ -889,78 +928,88 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
         "x86_64-linux": {
             script {
                 stage("Build") {
-                    def parameters = getCommonParameters()
-                    String globalVarsJson = writeJSON returnText: true, json: globalVars
-                    parameters += [
-                        'enableFailFast': enableFailFast,
+                    def additionalParameters = [
                         'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
                         'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
                         'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
-                        'globalVars': globalVarsJson,
                     ]
-
-                    if (env.alternativeTRT) {
-                        parameters += [
-                            'alternativeTRT': env.alternativeTRT,
-                        ]
-                    }
-
-                    if (reuseBuild) {
-                        parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
-                    }
-
-                    echo "trigger x86_64 build job, params: ${parameters}"
-
-                    def status = triggerJob("/LLM/helpers/Build-x86_64", parameters)
-                    if (status != "SUCCESS") {
-                        error "Downstream job did not succeed"
-                    }
-
-                }
-                def testStageName = "[Test-x86_64] Run"
-                if (env.localJobCredentials) {
-                    testStageName = "[Test-x86_64] Remote Run"
+                    launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
                 }
+                def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                def singleGpuTestFailed = false
                 stage(testStageName) {
                     if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
                         echo "x86_64 test job is skipped due to Jenkins configuration"
                         return
                     }
                     try {
-                        parameters = getCommonParameters()
                         String testFilterJson = writeJSON returnText: true, json: testFilter
-                        String globalVarsJson = writeJSON returnText: true, json: globalVars
-                        parameters += [
-                            'enableFailFast': enableFailFast,
+                        def additionalParameters = [
                             'testFilter': testFilterJson,
                             'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
                             'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
                             'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
-                            'globalVars': globalVarsJson,
                         ]
 
-                        if (env.alternativeTRT) {
-                            parameters += [
-                                'alternativeTRT': env.alternativeTRT,
-                            ]
+                        launchJob("L0_Test-x86_64-Single-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
+                    } catch (InterruptedException e) {
+                        throw e
+                    } catch (Exception e) {
+                        if (X86_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
+                            catchError(
+                                buildResult: 'SUCCESS',
+                                stageResult: 'FAILURE') {
+                                error "x86_64 test failed but ignored due to Jenkins configuration"
+                            }
+                        } else {
+                            catchError(
+                                buildResult: 'FAILURE',
+                                stageResult: 'FAILURE') {
+                                error "x86_64 single-GPU test failed"
+                            }
+                            singleGpuTestFailed = true
                         }
+                    }
+                }
 
-                        if (env.testPhase2StageName) {
-                            parameters += [
-                                'testPhase2StageName': env.testPhase2StageName,
-                            ]
+                def requireMultiGpuTesting = currentBuild.description?.contains("Require Multi-GPU Testing") ?: false
+                echo "requireMultiGpuTesting: ${requireMultiGpuTesting}"
+                if (!requireMultiGpuTesting) {
+                    return
+                }
+
+                if (singleGpuTestFailed) {
+                    if (env.JOB_NAME ==~ /.*PostMerge.*/) {
+                        echo "In the official post-merge pipeline, single-GPU test failed, whereas multi-GPU test is still kept running."
+                    } else {
+                        stage("[Test-x86_64-Multi-GPU] Blocked") {
+                            catchError(
+                                buildResult: 'FAILURE',
+                                stageResult: 'FAILURE') {
+                                error "This pipeline requires running multi-GPU test, but single-GPU test has failed."
+                            }
                         }
+                        return
+                    }
+                }
 
-                        echo "trigger x86_64 test job, params: ${parameters}"
+                testStageName = "[Test-x86_64-Multi-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                stage(testStageName) {
+                    if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
+                        echo "x86_64 test job is skipped due to Jenkins configuration"
+                        return
+                    }
+                    try {
+                        def testFilterJson = writeJSON returnText: true, json: testFilter
+                        def additionalParameters = [
+                            'testFilter': testFilterJson,
+                            'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
+                            'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
+                            'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
+                        ]
 
-                        def status = triggerJob(
-                            "L0_Test-x86_64",
-                            parameters,
-                        )
+                        launchJob("L0_Test-x86_64-Multi-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
 
-                        if (status != "SUCCESS") {
-                            error "Downstream job did not succeed"
-                        }
                     } catch (InterruptedException e) {
                         throw e
                     } catch (Exception e) {
@@ -991,38 +1040,11 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                     return
                 }
 
-                def stageName = "Build"
-                stage(stageName) {
-                    def parameters = getCommonParameters()
-                    String globalVarsJson = writeJSON returnText: true, json: globalVars
-                    parameters += [
-                        'enableFailFast': enableFailFast,
+                stage("Build") {
+                    def additionalParameters = [
                         "dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
-                        'globalVars': globalVarsJson,
                     ]
-
-                    if (env.alternativeTrtSBSA) {
-                        parameters += [
-                            "alternativeTRT": env.alternativeTrtSBSA,
-                        ]
-                    }
-
-                    if (reuseBuild) {
-                        parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
-                    }
-
-                    echo "trigger SBSA build job, params: ${parameters}"
-
-                    def status = triggerJob(
-                        "/LLM/helpers/Build-SBSA",
-                        parameters,
-                        jenkinsUrl,
-                        credentials,
-                    )
-
-                    if (status != "SUCCESS") {
-                        error "Downstream job did not succeed"
-                    }
+                    launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
                 }
                 stage(testStageName) {
                     if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
@@ -1030,40 +1052,14 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                         return
                     }
                     try {
-                        def parameters = getCommonParameters()
-                        String testFilterJson = writeJSON returnText: true, json: testFilter
-                        String globalVarsJson = writeJSON returnText: true, json: globalVars
-                        parameters += [
-                            'enableFailFast': enableFailFast,
+                        def testFilterJson = writeJSON returnText: true, json: testFilter
+                        def additionalParameters = [
                             'testFilter': testFilterJson,
                             "dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
-                            'globalVars': globalVarsJson,
                         ]
 
-                        if (env.alternativeTrtSBSA) {
-                            parameters += [
-                                "alternativeTRT": env.alternativeTrtSBSA,
-                            ]
-                        }
-
-                        if (env.testPhase2StageName) {
-                            parameters += [
-                                'testPhase2StageName': env.testPhase2StageName,
-                            ]
-                        }
-
-                        echo "trigger SBSA test job, params: ${parameters}"
+                        launchJob("L0_Test-SBSA", false, enableFailFast, globalVars, "SBSA", additionalParameters)
 
-                        def status = triggerJob(
-                            "L0_Test-SBSA",
-                            parameters,
-                            jenkinsUrl,
-                            credentials,
-                        )
-
-                        if (status != "SUCCESS") {
-                            error "Downstream job did not succeed"
-                        }
                     } catch (InterruptedException e) {
                         throw e
                     } catch (Exception e) {
@@ -1085,31 +1081,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
         "Build-Docker-Images": {
             script {
                 stage("[Build-Docker-Images] Remote Run") {
-                    def parameters = getCommonParameters()
-                    String globalVarsJson = writeJSON returnText: true, json: globalVars
                     def branch = env.gitlabBranch ? env.gitlabBranch : "main"
                     if (globalVars[GITHUB_PR_API_URL]) {
                         branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
                     }
 
-                    parameters += [
-                        'enableFailFast': enableFailFast,
+                    def additionalParameters = [
                         'branch': branch,
                         'action': "push",
                         'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
-                        'globalVars': globalVarsJson,
                     ]
 
-                    echo "trigger BuildDockerImages job, params: ${parameters}"
-
-                    def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
-                    if (status != "SUCCESS") {
-                        error "Downstream job did not succeed"
-                    }
+                    launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
                 }
             }
         }
     ]
+
     if (env.JOB_NAME ==~ /.*PostMerge.*/) {
         stages += dockerBuildJob
     }
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 7dec81f7fde..12d2a3c6dbe 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -2274,7 +2274,7 @@ pipeline {
             when {
                 expression {
                     // Only run the test list validation when necessary
-                    env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false
+                    env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
                 }
             }
             steps
@@ -2299,17 +2299,33 @@ pipeline {
                         dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
                     }
 
-                    if (singleGpuJobs.size() > 0) {
-                        singleGpuJobs.failFast = params.enableFailFast
-                        parallel singleGpuJobs
-                    } else {
-                        echo "Skip single-GPU testing. No test to run."
-                    }
-
-                    if (dgxJobs.size() > 0) {
-                        stage(testPhase2StageName) {
+                    if (env.JOB_NAME ==~ /.*Single-GPU.*/) {
+                        echo "Only run single-GPU tests."
+                        if (dgxJobs.size() > 0) {
+                            if (globalVars[ACTION_INFO]['parents'].size() > 0) {
+                                // We add a special marker to the parent job's description.
+                                // This will be used to decide whether to run multi-GPU test stage.
+                                def parentJob = globalVars[ACTION_INFO]['parents'][-2]
+                                trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require Multi-GPU Testing====<br/>")
+                            } else {
+                                echo "No parent job found to add the special marker for executing multi-GPU test stage."
+                            }
+                        } else {
+                            echo "Skip multi-GPU testing. No test to run."
+                        }
+                        if (singleGpuJobs.size() > 0) {
+                            singleGpuJobs.failFast = params.enableFailFast
+                            parallel singleGpuJobs
+                        } else {
+                            echo "Skip single-GPU testing. No test to run."
+                        }
+                    } else if (env.JOB_NAME ==~ /.*Multi-GPU.*/) {
+                        echo "Only run multi-GPU tests."
+                        if (dgxJobs.size() > 0) {
                             dgxJobs.failFast = params.enableFailFast
                             parallel dgxJobs
+                        } else {
+                            error "Skip multi-GPU testing. No test to run."
                         }
                     }
                 }

From e30d7bec387c620eef93559b61185c224f1d3a8a Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Wed, 16 Jul 2025 22:41:18 +0800
Subject: [PATCH 41/88] [Infra] - Waive failed cases in post-merge on main 
 (#6096)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 0039dca742f..5380afccf86 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -440,3 +440,7 @@ test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSe
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
 test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)
+triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437)
+triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5397036)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5397036)

From 301b78bb77c48ea4fe53dd0d02390efb8f5e336e Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
Date: Wed, 16 Jul 2025 08:39:29 -0700
Subject: [PATCH 42/88] Add documentation for eagle3+disagg+dynamo (#6072)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
---
 docs/source/advanced/speculative-decoding.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/advanced/speculative-decoding.md b/docs/source/advanced/speculative-decoding.md
index a601d9dd24a..919662a5fbe 100644
--- a/docs/source/advanced/speculative-decoding.md
+++ b/docs/source/advanced/speculative-decoding.md
@@ -10,6 +10,7 @@
     - [Limitations](#limitations)
 - [ReDrafter](#redrafter)
 - [EAGLE](#eagle)
+    - [Disaggregated Serving](#disaggregated-serving)
 - [Lookahead decoding](#lookahead-decoding)
 
 ## About Speculative Sampling
@@ -169,6 +170,10 @@ The EAGLE approach enhances the single-model Medusa method by predicting and ver
 
 Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits prediction, draft tokens acceptance and draft token generation are performed inside of the TensorRT engine. EAGLE-1 and EAGLE-2 are both supported, while EAGLE-2 is currently in the experimental stage. Please, visit the [EAGLE README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/eagle/README.md) for information about building and running the model.
 
+### Disaggregated Serving
+
+[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) with EAGLE3 using the two model approach is supported in the Pytorch backend. Please refer to the following [Dynamo example](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/llama4_plus_eagle.md) on how to run EAGLE3 with Disaggregated Serving for Llama 4 Maverick.
+
 ## Lookahead Decoding
 
 Lookahead decoding algorithm operates through two parallel computation branches within the same model: a lookahead branch that generates n-grams using a fixed-sized 2D window, and a verification branch that validates promising n-gram candidates. This approach eliminates the necessity for additional model training or fine-tuning and can be enabled for any autoregressive model. Refer to the [Lookahead decoding README](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/lookahead/README.md) for information about building and running the model.

From 9354114f6827eb6654fb515175cfbcec73acc99b Mon Sep 17 00:00:00 2001
From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
Date: Thu, 17 Jul 2025 00:41:45 +0800
Subject: [PATCH 43/88] fix: Update trtllm args issues with extra nested config
 (#5996)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
---
 tensorrt_llm/builder.py                | 100 +++++++++++++++++--------
 tensorrt_llm/llmapi/llm_args.py        |   3 +-
 tests/unittest/llmapi/test_llm_args.py |  60 +++++++++++++++
 3 files changed, 131 insertions(+), 32 deletions(-)

diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index 5b41abf7e8e..e2dc543ac42 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -13,12 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
+import dataclasses
 import json
 import math
 import os
 import shutil
 import time
 from dataclasses import dataclass, field
+from functools import cache
 from pathlib import Path
 from typing import Dict, Optional, Union
 
@@ -557,53 +559,89 @@ def override_attri(attr_name, value):
         else:
             override_attri('paged_state', False)
 
+    @classmethod
+    @cache
+    def get_build_config_defaults(cls):
+        return {
+            field.name: field.default
+            for field in dataclasses.fields(cls)
+            if field.default is not dataclasses.MISSING
+        }
+
     @classmethod
     def from_dict(cls, config, plugin_config=None):
         config = copy.deepcopy(
             config
         )  # it just does not make sense to change the input arg `config`
-        max_input_len = config.pop('max_input_len')
-        max_seq_len = config.pop('max_seq_len')
-        max_batch_size = config.pop('max_batch_size')
-        max_beam_width = config.pop('max_beam_width')
-        max_num_tokens = config.pop('max_num_tokens')
-        opt_num_tokens = config.pop('opt_num_tokens')
-        opt_batch_size = config.pop('opt_batch_size', 8)
+
+        defaults = cls.get_build_config_defaults()
+        max_input_len = config.pop('max_input_len',
+                                   defaults.get('max_input_len'))
+        max_seq_len = config.pop('max_seq_len', defaults.get('max_seq_len'))
+        max_batch_size = config.pop('max_batch_size',
+                                    defaults.get('max_batch_size'))
+        max_beam_width = config.pop('max_beam_width',
+                                    defaults.get('max_beam_width'))
+        max_num_tokens = config.pop('max_num_tokens',
+                                    defaults.get('max_num_tokens'))
+        opt_num_tokens = config.pop('opt_num_tokens',
+                                    defaults.get('opt_num_tokens'))
+        opt_batch_size = config.pop('opt_batch_size',
+                                    defaults.get('opt_batch_size'))
         max_prompt_embedding_table_size = config.pop(
-            'max_prompt_embedding_table_size', 0)
-
-        kv_cache_type = KVCacheType(
-            config.pop('kv_cache_type')) if 'plugin_config' in config else None
-        gather_context_logits = config.pop('gather_context_logits', False)
-        gather_generation_logits = config.pop('gather_generation_logits', False)
-        strongly_typed = config.pop('strongly_typed', True)
-        force_num_profiles = config.pop('force_num_profiles', None)
-        weight_sparsity = config.pop('weight_sparsity', False)
+            'max_prompt_embedding_table_size',
+            defaults.get('max_prompt_embedding_table_size'))
+
+        if "kv_cache_type" in config and config["kv_cache_type"] is not None:
+            kv_cache_type = KVCacheType(config.pop('kv_cache_type'))
+        else:
+            kv_cache_type = None
+        gather_context_logits = config.pop(
+            'gather_context_logits', defaults.get('gather_context_logits'))
+        gather_generation_logits = config.pop(
+            'gather_generation_logits',
+            defaults.get('gather_generation_logits'))
+        strongly_typed = config.pop('strongly_typed',
+                                    defaults.get('strongly_typed'))
+        force_num_profiles = config.pop('force_num_profiles',
+                                        defaults.get('force_num_profiles'))
+        weight_sparsity = config.pop('weight_sparsity',
+                                     defaults.get('weight_sparsity'))
         profiling_verbosity = config.pop('profiling_verbosity',
-                                         'layer_names_only')
-        enable_debug_output = config.pop('enable_debug_output', False)
-        max_draft_len = config.pop('max_draft_len', 0)
-        speculative_decoding_mode = config.pop('speculative_decoding_mode',
-                                               SpeculativeDecodingMode.NONE)
-        use_refit = config.pop('use_refit', False)
-        input_timing_cache = config.pop('input_timing_cache', None)
-        output_timing_cache = config.pop('output_timing_cache', None)
+                                         defaults.get('profiling_verbosity'))
+        enable_debug_output = config.pop('enable_debug_output',
+                                         defaults.get('enable_debug_output'))
+        max_draft_len = config.pop('max_draft_len',
+                                   defaults.get('max_draft_len'))
+        speculative_decoding_mode = config.pop(
+            'speculative_decoding_mode',
+            defaults.get('speculative_decoding_mode'))
+        use_refit = config.pop('use_refit', defaults.get('use_refit'))
+        input_timing_cache = config.pop('input_timing_cache',
+                                        defaults.get('input_timing_cache'))
+        output_timing_cache = config.pop('output_timing_cache',
+                                         defaults.get('output_timing_cache'))
         lora_config = LoraConfig.from_dict(config.get('lora_config', {}))
         auto_parallel_config = AutoParallelConfig.from_dict(
             config.get('auto_parallel_config', {}))
-        max_encoder_input_len = config.pop('max_encoder_input_len', 1024)
-        weight_streaming = config.pop('weight_streaming', False)
-        use_strip_plan = config.pop('use_strip_plan', False)
+        max_encoder_input_len = config.pop(
+            'max_encoder_input_len', defaults.get('max_encoder_input_len'))
+        weight_streaming = config.pop('weight_streaming',
+                                      defaults.get('weight_streaming'))
+        use_strip_plan = config.pop('use_strip_plan',
+                                    defaults.get('use_strip_plan'))
 
         if plugin_config is None:
             plugin_config = PluginConfig()
         if "plugin_config" in config.keys():
             plugin_config.update_from_dict(config["plugin_config"])
 
-        dry_run = config.pop('dry_run', False)
-        visualize_network = config.pop('visualize_network', None)
-        monitor_memory = config.pop('monitor_memory', False)
-        use_mrope = config.pop('use_mrope', False)
+        dry_run = config.pop('dry_run', defaults.get('dry_run'))
+        visualize_network = config.pop('visualize_network',
+                                       defaults.get('visualize_network'))
+        monitor_memory = config.pop('monitor_memory',
+                                    defaults.get('monitor_memory'))
+        use_mrope = config.pop('use_mrope', defaults.get('use_mrope'))
 
         return cls(
             max_input_len=max_input_len,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 745aa739a86..a08982022d2 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -2096,7 +2096,8 @@ def update_llm_args_with_extra_dict(
     }
     for field_name, field_type in field_mapping.items():
         if field_name in llm_args_dict:
-            if field_name == "speculative_config":
+            # Some fields need to be converted manually.
+            if field_name in ["speculative_config", "build_config"]:
                 llm_args_dict[field_name] = field_type.from_dict(
                     llm_args_dict[field_name])
             else:
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index 0c2aaf20a13..c1bfdcc4001 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -6,9 +6,14 @@
 
 import tensorrt_llm.bindings.executor as tle
 from tensorrt_llm import LLM as TorchLLM
+from tensorrt_llm import AutoParallelConfig
 from tensorrt_llm._tensorrt_engine import LLM
+from tensorrt_llm.builder import LoraConfig
+from tensorrt_llm.llmapi import (BuildConfig, CapacitySchedulerPolicy,
+                                 SchedulerConfig)
 from tensorrt_llm.llmapi.llm_args import *
 from tensorrt_llm.llmapi.utils import print_traceback_on_error
+from tensorrt_llm.plugin import PluginConfig
 
 from .test_llm import llama_model_path
 
@@ -252,6 +257,61 @@ def test_PeftCacheConfig_declaration():
     assert pybind_config.lora_prefetch_dir == "."
 
 
+def test_update_llm_args_with_extra_dict_with_nested_dict():
+    llm_api_args_dict = {
+        "model":
+        "dummy-model",
+        "build_config":
+        None,  # Will override later.
+        "extended_runtime_perf_knob_config":
+        ExtendedRuntimePerfKnobConfig(multi_block_mode=True),
+        "kv_cache_config":
+        KvCacheConfig(enable_block_reuse=False),
+        "peft_cache_config":
+        PeftCacheConfig(num_host_module_layer=0),
+        "scheduler_config":
+        SchedulerConfig(capacity_scheduler_policy=CapacitySchedulerPolicy.
+                        GUARANTEED_NO_EVICT)
+    }
+    plugin_config_dict = {
+        "_dtype": 'float16',
+        "nccl_plugin": None,
+    }
+    plugin_config = PluginConfig.from_dict(plugin_config_dict)
+    build_config = BuildConfig(max_input_len=1024,
+                               lora_config=LoraConfig(lora_ckpt_source='hf'),
+                               auto_parallel_config=AutoParallelConfig(
+                                   world_size=1,
+                                   same_buffer_io={},
+                                   debug_outputs=[]),
+                               plugin_config=plugin_config)
+    extra_llm_args_dict = {
+        "build_config": build_config.to_dict(),
+    }
+
+    llm_api_args_dict = update_llm_args_with_extra_dict(llm_api_args_dict,
+                                                        extra_llm_args_dict,
+                                                        "build_config")
+    initialized_llm_args = TrtLlmArgs(**llm_api_args_dict)
+
+    def check_nested_dict_equality(dict1, dict2, path=""):
+        if not isinstance(dict1, dict) or not isinstance(dict2, dict):
+            if dict1 != dict2:
+                raise ValueError(f"Mismatch at {path}: {dict1} != {dict2}")
+            return True
+        if dict1.keys() != dict2.keys():
+            raise ValueError(f"Different keys at {path}:")
+        for key in dict1:
+            new_path = f"{path}.{key}" if path else key
+            if not check_nested_dict_equality(dict1[key], dict2[key], new_path):
+                raise ValueError(f"Mismatch at {path}: {dict1} != {dict2}")
+        return True
+
+    build_config_dict1 = build_config.to_dict()
+    build_config_dict2 = initialized_llm_args.build_config.to_dict()
+    check_nested_dict_equality(build_config_dict1, build_config_dict2)
+
+
 class TestTorchLlmArgsCudaGraphSettings:
 
     def test_cuda_graph_batch_sizes_case_0(self):

From e0836f9ca90507d1cd2a1c70e63396f0689529ed Mon Sep 17 00:00:00 2001
From: shaharmor98 <17088876+shaharmor98@users.noreply.github.com>
Date: Wed, 16 Jul 2025 19:50:30 +0300
Subject: [PATCH 44/88] [TRTLLM-5493] Add core infrastructure to enable loading
 of custom checkpoint formats (#5372)

Signed-off-by: Shahar Mor <17088876+shaharmor98@users.noreply.github.com>
---
 tensorrt_llm/_torch/__init__.py               |   3 +-
 .../_torch/models/checkpoints/__init__.py     |  18 ++
 .../_torch/models/checkpoints/auto_mapper.py  |  17 ++
 .../checkpoints/base_checkpoint_loader.py     |  87 +++++++++
 .../models/checkpoints/base_config_loader.py  |  13 ++
 .../models/checkpoints/base_weight_loader.py  |  20 +++
 .../models/checkpoints/base_weight_mapper.py  | 165 ++++++++++++++++++
 .../_torch/models/checkpoints/hf/__init__.py  |   0
 .../checkpoints/hf/checkpoint_loader.py       |  75 ++++++++
 .../models/checkpoints/hf/config_loader.py    |  11 ++
 .../checkpoints/hf/gemma3_weight_mapper.py    |  34 ++++
 .../checkpoints/hf/llama4_weight_mapper.py    |  22 +++
 .../checkpoints/hf/mixtral_weight_mapper.py   |  26 +++
 .../hf/nemotron_h_weight_mapper.py            |  99 +++++++++++
 .../checkpoints/hf/qwen2_moe_weight_mapper.py |  26 +++
 .../checkpoints/hf/qwen3_moe_weight_mapper.py |  39 +++++
 .../models/checkpoints/hf/weight_loader.py    | 123 +++++++++++++
 .../models/checkpoints/hf/weight_mapper.py    | 101 +++++++++++
 tensorrt_llm/_torch/models/modeling_gemma3.py |  64 +------
 tensorrt_llm/_torch/models/modeling_llama.py  |  14 +-
 .../_torch/models/modeling_mixtral.py         |  28 +--
 .../_torch/models/modeling_nemotron_h.py      |  95 +---------
 .../_torch/models/modeling_qwen3_moe.py       |  74 +-------
 .../_torch/models/modeling_qwen_moe.py        |  60 +------
 .../_torch/models/modeling_speculative.py     |  28 ++-
 tensorrt_llm/_torch/models/modeling_utils.py  | 153 +++++++++++++++-
 tensorrt_llm/_torch/pyexecutor/_util.py       |   1 -
 tensorrt_llm/_torch/pyexecutor/config.py      |  34 +++-
 .../_torch/pyexecutor/model_engine.py         | 139 ++++-----------
 .../_torch/pyexecutor/py_executor_creator.py  |   2 +
 tensorrt_llm/executor/worker.py               |   6 +
 tensorrt_llm/llmapi/llm.py                    |   6 +-
 tensorrt_llm/llmapi/llm_args.py               |  28 +++
 .../_torch/modeling/test_modeling_gemma3.py   |   6 +-
 .../test_modeling_llama_min_latency.py        |   7 +-
 .../_torch/modeling/test_modeling_mixtral.py  |   6 +-
 .../_torch/modeling/test_modeling_qwen_moe.py |   6 +-
 .../_torch/test_pytorch_model_engine.py       |   1 +
 .../api_stability/references/llm.yaml         |   6 +
 39 files changed, 1202 insertions(+), 441 deletions(-)
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/__init__.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/auto_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/base_config_loader.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/base_weight_loader.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/__init__.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/checkpoint_loader.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/config_loader.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/llama4_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/mixtral_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/qwen2_moe_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/qwen3_moe_weight_mapper.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py
 create mode 100644 tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py

diff --git a/tensorrt_llm/_torch/__init__.py b/tensorrt_llm/_torch/__init__.py
index 7d2de6d643c..23257d91504 100644
--- a/tensorrt_llm/_torch/__init__.py
+++ b/tensorrt_llm/_torch/__init__.py
@@ -1,4 +1,5 @@
 from .llm import LLM
 from .model_config import MoeLoadBalancerConfig
+from .models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
 
-__all__ = ["LLM", "MoeLoadBalancerConfig"]
+__all__ = ["LLM", "MoeLoadBalancerConfig", "BaseCheckpointLoader"]
diff --git a/tensorrt_llm/_torch/models/checkpoints/__init__.py b/tensorrt_llm/_torch/models/checkpoints/__init__.py
new file mode 100644
index 00000000000..58789f36458
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/__init__.py
@@ -0,0 +1,18 @@
+from .base_checkpoint_loader import BaseCheckpointLoader
+from .hf.checkpoint_loader import HfCheckpointLoader
+from .hf.config_loader import HfConfigLoader
+from .hf.gemma3_weight_mapper import Gemma3HfWeightMapper
+from .hf.llama4_weight_mapper import Llama4HfWeightMapper
+from .hf.mixtral_weight_mapper import MixtralHfWeightMapper
+from .hf.nemotron_h_weight_mapper import NemotronHHfWeightMapper
+from .hf.qwen2_moe_weight_mapper import Qwen2MoeHfWeightMapper
+from .hf.qwen3_moe_weight_mapper import Qwen3MoeHfWeightMapper
+from .hf.weight_loader import HfWeightLoader
+from .hf.weight_mapper import HfWeightMapper
+
+__all__ = [
+    "HfConfigLoader", "HfWeightLoader", "HfWeightMapper",
+    "BaseCheckpointLoader", "HfCheckpointLoader", "NemotronHHfWeightMapper",
+    "Gemma3HfWeightMapper", "MixtralHfWeightMapper", "Llama4HfWeightMapper",
+    "Qwen2MoeHfWeightMapper", "Qwen3MoeHfWeightMapper"
+]
diff --git a/tensorrt_llm/_torch/models/checkpoints/auto_mapper.py b/tensorrt_llm/_torch/models/checkpoints/auto_mapper.py
new file mode 100644
index 00000000000..8bb5bb92a8a
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/auto_mapper.py
@@ -0,0 +1,17 @@
+from typing import Optional
+
+from tensorrt_llm._torch.models.modeling_utils import MODEL_CLASS_MAPPER_MAPPING
+
+
+class AutoCheckpointMapper():
+
+    @staticmethod
+    def get(format: str, name: Optional[str] = None) -> "BaseWeightMapper":
+        if name is not None:
+            try:
+                return MODEL_CLASS_MAPPER_MAPPING[f'{name}_{format}']()
+            except KeyError:  # no mapper for this model architecture, resort to default
+                # TODO smor- a potential bug here, if the class isn't added to __init__, it will return the default mapper
+                return MODEL_CLASS_MAPPER_MAPPING[format]()
+        else:
+            return MODEL_CLASS_MAPPER_MAPPING[format]()
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
new file mode 100644
index 00000000000..c1bfec0144a
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/base_checkpoint_loader.py
@@ -0,0 +1,87 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+from torch import nn
+
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.auto_mapper import \
+    AutoCheckpointMapper
+from tensorrt_llm._torch.models.checkpoints.base_config_loader import \
+    BaseConfigLoader
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import \
+    BaseWeightLoader
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import \
+    CHECKPOINT_LOADER_FORMAT_DEFAULT_MAPPING
+
+
+class BaseCheckpointLoader(ABC):
+
+    @abstractmethod
+    def get_default_weight_loader(self) -> BaseWeightLoader:
+        raise NotImplementedError
+
+    @abstractmethod
+    def get_default_config_loader(self) -> BaseConfigLoader:
+        raise NotImplementedError
+
+    @abstractmethod
+    def cleanup(self) -> None:
+        raise NotImplementedError
+
+    @property
+    @abstractmethod
+    def weight_loader(self) -> BaseWeightLoader:
+        ...
+
+    @property
+    @abstractmethod
+    def weight_mapper(self) -> BaseWeightMapper:
+        ...
+
+    @property
+    @abstractmethod
+    def config_loader(self) -> BaseConfigLoader:
+        ...
+
+    @property
+    @abstractmethod
+    def checkpoint_format(self) -> str:
+        ...
+
+    def load_config(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
+        return self.config_loader.load(checkpoint_dir, **kwargs)
+
+    def load_weights(self, checkpoint_dir: str, **kwargs) -> dict[str, Any]:
+        return self.weight_loader.load_weights(checkpoint_dir, **kwargs)
+
+    @classmethod
+    def get(cls, checkpoint_format: str, **kwargs) -> "BaseCheckpointLoader":
+        try:
+            return CHECKPOINT_LOADER_FORMAT_DEFAULT_MAPPING[checkpoint_format](
+                **kwargs)
+        except KeyError:
+            raise ValueError(
+                f"Checkpoint loader for format {checkpoint_format} not found, "
+                f"available formats are: {CHECKPOINT_LOADER_FORMAT_DEFAULT_MAPPING.keys()}"
+            )
+
+    def get_initilized_weight_mapper(self, model: nn.Module,
+                                     config: ModelConfig) -> BaseWeightMapper:
+        weight_mapper = None
+        if self.weight_mapper is not None:
+            self.weight_mapper.init_model_and_config(model, config)
+            return self.weight_mapper
+        else:
+            # The name of the registered mapper should be the model architecture
+            if config.pretrained_config and config.pretrained_config.architectures:
+                model_arch = config.pretrained_config.architectures[0]
+            else:
+                raise ValueError(
+                    "Cannot determine model architecture from config")
+            weight_mapper = AutoCheckpointMapper.get(self.checkpoint_format,
+                                                     model_arch)
+            weight_mapper.init_model_and_config(model, config)
+            self.weight_mapper = weight_mapper
+            return weight_mapper
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_config_loader.py b/tensorrt_llm/_torch/models/checkpoints/base_config_loader.py
new file mode 100644
index 00000000000..4af1dadaddc
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/base_config_loader.py
@@ -0,0 +1,13 @@
+from abc import ABC, abstractmethod
+
+from tensorrt_llm._torch.model_config import ModelConfig
+
+
+class BaseConfigLoader(ABC):
+
+    @abstractmethod
+    def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
+        pass
+
+    def cleanup(self) -> None:
+        pass
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_weight_loader.py b/tensorrt_llm/_torch/models/checkpoints/base_weight_loader.py
new file mode 100644
index 00000000000..c6c88d16bdc
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/base_weight_loader.py
@@ -0,0 +1,20 @@
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class BaseWeightLoader(ABC):
+
+    @abstractmethod
+    def load_weights(self, checkpoint_dir: str) -> dict[str, Any]:
+        """
+        Loads weights from a checkpoint directory.
+
+        Args:
+            checkpoint_dir: A path to the checkpoint directory.
+
+        Returns:
+            A dictionary where keys are tensor names and values are the tensors.
+        """
+
+    def cleanup(self) -> None:
+        pass
diff --git a/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py
new file mode 100644
index 00000000000..708493eb532
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py
@@ -0,0 +1,165 @@
+from abc import ABC, abstractmethod
+from typing import Callable, List, Union
+
+from torch import nn
+
+from tensorrt_llm._torch.model_config import ModelConfig, TConfig
+from tensorrt_llm._torch.models.modeling_utils import DecoderModelForCausalLM
+
+
+class BaseWeightMapper(ABC):
+
+    def __init__(self):
+        self._callbacks: list[Callable] = []
+        self._mapping: dict = {}
+        self._skip_modules = []
+        self._model: Union[nn.Module, DecoderModelForCausalLM] | None = None
+        self._config: TConfig | None = None
+
+    def init_model_and_config(self, model: Union[nn.Module,
+                                                 DecoderModelForCausalLM],
+                              config: TConfig):
+        self._model = model
+        self._config = config
+
+        if not hasattr(model, 'model_config') or not isinstance(
+                model.model_config, ModelConfig):
+            raise ValueError("model must have a model_config attribute")
+        if not hasattr(model, 'config'):
+            raise ValueError("model must have a config attribute")
+
+        self._tp_size = 1 if model.model_config.mapping.enable_attention_dp else model.model_config.mapping.tp_size
+        self._num_kv_heads = model.config.num_key_value_heads if hasattr(
+            model.config, 'num_key_value_heads'
+        ) and model.config.num_key_value_heads is not None else model.config.num_attention_heads
+
+        self.map_weights()
+
+    def cleanup(self) -> None:
+        self._model = None
+        self._config = None
+
+    @abstractmethod
+    def map_weights(self) -> None:
+        """
+        Maps weights from TRT-LLM to a source state dictionary (e.g., Hugging Face)
+        """
+
+    @abstractmethod
+    def apply_callbacks(self, module: nn.Module, module_name: str,
+                        module_names_breakdown: list[str],
+                        weights: dict) -> list[dict]:
+        """
+        Applies a series of transformation functions to an internal representation
+        of weights or to guide the mapping process. The exact behavior might depend
+        on the implementation (e.g., storing callbacks to be applied later).
+
+        Args:
+            module: The module to apply the callbacks to
+            module_name: The specific module name (e.g., 'qkv_proj', 'gate_up_proj')
+            module_names_breakdown: List of module path components for building full paths
+            weights: The weights dictionary to process
+        """
+
+    def rename_by_params_map(self, params_map: dict[str, str],
+                             weights: dict) -> dict:
+        """
+        Rename weight keys according to regex pattern matching.
+
+        Args:
+            pattern_mapping: A dictionary mapping regex patterns to replacement strings. The key is HF name pattern, and the value is corresponding TRT-LLM name pattern.
+                The patterns will be used to match keys in the weights dict and replace
+                them according to the replacement string, which can use regex backreferences.
+                Example:
+                HF name: vision_model.encoder.layers.1.self_attn.out_proj.{weight,bias}
+                TRT-LLM name: vision_model.encoder.layers.1.self_attn.o_proj.{weight,bias}
+                Then the pattern_mapping could be:
+                pattern_mapping = {
+                    r'(.*?)out_proj(.*)': r'\1o_proj\2'
+                }
+            weights: A dictionary of weights
+
+        Returns:
+            A dictionary of weights with renamed keys
+        """
+        import re
+
+        # Create a new dictionary to store the renamed weights
+        renamed_weights = {}
+
+        # Keep track of keys that have been matched by a pattern
+        matched_keys = set()
+
+        # Process each key in the weights dictionary
+        for key in list(weights.keys()):
+            # Check each pattern for a match
+            for pattern, replacement in params_map.items():
+                if re.match(pattern, key):
+                    # Create the new key by applying the regex replacement
+                    new_key = re.sub(pattern, replacement, key)
+                    # Store the weight with the new key
+                    renamed_weights[new_key] = weights[key]
+                    matched_keys.add(key)
+                    break
+
+            # If the key wasn't matched by any pattern, keep it as is
+            if key not in matched_keys:
+                renamed_weights[key] = weights[key]
+
+        return renamed_weights
+
+    def preprocess_weights(self, weights: dict) -> dict:
+        """
+        Preprocess weights before starting the loading process.
+        """
+        ...
+
+    def handle_manual_copy(self, module_name: str, module_weights: dict, n: str,
+                           p: nn.Parameter) -> None:
+        p.data.copy_(module_weights[n][:])
+
+    def does_require_special_handling(self, module_name: str) -> bool:
+        return module_name in self.mapping
+
+    def is_special_instance_module(self, module: nn.Module) -> bool:
+        return False
+
+    def handle_special_instance_module(self, module: nn.Module,
+                                       module_name: str,
+                                       module_weights: dict) -> None:
+        raise NotImplementedError()
+
+    @property
+    def skip_modules(self) -> List[str]:
+        return self._skip_modules
+
+    def add_skip_modules(self, value: List[str]) -> None:
+        self._skip_modules.extend(value)
+
+    def should_skip_module(self, module_name: str) -> bool:
+        return any(skip_module in module_name
+                   for skip_module in self._skip_modules)
+
+    def filter_weights(self, prefix: str, weights: dict) -> dict:
+        result = {}
+        for k, v in weights.items():
+            if k.startswith(prefix):
+                new_k = k[len(prefix) + 1:]
+                result[new_k] = v
+        return result
+
+    @property
+    def mapping(self) -> dict:
+        return self._mapping
+
+    @property
+    def config(self) -> TConfig:
+        if self._config is None:
+            raise RuntimeError("Weight mapper is not initialized")
+        return self._config
+
+    @property
+    def model(self) -> Union[nn.Module, DecoderModelForCausalLM]:
+        if self._model is None:
+            raise RuntimeError("Weight mapper is not initialized")
+        return self._model
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/__init__.py b/tensorrt_llm/_torch/models/checkpoints/hf/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/checkpoint_loader.py b/tensorrt_llm/_torch/models/checkpoints/hf/checkpoint_loader.py
new file mode 100644
index 00000000000..d00a2411474
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/checkpoint_loader.py
@@ -0,0 +1,75 @@
+from typing import Optional
+
+from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
+    BaseCheckpointLoader
+from tensorrt_llm._torch.models.checkpoints.base_config_loader import \
+    BaseConfigLoader
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import \
+    BaseWeightLoader
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
+from tensorrt_llm._torch.models.checkpoints.hf.config_loader import \
+    HfConfigLoader
+from tensorrt_llm._torch.models.checkpoints.hf.weight_loader import \
+    HfWeightLoader
+from tensorrt_llm._torch.models.modeling_utils import register_checkpoint_loader
+
+
+@register_checkpoint_loader("HF")
+class HfCheckpointLoader(BaseCheckpointLoader):
+
+    def __init__(self,
+                 *,
+                 weight_loader: Optional[BaseWeightLoader] = None,
+                 weight_mapper: Optional[BaseWeightMapper] = None,
+                 config_loader: Optional[BaseConfigLoader] = None):
+        if weight_loader is None:
+            self._weight_loader = self.get_default_weight_loader()
+        else:
+            self._weight_loader = weight_loader
+        if config_loader is None:
+            self._config_loader = self.get_default_config_loader()
+        else:
+            self._config_loader = config_loader
+        self._weight_mapper = weight_mapper
+        self._checkpoint_format = "HF"
+
+    def cleanup(self) -> None:
+        # Clean up weight mapper first as it may hold model references
+        if self._weight_mapper is not None:
+            self._weight_mapper.cleanup()
+            self._weight_mapper = None
+
+        if self._weight_loader is not None:
+            self._weight_loader.cleanup()
+            self._weight_loader = None
+
+        if self._config_loader is not None:
+            self._config_loader.cleanup()
+            self._config_loader = None
+
+    def get_default_weight_loader(self) -> HfWeightLoader:
+        return HfWeightLoader()
+
+    def get_default_config_loader(self) -> HfConfigLoader:
+        return HfConfigLoader()
+
+    @property
+    def weight_loader(self) -> BaseWeightLoader:
+        return self._weight_loader
+
+    @property
+    def weight_mapper(self) -> Optional[BaseWeightMapper]:
+        return self._weight_mapper
+
+    @weight_mapper.setter
+    def weight_mapper(self, value: BaseWeightMapper) -> None:
+        self._weight_mapper = value
+
+    @property
+    def config_loader(self) -> Optional[BaseConfigLoader]:
+        return self._config_loader
+
+    @property
+    def checkpoint_format(self) -> str:
+        return self._checkpoint_format
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/config_loader.py b/tensorrt_llm/_torch/models/checkpoints/hf/config_loader.py
new file mode 100644
index 00000000000..a82824db473
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/config_loader.py
@@ -0,0 +1,11 @@
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.base_config_loader import \
+    BaseConfigLoader
+from tensorrt_llm._torch.models.modeling_utils import register_config_loader
+
+
+@register_config_loader("HF")
+class HfConfigLoader(BaseConfigLoader):
+
+    def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
+        return ModelConfig.from_pretrained(checkpoint_dir, **kwargs)
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py
new file mode 100644
index 00000000000..3f35f2d9016
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/gemma3_weight_mapper.py
@@ -0,0 +1,34 @@
+from torch import nn
+
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import \
+    HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "Gemma3ForCausalLM")
+class Gemma3HfWeightMapper(HfWeightMapper):
+
+    def should_skip_module(self, module_name: str) -> bool:
+        if self.model.config.tie_word_embeddings and module_name.startswith(
+                "lm_head"):
+            return True
+
+        # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values
+        if hasattr(self.model, "model") and hasattr(
+                self.model.model, 'has_custom_embed_tokens'
+        ) and self.model.model.has_custom_embed_tokens and module_name == "model.embed_tokens":
+            return True
+        if hasattr(
+                self.model, 'has_custom_lm_head'
+        ) and self.model.has_custom_lm_head and module_name == "lm_head":
+            return True
+
+        return any(skip_module in module_name
+                   for skip_module in self._skip_modules)
+
+    def handle_manual_copy(self, module_name: str, module_weights: dict, n: str,
+                           p: nn.Parameter) -> None:
+        if 'norm' in module_name:
+            p.data.copy_(module_weights[n][:] + 1)
+        else:
+            super().handle_manual_copy(module_name, module_weights, n, p)
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/llama4_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/llama4_weight_mapper.py
new file mode 100644
index 00000000000..4889e02ccaf
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/llama4_weight_mapper.py
@@ -0,0 +1,22 @@
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import \
+    HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "Llama4ForConditionalGeneration")
+class Llama4HfWeightMapper(HfWeightMapper):
+    """
+    Weight mapper for Llama4ForConditionalGeneration that handles the
+    'language_model.' prefix removal from weight keys.
+    """
+
+    def filter_weights(self, prefix: str, weights: dict) -> dict:
+        transformed_weights = {}
+        for key, value in weights.items():
+            if key.startswith("language_model."):
+                new_key = key[len("language_model."):]
+                transformed_weights[new_key] = value
+            else:
+                transformed_weights[key] = value
+
+        return super().filter_weights(prefix, transformed_weights)
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/mixtral_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/mixtral_weight_mapper.py
new file mode 100644
index 00000000000..6bdd31383ae
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/mixtral_weight_mapper.py
@@ -0,0 +1,26 @@
+from torch import nn
+
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "MixtralForCausalLM")
+class MixtralHfWeightMapper(BaseWeightMapper):
+
+    def map_weights(self) -> None:
+        self.mapping.update({
+            'qkv_proj': ['q_proj', 'k_proj', 'v_proj'],
+        })
+
+    def apply_callbacks(self, module: nn.Module, module_name: str,
+                        module_names_breakdown: list[str],
+                        weights: dict) -> list[dict]:
+        module_weights = []
+
+        for new_name in self.mapping[module_name]:
+            fw = self.filter_weights(
+                '.'.join(module_names_breakdown + [new_name]), weights)
+            module_weights.append(fw)
+
+        return module_weights
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
new file mode 100644
index 00000000000..1b2ef8cf81b
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/nemotron_h_weight_mapper.py
@@ -0,0 +1,99 @@
+import torch
+
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import \
+    HfWeightMapper
+from tensorrt_llm._torch.models.modeling_nemotron_h import split
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "NemotronHForCausalLM")
+class NemotronHHfWeightMapper(HfWeightMapper):
+
+    def preprocess_weights(self, weights: dict) -> dict:
+        config = self.config.pretrained_config
+        tp_size = self.config.mapping.tp_size
+        tp_rank = self.config.mapping.tp_rank
+        d_inner = config.hidden_size * config.expand
+        n_groups = config.n_groups
+        d_state = config.ssm_state_size
+        nheads = d_inner // config.mamba_head_dim
+
+        new_weights = {}
+        for name, _ in weights.items():
+            key = name
+
+            # change backbone root name to model
+            if "backbone" in key:
+                key = key.replace("backbone", "model")
+
+            # change embedding layer to embed_token
+            if "embeddings" in key:
+                key = key.replace("embeddings", "embed_tokens")
+
+            if "A_log" in key:
+                key = key.replace("A_log", "A")
+
+            if "_scale" in key and weights[name].dim() == 0:
+                new_weights[key] = weights[name]
+            elif "A" in key:
+                w = split(weights[name], tp_size, tp_rank)
+                w = w.to(torch.float32)
+                w = -torch.exp(w)
+                new_weights[key] = w
+            elif "D" in key:
+                w = split(weights[name], tp_size, tp_rank)
+                w = w.to(torch.float32)
+                new_weights[key] = w
+            elif "dt_bias" in key:
+                w = split(weights[name], tp_size, tp_rank)
+                w = w.to(torch.float32)
+                new_weights[key] = w
+            elif "mixer.in_proj" in key:
+                w = weights[name]
+                in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split(
+                    w, [
+                        d_inner, d_inner, n_groups * d_state,
+                        n_groups * d_state, nheads
+                    ],
+                    dim=0)
+
+                w = []
+                for rank in range(tp_size):
+                    in_proj_z_rank = split(in_proj_z, tp_size, rank)
+                    in_proj_x_rank = split(in_proj_x, tp_size, rank)
+                    in_proj_b_rank = split(in_proj_b, tp_size, rank)
+                    in_proj_c_rank = split(in_proj_c, tp_size, rank)
+                    in_proj_dt_rank = split(in_proj_dt, tp_size, rank)
+                    y = torch.concat([
+                        in_proj_z_rank, in_proj_x_rank, in_proj_b_rank,
+                        in_proj_c_rank, in_proj_dt_rank
+                    ])
+                    w.append(y)
+
+                w = torch.concat(w).contiguous()
+                new_weights[key] = w
+            elif "conv1d" in key:
+                w = weights[name]
+                # removing dim(1) because we are using Linear to store conv1d weights
+                if "weight" in key:
+                    w = w.squeeze(1)
+
+                conv_x, conv_b, conv_c = torch.split(
+                    w, [d_inner, n_groups * d_state, n_groups * d_state], dim=0)
+
+                w = []
+                for rank in range(tp_size):
+                    conv_x_rank = split(conv_x, tp_size, rank)
+                    conv_b_rank = split(conv_b, tp_size, rank)
+                    conv_c_rank = split(conv_c, tp_size, rank)
+                    y = torch.concat([conv_x_rank, conv_b_rank, conv_c_rank])
+                    w.append(y)
+                w = torch.concat(w).contiguous()
+                new_weights[key] = w
+            elif "mixer.norm.weight" in key:
+                w = split(weights[name], tp_size, tp_rank)
+                new_weights[key] = w
+            else:
+                new_weights[key] = weights[name]
+
+        return new_weights
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen2_moe_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen2_moe_weight_mapper.py
new file mode 100644
index 00000000000..41b40042c76
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen2_moe_weight_mapper.py
@@ -0,0 +1,26 @@
+from torch import nn
+
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import \
+    HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+from tensorrt_llm._torch.modules.fused_moe.interface import MoE
+
+
+@register_mapper("HF", "Qwen2MoeForCausalLM")
+class Qwen2MoeHfWeightMapper(HfWeightMapper):
+
+    def is_special_instance_module(self, module: nn.Module) -> bool:
+        return isinstance(module, MoE)
+
+    def handle_special_instance_module(self, module: nn.Module,
+                                       module_name: str,
+                                       module_weights: dict) -> None:
+        if isinstance(module, MoE):
+            updated_module_weights = {}
+            for weight_name, weight_value in module_weights.items():
+                new_weight_name = weight_name.replace(
+                    "gate_proj", "w1").replace("up_proj",
+                                               "w3").replace("down_proj", "w2")
+                updated_module_weights[new_weight_name] = weight_value
+            del module_weights
+            module.load_weights(weights=[updated_module_weights])
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_moe_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_moe_weight_mapper.py
new file mode 100644
index 00000000000..1d0763dc5a9
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3_moe_weight_mapper.py
@@ -0,0 +1,39 @@
+from torch import nn
+
+from tensorrt_llm._torch.models.checkpoints.hf.qwen2_moe_weight_mapper import \
+    Qwen2MoeHfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "Qwen3MoeForCausalLM")
+class Qwen3MoeHfWeightMapper(Qwen2MoeHfWeightMapper):
+
+    def should_skip_module(self, module_name: str) -> bool:
+        if module_name.startswith("draft_model"):
+            return True
+        return super().should_skip_module(module_name)
+
+    def _duplicate_kv_weights(self, module: nn.Module, new_name: str,
+                              weights: dict):
+        tensors_to_duplicate = ["weight", "bias"]
+        if module.quant_config.quant_mode.has_nvfp4():
+            tensors_to_duplicate.append("weight_scale")
+        if module.quant_config.quant_mode.has_fp8_block_scales():
+            tensors_to_duplicate.append("weight_scale_inv")
+
+        if new_name in ['k_proj', 'v_proj']:
+            num_kv_heads_list = [self._num_kv_heads
+                                 ] * len(weights) if isinstance(
+                                     self._num_kv_heads,
+                                     int) else self._num_kv_heads
+            processed_weights = {
+                k:
+                self._duplicate_kv(weight=v[:],
+                                   num_kv_heads=num_kv_heads_list[i],
+                                   tensor_parallel_size=self._tp_size)
+                if k in tensors_to_duplicate else v
+                for i, (k, v) in enumerate(weights.items())
+            }
+            return processed_weights
+
+        return weights
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py
new file mode 100644
index 00000000000..1277e25b42f
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/weight_loader.py
@@ -0,0 +1,123 @@
+import glob
+import multiprocessing
+import os
+from typing import Any, List
+
+import psutil
+import safetensors
+import torch
+import tqdm
+
+from tensorrt_llm._torch.models.checkpoints.base_weight_loader import \
+    BaseWeightLoader
+from tensorrt_llm._torch.models.modeling_utils import (
+    register_checkpoint_weight_loader, run_concurrently)
+from tensorrt_llm._utils import local_mpi_rank, local_mpi_size
+from tensorrt_llm.logger import logger
+
+
+@register_checkpoint_weight_loader("HF")
+class HfWeightLoader(BaseWeightLoader):
+    """
+    Loads weights from SafeTensors/bin/pth files.
+    """
+
+    def load_weights(self, checkpoint_dir: str) -> dict[str, Any]:
+        weight_files = glob.glob(f"{checkpoint_dir}/*.safetensors")
+        if weight_files:
+            # Prefetch the weight files to CPU memory if the size is less than 90% of the available memory.
+            # This is a heuristic to avoid prefetching files that are too large and causing file cache thrashing.
+            prefetch_size = sum(os.path.getsize(file) for file in weight_files)
+            # If the layer number is overridden, it indicates that only a subset of layers are loaded.
+            # Prefetching all layers is unnecessary.
+            num_layers = int(os.environ.get("TLLM_OVERRIDE_LAYER_NUM", "0"))
+            enable_prefetch = prefetch_size < psutil.virtual_memory(
+            ).available * 0.9 and num_layers == 0
+            if enable_prefetch:
+                logger.info(
+                    f"Prefetching {prefetch_size / (1024**3):.2f}GB checkpoint files."
+                )
+                self.prefetch_files(weight_files)
+
+            return self._load_weights_in_parallel(
+                weight_files, self._load_safetensors_file,
+                "Loading safetensors weights in parallel")
+
+        weight_files = glob.glob(f"{checkpoint_dir}/*.bin")
+        if not weight_files:
+            weight_files = glob.glob(f"{checkpoint_dir}/*.pth")
+
+        if weight_files:
+            return self._load_weights_in_parallel(
+                weight_files, self._load_bin_or_path_file,
+                "Loading bin weights in parallel")
+
+        raise RuntimeError(f"No weight files found in {checkpoint_dir}.")
+
+    def _load_weights_in_parallel(self, weight_files: List[str], load_func,
+                                  description: str) -> dict[str, Any]:
+        """
+        Load weight files in parallel using the specified loading function.
+
+        Args:
+            weight_files: List of weight file paths
+            load_func: Function to load individual weight files
+            description: Description for the progress bar
+
+        Returns:
+            Dictionary containing all loaded weights
+        """
+        weights = {}
+        pbar = tqdm.tqdm(total=len(weight_files), desc=description)
+
+        # Note that the function is called with a tuple of arguments, hence we need to wrap the arguments in a tuple via [(w,) for w in weight_files]
+        # specifically the comma right after the w is important to make it a tuple.
+        run_concurrently(load_func, [(w, ) for w in weight_files],
+                         reduce_func=weights.update,
+                         pbar=pbar)
+
+        return weights
+
+    @staticmethod
+    def _load_safetensors_file(file):
+        return safetensors.torch.load_file(file)
+
+    @staticmethod
+    def _load_bin_or_path_file(file):
+        try:
+            part_weights = torch.load(file,
+                                      weights_only=True,
+                                      map_location='cpu',
+                                      mmap=True)
+        except Exception:
+            logger.warning(
+                f"Failed to load {file} with mmap=True, fallback to mmap=False")
+            part_weights = torch.load(file,
+                                      weights_only=True,
+                                      map_location='cpu',
+                                      mmap=False)
+        finally:
+            return part_weights
+
+    def _prefetch_one_file(self, file_name):
+        if os.path.exists(file_name):
+            logger.info(f"Prefetching {file_name} to memory...")
+            with open(file_name, 'rb') as f:
+                f.read()
+            logger.info(f"Finished prefetching {file_name}.")
+
+    def prefetch_files(self, file_names: List[str]):
+        """
+        Prefetch safetensors files to memory so that the weight loading will be much faster.
+        When multiple ranks run in parallel, each rank will prefetch some files.
+        """
+        # Find out the files to prefetch for the current rank.
+        # Each rank loads files with indices local_rank, local_rank + local_mpi_size, local_rank + 2*local_mpi_size, etc.
+        local_file_names = file_names[local_mpi_rank()::local_mpi_size()]
+        if len(local_file_names) == 0:
+            return
+
+        max_processes = min(multiprocessing.cpu_count() * 2, 16,
+                            len(local_file_names))
+        with multiprocessing.Pool(processes=max_processes) as pool:
+            pool.map(self._prefetch_one_file, local_file_names)
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py
new file mode 100644
index 00000000000..7df6faa613a
--- /dev/null
+++ b/tensorrt_llm/_torch/models/checkpoints/hf/weight_mapper.py
@@ -0,0 +1,101 @@
+import torch
+from torch import nn
+
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+from ..base_weight_mapper import BaseWeightMapper
+
+
+@register_mapper("HF")
+class HfWeightMapper(BaseWeightMapper):
+
+    def __init__(self):
+        super().__init__()
+        self._callbacks = [
+            self._duplicate_kv_weights,
+        ]
+
+    def map_weights(self) -> None:
+        self.mapping.update({
+            'qkv_proj': ['q_proj', 'k_proj', 'v_proj'],
+            'gate_up_proj': ['gate_proj', 'up_proj']
+        })
+
+    def apply_callbacks(self, module: nn.Module, module_name: str,
+                        module_names_breakdown: list[str],
+                        weights: dict) -> list[dict]:
+        module_weights = []
+
+        for new_name in self._mapping[module_name]:
+            fw = self.filter_weights(
+                '.'.join(module_names_breakdown + [new_name]), weights)
+            for callback in self._callbacks:
+                fw = callback(module, new_name, fw)
+            module_weights.append(fw)
+
+        return module_weights
+
+    def should_skip_module(self, module_name: str) -> bool:
+        if self.model.config.tie_word_embeddings and module_name.startswith(
+                "lm_head"):
+            return True
+
+        # Skip loading weights for embedding and lm_head if LoRA is enabled and has custom values
+        if hasattr(self.model, "model") and hasattr(
+                self.model.model, 'has_custom_embed_tokens'
+        ) and self.model.model.has_custom_embed_tokens and module_name == "model.embed_tokens":
+            return True
+        if hasattr(
+                self.model, 'has_custom_lm_head'
+        ) and self.model.has_custom_lm_head and module_name == "lm_head":
+            return True
+
+        # WAR: better solution is that llama has its own load_weights function.
+        if module_name.split('.')[-1] == 'next_layer_layernorm':
+            return True
+
+        return super().should_skip_module(module_name)
+
+    def _duplicate_kv_weights(self, module: nn.Module, new_name: str,
+                              weights: dict):
+        if new_name in ['k_proj', 'v_proj']:
+            num_kv_heads_list = [self._num_kv_heads
+                                 ] * len(weights) if isinstance(
+                                     self._num_kv_heads,
+                                     int) else self._num_kv_heads
+            processed_weights = {
+                k:
+                self._duplicate_kv(weight=v[:],
+                                   num_kv_heads=num_kv_heads_list[i],
+                                   tensor_parallel_size=self._tp_size)
+                if k in ["weight", "bias"] else v
+                for i, (k, v) in enumerate(weights.items())
+            }
+            return processed_weights
+
+        return weights
+
+    def _duplicate_kv(self, weight: torch.Tensor, num_kv_heads: int,
+                      tensor_parallel_size: int):
+
+        if num_kv_heads >= tensor_parallel_size:
+            assert num_kv_heads % tensor_parallel_size == 0
+            return weight
+
+        assert tensor_parallel_size % num_kv_heads == 0
+        reps = tensor_parallel_size // num_kv_heads
+
+        # bias
+        if weight.ndim == 1:
+            return weight.repeat_interleave(reps)
+
+        # weight and scale
+        assert weight.shape[0] % num_kv_heads == 0
+        size_per_kv_head = weight.shape[0] // num_kv_heads
+        weight = weight.reshape(num_kv_heads, size_per_kv_head,
+                                -1)[:,
+                                    None, :, :].expand(num_kv_heads, reps,
+                                                       size_per_kv_head,
+                                                       weight.shape[1])
+        return weight.reshape(num_kv_heads * reps * size_per_kv_head,
+                              -1).clone().detach()
diff --git a/tensorrt_llm/_torch/models/modeling_gemma3.py b/tensorrt_llm/_torch/models/modeling_gemma3.py
index 460af83ddbf..db672279646 100644
--- a/tensorrt_llm/_torch/models/modeling_gemma3.py
+++ b/tensorrt_llm/_torch/models/modeling_gemma3.py
@@ -3,10 +3,11 @@
 
 import torch
 from torch import nn
-from tqdm import tqdm
 from transformers import Gemma3TextConfig
 from transformers.activations import ACT2FN
 
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
 from tensorrt_llm.functional import PositionEmbeddingType, RotaryScalingType
 from tensorrt_llm.mapping import Mapping
 
@@ -23,7 +24,6 @@
 from ..modules.multi_stream_utils import maybe_execute_in_parallel
 from ..modules.rms_norm import RMSNorm
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
-                             duplicate_kv_weight, filter_weights,
                              register_auto_model)
 
 
@@ -486,61 +486,5 @@ def forward(
             return_context_logits,
         )
 
-    # This is a modified version of the load_weights function in modeling_utils.py with the
-    # minor change for Gemma3 RMSNorm.
-    def load_weights(self, weights: Dict):
-        tp_size = self.model_config.mapping.tp_size
-        num_kv_heads = self.config.num_key_value_heads
-
-        params_map = {
-            'qkv_proj': ['q_proj', 'k_proj', 'v_proj'],
-            'gate_up_proj': ['gate_proj', 'up_proj']
-        }
-
-        for name, module in tqdm(list(self.named_modules()),
-                                 desc="Loading weights"):
-            if len(module._parameters) > 0:
-                # skip load weights if tie word embeddings is enabled and layer is lm_head
-                if self.config.tie_word_embeddings and name.startswith(
-                        "lm_head"):
-                    continue
-
-                # Skip loading weights for embedding and lm_head if LoRA is enabled.
-                if hasattr(
-                        self.model_config, 'lora_config'
-                ) and self.model_config.lora_config is not None and len(
-                        self.model_config.lora_config.lora_dir) == 1 and (
-                            name == "model.embed_tokens" or name == "lm_head"):
-                    continue
-
-                names = name.split('.')
-                if names[-1] in params_map:
-                    module_weights = []
-                    for new_name in params_map[names[-1]]:
-                        fw = filter_weights('.'.join(names[:-1] + [new_name]),
-                                            weights)
-                        if new_name in ['k_proj', 'v_proj']:
-                            fw = {
-                                k:
-                                duplicate_kv_weight(
-                                    weight=v[:],
-                                    num_kv_heads=num_kv_heads,
-                                    tensor_parallel_size=tp_size)
-                                if k in ["weight", "bias"] else v
-                                for k, v in fw.items()
-                            }
-
-                        module_weights.append(fw)
-                    module.load_weights(weights=module_weights)
-                else:
-                    module_weights = filter_weights(name, weights)
-                    if hasattr(module, 'load_weights'):
-                        module.load_weights(weights=[module_weights])
-                    else:
-                        for n, p in module._parameters.items():
-                            if p is not None:
-                                # Gemma3 RMSNorm uses +1 just like LayerNorm-1P.
-                                if 'norm' in names[-1]:
-                                    p.data.copy_(module_weights[n][:] + 1)
-                                else:
-                                    p.data.copy_(module_weights[n][:])
+    def load_weights(self, weights: Dict, weight_mapper: BaseWeightMapper):
+        super().load_weights(weights, weight_mapper)
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
index 1c17eeb5a8e..f4ea1cc3e75 100644
--- a/tensorrt_llm/_torch/models/modeling_llama.py
+++ b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -11,6 +11,8 @@
 
 from tensorrt_llm._torch.distributed import (AllReduce, AllReduceFusionOp,
                                              AllReduceParams, MoEAllReduce)
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import HfLoraLoader
@@ -917,16 +919,8 @@ def infer_max_seq_len(self):
 
         return super().infer_max_seq_len()
 
-    def load_weights(self, weights: Dict):
-        new_weights = {}
-        for key, tensor in weights.items():
-            if key.startswith("language_model."):
-                new_key = key[len("language_model."):]
-                new_weights[new_key] = tensor
-            else:
-                new_weights[key] = tensor
-
-        super().load_weights(new_weights)
+    def load_weights(self, weights: Dict, weight_mapper: BaseWeightMapper):
+        super().load_weights(weights, weight_mapper)
 
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
diff --git a/tensorrt_llm/_torch/models/modeling_mixtral.py b/tensorrt_llm/_torch/models/modeling_mixtral.py
index bb1459b4ab1..3878252dbc3 100644
--- a/tensorrt_llm/_torch/models/modeling_mixtral.py
+++ b/tensorrt_llm/_torch/models/modeling_mixtral.py
@@ -1,4 +1,4 @@
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 from torch import nn
@@ -16,7 +16,7 @@
 from ..modules.linear import Linear
 from ..modules.rms_norm import RMSNorm
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
-                             filter_weights, register_auto_model)
+                             register_auto_model)
 
 
 class MixtralMoE(nn.Module):
@@ -215,27 +215,3 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
                          config=model_config,
                          hidden_size=model_config.pretrained_config.hidden_size,
                          vocab_size=model_config.pretrained_config.vocab_size)
-
-    def load_weights(self, weights: Dict):
-
-        params_map = {
-            'qkv_proj': ['q_proj', 'k_proj', 'v_proj'],
-        }
-
-        for name, module in self.named_modules():
-            if len(module._parameters) > 0:
-                names = name.split('.')
-                if names[-1] in params_map:
-                    module_weights = []
-                    for new_name in params_map[names[-1]]:
-                        module_weights.append(
-                            filter_weights('.'.join(names[:-1] + [new_name]),
-                                           weights))
-                    module.load_weights(weights=module_weights)
-                else:
-                    module_weights = filter_weights(name, weights)
-                    if hasattr(module, 'load_weights'):
-                        module.load_weights(weights=[module_weights])
-                    else:
-                        for n, p in module.named_parameters():
-                            p.data.copy_(module_weights[n][:])
diff --git a/tensorrt_llm/_torch/models/modeling_nemotron_h.py b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
index 0e9f9a03361..e19a8dc6ea9 100644
--- a/tensorrt_llm/_torch/models/modeling_nemotron_h.py
+++ b/tensorrt_llm/_torch/models/modeling_nemotron_h.py
@@ -13,13 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 from torch import nn
 from torch.nn import functional as F
 from transformers import AutoConfig, PretrainedConfig
 
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
 from tensorrt_llm._torch.modules.mamba.mamba2_metadata import Mamba2Metadata
 
 from ..attention_backend import AttentionMetadata
@@ -254,94 +256,9 @@ def __init__(
             vocab_size=model_config.pretrained_config.vocab_size,
         )
 
-    def load_weights(self, weights: Dict):
-        config = self.model_config.pretrained_config
-        tp_size = self.model_config.mapping.tp_size
-        tp_rank = self.model_config.mapping.tp_rank
-        d_inner = config.hidden_size * config.expand
-        n_groups = config.n_groups
-        d_state = config.ssm_state_size
-        nheads = d_inner // config.mamba_head_dim
-
-        new_weights = {}
-        for name, params in weights.items():
-            key = name
-
-            # change backbone root name to model
-            if "backbone" in key:
-                key = key.replace("backbone", "model")
-
-            # change embedding layer to embed_token
-            if "embeddings" in key:
-                key = key.replace("embeddings", "embed_tokens")
-
-            if "A_log" in key:
-                key = key.replace("A_log", "A")
-
-            if "_scale" in key and weights[name].dim() == 0:
-                new_weights[key] = weights[name]
-            elif "A" in key:
-                w = split(weights[name], tp_size, tp_rank)
-                w = w.to(torch.float32)
-                w = -torch.exp(w)
-                new_weights[key] = w
-            elif "D" in key:
-                w = split(weights[name], tp_size, tp_rank)
-                w = w.to(torch.float32)
-                new_weights[key] = w
-            elif "dt_bias" in key:
-                w = split(weights[name], tp_size, tp_rank)
-                w = w.to(torch.float32)
-                new_weights[key] = w
-            elif "mixer.in_proj" in key:
-                w = weights[name]
-                in_proj_z, in_proj_x, in_proj_b, in_proj_c, in_proj_dt = torch.split(
-                    w, [
-                        d_inner, d_inner, n_groups * d_state,
-                        n_groups * d_state, nheads
-                    ],
-                    dim=0)
-
-                w = []
-                for rank in range(tp_size):
-                    in_proj_z_rank = split(in_proj_z, tp_size, rank)
-                    in_proj_x_rank = split(in_proj_x, tp_size, rank)
-                    in_proj_b_rank = split(in_proj_b, tp_size, rank)
-                    in_proj_c_rank = split(in_proj_c, tp_size, rank)
-                    in_proj_dt_rank = split(in_proj_dt, tp_size, rank)
-                    y = torch.concat([
-                        in_proj_z_rank, in_proj_x_rank, in_proj_b_rank,
-                        in_proj_c_rank, in_proj_dt_rank
-                    ])
-                    w.append(y)
-
-                w = torch.concat(w).contiguous()
-                new_weights[key] = w
-            elif "conv1d" in key:
-                w = weights[name]
-                # removing dim(1) because we are using Linear to store conv1d weights
-                if "weight" in key:
-                    w = w.squeeze(1)
-
-                conv_x, conv_b, conv_c = torch.split(
-                    w, [d_inner, n_groups * d_state, n_groups * d_state], dim=0)
-
-                w = []
-                for rank in range(tp_size):
-                    conv_x_rank = split(conv_x, tp_size, rank)
-                    conv_b_rank = split(conv_b, tp_size, rank)
-                    conv_c_rank = split(conv_c, tp_size, rank)
-                    y = torch.concat([conv_x_rank, conv_b_rank, conv_c_rank])
-                    w.append(y)
-                w = torch.concat(w).contiguous()
-                new_weights[key] = w
-            elif "mixer.norm.weight" in key:
-                w = split(weights[name], tp_size, tp_rank)
-                new_weights[key] = w
-            else:
-                new_weights[key] = weights[name]
-
-        super().load_weights(new_weights)
+    def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
+        new_weights = weight_mapper.preprocess_weights(weights)
+        super().load_weights(new_weights, weight_mapper)
 
 
 AutoConfig.register(NemotronHConfig.model_type, NemotronHConfig)
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
index 5877f3daf5a..81bdf650443 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen3_moe.py
@@ -3,16 +3,18 @@
 
 import torch
 from torch import nn
-from tqdm import tqdm
 from transformers import Qwen3MoeConfig
 
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
+
 from ..attention_backend import AttentionMetadata
 from ..distributed import (AllReduce, AllReduceFusionOp, AllReduceParams,
                            MoEAllReduce, MoEAllReduceParams, allgather)
 from ..model_config import ModelConfig
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
-from ..modules.fused_moe import (BaseMoeRoutingMethod, CutlassFusedMoE, MoE,
+from ..modules.fused_moe import (BaseMoeRoutingMethod, CutlassFusedMoE,
                                  RenormalizeMoeRoutingMethod,
                                  RenormalizeNaiveMoeRoutingMethod,
                                  RoutingMethodType, TRTLLMGenFusedMoE,
@@ -22,9 +24,7 @@
 from ..speculative import SpecMetadata
 from .modeling_qwen3 import Qwen3Attention
 from .modeling_speculative import SpecDecOneEngineForCausalLM
-from .modeling_utils import (DecoderModel, EagerFusionConfig,
-                             duplicate_kv_weight, filter_weights,
-                             register_auto_model)
+from .modeling_utils import DecoderModel, EagerFusionConfig, register_auto_model
 
 
 class Qwen3Gate(nn.Module):
@@ -389,67 +389,9 @@ def __init__(
             model_config,
         )
 
-    def load_weights(self, weights: Dict):
-        tp_size = self.model_config.mapping.tp_size
-        enable_attention_dp = self.model_config.mapping.enable_attention_dp
-
-        num_kv_heads = self.config.num_key_value_heads
-
-        params_map = {
-            "qkv_proj": ["q_proj", "k_proj", "v_proj"],
-            "gate_up_proj": ["gate_proj", "up_proj"]
-        }
-        for name, module in tqdm(list(self.named_modules()),
-                                 desc="Loading weights"):
-            if len(module._parameters) > 0:
-                # skip load weights if tie word embeddings is enabled and layer is lm_head
-                if self.config.tie_word_embeddings and name.startswith(
-                        "lm_head") or name.startswith("draft_model"):
-                    continue
-
-                names = name.split(".")
-                if names[-1] in params_map:
-                    module_weights = []
-                    for new_name in params_map[names[-1]]:
-                        fw = filter_weights(".".join(names[:-1] + [new_name]),
-                                            weights)
-                        tensors_need_duplication = ["weight", "bias"]
-                        if module.quant_config.quant_mode.has_nvfp4():
-                            tensors_need_duplication.append("weight_scale")
-                        if module.quant_config.quant_mode.has_fp8_block_scales(
-                        ):
-                            tensors_need_duplication.append("weight_scale_inv")
-                        if new_name in ["k_proj", "v_proj"]:
-                            fw = {
-                                k: (duplicate_kv_weight(
-                                    weight=v[:],
-                                    num_kv_heads=num_kv_heads,
-                                    tensor_parallel_size=tp_size
-                                    if not enable_attention_dp else 1)
-                                    if k in tensors_need_duplication else v)
-                                for k, v in fw.items()
-                            }
-                        module_weights.append(fw)
-                    module.load_weights(weights=module_weights)
-                else:
-                    module_weights = filter_weights(name, weights)
-                    if isinstance(module, MoE):
-                        updated_module_weights = {}
-                        for weight_name, weight_value in module_weights.items():
-                            new_weight_name = (weight_name.replace(
-                                "gate_proj",
-                                "w1").replace("up_proj",
-                                              "w3").replace("down_proj", "w2"))
-                            updated_module_weights[
-                                new_weight_name] = weight_value
-                        del module_weights
-                        module.load_weights(weights=[updated_module_weights])
-                    elif hasattr(module, "load_weights"):
-                        module.load_weights(weights=[module_weights])
-                    else:
-                        for n, p in module._parameters.items():
-                            if p is not None:
-                                p.data.copy_(module_weights[n][:])
+    def load_weights(self, weights: dict, weight_mapper: BaseWeightMapper):
+        super().load_weights(weights, weight_mapper)
+
         for idx, layer in enumerate(
                 self.model.layers[:self.config.num_hidden_layers]):
             if idx == self.config.num_hidden_layers - 1:
diff --git a/tensorrt_llm/_torch/models/modeling_qwen_moe.py b/tensorrt_llm/_torch/models/modeling_qwen_moe.py
index 0ee6d1ac55d..2bbf9b80d54 100644
--- a/tensorrt_llm/_torch/models/modeling_qwen_moe.py
+++ b/tensorrt_llm/_torch/models/modeling_qwen_moe.py
@@ -1,9 +1,8 @@
-from typing import Dict, Optional
+from typing import Optional
 
 import torch
 import torch.nn.functional as F
 from torch import nn
-from tqdm import tqdm
 from transformers import Qwen2MoeConfig
 
 from tensorrt_llm.functional import PositionEmbeddingType
@@ -14,12 +13,11 @@
 from ..modules.attention import Attention
 from ..modules.decoder_layer import DecoderLayer
 from ..modules.embedding import Embedding
-from ..modules.fused_moe import DefaultMoeRoutingMethod, MoE, create_moe
+from ..modules.fused_moe import DefaultMoeRoutingMethod, create_moe
 from ..modules.gated_mlp import GatedMLP
 from ..modules.linear import Linear, TensorParallelMode
 from ..modules.rms_norm import RMSNorm
 from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
-                             duplicate_kv_weight, filter_weights,
                              register_auto_model)
 
 
@@ -255,57 +253,3 @@ def __init__(
                          config=model_config,
                          hidden_size=model_config.pretrained_config.hidden_size,
                          vocab_size=model_config.pretrained_config.vocab_size)
-
-    def load_weights(self, weights: Dict):
-        tp_size = self.model_config.mapping.tp_size
-        num_kv_heads = self.config.num_key_value_heads
-
-        params_map = {
-            'qkv_proj': ['q_proj', 'k_proj', 'v_proj'],
-            'gate_up_proj': ['gate_proj', 'up_proj']
-        }
-        for name, module in tqdm(list(self.named_modules()),
-                                 desc="Loading weights"):
-            if len(module._parameters) > 0:
-                # skip load weights if tie word embeddings is enabled and layer is lm_head
-                if self.config.tie_word_embeddings and name.startswith(
-                        "lm_head"):
-                    continue
-
-                names = name.split('.')
-                if names[-1] in params_map:
-                    module_weights = []
-                    for new_name in params_map[names[-1]]:
-                        fw = filter_weights('.'.join(names[:-1] + [new_name]),
-                                            weights)
-                        if new_name in ['k_proj', 'v_proj']:
-                            fw = {
-                                k:
-                                duplicate_kv_weight(
-                                    weight=v[:],
-                                    num_kv_heads=num_kv_heads,
-                                    tensor_parallel_size=tp_size)
-                                if k in ["weight", "bias"] else v
-                                for k, v in fw.items()
-                            }
-                        module_weights.append(fw)
-                    module.load_weights(weights=module_weights)
-                else:
-                    module_weights = filter_weights(name, weights)
-                    if isinstance(module, MoE):
-                        updated_module_weights = {}
-                        for weight_name, weight_value in module_weights.items():
-                            new_weight_name = weight_name.replace(
-                                "gate_proj",
-                                "w1").replace("up_proj",
-                                              "w3").replace("down_proj", "w2")
-                            updated_module_weights[
-                                new_weight_name] = weight_value
-                        del module_weights
-                        module.load_weights(weights=[updated_module_weights])
-                    elif hasattr(module, 'load_weights'):
-                        module.load_weights(weights=[module_weights])
-                    else:
-                        for n, p in module._parameters.items():
-                            if p is not None:
-                                p.data.copy_(module_weights[n][:])
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
index c6f05219775..ee178b48f14 100644
--- a/tensorrt_llm/_torch/models/modeling_speculative.py
+++ b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -4,6 +4,8 @@
 from torch import nn
 from transformers import LlamaConfig
 
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
 from tensorrt_llm.functional import PositionEmbeddingType
 
 from ..attention_backend import AttentionMetadata
@@ -264,7 +266,7 @@ def forward(
             return_context_logits,
         )
 
-    def load_weights(self, weights: Dict):
+    def load_weights(self, weights: Dict, weight_mapper: BaseWeightMapper):
         new_weights = {}
         for k, v in weights.items():
             if 'lm_head' not in k:
@@ -274,9 +276,12 @@ def load_weights(self, weights: Dict):
                 new_k = k
             new_weights[new_k] = v
         if self.load_lm_head_from_target:
-            super().load_weights(new_weights, skip_modules=['lm_head'])
+            super().load_weights(weights=new_weights,
+                                 weight_mapper=weight_mapper,
+                                 skip_modules=['lm_head'])
         else:
-            super().load_weights(new_weights)
+            super().load_weights(weights=new_weights,
+                                 weight_mapper=weight_mapper)
 
     def load_weights_from_target_model(self,
                                        target_model: torch.nn.Module) -> None:
@@ -401,9 +406,16 @@ def forward(
 
         return logits
 
-    def load_weights(self, weights: Dict):
-        super().load_weights(weights, skip_modules=["draft_model"])
-
-    def load_draft_weights(self, weights: Dict):
-        self.draft_model.load_weights(weights)
+    def load_weights(self,
+                     weights: Dict,
+                     weight_mapper: Optional[BaseWeightMapper] = None):
+        super().load_weights(weights=weights,
+                             weight_mapper=weight_mapper,
+                             skip_modules=["draft_model"])
+
+    def load_draft_weights(self,
+                           weights: Dict,
+                           weight_mapper: Optional[BaseWeightMapper] = None):
+        self.draft_model.load_weights(weights=weights,
+                                      weight_mapper=weight_mapper)
         self.draft_model.load_weights_from_target_model(self)
diff --git a/tensorrt_llm/_torch/models/modeling_utils.py b/tensorrt_llm/_torch/models/modeling_utils.py
index 1dac009f5c1..c751bdcbb01 100755
--- a/tensorrt_llm/_torch/models/modeling_utils.py
+++ b/tensorrt_llm/_torch/models/modeling_utils.py
@@ -524,12 +524,25 @@ def forward(
             return_context_logits,
         )
 
-    def load_weights(self, weights: Dict, skip_modules: List[str] = []):
+    def load_weights(self,
+                     weights: Dict,
+                     weight_mapper: Optional["BaseWeightMapper"] = None,
+                     skip_modules: List[str] = []):
+        # TODO smor- this solution is a temporary solution to load weights while we are still using
+        # the old checkpoint format loading process. Once checkpoint format is unified
+        # this method will be removed.
         preload_weight_modules = getattr(self, "preload_weight_modules", None)
-        _load_weights_impl(self,
-                           weights,
-                           skip_modules,
-                           preload_weight_modules=preload_weight_modules)
+        if weight_mapper is None:
+            _load_weights_impl(self,
+                               weights,
+                               skip_modules,
+                               preload_weight_modules=preload_weight_modules)
+        else:
+            _load_weights_impl_v2(self,
+                                  weights,
+                                  weight_mapper,
+                                  skip_modules,
+                                  preload_weight_modules=preload_weight_modules)
 
     def infer_max_seq_len(self) -> int:
         # Modified from tensorrt_llm/builder.py _init_max_seq_len
@@ -558,6 +571,10 @@ def infer_max_seq_len(self) -> int:
 
 
 MODEL_CLASS_MAPPING = {}
+MODEL_CLASS_MAPPER_MAPPING = {}
+MODEL_CLASS_CHECKPOINT_WEIGHT_LOADER_DEFAULT_MAPPING = {}
+MODEL_CLASS_CONFIG_LOADER_DEFAULT_MAPPING = {}
+CHECKPOINT_LOADER_FORMAT_DEFAULT_MAPPING = {}
 
 
 def register_auto_model(name: str):
@@ -569,6 +586,59 @@ def decorator(cls):
     return decorator
 
 
+def register_mapper(format: str, name: Optional[str] = None):
+
+    def decorator(cls):
+        if name is not None:
+            # set cls for model name and format pair
+            MODEL_CLASS_MAPPER_MAPPING[f'{name}_{format}'] = cls
+        else:
+            # resort to the default per format
+            MODEL_CLASS_MAPPER_MAPPING[format] = cls
+        return cls
+
+    return decorator
+
+
+def register_checkpoint_weight_loader(name: str):
+
+    def decorator(cls):
+        MODEL_CLASS_CHECKPOINT_WEIGHT_LOADER_DEFAULT_MAPPING[name] = cls
+        return cls
+
+    return decorator
+
+
+def register_checkpoint_loader(name: str):
+
+    def decorator(cls):
+        CHECKPOINT_LOADER_FORMAT_DEFAULT_MAPPING[name] = cls
+        return cls
+
+    return decorator
+
+
+def register_config_loader(name: str):
+
+    def decorator(cls):
+        MODEL_CLASS_CONFIG_LOADER_DEFAULT_MAPPING[name] = cls
+        return cls
+
+    return decorator
+
+
+def get_checkpoint_weight_loader(name: str) -> Type["BaseWeightLoader"]:
+    if name not in MODEL_CLASS_CHECKPOINT_WEIGHT_LOADER_DEFAULT_MAPPING:
+        raise ValueError(f"Default checkpoint weight loader {name} not found.")
+    return MODEL_CLASS_CHECKPOINT_WEIGHT_LOADER_DEFAULT_MAPPING[name]
+
+
+def get_config_loader(name: str) -> Type["BaseConfigLoader"]:
+    if name not in MODEL_CLASS_CONFIG_LOADER_DEFAULT_MAPPING:
+        raise ValueError(f"Default config loader {name} not found.")
+    return MODEL_CLASS_CONFIG_LOADER_DEFAULT_MAPPING[name]
+
+
 def get_model_architecture(
         model_config: TConfig) -> Tuple[Type[nn.Module], str]:
     cls = None
@@ -587,7 +657,6 @@ def get_model_architecture(
 def rename_weights_with_regex(pattern_mapping: Dict[str, str], weights: Dict):
     """
     Rename weight keys according to regex pattern matching.
-
     Args:
         pattern_mapping: A dictionary mapping regex patterns to replacement strings. The key is HF name pattern, and the value is corresponding TRT-LLM name pattern.
             The patterns will be used to match keys in the weights dict and replace
@@ -600,7 +669,6 @@ def rename_weights_with_regex(pattern_mapping: Dict[str, str], weights: Dict):
                 r'(.*?)out_proj(.*)': r'\1o_proj\2'
             }
         weights: A dictionary of weights
-
     Returns:
         A dictionary of weights with renamed keys
     """
@@ -683,6 +751,9 @@ def _load_weights_impl(model: Union[nn.Module, DecoderModelForCausalLM],
                        preload_weight_modules: Optional[List[str]] = None):
     # TODO: remove preload_weight_modules - it is a workaround for min-latency llama4 model loading where
     # we need some order in the module loading. Once this is resolved, we can remove this workaround.
+    # TODO smor- this method is here as a temporary solution to load weights.
+    # Once checkpoint format is unified, this method will be removed.
+
     if not hasattr(model, 'model_config') or not isinstance(
             model.model_config, ModelConfig):
         raise ValueError("model must have a model_config attribute")
@@ -784,3 +855,71 @@ def load_single_module(name, module):
         args_list = [(name, module) for name, module in model.named_modules()
                      if name not in serial_load_modules]
         run_concurrently(load_single_module, args_list, pbar=pbar)
+
+
+def _load_weights_impl_v2(model: Union[nn.Module, DecoderModelForCausalLM],
+                          weights: Dict,
+                          weight_mapper: "BaseWeightMapper",
+                          skip_modules: List[str] = [],
+                          params_map: Optional[Dict[str, str]] = None,
+                          preload_weight_modules: Optional[List[str]] = None):
+    # TODO: remove preload_weight_modules - it is a workaround for min-latency llama4 model loading where
+    # we need some order in the module loading. Once this is resolved, we can remove this workaround.
+    weight_mapper.add_skip_modules(skip_modules)
+    if params_map is not None:
+        weights = weight_mapper.rename_by_params_map(params_map, weights)
+        logger.info(f"Renamed weights with params_map: {params_map}")
+
+    def load_single_module(name, module):
+        if len(module._parameters) > 0:
+            if weight_mapper.should_skip_module(name):
+                return
+
+            names = name.split('.')
+            module_names_breakdown, module_name = names[:-1], names[-1]
+
+            if weight_mapper.does_require_special_handling(module_name):
+                module_weights = weight_mapper.apply_callbacks(
+                    module, module_name, module_names_breakdown, weights)
+                module.load_weights(weights=module_weights)
+            else:
+                module_weights = weight_mapper.filter_weights(name, weights)
+                if weight_mapper.is_special_instance_module(module):
+                    weight_mapper.handle_special_instance_module(
+                        module, module_name, module_weights)
+
+                elif hasattr(module, 'load_weights'):
+                    module.load_weights(weights=[module_weights])
+                else:
+                    for n, p in module._parameters.items():
+                        if p is not None:
+                            weight_mapper.handle_manual_copy(
+                                module_name, module_weights, n, p)
+
+    if os.environ.get("TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL",
+                      False) in ["True", "true", "1", "yes", "y"]:
+        for name, module in tqdm(list(model.named_modules()),
+                                 desc="Loading weights"):
+            load_single_module(name, module)
+    else:
+        all_modules = dict(model.named_modules())
+        serial_load_modules = []
+        if preload_weight_modules is not None:
+            for module in preload_weight_modules:
+                serial_load_modules.extend([
+                    name for name in all_modules.keys() if name.endswith(module)
+                ])
+            logger.info(f"Serial load modules: {serial_load_modules}")
+            pbar = tqdm(serial_load_modules, desc="Loading weights serially")
+            for module in serial_load_modules:
+                # logger.info(f"Loading weights for {module} in serial")
+                load_single_module(module, all_modules[module])
+                pbar.update(1)
+                del all_modules[module]
+            pbar.close()
+
+        pbar = tqdm(list(model.named_modules()),
+                    desc="Loading weights concurrently")
+        args_list = [(name, module) for name, module in model.named_modules()
+                     if name not in serial_load_modules]
+        run_concurrently(load_single_module, args_list, pbar=pbar)
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index 979bc83f218..88e046eb056 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -472,7 +472,6 @@ def create_py_executor_instance(
         num_lora_modules = model_engine.model.model_config.pretrained_config.num_hidden_layers * \
             len(lora_config.lora_target_modules + lora_config.missing_qkv_modules)
 
-        # TODO smor- need to figure out how to set these values
         executor_config.peft_cache_config = trtllm.PeftCacheConfig(
             num_device_module_layer=max_lora_rank * num_lora_modules *
             lora_config.max_loras,
diff --git a/tensorrt_llm/_torch/pyexecutor/config.py b/tensorrt_llm/_torch/pyexecutor/config.py
index b1935a51234..181f2b0bdc0 100644
--- a/tensorrt_llm/_torch/pyexecutor/config.py
+++ b/tensorrt_llm/_torch/pyexecutor/config.py
@@ -1,6 +1,8 @@
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Union
 
+from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
+    BaseCheckpointLoader
 from tensorrt_llm.bindings.executor import ExecutorConfig
 
 from ...builder import BuildConfig
@@ -117,7 +119,9 @@ def update_executor_config(
         speculative_config: Optional["DecodingBaseConfig"] = None,
         hf_model_dir: Optional[str] = None,
         max_input_len: Optional[int] = None,
-        max_seq_len: Optional[int] = None):
+        max_seq_len: Optional[int] = None,
+        checkpoint_format: Optional[str] = None,
+        checkpoint_loader: Optional[BaseCheckpointLoader] = None):
     if backend is None:
         return
 
@@ -145,3 +149,31 @@ def update_executor_config(
 
     if max_seq_len is not None:
         executor_config.max_seq_len = max_seq_len
+
+    executor_config.checkpoint_loader = _construct_checkpoint_loader(
+        backend, checkpoint_loader, checkpoint_format)
+
+
+def _construct_checkpoint_loader(
+        backend: str, checkpoint_loader: Optional[BaseCheckpointLoader],
+        checkpoint_format: Optional[str]) -> Optional[BaseCheckpointLoader]:
+    if backend == "_autodeploy":
+        return None
+
+    from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
+        BaseCheckpointLoader
+    from tensorrt_llm._torch.models.modeling_utils import (
+        get_checkpoint_weight_loader, get_config_loader)
+
+    if checkpoint_loader is None:
+        checkpoint_weight_loader = get_checkpoint_weight_loader(
+            checkpoint_format)()
+        config_loader = get_config_loader(checkpoint_format)()
+
+        checkpoint_loader = BaseCheckpointLoader.get(
+            checkpoint_format=checkpoint_format,
+            weight_loader=checkpoint_weight_loader,
+            weight_mapper=None,
+            config_loader=config_loader)
+
+    return checkpoint_loader
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 635787a0324..5333b940ebc 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -2,28 +2,24 @@
 import contextlib
 import functools
 import gc
-import glob
 import inspect
 import math
-import multiprocessing
 import os
 import traceback
 import weakref
 from abc import ABC, abstractmethod
 from contextlib import contextmanager
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple
 
-import psutil
-import safetensors
 import torch
 import torch._dynamo.config
-import tqdm
 
 import tensorrt_llm.bindings.internal.userbuffers as ub
+from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
+    BaseCheckpointLoader
 from tensorrt_llm._torch.pyexecutor.sampler import SampleStateTensors
 from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
-from tensorrt_llm._utils import (is_trace_enabled, local_mpi_rank,
-                                 local_mpi_size, nvtx_range, release_gc,
+from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
                                  torch_dtype_to_str, trace_func)
 from tensorrt_llm.bindings.executor import GuidedDecodingConfig
 from tensorrt_llm.inputs.multimodal import MultimodalParams
@@ -48,7 +44,7 @@
 from ..model_config import ModelConfig, MoeLoadBalancerConfig
 from ..models import AutoModelForCausalLM
 from ..models.modeling_utils import (DecoderModelForCausalLM, MetaInitMode,
-                                     run_concurrently, timing)
+                                     timing)
 from ..modules.fused_moe.moe_load_balancer import (
     MoeLoadBalancer, MoeLoadBalancerIterContext, maybe_create_moe_load_balancer)
 from ..speculative import SpecMetadata, get_spec_metadata
@@ -140,99 +136,6 @@ def validate_and_set_kv_cache_quant(model_config: ModelConfig,
     model_config.quant_config.kv_cache_quant_algo = mapped_pyt_quant
 
 
-def _prefetch_one_file(file_name):
-    if os.path.exists(file_name):
-        logger.info(f"Prefetching {file_name} to memory...")
-        with open(file_name, 'rb') as f:
-            f.read()
-        logger.info(f"Finished prefetching {file_name}.")
-
-
-def prefetch_files(file_names: List[str]):
-    """
-    Prefetch safetensors files to memory so that the weight loading will be much faster.
-    When multiple ranks run in parallel, each rank will prefetch some files.
-    """
-
-    # Find out the files to prefetch for the current rank.
-    # Each rank loads files with indices local_rank, local_rank + local_mpi_size, local_rank + 2*local_mpi_size, etc.
-    local_file_names = file_names[local_mpi_rank()::local_mpi_size()]
-    if len(local_file_names) == 0:
-        return
-
-    max_processes = min(multiprocessing.cpu_count() * 2, 16,
-                        len(local_file_names))
-    with multiprocessing.Pool(processes=max_processes) as pool:
-        pool.map(_prefetch_one_file, local_file_names)
-
-
-def load_weights(checkpoint_dir: str):
-    weights = {}
-    weight_files = glob.glob(f"{checkpoint_dir}/*.safetensors")
-    if weight_files:
-        # Prefetch the weight files to CPU memory if the size is less than 90% of the available memory.
-        # This is a heuristic to avoid prefetching files that are too large and causing file cache thrashing.
-        prefetch_size = sum(os.path.getsize(file) for file in weight_files)
-        # If the layer number is overridden, it indicates that only a subset of layers are loaded.
-        # Prefetching all layers is unnecessary.
-        num_layers = int(os.environ.get("TLLM_OVERRIDE_LAYER_NUM", "0"))
-        enable_prefetch = prefetch_size < psutil.virtual_memory(
-        ).available * 0.9 and num_layers == 0
-        if enable_prefetch:
-            logger.info(
-                f"Prefetching {prefetch_size / (1024**3):.2f}GB checkpoint files."
-            )
-            prefetch_files(weight_files)
-
-        def load_safetensors_file(file):
-            return safetensors.torch.load_file(file)
-
-        pbar = tqdm.tqdm(total=len(weight_files),
-                         desc="Loading safetensors weights in parallel")
-
-        # Note that the function is called with a tuple of arguments, hence we need to wrap the arguments in a tuple via [(w,) for w in weight_files]
-        # specifically the comma right after the w is important to make it a tuple.
-        run_concurrently(load_safetensors_file, [(w, ) for w in weight_files],
-                         reduce_func=weights.update,
-                         pbar=pbar)
-
-        return weights
-
-    weight_files = glob.glob(f"{checkpoint_dir}/*.bin")
-    if not weight_files:
-        weight_files = glob.glob(f"{checkpoint_dir}/*.pth")
-
-    if weight_files:
-
-        def load_bin_or_path_file(file):
-            try:
-                part_weights = torch.load(file,
-                                          weights_only=True,
-                                          map_location='cpu',
-                                          mmap=True)
-            except Exception:
-                logger.warning(
-                    f"Failed to load {file} with mmap=True, fallback to mmap=False"
-                )
-                part_weights = torch.load(file,
-                                          weights_only=True,
-                                          map_location='cpu',
-                                          mmap=False)
-            finally:
-                return part_weights
-
-        pbar = tqdm.tqdm(total=len(weight_files),
-                         desc="Loading bin weights in parallel")
-        # Note that the function is called with a tuple of arguments, hence we need to wrap the arguments in a tuple via [(w,) for w in weight_files]
-        # specifically the comma right after the w is important to make it a tuple.
-        run_concurrently(load_bin_or_path_file, [(w, ) for w in weight_files],
-                         reduce_func=weights.update,
-                         pbar=pbar)
-        return weights
-
-    raise RuntimeError(f"No weight files found in {checkpoint_dir}.")
-
-
 def initialize_dummy_weights(
     model: torch.nn.Module,
     low: float = -1e-3,
@@ -346,6 +249,7 @@ def __init__(
         *,
         model_path: str,
         pytorch_backend_config: PyTorchConfig,
+        checkpoint_loader: BaseCheckpointLoader,
         batch_size: int = 8,
         max_beam_width: int = 1,
         max_num_tokens: int = 8192,
@@ -384,6 +288,7 @@ def __init__(
         self.model = self._load_model(
             model_path,
             mapping=self.mapping,
+            checkpoint_loader=checkpoint_loader,
             attn_backend=attn_backend,
             moe_backend=pytorch_backend_config.moe_backend,
             load_format=pytorch_backend_config.load_format,
@@ -1023,13 +928,15 @@ def __del__(self) -> None:
 
     def _load_model(self,
                     checkpoint_dir: str,
+                    checkpoint_loader: BaseCheckpointLoader,
                     load_format: LoadFormat,
                     max_num_tokens: int,
                     moe_max_num_tokens: Optional[int] = None,
                     moe_load_balancer: Optional[MoeLoadBalancerConfig] = None,
                     lora_config: Optional[LoraConfig] = None,
                     **kwargs):
-        config = ModelConfig.from_pretrained(
+
+        config = checkpoint_loader.load_config(
             checkpoint_dir,
             trust_remote_code=True,
             enable_min_latency=self.pytorch_backend_config.enable_min_latency,
@@ -1082,20 +989,24 @@ def init_meta_tensor(t: torch.Tensor):
             logger.info(
                 f"Use {rank_model_storage / (1024**3):.2f} GB for model weights."
             )
-
             if load_format == LoadFormat.AUTO:
                 if hasattr(model, 'llm_checkpoint_dir'):
-                    weights = load_weights(model.llm_checkpoint_dir)
+                    weights = checkpoint_loader.load_weights(
+                        model.llm_checkpoint_dir)
                 else:
-                    weights = load_weights(checkpoint_dir)
+                    weights = checkpoint_loader.load_weights(checkpoint_dir)
 
-                model.load_weights(weights)
+                weight_mapper = checkpoint_loader.get_initilized_weight_mapper(
+                    model, config)
+                self._call_load_weights(model.load_weights, weights,
+                                        weight_mapper)
 
                 if self.spec_config is not None and self.spec_config.spec_dec_mode.need_load_draft_weights(
                 ):
-                    weights = load_weights(
+                    weights = checkpoint_loader.load_weights(
                         self.spec_config.speculative_model_dir)
-                    model.load_draft_weights(weights)
+                    self._call_load_weights(model.load_draft_weights, weights,
+                                            weight_mapper)
 
             elif load_format == LoadFormat.DUMMY:
                 initialize_dummy_weights(model)
@@ -1114,6 +1025,16 @@ def init_meta_tensor(t: torch.Tensor):
             torch.cuda.current_stream().synchronize()
         return model
 
+    def _call_load_weights(self, load_method, weights, weight_mapper):
+        # TODO smor- this is a temporary solution to load weights.
+        # Once checkpoint format is unified, this method will be removed.
+        from inspect import getfullargspec
+        args = getfullargspec(load_method).args
+        if "weight_mapper" in args:
+            load_method(weights, weight_mapper=weight_mapper)
+        else:
+            load_method(weights)
+
     def _init_max_seq_len(self):
         inferred_max_seq_len = self.model.infer_max_seq_len()
         if self.max_seq_len is None:
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index b6893d69e26..b99037d8a04 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -239,6 +239,7 @@ def create_py_executor(
             spec_config=spec_config,
             guided_decoding_config=executor_config.guided_decoding_config,
             lora_config=lora_config,
+            checkpoint_loader=executor_config.checkpoint_loader,
         )
 
     if has_draft_model_engine:
@@ -262,6 +263,7 @@ def create_py_executor(
                 attn_runtime_features=attn_runtime_features,
                 dist=dist,
                 spec_config=draft_spec_config,
+                checkpoint_loader=executor_config.checkpoint_loader,
                 is_draft_model=True,
             )
             draft_model_engine.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index da90fc8fe93..a82d0d71e5f 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -567,6 +567,12 @@ def shutdown(self):
             self.engine.shutdown()
             self.engine = None
 
+            if hasattr(
+                    self._executor_config, "checkpoint_loader"
+            ) and self._executor_config.checkpoint_loader is not None:
+                self._executor_config.checkpoint_loader.cleanup()
+                self._executor_config.checkpoint_loader = None
+
         # Check if there are any errors from the threads before shutdown.
         self._handle_background_error()
 
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index d5d0c935002..1afe97d3ce4 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -970,7 +970,11 @@ def _build_model(self):
             speculative_config=self.args.speculative_config,
             hf_model_dir=self._hf_model_dir,
             max_input_len=self.args.max_input_len,
-            max_seq_len=max_seq_len)
+            max_seq_len=max_seq_len,
+            checkpoint_format=None if self.args.backend == "_autodeploy" else
+            self.args.checkpoint_format,
+            checkpoint_loader=None if self.args.backend == "_autodeploy" else
+            self.args.checkpoint_loader)
 
         # TODO: revisit gather_context_logits
         return_logits = self.args.gather_generation_logits
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index a08982022d2..76fbaf473b1 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -1872,6 +1872,18 @@ class TorchLlmArgs(BaseLlmArgs):
                 'LOWPRECISION',
                 'MNNVL']] = Field(default='AUTO',
                                   description="Allreduce strategy to use.")
+    checkpoint_loader: Optional[object] = Field(
+        default=None,
+        description="The checkpoint loader to use for this LLM instance.",
+        json_schema_extra={
+            "type": "Optional[tensorrt_llm._torch.BaseCheckpointLoader]"
+        },
+    )
+
+    checkpoint_format: Optional[str] = Field(
+        default=None,
+        description="The format of the provided checkpoint.",
+    )
 
     # PrivateVars
     _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
@@ -1926,6 +1938,22 @@ def validate_stream_interval(self):
                 f"stream_interval must be positive, got {self.stream_interval}")
         return self
 
+    @model_validator(mode="after")
+    def validate_checkpoint_format(self):
+        if self.checkpoint_format is not None and self.checkpoint_loader is not None:
+            logger.warning(
+                "checkpoint_format and checkpoint_loader are both provided, "
+                "checkpoint_loader will be ignored.")
+            self.checkpoint_loader = None
+
+        if self.checkpoint_format is None and self.checkpoint_loader is None:
+            logger.info(
+                "neither checkpoint_format nor checkpoint_loader were provided, "
+                "checkpoint_format will be set to HF.")
+            self.checkpoint_format = "HF"
+
+        return self
+
     @staticmethod
     def _generate_cuda_graph_batch_sizes(max_batch_size: int,
                                          enable_padding: bool) -> List[int]:
diff --git a/tests/unittest/_torch/modeling/test_modeling_gemma3.py b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
index d7e27de24b5..36eb7feb242 100644
--- a/tests/unittest/_torch/modeling/test_modeling_gemma3.py
+++ b/tests/unittest/_torch/modeling/test_modeling_gemma3.py
@@ -14,6 +14,8 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.hf.gemma3_weight_mapper import \
+    Gemma3HfWeightMapper
 from tensorrt_llm._torch.models.modeling_gemma3 import Gemma3ForCausalLM
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
@@ -268,7 +270,9 @@ def test_gemma3_allclose_to_hf(self, scenario: Scenario) -> None:
         model_config = ModelConfig(pretrained_config=gemma3_config,
                                    attn_backend=backend)
         gemma3 = Gemma3ForCausalLM(model_config).to(dtype).to(device)
-        gemma3.load_weights(hf_gemma3.state_dict())
+        weight_mapper = Gemma3HfWeightMapper()
+        weight_mapper.init_model_and_config(gemma3, model_config)
+        gemma3.load_weights(hf_gemma3.state_dict(), weight_mapper)
 
         kv_cache_manager = self.get_kv_cache_manager(
             dtype=dtype,
diff --git a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
index 54319371a87..04ca1cd62f3 100644
--- a/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
+++ b/tests/unittest/_torch/modeling/test_modeling_llama_min_latency.py
@@ -14,6 +14,8 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.hf.llama4_weight_mapper import \
+    Llama4HfWeightMapper
 from tensorrt_llm._torch.models.modeling_llama import \
     Llama4ForConditionalGeneration
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
@@ -284,7 +286,10 @@ def test_llama_allclose_to_hf(self, scenario: AllCloseScenario) -> None:
             model_config.pytorch_backend_config = PyTorchConfig(
                 enable_min_latency=enable_min_latency)
             llama = Llama4ForConditionalGeneration(model_config)
-            llama.load_weights(hf_llama.state_dict())
+            weight_mapper = Llama4HfWeightMapper()
+            weight_mapper.init_model_and_config(llama, model_config)
+            llama.load_weights(hf_llama.state_dict(),
+                               weight_mapper=weight_mapper)
 
         num_blocks = 1
         tokens_per_block = 128
diff --git a/tests/unittest/_torch/modeling/test_modeling_mixtral.py b/tests/unittest/_torch/modeling/test_modeling_mixtral.py
index edbcf1efd2a..3b9e6896e32 100644
--- a/tests/unittest/_torch/modeling/test_modeling_mixtral.py
+++ b/tests/unittest/_torch/modeling/test_modeling_mixtral.py
@@ -12,6 +12,8 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.hf.mixtral_weight_mapper import \
+    MixtralHfWeightMapper
 from tensorrt_llm._torch.models.modeling_mixtral import MixtralForCausalLM
 from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import \
     DecodingCUDAGraphRunner
@@ -206,7 +208,9 @@ def test_mixtral_allclose_to_hf(self, scenario: Scenario):
             model_config = ModelConfig(pretrained_config=mixtral_config,
                                        attn_backend=backend)
             mixtral = MixtralForCausalLM(model_config)
-            mixtral.load_weights(hf_mixtral.state_dict())
+            weight_mapper = MixtralHfWeightMapper()
+            weight_mapper.init_model_and_config(mixtral, mixtral_config)
+            mixtral.load_weights(hf_mixtral.state_dict(), weight_mapper)
 
         num_blocks = 1
         tokens_per_block = 128
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py b/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py
index 53c7d66ec02..608a17fe1b5 100644
--- a/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py
+++ b/tests/unittest/_torch/modeling/test_modeling_qwen_moe.py
@@ -12,6 +12,8 @@
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.checkpoints.hf.qwen2_moe_weight_mapper import \
+    Qwen2MoeHfWeightMapper
 from tensorrt_llm._torch.models.modeling_qwen_moe import Qwen2MoeForCausalLM
 from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import \
     DecodingCUDAGraphRunner
@@ -210,7 +212,9 @@ def test_qwen_moe_allclose_to_hf(self, scenario: Scenario):
         model_config = ModelConfig(pretrained_config=qwen_moe_config,
                                    attn_backend=backend)
         qwen_moe = Qwen2MoeForCausalLM(model_config).to(device)
-        qwen_moe.load_weights(hf_qwen_moe.state_dict())
+        weight_mapper = Qwen2MoeHfWeightMapper()
+        weight_mapper.init_model_and_config(qwen_moe, qwen_moe_config)
+        qwen_moe.load_weights(hf_qwen_moe.state_dict(), weight_mapper)
 
         num_blocks = 1
         tokens_per_block = 128
diff --git a/tests/unittest/_torch/test_pytorch_model_engine.py b/tests/unittest/_torch/test_pytorch_model_engine.py
index 30c05a67aa0..4cfec14c750 100644
--- a/tests/unittest/_torch/test_pytorch_model_engine.py
+++ b/tests/unittest/_torch/test_pytorch_model_engine.py
@@ -69,6 +69,7 @@ def __init__(self,
                           rank=tensorrt_llm.mpi_rank())
         super().__init__(model_path="",
                          pytorch_backend_config=pytorch_backend_config,
+                         checkpoint_loader=None,
                          batch_size=batch_size,
                          max_seq_len=max_seq_len,
                          mapping=mapping)
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index 132bdee5804..7e4867df50f 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -66,6 +66,12 @@ methods:
       cuda_graph_config:
         annotation: Optional[tensorrt_llm.llmapi.llm_args.CudaGraphConfig]
         default: null
+      checkpoint_loader:
+        annotation: Optional[tensorrt_llm._torch.BaseCheckpointLoader]
+        default: null
+      checkpoint_format:
+        annotation: Optional[str]
+        default: null
       disable_overlap_scheduler:
         annotation: bool
         default: False

From fa34cb723457c77292109a9e788681263dc69adf Mon Sep 17 00:00:00 2001
From: Mike Iovine <miovine@nvidia.com>
Date: Wed, 16 Jul 2025 15:45:46 -0400
Subject: [PATCH 45/88] [refactor] Clean up drafter/resource manager creation
 logic (#5805)

Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
---
 .../_torch/pyexecutor/py_executor_creator.py    | 13 +++++++------
 tensorrt_llm/_torch/speculative/drafter.py      |  8 --------
 tensorrt_llm/_torch/speculative/ngram.py        |  2 +-
 tensorrt_llm/_torch/speculative/utils.py        | 17 +++++++----------
 tensorrt_llm/llmapi/llm_args.py                 |  5 +++--
 5 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index b99037d8a04..09976cb512e 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -360,18 +360,19 @@ def create_py_executor(
                 if estimating_kv_cache else _ExecutorCreationStage.KV_CACHE):
             kv_cache_creator.build_managers(resources)
 
-    # Drafter for speculative decoding
-    with mem_monitor.observe_creation_stage(_ExecutorCreationStage.DRAFTER):
-        drafter = get_spec_drafter(model_engine)
-
     # Resource managers for speculative decoding
+    # For user-specified drafters, use extra_resource_managers in PyTorchBackend config
+    # to provide a resource manager if required.
     spec_resource_manager = get_spec_resource_manager(model_engine,
-                                                      draft_model_engine,
-                                                      drafter)
+                                                      draft_model_engine)
     if spec_resource_manager is not None:
         resources[
             ResourceManagerType.SPEC_RESOURCE_MANAGER] = spec_resource_manager
 
+    # Drafter for speculative decoding
+    with mem_monitor.observe_creation_stage(_ExecutorCreationStage.DRAFTER):
+        drafter = get_spec_drafter(model_engine, spec_resource_manager)
+
     with mem_monitor.observe_creation_stage(
             _ExecutorCreationStage.INIT_EXTRA_RESOURCES
             if estimating_kv_cache else _ExecutorCreationStage.EXTRA_RESOURCES):
diff --git a/tensorrt_llm/_torch/speculative/drafter.py b/tensorrt_llm/_torch/speculative/drafter.py
index d0f5a44d778..d99c5dd92d8 100644
--- a/tensorrt_llm/_torch/speculative/drafter.py
+++ b/tensorrt_llm/_torch/speculative/drafter.py
@@ -1,18 +1,10 @@
 from abc import ABC, abstractmethod
-from typing import Optional
 
-from ..pyexecutor.resource_manager import BaseResourceManager
 from ..pyexecutor.scheduler import ScheduledRequests
 
 
 class Drafter(ABC):
 
-    def __init__(
-        self,
-        spec_resource_manager: Optional[BaseResourceManager] = None,
-    ):
-        self.spec_resource_manager = spec_resource_manager
-
     @abstractmethod
     def prepare_draft_tokens(
         self,
diff --git a/tensorrt_llm/_torch/speculative/ngram.py b/tensorrt_llm/_torch/speculative/ngram.py
index 1d015a58b9b..57f3045e664 100644
--- a/tensorrt_llm/_torch/speculative/ngram.py
+++ b/tensorrt_llm/_torch/speculative/ngram.py
@@ -167,8 +167,8 @@ def __init__(
         ngram_pool_manager: NGramPoolManager = None,
     ):
         assert ngram_pool_manager is not None, "NGram needs a resource manager to maintain the pool."
-        super().__init__(spec_resource_manager=ngram_pool_manager)
         self.max_draft_len = spec_config.max_draft_len
+        self.spec_resource_manager = ngram_pool_manager
 
     def prepare_draft_tokens(
         self,
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
index 882dfdf924d..667d1a14b0e 100644
--- a/tensorrt_llm/_torch/speculative/utils.py
+++ b/tensorrt_llm/_torch/speculative/utils.py
@@ -55,9 +55,7 @@ def get_spec_metadata(spec_config,
     return None
 
 
-def get_spec_resource_manager(model_engine,
-                              draft_model_engine=None,
-                              drafter=None):
+def get_spec_resource_manager(model_engine, draft_model_engine=None):
     spec_config = model_engine.spec_config
     if spec_config is None:
         return None
@@ -93,9 +91,10 @@ def get_spec_resource_manager(model_engine,
             max_seq_len,
             max_num_tokens,
         )
-    if spec_dec_mode.is_ngram() or spec_dec_mode.is_user_provided():
-        assert drafter is not None, "Drafter is required for ngram or user provided speculative decoding."
-        return drafter.spec_resource_manager
+    if spec_dec_mode.is_ngram():
+        return NGramPoolManager(spec_config, max_num_requests)
+    if spec_dec_mode.is_user_provided():
+        return spec_config.resource_manager
     return None
 
 
@@ -113,14 +112,12 @@ def get_spec_decoder(sampler_args: TorchSampler.Args,
         f"Unsupported speculative decoding mode: {spec_config.spec_dec_mode}")
 
 
-def get_spec_drafter(model_engine):
+def get_spec_drafter(model_engine, spec_resource_manager):
     spec_config = model_engine.spec_config
-    max_num_requests = model_engine.batch_size
     if spec_config is None:
         return None
     if spec_config.spec_dec_mode.is_ngram():
-        return NGramDrafter(spec_config,
-                            NGramPoolManager(spec_config, max_num_requests))
+        return NGramDrafter(spec_config, spec_resource_manager)
     if spec_config.spec_dec_mode.is_user_provided():
         return spec_config.drafter
     return None
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 76fbaf473b1..111d779ef39 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -354,8 +354,9 @@ def get_draft_model_prompt(self,
 
 
 class UserProvidedDecodingConfig(DecodingBaseConfig):
-    # Type should be Drafter, but it leads to circular import
-    drafter: object
+    # Cannot use real type annotations due to circular imports
+    drafter: object  # Type is Drafter
+    resource_manager: object = None  # Type is Optional[ResourceManager]
 
     @classmethod
     def from_dict(cls, data: dict):

From e09e409dfb678fd1f1f91abe79ed7db734882170 Mon Sep 17 00:00:00 2001
From: qixiang-99 <203170375+qixiang-99@users.noreply.github.com>
Date: Wed, 16 Jul 2025 14:41:31 -0700
Subject: [PATCH 46/88] Fix: Enhance ModelConfig for kv cache size calculations
 (#5868)

Signed-off-by: qixiang-99 <203170375+qixiang-99@users.noreply.github.com>
---
 .../batch_manager/kvCacheManager.h            |  8 ++++++
 .../pybind/batch_manager/kvCacheManager.cpp   |  3 ++-
 tensorrt_llm/_torch/model_config.py           | 19 ++++++++++++--
 .../_torch/pyexecutor/resource_manager.py     | 25 +++++++++++++++----
 .../defs/accuracy/test_llm_api_pytorch.py     | 10 +++++++-
 .../test_lists/test-db/l0_h100.yml            |  2 ++
 6 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index caac72744f3..d0daf9e4350 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -180,6 +180,8 @@ struct KvCacheStats
     SizeType32 missedBlocks;
     // Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
     float cacheHitRate;
+    // Number of free blocks for every configured attention-window size.
+    std::map<SizeType32, SizeType32> numFreeBlocksPerWindowSize;
 };
 
 // Basic building block of a paged KV cache - a single
@@ -1457,6 +1459,11 @@ class KVCacheManager : public BaseKVCacheManager
         return mBlockManager.getNumMissedBlocks();
     }
 
+    [[nodiscard]] std::map<SizeType32, SizeType32> getNumFreeBlocksPerWindowSize() const
+    {
+        return mBlockManager.getNumFreeBlocksPerWindowSize();
+    }
+
     [[nodiscard]] KvCacheStats getKvCacheStats() const override
     {
         KvCacheStats kvCacheStats;
@@ -1471,6 +1478,7 @@ class KVCacheManager : public BaseKVCacheManager
         kvCacheStats.cacheHitRate = kvCacheStats.reusedBlocks == 0 ? 0
                                                                    : static_cast<float>(kvCacheStats.reusedBlocks)
                 / static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
+        kvCacheStats.numFreeBlocksPerWindowSize = getNumFreeBlocksPerWindowSize();
         return kvCacheStats;
     }
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
index a75db66eaee..e31269d1fd9 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/kvCacheManager.cpp
@@ -298,7 +298,8 @@ void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(py::module_& m)
         .def_readwrite("alloc_new_blocks", &tbk::KvCacheStats::allocNewBlocks)
         .def_readwrite("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
         .def_readwrite("missed_blocks", &tbk::KvCacheStats::missedBlocks)
-        .def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate);
+        .def_readwrite("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
+        .def_readwrite("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
 
     py::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
         .def(py::init<>())
diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index 830cd5bda6a..671564baadc 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -305,6 +305,8 @@ def get_bindings_model_config(self,
             hidden_size=hidden_size,
             data_type=torch_dtype_to_binding(
                 self.pretrained_config.torch_dtype))
+
+        # For kv cache size calculation: set tokens_per_block
         if tokens_per_block is None:
             logger.warning(
                 f"tokens_per_block is not set, using default value {model_config_cpp.tokens_per_block}"
@@ -312,6 +314,12 @@ def get_bindings_model_config(self,
         else:
             model_config_cpp.tokens_per_block = tokens_per_block
 
+        # For kv cache size calculation: set num_kv_heads
+        num_kv_heads = getattr(
+            self.pretrained_config, "num_key_value_heads",
+            num_heads) // (self.mapping.tp_size * self.mapping.cp_size)
+        model_config_cpp.set_num_kv_heads(num_kv_heads)
+
         mlp_hidden_size = None
         if self.pretrained_config.intermediate_size is not None:
             mlp_hidden_size = self.pretrained_config.intermediate_size // self.mapping.tp_size
@@ -333,9 +341,16 @@ def get_bindings_model_config(self,
                 f"Failed to infer mlp hidden size for model: {self.pretrained_config.model_type}"
             )
 
-        if "head_size" in self.pretrained_config:
-            head_size = self.pretrained_config.head_size
+        # For kv cache size calculation: set size_per_head
+        head_dim_names = ["head_size", "head_dim"]
+        for head_dim_name in head_dim_names:
+            if head_dim_name in self.pretrained_config:
+                head_size = getattr(self.pretrained_config, head_dim_name)
+                break
         else:
+            logger.warning(
+                f"head_size/head_dim is not set, using default value {hidden_size // num_heads}"
+            )
             head_size = hidden_size // num_heads
 
         model_config_cpp.mlp_hidden_size = mlp_hidden_size
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index ffa8ce4bdae..c5a9f264b01 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -193,10 +193,10 @@ def __init__(
                              else 0)
 
         # Determine if this is VSWA (Variable Sliding Window Attention)
-        is_vswa = len(self.max_attention_window_vec) > 1
+        self.is_vswa = len(self.max_attention_window_vec) > 1
 
         # Calculate blocks per window using appropriate method
-        if is_vswa:
+        if self.is_vswa:
             # VSWA case: use C++ implementation for variable window sizes
             # model config check
             if model_config is None:
@@ -523,14 +523,29 @@ def get_batch_cache_indices(
         return result
 
     def get_num_free_blocks(self) -> int:
-        return self.impl.get_kv_cache_stats().free_num_blocks
+        if self.is_vswa:
+            logger.info(
+                f"For VSWA case, we return the minimum of the number of free blocks for each window size: {self.impl.get_kv_cache_stats().num_free_blocks_per_window_size}"
+            )
+            return min(self.impl.get_kv_cache_stats().
+                       num_free_blocks_per_window_size.values())
+        else:
+            return self.impl.get_kv_cache_stats().free_num_blocks
 
     def get_num_kv_blocks(self, num_tokens: int) -> int:
         return (num_tokens + self.tokens_per_block - 1) // self.tokens_per_block
 
     def get_num_available_tokens(self, max_num_draft_tokens: int = 0) -> int:
-        return (self.get_num_free_blocks() * self.tokens_per_block -
-                self.num_extra_kv_tokens - max_num_draft_tokens)
+        if self.max_attention_window_vec and len(
+                self.max_attention_window_vec) > 1:
+            # VSWA case, the available tokens should the the minimum of the available tokens for each window size
+            min_free_blocks = min(self.impl.get_kv_cache_stats().
+                                  num_free_blocks_per_window_size.values())
+            res = min_free_blocks * self.tokens_per_block - self.num_extra_kv_tokens - max_num_draft_tokens
+        else:
+            res = (self.get_num_free_blocks() * self.tokens_per_block -
+                   self.num_extra_kv_tokens - max_num_draft_tokens)
+        return res
 
     def get_buffers(self, layer_idx: int) -> Optional[torch.Tensor]:
         layer_offset = self.layer_offsets[layer_idx]
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 701461b19d1..5088f901c50 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -70,7 +70,6 @@ def test_nvfp4_streaming(self, stream_interval):
             task.evaluate(llm, streaming=True)
 
 
-@skip_post_blackwell  # TODO: remove this skip after this nvbug is fixed: https://nvbugspro.nvidia.com/bug/5295470
 class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
@@ -537,6 +536,15 @@ def test_auto_dtype_vswa(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @pytest.mark.skip(
+        reason=
+        "remove this skip after the kernel support mentioned in this nvbug is fixed: https://nvbugspro.nvidia.com/bug/5338620"
+    )
+    def test_auto_dtype_chunked_prefill(self):
+        # NOTE: Test with VSWA kv cache config.
+        self.kv_cache_config.max_attention_window = [
+            512, 512, 512, 512, 512, 32768
+        ]  # Gemma3 1B attention window size pattern
         # chunked prefill case or more features
         extra_llm_config = dict(
             enable_chunked_prefill=True,
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 3e9f0d3995b..ca678f13ef5 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -24,6 +24,8 @@ l0_h100:
   - unittest/disaggregated/test_router.py
   - unittest/disaggregated/test_remoteDictionary.py
   - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_chunked_prefill
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)

From 2d2b8bae32b1d65f44873d762eb44fbc38e1336d Mon Sep 17 00:00:00 2001
From: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
Date: Thu, 17 Jul 2025 06:30:58 +0800
Subject: [PATCH 47/88] feat: TRTLLM-5574 Add phi-4-multimodal pytorch-backend
 support (#5644)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
---
 examples/llm-api/quickstart_advanced.py       |   6 +-
 examples/llm-api/quickstart_multimodal.py     | 108 ++++--
 requirements.txt                              |   1 +
 .../_torch/attention_backend/interface.py     |  21 +-
 tensorrt_llm/_torch/models/__init__.py        |   4 +
 .../models/modeling_multimodal_utils.py       |   1 +
 tensorrt_llm/_torch/models/modeling_phi3.py   | 249 +++++++++++++
 tensorrt_llm/_torch/models/modeling_phi4mm.py | 286 ++++++++++++++
 tensorrt_llm/functional.py                    |  42 +++
 tensorrt_llm/inputs/__init__.py               |   6 +-
 tensorrt_llm/inputs/utils.py                  |  82 +++-
 tensorrt_llm/serve/chat_utils.py              |  29 +-
 .../defs/accuracy/references/gsm8k.yaml       |   2 +
 .../defs/accuracy/references/mmlu.yaml        |   2 +
 .../defs/accuracy/test_llm_api_pytorch.py     |  13 +
 tests/integration/defs/test_e2e.py            | 121 +++++-
 .../test_lists/qa/examples_test_list.txt      |   4 +
 .../integration/test_lists/test-db/l0_a30.yml |   1 +
 .../test_lists/test-db/l0_l40s.yml            |   3 +
 .../_torch/modeling/test_modeling_phi3.py     | 352 ++++++++++++++++++
 20 files changed, 1277 insertions(+), 56 deletions(-)
 create mode 100644 tensorrt_llm/_torch/models/modeling_phi3.py
 create mode 100644 tensorrt_llm/_torch/models/modeling_phi4mm.py
 create mode 100644 tests/unittest/_torch/modeling/test_modeling_phi3.py

diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
index 90d527562a1..1bd6e0793e2 100644
--- a/examples/llm-api/quickstart_advanced.py
+++ b/examples/llm-api/quickstart_advanced.py
@@ -145,7 +145,7 @@ def parse_arguments():
     return args
 
 
-def setup_llm(args):
+def setup_llm(args, **kwargs):
     kv_cache_config = KvCacheConfig(
         enable_block_reuse=not args.disable_kv_cache_reuse,
         free_gpu_memory_fraction=args.kv_cache_fraction,
@@ -222,7 +222,9 @@ def setup_llm(args):
         speculative_config=spec_config,
         trust_remote_code=args.trust_remote_code,
         gather_generation_logits=args.return_generation_logits,
-        max_beam_width=args.max_beam_width)
+        max_beam_width=args.max_beam_width,
+        **kwargs,
+    )
 
     sampling_params = SamplingParams(
         max_tokens=args.max_tokens,
diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py
index 3859580fcde..967a8636e1b 100644
--- a/examples/llm-api/quickstart_multimodal.py
+++ b/examples/llm-api/quickstart_multimodal.py
@@ -7,24 +7,56 @@
 from tensorrt_llm.inputs import (ALL_SUPPORTED_MULTIMODAL_MODELS,
                                  default_multimodal_input_loader)
 
-example_images = [
-    "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png",
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
-    "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg",
-]
-example_image_prompts = [
-    "Describe the natural environment in the image.",
-    "Describe the object and the weather condition in the image.",
-    "Describe the traffic condition on the road in the image.",
-]
-example_videos = [
-    "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4",
-    "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/world.mp4",
-]
-example_video_prompts = [
-    "Tell me what you see in the video briefly.",
-    "Describe the scene in the video briefly.",
-]
+example_medias_and_prompts = {
+    "image": {
+        "media": [
+            "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/seashore.png",
+            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+            "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg",
+        ],
+        "prompt": [
+            "Describe the natural environment in the image.",
+            "Describe the object and the weather condition in the image.",
+            "Describe the traffic condition on the road in the image.",
+        ]
+    },
+    "video": {
+        "media": [
+            "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4",
+            "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/world.mp4",
+        ],
+        "prompt": [
+            "Tell me what you see in the video briefly.",
+            "Describe the scene in the video briefly.",
+        ]
+    },
+    "audio": {
+        "media": [
+            "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_the_traffic_sign_in_the_image.wav",
+            "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav",
+        ],
+        "prompt": [
+            "Transcribe the audio clip into text, please don't add other text.",
+            "Transcribe the audio clip into text, please don't add other text.",
+        ]
+    },
+    "image_audio": {
+        "media": [
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+                "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav"
+            ],
+            [
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
+                "https://huggingface.co/microsoft/Phi-4-multimodal-instruct/resolve/main/examples/what_is_shown_in_this_image.wav"
+            ],
+        ],
+        "prompt": [
+            "Describe the scene in the image briefly.",
+            "",
+        ]
+    }
+}
 
 
 def add_multimodal_args(parser):
@@ -34,7 +66,7 @@ def add_multimodal_args(parser):
                         help="Model type.")
     parser.add_argument("--modality",
                         type=str,
-                        choices=["image", "video"],
+                        choices=["image", "video", "audio", "image_audio"],
                         default="image",
                         help="Media type.")
     parser.add_argument("--media",
@@ -53,11 +85,24 @@ def add_multimodal_args(parser):
     return parser
 
 
+def add_lora_args(parser):
+    parser.add_argument("--load_lora",
+                        default=False,
+                        action='store_true',
+                        help="Whether to load the LoRA model.")
+    parser.add_argument("--auto_model_name",
+                        type=str,
+                        default=None,
+                        help="The auto model name in TRTLLM repo.")
+    return parser
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser(
         description="Multimodal models with the PyTorch workflow.")
     parser = add_llm_args(parser)
     parser = add_multimodal_args(parser)
+    parser = add_lora_args(parser)
     args = parser.parse_args()
 
     args.disable_kv_cache_reuse = True  # kv cache reuse does not work for multimodal, force overwrite
@@ -71,11 +116,19 @@ def main():
     args = parse_arguments()
     # set prompts and media to example prompts and images if they are not provided
     if args.prompt is None:
-        args.prompt = example_image_prompts if args.modality == "image" else example_video_prompts
+        args.prompt = example_medias_and_prompts[args.modality]["prompt"]
     if args.media is None:
-        args.media = example_images if args.modality == "image" else example_videos
+        args.media = example_medias_and_prompts[args.modality]["media"]
+
+    lora_config = None
+    if args.load_lora:
+        assert args.auto_model_name is not None, "Please provide the auto model name to load LoRA config."
+        import importlib
+        models_module = importlib.import_module('tensorrt_llm._torch.models')
+        model_class = getattr(models_module, args.auto_model_name)
+        lora_config = model_class.lora_config(args.model_dir)
 
-    llm, sampling_params = setup_llm(args)
+    llm, sampling_params = setup_llm(args, lora_config=lora_config)
 
     image_format = args.image_format
     if args.model_type is not None:
@@ -96,7 +149,16 @@ def main():
                                              num_frames=args.num_frames,
                                              device=device)
 
-    outputs = llm.generate(inputs, sampling_params)
+    lora_request = None
+    if args.load_lora:
+        lora_request = model_class.lora_request(len(inputs), args.modality,
+                                                llm._hf_model_dir)
+
+    outputs = llm.generate(
+        inputs,
+        sampling_params,
+        lora_request=lora_request,
+    )
 
     for i, output in enumerate(outputs):
         prompt = args.prompt[i]
diff --git a/requirements.txt b/requirements.txt
index a10401ecafd..c0e94b2a3d0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -59,3 +59,4 @@ ninja
 etcd3
 blake3
 llguidance==0.7.29
+soundfile
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index 3a3410e8a9c..d505626ca99 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -351,6 +351,8 @@ class RopeParams:
     beta_slow: int = 1
     mscale: float = 1.0
     mscale_all_dim: float = 0.0
+    short_factor: Optional[Tuple[float]] = None
+    long_factor: Optional[Tuple[float]] = None
 
     @staticmethod
     def from_config(config) -> "RopeParams":
@@ -386,12 +388,18 @@ def from_config(config) -> "RopeParams":
                 "low_freq_factor", 1.0)
             rope_params.high_freq_factor = rope_scaling.get(
                 "high_freq_factor", 4.0)
-            rope_params.original_max_positions = rope_scaling.get(
-                "original_max_position_embeddings", 1024)
+            rope_params.original_max_positions = getattr(
+                config,
+                "original_max_position_embeddings", None) or rope_scaling.get(
+                    "original_max_position_embeddings", None) or 1024
             rope_params.beta_fast = rope_scaling.get("beta_fast", 32)
             rope_params.beta_slow = rope_scaling.get("beta_slow", 1)
             rope_params.mscale = rope_scaling.get("mscale", 1.0)
             rope_params.mscale_all_dim = rope_scaling.get("mscale_all_dim", 0.0)
+            if "short_factor" in rope_scaling:
+                rope_params.short_factor = tuple(rope_scaling["short_factor"])
+            if "long_factor" in rope_scaling:
+                rope_params.long_factor = tuple(rope_scaling["long_factor"])
         # Workaround for DeepSeek V3 Lite since its rope_scaling is null in config.json.
         elif config.model_type == "deepseek_v3":
             rope_params.scale_type = RotaryScalingType.yarn
@@ -428,7 +436,14 @@ def create_rope_const_params(self, interleave: bool = True):
                 self.mscale_all_dim,
             )
         elif self.scale_type == RotaryScalingType.longrope:
-            raise NotImplementedError("Long RoPE is not supported.")
+            rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_long_rope_for_attention_plugin(
+                num_pos=self.max_positions,
+                dim=self.dim,
+                theta=self.theta,
+                original_max_pos=self.original_max_positions,
+                short_factor=self.short_factor,
+                long_factor=self.long_factor,
+            )
         else:
             rope_inv_freq, rope_cos_sin = RopeEmbeddingUtils.create_sinusoidal_positions_for_attention_plugin(
                 self.max_positions,
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index cc75dffb5c0..c5acbef804a 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -15,6 +15,8 @@
 from .modeling_nemotron import NemotronForCausalLM
 from .modeling_nemotron_h import NemotronHForCausalLM
 from .modeling_nemotron_nas import NemotronNASForCausalLM
+from .modeling_phi3 import Phi3ForCausalLM
+from .modeling_phi4mm import Phi4MMForCausalLM
 from .modeling_qwen import (Qwen2ForCausalLM, Qwen2ForProcessRewardModel,
                             Qwen2ForRewardModel)
 from .modeling_qwen2vl import Qwen2_5_VLModel, Qwen2VLModel
@@ -42,6 +44,8 @@
     "NemotronForCausalLM",
     "NemotronHForCausalLM",
     "NemotronNASForCausalLM",
+    "Phi3ForCausalLM",
+    "Phi4MMForCausalLM",
     "Qwen2ForCausalLM",
     "Qwen2ForProcessRewardModel",
     "Qwen2ForRewardModel",
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
index b23f5e733d2..1dc86cdd1d2 100644
--- a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
+++ b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
@@ -64,6 +64,7 @@ def fuse_input_embeds(
         mm_token_mask = input_ids >= vocab_size
         text_token_mask = input_ids < vocab_size
     else:
+        mm_token_ids = mm_token_ids.to(input_ids.device)
         mm_token_mask = torch.isin(input_ids, mm_token_ids)
         text_token_mask = ~mm_token_mask
     text_token_indices = torch.where(text_token_mask)[0]
diff --git a/tensorrt_llm/_torch/models/modeling_phi3.py b/tensorrt_llm/_torch/models/modeling_phi3.py
new file mode 100644
index 00000000000..5e4221dd713
--- /dev/null
+++ b/tensorrt_llm/_torch/models/modeling_phi3.py
@@ -0,0 +1,249 @@
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from tqdm import tqdm
+from transformers import Phi3Config
+
+from tensorrt_llm._torch.attention_backend import AttentionMetadata
+from tensorrt_llm._torch.attention_backend.interface import (
+    PositionalEmbeddingParams, RopeParams)
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.modeling_utils import (DecoderModel,
+                                                       DecoderModelForCausalLM,
+                                                       register_auto_model)
+from tensorrt_llm._torch.modules.attention import Attention
+from tensorrt_llm._torch.modules.decoder_layer import DecoderLayer
+from tensorrt_llm._torch.modules.embedding import Embedding
+from tensorrt_llm._torch.modules.gated_mlp import GatedMLP
+from tensorrt_llm._torch.modules.linear import TensorParallelMode
+from tensorrt_llm._torch.modules.rms_norm import RMSNorm
+from tensorrt_llm.functional import PositionEmbeddingType
+
+
+class Phi3Attention(Attention):
+
+    def __init__(
+        self,
+        model_config: ModelConfig[Phi3Config],
+        layer_idx: Optional[int] = None,
+    ):
+        config = model_config.pretrained_config
+
+        rope_params = RopeParams.from_config(config)
+        super().__init__(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            bias=config.attention_bias,
+            pos_embd_params=PositionalEmbeddingParams(
+                type=PositionEmbeddingType.rope_gpt_neox,
+                rope=rope_params,
+            ),
+            layer_idx=layer_idx,
+            dtype=config.torch_dtype,
+            config=model_config,
+        )
+
+
+class Phi3DecoderLayer(DecoderLayer):
+
+    def __init__(
+        self,
+        model_config: ModelConfig[Phi3Config],
+        layer_idx: int,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        super().__init__()
+        config = model_config.pretrained_config
+        self.layer_idx = layer_idx
+
+        self.self_attn = Phi3Attention(model_config, layer_idx=layer_idx)
+
+        self.mlp = GatedMLP(
+            hidden_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+            bias=False,
+            dtype=config.torch_dtype,
+            config=model_config,
+        )
+
+        self.input_layernorm = RMSNorm(
+            hidden_size=config.hidden_size,
+            eps=config.rms_norm_eps,
+            dtype=config.torch_dtype,
+        )
+        self.post_attention_layernorm = RMSNorm(
+            hidden_size=config.hidden_size,
+            eps=config.rms_norm_eps,
+            dtype=config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        position_ids: torch.LongTensor,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        lora_params=None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(
+                hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            position_ids=None,
+            hidden_states=hidden_states,
+            attn_metadata=attn_metadata,
+            lora_params=lora_params,
+            **kwargs,
+        )
+
+        # Fully connected
+        hidden_states, residual = self.post_attention_layernorm(
+            hidden_states, residual)
+        hidden_states = self.mlp(hidden_states, **kwargs)
+        return hidden_states, residual
+
+
+class Phi3Model(DecoderModel):
+
+    def __init__(self, model_config: ModelConfig[Phi3Config]):
+        super().__init__(model_config)
+        config = self.model_config.pretrained_config
+        self.padding_idx = config.pad_token_id
+
+        self.embed_tokens = Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            dtype=config.torch_dtype,
+            mapping=model_config.mapping,
+            tensor_parallel_mode=TensorParallelMode.COLUMN,
+            gather_output=True,
+        )
+        self.layers = nn.ModuleList([
+            Phi3DecoderLayer(
+                model_config,
+                layer_idx,
+            ) for layer_idx in range(config.num_hidden_layers)
+        ])
+        self.norm = RMSNorm(
+            hidden_size=config.hidden_size,
+            eps=config.rms_norm_eps,
+            dtype=config.torch_dtype,
+        )
+
+    def forward(
+        self,
+        attn_metadata: AttentionMetadata,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        lora_params=None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        residual = None
+        for decoder_layer in self.layers:
+            hidden_states, residual = decoder_layer(
+                hidden_states=hidden_states,
+                position_ids=position_ids,
+                residual=residual,
+                attn_metadata=attn_metadata,
+                lora_params=lora_params,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+@register_auto_model("Phi3ForCausalLM")
+class Phi3ForCausalLM(DecoderModelForCausalLM[Phi3Model, Phi3Config]):
+
+    def __init__(
+        self,
+        model_config: ModelConfig[Phi3Config],
+    ):
+        super().__init__(Phi3Model(model_config),
+                         config=model_config,
+                         hidden_size=model_config.pretrained_config.hidden_size,
+                         vocab_size=model_config.pretrained_config.vocab_size)
+
+    def load_weights(self, weights: dict):
+        self.model_config.mapping.tp_size
+        hidden_size = self.config.hidden_size
+        num_heads = self.config.num_attention_heads
+        num_kv_heads = self.config.num_key_value_heads
+        head_dim = hidden_size // num_heads
+
+        def filter_weights(prefix: str, weights: dict):
+            result = {}
+            for k, v in weights.items():
+                if k.startswith(prefix):
+                    new_k = k[len(prefix) + 1:]
+                    result[new_k] = v
+            return result
+
+        for name, module in tqdm(list(self.named_modules()),
+                                 desc="Loading weights"):
+            if len(module._parameters) > 0:
+                # skip load weights if tie word embeddings is enabled and layer is lm_head
+                if self.config.tie_word_embeddings and name.startswith(
+                        'lm_head'):
+                    continue
+
+                module_weights = filter_weights(name, weights)
+                if hasattr(module, 'load_weights'):
+                    if "self_attn.qkv_proj" in name:
+                        # The weights need to be split correctly before sharding to support tp_size >1.
+                        qkv_weight = module_weights['weight'][:]
+                        q_weight = qkv_weight[:hidden_size, :]
+                        k_weight = qkv_weight[hidden_size:hidden_size +
+                                              num_kv_heads * head_dim, :]
+                        v_weight = qkv_weight[hidden_size +
+                                              num_kv_heads * head_dim:, :]
+                        module.load_weights(weights=[
+                            {
+                                'weight': q_weight
+                            },
+                            {
+                                'weight': k_weight
+                            },
+                            {
+                                'weight': v_weight
+                            },
+                        ])
+                    elif "mlp.gate_up_proj" in name:
+                        # The weights need to be split correctly before sharding to support tp_size >1.
+                        intermediate_size = self.config.intermediate_size
+                        gate_up_weight = module_weights['weight'][:]
+                        gate_weight = gate_up_weight[:intermediate_size, :]
+                        up_weight = gate_up_weight[intermediate_size:, :]
+                        module.load_weights(weights=[
+                            {
+                                'weight': gate_weight
+                            },
+                            {
+                                'weight': up_weight
+                            },
+                        ])
+                    else:
+                        module.load_weights(weights=[module_weights])
+                else:
+                    for n, p in module._parameters.items():
+                        if p is not None:
+                            p.data.copy_(module_weights[n][:])
diff --git a/tensorrt_llm/_torch/models/modeling_phi4mm.py b/tensorrt_llm/_torch/models/modeling_phi4mm.py
new file mode 100644
index 00000000000..8c8982f6e0b
--- /dev/null
+++ b/tensorrt_llm/_torch/models/modeling_phi4mm.py
@@ -0,0 +1,286 @@
+# Plan for phi4-mm model support.
+# (done) step 1: support legacy inference pipeline for phi4-mm model.
+# (todo) step 2: refactor the inference pipeline to use AGGREGATE mode (https://github.com/NVIDIA/TensorRT-LLM/pull/5522).
+
+import copy
+from typing import List, Optional, Tuple
+
+import torch
+import transformers
+from PIL import Image
+
+from ...executor.request import LoRARequest
+from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
+                       register_input_processor)
+from ...logger import logger
+from ...lora_manager import LoraConfig
+from ...sampling_params import SamplingParams
+from ..attention_backend import AttentionMetadata
+from ..model_config import ModelConfig
+from .modeling_auto import AutoModelForCausalLM
+from .modeling_multimodal_utils import fuse_input_embeds
+from .modeling_utils import register_auto_model
+
+# Special tokens
+_IMAGE_SPECIAL_TOKEN_ID = 200010  # '<|endoftext10|>'
+_AUDIO_SPECIAL_TOKEN_ID = 200011  # '<|endoftext11|>'
+
+
+# Create a PreTrainedModel class for transformers=4.53.1 upgrade.
+# Core idea is to provide `prepare_inputs_for_generation` method from `GenerationMixin`.
+class NewPreTrainedModel(transformers.modeling_utils.PreTrainedModel,
+                         transformers.generation.GenerationMixin):
+    pass
+
+
+class Phi4MMInputProcessor(InputProcessor):
+
+    def __init__(self,
+                 model_path: str,
+                 model_config: transformers.PretrainedConfig,
+                 tokenizer: transformers.AutoTokenizer,
+                 trust_remote_code: bool = True):
+        assert trust_remote_code, "trust_remote_code must be True for Phi4MM"
+
+        self.model_config = model_config
+        self.device = 'cuda'
+
+        self.tokenizer = tokenizer
+        self.use_fast = True
+        if self.tokenizer is None:
+            self.tokenizer = transformers.AutoTokenizer.from_pretrained(
+                model_path,
+                trust_remote_code=trust_remote_code,
+                use_fast=self.use_fast)
+
+        self.processor = transformers.AutoProcessor.from_pretrained(
+            model_path,
+            trust_remote_code=trust_remote_code,
+            use_fast=self.use_fast)
+
+        # Build pure-pytorch model architecture for multimodal encoder.
+        # Model weights are also loaded here.
+        OldPreTrainedModel = transformers.modeling_utils.PreTrainedModel
+        transformers.modeling_utils.PreTrainedModel = NewPreTrainedModel
+        # TODO: Make separate Phi4VisionEncoder and Phi4AudioEncoder, and move them to LLM-side.
+        ref_phi4mm_model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            trust_remote_code=True,
+            # Flash_attn_2 only supports bf16 or fp16 and set in HF config.
+            torch_dtype='auto',
+            _attn_implementation='flash_attention_2',
+        ).eval()
+        transformers.modeling_utils.PreTrainedModel = OldPreTrainedModel
+        self.phi4mm_modal_encoder = ref_phi4mm_model.model.embed_tokens_extend.to(
+            self.device)
+        # Required by Phi4MMImageAudioEmbedding.
+        # See link: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/modeling_phi4mm.py#L701
+        self.phi4mm_wte = ref_phi4mm_model.model.embed_tokens.to(self.device)
+
+    @torch.inference_mode()
+    def __call__(
+        self, inputs: TextPrompt, sampling_params: SamplingParams
+    ) -> Tuple[List[int], Optional[ExtraProcessedInputs]]:
+        text_prompt, mm_data, mm_processor_kwargs = inputs.get("prompt"), \
+                        inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {})
+        images = mm_data.get("image", None)
+        audios = mm_data.get("audio", None)
+
+        if images is not None:
+            if isinstance(images[0], torch.Tensor):
+                # Convert normalized tensors (0-1) to PIL images (0-255).
+                images = [
+                    Image.fromarray((image.permute(1, 2, 0) * 255).to(
+                        torch.uint8).cpu().numpy()) for image in images
+                ]
+
+        # Preprocessing for multimodal data.
+        inputs = self.processor(text=[text_prompt],
+                                images=images,
+                                audios=audios,
+                                return_tensors='pt').to(self.device)
+
+        # Set audio_projection_mode according to the modality.
+        # Ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/modeling_phi4mm.py#L2103
+        if images is not None:
+            audio_projection_mode = 'vision'
+        elif audios is not None:
+            audio_projection_mode = 'speech'
+        else:
+            audio_projection_mode = 'speech'
+
+        # Processing with Phi4MMImageAudioEmbedding.
+        mm_features = self.phi4mm_modal_encoder(
+            input_ids=inputs['input_ids'],
+            input_embeds=None,
+            input_image_embeds=inputs['input_image_embeds'],
+            input_audio_embeds=inputs['input_audio_embeds'],
+            image_sizes=inputs['image_sizes'],
+            image_attention_mask=inputs['image_attention_mask'],
+            audio_embed_sizes=inputs['audio_embed_sizes'],
+            audio_attention_mask=inputs['audio_attention_mask'],
+            audio_projection_mode=audio_projection_mode,
+            wte=self.phi4mm_wte,
+        )
+
+        # Postprocessing to get multimodal-only embeddings.
+        image_token_mask = inputs['input_ids'] == _IMAGE_SPECIAL_TOKEN_ID
+        audio_token_mask = inputs['input_ids'] == _AUDIO_SPECIAL_TOKEN_ID
+        mm_token_mask = image_token_mask | audio_token_mask
+        mm_features = mm_features[mm_token_mask]
+
+        multimodal_data = {}
+        multimodal_data["multimodal_embedding"] = mm_features
+
+        return inputs['input_ids'][0].to(torch.int32).tolist(), {
+            "multimodal_data": multimodal_data,
+        }
+
+
+@register_auto_model("Phi4MMForCausalLM")
+@register_input_processor(Phi4MMInputProcessor, model_type="phi4mm")
+class Phi4MMForCausalLM(transformers.PreTrainedModel):
+
+    _supports_flash_attn_2 = True
+    MM_TOKEN_IDS = torch.tensor(
+        [_IMAGE_SPECIAL_TOKEN_ID, _AUDIO_SPECIAL_TOKEN_ID])
+
+    def __init__(self, model_config: ModelConfig):
+
+        config = model_config.pretrained_config
+        super().__init__(config)
+
+        self.model_config = model_config
+        if hasattr(self, "llm"):
+            return
+
+        # We use Phi3ForCausalLM as the language model.
+        llm_model_config = copy.deepcopy(model_config)
+        llm_model_config.pretrained_config.architectures = ["Phi3ForCausalLM"]
+        # Only build the language model architecture without loading weights.
+        self.llm = AutoModelForCausalLM.from_config(llm_model_config)
+
+        self.vocab_size = config.vocab_size
+        self.model_dtype = getattr(config, "torch_dtype", torch.float16)
+        logger.info(f"{self.dtype=} {self.model_dtype=}")
+        self.post_config()
+        self.is_loaded = True
+
+    def load_weights(self, weights):
+        # Filter out non-language model weights.
+        weights = {
+            k: v
+            for k, v in weights.items()
+            if not k.startswith('model.embed_tokens_extend')
+        }
+        # Filter out LoRA weights.
+        # LoRA weights will be loaded by LoraManager.
+        weights = {k: v for k, v in weights.items() if '.lora_' not in k}
+        # Rename base layer weights.
+        updated_weights = {}
+        for k in weights.keys():
+            if 'base_layer.weight' in k:
+                new_k = k.replace('base_layer.weight', 'weight')
+                updated_weights[new_k] = weights[k]
+            else:
+                updated_weights[k] = weights[k]
+        weights = updated_weights
+
+        self.llm.load_weights(weights)
+
+    def infer_max_seq_len(self) -> int:
+        return self.llm.infer_max_seq_len()
+
+    def post_config(self):
+        # use llm.config as config for pytorch model engine
+        self.config = self.llm.config
+        self.model_config.pretrained_config = self.llm.config
+
+    @torch.inference_mode()
+    def forward(
+        self,
+        attn_metadata: AttentionMetadata,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        input_embeds: Optional[torch.Tensor] = None,
+        return_context_logits: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        VLM forward logic with inflight batching support.
+        """
+        num_context_requests, num_generation_requests = attn_metadata.num_contexts, attn_metadata.num_generations
+        logger.debug(
+            f"num_context_requests: {num_context_requests}, num_generation_requests: {num_generation_requests}"
+        )
+
+        multimodal_params = kwargs.get("multimodal_params", [])
+        mm_embedding = [
+            multimodal_param.multimodal_data["multimodal_embedding"]
+            for multimodal_param in multimodal_params
+        ]
+        input_ids, input_embeds = fuse_input_embeds(
+            self.llm.model.embed_tokens,
+            input_ids,
+            mm_embedding,
+            mm_token_ids=self.MM_TOKEN_IDS,
+        )
+
+        output_prob = self.llm.forward(
+            attn_metadata=attn_metadata,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=input_embeds,
+            return_context_logits=return_context_logits,
+            lora_params=kwargs.get("lora_params", None),
+        )
+
+        logger.debug(f'output shape: {output_prob.shape}')
+        return output_prob
+
+    @staticmethod
+    def lora_config(model_dir: str):
+        _lora_config = LoraConfig(
+            lora_dir=[
+                f"{model_dir}/vision-lora",
+                f"{model_dir}/speech-lora",
+            ],
+            lora_target_modules=[
+                "attn_qkv",
+                "attn_dense",
+                "mlp_h_to_4h",
+                "mlp_4h_to_h",
+            ],
+            trtllm_modules_to_hf_modules={
+                "attn_qkv": "qkv_proj",
+                "attn_dense": "o_proj",
+                "mlp_h_to_4h": "gate_up_proj",
+                "mlp_4h_to_h": "down_proj",
+            },
+            max_lora_rank=320,  # Max rank for Phi4MM.
+        )
+        return _lora_config
+
+    @staticmethod
+    def lora_request(num_requests: int, modality: str, base_model_dir: str):
+        # Prepare LoRA requests for different modalities.
+        # Ref: https://huggingface.co/microsoft/Phi-4-multimodal-instruct/blob/main/modeling_phi4mm.py#L2103
+        lora_request = None
+        if modality == "image" or modality == "image_audio":
+            lora_request = [
+                LoRARequest(
+                    lora_name=f"vision-lora-{i}",
+                    lora_int_id=i,
+                    lora_path=f"{base_model_dir}/vision-lora",
+                ) for i in range(num_requests)
+            ]
+        elif modality == "audio":
+            lora_request = [
+                LoRARequest(
+                    lora_name=f"speech-lora-{i}",
+                    lora_int_id=i,
+                    lora_path=f"{base_model_dir}/speech-lora",
+                ) for i in range(num_requests)
+            ]
+
+        return lora_request
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 52c96d40f59..0532b995c59 100755
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -4834,6 +4834,48 @@ def _compute_sinusoidal_positions(scale_factors, is_short,
                     True), _compute_sinusoidal_positions(
                         scaling_long_factors, False, True), short_mscale
 
+    @staticmethod
+    def create_sinusoidal_positions_long_rope_for_attention_plugin(
+            num_pos: int,
+            dim: int,
+            theta: float,
+            original_max_pos: int,
+            short_factor: List[float],
+            long_factor: List[float],
+            dtype=np.float32):
+        short_factor = np.array(short_factor, dtype=np.float32)
+        long_factor = np.array(long_factor, dtype=np.float32)
+
+        inv_freq = 1.0 / (theta**(np.arange(0, dim, 2, dtype=np.float32) / dim))
+
+        # Short part
+        inv_freq_short = inv_freq / short_factor
+        t_short = np.arange(np.min([num_pos, original_max_pos]),
+                            dtype=np.float32)
+        freqs_short = np.einsum("i,j->ij", t_short, inv_freq_short)
+
+        # Long part
+        inv_freq_long = inv_freq / long_factor
+        t_long = np.arange(np.max([0, num_pos - original_max_pos]),
+                           dtype=np.float32) + original_max_pos
+        freqs_long = np.einsum("i,j->ij", t_long, inv_freq_long)
+
+        freqs = np.concatenate([freqs_short, freqs_long], axis=0)
+        sinusoid_inp = freqs.astype(np.float32)[..., np.newaxis]
+
+        # Apply scaling
+        scale = num_pos / original_max_pos
+        scaling_factor = np.sqrt(1.0 + np.log(scale) / np.log(original_max_pos))
+
+        # fuse cos/sin into float2 (cos, sin).
+        concat = np.concatenate(
+            (np.cos(sinusoid_inp) * scaling_factor,
+             np.sin(sinusoid_inp) * scaling_factor),
+            axis=-1,
+        )
+
+        return None, concat.reshape(1, -1).astype(dtype)
+
     @staticmethod
     def create_fake_weight(dim: int, dtype=np.half):
         return np.random.rand(dim).astype(dtype)
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
index 34b719885d5..a20978cab43 100644
--- a/tensorrt_llm/inputs/__init__.py
+++ b/tensorrt_llm/inputs/__init__.py
@@ -5,8 +5,9 @@
                        register_input_processor)
 from .utils import (ALL_SUPPORTED_MULTIMODAL_MODELS, ConversationMessage,
                     MultimodalData, MultimodalDataTracker,
-                    add_multimodal_placeholders, async_load_image,
-                    async_load_video, default_multimodal_input_loader,
+                    add_multimodal_placeholders, async_load_audio,
+                    async_load_image, async_load_video,
+                    default_multimodal_input_loader,
                     encode_base64_content_from_url, load_image, load_video)
 
 __all__ = [
@@ -24,6 +25,7 @@
     "MultimodalDataTracker",
     "MultimodalData",
     "MultimodalInput",
+    "async_load_audio",
     "async_load_image",
     "async_load_video",
     "add_multimodal_placeholders",
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index e6a939864ed..a58e6e4b58a 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -5,12 +5,13 @@
 from collections import defaultdict
 from io import BytesIO
 from pathlib import Path
-from typing import Any, Coroutine, Dict, List, Optional, TypedDict, Union
+from typing import Any, Coroutine, Dict, List, Optional, Tuple, TypedDict, Union
 from urllib.parse import urlparse
 
 import aiohttp
 import numpy as np
 import requests
+import soundfile
 import torch
 from PIL import Image
 from torchvision.transforms import ToTensor
@@ -159,6 +160,35 @@ async def async_load_video(
     return load_video(video_path, num_frames, format, device)
 
 
+def load_audio(
+    audio: str,
+    format: str = "pt",
+    device: str = "cuda",
+) -> Tuple[np.ndarray, int]:
+    parsed_url = urlparse(audio)
+    if parsed_url.scheme in ["http", "https"]:
+        audio = requests.get(audio, stream=True, timeout=10)
+        audio = BytesIO(audio.content)
+
+    audio = soundfile.read(audio)
+    return audio
+
+
+async def async_load_audio(
+    audio: str,
+    format: str = "pt",
+    device: str = "cuda",
+) -> Tuple[np.ndarray, int]:
+    parsed_url = urlparse(audio)
+    if parsed_url.scheme in ["http", "https"]:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(audio) as response:
+                audio = BytesIO(await response.content.read())
+
+    audio = soundfile.read(audio)
+    return audio
+
+
 # Copied from https://github.com/vllm-project/vllm/blob/main/examples/online_serving/openai_chat_completion_client_for_multimodal.py#L38
 def encode_base64_content_from_url(content_url: str) -> str:
     """Encode a content retrieved from a remote url to base64 format."""
@@ -186,19 +216,24 @@ def encode_base64_content_from_url(content_url: str) -> str:
 SUPPORTED_LLAVA_VIDEO_MODEL_GROUP = ["llava_llama"]
 SUPPORTED_MISTRAL_IMAGE_MODEL_GROUP = ["mistral3"]
 SUPPORTED_HYPERCLOVAX_MODEL_GROUP = ["hyperclovax_vlm"]
+SUPPORTED_PHI_MODEL_GROUP = ["phi4mm"]
 
 ALL_SUPPORTED_IMAGE_MODELS = SUPPORTED_QWEN_MODEL_GROUP \
     + SUPPORTED_LLAMA_MODEL_GROUP \
     + SUPPORTED_LLAVA_IMAGE_MODEL_GROUP \
     + SUPPORTED_HYPERCLOVAX_MODEL_GROUP \
     + SUPPORTED_GEMMA_MODEL_GROUP \
-    + SUPPORTED_MISTRAL_IMAGE_MODEL_GROUP
+    + SUPPORTED_MISTRAL_IMAGE_MODEL_GROUP \
+    + SUPPORTED_PHI_MODEL_GROUP
 
 ALL_SUPPORTED_VIDEO_MODELS = SUPPORTED_QWEN_MODEL_GROUP \
     + SUPPORTED_LLAVA_VIDEO_MODEL_GROUP
 
+ALL_SUPPORTED_AUDIO_MODELS = SUPPORTED_PHI_MODEL_GROUP
+
 ALL_SUPPORTED_MULTIMODAL_MODELS = list(set(ALL_SUPPORTED_IMAGE_MODELS) \
-    | set(ALL_SUPPORTED_VIDEO_MODELS))
+    | set(ALL_SUPPORTED_VIDEO_MODELS) \
+    | set(ALL_SUPPORTED_AUDIO_MODELS))
 
 HF_CHAT_TEMPLATE_EXCEPTIONS = ["llava_llama"]
 PLACEHOLDER_EXCEPTIONS = ["llava_next"]
@@ -223,6 +258,7 @@ class MultimodalPlaceholderPlacement(enum.Enum):
     # Ref: https://github.com/mistralai/mistral-common/blob/039465db2bdc0486df36365c9bdb428188482a18/
     #      src/mistral_common/tokens/tokenizers/base.py#L326
     "mistral3": MultimodalPlaceholderPlacement.AFTER_TEXT,
+    "phi4mm": MultimodalPlaceholderPlacement.BEFORE_TEXT,
 }
 assert len(PLACEHOLDER_PLACEMENT_MAP) == len(ALL_SUPPORTED_MULTIMODAL_MODELS)
 
@@ -235,7 +271,7 @@ def retrieve_multimodal_placeholder(model_type: str, modality: str,
         Args:
             model_type: The type of the multimodal model.
             modality: The modality of the data.
-            current_count: The number of multimodal data already added. Currently not used.
+            current_count: The number of multimodal data already added.
 
     """
 
@@ -257,6 +293,8 @@ def retrieve_multimodal_placeholder(model_type: str, modality: str,
             # Ref: https://github.com/mistralai/mistral-common/blob/26a6bb3a07ee0b78a3808f2797f23e1d28514b93/
             # src/mistral_common/tokens/tokenizers/base.py#L60
             return "[IMG]"
+        elif model_type in SUPPORTED_PHI_MODEL_GROUP:
+            return f"<|image_{current_count}|>"
         raise TypeError(
             f"For image modality, only {ALL_SUPPORTED_IMAGE_MODELS} are supported but got {model_type}"
         )
@@ -268,6 +306,9 @@ def retrieve_multimodal_placeholder(model_type: str, modality: str,
         raise TypeError(
             f"For video modality, only {ALL_SUPPORTED_VIDEO_MODELS} are supported but got {model_type}"
         )
+    elif modality == "audio":
+        if model_type in SUPPORTED_PHI_MODEL_GROUP:
+            return f"<|audio_{current_count}|>"
     raise TypeError(f"Unknown modality: {modality}")
 
 
@@ -343,7 +384,10 @@ def add_multimodal_placeholders(model_type: str, text_prompt: str,
         case MultimodalPlaceholderPlacement.AFTER_TEXT:
             parts.append(text_prompt)
             parts.extend(placeholders)
-    return "\n".join(parts)
+    if model_type == "phi4mm":
+        return "".join(parts)
+    else:
+        return "\n".join(parts)
 
 
 def resolve_hf_chat_template(
@@ -458,6 +502,34 @@ def convert_to_conversation_message(prompt: str, media: Union[str,
                                                format=image_data_format,
                                                device=device)) for i in media
             ]
+        elif modality == "audio":
+            mm_data = [
+                MultimodalData(modality=modality,
+                               data=load_audio(i, device=device)) for i in media
+            ]
+        elif modality == "image_audio":
+            # Use different load_xxx functions to match the modality.
+            mm_data = []
+            for m in media:
+                data = None
+                _modal = None
+                if _modal is None:
+                    try:
+                        data = load_image(m,
+                                          format=image_data_format,
+                                          device=device)
+                        _modal = "image"
+                    except Exception:
+                        pass
+                if _modal is None:
+                    try:
+                        data = load_audio(m, device=device)
+                        _modal = "audio"
+                    except Exception:
+                        pass
+                if _modal is None:
+                    raise ValueError(f"Unknown matching modality: {modality}")
+                mm_data.append(MultimodalData(modality=_modal, data=data))
         else:
             raise ValueError(f"Unknown modality: {modality}")
         return ConversationMessage(role="user", content=prompt, media=mm_data)
diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py
index fd56bfa161b..ec67d469bc0 100644
--- a/tensorrt_llm/serve/chat_utils.py
+++ b/tensorrt_llm/serve/chat_utils.py
@@ -2,7 +2,8 @@
 from typing import (Any, Callable, Coroutine, Dict, Iterable, List, Literal,
                     Optional, Tuple, TypeAlias, TypedDict, Union, cast)
 
-from openai.types.chat import ChatCompletionContentPartImageParam
+from openai.types.chat import (ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartInputAudioParam)
 from openai.types.chat import \
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam
 from openai.types.chat import (ChatCompletionContentPartTextParam,
@@ -12,8 +13,8 @@
 
 from tensorrt_llm.inputs import (ConversationMessage, MultimodalData,
                                  MultimodalDataTracker,
-                                 add_multimodal_placeholders, async_load_image,
-                                 async_load_video)
+                                 add_multimodal_placeholders, async_load_audio,
+                                 async_load_image, async_load_video)
 from tensorrt_llm.logger import logger
 
 
@@ -33,12 +34,16 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
     OpenAIChatCompletionContentPartParam, ChatCompletionContentPartVideoParam,
     str]
 
-VALID_MESSAGE_CONTENT_MM_PART_TYPES = ["text", "image_url", "video_url"]
+# TODO: Add "input_audio" to support byte_encoded audio input.
+VALID_MESSAGE_CONTENT_MM_PART_TYPES = [
+    "text", "image_url", "video_url", "audio_url"
+]
 
 # Parser Functions
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
+_AudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 
 MM_PARSER_MAP: dict[str, Callable[[ChatCompletionContentPartParam], Union[
     str, dict[str, str]]]] = {
@@ -48,6 +53,8 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
         lambda part: _ImageParser(part).get("image_url", {}).get("url", None),
         "video_url":
         lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
+        "audio_url":
+        lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
     }
 
 
@@ -74,7 +81,7 @@ def parse_chat_message_content_part(
 
     part_type, content = _parse_chat_message_content_mm_part(part)
 
-    # if part_type is text/image_url/video_url but content is None, log a warning and skip
+    # if part_type is text/image_url/video_url/audio_url but content is None, log a warning and skip
     if part_type in VALID_MESSAGE_CONTENT_MM_PART_TYPES and content is None:
         logger.warning(
             "Skipping multimodal part '%s' (type: '%s') with empty / unparsable content.",
@@ -108,6 +115,18 @@ async def load_video_async():
 
         return MultimodalData(modality="video", data=load_video_async())
 
+    if part_type == "audio_url":
+        str_content = cast(str, content)
+
+        async def load_audio_async():
+            try:
+                return await async_load_audio(str_content)
+            except Exception as e:
+                logger.error(f"Failed to load audio: {str(e)}")
+                return None
+
+        return MultimodalData(modality="audio", data=load_audio_async())
+
     raise NotImplementedError(f"Unknown part type: {part_type}")
 
 
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 5a30e0c8e7a..04780b28230 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -115,3 +115,5 @@ mistralai/Ministral-8B-Instruct-2410:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 78.35
+microsoft/Phi-4-multimodal-instruct:
+  - accuracy: 81.19
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 7beba282671..5f9b73e73de 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -199,3 +199,5 @@ mistralai/Ministral-8B-Instruct-2410:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 65.96
+microsoft/Phi-4-multimodal-instruct:
+  - accuracy: 69.69
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 5088f901c50..5ddfa18a7b6 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1847,3 +1847,16 @@ def test_fp8(self):
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+
+
+class TestPhi4MM(LlmapiAccuracyTestHarness):
+    # phi4-mm can also support text input.
+    MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
+    MODEL_PATH = f"{llm_models_root()}/multimodals/Phi-4-multimodal-instruct"
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index d7bd8c0f2d5..1e8098330f4 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -1544,7 +1544,25 @@ def test_build_time_benchmark_sanity(llm_root, llm_venv):
     ])
 
 
-### Pivot-To-Python examples
+### PyTorch examples
+
+
+def parse_output(text):
+    results = []
+    text_lists = re.split(r"\[\d+\] Prompt:", text)
+    for item in text_lists:
+        item = item.replace(os.linesep, "")
+        while True:
+            match = re.search(r"(Generated text: \'(.*?)\')", item,
+                              re.MULTILINE)
+            if match is None:
+                break
+            _, end = match.span(1)
+            results.append(match.group(2))
+            item = item[end:]
+    return results
+
+
 def test_ptp_quickstart(llm_root, llm_venv):
     example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
 
@@ -2101,21 +2119,6 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
 
     output = llm_venv.run_cmd(cmd, caller=check_output)
 
-    def parse_output(text):
-        results = []
-        text_lists = re.split(r"\[\d+\] Prompt:", text)
-        for item in text_lists:
-            item = item.replace(os.linesep, "")
-            while True:
-                match = re.search(r"(Generated text: \'(.*?)\')", item,
-                                  re.MULTILINE)
-                if match is None:
-                    break
-                _, end = match.span(1)
-                results.append(match.group(2))
-                item = item[end:]
-        return results
-
     match_ratio = 4.0 / 5
     if model_name == "qwen2-vl-7b-instruct" and modality == "image":
         match_ratio = 4.0 / 6
@@ -2182,6 +2185,92 @@ def parse_output(text):
             _check_mem_usage(running_log, [peak, 0, 0, 0])
 
 
+@pytest.mark.parametrize("modality", ["image", "audio", "image_audio"])
+def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
+    model_name = "Phi-4-multimodal-instruct"
+    model_path = "multimodals/Phi-4-multimodal-instruct"
+
+    example_root = Path(os.path.join(llm_root, "examples", "llm-api"))
+    test_data_root = Path(
+        os.path.join(llm_models_root(), "multimodals", "test_data"))
+    audio_data_root = Path(
+        os.path.join(llm_models_root(), "multimodals",
+                     "Phi-4-multimodal-instruct", "examples"))
+    print(f"Accuracy test {model_name} {modality} mode with example inputs.")
+    accuracy_inputs = {
+        "image": {
+            "prompt": [
+                "Describe the object and the weather condition in the image.",
+                "Describe the traffic condition on the road in the image.",
+            ],
+            "media": [
+                str(test_data_root / "inpaint.png"),
+                str(test_data_root / "61.jpg"),
+            ],
+        },
+        "audio": {
+            "prompt": [
+                "Transcribe the audio clip into text, please don't add other text.",
+                "Transcribe the audio clip into text, please don't add other text.",
+            ],
+            "media": [
+                str(audio_data_root /
+                    "what_is_the_traffic_sign_in_the_image.wav"),
+                str(audio_data_root / "what_is_shown_in_this_image.wav"),
+            ],
+        },
+        "image_audio": {
+            "prompt": [
+                "",
+            ],
+            "media": [
+                str(test_data_root / "inpaint.png"),
+                str(audio_data_root / "what_is_shown_in_this_image.wav"),
+            ],
+        }
+    }
+    expected_keywords = {
+        "image": [
+            ["clear", "sunny", "sky", "image", "object"],
+            ["road", "car", "lane", "strip", "bus"],
+        ],
+        "audio": [
+            ["what", "is", "the", "traffic", "sign", "in", "image"],
+            ["what", "is", "shown", "in", "this", "image"],
+        ],
+        "image_audio": [
+            ["Half", "Dome", "Park", "natural", "image"],
+        ],
+    }
+
+    cmd = [
+        str(example_root / "quickstart_multimodal.py"),
+        "--model_dir",
+        f"{llm_models_root()}/{model_path}",
+        "--modality",
+        modality,
+        "--prompt",
+        *accuracy_inputs[modality]["prompt"],
+        "--media",
+        *accuracy_inputs[modality]["media"],
+        "--load_lora",
+        "--auto_model_name",
+        "Phi4MMForCausalLM",
+    ]
+    output = llm_venv.run_cmd(cmd, caller=check_output)
+
+    match_ratio = 0.6
+    for prompt_output, prompt_keywords in zip(parse_output(output),
+                                              expected_keywords[modality]):
+        matches = [
+            keyword in prompt_output.lower() for keyword in prompt_keywords
+        ]
+        obs_match_ratio = 1. * sum(matches) / len(matches)
+        assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}"
+
+    print("All answers are correct!")
+
+
 @pytest.mark.parametrize("model_name,model_path", [
     ("BertForSequenceClassification", "bert/bert-base-uncased-yelp-polarity"),
 ])
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 3dcfcbac093..3b14bdca990 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -486,6 +486,7 @@ accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
+accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
@@ -528,6 +529,9 @@ test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistr
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False]
 test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image]
+test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
 test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
index 7d68c95486f..0044a853c07 100644
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -15,6 +15,7 @@ l0_a30:
   tests:
   # ------------- PyTorch tests ---------------
   - unittest/_torch/modeling -k "modeling_nemotron_nas"
+  - unittest/_torch/modeling -k "modeling_phi3"
   - unittest/_torch/modeling -k "modeling_qwen"
   - unittest/_torch/modeling -k "modeling_qwen_moe"
   - unittest/_torch/auto_deploy/unit/singlegpu
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index d7dda796764..1ea66b87726 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -28,6 +28,9 @@ l0_l40s:
   - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-True]
   - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False]
   - test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[audio]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image]
+  - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
   - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
   - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
 - condition:
diff --git a/tests/unittest/_torch/modeling/test_modeling_phi3.py b/tests/unittest/_torch/modeling/test_modeling_phi3.py
new file mode 100644
index 00000000000..4a277c01ba1
--- /dev/null
+++ b/tests/unittest/_torch/modeling/test_modeling_phi3.py
@@ -0,0 +1,352 @@
+import unittest
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Any
+
+import torch
+from transformers import Phi3Config
+from transformers import Phi3ForCausalLM as HFPhi3ForCausalLM
+from utils.util import default_dtype
+
+import tensorrt_llm
+from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
+from tensorrt_llm._torch.metadata import KVCacheParams
+from tensorrt_llm._torch.model_config import ModelConfig
+from tensorrt_llm._torch.models.modeling_phi3 import Phi3ForCausalLM
+from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import \
+    DecodingCUDAGraphRunner
+from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
+from tensorrt_llm.bindings.executor import KvCacheConfig
+from tensorrt_llm.mapping import Mapping
+
+PHI3_MINI_4K_CONFIG = {
+    "_name_or_path": "Phi-3-mini-4k-instruct",
+    "architectures": ["Phi3ForCausalLM"],
+    "attention_dropout": 0.0,
+    "auto_map": {
+        "AutoConfig": "configuration_phi3.Phi3Config",
+        "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM"
+    },
+    "bos_token_id": 1,
+    "embd_pdrop": 0.0,
+    "eos_token_id": 32000,
+    "hidden_act": "silu",
+    "hidden_size": 3072,
+    "initializer_range": 0.02,
+    "intermediate_size": 8192,
+    "max_position_embeddings": 4096,
+    "model_type": "phi3",
+    "num_attention_heads": 32,
+    "num_hidden_layers": 32,
+    "num_key_value_heads": 32,
+    "original_max_position_embeddings": 4096,
+    "pad_token_id": 32000,
+    "resid_pdrop": 0.0,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": None,
+    "rope_theta": 10000.0,
+    "sliding_window": 2047,
+    "tie_word_embeddings": False,
+    "torch_dtype": "bfloat16",
+    "transformers_version": "4.40.2",
+    "use_cache": True,
+    "attention_bias": False,
+    "vocab_size": 32064
+}
+
+
+@dataclass(repr=False)
+class Scenario:
+    backend: str
+    use_cuda_graph: bool = False
+
+    def __repr__(self) -> str:
+        return f"backend:{self.backend.lower()}-use_cuda_graph:{self.use_cuda_graph}"
+
+
+def reduce_phi3_config(mem_for_full_model: int,
+                       config_dict: dict[str, Any],
+                       default_num_layers: int = 32):
+    _, total_mem = torch.cuda.mem_get_info()
+    # scale model down if gpu memory is low
+    if total_mem < mem_for_full_model:
+        model_fraction = total_mem / mem_for_full_model
+        num_layers = int(config_dict["num_hidden_layers"] * model_fraction)
+        num_layers = min(num_layers, default_num_layers)
+        config_dict["num_hidden_layers"] = num_layers
+
+
+class TestPhi3(unittest.TestCase):
+
+    def test_phi3_sanity(self):
+        config_dict = deepcopy(PHI3_MINI_4K_CONFIG)
+        # 8B * sizeof(float16) plus some extra for activations
+        mem_for_full_model = (2 + 1) * 8 * 2**(30)
+        reduce_phi3_config(mem_for_full_model, config_dict)
+        if config_dict["num_hidden_layers"] <= 0:
+            self.skipTest("Insufficient memory for a single Phi3 layer")
+        phi3_config = Phi3Config.from_dict(config_dict)
+        dtype = phi3_config.torch_dtype
+        device = torch.device('cuda')
+
+        with torch.device(device), default_dtype(dtype):
+            model_config = ModelConfig(pretrained_config=phi3_config)
+            phi3 = Phi3ForCausalLM(model_config).to(device)
+
+        input_ids = torch.tensor([100, 200, 300, 100, 200, 100, 400, 500],
+                                 dtype=torch.int,
+                                 device=device)
+
+        context_sequence_lengths = [3, 2, 1]
+        sequence_lengths = context_sequence_lengths + [1, 1]
+        past_seen_tokens = [0, 0, 0, 62, 75]
+        request_ids = list(range(len(sequence_lengths)))
+        token_nums = (torch.tensor(past_seen_tokens) +
+                      torch.tensor(sequence_lengths)).tolist()
+        prompt_lens = token_nums[:3] + past_seen_tokens[3:]
+
+        num_blocks = 100
+        tokens_per_block = 128
+        head_dim = phi3.config.hidden_size // phi3.config.num_attention_heads
+        num_layers = phi3.config.num_hidden_layers
+        num_kv_heads = phi3.config.num_key_value_heads
+        max_seq_len = num_blocks * tokens_per_block
+        batch_size = len(context_sequence_lengths) + 2
+
+        if dtype == torch.half:
+            kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
+        elif dtype == torch.bfloat16:
+            kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
+        else:
+            raise ValueError("Invalid dtype")
+
+        mapping = Mapping(world_size=1, tp_size=1, rank=0)
+        kv_cache_config = KvCacheConfig(max_tokens=num_blocks *
+                                        tokens_per_block)
+        kv_cache_manager = KVCacheManager(
+            kv_cache_config,
+            tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
+            num_layers=num_layers,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            tokens_per_block=tokens_per_block,
+            max_seq_len=max_seq_len,
+            max_batch_size=batch_size,
+            mapping=mapping,
+            dtype=kv_cache_dtype,
+        )
+        kv_cache_manager.add_dummy_requests(request_ids, token_nums)
+
+        metadata_cls = get_attention_backend(model_config.attn_backend).Metadata
+        attn_metadata = metadata_cls(
+            seq_lens=torch.tensor(sequence_lengths, dtype=torch.int),
+            num_contexts=len(context_sequence_lengths),
+            kv_cache_params=KVCacheParams(
+                use_cache=True,
+                num_cached_tokens_per_seq=past_seen_tokens,
+            ),
+            kv_cache_manager=kv_cache_manager,
+            request_ids=request_ids,
+            prompt_lens=prompt_lens,
+            max_num_requests=len(context_sequence_lengths) + 2,
+            max_num_tokens=8192,
+        )
+
+        position_ids = []
+        for i, tokens in enumerate(past_seen_tokens):
+            seq_len = context_sequence_lengths[i] if i < len(
+                context_sequence_lengths) else 1
+            position_id = torch.arange(tokens,
+                                       tokens + seq_len,
+                                       device=input_ids.device)
+            position_ids.append(position_id)
+
+        position_ids = torch.cat(position_ids).unsqueeze(0)
+
+        with torch.inference_mode():
+            attn_metadata.prepare()
+            logits = phi3.forward(input_ids=input_ids,
+                                  position_ids=position_ids,
+                                  attn_metadata=attn_metadata)
+
+        self.assertEqual(len(past_seen_tokens), logits.shape[0])
+
+        with torch.inference_mode():
+            attn_metadata.prepare()
+            logits = phi3.forward(input_ids=input_ids,
+                                  position_ids=position_ids,
+                                  attn_metadata=attn_metadata,
+                                  return_context_logits=True)
+        self.assertEqual(input_ids.shape, logits.shape[:-1])
+
+        kv_cache_manager.shutdown()
+
+    @torch.no_grad()
+    def test_phi3_allclose_to_hf(self) -> None:
+        """
+        Compare output to HF
+        """
+        scenario = Scenario(backend="TRTLLM")
+        backend = scenario.backend
+        metadata_cls = get_attention_backend(backend).Metadata
+
+        torch.random.manual_seed(0)
+        config_dict = deepcopy(PHI3_MINI_4K_CONFIG)
+        # 8B * sizeof(float16) plus some extra for activations
+        # times 2, since we'll need 2 of these
+        mem_for_full_model = (2 + 1) * 8 * 2**(30) * 4
+        reduce_phi3_config(mem_for_full_model, config_dict)
+        if config_dict["num_hidden_layers"] <= 0:
+            self.skipTest("Insufficient memory for a single Phi3 layer")
+        phi3_config = Phi3Config.from_dict(config_dict)
+        dtype = phi3_config.torch_dtype
+        device = torch.device('cuda')
+
+        with torch.device(device), default_dtype(dtype):
+            hf_phi3 = HFPhi3ForCausalLM(phi3_config).eval()
+
+            model_config = ModelConfig(pretrained_config=phi3_config,
+                                       attn_backend=backend)
+
+            phi3 = Phi3ForCausalLM(model_config).to(dtype).to(device)
+            phi3.load_weights(hf_phi3.state_dict())
+
+        num_blocks = 1
+        tokens_per_block = 128
+        head_dim = phi3.config.hidden_size // phi3.config.num_attention_heads
+        num_layers = phi3.config.num_hidden_layers
+        num_kv_heads = phi3.config.num_key_value_heads
+        max_seq_len = num_blocks * tokens_per_block
+        batch_size = 1
+
+        if dtype == torch.half:
+            kv_cache_dtype = tensorrt_llm.bindings.DataType.HALF
+        elif dtype == torch.bfloat16:
+            kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
+        else:
+            raise ValueError("Invalid dtype")
+
+        mapping = Mapping(world_size=1, tp_size=1, rank=0)
+        kv_cache_config = KvCacheConfig(max_tokens=num_blocks *
+                                        tokens_per_block)
+        kv_cache_manager = KVCacheManager(
+            kv_cache_config,
+            tensorrt_llm.bindings.internal.batch_manager.CacheType.SELF,
+            num_layers=num_layers,
+            num_kv_heads=num_kv_heads,
+            head_dim=head_dim,
+            tokens_per_block=tokens_per_block,
+            max_seq_len=max_seq_len,
+            max_batch_size=batch_size,
+            mapping=mapping,
+            dtype=kv_cache_dtype,
+        )
+
+        # context
+        input_ids = torch.tensor([100, 200, 300, 100, 200, 100, 400, 500],
+                                 dtype=torch.int,
+                                 device=device)
+
+        num_cached_tokens_per_seq = [0]
+        request_ids = [1]
+        token_nums = [input_ids.size(-1)]
+        prompt_lens = [input_ids.size(-1)]
+        kv_cache_manager.add_dummy_requests(request_ids, token_nums)
+
+        attn_metadata = metadata_cls(
+            seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.int),
+            num_contexts=1,
+            kv_cache_params=KVCacheParams(
+                use_cache=True,
+                num_cached_tokens_per_seq=num_cached_tokens_per_seq,
+            ),
+            max_num_requests=1,
+            max_num_tokens=8192,
+            kv_cache_manager=kv_cache_manager,
+            request_ids=request_ids,
+            prompt_lens=prompt_lens,
+        )
+
+        # Note: no CUDA graphs for prefill, the graph runner is built for
+        # decoding only.
+        position_ids = [torch.arange(0, input_ids.size(-1))]
+        position_ids = torch.cat(position_ids).unsqueeze(0).cuda()
+        with torch.inference_mode():
+            attn_metadata.prepare()
+            logits = phi3.forward(input_ids=input_ids,
+                                  position_ids=position_ids,
+                                  attn_metadata=attn_metadata)
+            ref = hf_phi3.forward(input_ids=input_ids.unsqueeze(0),
+                                  position_ids=position_ids,
+                                  use_cache=True)
+
+        torch.testing.assert_close(logits,
+                                   ref.logits[:, -1].float(),
+                                   atol=0.4,
+                                   rtol=0.4)
+
+        # gen
+        gen_input_ids = torch.tensor([600], dtype=torch.int, device=device)
+
+        num_cached_tokens_per_seq = [input_ids.size(-1)]
+
+        attn_metadata = metadata_cls(
+            seq_lens=torch.tensor([gen_input_ids.size(-1)], dtype=torch.int),
+            num_contexts=0,
+            kv_cache_params=KVCacheParams(
+                use_cache=True,
+                num_cached_tokens_per_seq=num_cached_tokens_per_seq,
+            ),
+            max_num_requests=1,
+            max_num_tokens=8192,
+            kv_cache_manager=kv_cache_manager,
+            request_ids=request_ids,
+            prompt_lens=prompt_lens,
+        )
+
+        gen_position_ids = [
+            torch.arange(input_ids.size(-1),
+                         input_ids.size(-1) + gen_input_ids.size(-1))
+        ]
+        gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
+
+        def run_forward(input_ids, position_ids, attn_metadata):
+            attn_metadata.prepare()
+            if not scenario.use_cuda_graph:
+                return phi3.forward(input_ids=input_ids,
+                                    position_ids=position_ids,
+                                    attn_metadata=attn_metadata)
+            else:
+                graph_runner = DecodingCUDAGraphRunner(
+                    attn_metadata.max_num_requests, "cuda", attn_metadata)
+                graph_runner.capture(lambda inputs: phi3.forward(**inputs))
+
+                for _ in range(2):
+                    # Run it twice. This helps us catch problems if buffers are accidentally reallocated
+                    # in prepare().
+                    attn_metadata.prepare()
+                    logits = graph_runner.run({
+                        "input_ids": input_ids,
+                        "position_ids": position_ids,
+                        "attn_metadata": attn_metadata,
+                    })
+                return logits
+
+        if scenario.use_cuda_graph:
+            attn_metadata = attn_metadata.create_cuda_graph_metadata(1)
+
+        with torch.inference_mode():
+            logits = run_forward(input_ids=gen_input_ids,
+                                 position_ids=gen_position_ids,
+                                 attn_metadata=attn_metadata)
+            ref = hf_phi3.forward(input_ids=gen_input_ids.unsqueeze(0),
+                                  position_ids=gen_position_ids,
+                                  past_key_values=ref.past_key_values,
+                                  use_cache=True)
+
+        torch.testing.assert_close(logits,
+                                   ref.logits[:, -1].float(),
+                                   atol=0.4,
+                                   rtol=0.4)
+
+        kv_cache_manager.shutdown()

From 28385f6571fde019672517ab179adb104af70034 Mon Sep 17 00:00:00 2001
From: Frank <3429989+FrankD412@users.noreply.github.com>
Date: Wed, 16 Jul 2025 18:15:06 -0700
Subject: [PATCH 48/88] [TRTLLM-6070] docs: Add initial documentation for
 trtllm-bench CLI. (#5734)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
Signed-off-by: Frank <3429989+FrankD412@users.noreply.github.com>
Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
---
 docs/source/commands/trtllm-bench.rst | 164 ++++++++++++++++++++++++++
 docs/source/index.rst                 |   1 +
 2 files changed, 165 insertions(+)
 create mode 100644 docs/source/commands/trtllm-bench.rst

diff --git a/docs/source/commands/trtllm-bench.rst b/docs/source/commands/trtllm-bench.rst
new file mode 100644
index 00000000000..7f03c8dfc66
--- /dev/null
+++ b/docs/source/commands/trtllm-bench.rst
@@ -0,0 +1,164 @@
+trtllm-bench
+===========================
+
+trtllm-bench is a comprehensive benchmarking tool for TensorRT-LLM engines. It provides three main subcommands for different benchmarking scenarios:
+
+**Common Options for All Commands:**
+
+**Usage:**
+
+.. click:: tensorrt_llm.commands.bench:main
+   :prog: trtllm-bench
+   :nested: full
+   :commands: throughput, latency, build
+
+
+
+prepare_dataset.py
+===========================
+
+trtllm-bench is designed to work with the `prepare_dataset.py <https://github.com/NVIDIA/TensorRT-LLM/blob/main/benchmarks/cpp/prepare_dataset.py>`_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports:
+
+**Dataset Types:**
+
+- Real datasets from various sources
+- Synthetic datasets with normal or uniform token distributions
+- LoRA task-specific datasets
+
+**Key Features:**
+
+- Tokenizer integration for proper text preprocessing
+- Configurable random seeds for reproducible results
+- Support for LoRA adapters and task IDs
+- Output in JSON format compatible with trtllm-bench
+
+.. important::
+   The ``--stdout`` flag is **required** when using prepare_dataset.py with trtllm-bench to ensure proper data streaming format.
+
+**Usage:**
+
+prepare_dataset
+-------------------
+
+.. code-block:: bash
+
+    python prepare_dataset.py [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+   :widths: 20 80
+   :header-rows: 1
+
+   * - Option
+     - Description
+   * - ``--tokenizer``
+     - Tokenizer directory or HuggingFace model name (required)
+   * - ``--output``
+     - Output JSON filename (default: preprocessed_dataset.json)
+   * - ``--stdout``
+     - Print output to stdout with JSON dataset entry on each line (**required for trtllm-bench**)
+   * - ``--random-seed``
+     - Random seed for token generation (default: 420)
+   * - ``--task-id``
+     - LoRA task ID (default: -1)
+   * - ``--rand-task-id``
+     - Random LoRA task range (two integers)
+   * - ``--lora-dir``
+     - Directory containing LoRA adapters
+   * - ``--log-level``
+     - Logging level: info or debug (default: info)
+
+dataset
+-------------------
+
+Process real datasets from various sources.
+
+.. code-block:: bash
+
+    python prepare_dataset.py dataset [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+   :widths: 20 80
+   :header-rows: 1
+
+   * - Option
+     - Description
+   * - ``--input``
+     - Input dataset file or directory (required)
+   * - ``--max-input-length``
+     - Maximum input sequence length (default: 2048)
+   * - ``--max-output-length``
+     - Maximum output sequence length (default: 512)
+   * - ``--num-samples``
+     - Number of samples to process (default: all)
+   * - ``--format``
+     - Input format: json, jsonl, csv, or txt (default: auto-detect)
+
+
+token_norm_dist
+-------------------
+
+Generate synthetic datasets with normal token distribution.
+
+.. code-block:: bash
+
+    python prepare_dataset.py token_norm_dist [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+   :widths: 20 80
+   :header-rows: 1
+
+   * - Option
+     - Description
+   * - ``--num-requests``
+     - Number of requests to be generated (required)
+   * - ``--input-mean``
+     - Normal distribution mean for input tokens (required)
+   * - ``--input-stdev``
+     - Normal distribution standard deviation for input tokens (required)
+   * - ``--output-mean``
+     - Normal distribution mean for output tokens (required)
+   * - ``--output-stdev``
+     - Normal distribution standard deviation for output tokens (required)
+
+
+token_unif_dist
+-------------------
+
+Generate synthetic datasets with uniform token distribution
+
+.. code-block:: bash
+
+    python prepare_dataset.py token_unif_dist [OPTIONS]
+
+**Options**
+
+----
+
+.. list-table::
+   :widths: 20 80
+   :header-rows: 1
+
+   * - Option
+     - Description
+   * - ``--num-requests``
+     - Number of requests to be generated (required)
+   * - ``--input-min``
+     - Uniform distribution minimum for input tokens (required)
+   * - ``--input-max``
+     - Uniform distribution maximum for input tokens (required)
+   * - ``--output-min``
+     - Uniform distribution minimum for output tokens (required)
+   * - ``--output-max``
+     - Uniform distribution maximum for output tokens (required)
diff --git a/docs/source/index.rst b/docs/source/index.rst
index b63ec95a676..50b9c122678 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -77,6 +77,7 @@ Welcome to TensorRT-LLM's Documentation!
    :caption: Command-Line Reference
    :hidden:
 
+   commands/trtllm-bench
    commands/trtllm-build
    commands/trtllm-serve
 

From fe070a0168eee04c4aee01fc365434703a27a7e6 Mon Sep 17 00:00:00 2001
From: chenfeiz0326 <chenfeiz@nvidia.com>
Date: Thu, 17 Jul 2025 09:41:18 +0800
Subject: [PATCH 49/88] test: Update Llama4 Scout FP4 & FP8 accuracy tests
 (#5901)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
---
 .../_torch/modules/fused_moe/quantization.py  | 22 ++++++---
 .../defs/accuracy/references/gsm8k.yaml       |  5 ++
 .../defs/accuracy/references/mmlu.yaml        |  5 ++
 .../defs/accuracy/test_llm_api_pytorch.py     | 49 ++++++++++++++++++-
 .../test_lists/qa/examples_test_list.txt      |  4 ++
 .../test_lists/test-db/l0_dgx_b200.yml        |  2 +
 6 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
index 2a28f019033..f957712e3e5 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1096,10 +1096,13 @@ def load_expert_w3_w1_weight_scale_nvfp4(
 
         orig_shape = dst_w3_w1_weight_scale.shape
 
-        dst_w3_w1_weight_scale.copy_(
-            torch.ops.trtllm.nvfp4_block_scale_interleave(
-                dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
-                    self.block_scales_dtype).reshape(orig_shape))
+        dst_w3_w1_weight_scale_interleaved = torch.ops.trtllm.nvfp4_block_scale_interleave(
+            dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
+                self.block_scales_dtype).reshape(orig_shape)
+
+        torch.cuda.synchronize()
+
+        dst_w3_w1_weight_scale.copy_(dst_w3_w1_weight_scale_interleaved)
 
     def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
                                           w2_weight_scale: torch.Tensor,
@@ -1113,10 +1116,13 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
 
         orig_shape = dst_w2_weight_scale.shape
 
-        dst_w2_weight_scale.copy_(
-            torch.ops.trtllm.nvfp4_block_scale_interleave(
-                dst_w2_weight_scale.view(float4_sf_dtype)).view(
-                    self.block_scales_dtype).reshape(orig_shape))
+        dst_w2_weight_scale_interleaved = torch.ops.trtllm.nvfp4_block_scale_interleave(
+            dst_w2_weight_scale.view(float4_sf_dtype)).view(
+                self.block_scales_dtype).reshape(orig_shape)
+
+        torch.cuda.synchronize()
+
+        dst_w2_weight_scale.copy_(dst_w2_weight_scale_interleaved)
 
 
 class NVFP4TRTLLMGenFusedMoEMethod(NVFP4FusedMoEMethod):
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 04780b28230..41dce7f1837 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -18,6 +18,11 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 92.20
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 89.70
+  - quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 79.62
+  - quant_algo: FP8
+    accuracy: 80.37
 deepseek-ai/DeepSeek-V3-Lite:
   - accuracy: 64.74
   - quant_algo: NVFP4
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 5f9b73e73de..bb3d30dd079 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -68,6 +68,11 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
   - accuracy: 80.00
+  - quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 88.63
+  - quant_algo: FP8
+    accuracy: 89.46
 mistralai/Mistral-7B-v0.1:
   - accuracy: 66
 mistralai/Mistral-7B-Instruct-v0.3:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 5ddfa18a7b6..d34a60604bf 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -422,7 +422,6 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
 
 class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
-    MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
 
     @skip_pre_hopper
     @pytest.mark.skip_less_mpi_world_size(8)
@@ -431,8 +430,9 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
                                                          (8, 1, 8)],
                              ids=["tp8", "tp8ep4", "tp8ep8"])
     def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
+        model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
         with LLM(
-                self.MODEL_PATH,
+                model_path,
                 tensor_parallel_size=tp_size,
                 # Keep this low to avoid warmup OOM in CI
                 max_seq_len=8192,
@@ -445,6 +445,51 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
+    @pytest.mark.skip_less_mpi_world_size(8)
+    @parametrize_with_ids("cuda_graph", [True])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
+                             ids=["tp8ep8", "tp4"])
+    def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
+        model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
+        with LLM(
+                model_path,
+                tensor_parallel_size=tp_size,
+                # Keep this low to avoid warmup OOM in CI
+                max_seq_len=8192,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                cuda_graph_config=CudaGraphConfig()
+                if cuda_graph else None) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_blackwell
+    @pytest.mark.skip_less_mpi_world_size(8)
+    @parametrize_with_ids("cuda_graph", [True])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
+                             ids=["tp8ep8", "tp4"])
+    def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size):
+        model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
+        with LLM(
+                model_path,
+                tensor_parallel_size=tp_size,
+                # Keep this low to avoid warmup OOM in CI
+                max_seq_len=8192,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                cuda_graph_config=CudaGraphConfig()
+                if cuda_graph else None) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
 
 class TestMistral7B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "mistralai/Mistral-7B-v0.1"
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 3b14bdca990..0cf65a29aed 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -455,6 +455,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 8a70e6efefc..8b3b0cac36b 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -62,3 +62,5 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
+  - accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]

From 6e1aee6fd68a39715ee069cca2a5adf442a9eedf Mon Sep 17 00:00:00 2001
From: Shiyu Li <shili@nvidia.com>
Date: Wed, 16 Jul 2025 19:49:51 -0700
Subject: [PATCH 50/88] [fix] Performance Optimization for MNNVL TwoShot Kernel
 (#5934)

Signed-off-by: Shiyu Li <shili@nvidia.com>
Co-authored-by: Zongfei Jing <20381269+zongfeijing@users.noreply.github.com>
---
 .../mnnvlTwoShotAllreduceKernels.cu           | 148 +++++++++++-------
 .../runtime/mcastDeviceMemory.cpp             |  33 ++--
 cpp/tensorrt_llm/runtime/mcastDeviceMemory.h  |  16 +-
 .../_torch/models/modeling_deepseekv3.py      |  12 +-
 4 files changed, 130 insertions(+), 79 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
index f2e87e39dda..6f85317ae77 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu
@@ -61,6 +61,31 @@ inline __device__ __nv_bfloat16 fromFloat<__nv_bfloat16>(float val)
     return __float2bfloat16(val);
 }
 
+__device__ float4 loadfloat4(void const* ptr)
+{
+
+    float return_value[4];
+
+    asm volatile("ld.volatile.global.v4.f32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=f"(return_value[0]), "=f"(return_value[1]), "=f"(return_value[2]), "=f"(return_value[3])
+                 : "l"(ptr));
+
+    return *(float4*) return_value;
+}
+
+__device__ __inline__ float2 loadfloat2(void const* ptr)
+{
+
+    float return_value[2];
+
+    asm volatile("ld.volatile.global.v2.f32 {%0, %1}, [%2];\n"
+                 : "=f"(return_value[0]), "=f"(return_value[1])
+                 : "l"(ptr)
+                 : "memory");
+
+    return *(float2*) return_value;
+}
+
 template <int WORLD_SIZE, typename T>
 __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_ptrs, T* mcast_ptr, int num_tokens,
     int buffer_M, int token_dim, int rank, uint32_t* buffer_flags, bool wait_for_results)
@@ -74,20 +99,13 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
     cudaGridDependencySynchronize();
 #endif
 
+    // [input_ptr, clear_ptr, buffer_size, access_counter]
+    uint4 flag = reinterpret_cast<uint4*>(buffer_flags)[0];
+    // Each buffer is M * N and we have 2 buffers in each group, one for reduce-scatter and one for allgather
+    uint32_t buffer_group_size = flag.z << 1;
+    uint32_t input_offset = flag.x * buffer_group_size;
+    uint32_t clear_offset = flag.y * buffer_group_size;
     uint32_t* offset_access_ptr = &buffer_flags[3];
-    // Buffer size is M * N, and we need two buffers for reduce-scatter and allgather
-    uint32_t buffer_size = (buffer_flags[2] << 1);
-    uint32_t input_offset = buffer_flags[0] * buffer_size;
-    uint32_t clear_offset = buffer_flags[1] * buffer_size;
-
-    if (wait_for_results)
-    {
-        __syncthreads();
-        if (threadIdx.x == 0)
-        {
-            atomicAdd(offset_access_ptr, 1);
-        }
-    }
 
     if (elt < token_dim)
     {
@@ -101,17 +119,16 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
 
         // Reduce and broadcast
 
-        int global_token = token * WORLD_SIZE + rank;
-        if (global_token < num_tokens)
+        if ((token % WORLD_SIZE) == rank)
         {
-
+            int local_token = token / WORLD_SIZE;
             float accum = 0.f;
 
             T values[WORLD_SIZE];
 
             for (int r = 0; r < WORLD_SIZE; r++)
             {
-                input_ptrs[rank][clear_offset + token * token_dim * WORLD_SIZE + r * token_dim + elt]
+                input_ptrs[rank][clear_offset + local_token * token_dim * WORLD_SIZE + r * token_dim + elt]
                     = fromFloat<T>(-0.f);
             }
 
@@ -121,7 +138,7 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
                 for (int r = 0; r < WORLD_SIZE; r++)
                 {
                     T volatile* lamport_ptr = (T volatile*) &input_ptrs[rank][input_offset
-                        + token * token_dim * WORLD_SIZE + r * token_dim + elt];
+                        + local_token * token_dim * WORLD_SIZE + r * token_dim + elt];
                     values[r] = *lamport_ptr;
                     valid &= !isNegZero(values[r]);
                 }
@@ -132,7 +149,7 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
             {
                 accum += toFloat<T>(values[r]);
             }
-            mcast_ptr[input_offset + buffer_M * token_dim + global_token * token_dim + elt] = fromFloat<T>(accum);
+            mcast_ptr[input_offset + buffer_M * token_dim + token * token_dim + elt] = fromFloat<T>(accum);
         }
     }
 
@@ -145,23 +162,50 @@ __global__ void twoshot_allreduce_kernel(T* output_ptr, T* shard_ptr, T** input_
     // Optionally wait for results if the next layer isn't doing the Lamport check
     if (wait_for_results)
     {
-        T volatile* lamport_ptr
-            = (T volatile*) &input_ptrs[rank][input_offset + buffer_M * token_dim + token * token_dim + elt];
-        T val = *lamport_ptr;
-        while (isNegZero(val))
-            val = *lamport_ptr;
-
-        // Copy if requested
-        if (output_ptr)
-            output_ptr[token * token_dim + elt] = val;
-        if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0)
+        // Update the atomic counter to indicate the block has read the offsets
+        __syncthreads();
+
+        if (threadIdx.x == 0)
+        {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
+            asm volatile("red.async.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
+#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+            asm volatile("red.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
+#else
+            atomicAdd(offset_access_ptr, 1);
+#endif
+        }
+        // Only use a set of CTAs for lamport sync, reargange the grid
+        constexpr int ELTS_PER_LOAD = sizeof(float2) / sizeof(T);
+        // blockDim.x / ELTS_PER_LOAD should be at least the size of a warp (32)
+        if (threadIdx.x < (blockDim.x / ELTS_PER_LOAD))
+        {
+            uint64_t current_pos = blockIdx.x * token_dim + blockIdx.y * blockDim.x + threadIdx.x * ELTS_PER_LOAD;
+
+            void* lamport_ptr = (void*) &input_ptrs[rank][input_offset + buffer_M * token_dim + current_pos];
+            // We have 2 assumptions here:
+            // 1. The write is atomic in 8B granularity -> Each buffer in the buffer group should be aligned to 8B
+            // 2. The num_token * token_dim is divisible by ELTS_PER_LOAD (4 for BF16 and 2 for FP32)
+            float2 val = loadfloat2(lamport_ptr);
+            while (isNegZero(*(T*) &val))
+            {
+                val = loadfloat2(lamport_ptr);
+            }
+            if (output_ptr)
+            {
+                *((float2*) &output_ptr[current_pos]) = val;
+            }
+        }
+
+        // Update the buffer flags
+        if (threadIdx.x == 0 && blockIdx.x == gridDim.x - 1 && blockIdx.y == 0)
         {
             // Make sure all blocks have finished reading the offsets, 2-D grid
             while (*reinterpret_cast<uint32_t volatile*>(offset_access_ptr) < gridDim.x * gridDim.y)
             {
             }
-            buffer_flags[0] = (buffer_flags[0] + 1) % 3;
-            buffer_flags[1] = (buffer_flags[1] + 1) % 3;
+            buffer_flags[0] = (flag.x + 1) % 3;
+            buffer_flags[1] = (flag.y + 1) % 3;
             *(offset_access_ptr) = 0;
         }
     }
@@ -251,18 +295,6 @@ __device__ void copy_f4_ldg(T_IN* dst, T_IN const* src)
     *dst4 = *src4;
 }
 
-__device__ float4 loadfloat4(void const* ptr)
-{
-
-    float return_value[4];
-
-    asm volatile("ld.volatile.global.v4.f32 {%0, %1, %2, %3}, [%4];\n"
-                 : "=f"(return_value[0]), "=f"(return_value[1]), "=f"(return_value[2]), "=f"(return_value[3])
-                 : "l"(ptr));
-
-    return *(float4*) return_value;
-}
-
 template <typename T>
 inline __device__ T add(T a, T b)
 {
@@ -322,19 +354,14 @@ __global__ void __launch_bounds__(128, 1)
     int offsets[NUM_INPUTS][DIM / (1 * ELTS_PER_THREAD * NUM_THREADS)];
 
     uint32_t* offset_access_ptr = &buffer_flags[3];
+    uint4 flag = reinterpret_cast<uint4*>(buffer_flags)[0];
     // Buffer size is M * N, and we need two buffers for reduce-scatter and allgather
-    uint32_t buffer_size = buffer_flags[2];
-    uint32_t buffer_offset = buffer_flags[0] * (buffer_size << 1);
+    uint32_t buffer_size = flag.z;
+    uint32_t buffer_offset = flag.x * (buffer_size << 1);
     T_IN const* input = &buffer_input[buffer_offset + buffer_size];
 
     cudaTriggerProgrammaticLaunchCompletion();
 
-    __syncthreads();
-    if (threadIdx.x == 0)
-    {
-        atomicAdd(offset_access_ptr, 1);
-    }
-
     for (int i = 0; i < NUM_INPUTS; i++)
     {
         for (int j = 0; j < DIM / (1 * ELTS_PER_THREAD * NUM_THREADS); j++)
@@ -361,7 +388,17 @@ __global__ void __launch_bounds__(128, 1)
     }
 
     __pipeline_commit();
-
+    __syncthreads();
+    if (threadIdx.x == 0)
+    {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000))
+        asm volatile("red.async.release.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
+#elif (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+        asm volatile("red.global.gpu.add.u32 [%0], %1;" ::"l"(offset_access_ptr), "r"(1) : "memory");
+#else
+        atomicAdd(offset_access_ptr, 1);
+#endif
+    }
     // Load all inputs
     bool valid = false;
 
@@ -494,14 +531,13 @@ __global__ void __launch_bounds__(128, 1)
     if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0)
     {
         // Make sure all blocks have finished accessing the buffer
-        while (*reinterpret_cast<uint32_t volatile*>(offset_access_ptr) != gridDim.x * gridDim.y)
+        while (*reinterpret_cast<uint32_t volatile*>(offset_access_ptr) < gridDim.x * gridDim.y)
         {
         }
-        buffer_flags[0] = (buffer_flags[0] + 1) % 3;
-        buffer_flags[1] = (buffer_flags[1] + 1) % 3;
+        buffer_flags[0] = (flag.x + 1) % 3;
+        buffer_flags[1] = (flag.y + 1) % 3;
         *(offset_access_ptr) = 0;
     }
-    __syncthreads();
 #endif
 }
 
diff --git a/cpp/tensorrt_llm/runtime/mcastDeviceMemory.cpp b/cpp/tensorrt_llm/runtime/mcastDeviceMemory.cpp
index b45d903367e..950215e7542 100644
--- a/cpp/tensorrt_llm/runtime/mcastDeviceMemory.cpp
+++ b/cpp/tensorrt_llm/runtime/mcastDeviceMemory.cpp
@@ -50,7 +50,7 @@ McastDeviceMemory::McastDeviceMemory(
     , mMcHandle(0)
 {
 
-    cudaSetDevice(mDeviceIdx);
+    TLLM_CUDA_CHECK(cudaSetDevice(mDeviceIdx));
     // Check if the device support multicasting
     int multicast_supported{0};
     TLLM_CU_CHECK(cuDeviceGetAttribute(&multicast_supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, mDeviceIdx));
@@ -82,34 +82,41 @@ McastDeviceMemory::McastDeviceMemory(
     {
         allocNvlsMcastMem(mSignalPadOffset + kSIGNAL_PAD_SIZE);
     }
-    mSignalPadsDev.resize(mGroupSize);
+    // Initialize signal pads
+    mSignalPads.resize(mGroupSize);
     for (size_t i = 0; i < mGroupSize; i++)
     {
-        mSignalPadsDev[i] = mUcPtrs[i] + mSignalPadOffset;
+        mSignalPads[i] = mUcPtrs[i] + mSignalPadOffset;
         if (i == mGroupRank)
         {
-            cuMemsetD8(mSignalPadsDev[i], 0, kSIGNAL_PAD_SIZE);
+            cuMemsetD8(mSignalPads[i], 0, kSIGNAL_PAD_SIZE);
         }
     }
+    // Copy host array of pointers to device array
+    TLLM_CUDA_CHECK(cudaMalloc(&mSignalPadsDev, mGroupSize * sizeof(CUdeviceptr)));
+    TLLM_CUDA_CHECK(cudaMalloc(&mUcPtrsDev, mGroupSize * sizeof(CUdeviceptr)));
+    TLLM_CUDA_CHECK(
+        cudaMemcpy(mSignalPadsDev, mSignalPads.data(), mGroupSize * sizeof(CUdeviceptr), cudaMemcpyHostToDevice));
+    TLLM_CUDA_CHECK(cudaMemcpy(mUcPtrsDev, mUcPtrs.data(), mGroupSize * sizeof(CUdeviceptr), cudaMemcpyHostToDevice));
 }
 
 McastDeviceMemory::~McastDeviceMemory()
 {
     tensorrt_llm::common::unregisterMcastDevMemBuffer(this);
+    TLLM_CUDA_CHECK(cudaFree(mSignalPadsDev));
+    TLLM_CUDA_CHECK(cudaFree(mUcPtrsDev));
+
     if (mIsMNNvlink)
     {
         for (uint32_t rank = 0; rank < mGroupSize; rank++)
         {
-            if (rank == mGroupRank)
-            {
-                cuMemRelease(mUcHandles[rank]);
-            }
-            else
-            {
-                mUcHandles[rank] = 0;
-            }
+            TLLM_CU_CHECK(cuMemUnmap(mUcPtrs[rank], mAllocationSize));
+            // We need to release the handle on each rank
+            TLLM_CU_CHECK(cuMemRelease(mUcHandles[rank]));
         }
-        cuMemRelease(mMcHandle);
+        TLLM_CU_CHECK(cuMemUnmap(mMcPtr, mAllocationSize));
+        TLLM_CU_CHECK(cuMemAddressFree(mMcPtr, mAllocationSize));
+        TLLM_CU_CHECK(cuMemRelease(mMcHandle));
     }
     else
     {
diff --git a/cpp/tensorrt_llm/runtime/mcastDeviceMemory.h b/cpp/tensorrt_llm/runtime/mcastDeviceMemory.h
index 339c62c310c..4afcc05223d 100644
--- a/cpp/tensorrt_llm/runtime/mcastDeviceMemory.h
+++ b/cpp/tensorrt_llm/runtime/mcastDeviceMemory.h
@@ -44,16 +44,18 @@ class McastDeviceMemory
 
     McastDeviceMemory(size_t bufSize, uint32_t groupSize, uint32_t groupRank, int deviceIdx, bool mnNvlink);
 
+    // We don't register the pointer in these two functions since we don't expect any python-level code would call
+    // to obtain the raw pointers.
     //! Get the raw array of signal pad pointers to all ranks (including self)
     void** getSignalPadPtrsDev()
     {
-        return reinterpret_cast<void**>(mSignalPadsDev.data());
+        return mSignalPadsDev;
     }
 
     //! Get the raw array of unicast pointers to all ranks (including self)
     void** getBufferPtrsDev()
     {
-        return reinterpret_cast<void**>(mUcPtrs.data());
+        return mUcPtrsDev;
     }
 
     //! Get the raw unicast pointer to a given rank
@@ -93,11 +95,17 @@ class McastDeviceMemory
     size_t mAllocationSize;
 
     CUdeviceptr mMcPtr;
-    std::vector<CUdeviceptr> mUcPtrs;
-    std::vector<CUdeviceptr> mSignalPadsDev;
     CUmemGenericAllocationHandle mMcHandle;
     std::vector<CUmemGenericAllocationHandle> mUcHandles;
 
+    // Host array of pointers
+    std::vector<CUdeviceptr> mUcPtrs;
+    std::vector<CUdeviceptr> mSignalPads;
+
+    // Device array of pointers
+    void** mUcPtrsDev;
+    void** mSignalPadsDev;
+
     // For intra-node mcast
     tensorrt_llm::runtime::IpcNvlsHandle* mNvlsHandle;
 
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index ac9b85f0162..62be770010b 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -798,12 +798,12 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
             hidden_states, residual = self.post_attention_layernorm(
                 hidden_states, residual)
 
-        # Note: this fusion pattern is only supported for TRTLLM-nvfp4 backend now
-        do_finalize = not (hidden_states.shape[0]
-                           <= self.moe_allreduce.max_token
-                           and self.fusion_config.POST_MOE_FUSION
-                           and self.model_config.moe_backend == 'TRTLLM'
-                           and self.mlp.experts.has_nvfp4)
+        # Note: this fusion pattern is only supported for single-node TRTLLM-nvfp4 backend now
+        do_finalize = self.mapping.is_multi_node() or (
+            not (hidden_states.shape[0] <= self.moe_allreduce.max_token
+                 and self.fusion_config.POST_MOE_FUSION
+                 and self.model_config.moe_backend == "TRTLLM"
+                 and self.mlp.experts.has_nvfp4))
 
         hidden_states = _run_MoE(hidden_states,
                                  hidden_states_fp4=None,

From 4c364b9a731da91db85a29b1f98c76e5386c24e0 Mon Sep 17 00:00:00 2001
From: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Date: Thu, 17 Jul 2025 11:56:03 +0800
Subject: [PATCH 51/88] infra: fix SBSA test stage (#6113)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
---
 jenkins/L0_Test.groovy | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 12d2a3c6dbe..941c3efb228 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -2327,6 +2327,20 @@ pipeline {
                         } else {
                             error "Skip multi-GPU testing. No test to run."
                         }
+                    } else {
+                        if (singleGpuJobs.size() > 0) {
+                            singleGpuJobs.failFast = params.enableFailFast
+                            parallel singleGpuJobs
+                        } else {
+                            echo "Skip single-GPU testing. No test to run."
+                        }
+
+                        if (dgxJobs.size() > 0) {
+                            stage(testPhase2StageName) {
+                                dgxJobs.failFast = params.enableFailFast
+                                parallel dgxJobs
+                            }
+                        }
                     }
                 }
             }

From 7e033c392e7104ad3d3214b5bcc6e662b2b4fe38 Mon Sep 17 00:00:00 2001
From: ChristinaZ <83400082+ChristinaZ@users.noreply.github.com>
Date: Thu, 17 Jul 2025 12:38:29 +0800
Subject: [PATCH 52/88] Feat: Add vectorized loading for finalize kernel in MoE
 Trtllm backend (#5919)

Signed-off-by: Christina Zhang <83400082+ChristinaZ@users.noreply.github.com>
---
 .../blockScaleMoe/DevKernel.cu                | 110 +++++++++++++++++-
 1 file changed, 106 insertions(+), 4 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
index ac0684ff18e..ad5cd15fdda 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu
@@ -16,10 +16,21 @@
 
 #include "DevKernel.h"
 
+#include "cutlass/array.h"
+#include "cutlass/numeric_conversion.h"
+#include <cub/cub.cuh>
 #include <cutlass/cutlass.h>
 #include <cutlass/numeric_types.h>
 
-#include <cub/cub.cuh>
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Helper function for array conversion
+template <class T, class U>
+__host__ __device__ constexpr static U arrayConvert(T const& input)
+{
+    cutlass::NumericArrayConverter<typename U::Element, typename T::Element, U::kElements> converter;
+    return converter(input);
+}
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -518,6 +529,83 @@ __global__ void finalizeKernel(KernelParams params)
     }
 }
 
+constexpr static int FINALIZE_THREADS_PER_BLOCK = 256;
+
+__device__ float4 vectorizedLoadPtx(float4 const* ptr)
+{
+    float4 ret;
+    asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];"
+                 : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w)
+                 : "l"(ptr));
+    return ret;
+}
+
+// Final kernel to unpermute and scale
+// This kernel unpermutes the original data, does the k-way reduction and performs the final skip connection.
+
+template <typename KernelParams>
+__global__ void finalizeKernelVecLoad(KernelParams params)
+{
+    using Type = typename KernelParams::Type;
+    using TypeExpW = typename KernelParams::TypeExpW;
+
+    int const hiddenDimBits = params.hiddenDim * cutlass::sizeof_bits<Type>::value;
+    assert(hiddenDimBits % 128 == 0);
+
+    // Load 128-bits per thread, according to the smallest data type we read/write
+    constexpr int64_t FINALIZE_ELEM_PER_THREAD = 128 / cutlass::sizeof_bits<Type>::value;
+    using InputElem = cutlass::Array<Type, FINALIZE_ELEM_PER_THREAD>;
+    using OutputElem = cutlass::Array<Type, FINALIZE_ELEM_PER_THREAD>;
+    using ComputeElem = cutlass::Array<float, FINALIZE_ELEM_PER_THREAD>;
+
+    int64_t const tokenIdx = blockIdx.x;
+    int64_t const startOffset = threadIdx.x;
+    int64_t const stride = FINALIZE_THREADS_PER_BLOCK;
+    int64_t const numElemsInCol = params.hiddenDim / FINALIZE_ELEM_PER_THREAD;
+
+    auto const offset = tokenIdx * params.hiddenDim;
+    Type* outputPtr = params.outPtr + offset;
+    auto* outElemPtr = reinterpret_cast<OutputElem*>(outputPtr);
+    auto const* inElemPtr = reinterpret_cast<InputElem const*>(params.inPtr);
+
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    // wait on primary kernel when using PDL
+    if constexpr (KernelParams::UsePdl)
+    {
+        cudaGridDependencySynchronize();
+    }
+#endif
+
+    for (int elemIndex = startOffset; elemIndex < numElemsInCol; elemIndex += stride)
+    {
+        ComputeElem threadOutput;
+        threadOutput.fill(0);
+        for (int k = 0; k < params.topK; ++k)
+        {
+            int const expandedIdx = tokenIdx * params.topK + k;
+            int const permutedIdx = params.expandedIdxToPermutedIdx[expandedIdx];
+            if (permutedIdx == -1)
+            {
+                continue;
+            }
+
+            float const scale
+                = (params.expertWeightsPtr != nullptr) ? static_cast<float>(params.expertWeightsPtr[expandedIdx]) : 1.f;
+
+            auto const* inputPermutedPtr = inElemPtr + permutedIdx * numElemsInCol;
+
+            float4 input = vectorizedLoadPtx(reinterpret_cast<float4 const*>(&inputPermutedPtr[elemIndex]));
+            InputElem inputPermutedElem = *reinterpret_cast<InputElem const*>(&input);
+            ComputeElem expertResult = arrayConvert<InputElem, ComputeElem>(inputPermutedElem);
+
+            threadOutput = threadOutput + scale * expertResult;
+        }
+
+        OutputElem outputElem = arrayConvert<ComputeElem, OutputElem>(threadOutput);
+        outElemPtr[elemIndex] = outputElem;
+    }
+}
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 template <typename KernelParams>
@@ -552,7 +640,9 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
                 int const expandedIdx = tokenIdx * params.topK + k;
                 int const permutedIdx = params.expandedIdxToPermutedIdx[expandedIdx];
                 if (permutedIdx == -1)
+                {
                     continue;
+                }
                 int const totalNumPaddedTokens = params.totalNumPaddedTokens[0];
                 int const scaleIdx = permutedIdx + totalNumPaddedTokens * (hiddenIdx / 128);
                 float const blockScale = params.inDqSfsPtr ? params.inDqSfsPtr[scaleIdx] : 1;
@@ -591,7 +681,6 @@ __global__ void finalizeDeepSeekKernel(KernelParams params)
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
-
 void run(Data const& data, void* stream)
 {
     if (data.mUseDeepSeekFp8)
@@ -610,9 +699,22 @@ void run(Data const& data, void* stream)
         int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads;
         // Capped at rather arbitrary 8192 to avoid gridDim exceeding 65535 specified by CUDA.
         int const numBlocksY = std::min(8192, data.numTokens);
-        dim3 numBlocks(numBlocksX, numBlocksY);
 
-        LAUNCH_EXPW(data, finalizeKernel, numBlocks, numThreads, 0, stream);
+        if (numBlocksX * numBlocksY < 1184)
+        {
+            // The number 1184 comes from 148 * 8, where 148 is the number of SMs (Streaming Multiprocessors) in the
+            // Blackwell architecture,
+            // and the value 8 means that each Streaming Multiprocessor (SM) can hold up to 8 blocks for this kernel.
+            // This limitation is intended to ensure that when the number of waves is greater than 1, we choose to use
+            // the kernel with vectorized loading.
+            dim3 numBlocks(numBlocksX, numBlocksY);
+            LAUNCH_EXPW(data, finalizeKernel, numBlocks, numThreads, 0, stream);
+        }
+        else
+        {
+            LAUNCH_EXPW(data, finalizeKernelVecLoad, /*numBlocks=*/data.numTokens,
+                /*numThreads=*/FINALIZE_THREADS_PER_BLOCK, 0, stream);
+        }
     }
 }
 

From d4d21a106e8176bf20e627ee432cca5ef920c325 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
Date: Wed, 16 Jul 2025 21:58:18 -0700
Subject: [PATCH 53/88] [fix] Release slots with spec decode + disagg (#5975)
 (#6032)

Signed-off-by: Iman Tabrizian <itabrizian@nvidia.com>
Signed-off-by: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor.py |  16 ++-
 .../test_disaggregated_single_gpu.py          | 105 ++++++++++++++++++
 .../test_lists/test-db/l0_h100.yml            |   1 +
 3 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index dc7b79c265c..c8518c83a81 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -896,6 +896,10 @@ def _executor_loop_pp(self):
 
     def _executor_loop(self):
         torch.cuda.set_device(self.device_id)
+        is_ngram = hasattr(
+            self.model_engine, "spec_config"
+        ) and self.model_engine.spec_config is not None and self.model_engine.spec_config.spec_dec_mode.is_ngram(
+        )
         with self._profiler() as profile_step:
             sample_state = None
             iter_start_time = time.time()
@@ -918,8 +922,7 @@ def _executor_loop(self):
 
                 self._pad_attention_dp_dummy_request()
 
-                if self.draft_model_engine is not None or hasattr(
-                        self, 'drafter') and self.drafter is not None:
+                if self.draft_model_engine is not None or is_ngram or self.drafter is not None:
                     self._prepare_draft_requests(self.active_requests)
 
                 scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule(
@@ -1652,8 +1655,13 @@ def _send_disagg_ctx_cache(self, scheduled_ctx_requests):
             if req.is_context_only_request and (req.is_context_finished or
                                                 req.is_finished_due_to_length):
                 self.kv_cache_transceiver.respond_and_send_async(req)
-                self.resource_manager.resource_managers[
-                    ResourceManagerType.SEQ_SLOT_MANAGER].free_resources(req)
+                for resource_mgr_type in (
+                        ResourceManagerType.SEQ_SLOT_MANAGER,
+                        ResourceManagerType.SPEC_RESOURCE_MANAGER):
+                    if resource_mgr_type in self.resource_manager.resource_managers and self.resource_manager.resource_managers[
+                            resource_mgr_type] is not None:
+                        self.resource_manager.resource_managers[
+                            resource_mgr_type].free_resources(req)
 
         self.kv_cache_transceiver.check_context_transfer_status(0)
 
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index 540313cfdff..e0ab570ec5c 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -12,6 +12,7 @@
 from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
 from tensorrt_llm._utils import set_mpi_comm
 from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MpiCommSession
+from tensorrt_llm.llmapi.llm_args import EagleDecodingConfig
 
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
 MPI.pickle.__init__(
@@ -33,6 +34,11 @@ def model_path(model_name):
     elif 'TinyLlama-1.1B-Chat-v1.0' in model_name:
         return os.path.join(llm_models_root, 'llama-models-v2',
                             'TinyLlama-1.1B-Chat-v1.0')
+    elif 'Llama-3.1-8B-Instruct' in model_name:
+        return os.path.join(llm_models_root, 'llama-3.1-model',
+                            'Llama-3.1-8B-Instruct/')
+    elif 'EAGLE3-LLaMA3.1-Instruct-8B' in model_name:
+        return os.path.join(llm_models_root, 'EAGLE3-LLaMA3.1-Instruct-8B')
     else:
         raise ValueError(f"Unknown model: {model_name}")
 
@@ -313,5 +319,104 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
             print("All workers terminated.")
 
 
+@pytest.mark.parametrize("model", ["Llama-3.1-8B-Instruct"])
+@pytest.mark.parametrize("spec_dec_model_path", ["EAGLE3-LLaMA3.1-Instruct-8B"])
+@pytest.mark.parametrize("generation_overlap", [False])
+def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
+                                                 generation_overlap):
+    # Test whether the batch slots are properly released when using speculative decoding
+    # with disaggregated serving.
+    spec_dec_config = EagleDecodingConfig(
+        speculative_model_dir=model_path(spec_dec_model_path),
+        eagle3_one_model=False,
+        max_draft_len=3)
+
+    worker_pytorch_configs = []
+
+    # Context worker
+    worker_pytorch_configs.append(
+        dict(disable_overlap_scheduler=True,
+             speculative_config=spec_dec_config,
+             max_batch_size=1))
+
+    # Generation worker
+    worker_pytorch_configs.append(
+        dict(disable_overlap_scheduler=not generation_overlap,
+             speculative_config=spec_dec_config,
+             max_batch_size=1))
+
+    kv_cache_configs = [
+        KvCacheConfig(max_tokens=128, enable_block_reuse=False)
+        for _ in range(2)
+    ]
+    model_names = [model_path(model) for _ in range(2)]
+    ranks = [0, 1]
+    worker_args = list(
+        zip(kv_cache_configs, worker_pytorch_configs, model_names, ranks))
+
+    port_name = MPI.Open_port()
+    MPI.Publish_name('my_port', port_name)
+
+    prompt = "What is the capital of Germany?"
+
+    with MPIPoolExecutor(max_workers=2, env={"TRTLLM_USE_MPI_KVCACHE":
+                                             "1"}) as executor:
+        futures = []
+        try:
+            for worker_arg in worker_args:
+                future = executor.submit(worker_entry_point, *worker_arg)
+                futures.append(future)
+        except Exception as e:
+            print(f"Error in worker {worker_arg}: {e}")
+            raise e
+
+        try:
+            print("Launched all the workers.")
+            intercomm = MPI.COMM_SELF.Accept(port_name)
+
+            for _ in range(2):
+                intercomm.recv(tag=MPI_READY)
+                print("Received ready signal.")
+            max_tokens = 25
+
+            requests = []
+            for _ in range(10):
+                requests.append(
+                    (prompt, SamplingParams(max_tokens=1, ignore_eos=True),
+                     DisaggregatedParams(request_type="context_only")))
+
+            intercomm.send(requests, dest=0, tag=MPI_REQUEST)
+
+            for _ in range(len(requests)):
+                output = intercomm.recv(source=0, tag=MPI_RESULT)
+                assert output[0].disaggregated_params is not None
+                assert output[
+                    0].disaggregated_params.request_type == "context_only"
+                assert len(output[0].token_ids) == 1
+
+                generation_request_disagg_params = output[
+                    0].disaggregated_params
+                generation_request_disagg_params.request_type = "generation_only"
+                requests = []
+                requests.append((prompt,
+                                 SamplingParams(max_tokens=max_tokens,
+                                                ignore_eos=True),
+                                 generation_request_disagg_params))
+
+                intercomm.send(requests, dest=1, tag=MPI_REQUEST)
+                output = intercomm.recv(source=1, tag=MPI_RESULT)
+
+        finally:
+            # Send termination requests
+            intercomm.send(None, dest=0, tag=MPI_REQUEST)
+            intercomm.send(None, dest=1, tag=MPI_REQUEST)
+            print("Sent termination requests to the workers.")
+
+            # Wait for all futures to complete
+            for future in futures:
+                future.result()
+            print("All workers terminated.")
+
+
 if __name__ == "__main__":
     pytest.main()
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index ca678f13ef5..66ce79bb239 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -68,6 +68,7 @@ l0_h100:
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8]
+  - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
   - test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
   - test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
   - test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]

From 48daa18de3fee3eabad6f42efeb8afd9ad9a2fe7 Mon Sep 17 00:00:00 2001
From: Yanchao Lu <yanchaol@nvidia.com>
Date: Thu, 17 Jul 2025 14:29:57 +0800
Subject: [PATCH 54/88] [None][infra] Set up the initial config for CodeRabbit
 (#6128)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
---
 .coderabbit.yaml                 | 22 ++++++++++++++++++++++
 .github/pull_request_template.md | 17 ++++++++++++-----
 2 files changed, 34 insertions(+), 5 deletions(-)
 create mode 100644 .coderabbit.yaml

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
new file mode 100644
index 00000000000..d72700a755d
--- /dev/null
+++ b/.coderabbit.yaml
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
+language: "en-US"
+reviews:
+  auto_review:
+    drafts: true
+    base_branches: ["main", "release/.+"]
+  commit_status: false
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index b407b662409..f4bb9f33c48 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,17 +1,24 @@
+@coderabbitai summary
 
-# PR title
-
-Please write the PR title by following template:
+<!--
+Please write the PR title by following this template:
 
-[JIRA ticket link/nvbug link/github issue link][fix/feat/doc/infra/...] \<summary of this PR\>
+[JIRA ticket/NVBugs ID/GitHub issue][fix/feat/doc/infra/...] \<summary of this PR\>
 
-For example, assume I have a PR hope to support a new feature about cache manager of Jira TRTLLM-1000 ticket, it would be like
+For example, assume I have a PR to support a new feature about cache manager for JIRA ticket TRTLLM-1000, it would be like:
 
 [TRTLLM-1000][feat] Support a new feature about cache manager
 
+Or I have a PR to fix a Llama3 accuracy issue:
+
+[https://nvbugs/1234567][fix] Fix Llama3 accuracy issue
+-->
+
 ## Description
 
+<!--
 Please explain the issue and the solution in short.
+-->
 
 ## Test Coverage
 

From e821c68611e682535ae0ed2765758514ffade571 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Thu, 17 Jul 2025 14:48:23 +0800
Subject: [PATCH 55/88] CI: update multi gpu test trigger file list (#6131)

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 jenkins/L0_MergeRequest.groovy | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index 7dd12cf78a4..65cda403276 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -565,6 +565,8 @@ def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
         "cpp/tensorrt_llm/runtime/ncclCommunicator.cpp",
         "cpp/tensorrt_llm/kernels/communicationKernels/",
         "cpp/tensorrt_llm/thop/allreduceOp.cpp",
+        "cpp/tensorrt_llm/thop/allgatherOp.cpp",
+        "cpp/tensorrt_llm/thop/reducescatterOp.cpp",
         "cpp/tensorrt_llm/kernels/customAllReduceKernels.h",
         "cpp/tensorrt_llm/kernels/customAllReduceKernels.cu",
         "cpp/tensorrt_llm/kernels/gptKernels.h",

From 8c1c9ef7aa5c1b96183b552b01de6c04ac339c65 Mon Sep 17 00:00:00 2001
From: Zhenhuan Chen <chenzhh3671@gmail.com>
Date: Thu, 17 Jul 2025 15:04:54 +0800
Subject: [PATCH 56/88] fix: convert venv_prefix to str before comparison with
 base_prefix (#6121)

Signed-off-by: Zhenhuan Chen <chenzhh3671@gmail.com>
---
 scripts/build_wheel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 06a22d93ff8..2724b8489b9 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -144,7 +144,7 @@ def setup_venv(project_dir: Path, requirements_file: Path, no_venv: bool):
         # Ensure PyPI PyTorch is not installed in the venv
         purelib_dir = Path(scheme["purelib"])
         pytorch_package_dir = purelib_dir / "torch"
-        if venv_prefix != sys.base_prefix and pytorch_package_dir.exists():
+        if str(venv_prefix) != sys.base_prefix and pytorch_package_dir.exists():
             warnings.warn(
                 f"Using the NVIDIA PyTorch container with PyPI distributed PyTorch may lead to compatibility issues.\n"
                 f"If you encounter any problems, please delete the environment at `{venv_prefix}` so that "

From 1cc49494fe85127236945320f12b4ebee33245d3 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Thu, 17 Jul 2025 16:53:15 +0800
Subject: [PATCH 57/88] [Infra] - Add wiave list for pytest when using slurm
 (#6130)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 jenkins/L0_Test.groovy       | 6 ++++++
 jenkins/scripts/slurm_run.sh | 1 +
 2 files changed, 7 insertions(+)

diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 941c3efb228..6f6ae7c1186 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -309,6 +309,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
             def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
             def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
+            def waivesListPathNode = "${jobWorkspace}/waives.txt"
             def isAarch64 = config.contains("aarch64")
             def pytestTestTimeout = "7200"
 
@@ -325,6 +326,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                 Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
                 Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
 
+                // Upload waives.txt to Frontend node
+                def waivesListLocalPath = "${llmSrcLocal}/tests/integration/test_lists/waives.txt"
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${waivesListLocalPath} ${remote.user}@${remote.host}:${waivesListPathNode}",)
+
                 // Generate Test List and Upload to Frontend Node
                 def makoArgs = getMakoArgsFromStageName(stageName, true)
                 // TODO: currently the options will only be processed if the first
@@ -362,6 +367,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                     export stageName=$stageName
                     export testList=$testList
                     export testListPathNode=$testListPathNode
+                    export waivesListPathNode=$waivesListPathNode
                     export pytestTestTimeout=$pytestTestTimeout
                     export splits=$splits
                     export splitId=$splitId
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
index 9c055d8cd34..4b6337fca5d 100755
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@@ -45,6 +45,7 @@ testCmdLines=(
     "-v"
     "--timeout=$pytestTestTimeout"
     "--test-list=$testListPathNode"
+    "--waives-file=$waivesListPathNode"
     "--rootdir $llmSrcNode/tests/integration/defs"
     "--test-prefix=$stageName"
     "--splits $splits"

From 44c70c88f98cfa1aafbeb83f00426ddcfd77904b Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:42:07 +0800
Subject: [PATCH 58/88] chore:[BREAKING CHANGE] use cacheTransceiverConfig as
 knobs for disagg service (#5234)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
---
 benchmarks/cpp/disaggServerBenchmark.cpp      |   2 +
 .../batch_manager/cacheTransceiver.h          |  19 +--
 cpp/include/tensorrt_llm/executor/executor.h  |  19 ++-
 .../batch_manager/cacheTransBuffer.cpp        |  37 +++--
 .../batch_manager/cacheTransBuffer.h          |   4 +-
 .../batch_manager/cacheTransceiver.cpp        | 150 +++++++++---------
 .../batch_manager/kvCacheManager.cpp          |   9 +-
 .../trtGptModelInflightBatching.cpp           |  38 ++++-
 .../executor/cacheTransceiverConfig.cpp       |  26 ++-
 cpp/tensorrt_llm/executor/serialization.cpp   |  11 +-
 .../pybind/batch_manager/cacheTransceiver.cpp |  17 +-
 .../pybind/executor/executorConfig.cpp        |  39 ++++-
 cpp/tests/executor/disaggExecutorTest.cpp     |   6 +
 .../batch_manager/cacheTransBufferTest.cpp    |  21 ++-
 .../executor/serializeUtilsTest.cpp           |  10 +-
 docs/source/advanced/disaggregated-service.md |  56 ++-----
 docs/source/scripts/disaggregated/gen_yaml.py |   6 +-
 examples/disaggregated/README.md              |  25 ++-
 examples/disaggregated/disagg_config.yaml     |   4 +
 .../_torch/pyexecutor/kv_cache_transceiver.py |  50 +++---
 tensorrt_llm/_torch/pyexecutor/py_executor.py |   2 +
 tensorrt_llm/commands/serve.py                |   1 -
 tensorrt_llm/executor/worker.py               |   4 +
 tensorrt_llm/llmapi/llm_args.py               |  12 +-
 .../accuracy/test_disaggregated_serving.py    |  32 +++-
 .../disagg_config_cache_aware_balance.yaml    |   4 +
 ...onfig_cache_aware_balance_deepseek_v3.yaml |   4 +
 .../disagg_config_cache_reuse.yaml            |   4 +
 ...disagg_config_cache_reuse_deepseek_v3.yaml |   4 +
 .../disagg_config_conditional.yaml            |   4 +
 ...disagg_config_conditional_deepseek_v3.yaml |   4 +
 ...config_ctxtp1_gentp1_deepseek_v3_lite.yaml |   4 +
 ...txtp1_gentp1_deepseek_v3_lite_one_mtp.yaml |   4 +
 ..._v3_lite_one_mtp_attention_dp_overlap.yaml |   4 +
 ...txtp1_gentp1_deepseek_v3_lite_two_mtp.yaml |   4 +
 .../disagg_config_ctxtp2_gentp1.yaml          |   4 +
 ...sagg_config_ctxtp2_gentp1_trt_backend.yaml |   4 +
 ...config_ctxtp2_gentp2_deepseek_v3_lite.yaml |   4 +
 ..._gentp2_deepseek_v3_lite_attention_dp.yaml |   4 +
 ...tp2_deepseek_v3_lite_attention_dp_one.yaml |   4 +
 ...deepseek_v3_lite_attention_dp_one_mtp.yaml |   5 +
 ...deepseek_v3_lite_attention_dp_overlap.yaml |   4 +
 ..._lite_attention_dp_overlap_cuda_graph.yaml |   4 +
 ...ig_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml |  22 +++
 ...g_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml |  22 +++
 ...2_deepseek_v3_lite_overlap_cuda_graph.yaml |   4 +
 ...ig_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml |  22 +++
 .../disagg_config_cuda_graph_padding.yaml     |   4 +
 .../test_configs/disagg_config_gen_only.yaml  |   2 +
 .../disagg_config_gen_only_trt_backend.yaml   |   2 +
 .../disagg_config_load_balance.yaml           |   4 +
 .../test_configs/disagg_config_mixed.yaml     |   4 +
 .../test_configs/disagg_config_ngram.yaml     |   4 +
 .../test_configs/disagg_config_overlap.yaml   |   4 +
 .../disagg_config_trt_backend.yaml            |   4 +
 .../disagg_config_trtllm_sampler.yaml         |   4 +
 .../defs/disaggregated/test_disaggregated.py  |  36 +++--
 .../disaggregated/test_disaggregated_etcd.py  |   6 +-
 .../test_disaggregated_single_gpu.py          |  33 ++--
 .../test_lists/qa/examples_test_list.txt      |   2 +-
 .../test_lists/qa/llm_sanity_test.txt         |   2 +-
 .../test_lists/test-db/l0_dgx_h100.yml        |   2 +-
 tests/integration/test_lists/waives.txt       |   3 -
 .../bindings/test_executor_bindings.py        |   6 +-
 64 files changed, 600 insertions(+), 265 deletions(-)
 create mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml
 create mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml
 create mode 100644 tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml

diff --git a/benchmarks/cpp/disaggServerBenchmark.cpp b/benchmarks/cpp/disaggServerBenchmark.cpp
index d0b5fb8c864..ab009802757 100644
--- a/benchmarks/cpp/disaggServerBenchmark.cpp
+++ b/benchmarks/cpp/disaggServerBenchmark.cpp
@@ -636,6 +636,8 @@ class DisaggExecutorServer
                                                                                 : texec::DecodingMode::Auto(),
                     benchmarkParams.executorLookaheadConfig, benchmarkParams.medusaChoices));
             executorConfig.setExtendedRuntimePerfKnobConfig(extendedRuntimePerfKnobConfig);
+            executorConfig.setCacheTransceiverConfig(
+                texec::CacheTransceiverConfig(texec::CacheTransceiverConfig::BackendType::DEFAULT));
             constexpr int maxIterationsForRequestStats = 1000;
             if (mEnableCollectKvCacheTransferTime)
             {
diff --git a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
index 6f9c2f82dd6..c39fee6f940 100644
--- a/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
+++ b/cpp/include/tensorrt_llm/batch_manager/cacheTransceiver.h
@@ -70,28 +70,20 @@ class BaseCacheTransceiver
 class CacheTransceiver : public BaseCacheTransceiver
 {
 public:
-    enum class CommType : std::uint8_t
-    {
-        UNKNOWN = 0,
-        MPI = 1,
-        UCX = 2,
-        NIXL = 3
-    };
-
-    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
+    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager,
         executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
         nvinfer1::DataType dataType,
         executor::kv_cache::CacheState::AttentionType attentionType
         = executor::kv_cache::CacheState::AttentionType::kDEFAULT,
         std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt);
 
-    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
-        std::vector<SizeType32> numKvHeadsPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
-        runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType,
+    CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, std::vector<SizeType32> numKvHeadsPerLayer,
+        SizeType32 sizePerHead, SizeType32 tokensPerBlock, runtime::WorldConfig const& worldConfig,
+        nvinfer1::DataType dataType,
         executor::kv_cache::CacheState::AttentionType attentionType
         = executor::kv_cache::CacheState::AttentionType::kDEFAULT,
         std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt)
-        : CacheTransceiver(cacheManager, commType,
+        : CacheTransceiver(cacheManager,
             executor::kv_cache::CacheState::ModelConfig{numKvHeadsPerLayer, sizePerHead, tokensPerBlock}, worldConfig,
             dataType, attentionType, cacheTransceiverConfig)
     {
@@ -118,7 +110,6 @@ class CacheTransceiver : public BaseCacheTransceiver
 
     void setContextState(LlmRequest* llmRequest);
 
-    CommType mCommType;
     std::unique_ptr<DataResponder> mDataResponder;
     std::unique_ptr<DataRequester> mDataRequester;
     std::vector<std::pair<LlmRequest*, std::future<void>>> mResponderFutures;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
index 1cd651cd07c..bba3c31a014 100644
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1430,18 +1430,29 @@ class LogitsPostProcessorConfig
 class CacheTransceiverConfig
 {
 public:
-    explicit CacheTransceiverConfig(std::optional<size_t> maxNumTokens = std::nullopt);
+    enum class BackendType : std::uint8_t
+    {
+        DEFAULT = 0,
+        MPI = 1,
+        UCX = 2,
+        NIXL = 3
+    };
+    explicit CacheTransceiverConfig(
+        std::optional<BackendType> backendType = std::nullopt, std::optional<size_t> maxNumTokens = std::nullopt);
 
     bool operator==(CacheTransceiverConfig const& other) const;
+    void setBackendType(std::optional<BackendType> backendType);
+    void setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer);
 
-    [[nodiscard]] std::optional<size_t> getMaxNumTokens() const;
-    void setMaxNumTokens(size_t maxNumTokens);
+    [[nodiscard]] std::optional<size_t> getMaxTokensInBuffer() const;
+    [[nodiscard]] std::optional<BackendType> getBackendType() const;
 
 private:
+    std::optional<BackendType> mBackendType;
     /// @brief The maximum number of tokens that the CacheTransceiver's pre-allocated buffer can hold. If the number of
     /// kvCache tokens to be transferred for a single request is greater than this value, the performance of the cache
     /// transfer may be degraded.
-    std::optional<size_t> mMaxNumTokens;
+    std::optional<size_t> mMaxTokensInBuffer;
 };
 
 /// @brief Configuration class for the model executor
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp
index 51b06feaf71..1a3aed54f41 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.cpp
@@ -210,7 +210,7 @@ CacheTransBufferManager::CacheTransBufferManager(
         {
             auto poolIdx = mCacheManager->getBlockManager().getLayerPoolIdx(layerId);
             auto windowSize = static_cast<size_t>(mCacheManager->getBlockManager().getPoolWindowSize(poolIdx));
-            auto validTokenNum = windowSize < maxNumTokens.value() ? windowSize : maxNumTokens.value();
+            auto validTokenNum = (windowSize < maxNumTokens.value() ? windowSize : maxNumTokens.value());
             bufferSizeFromMaxNumToken += validTokenNum * kvCacheByteSizePerTokenPerLayer;
         }
     }
@@ -230,26 +230,37 @@ CacheTransBufferManager::CacheTransBufferManager(
     TLLM_LOG_INFO(
         "CacheTransBufferManager: mMaxNumTokens:%ld, mRecvBufferCount:%ld, "
         "mSendBufferCount:%ld,mTransferBufferSize:%ld, mPreAllocBufferSize:%ld,mOnlyUseDynamicBuffer:%d "
-        "mUseFabricMemory:%d",
+        "mUseFabricMemory:%d mDataType:%d",
         maxNumTokens.has_value() ? maxNumTokens.value() : 0, mRecvBufferCount, mSendBufferCount, mTransferBufferSize,
-        mPreAllocBufferSize, mOnlyUseDynamicBuffer, mUseFabricMemory);
-    bool to_allocate = common::getEnvUseMPIKvCache() || common::getEnvUseUCXKvCache() || common::getEnvUseNixlKvCache();
+        mPreAllocBufferSize, mOnlyUseDynamicBuffer, mUseFabricMemory, mDataType);
 
-    TLLM_CHECK_WITH_INFO(to_allocate, "CacheTransBufferManager: to_allocate is false");
     allocateBuffer();
 }
 
-size_t CacheTransBufferManager::preAllocBufferSize(std::optional<size_t> maxNumTokens)
+size_t CacheTransBufferManager::preAllocBufferSize(
+    std::map<SizeType32, SizeType32> const& cacheSizeBytesPerTokenPerWindow,
+    std::optional<executor::CacheTransceiverConfig> const& cacheTransceiverConfig)
 {
-    bool to_allocate = common::getEnvUseMPIKvCache() || common::getEnvUseUCXKvCache() || common::getEnvUseNixlKvCache();
-    if (!to_allocate)
+    if (!cacheTransceiverConfig.has_value())
     {
         return 0;
     }
+    if (!cacheTransceiverConfig->getBackendType().has_value())
+    {
+        return 0;
+    }
+    auto maxNumTokens = cacheTransceiverConfig->getMaxTokensInBuffer();
     size_t TransferBufferSize = common::getEnvMemSizeForKVCacheTransferBuffer();
     if (maxNumTokens.has_value())
     {
-        TransferBufferSize = maxNumTokens.value();
+        TransferBufferSize = 0;
+        for (auto const& [windowSize, cacheSizeBytesPerToken] : cacheSizeBytesPerTokenPerWindow)
+        {
+            auto validTokenNum
+                = (static_cast<size_t>(windowSize) < maxNumTokens.value() ? static_cast<size_t>(windowSize)
+                                                                          : maxNumTokens.value());
+            TransferBufferSize += validTokenNum * cacheSizeBytesPerToken;
+        }
     }
     bool useFabricMemory = FabricMemory::supportFbaricMemory()
         && (!(common::getEnvKVCacheTransferUseSyncBuffer() || common::getEnvKVCacheTransferUseAsyncBuffer()));
@@ -329,6 +340,14 @@ std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> CacheTransBuf
     size_t bufferCoverTargetNum = std::min(
         static_cast<size_t>(targetNum), mTransferBufferSize / (targetBufferEleSize * common::getDTypeSize(mDataType)));
     TLLM_LOG_DEBUG("getOrAllocateBuffers bufferCoverTargetNum:%d", bufferCoverTargetNum);
+    if (bufferCoverTargetNum < static_cast<size_t>(targetNum))
+    {
+        TLLM_LOG_WARNING(
+            "CacheTransceiver getOrAllocateBuffers: bufferCoverTargetNum:%d < targetNum:%d, may use dynamic buffer, "
+            "it's better to increase MaxTokensInBuffer in cacheTransceiverConfig, otherwise, the performance may "
+            "be degraded",
+            bufferCoverTargetNum, targetNum);
+    }
     if (bufferId.has_value())
     {
         TLLM_CHECK(static_cast<size_t>(bufferId.value()) < concurrenceResource.mBuffers.size());
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h
index d534e2b4ac6..e7b050388fe 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransBuffer.h
@@ -18,6 +18,7 @@
 #pragma once
 
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include <atomic>
@@ -59,7 +60,8 @@ class CacheTransBufferManager
     CacheTransBufferManager(
         KVCacheManager::BaseKVCacheManager* cacheManager, std::optional<size_t> maxNumTokens = std::nullopt);
 
-    static size_t preAllocBufferSize(std::optional<size_t> maxNumTokens = std::nullopt);
+    static size_t preAllocBufferSize(std::map<SizeType32, SizeType32> const& cacheSizeBytesPerTokenPerWindow,
+        std::optional<executor::CacheTransceiverConfig> const& cacheTransceiverConfig = std::nullopt);
 
     std::optional<int> assignBufferIndexForSend();
     void freeBufferIndexForSend(std::optional<int> bufferId);
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
index 3dd85b7dd4f..599a89cef03 100644
--- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
+++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -62,41 +62,49 @@ std::unique_ptr<BaseCacheTransceiver> CacheTransceiverFactory::createCacheTransc
     runtime::WorldConfig const& worldConfig, executor::kv_cache::CacheState::AttentionType attentionType,
     std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig)
 {
-
-    std::optional<CacheTransceiver::CommType> commType;
-    if (common::getEnvUseUCXKvCache())
-    {
-        commType = CacheTransceiver::CommType::UCX;
-        TLLM_LOG_INFO("Enable UCX KV cache transport.");
-    }
-    else if (common::getEnvUseNixlKvCache())
+    if (!cacheTransceiverConfig.has_value() || !cacheTransceiverConfig.value().getBackendType().has_value())
     {
-        commType = CacheTransceiver::CommType::NIXL;
-        TLLM_LOG_INFO("Enable NIXL KV cache transport.");
+        TLLM_LOG_INFO("CacheTransceiver is disabled.");
+        return nullptr;
     }
-    else if (common::getEnvUseMPIKvCache())
+    auto backendType = cacheTransceiverConfig.value().getBackendType();
+    if (backendType.value() == executor::CacheTransceiverConfig::BackendType::DEFAULT)
     {
-        commType = CacheTransceiver::CommType::MPI;
-        TLLM_LOG_INFO("Enable MPI KV cache transport.");
+        if (common::getEnvUseUCXKvCache())
+        {
+            backendType = executor::CacheTransceiverConfig::BackendType::UCX;
+            TLLM_LOG_INFO("Enable UCX KV cache transport.");
+        }
+        else if (common::getEnvUseNixlKvCache())
+        {
+            backendType = executor::CacheTransceiverConfig::BackendType::NIXL;
+            TLLM_LOG_INFO("Enable NIXL KV cache transport.");
+        }
+        else if (common::getEnvUseMPIKvCache())
+        {
+            backendType = executor::CacheTransceiverConfig::BackendType::MPI;
+            TLLM_LOG_INFO("Enable MPI KV cache transport.");
+            TLLM_LOG_WARNING("MPI KV cache transport is deprecated, please use UCX or NIXL instead.");
+        }
+        else
+        {
+            backendType = executor::CacheTransceiverConfig::BackendType::UCX;
+        }
     }
+    cacheTransceiverConfig.value().setBackendType(backendType);
 
-    if (commType)
-    {
-        executor::kv_cache::CacheState::ModelConfig cacheStateCfg{
-            modelConfig.getNumKvHeadsPerLayer(), modelConfig.getSizePerHead(), modelConfig.getTokensPerBlock()};
+    executor::kv_cache::CacheState::ModelConfig cacheStateCfg{
+        modelConfig.getNumKvHeadsPerLayer(), modelConfig.getSizePerHead(), modelConfig.getTokensPerBlock()};
 
-        return std::make_unique<CacheTransceiver>(cacheManager, commType.value(), cacheStateCfg, worldConfig,
-            modelConfig.getKvDataType(), attentionType, cacheTransceiverConfig);
-    }
-    return nullptr;
+    return std::make_unique<CacheTransceiver>(
+        cacheManager, cacheStateCfg, worldConfig, modelConfig.getKvDataType(), attentionType, cacheTransceiverConfig);
 }
 
-CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, CommType commType,
+CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager,
     executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
     nvinfer1::DataType dataType, executor::kv_cache::CacheState::AttentionType attentionType,
     std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig)
-    : mCommType{commType}
-    , mMpiGroupComm(std::addressof(tensorrt_llm::mpi::MpiComm::session()))
+    : mMpiGroupComm(std::addressof(tensorrt_llm::mpi::MpiComm::session()))
     , mCacheTransceiverConfig{cacheTransceiverConfig}
 {
     using tensorrt_llm::batch_manager::kv_cache_manager::CacheFormatter;
@@ -138,59 +146,59 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
         }
     }
     bool isMLA = attentionType == executor::kv_cache::CacheState::AttentionType::kMLA;
-    if (mCommType == CommType::MPI || mCommType == CommType::UCX || mCommType == CommType::NIXL)
-    {
-        std::optional<size_t> maxNumTokens = std::nullopt;
-        if (mCacheTransceiverConfig.has_value())
-        {
-            maxNumTokens = mCacheTransceiverConfig.value().getMaxNumTokens();
-        }
-        mCacheTransBufferManager
-            = std::make_unique<kv_cache_manager::CacheTransBufferManager>(cacheManager, maxNumTokens);
-        if (mCommType == CommType::UCX)
-        {
-            std::lock_guard<std::mutex> lock(mDllMutex);
-            mWrapperLibHandle = dllOpen(UCX_WRAPPER_LIB_NAME);
-            TLLM_CHECK_WITH_INFO(mWrapperLibHandle != nullptr, "UCX wrapper library is not open correctly.");
-            auto load_sym = [](void* handle, char const* name)
-            {
-                void* ret = dllGetSym(handle, name);
-                TLLM_CHECK_WITH_INFO(ret != nullptr,
-                    "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
-                    "built with UCX support, please rebuild in UCX-enabled environment.");
-                return ret;
-            };
-            std::unique_ptr<tensorrt_llm::executor::kv_cache::ConnectionManager> (*makeUcxConnectionManager)();
-            *(void**) (&makeUcxConnectionManager) = load_sym(mWrapperLibHandle, "makeUcxConnectionManager");
-            mManager = makeUcxConnectionManager();
-            TLLM_LOG_INFO("UCX Connection Manager created");
-        }
-        else if (mCommType == CommType::NIXL)
-        {
-            mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
-                mCacheTransBufferManager.get());
-            TLLM_LOG_INFO("NIXL Connection Manager created");
-        }
-        else
-        {
-            mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world());
-            mManager = std::make_unique<executor::kv_cache::MpiConnectionManager>(mMpiWorldComm);
-            TLLM_LOG_INFO("MPI Connection Manager created");
-        }
+    TLLM_CHECK_WITH_INFO(mCacheTransceiverConfig.has_value(), "CacheTransceiverConfig is not set.");
+    auto backendType = mCacheTransceiverConfig.value().getBackendType();
+    TLLM_CHECK_WITH_INFO(
+        backendType.has_value() && (backendType.value() != executor::CacheTransceiverConfig::BackendType::DEFAULT),
+        " CacheTransceiverConfig::BackendType is not set.");
 
-        using tensorrt_llm::batch_manager::kv_cache_manager::MLACacheFormatter;
-        auto makeFormatter = [cacheManager, isMLA, this]()
-        { return createCacheFormatter(cacheManager, mCacheTransBufferManager.get(), isMLA); };
+    std::optional<size_t> maxNumTokens = mCacheTransceiverConfig.value().getMaxTokensInBuffer();
 
-        mDataResponder = std::make_unique<DataResponder>(
-            std::make_unique<DataSenderImpl>(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()));
-        mDataRequester = std::make_unique<DataRequester>(
-            std::make_unique<DataReceiverImpl>(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()));
+    mCacheTransBufferManager = std::make_unique<kv_cache_manager::CacheTransBufferManager>(cacheManager, maxNumTokens);
+    if (backendType.value() == executor::CacheTransceiverConfig::BackendType::UCX)
+    {
+        std::lock_guard<std::mutex> lock(mDllMutex);
+        mWrapperLibHandle = dllOpen(UCX_WRAPPER_LIB_NAME);
+        TLLM_CHECK_WITH_INFO(mWrapperLibHandle != nullptr, "UCX wrapper library is not open correctly.");
+        auto load_sym = [](void* handle, char const* name)
+        {
+            void* ret = dllGetSym(handle, name);
+            TLLM_CHECK_WITH_INFO(ret != nullptr,
+                "Unable to load UCX wrapper library symbol, possible cause is that TensorRT-LLM library is not "
+                "built with UCX support, please rebuild in UCX-enabled environment.");
+            return ret;
+        };
+        std::unique_ptr<tensorrt_llm::executor::kv_cache::ConnectionManager> (*makeUcxConnectionManager)();
+        *(void**) (&makeUcxConnectionManager) = load_sym(mWrapperLibHandle, "makeUcxConnectionManager");
+        mManager = makeUcxConnectionManager();
+        TLLM_LOG_INFO("UCX Connection Manager created");
+    }
+    else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL)
+    {
+        mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
+            mCacheTransBufferManager.get());
+        TLLM_LOG_INFO("NIXL Connection Manager created");
+    }
+    else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI)
+    {
+        mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world());
+        mManager = std::make_unique<executor::kv_cache::MpiConnectionManager>(mMpiWorldComm);
+        TLLM_LOG_INFO("MPI Connection Manager created");
     }
     else
     {
-        TLLM_THROW("Unsupported communication type.");
+        TLLM_THROW("Unsupported cache transceiver backend type ");
     }
+
+    using tensorrt_llm::batch_manager::kv_cache_manager::MLACacheFormatter;
+    auto makeFormatter = [cacheManager, isMLA, this]()
+    { return createCacheFormatter(cacheManager, mCacheTransBufferManager.get(), isMLA); };
+
+    mDataResponder = std::make_unique<DataResponder>(
+        std::make_unique<DataSenderImpl>(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()));
+    mDataRequester = std::make_unique<DataRequester>(
+        std::make_unique<DataReceiverImpl>(mManager.get(), *mCacheState, worldConfig.getRank(), makeFormatter()));
+
     initializeCommState();
 }
 
diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
index 540dee9148b..ba3b2a94ede 100644
--- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
+++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp
@@ -2235,13 +2235,8 @@ BlocksPerWindow BaseKVCacheManager::calculateMaxNumBlocks(executor::KvCacheConfi
         cacheSizeBytesPerTokenPerWindow[windowSize] = cacheSizeBytesPerToken;
     }
 
-    auto const extraCostMemoryBytes = extraCostMemory
-        * std::accumulate(cacheSizeBytesPerTokenPerWindow.cbegin(), cacheSizeBytesPerTokenPerWindow.cend(),
-            SizeType32{0}, [](SizeType32 acc, auto const cost) { return acc + cost.second; });
-
-    TLLM_LOG_DEBUG(
-        "extraCostMemoryBytes [all windows] [Gib]: %0.2f", extraCostMemoryBytes / static_cast<double>(1 << 30));
-
+    TLLM_LOG_DEBUG("extraCostMemory [Gib]: %0.2f", extraCostMemory / static_cast<double>(1 << 30));
+    allottedPrimaryMemBytes = allottedPrimaryMemBytes - extraCostMemory;
     auto const tokensPerBlock = modelConfig.getTokensPerBlock();
     auto const calculatePrimaryBlocks
         = [&](SizeType32 windowSize, float windowSizeShare, SizeType32 cacheSizeBytesPerToken)
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 1bc80ac2156..b36f0856fd5 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -264,10 +264,35 @@ TrtGptModelInflightBatching::TrtGptModelInflightBatching(std::shared_ptr<nvinfer
     }
     if (mModelConfig.isTransformerBased() && modelConfig.isKVCacheEnabled())
     {
+
+        auto calculateCacheSizePerToken
+            = [](ModelConfig const& modelConfig, WorldConfig const& worldConfig,
+                  std::vector<SizeType32> const& maxAttentionWindowVec, bool isCrossAttention, SizeType32 kvFactor)
+        {
+            auto [numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd] = modelConfig.getNumKvHeadsPerLayerLocalRange(
+                worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank(), isCrossAttention);
+            auto numKvHeadsPerLayer = std::vector<SizeType32>(numKvHeadsPerLayerBegin, numKvHeadsPerLayerEnd);
+            auto windowSizeLayers
+                = BaseKVCacheManager::groupLayersByWindowSize(maxAttentionWindowVec, modelConfig.getNbLayers());
+            std::map<SizeType32, SizeType32> cacheSizeBytesPerTokenPerWindow;
+            for (auto const& [windowSize, managedLayers] : windowSizeLayers)
+            {
+                auto const cacheSizePerToken = BaseKVCacheManager::calculateCacheSizePerTokenForSingleWindowSize(
+                    modelConfig, managedLayers, isCrossAttention, kvFactor);
+                auto const cacheSizeBytesPerToken
+                    = cacheSizePerToken * BufferDataType(modelConfig.getKvDataType()).getSize();
+                cacheSizeBytesPerTokenPerWindow[windowSize] = cacheSizeBytesPerToken;
+            }
+
+            return cacheSizeBytesPerTokenPerWindow;
+        };
         auto cacheTransceiverConfig
             = executorConfig.getCacheTransceiverConfig().value_or(executor::CacheTransceiverConfig());
-        auto cacheTransPreAllocaSize
-            = kv_cache_manager::CacheTransBufferManager::preAllocBufferSize(cacheTransceiverConfig.getMaxNumTokens());
+
+        auto const cacheSizeBytesPerTokenPerWindow = calculateCacheSizePerToken(
+            mModelConfig, mWorldConfig, getMaxAttentionWindowVec(), mModelConfig.useCrossAttention(), 2);
+        auto cacheTransPreAllocaSize = kv_cache_manager::CacheTransBufferManager::preAllocBufferSize(
+            cacheSizeBytesPerTokenPerWindow, cacheTransceiverConfig);
 
         auto const [freePrimaryMemBytes, freeSecondaryMemBytes]
             = BaseKVCacheManager::calculateFreeMemBytes(mRuntime->getBufferManager(), kvCacheConfig);
@@ -879,8 +904,9 @@ void TrtGptModelInflightBatching::forwardSync()
             {
                 // TODO: skip if sending layer-wise
                 {
-                    TLLM_CHECK_WITH_INFO(
-                        mCacheTransceiver, "Disaggregated serving is not enabled, please check the configuration.");
+                    TLLM_CHECK_WITH_INFO(mCacheTransceiver,
+                        "Disaggregated serving is not enabled, please check the configuration of "
+                        "cacheTransceiverConfig.");
                     mCacheTransceiver->respondAndSendAsync(llmReq.get());
                 }
                 mSeqSlotManager->freeSequenceSlot(llmReq->mRequestId);
@@ -1780,8 +1806,8 @@ void TrtGptModelInflightBatching::executeStep(
         bufferCast<void*>(*mBuffers[bufferId]->transformerBuffers->contextProgressHost)[0] = progress.get();
         if (progress)
         {
-            TLLM_CHECK_WITH_INFO(
-                mCacheTransceiver, "Disaggregated serving is not enabled, please check the configuration.");
+            TLLM_CHECK_WITH_INFO(mCacheTransceiver,
+                "Disaggregated serving is not enabled, please check the configuration of cacheTransceiverConfig.");
             mCacheTransceiver->respondAndSendLayerWise(layerWiseRequests, progress);
         }
     }
diff --git a/cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp b/cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp
index 1f392ef0583..6919d213642 100644
--- a/cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp
+++ b/cpp/tensorrt_llm/executor/cacheTransceiverConfig.cpp
@@ -21,24 +21,36 @@
 namespace tensorrt_llm::executor
 {
 
-CacheTransceiverConfig::CacheTransceiverConfig(std::optional<size_t> maxNumTokens)
-    : mMaxNumTokens(maxNumTokens)
+CacheTransceiverConfig::CacheTransceiverConfig(
+    std::optional<BackendType> backendType, std::optional<size_t> maxNumTokens)
+    : mBackendType(backendType)
+    , mMaxTokensInBuffer(maxNumTokens)
 {
 }
 
 bool CacheTransceiverConfig::operator==(CacheTransceiverConfig const& other) const
 {
-    return mMaxNumTokens == other.mMaxNumTokens;
+    return mMaxTokensInBuffer == other.mMaxTokensInBuffer && mBackendType == other.mBackendType;
 }
 
-std::optional<size_t> CacheTransceiverConfig::getMaxNumTokens() const
+void CacheTransceiverConfig::setBackendType(std::optional<BackendType> backendType)
 {
-    return mMaxNumTokens;
+    mBackendType = backendType;
 }
 
-void CacheTransceiverConfig::setMaxNumTokens(size_t maxNumTokens)
+void CacheTransceiverConfig::setMaxTokensInBuffer(std::optional<size_t> maxTokensInBuffer)
 {
-    mMaxNumTokens = maxNumTokens;
+    mMaxTokensInBuffer = maxTokensInBuffer;
+}
+
+std::optional<CacheTransceiverConfig::BackendType> CacheTransceiverConfig::getBackendType() const
+{
+    return mBackendType;
+}
+
+std::optional<size_t> CacheTransceiverConfig::getMaxTokensInBuffer() const
+{
+    return mMaxTokensInBuffer;
 }
 
 } // namespace tensorrt_llm::executor
diff --git a/cpp/tensorrt_llm/executor/serialization.cpp b/cpp/tensorrt_llm/executor/serialization.cpp
index 2ea6c26dc73..65718f0405d 100644
--- a/cpp/tensorrt_llm/executor/serialization.cpp
+++ b/cpp/tensorrt_llm/executor/serialization.cpp
@@ -1258,19 +1258,22 @@ size_t Serialization::serializedSize(SchedulerConfig const& schedulerConfig)
 // CacheTransceiverConfig
 CacheTransceiverConfig Serialization::deserializeCacheTransceiverConfig(std::istream& is)
 {
-    auto maxNumTokens = su::deserialize<std::optional<size_t>>(is);
-    return CacheTransceiverConfig{maxNumTokens};
+    auto backendType = su::deserialize<std::optional<CacheTransceiverConfig::BackendType>>(is);
+    auto maxTokensInBuffer = su::deserialize<std::optional<size_t>>(is);
+    return CacheTransceiverConfig{backendType, maxTokensInBuffer};
 }
 
 void Serialization::serialize(CacheTransceiverConfig const& cacheTransceiverConfig, std::ostream& os)
 {
-    su::serialize(cacheTransceiverConfig.getMaxNumTokens(), os);
+    su::serialize(cacheTransceiverConfig.getBackendType(), os);
+    su::serialize(cacheTransceiverConfig.getMaxTokensInBuffer(), os);
 }
 
 size_t Serialization::serializedSize(CacheTransceiverConfig const& cacheTransceiverConfig)
 {
     size_t totalSize = 0;
-    totalSize += su::serializedSize(cacheTransceiverConfig.getMaxNumTokens());
+    totalSize += su::serializedSize(cacheTransceiverConfig.getBackendType());
+    totalSize += su::serializedSize(cacheTransceiverConfig.getMaxTokensInBuffer());
     return totalSize;
 }
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.cpp
index 87b0a26a79e..d92336e6bdf 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/cacheTransceiver.cpp
@@ -22,6 +22,7 @@
 #include <ATen/ATen.h>
 #include <pybind11/functional.h>
 #include <pybind11/operators.h>
+#include <pybind11/pytypes.h>
 #include <pybind11/stl.h>
 #include <pybind11/stl_bind.h>
 #include <torch/extension.h>
@@ -80,21 +81,15 @@ void tb::CacheTransceiverBindings::initBindings(py::module_& m)
         .def("check_gen_transfer_status", &BaseCacheTransceiver::checkGenTransferStatus)
         .def("check_gen_transfer_complete", &BaseCacheTransceiver::checkGenTransferComplete);
 
-    py::enum_<tb::CacheTransceiver::CommType>(m, "CommType")
-        .value("UNKNOWN", tb::CacheTransceiver::CommType::UNKNOWN)
-        .value("MPI", tb::CacheTransceiver::CommType::MPI)
-        .value("UCX", tb::CacheTransceiver::CommType::UCX)
-        .value("NIXL", tb::CacheTransceiver::CommType::NIXL);
-
     py::enum_<executor::kv_cache::CacheState::AttentionType>(m, "AttentionType")
         .value("DEFAULT", executor::kv_cache::CacheState::AttentionType::kDEFAULT)
         .value("MLA", executor::kv_cache::CacheState::AttentionType::kMLA);
 
     py::classh<tb::CacheTransceiver, tb::BaseCacheTransceiver>(m, "CacheTransceiver")
-        .def(py::init<tb::kv_cache_manager::BaseKVCacheManager*, tb::CacheTransceiver::CommType,
-                 std::vector<SizeType32>, SizeType32, SizeType32, runtime::WorldConfig, nvinfer1::DataType,
-                 executor::kv_cache::CacheState::AttentionType, std::optional<executor::CacheTransceiverConfig>>(),
-            py::arg("cache_manager"), py::arg("comm_type"), py::arg("num_kv_heads_per_layer"), py::arg("size_per_head"),
+        .def(py::init<tb::kv_cache_manager::BaseKVCacheManager*, std::vector<SizeType32>, SizeType32, SizeType32,
+                 runtime::WorldConfig, nvinfer1::DataType, executor::kv_cache::CacheState::AttentionType,
+                 std::optional<executor::CacheTransceiverConfig>>(),
+            py::arg("cache_manager"), py::arg("num_kv_heads_per_layer"), py::arg("size_per_head"),
             py::arg("tokens_per_block"), py::arg("world_config"), py::arg("dtype"), py::arg("attention_type"),
             py::arg("cache_transceiver_config") = std::nullopt);
 
@@ -102,5 +97,5 @@ void tb::CacheTransceiverBindings::initBindings(py::module_& m)
         .def(py::init<tb::kv_cache_manager::BaseKVCacheManager*, std::optional<size_t>>(), py::arg("cache_manager"),
             py::arg("max_num_tokens") = std::nullopt)
         .def_static("pre_alloc_buffer_size", &tb::kv_cache_manager::CacheTransBufferManager::preAllocBufferSize,
-            py::arg("max_num_tokens") = std::nullopt);
+            py::arg("cache_size_bytes_per_token_per_window"), py::arg("cache_transceiver_config") = py::none());
 }
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index 71a0b4af724..bc0d997e337 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -407,21 +407,46 @@ void initConfigBindings(pybind11::module_& m)
             "stop_token_ids", &tle::GuidedDecodingConfig::getStopTokenIds, &tle::GuidedDecodingConfig::setStopTokenIds)
         .def(py::pickle(guidedDecodingConfigGetstate, guidedDecodingConfigSetstate));
 
-    auto cacheTransceiverConfigGetstate
-        = [](tle::CacheTransceiverConfig const& self) { return py::make_tuple(self.getMaxNumTokens()); };
+    auto cacheTransceiverConfigGetstate = [](tle::CacheTransceiverConfig const& self)
+    { return py::make_tuple(self.getBackendType(), self.getMaxTokensInBuffer()); };
     auto cacheTransceiverConfigSetstate = [](py::tuple const& state)
     {
-        if (state.size() != 1)
+        if (state.size() != 2)
         {
             throw std::runtime_error("Invalid CacheTransceiverConfig state!");
         }
-        return tle::CacheTransceiverConfig(state[0].cast<std::optional<size_t>>());
+        return tle::CacheTransceiverConfig(
+            state[0].cast<tle::CacheTransceiverConfig::BackendType>(), state[1].cast<std::optional<size_t>>());
     };
 
+    py::enum_<tle::CacheTransceiverConfig::BackendType>(m, "CacheTransceiverBackendType")
+        .value("DEFAULT", tle::CacheTransceiverConfig::BackendType::DEFAULT)
+        .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI)
+        .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX)
+        .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL)
+        .def(py::init(
+            [](std::string const& str)
+            {
+                if (str == "DEFAULT" || str == "default")
+                    return tle::CacheTransceiverConfig::BackendType::DEFAULT;
+                if (str == "MPI" || str == "mpi")
+                    return tle::CacheTransceiverConfig::BackendType::MPI;
+                if (str == "UCX" || str == "ucx")
+                    return tle::CacheTransceiverConfig::BackendType::UCX;
+                if (str == "NIXL" || str == "nixl")
+                    return tle::CacheTransceiverConfig::BackendType::NIXL;
+                throw std::runtime_error("Invalid backend type: " + str);
+            }));
+
+    py::implicitly_convertible<std::string, tle::CacheTransceiverConfig::BackendType>();
+
     py::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(py::init<std::optional<size_t>>(), py::arg("max_num_tokens") = py::none())
-        .def_property("max_num_tokens", &tle::CacheTransceiverConfig::getMaxNumTokens,
-            &tle::CacheTransceiverConfig::setMaxNumTokens)
+        .def(py::init<std::optional<tle::CacheTransceiverConfig::BackendType>, std::optional<size_t>>(),
+            py::arg("backend") = std::nullopt, py::arg("max_tokens_in_buffer") = std::nullopt)
+        .def_property(
+            "backend", &tle::CacheTransceiverConfig::getBackendType, &tle::CacheTransceiverConfig::setBackendType)
+        .def_property("max_tokens_in_buffer", &tle::CacheTransceiverConfig::getMaxTokensInBuffer,
+            &tle::CacheTransceiverConfig::setMaxTokensInBuffer)
         .def(py::pickle(cacheTransceiverConfigGetstate, cacheTransceiverConfigSetstate));
 
     auto executorConfigGetState = [](py::object const& self)
diff --git a/cpp/tests/executor/disaggExecutorTest.cpp b/cpp/tests/executor/disaggExecutorTest.cpp
index 49c8c00f048..75ab6dccb44 100644
--- a/cpp/tests/executor/disaggExecutorTest.cpp
+++ b/cpp/tests/executor/disaggExecutorTest.cpp
@@ -662,6 +662,8 @@ TEST_P(DisaggParamsTest, DisaggTokenComparison)
     KvCacheConfig kvCacheConfig{true, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
     executorConfig.setKvCacheConfig(kvCacheConfig);
     executorConfig.setRequestStatsMaxIterations(1000);
+    executorConfig.setCacheTransceiverConfig(
+        texec::CacheTransceiverConfig(texec::CacheTransceiverConfig::BackendType::DEFAULT));
     auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
     auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
     auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
@@ -894,6 +896,8 @@ TEST_P(DisaggOrchestratorParamsTest, DisaggTokenComparison)
             spawnProcess ? std::nullopt : std::optional<std::vector<SizeType32>>(participantIdsEachInstance.at(in)),
             orchestratorConfig};
         executorConfig.setParallelConfig(parallelConfig);
+        executorConfig.setCacheTransceiverConfig(
+            texec::CacheTransceiverConfig(texec::CacheTransceiverConfig::BackendType::DEFAULT));
         if (in < contextNum)
         {
             ctxExecutorConfigs.push_back(executorConfig);
@@ -994,6 +998,8 @@ TEST_P(ConditionalDisaggParamsTest, DisaggTokenComparison)
     KvCacheConfig kvCacheConfig{true, std::nullopt, std::nullopt, std::nullopt, freeGpuMemoryFraction};
     executorConfig.setKvCacheConfig(kvCacheConfig);
     executorConfig.setRequestStatsMaxIterations(1000);
+    executorConfig.setCacheTransceiverConfig(
+        texec::CacheTransceiverConfig(CacheTransceiverConfig::BackendType::DEFAULT));
     auto manager = tr::BufferManager(std::make_shared<tr::CudaStream>());
     auto const& givenInput = tr::utils::loadNpy(manager, inputPath.string(), tr::MemoryType::kCPU);
     auto [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(*givenInput, modelIds.padId);
diff --git a/cpp/tests/unit_tests/batch_manager/cacheTransBufferTest.cpp b/cpp/tests/unit_tests/batch_manager/cacheTransBufferTest.cpp
index 996b7b97237..27e1590e6a2 100644
--- a/cpp/tests/unit_tests/batch_manager/cacheTransBufferTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/cacheTransBufferTest.cpp
@@ -18,6 +18,7 @@
 #include "tensorrt_llm/batch_manager/cacheTransBuffer.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/common/envUtils.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include <gtest/gtest.h>
@@ -110,8 +111,13 @@ TEST_F(CacheTransBufferTest, TestPreAllocBufferSize)
         size_t sendBufferCount = tensorrt_llm::common::getEnvParallelCacheSend()
             ? tensorrt_llm::common::getEnvKVCacheSendMaxConcurrenceNum()
             : 1;
-        size_t bufferSizeBytes = CacheTransBufferManager::preAllocBufferSize(maxNumTokens)
-            * kvCacheSizePerToken(4, 2, 64, CacheType::kSELFKONLY);
+        size_t cacheSizeBytesPerToken = kvCacheSizePerToken(4, 2, 64, CacheType::kSELFKONLY);
+        std::map<SizeType32, SizeType32> cacheSizeBytesPerTokenPerWindow{
+            {maxBlocksPerSeq * tokensPerBlock, cacheSizeBytesPerToken}};
+        tensorrt_llm::executor::CacheTransceiverConfig cacheTransceiverConfig{
+            tensorrt_llm::executor::CacheTransceiverConfig::BackendType::UCX, maxNumTokens};
+        size_t bufferSizeBytes
+            = CacheTransBufferManager::preAllocBufferSize(cacheSizeBytesPerTokenPerWindow, cacheTransceiverConfig);
         auto bufferId = mTransBufferManager->assignBufferIndexForSend();
         EXPECT_TRUE(bufferId.has_value());
         EXPECT_EQ(bufferId.value(), 0);
@@ -149,15 +155,18 @@ TEST_F(CacheTransBufferTest, TestPreAllocBufferSize2)
         size_t sendBufferCount = tensorrt_llm::common::getEnvParallelCacheSend()
             ? tensorrt_llm::common::getEnvKVCacheSendMaxConcurrenceNum()
             : 1;
-        size_t bufferSizeBytes = CacheTransBufferManager::preAllocBufferSize(maxNumTokens)
-            * kvCacheSizePerToken(4, 2, 64, CacheType::kSELF);
+        size_t cacheSizeBytesPerToken = kvCacheSizePerToken(4, 2, 64, CacheType::kSELF);
+        tensorrt_llm::executor::CacheTransceiverConfig cacheTransceiverConfig{
+            tensorrt_llm::executor::CacheTransceiverConfig::BackendType::UCX, maxNumTokens};
+        std::map<SizeType32, SizeType32> cacheSizeBytesPerTokenPerWindow{
+            {maxBlocksPerSeq * tokensPerBlock, cacheSizeBytesPerToken}};
+        size_t bufferSizeBytes
+            = CacheTransBufferManager::preAllocBufferSize(cacheSizeBytesPerTokenPerWindow, cacheTransceiverConfig);
         auto bufferId = mTransBufferManager->assignBufferIndexForSend();
         EXPECT_TRUE(bufferId.has_value());
         EXPECT_EQ(bufferId.value(), 0);
         EXPECT_EQ(bufferSizeBytes,
             mTransBufferManager->getSendBuffer(bufferId)->getSizeInBytes() * (recvbufferCount + sendBufferCount));
-        TLLM_LOG_INFO("bufferSizeBytes: %ld , getSizeINBytes: %ld", bufferSizeBytes,
-            mTransBufferManager->getSendBuffer(bufferId)->getSizeInBytes() * (recvbufferCount + sendBufferCount));
         mTransBufferManager->freeBufferIndexForSend(bufferId);
         exit(testing::Test::HasFailure() ? 1 : 0);
     }
diff --git a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
index d29cf0350ca..18f7e6f5379 100644
--- a/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
+++ b/cpp/tests/unit_tests/executor/serializeUtilsTest.cpp
@@ -785,8 +785,8 @@ TEST(SerializeUtilsTest, ExecutorConfig)
         texec::SpeculativeDecodingConfig(true),
         texec::GuidedDecodingConfig(
             texec::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR, std::initializer_list<std::string>{"eos"}),
-        std::vector{tensorrt_llm::executor::AdditionalModelOutput{"output_name"}}, texec::CacheTransceiverConfig(1024),
-        true, true, true);
+        std::vector{tensorrt_llm::executor::AdditionalModelOutput{"output_name"}},
+        texec::CacheTransceiverConfig(std::nullopt, 1024), true, true, true);
     auto executorConfig2 = serializeDeserialize(executorConfig);
 
     EXPECT_EQ(executorConfig.getMaxBeamWidth(), executorConfig2.getMaxBeamWidth());
@@ -862,7 +862,9 @@ TEST(SerializeUtilsTest, MethodReturnType)
 
 TEST(SerializeUtilsTest, CacheTransceiverConfig)
 {
-    texec::CacheTransceiverConfig cacheTransceiverConfig(1024);
+    texec::CacheTransceiverConfig cacheTransceiverConfig(
+        tensorrt_llm::executor::CacheTransceiverConfig::BackendType::UCX, 1024);
     auto cacheTransceiverConfig2 = serializeDeserialize(cacheTransceiverConfig);
-    EXPECT_EQ(cacheTransceiverConfig.getMaxNumTokens(), cacheTransceiverConfig2.getMaxNumTokens());
+    EXPECT_EQ(cacheTransceiverConfig.getBackendType(), cacheTransceiverConfig2.getBackendType());
+    EXPECT_EQ(cacheTransceiverConfig.getMaxTokensInBuffer(), cacheTransceiverConfig2.getMaxTokensInBuffer());
 }
diff --git a/docs/source/advanced/disaggregated-service.md b/docs/source/advanced/disaggregated-service.md
index 757b1da81f4..426d327c18b 100644
--- a/docs/source/advanced/disaggregated-service.md
+++ b/docs/source/advanced/disaggregated-service.md
@@ -16,8 +16,6 @@ An [architectural and performance overview](../../../docs/source/blogs/tech_blog
 
 TRT-LLM uses some environment variables to control the behavior of disaggregated service.
 
-* `TRTLLM_USE_UCX_KVCACHE`: Specifies whether to use UCX for KV cache transfer. The default value is `0`. This must be enabled when using a disaggregated service.
-
 * `TRTLLM_PARALLEL_CACHE_SEND`: If set to `1`, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is `0`.
 
 * `TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP`: If set to `1`, generationExecutor will not overlap KV cache transfer with model inference. The default value is `0`.
@@ -66,55 +64,19 @@ A. Yes, it's recommended that different executor use different GPUs . We support
 
 *Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`*
 
-A. Please set the environment variables
-```
-export TRTLLM_USE_UCX_KVCACHE=1
-```
+A. please set `backendType` of `CacheTransceiverConfig`.
+```cpp
+ExecutorConfig executorConfig{...};
 
-*Q. Why do some profiling tools show that TRT-LLM's KV cache transfer does not utilize NVLink even on devices equipped with NVLink?*
+executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT));
+```
 
-A. Please check version of `UCX` with `ucx_info -v`.
-If the version of UCX <=1.17, set the environment variables `UCX_RNDV_FRAG_MEM_TYPE=cuda` and `UCX_MEMTYPE_CACHE=n` to enable NVLink. For BlackWell architecture GPUs, UCX version >=1.19 is required to enable NVLink.
-If the version of UCX >=1.18, there are several ways to enable NVLink:
-1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda`, `UCX_CUDA_COPY_DMABUF=no`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
-2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`. $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
+When the environment variable `TRTLLM_USE_MPI_KVCACHE=1` is set, TRT-LLM will transfer the KV cache using `CUDA-aware MPI`. All executor processes involved must share the same MPI world communicator. Consequently, with `TRTLLM_USE_MPI_KVCACHE=1`, TRT-LLM only supports launching multiple executors via `MPI`. Additionally, the `CommunicationMode` for the executors must be set to `kLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for the `disaggregated-service`. These restrictions do not apply when `TRTLLM_USE_UCX_KVCACHE=1` is set.
 
 *Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?*
 
-A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer, but it is not enabled by default. There are several ways to enable GPU direct RDMA:
-1. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B`,`UCX_RNDV_FRAG_MEM_TYPE=cuda`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`.
-2. Set the environment variables `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size`, `UCX_MEMTYPE_CACHE=n` and `UCX_RNDV_PIPELINE_ERROR_HANDLING=y`, $Size represents the size of the buffer for KV cache transfer, which is recommended to be larger than the size of the KV cache for the longest request.
-
-*Q. Are there any guidelines for performance tuning of KV cache transfer?*
-
-A. Depending on the user's use case, certain sets of environment variables can help avoid poor KV cache transfer performance.
-
-Environment Variable Set A
-
-```
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
-export UCX_RNDV_FRAG_MEM_TYPES=cuda
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-```
-This set allows KV cache transfers to utilize NVLink within nodes and GDRDMA between nodes.
-
-Environment Variable Set B
-
-```
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=0B
-export UCX_CUDA_COPY_ASYNC_MEM_TYPE=cuda
-export UCX_CUDA_COPY_DMABUF=no
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-```
-Set B may provide slightly better performance on a single node compared to Set A. However, when transferring KV cache across multiple nodes, it may cause program instability.
+A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.
 
-Environment Variable Set C
+*Q. What causes the substantial bandwidth fluctuations in kvCache transfers, especially during the first few requests following service initialization?*
 
-```
-export TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=$Size
-export UCX_MEMTYPE_CACHE=n
-export UCX_RNDV_PIPELINE_ERROR_HANDLING=y
-```
-Set C can achieve better performance than Sets A and B, both within and between nodes. However, if the KV cache size exceeds the specified $Size, performance may degrade.
+A. The communication for kvCache transfer between executors are established dynamically. The connection establishment process incurs significant overhead, which explains the apparently lower kvCache transfer bandwidth observed during the initial requests after service startup. This lower bandwidth reflects the inclusion of connection establishment overhead. When conducting benchmarks, it is recommended to perform a warm-up phase to ensure accurate performance measurements.
diff --git a/docs/source/scripts/disaggregated/gen_yaml.py b/docs/source/scripts/disaggregated/gen_yaml.py
index 1d198a9766d..859a07310ab 100644
--- a/docs/source/scripts/disaggregated/gen_yaml.py
+++ b/docs/source/scripts/disaggregated/gen_yaml.py
@@ -176,7 +176,8 @@ def gen_config_file(config_path: str,
             'disable_overlap_scheduler': True,
             'kv_cache_dtype': 'fp8',
             'cache_transceiver_config': {
-                'max_num_tokens': 8320,
+                'backend': 'default',
+                'max_tokens_in_buffer': 8320,
             },
         },
         'generation_servers': {
@@ -199,7 +200,8 @@ def gen_config_file(config_path: str,
                 'backend': 'TRTLLM',
             },
             'cache_transceiver_config': {
-                'max_num_tokens': 8320,
+                'backend': 'default',
+                'max_tokens_in_buffer': 8320,
             },
         }
     }
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
index 120706dd01a..13abb8c73d6 100644
--- a/examples/disaggregated/README.md
+++ b/examples/disaggregated/README.md
@@ -4,14 +4,25 @@ To run TRT-LLM in disaggregated mode, you must first launch context (prefill) an
 
 ## Launching context and generation servers using multiple independent `trtllm-serve` commands
 
+We use the `cache_transceiver_config` configuration to set up disaggregated serving, which includes the following parameters:
+
+```
+cache_transceiver_config:
+  backend: <str>
+  max_tokens_in_buffer: <int>
+```
+
+`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX.
+
+`max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance.
+
 You can use multiple `trtllm-serve` commands to launch the context and generation servers that will be used
 for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
 
 ```
-echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  max_num_tokens: 2048" > context_extra-llm-api-config.yml
-echo -e "cache_transceiver_config:\n  max_num_tokens: 2048" > gen_extra-llm-api-config.yml
+echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml
+echo -e "cache_transceiver_config:\n  backend: UCX\n  max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml
 
-export TRTLLM_USE_UCX_KVCACHE=1
 #Context servers
 CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 &
 CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 &
@@ -128,6 +139,8 @@ context_servers:
   pipeline_parallel_size: 1
   kv_cache_config:
     free_gpu_memory_fraction: 0.9
+  cache_transceiver_config:
+    backend: UCX
   urls:
       - "localhost:8001"
       - "localhost:8002"
@@ -135,6 +148,8 @@ generation_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: UCX
   urls:
       - "localhost:8003"
 ```
@@ -143,3 +158,7 @@ Once the context and generation servers are launched, you can again launch the d
 ```
 trtllm-serve disaggregated -c disagg_config.yaml
 ```
+
+## Know Issues
+
+The MPI communication backend for kvCache transfer has been deprecated and may not be supported in the future. When using the MPI backend, the environment variable `TRTLLM_USE_MPI_KVCACHE=1` should be set to avoid conflicts between mpi4py and kvCache transfer.
diff --git a/examples/disaggregated/disagg_config.yaml b/examples/disaggregated/disagg_config.yaml
index 6d5314f235c..ae72c1b074e 100644
--- a/examples/disaggregated/disagg_config.yaml
+++ b/examples/disaggregated/disagg_config.yaml
@@ -10,11 +10,15 @@ context_servers:
   pipeline_parallel_size: 1
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
+  cache_transceiver_config:
+    backend: "default"
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "default"
   urls:
       - "localhost:8002"
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
index a7db4910b78..37a82df323b 100644
--- a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
+++ b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -2,6 +2,7 @@
 from os import getenv
 
 import tensorrt_llm
+from tensorrt_llm import logger
 from tensorrt_llm.bindings import WorldConfig
 from tensorrt_llm.bindings.executor import CacheTransceiverConfig
 from tensorrt_llm.mapping import Mapping
@@ -10,9 +11,9 @@
 from .resource_manager import KVCacheManager
 
 CacheTransceiverCpp = tensorrt_llm.bindings.internal.batch_manager.CacheTransceiver
-CommTypeCpp = tensorrt_llm.bindings.internal.batch_manager.CommType
 AttentionTypeCpp = tensorrt_llm.bindings.internal.batch_manager.AttentionType
 CacheTransBufferManagerCpp = tensorrt_llm.bindings.internal.batch_manager.CacheTransBufferManager
+BackendTypeCpp = tensorrt_llm.bindings.executor.CacheTransceiverBackendType
 
 
 def mapping_to_world_config(mapping: Mapping) -> WorldConfig:
@@ -30,21 +31,27 @@ def create_kv_cache_transceiver(
         mapping: Mapping, kv_cache_manager: KVCacheManager,
         attention_type: AttentionTypeCpp,
         cache_transceiver_config: CacheTransceiverConfig):
-
-    comm_type = None
-    if getenv("TRTLLM_USE_UCX_KVCACHE"):
-        comm_type = CommTypeCpp.UCX
-    elif getenv("TRTLLM_USE_NIXL_KVCACHE"):
-        comm_type = CommTypeCpp.NIXL
-    elif getenv("TRTLLM_USE_MPI_KVCACHE"):
-        comm_type = CommTypeCpp.MPI
-
-    cache_transceiver = None
-    if comm_type is not None:
-        cache_transceiver = BindKvCacheTransceiver(mapping, comm_type,
-                                                   kv_cache_manager,
-                                                   attention_type,
-                                                   cache_transceiver_config)
+    if cache_transceiver_config is None or (cache_transceiver_config.backend
+                                            is None):
+        logger.info("cache_transceiver is disabled")
+        return None
+    if (cache_transceiver_config.backend == BackendTypeCpp.DEFAULT):
+
+        backend_type = BackendTypeCpp.UCX
+        if getenv("TRTLLM_USE_UCX_KVCACHE"):
+            backend_type = BackendTypeCpp.UCX
+        elif getenv("TRTLLM_USE_NIXL_KVCACHE"):
+            backend_type = BackendTypeCpp.NIXL
+        elif getenv("TRTLLM_USE_MPI_KVCACHE"):
+            backend_type = BackendTypeCpp.MPI
+        cache_transceiver_config.backend = backend_type
+
+    if (cache_transceiver_config.backend == BackendTypeCpp.MPI):
+        logger.warning(
+            "MPI CacheTransceiver is deprecated, UCX or NIXL is recommended")
+    cache_transceiver = BindKvCacheTransceiver(mapping, kv_cache_manager,
+                                               attention_type,
+                                               cache_transceiver_config)
 
     return cache_transceiver
 
@@ -78,8 +85,7 @@ def check_gen_transfer_complete(self):
 
 class BindKvCacheTransceiver(KvCacheTransceiver):
 
-    def __init__(self, mapping: Mapping, comm_type: CommTypeCpp,
-                 kv_cache_manager: KVCacheManager,
+    def __init__(self, mapping: Mapping, kv_cache_manager: KVCacheManager,
                  attention_type: AttentionTypeCpp,
                  cache_transceiver_config: CacheTransceiverConfig):
         world_config = mapping_to_world_config(mapping)
@@ -88,7 +94,7 @@ def __init__(self, mapping: Mapping, comm_type: CommTypeCpp,
         tokens_per_block = kv_cache_manager.tokens_per_block
         dtype = kv_cache_manager.dtype
 
-        self.impl = CacheTransceiverCpp(kv_cache_manager.impl, comm_type,
+        self.impl = CacheTransceiverCpp(kv_cache_manager.impl,
                                         num_kv_heads_per_layer, head_dim,
                                         tokens_per_block, world_config, dtype,
                                         attention_type,
@@ -120,7 +126,7 @@ def __init__(self, kv_cache_manager: KVCacheManager, max_num_tokens: int):
                                                max_num_tokens)
 
     @staticmethod
-    def pre_alloc_buffer_size(max_num_tokens: int,
-                              kv_cache_size_per_token: int):
+    def pre_alloc_buffer_size(kv_cache_size_per_token: int,
+                              cache_transceiver_config: CacheTransceiverConfig):
         return CacheTransBufferManagerCpp.pre_alloc_buffer_size(
-            max_num_tokens) * kv_cache_size_per_token
+            kv_cache_size_per_token, cache_transceiver_config)
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index c8518c83a81..74c754651d1 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -1346,6 +1346,8 @@ def _fetch_new_requests(self) -> List[RequestQueueItem]:
 
             # In disaggregated serving, we might get either context request or
             # generation request. In IFB, we only get context request from request queue
+            # In IFB, we only get context request from request queue
+
             if self.kv_cache_transceiver:
                 for req_item in new_requests_cur_rank:
                     if req_item.request.request_type == RequestType.REQUEST_TYPE_CONTEXT_ONLY:
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index ddbcba2a115..35357e658a8 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -429,7 +429,6 @@ def disaggregated_mpi_worker(config_file: Optional[str], log_level: str):
         disagg_cfg.server_configs)
 
     logger.set_level(log_level)
-    os.environ['TRTLLM_USE_MPI_KVCACHE'] = "1"
     set_mpi_comm(sub_comm)
     logger.info(
         f"mpi_session is provided for LLM instance. Global MPI rank: {global_mpi_rank()}, sub-comm MPI rank: {mpi_rank()}"
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index a82d0d71e5f..68fa336db89 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -406,6 +406,10 @@ def _enqueue_request(self, request: GenerationRequest) -> int:
         context_phase_params = None
         request_type = tllm.RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION
         if request.disaggregated_params is not None:
+            assert (
+                not self._is_pytorch_backend
+                or self.engine.kv_cache_transceiver is not None
+            ), "kv_cache_transceiver is disabled, please set 'cache_transceiver_config: backend:<backend_type>` in config file for disaggregated serving"
             request_type = request.disaggregated_params.get_request_type()
             if request_type == tllm.RequestType.REQUEST_TYPE_GENERATION_ONLY:
                 context_phase_params = request.disaggregated_params.get_context_phase_params(
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 111d779ef39..27fff5ef13e 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -879,12 +879,20 @@ class CacheTransceiverConfig(BaseModel, PybindMirror):
     """
     Configuration for the cache transceiver.
     """
-    max_num_tokens: Optional[int] = Field(
+
+    backend: Optional[Literal["default", "ucx", "nixl", "mpi"]] = Field(
+        default=None,
+        description=
+        "The communication backend type to use for the cache transceiver.")
+
+    max_tokens_in_buffer: Optional[int] = Field(
         default=None,
         description="The max number of tokens the transfer buffer can fit.")
 
     def _to_pybind(self):
-        return _CacheTransceiverConfig(max_num_tokens=self.max_num_tokens)
+        return _CacheTransceiverConfig(
+            backend=self.backend,
+            max_tokens_in_buffer=self.max_tokens_in_buffer)
 
 
 @dataclass
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
index 67915d0728f..fee38e723e6 100644
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -195,6 +195,8 @@ def test_auto_dtype(self, disable_overlap_scheduler):
         gen_server_config = {
             "disable_overlap_scheduler": disable_overlap_scheduler
         }
+        ctx_server_config["cache_transceiver_config"] = {"backend": "default"}
+        gen_server_config["cache_transceiver_config"] = {"backend": "default"}
         disaggregated_server_config = {
             "hostname": "localhost",
             "port": 8000,
@@ -232,11 +234,17 @@ def test_ngram(self):
         ctx_server_config = {
             "disable_overlap_scheduler": True,
             "kv_cache_config": kv_cache_config,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
         }
         gen_server_config = {
             "disable_overlap_scheduler": True,
             "speculative_config": speculative_decoding_config,
             "kv_cache_config": kv_cache_config,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
         }
         disaggregated_server_config = {
             "hostname": "localhost",
@@ -274,13 +282,19 @@ def test_eagle3(self, overlap_scheduler):
             "disable_overlap_scheduler": True,
             "speculative_config": speculative_decoding_config,
             "kv_cache_config": kv_cache_config,
-            "max_num_tokens": 13393 * 2
+            "max_num_tokens": 13393 * 2,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
         }
         gen_server_config = {
             "disable_overlap_scheduler": not overlap_scheduler,
             "speculative_config": speculative_decoding_config,
             "kv_cache_config": kv_cache_config,
-            "max_num_tokens": 13393 * 2
+            "max_num_tokens": 13393 * 2,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
         }
         disaggregated_server_config = {
             "hostname": "localhost",
@@ -312,6 +326,8 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self, overlap_scheduler):
         ctx_server_config = {"disable_overlap_scheduler": True}
         gen_server_config = {"disable_overlap_scheduler": overlap_scheduler}
+        ctx_server_config["cache_transceiver_config"] = {"backend": "default"}
+        gen_server_config["cache_transceiver_config"] = {"backend": "default"}
         disaggregated_server_config = {
             "hostname": "localhost",
             "port": 8000,
@@ -347,6 +363,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self, overlap_scheduler, mtp_nextn):
         ctx_server_config = {"disable_overlap_scheduler": True}
         gen_server_config = {"disable_overlap_scheduler": not overlap_scheduler}
+        ctx_server_config["cache_transceiver_config"] = {"backend": "default"}
+        gen_server_config["cache_transceiver_config"] = {"backend": "default"}
         if mtp_nextn > 0:
             ctx_server_config["speculative_config"] = {
                 "decoding_type": "MTP",
@@ -389,11 +407,17 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self, overlap_scheduler):
         ctx_server_config = {
             "disable_overlap_scheduler": True,
-            "cuda_graph_config": None
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
         }
         gen_server_config = {
             "disable_overlap_scheduler": overlap_scheduler,
-            "cuda_graph_config": None
+            "cuda_graph_config": None,
+            "cache_transceiver_config": {
+                "backend": "default"
+            }
         }
         ctx_server_config["kv_cache_config"] = {
             "max_attention_window": [512, 512, 512, 512, 512, 32768],
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
index cb776b0f258..6db8a0f1a93 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance.yaml
@@ -20,6 +20,8 @@ context_servers:
     enable_partial_reuse: False
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
       - "localhost:8002"
@@ -32,6 +34,8 @@ generation_servers:
   max_seq_len: 4096
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   kv_cache_config:
     enable_block_reuse: True
     enable_partial_reuse: False
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml
index edb7d62ba00..cc275b98c7c 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_aware_balance_deepseek_v3.yaml
@@ -16,6 +16,8 @@ context_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.1
+  cache_transceiver_config:
+    backend: "default"
   urls:
       - "localhost:8001"
       - "localhost:8002"
@@ -30,6 +32,8 @@ generation_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.1
+  cache_transceiver_config:
+    backend: "default"
   urls:
       - "localhost:8003"
       - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
index 30662441dbd..86da31c42bf 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse.yaml
@@ -14,6 +14,8 @@ context_servers:
     enable_block_reuse: True
     enable_partial_reuse: True
     event_buffer_max_size: 1024
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -27,5 +29,7 @@ generation_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.05
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml
index 4bcca2967bb..e76a253c1ae 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cache_reuse_deepseek_v3.yaml
@@ -14,6 +14,8 @@ context_servers:
     enable_block_reuse: True
     enable_partial_reuse: True
     event_buffer_max_size: 1024
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -27,5 +29,7 @@ generation_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.05
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
index daf3c286d7c..2292fe22aaf 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional.yaml
@@ -17,6 +17,8 @@ context_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.15
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -30,5 +32,7 @@ generation_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.15
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml
index 59e713ad91a..345a958fa5e 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_conditional_deepseek_v3.yaml
@@ -17,6 +17,8 @@ context_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.15
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -30,5 +32,7 @@ generation_servers:
     enable_partial_reuse: True
     event_buffer_max_size: 1024
     free_gpu_memory_fraction: 0.15
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
index d62a9c42cd9..1f63caed57f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite.yaml
@@ -9,11 +9,15 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
index 4286a58eef8..97c03fbbcb1 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp.yaml
@@ -13,6 +13,8 @@ context_servers:
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
   enable_attention_dp: true
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -20,5 +22,7 @@ generation_servers:
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
   enable_attention_dp: false
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
index cf65a53f4ff..25612d4a784 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_one_mtp_attention_dp_overlap.yaml
@@ -13,6 +13,8 @@ context_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -21,5 +23,7 @@ generation_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
   disable_overlap_scheduler: False
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml
index eeac6135487..facc4603306 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp1_gentp1_deepseek_v3_lite_two_mtp.yaml
@@ -13,6 +13,8 @@ context_servers:
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
   enable_attention_dp: true
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -22,3 +24,5 @@ generation_servers:
   enable_attention_dp: false
   urls:
       - "localhost:8002"
+  cache_transceiver_config:
+    backend: default
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
index e4ee818e782..729bdf2cf99 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1.yaml
@@ -9,12 +9,16 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 2
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
       - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml
index 2e64638bafe..bde3132f8a1 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp1_trt_backend.yaml
@@ -6,12 +6,16 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 2
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
       - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
index 5c560cb77aa..1bc20842867 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml
@@ -9,11 +9,15 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
index 94ac965b19a..28d4c3556e2 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp.yaml
@@ -10,6 +10,8 @@ context_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -17,5 +19,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
index 0cb3ef15351..0d05bef459e 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one.yaml
@@ -10,6 +10,8 @@ context_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: true
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -17,5 +19,7 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: false
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
index 8403a61fd6d..fa771b9e30f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_one_mtp.yaml
@@ -13,6 +13,8 @@ context_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: true
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -20,5 +22,8 @@ generation_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   enable_attention_dp: false
+  cache_transceiver_config:
+    backend: default
+
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
index c893c8fff83..9398f7ddd26 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap.yaml
@@ -10,6 +10,8 @@ context_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: True
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -18,5 +20,7 @@ generation_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: True
   disable_overlap_scheduler: False
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
index 1171fb4f102..f8c04735eb3 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_attention_dp_overlap_cuda_graph.yaml
@@ -9,6 +9,8 @@ context_servers:
   pipeline_parallel_size: 1
   enable_attention_dp: true
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -19,5 +21,7 @@ generation_servers:
   cuda_graph_config:
     enable_padding: False
   disable_overlap_scheduler: False
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml
new file mode 100644
index 00000000000..912178b7f62
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml
@@ -0,0 +1,22 @@
+hostname: localhost
+port: 8000
+model: DeepSeek-V3-Lite/fp8
+free_gpu_memory_fraction: 0.25
+backend: "pytorch"
+disable_overlap_scheduler: True
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "mpi"
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "mpi"
+  urls:
+      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml
new file mode 100644
index 00000000000..e4fd09a1ce1
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml
@@ -0,0 +1,22 @@
+hostname: localhost
+port: 8000
+model: DeepSeek-V3-Lite/fp8
+free_gpu_memory_fraction: 0.25
+backend: "pytorch"
+disable_overlap_scheduler: True
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "nixl"
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "nixl"
+  urls:
+      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
index 18acc70f9ac..9ace31717ec 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_overlap_cuda_graph.yaml
@@ -8,6 +8,8 @@ context_servers:
   tensor_parallel_size: 2
   pipeline_parallel_size: 1
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -17,5 +19,7 @@ generation_servers:
   cuda_graph_config:
     enable_padding: False
   disable_overlap_scheduler: False
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml
new file mode 100644
index 00000000000..b21637529bf
--- /dev/null
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml
@@ -0,0 +1,22 @@
+hostname: localhost
+port: 8000
+model: DeepSeek-V3-Lite/fp8
+free_gpu_memory_fraction: 0.25
+backend: "pytorch"
+disable_overlap_scheduler: True
+context_servers:
+  num_instances: 1
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "ucx"
+  urls:
+      - "localhost:8001"
+generation_servers:
+  num_instances: 1
+  tensor_parallel_size: 2
+  pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "ucx"
+  urls:
+      - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
index 7009df9fd0f..8b992d210cc 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_cuda_graph_padding.yaml
@@ -15,6 +15,8 @@ context_servers:
   cuda_graph_config:
     batch_sizes: [1,3000]
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -31,5 +33,7 @@ generation_servers:
     enable_padding: True
     batch_sizes: [1,4,8,16,24,32]
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
index 6777ca485d3..f42ea826c05 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only.yaml
@@ -13,6 +13,8 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_block_reuse: False
     enable_partial_reuse: False
+  cache_transceiver_config:
+    backend: default
   print_iter_log: True
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml
index a0b31eb419c..386a8fba01f 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_gen_only_trt_backend.yaml
@@ -11,6 +11,8 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_block_reuse: False
     enable_partial_reuse: False
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
       - "localhost:8003"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
index fd42b7fdc0e..f0766a9c6d2 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_load_balance.yaml
@@ -18,6 +18,8 @@ context_servers:
     free_gpu_memory_fraction: 0.15
     enable_partial_reuse: False
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
       - "localhost:8002"
@@ -35,6 +37,8 @@ generation_servers:
     free_gpu_memory_fraction: 0.15
     enable_partial_reuse: False
   disable_overlap_scheduler: False
+  cache_transceiver_config:
+    backend: "default"
   urls:
       - "localhost:8003"
       - "localhost:8004"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
index e3d8cdb60b9..31e429c440e 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_mixed.yaml
@@ -9,12 +9,16 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 2
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
index 667262df4a3..2f779f598ac 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_ngram.yaml
@@ -8,12 +8,16 @@ context_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "default"
   urls:
     - "localhost:8001"
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: "default"
   urls:
     - "localhost:8002"
   speculative_config:
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
index ea6719cb55d..5cdafaed341 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_overlap.yaml
@@ -15,6 +15,8 @@ context_servers:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
   disable_overlap_scheduler: True
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
@@ -28,5 +30,7 @@ generation_servers:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
   disable_overlap_scheduler: False
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml
index 9b018dfcd98..fa57d987de4 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_trt_backend.yaml
@@ -8,11 +8,15 @@ context_servers:
   pipeline_parallel_size: 1
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8001"
 generation_servers:
   num_instances: 1
   tensor_parallel_size: 1
   pipeline_parallel_size: 1
+  cache_transceiver_config:
+    backend: default
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
index 7e4f0ddec00..b7ecb48b306 100644
--- a/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
+++ b/tests/integration/defs/disaggregated/test_configs/disagg_config_trtllm_sampler.yaml
@@ -15,6 +15,8 @@ context_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
+  cache_transceiver_config:
+    backend: "default"
   disable_overlap_scheduler: True
   urls:
       - "localhost:8001"
@@ -29,6 +31,8 @@ generation_servers:
   kv_cache_config:
     free_gpu_memory_fraction: 0.2
     enable_partial_reuse: False
+  cache_transceiver_config:
+    backend: "default"
   disable_overlap_scheduler: False
   urls:
       - "localhost:8002"
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
index 8648f59d357..251df5bc9dc 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -59,9 +59,17 @@ def get_test_config(test_desc, example_dir, test_root):
         "conditional": (2,
                         f"{test_configs_root}/disagg_config_conditional.yaml"),
         "ngram": (2, f"{test_configs_root}/disagg_config_ngram.yaml"),
-        "deepseek_v3_lite_fp8":
+        "deepseek_v3_lite_fp8_mpi":
         (4,
-         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite.yaml"
+         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_mpi.yaml"
+         ),
+        "deepseek_v3_lite_fp8_ucx":
+        (4,
+         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_ucx.yaml"
+         ),
+        "deepseek_v3_lite_fp8_nixl":
+        (4,
+         f"{test_configs_root}/disagg_config_ctxtp2_gentp2_deepseek_v3_lite_nixl.yaml"
          ),
         "deepseek_v3_lite_fp8_tp1":
         (2,
@@ -129,6 +137,8 @@ def run_disaggregated_test(example_dir,
                            cwd=None):
     """Run disaggregated test with given configuration."""
     cleanup_output_files()
+    run_env = env.copy()
+    run_env["UCX_TLS"] = "^ib"
 
     num_ranks, config_file = get_test_config(test_desc, example_dir,
                                              os.path.dirname(__file__))
@@ -151,14 +161,14 @@ def run_disaggregated_test(example_dir,
                 popen(workers_cmd,
                       stdout=output_workers,
                       stderr=subprocess.STDOUT,
-                      env=env,
+                      env=run_env,
                       cwd=cwd) as workers_proc,
                 # Start server
                 open('output_disagg.log', 'w') as output_disagg,
                 popen(server_cmd,
                       stdout=output_disagg,
                       stderr=subprocess.STDOUT,
-                      env=env,
+                      env=run_env,
                       cwd=cwd) as server_proc):
             client_dir = f"{example_dir}/clients"
             for _ in range(num_iters):
@@ -525,9 +535,10 @@ def test_disaggregated_ngram(disaggregated_test_root, llm_venv,
 @pytest.mark.skip_less_device(4)
 @pytest.mark.parametrize("deepseek_v3_model_root", ['DeepSeek-V3-Lite-fp8'],
                          indirect=True)
-def test_disaggregated_deepseek_v3_lite_fp8(disaggregated_test_root,
-                                            disaggregated_example_root,
-                                            llm_venv, deepseek_v3_model_root):
+def test_disaggregated_deepseek_v3_lite_fp8_mpi(disaggregated_test_root,
+                                                disaggregated_example_root,
+                                                llm_venv,
+                                                deepseek_v3_model_root):
     src_dst_dict = {
         deepseek_v3_model_root:
         f"{llm_venv.get_working_directory()}/DeepSeek-V3-Lite/fp8",
@@ -536,10 +547,11 @@ def test_disaggregated_deepseek_v3_lite_fp8(disaggregated_test_root,
         if not os.path.islink(dst):
             os.makedirs(os.path.dirname(dst), exist_ok=True)
             os.symlink(src, dst, target_is_directory=True)
-
+    env = llm_venv._new_env.copy()
+    env["TRTLLM_USE_MPI_KVCACHE"] = "1"
     run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8",
-                           env=llm_venv._new_env,
+                           "deepseek_v3_lite_fp8_mpi",
+                           env=env,
                            cwd=llm_venv.get_working_directory())
 
 
@@ -607,7 +619,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_ucx(disaggregated_test_root,
     env["TRTLLM_USE_UCX_KVCACHE"] = "1"
     env["UCX_TLS"] = "^ib"
     run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8",
+                           "deepseek_v3_lite_fp8_ucx",
                            env=env,
                            cwd=llm_venv.get_working_directory())
 
@@ -633,7 +645,7 @@ def test_disaggregated_deepseek_v3_lite_fp8_nixl(disaggregated_test_root,
     env["TRTLLM_USE_NIXL_KVCACHE"] = "1"
     env["UCX_TLS"] = "^ib"
     run_disaggregated_test(disaggregated_example_root,
-                           "deepseek_v3_lite_fp8",
+                           "deepseek_v3_lite_fp8_nixl",
                            env=env,
                            cwd=llm_venv.get_working_directory())
 
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
index 5d200d82e73..7521ecde42f 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_etcd.py
@@ -244,14 +244,16 @@ def create_config_files(config):
     context_config_content = """pytorch_backend_config:
   disable_overlap_scheduler: True
 cache_transceiver_config:
-  max_num_tokens: 2048"""
+  backend: "default"
+  max_tokens_in_buffer: 2048"""
 
     with open(CONTEXT_CONFIG_FILE, 'w') as file:
         file.write(context_config_content)
 
     # Create generation config file
     generation_config_content = """cache_transceiver_config:
-  max_num_tokens: 2048"""
+  backend: "default"
+  max_tokens_in_buffer: 2048"""
 
     with open(GENERATION_CONFIG_FILE, 'w') as file:
         file.write(generation_config_content)
diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index e0ab570ec5c..1e1859f5aa6 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -11,7 +11,8 @@
 
 from tensorrt_llm import LLM, DisaggregatedParams, SamplingParams
 from tensorrt_llm._utils import set_mpi_comm
-from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MpiCommSession
+from tensorrt_llm.llmapi import (CacheTransceiverConfig, CudaGraphConfig,
+                                 KvCacheConfig, MpiCommSession)
 from tensorrt_llm.llmapi.llm_args import EagleDecodingConfig
 
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
@@ -43,7 +44,8 @@ def model_path(model_name):
         raise ValueError(f"Unknown model: {model_name}")
 
 
-async def run_worker(kv_cache_config, pytorch_config, model_name, rank):
+async def run_worker(kv_cache_config, cache_transceiver_config, pytorch_config,
+                     model_name, rank):
     assert isinstance(pytorch_config, dict)
     print(f"Running worker {rank}")
     port_name = MPI.Lookup_name('my_port')
@@ -59,7 +61,8 @@ async def run_worker(kv_cache_config, pytorch_config, model_name, rank):
                   enable_chunked_prefill=False,
                   **pytorch_config,
                   _mpi_session=mpi_session,
-                  kv_cache_config=kv_cache_config)
+                  kv_cache_config=kv_cache_config,
+                  cache_transceiver_config=cache_transceiver_config)
         print(f"LLM created")
     except Exception as e:
         print(f"Error creating LLM: {e}")
@@ -103,9 +106,11 @@ def send_requests_to_worker(requests, worker_rank, intercomm):
     return responses
 
 
-def worker_entry_point(kv_cache_config, pytorch_config, model_name, rank):
+def worker_entry_point(kv_cache_config, cache_transceiver_config,
+                       pytorch_config, model_name, rank):
     return asyncio.run(
-        run_worker(kv_cache_config, pytorch_config, model_name, rank))
+        run_worker(kv_cache_config, cache_transceiver_config, pytorch_config,
+                   model_name, rank))
 
 
 def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
@@ -125,16 +130,19 @@ def verify_disaggregated(model, generation_overlap, enable_cuda_graph, prompt,
             cuda_graph_config=CudaGraphConfig() if enable_cuda_graph else None))
 
     kv_cache_configs = [KvCacheConfig(max_tokens=2048 * 8) for _ in range(2)]
+    cache_transceiver_configs = [
+        CacheTransceiverConfig(backend="default") for _ in range(2)
+    ]
     model_names = [model_path(model) for _ in range(2)]
     ranks = [0, 1]
     worker_args = list(
-        zip(kv_cache_configs, worker_pytorch_configs, model_names, ranks))
+        zip(kv_cache_configs, cache_transceiver_configs, worker_pytorch_configs,
+            model_names, ranks))
 
     port_name = MPI.Open_port()
     MPI.Publish_name('my_port', port_name)
 
-    with MPIPoolExecutor(max_workers=2, env={"TRTLLM_USE_MPI_KVCACHE":
-                                             "1"}) as executor:
+    with MPIPoolExecutor(max_workers=2, env={"UCX_TLS": "^ib"}) as executor:
         futures = []
         try:
             for worker_arg in worker_args:
@@ -249,18 +257,21 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
         KvCacheConfig(max_tokens=128, enable_block_reuse=False, dtype="auto")
         for _ in range(2)
     ]
+    cache_transceiver_configs = [
+        CacheTransceiverConfig(backend="default") for _ in range(2)
+    ]
     model_names = [model_path(model) for _ in range(2)]
     ranks = [0, 1]
     worker_args = list(
-        zip(kv_cache_configs, worker_pytorch_configs, model_names, ranks))
+        zip(kv_cache_configs, cache_transceiver_configs, worker_pytorch_configs,
+            model_names, ranks))
 
     port_name = MPI.Open_port()
     MPI.Publish_name('my_port', port_name)
 
     prompt = "European Union is a political and economic union of 27 countries. The European Union is headquartered in Brussels, Belgium. The first president of the European Union was Jean-Claude Juncker. The current president is Ursula von der Leyen. The European Union is a major economic and political entity."
 
-    with MPIPoolExecutor(max_workers=2, env={"TRTLLM_USE_MPI_KVCACHE":
-                                             "1"}) as executor:
+    with MPIPoolExecutor(max_workers=2, env={"UCX_TLS": "^ib"}) as executor:
         futures = []
         try:
             for worker_arg in worker_args:
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
index 0cf65a29aed..0b7a3d7384a 100644
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -589,7 +589,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[T
 disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt
index 19bf09b8b5e..5630dd47312 100644
--- a/tests/integration/test_lists/qa/llm_sanity_test.txt
+++ b/tests/integration/test_lists/qa/llm_sanity_test.txt
@@ -60,7 +60,7 @@ disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_att
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
 disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_trtllm_sampler[TinyLlama-1.1B-Chat-v1.0]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 1599b73a44b..e5a6b700786 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -89,7 +89,7 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=0]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding_4gpus[attention_dp=True-mtp_nextn=2]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb
-  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
+  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 5380afccf86..e9f4ed4401e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -417,9 +417,6 @@ test_e2e.py::test_trtllm_bench_llmapi_launch[trt_backend-llama-v3-llama3-8b] SKI
 examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] SKIP (https://nvbugs/5374145)
 examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5373451)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5360086)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5373962)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5373962)
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5373962)
 stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5375646)
 examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-2-9b-it] SKIP (https://nvbugs/5376087)
 full:GH200/disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8] SKIP (https://nvbugs/5375966)
diff --git a/tests/unittest/bindings/test_executor_bindings.py b/tests/unittest/bindings/test_executor_bindings.py
index 5d9460ffef0..935c4c9bfc3 100644
--- a/tests/unittest/bindings/test_executor_bindings.py
+++ b/tests/unittest/bindings/test_executor_bindings.py
@@ -2463,9 +2463,11 @@ def test_guided_decoding_config_pickle():
 
 
 def test_cache_transceiver_config_pickle():
-    config = trtllm.CacheTransceiverConfig(max_num_tokens=1024)
+    config = trtllm.CacheTransceiverConfig(backend="UCX",
+                                           max_tokens_in_buffer=1024)
     config_copy = pickle.loads(pickle.dumps(config))
-    assert config_copy.max_num_tokens == config.max_num_tokens
+    assert config_copy.backend == config.backend
+    assert config_copy.max_tokens_in_buffer == config.max_tokens_in_buffer
 
 
 def test_executor_config_pickle():

From 21efb500684cde92dbe2f31d39cc8e069b2d57ca Mon Sep 17 00:00:00 2001
From: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:46:10 +0800
Subject: [PATCH 59/88] [TRTLLM-6406] feat: Enable guided decoding with overlap
 scheduler (#6000)

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
---
 cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp     |  4 ++--
 .../features/feature_combination_matrix.md    |  2 +-
 examples/llm-api/llm_guided_decoding.py       |  9 +++----
 tensorrt_llm/_torch/pyexecutor/_util.py       | 11 +++------
 .../_torch/pyexecutor/guided_decoder.py       | 14 +++++------
 .../_torch/pyexecutor/model_engine.py         | 22 -----------------
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 24 +++++++++++++++++--
 .../_torch/pyexecutor/py_executor_creator.py  | 15 +++++++++++-
 .../defs/accuracy/test_llm_api_pytorch.py     |  2 --
 .../apps/_test_openai_chat_structural_tag.py  |  5 +---
 10 files changed, 53 insertions(+), 55 deletions(-)

diff --git a/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp b/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp
index 11b24e7a989..ad4588a6ce5 100644
--- a/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp
+++ b/cpp/tensorrt_llm/thop/logitsBitmaskOp.cpp
@@ -54,8 +54,8 @@ void logitsBitmask(std::vector<torch::Tensor> const& logits, std::vector<torch::
         bitmaskPtrsHost[i] = reinterpret_cast<uint64_t>(bitmask[i].data_ptr());
     }
 
-    auto logitsPtrs = logitsPtrsHost.to(torch::kCUDA);
-    auto bitmaskPtrs = bitmaskPtrsHost.to(torch::kCUDA);
+    auto logitsPtrs = logitsPtrsHost.to(torch::kCUDA, /*non_blocking=*/true);
+    auto bitmaskPtrs = bitmaskPtrsHost.to(torch::kCUDA, /*non_blocking=*/true);
 
     auto stream = at::cuda::getCurrentCUDAStream(logits[0].get_device()).stream();
 
diff --git a/docs/source/torch/features/feature_combination_matrix.md b/docs/source/torch/features/feature_combination_matrix.md
index 8f8d5defe80..f62c1d33aa4 100644
--- a/docs/source/torch/features/feature_combination_matrix.md
+++ b/docs/source/torch/features/feature_combination_matrix.md
@@ -15,4 +15,4 @@
 | KV Cache Reuse             | Yes               | Yes        | Yes                        | Untested              | Untested        | Untested | Yes                       | No                        | Yes           | Yes              | ---            |                        |                       |                 |
 | Slide Window Attention     | Yes               | Yes        | Yes                        | Untested              | Untested        | Untested | Untested                  | Untested                  | Yes           | Yes              | WIP            | ---                    |                       |                 |
 | Logits Post Processor      | No                | Yes        | Yes                        | No                    | Untested        | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |
-| Guided Decoding            | No                | Yes        | Yes                        | Untested              | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
+| Guided Decoding            | Yes               | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |
diff --git a/examples/llm-api/llm_guided_decoding.py b/examples/llm-api/llm_guided_decoding.py
index a5e0f89244d..e5df98e5da3 100644
--- a/examples/llm-api/llm_guided_decoding.py
+++ b/examples/llm-api/llm_guided_decoding.py
@@ -7,12 +7,9 @@
 
 def main():
 
-    # Specify the guided decoding backend; xgrammar is supported currently.
-    llm = LLM(
-        model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-        guided_decoding_backend='xgrammar',
-        disable_overlap_scheduler=True  # Not supported by xgrammar mode
-    )
+    # Specify the guided decoding backend; xgrammar and llguidance are supported currently.
+    llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+              guided_decoding_backend='xgrammar')
 
     # An example from json-mode-eval
     schema = '{"title": "WirelessAccessPoint", "type": "object", "properties": {"ssid": {"title": "SSID", "type": "string"}, "securityProtocol": {"title": "SecurityProtocol", "type": "string"}, "bandwidth": {"title": "Bandwidth", "type": "string"}}, "required": ["ssid", "securityProtocol", "bandwidth"]}'
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
index 88e046eb056..29f1c5d3ac8 100644
--- a/tensorrt_llm/_torch/pyexecutor/_util.py
+++ b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -21,6 +21,7 @@
 from ..speculative import get_spec_decoder
 from .config import PyTorchConfig
 from .config_utils import is_mla, is_nemotron_hybrid
+from .guided_decoder import GuidedDecoder
 from .kv_cache_transceiver import AttentionTypeCpp, create_kv_cache_transceiver
 from .llm_request import ExecutorResponse
 from .model_engine import PyTorchModelEngine
@@ -414,19 +415,12 @@ def create_py_executor_instance(
         start_worker,
         sampler,
         drafter,
+        guided_decoder: Optional[GuidedDecoder] = None,
         lora_config: Optional[LoraConfig] = None,
         garbage_collection_gen0_threshold: Optional[int] = None) -> PyExecutor:
     kv_cache_manager = resources.get(ResourceManagerType.KV_CACHE_MANAGER, None)
 
     spec_config = model_engine.spec_config
-    if mapping.is_last_pp_rank(
-    ) and executor_config.guided_decoding_config is not None:
-        if spec_config is not None:
-            raise ValueError(
-                "Guided decoding is not supported with speculative decoding.")
-        if not pytorch_backend_config.disable_overlap_scheduler:
-            raise ValueError(
-                "Guided decoding is not supported with overlap scheduler.")
 
     logger.info(
         f"max_seq_len={executor_config.max_seq_len}, max_num_requests={executor_config.max_batch_size}, max_num_tokens={executor_config.max_num_tokens}, max_batch_size={executor_config.max_batch_size}"
@@ -543,6 +537,7 @@ def create_py_executor_instance(
         if spec_config is not None else 0,
         kv_cache_transceiver=kv_cache_transceiver,
         draft_model_engine=draft_model_engine,
+        guided_decoder=guided_decoder,
         start_worker=start_worker,
         garbage_collection_gen0_threshold=garbage_collection_gen0_threshold)
 
diff --git a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
index 756c177a6ea..f1b21339b9a 100644
--- a/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
+++ b/tensorrt_llm/_torch/pyexecutor/guided_decoder.py
@@ -3,11 +3,11 @@
 
 import torch
 
+from ..._utils import nvtx_range
 from ...bindings.executor import GuidedDecodingConfig
 from .grammar_matcher import (GrammarMatcher, GrammarMatcherFactory,
                               LLGuidanceMatcherFactory, XGrammarMatcherFactory)
 from .scheduler import ScheduledRequests
-from .seq_slot_manager import SeqSlotManager
 
 
 class GuidedDecoder:
@@ -49,12 +49,12 @@ def __init__(self, guided_decoding_config: GuidedDecodingConfig,
     def bitmask_size(self) -> int:
         return math.ceil(self.vocab_size_padded / 32)
 
-    def build(self, scheduled_requests: ScheduledRequests,
-              resource_manager: SeqSlotManager) -> None:
+    @nvtx_range("GuidedDecoder.build")
+    def build(self, scheduled_requests: ScheduledRequests) -> None:
         for llm_req in scheduled_requests.all_requests():
             if llm_req.guided_decoding_params is None:
                 continue
-            slot = resource_manager.slot_manager.get_slot(llm_req.request_id)
+            slot = llm_req.py_seq_slot
             if llm_req.is_context_init_state and llm_req.context_current_position == llm_req.prepopulated_prompt_len:
                 self.grammar_matchers[
                     slot] = self.grammar_matcher_factory.create(
@@ -75,8 +75,9 @@ def build(self, scheduled_requests: ScheduledRequests,
                 self.bitmask[slot].copy_(self.bitmask_host[slot],
                                          non_blocking=True)
 
+    @nvtx_range("GuidedDecoder.execute")
     def execute(self, scheduled_requests: ScheduledRequests,
-                logits: torch.Tensor, resource_manager: SeqSlotManager) -> None:
+                logits: torch.Tensor) -> None:
         assert logits.size(0) == len(scheduled_requests.context_requests) + len(
             scheduled_requests.generation_requests)
         torch.cuda.current_stream().wait_stream(self._stream)
@@ -88,8 +89,7 @@ def execute(self, scheduled_requests: ScheduledRequests,
             if llm_req.is_context_init_state and not llm_req.is_last_context_chunk:
                 continue
             batched_logits.append(logits[i])
-            slot = resource_manager.slot_manager.get_slot(llm_req.request_id)
-            batched_bitmask.append(self.bitmask[slot])
+            batched_bitmask.append(self.bitmask[llm_req.py_seq_slot])
 
         if len(batched_logits) > 0:
             torch.ops.trtllm.logits_bitmask(batched_logits, batched_bitmask)
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 5333b940ebc..998da7ed70c 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -21,7 +21,6 @@
 from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
 from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
                                  torch_dtype_to_str, trace_func)
-from tensorrt_llm.bindings.executor import GuidedDecodingConfig
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraConfig, LoraModelConfig
@@ -53,7 +52,6 @@
 from .config import LoadFormat, PyTorchConfig
 from .config_utils import is_mla
 from .cuda_graph_runner import DecodingCUDAGraphRunner
-from .guided_decoder import GuidedDecoder
 from .layerwise_nvtx_marker import LayerwiseNvtxMarker
 from .resource_manager import (BaseResourceManager, KVCacheManager,
                                ResourceManager, ResourceManagerType)
@@ -258,7 +256,6 @@ def __init__(
         attn_runtime_features: Optional[AttentionRuntimeFeatures] = None,
         dist: Optional[MPIDist] = None,
         spec_config: Optional["DecodingBaseConfig"] = None,
-        guided_decoding_config: Optional[GuidedDecodingConfig] = None,
         lora_config: Optional[LoraConfig] = None,
         is_draft_model: bool = False,
     ):
@@ -313,13 +310,6 @@ def __init__(
         self.dtype = self.model.config.torch_dtype
         self._init_model_capacity()
 
-        self.guided_decoder: Optional[GuidedDecoder] = None
-        if self.mapping.is_last_pp_rank(
-        ) and guided_decoding_config is not None:
-            self.guided_decoder = GuidedDecoder(guided_decoding_config,
-                                                self.batch_size,
-                                                self.model.vocab_size_padded)
-
         self._torch_compile_backend = None
 
         try:
@@ -2091,18 +2081,6 @@ def capture_forward_fn(inputs: Dict[str, Any]):
                     with MoeLoadBalancerIterContext(moe_load_balancer):
                         outputs = maybe_graph.run(inputs)
 
-            # Note: To overlap the CPU and GPU computation as much as possible,
-            # guided_decoder.build should be called immediately after the launch of the single step;
-            # while guided_decoder.execute should be called right before the samplings.
-            # We can insert other CPU computation between them in the future.
-            if self.mapping.is_last_pp_rank(
-            ) and self.guided_decoder is not None:
-                seq_slot_manager = resource_manager.get_resource_manager(
-                    ResourceManagerType.SEQ_SLOT_MANAGER)
-                self.guided_decoder.build(scheduled_requests, seq_slot_manager)
-                self.guided_decoder.execute(scheduled_requests,
-                                            outputs['logits'], seq_slot_manager)
-
             self._execute_logit_post_processors(scheduled_requests, outputs)
 
             return outputs
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 74c754651d1..c402480b7d9 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -31,6 +31,7 @@
 
 from ..distributed import Distributed
 from ..speculative.drafter import Drafter
+from .guided_decoder import GuidedDecoder
 from .kv_cache_transceiver import KvCacheTransceiver
 from .llm_request import (ExecutorRequest, LlmRequest, LlmRequestState,
                           LlmResponse, executor_request_to_llm_request)
@@ -204,6 +205,7 @@ def __init__(self,
                  max_draft_len: int = 0,
                  kv_cache_transceiver: Optional[KvCacheTransceiver] = None,
                  draft_model_engine: Optional[ModelEngine] = None,
+                 guided_decoder: Optional[GuidedDecoder] = None,
                  garbage_collection_gen0_threshold: Optional[int] = None,
                  start_worker: bool = True):
         super(PyExecutor, self).__init__()
@@ -225,6 +227,7 @@ def __init__(self,
         self.enable_attention_dp = model_engine.enable_attention_dp
         self.sampler = sampler
         self.drafter = drafter
+        self.guided_decoder = guided_decoder
         self.dist = dist
         self.disable_overlap_scheduler = disable_overlap_scheduler
 
@@ -801,6 +804,12 @@ def _executor_loop_pp(self):
                             if self._need_return_logits(scheduled_batch):
                                 logits_host = batch_outputs["logits"].to(
                                     "cpu", non_blocking=True)
+
+                            if self.guided_decoder is not None:
+                                self.guided_decoder.build(scheduled_batch)
+                                self.guided_decoder.execute(
+                                    scheduled_batch, batch_outputs['logits'])
+
                             sample_state = self._sample_async(
                                 scheduled_batch, batch_outputs)
                             sample_state.host.logits = logits_host
@@ -978,6 +987,11 @@ def _executor_loop(self):
 
                     batch_outputs = self._forward_step(scheduled_batch)
 
+                    if self.guided_decoder is not None:
+                        self.guided_decoder.build(scheduled_batch)
+                        self.guided_decoder.execute(scheduled_batch,
+                                                    batch_outputs['logits'])
+
                     sample_state = self._sample_async(scheduled_batch,
                                                       batch_outputs)
 
@@ -1126,6 +1140,14 @@ def _executor_loop_overlap(self):
                     batch_outputs = self._forward_step(scheduled_batch,
                                                        previous_tensors_device)
 
+                    if self.previous_batch is not None:
+                        self._update_requests(self.previous_batch.sample_state)
+
+                    if self.guided_decoder is not None:
+                        self.guided_decoder.build(scheduled_batch)
+                        self.guided_decoder.execute(scheduled_batch,
+                                                    batch_outputs['logits'])
+
                     sample_state = self._sample_async(scheduled_batch,
                                                       batch_outputs)
                     assert sample_state is not None, "Sampling failed"
@@ -1159,8 +1181,6 @@ def _executor_loop_overlap(self):
                     self._terminate_ctx_finished_requests()
 
     def _process_previous_batch(self):
-        self._update_requests(self.previous_batch.sample_state)
-
         if self.kv_cache_transceiver and self.previous_batch.ctx_transmission_reqs:
             for req in self.previous_batch.ctx_transmission_reqs:
                 req.state = LlmRequestState.DISAGG_CONTEXT_TRANS_IN_PROGRESS
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index 09976cb512e..b9eccc90601 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -24,6 +24,7 @@
                     create_py_executor_instance, instantiate_sampler, is_mla)
 from .config import PyTorchConfig
 from .config_utils import is_mla
+from .guided_decoder import GuidedDecoder
 from .model_engine import PyTorchModelEngine
 from .py_executor import PyExecutor
 
@@ -237,7 +238,6 @@ def create_py_executor(
             attn_runtime_features=attn_runtime_features,
             dist=dist,
             spec_config=spec_config,
-            guided_decoding_config=executor_config.guided_decoding_config,
             lora_config=lora_config,
             checkpoint_loader=executor_config.checkpoint_loader,
         )
@@ -344,6 +344,17 @@ def create_py_executor(
         sampler = instantiate_sampler(model_engine, executor_config,
                                       pytorch_backend_config, mapping)
 
+    guided_decoder: Optional[GuidedDecoder] = None
+    if executor_config.guided_decoding_config is not None:
+        if spec_config is not None:
+            raise ValueError(
+                "Guided decoding is not supported with speculative decoding.")
+        if mapping.is_last_pp_rank():
+            guided_decoder = GuidedDecoder(
+                executor_config.guided_decoding_config,
+                executor_config.max_batch_size,
+                model_engine.model.vocab_size_padded)
+
     resources = {}
     estimating_kv_cache = False
     kv_cache_creator = None
@@ -388,6 +399,7 @@ def create_py_executor(
             start_worker=False,
             sampler=sampler,
             drafter=drafter,
+            guided_decoder=guided_decoder,
             lora_config=lora_config,
             garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
         )
@@ -430,6 +442,7 @@ def create_py_executor(
                 start_worker=False,
                 sampler=sampler,
                 drafter=drafter,
+                guided_decoder=guided_decoder,
                 lora_config=lora_config,
                 garbage_collection_gen0_threshold=
                 garbage_collection_gen0_threshold,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index d34a60604bf..8c5b75e65fb 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -287,7 +287,6 @@ def test_guided_decoding(self, backend: str, mocker):
         mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
         llm = LLM(self.MODEL_PATH,
                   guided_decoding_backend=backend,
-                  disable_overlap_scheduler=True,
                   cuda_graph_config=CudaGraphConfig())
         with llm:
             task = JsonModeEval(self.MODEL_NAME)
@@ -300,7 +299,6 @@ def test_guided_decoding_4gpus(self, backend: str, mocker):
         mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
         with LLM(self.MODEL_PATH,
                  guided_decoding_backend=backend,
-                 disable_overlap_scheduler=True,
                  cuda_graph_config=CudaGraphConfig(),
                  tensor_parallel_size=2,
                  pipeline_parallel_size=2) as llm:
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
index aeb46a8a0b0..edf6243c912 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_structural_tag.py
@@ -23,10 +23,7 @@ def temp_extra_llm_api_options_file(request):
     temp_dir = tempfile.gettempdir()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
-        extra_llm_api_options_dict = {
-            "guided_decoding_backend": "xgrammar",
-            "disable_overlap_scheduler": True,
-        }
+        extra_llm_api_options_dict = {"guided_decoding_backend": "xgrammar"}
 
         with open(temp_file_path, 'w') as f:
             yaml.dump(extra_llm_api_options_dict, f)

From de60ae47e3ec29c0637878888fe23843e37f5c22 Mon Sep 17 00:00:00 2001
From: Erin <14718778+hchings@users.noreply.github.com>
Date: Thu, 17 Jul 2025 02:59:51 -0700
Subject: [PATCH 60/88] chores: unwaive a few tests for v1.0 (#6107)

Signed-off-by: Erin Ho <14718778+hchings@users.noreply.github.com>
---
 tests/integration/defs/llmapi/test_llm_examples.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/integration/defs/llmapi/test_llm_examples.py b/tests/integration/defs/llmapi/test_llm_examples.py
index 7b31a8648e1..c9775d416dc 100644
--- a/tests/integration/defs/llmapi/test_llm_examples.py
+++ b/tests/integration/defs/llmapi/test_llm_examples.py
@@ -137,7 +137,6 @@ def test_llmapi_quickstart_atexit(llm_root, engine_dir, llm_venv):
     llm_venv.run_cmd([str(script_path)])
 
 
-@pytest.mark.skip(reason="https://nvbugs/5375671")
 @pytest.mark.skip_less_device_memory(80000)
 def test_llmapi_speculative_decoding_mtp(llm_root, engine_dir, llm_venv):
     _run_llmapi_example(llm_root, engine_dir, llm_venv,
@@ -145,7 +144,6 @@ def test_llmapi_speculative_decoding_mtp(llm_root, engine_dir, llm_venv):
                         f"{llm_models_root()}/DeepSeek-V3-Lite/bf16")
 
 
-@pytest.mark.skip(reason="https://nvbugs/5375671")
 @pytest.mark.skip_less_device_memory(80000)
 def test_llmapi_speculative_decoding_eagle3(llm_root, engine_dir, llm_venv):
     _run_llmapi_example(llm_root, engine_dir, llm_venv,

From 9b45499caa217e756bc6d2b9a89e524b63bce00f Mon Sep 17 00:00:00 2001
From: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Date: Thu, 17 Jul 2025 18:05:45 +0800
Subject: [PATCH 61/88] test: update max_beam_width to 1 due to torchsampler
 changes. (#6101)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
---
 tests/unittest/llmapi/test_llm_args.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
index c1bfdcc4001..801a2bf12a9 100644
--- a/tests/unittest/llmapi/test_llm_args.py
+++ b/tests/unittest/llmapi/test_llm_args.py
@@ -372,18 +372,18 @@ class TestTorchLlmArgs:
     def test_runtime_sizes(self):
         llm = TorchLLM(
             llama_model_path,
-            max_beam_width=4,
+            max_beam_width=1,
             max_num_tokens=256,
             max_seq_len=128,
             max_batch_size=8,
         )
 
-        assert llm.args.max_beam_width == 4
+        assert llm.args.max_beam_width == 1
         assert llm.args.max_num_tokens == 256
         assert llm.args.max_seq_len == 128
         assert llm.args.max_batch_size == 8
 
-        assert llm._executor_config.max_beam_width == 4
+        assert llm._executor_config.max_beam_width == 1
         assert llm._executor_config.max_num_tokens == 256
         assert llm._executor_config.max_seq_len == 128
         assert llm._executor_config.max_batch_size == 8

From a7184869001d28ca70a738e9862ea91cb147da8c Mon Sep 17 00:00:00 2001
From: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
Date: Thu, 17 Jul 2025 18:24:49 +0800
Subject: [PATCH 62/88] fix: Fix DeepSeek R1 CI (#6129)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 4 ++--
 tests/integration/test_lists/waives.txt                 | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 8c5b75e65fb..4e12889fa98 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1352,7 +1352,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                               attention_dp, cuda_graph, overlap_scheduler,
                               max_batch_size, moe_backend):
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.85)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.80)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
@@ -1374,7 +1374,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                  enable_attention_dp=attention_dp,
                  speculative_config=mtp_config) as llm:
 
-            assert llm.args.moe_backend == moe_backend
+            assert llm.args.moe_config.backend == moe_backend
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
 
             task = MMLU(self.MODEL_NAME)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index e9f4ed4401e..cd453839d9a 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -439,5 +439,3 @@ examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float
 test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)
 triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437)
 triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] SKIP (https://nvbugs/5397036)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5397036)

From 9518e14f69e408ce74f4128522ab5cbf516bb7f1 Mon Sep 17 00:00:00 2001
From: Stanley Sun <190317771+StanleySun639@users.noreply.github.com>
Date: Thu, 17 Jul 2025 18:55:04 +0800
Subject: [PATCH 63/88] test: fix PytestUnknownMarkWarning: Unknown
 pytest.mark.timeout (#6115)

Signed-off-by: Stanley Sun <190317771+StanleySun639@users.noreply.github.com>
---
 tests/integration/defs/pytest.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/defs/pytest.ini b/tests/integration/defs/pytest.ini
index 24b270884c0..69629dce95c 100644
--- a/tests/integration/defs/pytest.ini
+++ b/tests/integration/defs/pytest.ini
@@ -12,3 +12,4 @@ markers =
     skip_less_host_memory: skip when less host memory detected than the requested
     support_fp8: skip when fp8 is not supported on the device
     skip_device_not_contain: skip when the device does not contain the specified keyword
+    timeout: set test timeout in seconds

From 58d22a72f1f2b893b8b937a01c3d827efb4815e6 Mon Sep 17 00:00:00 2001
From: Ziyi Xiong <219238287+ziyixiong-nv@users.noreply.github.com>
Date: Thu, 17 Jul 2025 21:15:01 +0800
Subject: [PATCH 64/88] [TRTLLM-6352][feat] Migrate EAGLE3 and draft/target
 speculation to Drafter (#6007)

Signed-off-by: ziyixiong-nv <fxiong@nvidia.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 198 +---------
 .../_torch/pyexecutor/py_executor_creator.py  |   3 +-
 tensorrt_llm/_torch/speculative/drafter.py    |   7 +
 .../_torch/speculative/model_drafter.py       | 353 ++++++++++++++++++
 tensorrt_llm/_torch/speculative/ngram.py      |   7 +-
 tensorrt_llm/_torch/speculative/utils.py      |  20 +-
 6 files changed, 388 insertions(+), 200 deletions(-)
 create mode 100644 tensorrt_llm/_torch/speculative/model_drafter.py

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index c402480b7d9..6826cda6114 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -11,7 +11,7 @@
 import weakref
 from collections import deque, namedtuple
 from contextlib import contextmanager
-from typing import Dict, List, Optional, Tuple, Union
+from typing import Dict, List, Optional, Union
 
 import torch
 
@@ -308,7 +308,7 @@ def __init__(self,
         if is_trace_enabled("TLLM_TRACE_EXECUTOR_LOOP"):
             self.event_loop = trace_func(self.event_loop)
 
-        if self.draft_model_engine is not None:
+        if self.drafter is not None:
             if self.event_loop.__name__ != self._executor_loop.__name__:
                 raise NotImplementedError(
                     "Drafting is not supported for selected executor loop. "
@@ -905,10 +905,6 @@ def _executor_loop_pp(self):
 
     def _executor_loop(self):
         torch.cuda.set_device(self.device_id)
-        is_ngram = hasattr(
-            self.model_engine, "spec_config"
-        ) and self.model_engine.spec_config is not None and self.model_engine.spec_config.spec_dec_mode.is_ngram(
-        )
         with self._profiler() as profile_step:
             sample_state = None
             iter_start_time = time.time()
@@ -931,7 +927,7 @@ def _executor_loop(self):
 
                 self._pad_attention_dp_dummy_request()
 
-                if self.draft_model_engine is not None or is_ngram or self.drafter is not None:
+                if self.drafter is not None:
                     self._prepare_draft_requests(self.active_requests)
 
                 scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule(
@@ -971,11 +967,9 @@ def _executor_loop(self):
                             scheduled_batch)
 
                     self.resource_manager.prepare_resources(scheduled_batch)
-                    if self.draft_model_engine is not None:
-                        self._prepare_draft_tokens(scheduled_batch)
-
                     if self.drafter is not None:
-                        self.drafter.prepare_draft_tokens(scheduled_batch)
+                        self.drafter.prepare_draft_tokens(
+                            scheduled_batch, self.resource_manager)
 
                     if self.kv_cache_transceiver:
                         # For generation requests which have completed KV cache transfer
@@ -1798,188 +1792,6 @@ def _update_requests(self, sample_state: SampleState):
             logger.error(f"Encountered an error in sampling: {error_msg}")
             self._handle_errors(error_msg)
 
-    @nvtx_range("_prepare_draft_batch")
-    def _prepare_draft_batch(
-        self, scheduled_requests: ScheduledRequests
-    ) -> Tuple[ScheduledRequests, Dict[int, LlmRequest]]:
-        """
-        Prepares a batch for the draft model engine. Draft tokens are only produced
-        for generation requests.
-
-        The requests are prepared as follows:
-        1. The first time the draft engine sees a request, it's a context request.
-        2. Otherwise, if draft tokens were accepted on the last target model decoding
-        step, it's a chunked context request (we process all the accepted tokens together).
-        3. Otherwise, it's a generation request.
-        """
-        try:
-            draft_batch = ScheduledRequests()
-
-            for request in scheduled_requests.generation_requests:
-                if request.py_draft_pages_allocated == 0:
-                    # No space for draft tokens.
-                    continue
-
-                # Stop drafting when we hit the max seqlen. We still need dummy draft
-                # tokens attached to the requests to make sure everything works properly
-                # with CUDA graph. These dummy tokens are already added by
-                # _prepare_draft_requests to make the KV cache/scheduler aware of the fact
-                # that we want to do spec decoding, so no need to do anything else here.
-                # This makes the perf for this case suboptimal, but that's OK - this is
-                # a corner case for weird models like the llama 3.1 8b EAGLE3 implementation.
-                if request.max_beam_num_tokens - 1 >= self.draft_model_engine.max_seq_len:
-                    continue
-
-                num_draft_tokens = len(
-                    request.py_last_draft_tokens
-                ) if request.py_last_draft_tokens is not None else 0
-                request.py_draft_tokens = []
-
-                num_accepted_tokens = request.py_num_accepted_draft_tokens
-                num_rejected_tokens = num_draft_tokens - num_accepted_tokens
-                assert num_rejected_tokens >= 0
-
-                spec_config = self.model_engine.spec_config
-                beam_idx = 0
-                input_tokens = spec_config.get_draft_model_prompt(
-                    request.get_tokens()[beam_idx])
-
-                def create_new_request(input_tokens):
-                    return LlmRequest(
-                        request_id=request.py_request_id,
-                        max_new_tokens=request.py_max_new_tokens,
-                        input_tokens=input_tokens,
-                        sampling_config=request.sampling_config,
-                        return_perf_metrics=request.return_perf_metrics,
-                        is_streaming=False,
-                        is_draft=True)
-
-                if request.max_beam_num_tokens - 1 == request.py_prompt_len:
-                    # This is the first time the draft model is seeing this request.
-                    # Prepare a context request. We discard the first token and take
-                    # the newly decoded one - this is the convention for EAGLE 2 and 3.
-                    new_request = create_new_request(input_tokens)
-                    draft_batch.context_requests.append(new_request)
-                elif num_accepted_tokens == 0:
-                    new_request = create_new_request(input_tokens[:-1])
-                    # Explicitly add the last token so get_last_tokens() returns
-                    # the right value
-                    new_request.add_new_token(input_tokens[-1], beam_idx)
-                    new_request.state = LlmRequestState.GENERATION_IN_PROGRESS
-                    draft_batch.generation_requests.append(new_request)
-                else:
-                    new_request = create_new_request(input_tokens)
-                    new_request.context_chunk_size = num_accepted_tokens + 1
-                    new_request.context_current_position = len(
-                        input_tokens) - num_accepted_tokens - 1
-                    new_request.context_chunk_size = num_accepted_tokens + 1
-                    new_request.context_current_position = len(
-                        input_tokens) - num_accepted_tokens - 1
-
-                    draft_batch.context_requests.append(new_request)
-
-                new_request.py_stop_words_list = request.py_stop_words_list
-
-            return draft_batch
-
-        except Exception as e:
-            traceback.print_exc()
-            error_msg = str(e)
-            logger.error(f"Encountered an error in decode: {error_msg}")
-            self._handle_errors(error_msg)
-
-    @nvtx_range("_prepare_draft_tokens")
-    def _prepare_draft_tokens(self, scheduled_requests: ScheduledRequests):
-        if not self.draft_model_engine:
-            raise ValueError("Draft model engine is not set")
-
-        try:
-            draft_batch = self._prepare_draft_batch(scheduled_requests)
-
-            if draft_batch.batch_size == 0:
-                return
-            self.draft_seq_slot_manager.prepare_resources(draft_batch)
-
-            req_id_to_old_request = {
-                req.py_request_id: req
-                for req in scheduled_requests.all_requests()
-            }
-
-            # Disable cuda graph for the 1st draft model forward
-            if self.model_engine.spec_config.spec_dec_mode.needs_kv_cache_recompute(
-            ):
-                with self.draft_model_engine.no_cuda_graph():
-                    outputs = self.draft_model_engine.forward(
-                        draft_batch, self.resource_manager)
-            else:
-                outputs = self.draft_model_engine.forward(
-                    draft_batch, self.resource_manager)
-            if hasattr(self.draft_model_engine.model.model, 'd2t'):
-                outputs['d2t'] = self.draft_model_engine.model.model.d2t.data
-
-            sample_state = self._sample_async(draft_batch, outputs)
-            previous_batch = sample_state
-
-            self._update_request_states(draft_batch)
-
-            def _process_decoded_tokens(draft_batch):
-                new_requests = []
-                for req in draft_batch.all_requests():
-                    target_model_req = req_id_to_old_request[req.py_request_id]
-                    target_model_req.py_draft_tokens.append(
-                        req.get_last_tokens(0))
-                    if req.state != LlmRequestState.GENERATION_COMPLETE and len(
-                            target_model_req.py_draft_tokens
-                    ) < target_model_req.py_draft_pages_allocated:
-                        new_requests.append(req)
-                    else:
-                        self.draft_seq_slot_manager.free_resources(req)
-
-                return new_requests
-
-            # The TRTLLM attention kernels cannot handle generation requests with
-            # different seqlens. No issues with flashinfer, should we look into removing
-            # this? Just needs proper kernel support.
-            def _pad_to_max_draft_tokens():
-                for req in scheduled_requests.generation_requests:
-                    max_draft_len = self.max_draft_len
-                    num_draft_tokens = len(req.py_draft_tokens)
-                    req.py_draft_tokens.extend(
-                        0 for _ in range(max_draft_len - num_draft_tokens))
-
-            draft_batch.generation_requests = draft_batch.context_requests + draft_batch.generation_requests
-            draft_batch.context_requests = []
-
-            for i in range(self.max_draft_len - 1):
-                if len(draft_batch.generation_requests) == 0:
-                    break
-
-                outputs = self.draft_model_engine.forward(
-                    draft_batch,
-                    self.resource_manager,
-                    new_tensors_device=previous_batch.device)
-
-                if hasattr(self.draft_model_engine.model.model, 'd2t'):
-                    outputs[
-                        'd2t'] = self.draft_model_engine.model.model.d2t.data
-                sample_state = self._sample_async(draft_batch, outputs)
-                self._update_request_states(draft_batch)
-                self._update_requests(previous_batch)
-                new_requests = _process_decoded_tokens(
-                    previous_batch.scheduled_requests)
-                draft_batch.generation_requests = new_requests
-                previous_batch = sample_state
-            self._update_requests(previous_batch)
-            new_requests = _process_decoded_tokens(
-                previous_batch.scheduled_requests)
-            _pad_to_max_draft_tokens()
-
-        except Exception as e:
-            traceback.print_exc()
-            error_msg = str(e)
-            logger.error(f"Encountered an error in decode: {error_msg}")
-            self._handle_errors(error_msg)
-
     def _handle_errors(self, error_msg: Optional[str] = None):
         error_responses = {}
         error_msg = error_msg or "error"
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index b9eccc90601..446b647618d 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -382,7 +382,8 @@ def create_py_executor(
 
     # Drafter for speculative decoding
     with mem_monitor.observe_creation_stage(_ExecutorCreationStage.DRAFTER):
-        drafter = get_spec_drafter(model_engine, spec_resource_manager)
+        drafter = get_spec_drafter(model_engine, draft_model_engine, sampler,
+                                   spec_resource_manager)
 
     with mem_monitor.observe_creation_stage(
             _ExecutorCreationStage.INIT_EXTRA_RESOURCES
diff --git a/tensorrt_llm/_torch/speculative/drafter.py b/tensorrt_llm/_torch/speculative/drafter.py
index d99c5dd92d8..e08044cbb4f 100644
--- a/tensorrt_llm/_torch/speculative/drafter.py
+++ b/tensorrt_llm/_torch/speculative/drafter.py
@@ -1,16 +1,23 @@
 from abc import ABC, abstractmethod
+from typing import Optional
 
+from ..pyexecutor.resource_manager import ResourceManager
 from ..pyexecutor.scheduler import ScheduledRequests
 
 
 class Drafter(ABC):
+    """Abstract base class for all drafter implementations."""
 
     @abstractmethod
     def prepare_draft_tokens(
         self,
         scheduled_requests: ScheduledRequests,
+        resource_manager: Optional[ResourceManager] = None,
     ) -> None:
         """
         Prepare the drafter tokens for the forward computation this step.
+
+        Args:
+            scheduled_requests: The scheduled requests for this iteration
         """
         raise NotImplementedError
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
new file mode 100644
index 00000000000..ac195ccf515
--- /dev/null
+++ b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -0,0 +1,353 @@
+from __future__ import annotations
+
+import traceback
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+
+from tensorrt_llm._utils import nvtx_range
+from tensorrt_llm.logger import logger
+
+from ..pyexecutor.llm_request import LlmRequest, LlmRequestState, SamplingConfig
+from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager
+from ..pyexecutor.sampler import Sampler, SampleState
+from ..pyexecutor.scheduler import ScheduledRequests
+from ..pyexecutor.seq_slot_manager import SeqSlotManager
+from .drafter import Drafter
+
+if TYPE_CHECKING:
+    from ..pyexecutor.model_engine import ModelEngine
+
+
+class ModelDrafter(Drafter):
+    """Model-based drafter that uses a draft model to generate draft tokens."""
+
+    def __init__(
+        self,
+        spec_config: "DecodingBaseConfig",
+        draft_model_engine: "ModelEngine",
+        max_draft_tokens: int,
+        draft_seq_slot_manager: SeqSlotManager,
+        sampler: Sampler,
+        spec_resource_manager: Optional[BaseResourceManager] = None,
+    ):
+        # Validate required parameters
+        if draft_model_engine is None:
+            raise ValueError("draft_model_engine cannot be None")
+        if max_draft_tokens < 0:
+            raise ValueError(f"max_draft_tokens must be >= 0")
+
+        # Model and resource management
+        self.draft_model_engine = draft_model_engine
+        self.draft_seq_slot_manager = draft_seq_slot_manager
+        self.spec_resource_manager = spec_resource_manager
+
+        # Configuration
+        self.spec_config = spec_config
+        self.max_draft_tokens = max_draft_tokens
+
+        # Sampling
+        self.sampler = sampler
+
+    def _create_draft_request(self, request_id: int, max_new_tokens: int,
+                              input_tokens: Optional[List],
+                              sampling_config: SamplingConfig,
+                              return_perf_metrics: bool) -> LlmRequest:
+        """Create a draft request with common parameters."""
+        return LlmRequest(request_id=request_id,
+                          max_new_tokens=max_new_tokens,
+                          input_tokens=input_tokens,
+                          sampling_config=sampling_config,
+                          return_perf_metrics=return_perf_metrics,
+                          is_streaming=False,
+                          is_draft=True)
+
+    def _initialize_draft_tokens(self, request: LlmRequest) -> Tuple[int, int]:
+        """Initialize draft token tracking for a request."""
+        num_draft_tokens = len(
+            request.py_last_draft_tokens
+        ) if request.py_last_draft_tokens is not None else 0
+        request.py_draft_tokens = []
+
+        num_accepted_tokens = request.py_num_accepted_draft_tokens
+        num_rejected_tokens = num_draft_tokens - num_accepted_tokens
+        assert num_rejected_tokens >= 0
+
+        return num_draft_tokens, num_accepted_tokens
+
+    def _create_context_request(self, request: LlmRequest,
+                                input_tokens: Any) -> LlmRequest:
+        """Create a context request for first-time drafting."""
+        return self._create_draft_request(request.py_request_id,
+                                          request.py_max_new_tokens,
+                                          input_tokens, request.sampling_config,
+                                          request.return_perf_metrics)
+
+    def _create_generation_request(self, request: LlmRequest,
+                                   input_tokens: Any) -> LlmRequest:
+        """Create a generation request when no tokens were accepted."""
+        new_request = self._create_draft_request(request.py_request_id,
+                                                 request.py_max_new_tokens,
+                                                 input_tokens[:-1],
+                                                 request.sampling_config,
+                                                 request.return_perf_metrics)
+        # Explicitly add the last token so get_last_tokens() returns the right value
+        new_request.add_new_token(input_tokens[-1], 0)
+        new_request.state = LlmRequestState.GENERATION_IN_PROGRESS
+        return new_request
+
+    def _create_chunked_context_request(self, request: LlmRequest,
+                                        input_tokens: Any,
+                                        num_accepted_tokens: int) -> LlmRequest:
+        """Create a chunked context request when some tokens were accepted."""
+        new_request = self._create_draft_request(request.py_request_id,
+                                                 request.py_max_new_tokens,
+                                                 input_tokens,
+                                                 request.sampling_config,
+                                                 request.return_perf_metrics)
+        new_request.context_chunk_size = num_accepted_tokens + 1
+        new_request.context_current_position = len(
+            input_tokens) - num_accepted_tokens - 1
+        return new_request
+
+    def _create_draft_request_for_request(
+            self, request: LlmRequest) -> Optional[LlmRequest]:
+        """Create a draft request based on the original request state."""
+        num_draft_tokens, num_accepted_tokens = self._initialize_draft_tokens(
+            request)
+        input_tokens = self.spec_config.get_draft_model_prompt(
+            request.get_tokens()[0])
+
+        # First time seeing this request - context request
+        if request.max_beam_num_tokens - 1 == request.py_prompt_len:
+            # This is the first time the draft model is seeing this request.
+            # Prepare a context request. We discard the first token and take
+            # the newly decoded one - this is the convention for EAGLE 2 and 3.
+            assert num_draft_tokens == 0
+            return self._create_context_request(request, input_tokens)
+
+        # No tokens accepted - generation request
+        elif num_accepted_tokens == 0:
+            return self._create_generation_request(request, input_tokens)
+
+        # Tokens accepted - chunked context request
+        else:
+            return self._create_chunked_context_request(request, input_tokens,
+                                                        num_accepted_tokens)
+
+    def _add_to_draft_batch(self, draft_batch: ScheduledRequests,
+                            draft_request: LlmRequest,
+                            original_request: LlmRequest) -> None:
+        """Add the draft request to the appropriate batch list."""
+        # Copy additional properties
+        draft_request.py_stop_words_list = original_request.py_stop_words_list
+
+        # Add to appropriate batch based on request type
+        if draft_request.state == LlmRequestState.GENERATION_IN_PROGRESS:
+            draft_batch.generation_requests.append(draft_request)
+        else:
+            draft_batch.context_requests.append(draft_request)
+
+    @nvtx_range("_prepare_draft_batch")
+    def _prepare_draft_batch(
+            self, scheduled_requests: ScheduledRequests) -> ScheduledRequests:
+        """
+        Prepares a batch for the draft model engine. Draft tokens are only produced
+        for generation requests.
+
+        The requests are prepared as follows:
+        1. The first time the draft engine sees a request, it's a context request.
+        2. Otherwise, if draft tokens were accepted on the last target model decoding
+        step, it's a chunked context request (we process all the accepted tokens together).
+        3. Otherwise, it's a generation request.
+
+        Args:
+            scheduled_requests: The scheduled requests to prepare draft batch for
+
+        Returns:
+            ScheduledRequests: The prepared draft batch
+        """
+        try:
+            draft_batch = ScheduledRequests()
+
+            for request in scheduled_requests.generation_requests:
+                if request.py_draft_pages_allocated == 0:
+                    # No space for draft tokens
+                    continue
+
+                # Stop drafting when we hit the max seqlen. We still need dummy draft
+                # tokens attached to the requests to make sure everything works properly
+                # with CUDA graph. These dummy tokens are already added by
+                # _prepare_draft_requests to make the KV cache/scheduler aware of the fact
+                # that we want to do spec decoding, so no need to do anything else here.
+                # This makes the perf for this case suboptimal, but that's OK - this is
+                # a corner case for weird models like the llama 3.1 8b EAGLE3 implementation.
+                if request.max_beam_num_tokens - 1 >= self.draft_model_engine.max_seq_len:
+                    continue
+
+                draft_request = self._create_draft_request_for_request(request)
+                if draft_request is not None:
+                    self._add_to_draft_batch(draft_batch, draft_request,
+                                             request)
+
+            return draft_batch
+
+        except Exception as e:
+            logger.error(f"Error in _prepare_draft_batch: {str(e)}")
+            traceback.print_exc()
+            raise e
+
+    def _should_disable_cuda_graph(
+            self, previous_batch: Optional[SampleState]) -> bool:
+        """Check if CUDA graph should be disabled for the current forward pass."""
+        if previous_batch is not None:
+            return False
+        return self.spec_config.spec_dec_mode.needs_kv_cache_recompute()
+
+    def _forward_draft_model(
+            self,
+            draft_batch: ScheduledRequests,
+            resource_manager: ResourceManager,
+            previous_batch: Optional[SampleState] = None) -> Dict[str, Any]:
+        """Forward pass through the draft model."""
+        if self._should_disable_cuda_graph(previous_batch):
+            with self.draft_model_engine.no_cuda_graph():
+                outputs = self.draft_model_engine.forward(
+                    draft_batch, resource_manager)
+        else:
+            new_tensors_device = previous_batch.device if previous_batch else None
+            outputs = self.draft_model_engine.forward(
+                draft_batch,
+                resource_manager,
+                new_tensors_device=new_tensors_device)
+
+        # Handle d2t data if available
+        if hasattr(self.draft_model_engine.model.model, 'd2t'):
+            outputs['d2t'] = self.draft_model_engine.model.model.d2t.data
+
+        return outputs
+
+    def _sample_async(self, draft_batch: ScheduledRequests,
+                      outputs: Dict[str, Any]) -> Optional[SampleState]:
+        """Sample tokens from draft model outputs."""
+        try:
+            if self.sampler is not None:
+                return self.sampler.sample_async(draft_batch, outputs)
+            return None
+        except Exception as e:
+            logger.error(f"Error in sampling: {str(e)}")
+            return None
+
+    def _update_request_states(self,
+                               scheduled_requests: ScheduledRequests) -> None:
+        """Update request states after processing."""
+        for request in scheduled_requests.context_requests:
+            if request.state != LlmRequestState.GENERATION_COMPLETE:
+                request.move_to_next_context_chunk()
+            if request.context_remaining_length == 0:
+                request.state = LlmRequestState.GENERATION_IN_PROGRESS
+
+    def _update_requests(self, sample_state: SampleState) -> None:
+        """Update requests with sample state."""
+        if self.sampler is not None:
+            self.sampler.update_requests(sample_state)
+
+    def _process_decoded_tokens(
+            self, draft_batch: ScheduledRequests,
+            req_id_to_old_request: Dict[int, LlmRequest]) -> List[LlmRequest]:
+        """Process decoded tokens and determine which requests to continue processing."""
+        new_requests = []
+        for req in draft_batch.all_requests():
+            target_model_req = req_id_to_old_request[req.py_request_id]
+            target_model_req.py_draft_tokens.append(req.get_last_tokens(0))
+            if req.state != LlmRequestState.GENERATION_COMPLETE and len(
+                    target_model_req.py_draft_tokens
+            ) < target_model_req.py_draft_pages_allocated:
+                new_requests.append(req)
+            else:
+                self.draft_seq_slot_manager.free_resources(req)
+
+        return new_requests
+
+    def _pad_to_max_draft_tokens(self,
+                                 scheduled_requests: ScheduledRequests) -> None:
+        """Pad draft tokens to maximum length for all generation requests."""
+        for req in scheduled_requests.generation_requests:
+            max_draft_tokens = self.max_draft_tokens
+            num_draft_tokens = len(req.py_draft_tokens)
+            req.py_draft_tokens.extend(
+                0 for _ in range(max_draft_tokens - num_draft_tokens))
+
+    @nvtx_range("prepare_draft_tokens")
+    def prepare_draft_tokens(
+        self,
+        scheduled_requests: ScheduledRequests,
+        resource_manager: Optional[ResourceManager] = None,
+    ) -> None:
+        """
+        Prepare draft tokens for the scheduled requests.
+
+        Args:
+            scheduled_requests: The scheduled requests for this iteration
+            resource_manager: The resource manager for this iteration
+        """
+        if not self.draft_model_engine:
+            raise ValueError("Draft model engine is not set")
+
+        if resource_manager is None:
+            raise ValueError("Resource manager is required")
+
+        try:
+            draft_batch = self._prepare_draft_batch(scheduled_requests)
+
+            if draft_batch.batch_size == 0:
+                return
+
+            self.draft_seq_slot_manager.prepare_resources(draft_batch)
+
+            req_id_to_old_request = {
+                req.py_request_id: req
+                for req in scheduled_requests.all_requests()
+            }
+
+            # Initial forward pass
+            outputs = self._forward_draft_model(draft_batch, resource_manager)
+            sample_state = self._sample_async(draft_batch, outputs)
+            previous_batch = sample_state
+
+            self._update_request_states(draft_batch)
+
+            # Convert context requests to generation requests
+            draft_batch.generation_requests = draft_batch.context_requests + draft_batch.generation_requests
+            draft_batch.context_requests = []
+
+            # Generate remaining draft tokens iteratively
+            for i in range(self.max_draft_tokens - 1):
+                if len(draft_batch.generation_requests) == 0:
+                    break
+
+                outputs = self._forward_draft_model(draft_batch,
+                                                    resource_manager,
+                                                    previous_batch)
+                sample_state = self._sample_async(draft_batch, outputs)
+                self._update_request_states(draft_batch)
+                if previous_batch is not None:
+                    self._update_requests(previous_batch)
+                    new_requests = self._process_decoded_tokens(
+                        previous_batch.scheduled_requests,
+                        req_id_to_old_request)
+                else:
+                    new_requests = []
+                draft_batch.generation_requests = new_requests
+                previous_batch = sample_state
+
+            # Final cleanup
+            if previous_batch is not None:
+                self._update_requests(previous_batch)
+                self._process_decoded_tokens(previous_batch.scheduled_requests,
+                                             req_id_to_old_request)
+            self._pad_to_max_draft_tokens(scheduled_requests)
+
+        except Exception as e:
+            traceback.print_exc()
+            error_msg = str(e)
+            logger.error(f"Encountered an error in decode: {error_msg}")
+            raise e
diff --git a/tensorrt_llm/_torch/speculative/ngram.py b/tensorrt_llm/_torch/speculative/ngram.py
index 57f3045e664..9113900ef94 100644
--- a/tensorrt_llm/_torch/speculative/ngram.py
+++ b/tensorrt_llm/_torch/speculative/ngram.py
@@ -5,7 +5,7 @@
 from tensorrt_llm.logger import logger
 
 from ..pyexecutor.llm_request import *
-from ..pyexecutor.resource_manager import BaseResourceManager
+from ..pyexecutor.resource_manager import BaseResourceManager, ResourceManager
 from ..pyexecutor.scheduler import ScheduledRequests
 from .drafter import Drafter
 
@@ -59,10 +59,10 @@ def __init__(self, spec_config: "NGramDecodingConfig",
         self.start_index = {}
 
     def get_max_resource_count(self) -> int:
-        raise self.max_num_requests
+        return self.max_num_requests
 
     def get_needed_resource_to_completion(self, request: LlmRequest) -> int:
-        raise 0
+        return 0
 
     def prepare_resources(self, scheduled_batch: ScheduledRequests):
         pass
@@ -173,6 +173,7 @@ def __init__(
     def prepare_draft_tokens(
         self,
         scheduled_requests: ScheduledRequests,
+        resource_manager: Optional[ResourceManager] = None,
     ) -> None:
         # Sort by request_id when py_batch_idx is None as a fallback.
         # This happens in the disagg case: for a set of new requests, we draft
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
index 667d1a14b0e..2519584274f 100644
--- a/tensorrt_llm/_torch/speculative/utils.py
+++ b/tensorrt_llm/_torch/speculative/utils.py
@@ -1,9 +1,11 @@
 from tensorrt_llm._torch.pyexecutor.sampler import TorchSampler
 from tensorrt_llm._torch.speculative.interface import SpecMetadata
 
+from ..pyexecutor.seq_slot_manager import SeqSlotManager
 from .eagle3 import (Eagle3OneModelSampler, Eagle3OneModelSpecMetadata,
                      Eagle3OneModelWorker, Eagle3ResourceManager,
                      Eagle3SpecMetadata)
+from .model_drafter import ModelDrafter
 from .mtp import (MTPEagleWorker, MTPHiddenStatesManager, MTPSampler,
                   MTPSpecMetadata, MTPWorker)
 from .ngram import NGramDrafter, NGramPoolManager
@@ -112,14 +114,26 @@ def get_spec_decoder(sampler_args: TorchSampler.Args,
         f"Unsupported speculative decoding mode: {spec_config.spec_dec_mode}")
 
 
-def get_spec_drafter(model_engine, spec_resource_manager):
+def get_spec_drafter(model_engine, draft_model_engine, sampler,
+                     spec_resource_manager):
     spec_config = model_engine.spec_config
     if spec_config is None:
         return None
-    if spec_config.spec_dec_mode.is_ngram():
-        return NGramDrafter(spec_config, spec_resource_manager)
+
     if spec_config.spec_dec_mode.is_user_provided():
         return spec_config.drafter
+
+    max_num_requests = model_engine.batch_size
+    if spec_config.spec_dec_mode.is_draft_target(
+    ) or spec_config.spec_dec_mode.is_eagle3():
+        return ModelDrafter(spec_config, draft_model_engine,
+                            spec_config.max_draft_len,
+                            SeqSlotManager(max_num_requests), sampler,
+                            spec_resource_manager)
+
+    if spec_config.spec_dec_mode.is_ngram():
+        return NGramDrafter(spec_config, spec_resource_manager)
+
     return None
 
 

From 5bff317abf528b03a8ab3ee8d05857addb221af8 Mon Sep 17 00:00:00 2001
From: Linda <57756729+Linda-Stadter@users.noreply.github.com>
Date: Thu, 17 Jul 2025 16:42:52 +0200
Subject: [PATCH 65/88] feat: nanobind bindings (#5961)

Signed-off-by: Linda-Stadter <57756729+Linda-Stadter@users.noreply.github.com>
---
 cpp/CMakeLists.txt                            |   4 +-
 .../batch_manager/runtimeBuffers.h            |   2 +-
 .../batch_manager/runtimeBuffers.cpp          |   2 +-
 cpp/tensorrt_llm/nanobind/CMakeLists.txt      |  37 +-
 .../nanobind/batch_manager/algorithms.cpp     | 178 ++++
 .../nanobind/batch_manager/algorithms.h       |  29 +
 .../nanobind/batch_manager/bindings.cpp       | 525 ++++++++++
 .../nanobind/batch_manager/bindings.h         |  28 +
 .../nanobind/batch_manager/buffers.cpp        | 108 ++
 .../nanobind/batch_manager/buffers.h          |  29 +
 .../batch_manager/cacheTransceiver.cpp        | 110 +++
 .../nanobind/batch_manager/cacheTransceiver.h |  29 +
 .../nanobind/batch_manager/kvCacheManager.cpp | 478 +++++++++
 .../nanobind/batch_manager/kvCacheManager.h   |  39 +
 .../nanobind/batch_manager/llmRequest.cpp     | 131 +++
 .../nanobind/batch_manager/llmRequest.h       | 160 +++
 cpp/tensorrt_llm/nanobind/bindings.cpp        | 471 ++++++++-
 cpp/tensorrt_llm/nanobind/common/bindTypes.h  | 100 ++
 .../nanobind/common/customCasters.h           | 345 +++++++
 .../nanobind/executor/bindings.cpp            | 263 +++++
 cpp/tensorrt_llm/nanobind/executor/bindings.h |  29 +
 .../nanobind/executor/executor.cpp            | 241 +++++
 cpp/tensorrt_llm/nanobind/executor/executor.h | 129 +++
 .../nanobind/executor/executorConfig.cpp      | 616 ++++++++++++
 .../nanobind/executor/executorConfig.h        |  30 +
 .../nanobind/executor/request.cpp             | 935 ++++++++++++++++++
 cpp/tensorrt_llm/nanobind/executor/request.h  |  29 +
 .../nanobind/runtime/bindings.cpp             | 388 ++++++++
 cpp/tensorrt_llm/nanobind/runtime/bindings.h  |  30 +
 .../nanobind/runtime/moeBindings.cpp          | 124 +++
 .../nanobind/runtime/moeBindings.h            |  29 +
 .../nanobind/testing/modelSpecBinding.cpp     |  87 ++
 .../nanobind/testing/modelSpecBinding.h       |  29 +
 .../nanobind/userbuffers/bindings.cpp         |  47 +
 .../nanobind/userbuffers/bindings.h           |  30 +
 cpp/tensorrt_llm/pybind/bindings.cpp          |   2 +-
 cpp/tensorrt_llm/pybind/executor/bindings.cpp |  12 +-
 .../pybind/executor/executorConfig.cpp        |   2 +-
 examples/models/core/llama/summarize_long.py  |   2 +-
 examples/models/core/qwen2audio/run.py        |   3 +-
 examples/models/core/qwenvl/run.py            |   3 +-
 jenkins/Build.groovy                          |  18 +
 jenkins/L0_Test.groovy                        |   8 +
 tensorrt_llm/builder.py                       |   2 +-
 tensorrt_llm/commands/build.py                |  19 +-
 tensorrt_llm/runtime/model_runner.py          |   2 +-
 .../integration/test_lists/test-db/l0_a10.yml |  15 +
 tests/unittest/bindings/test_bindings_ut.py   |   7 +
 .../bindings/test_executor_bindings.py        |  17 +-
 49 files changed, 5932 insertions(+), 21 deletions(-)
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
 create mode 100644 cpp/tensorrt_llm/nanobind/common/bindTypes.h
 create mode 100644 cpp/tensorrt_llm/nanobind/common/customCasters.h
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/bindings.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/bindings.h
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/executor.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/executor.h
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/executorConfig.h
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/request.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/executor/request.h
 create mode 100644 cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/runtime/bindings.h
 create mode 100644 cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
 create mode 100644 cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
 create mode 100644 cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
 create mode 100644 cpp/tensorrt_llm/nanobind/userbuffers/bindings.h

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a76b3e21558..d9e8c206f46 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -198,7 +198,7 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-if(BINDING_TYPE STREQUAL "pybind")
+if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
   add_subdirectory(${3RDPARTY_DIR}/pybind11
                    ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
 endif()
@@ -217,7 +217,7 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include)
-if(BINDING_TYPE STREQUAL "pybind")
+if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
   include_directories(${3RDPARTY_DIR}/pybind11/include)
 endif()
 if(BINDING_TYPE STREQUAL "nanobind")
diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
index 13bde6d07a5..fa43d084b27 100644
--- a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -168,7 +168,7 @@ class RuntimeBuffers
 
 public:
     //! Additional buffers depending on model type
-    std::unique_ptr<TransformerBuffers> transformerBuffers;
+    std::shared_ptr<TransformerBuffers> transformerBuffers;
     std::unique_ptr<RnnStateBuffers> rnnStateBuffers;
 
     //! Encoder-Decoder
diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
index 691fb9c7efd..e8b71d065f3 100644
--- a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -84,7 +84,7 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
 
     if (modelConfig.isTransformerBased())
     {
-        transformerBuffers = std::make_unique<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
+        transformerBuffers = std::make_shared<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
             maxAttentionWindow, sinkTokenLen, runtime, modelConfig, worldConfig);
     }
     if (modelConfig.isRnnBased())
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index d2e7eac20c2..3d570f024d7 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -3,7 +3,23 @@ set(TRTLLM_NB_MODULE
     ${TRTLLM_NB_MODULE}
     PARENT_SCOPE)
 
-set(SRCS ../runtime/ipcNvlsMemory.cu bindings.cpp)
+set(SRCS
+    batch_manager/algorithms.cpp
+    batch_manager/bindings.cpp
+    batch_manager/buffers.cpp
+    batch_manager/cacheTransceiver.cpp
+    batch_manager/kvCacheManager.cpp
+    batch_manager/llmRequest.cpp
+    executor/bindings.cpp
+    executor/executor.cpp
+    executor/executorConfig.cpp
+    executor/request.cpp
+    runtime/bindings.cpp
+    testing/modelSpecBinding.cpp
+    runtime/moeBindings.cpp
+    userbuffers/bindings.cpp
+    ../runtime/ipcNvlsMemory.cu
+    bindings.cpp)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
@@ -14,20 +30,29 @@ set_property(TARGET ${TRTLLM_NB_MODULE} PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_directories(${TRTLLM_NB_MODULE} PUBLIC
                         "${TORCH_INSTALL_PREFIX}/lib")
 
+if(ENABLE_NVSHMEM)
+  target_link_libraries(${TRTLLM_NB_MODULE} PUBLIC nvshmem::nvshmem_host
+                                                   nvshmem::nvshmem_device)
+endif()
+
 target_link_libraries(
   ${TRTLLM_NB_MODULE}
-  PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
-         ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
-
+  PUBLIC ${SHARED_TARGET}
+         ${UNDEFINED_FLAG}
+         ${NO_AS_NEEDED_FLAG}
+         ${Python3_LIBRARIES}
+         ${TORCH_LIBRARIES}
+         torch_python
+         ${CUDA_NVML_LIB})
 target_compile_definitions(
   ${TRTLLM_NB_MODULE} PUBLIC TRTLLM_NB_MODULE=${TRTLLM_NB_MODULE}
-                             NB_DETAILED_ERROR_MESSAGES=1)
+                             PYBIND11_DETAILED_ERROR_MESSAGES=1)
 
 if(NOT WIN32)
   set_target_properties(
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
new file mode 100644
index 00000000000..637401555e8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
@@ -0,0 +1,178 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "algorithms.h"
+#include "tensorrt_llm/batch_manager/allocateKvCache.h"
+#include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
+#include "tensorrt_llm/batch_manager/capacityScheduler.h"
+#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
+#include "tensorrt_llm/batch_manager/handleContextLogits.h"
+#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
+#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
+#include "tensorrt_llm/batch_manager/medusaBuffers.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/pauseRequests.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/core/TensorBody.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/list.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+#include <optional>
+
+namespace nb = nanobind;
+
+namespace tr = tensorrt_llm::runtime;
+using namespace tensorrt_llm::batch_manager;
+
+void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_& m)
+{
+    nb::class_<CapacityScheduler>(m, CapacityScheduler::name)
+        .def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState>(),
+            nb::arg("max_num_requests"), nb::arg("capacity_scheduler_policy"), nb::arg("has_kv_cache_manager"),
+            nb::arg("two_step_lookahead") = false, nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+        .def("__call__", &CapacityScheduler::operator(), nb::arg("active_requests"),
+            nb::arg("kv_cache_manager") = nullptr, nb::arg("peft_cache_manager") = nullptr,
+            nb::arg("cross_kv_cache_manager") = nullptr)
+        .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; });
+
+    nb::class_<MicroBatchScheduler>(m, MicroBatchScheduler::name)
+        .def(nb::init<std::optional<batch_scheduler::ContextChunkingConfig>, std::optional<SizeType32>, LlmRequestState,
+                 LlmRequestState>(),
+            nb::arg("ctx_chunk_config") = std::nullopt, nb::arg("max_context_length") = std::nullopt,
+            nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
+            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
+        .def("__call__", &MicroBatchScheduler::operator(), nb::arg("active_requests"), nb::arg("inflight_req_ids"),
+            nb::arg("max_batch_size_runtime"), nb::arg("max_num_tokens_runtime"))
+        .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
+
+    nb::class_<PauseRequests>(m, PauseRequests::name)
+        .def(nb::init<SizeType32>(), nb::arg("max_input_len"))
+        .def("__call__", &PauseRequests::operator(), nb::arg("requests_to_pause"), nb::arg("inflight_req_ids"),
+            nb::arg("req_ids_to_pause"), nb::arg("pause_flagged"), nb::arg("seq_slot_manager"),
+            nb::arg("kv_cache_manager") = std::nullopt, nb::arg("cross_kv_cache_manager") = std::nullopt,
+            nb::arg("peft_cache_manager") = std::nullopt)
+        .def("name", [](PauseRequests const&) { return PauseRequests::name; });
+
+    nb::class_<AssignReqSeqSlots>(m, AssignReqSeqSlots::name)
+        .def(nb::init<>())
+        .def("__call__", &AssignReqSeqSlots::operator(), nb::arg("seq_slot_manager"), nb::arg("context_requests"),
+            nb::arg("generation_requests"))
+        .def("name", [](AssignReqSeqSlots const&) { return AssignReqSeqSlots::name; });
+
+    nb::class_<AllocateKvCache>(m, AllocateKvCache::name)
+        .def(nb::init<>())
+        .def("__call__", &AllocateKvCache::operator(), nb::arg("kv_cache_manager"), nb::arg("context_requests"),
+            nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
+        .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
+
+    nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
+        .def(nb::init<>())
+        .def(
+            "__call__",
+            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
+                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
+                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
+            {
+                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
+                    manager, medusaBuffers);
+            },
+            nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
+            nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
+            nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
+
+    nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
+        .def(nb::init<>())
+        .def(
+            "__call__",
+            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
+                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
+                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
+                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
+                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
+            {
+                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
+                    genRuntimeBuffers, medusaBuffers);
+            },
+            nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
+            nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
+            nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
+
+    nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
+        .def(nb::init<>())
+        .def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("context_requests"),
+            nb::arg("generation_requests"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+            nb::arg("model_config"), nb::arg("max_num_sequences"), nb::arg("fused_runtime_buffers") = std::nullopt)
+        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
+
+    nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
+        .def(nb::init<>())
+        .def("__call__", &LogitsPostProcessor::operator(), nb::arg("context_requests"), nb::arg("generation_requests"),
+            nb::arg("replicate_logits_post_processor"), nb::arg("decoder_buffers"), nb::arg("world_config"),
+            nb::arg("runtime"), nb::arg("logits_post_processor_batched") = std::nullopt)
+        .def("name", [](LogitsPostProcessor const&) { return LogitsPostProcessor::name; });
+
+    nb::class_<CreateNewDecoderRequests>(m, CreateNewDecoderRequests::name)
+        .def(nb::init<bool, bool, bool>(), nb::arg("speculative_decoding_fast_logits"),
+            nb::arg("is_leader_in_orch_mode"), nb::arg("is_normalize_log_probs"))
+        .def(
+            "__call__",
+            [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
+                executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
+                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
+                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+                tensorrt_llm::runtime::CudaStream const& runtimeStream,
+                tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
+                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
+            {
+                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
+                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
+                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
+
+                return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
+                    std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
+            },
+            nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
+            nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
+            nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
+            nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
+        .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
+
+    nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
+        .def(nb::init<>())
+        .def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
+            nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
+            nb::arg("decoder_finish_event"))
+        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
new file mode 100644
index 00000000000..cac81d73f27
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager::algorithms
+{
+
+void initBindings(nb::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
new file mode 100644
index 00000000000..d44a957aad9
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -0,0 +1,525 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
+#include "tensorrt_llm/batch_manager/medusaBuffers.h"
+#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/batch_manager/rnnStateManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
+#include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
+#include "tensorrt_llm/runtime/runtimeKernels.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+#include <tuple>
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tle = tensorrt_llm::executor;
+namespace tr = tensorrt_llm::runtime;
+
+using namespace tensorrt_llm::runtime;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void initBindings(nb::module_& m)
+{
+    using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;
+
+    // Create and register exceptions in module scope
+    nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");
+    nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");
+
+    // Register with no captures
+    nb::register_exception_translator(
+        [](std::exception_ptr const& p, void*)
+        {
+            try
+            {
+                if (p)
+                    std::rethrow_exception(p);
+            }
+            catch (const tb::PeftTaskNotCachedException& e)
+            {
+                PyErr_SetString(nb::type<tb::PeftTaskNotCachedException>().ptr(), e.what());
+            }
+            catch (const tr::LoraCacheFullException& e)
+            {
+                PyErr_SetString(nb::type<tr::LoraCacheFullException>().ptr(), e.what());
+            }
+        });
+
+    PybindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");
+
+    nb::enum_<tb::LlmRequestType>(m, "LlmRequestType")
+        .value("LLMREQUEST_TYPE_CONTEXT_AND_GENERATION", tb::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION)
+        .value("LLMREQUEST_TYPE_CONTEXT_ONLY", tb::LLMREQUEST_TYPE_CONTEXT_ONLY)
+        .value("LLMREQUEST_TYPE_GENERATION_ONLY", tb::LLMREQUEST_TYPE_GENERATION_ONLY)
+        .export_values();
+
+    nb::class_<tb::batch_scheduler::ContextChunkingConfig>(m, "ContextChunkingConfig")
+        .def(nb::init<tle::ContextChunkingPolicy, tensorrt_llm::runtime::SizeType32>(), nb::arg("chunking_policy"),
+            nb::arg("chunk_unit_size"))
+        .def_rw("chunking_policy", &tb::batch_scheduler::ContextChunkingConfig::chunkingPolicy)
+        .def_rw("chunk_unit_size", &tb::batch_scheduler::ContextChunkingConfig::chunkUnitSize);
+
+    nb::class_<GenLlmReq>(m, "GenericLlmRequest")
+        .def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, nb::arg("exclude"))
+        .def("get_num_tokens", &GenLlmReq::getNumTokens, nb::arg("beam"))
+        .def_prop_ro("max_beam_num_tokens", &GenLlmReq::getMaxBeamNumTokens)
+        .def("get_token", &GenLlmReq::getToken, nb::arg("beam"), nb::arg("pos"))
+        .def("get_tokens", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getTokens, nb::const_), nb::arg("beam"))
+        .def("get_tokens", nb::overload_cast<>(&GenLlmReq::getTokens, nb::const_))
+        .def("get_last_tokens", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getLastTokens), nb::arg("beam"))
+        .def("get_last_tokens", nb::overload_cast<>(&GenLlmReq::getLastTokens))
+        .def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, nb::arg("for_next_iteration") = false)
+        .def_prop_ro("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
+        .def("add_new_token", &GenLlmReq::addNewToken, nb::arg("token"), nb::arg("beam"))
+        .def("add_new_tokens", &GenLlmReq::addNewTokens, nb::arg("beam_tokens"))
+        .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
+        .def("set_generated_tokens", &GenLlmReq::setGeneratedTokens, nb::arg("generated_beam_tokens"))
+        .def("pause", &GenLlmReq::pause, nb::arg("max_input_len"))
+        .def_prop_rw("max_sent_token_len", &GenLlmReq::getMaxSentTokenLen, &GenLlmReq::setMaxSentTokenLen)
+        .def_prop_ro("prompt_embedding_table", &GenLlmReq::getPromptEmbeddingTable)
+        .def_prop_ro("multimodal_embedding", &GenLlmReq::getMultimodalEmbedding)
+        .def_prop_ro("mrope_rotary_cos_sin", &GenLlmReq::getMropeRotaryCosSin)
+        .def_prop_ro("bad_words_list", &GenLlmReq::getBadWordsList)
+        .def_prop_rw("draft_logits", &GenLlmReq::getDraftLogits, &GenLlmReq::setDraftLogits)
+        .def_prop_ro("embedding_bias", &GenLlmReq::getEmbeddingBias)
+        .def_prop_rw("lora_config", &GenLlmReq::getLoraConfig, &GenLlmReq::setLoraConfig)
+        .def_prop_rw("lora_weights", &GenLlmReq::getLoraWeights, &GenLlmReq::setLoraWeights)
+        .def_prop_ro("stop_words_list", &GenLlmReq::getStopWordsList)
+        .def_prop_ro("context_logits", &GenLlmReq::getContextLogitsHost)
+        .def_prop_ro("generation_logits", &GenLlmReq::getGenerationLogitsHost)
+        .def_prop_ro("prompt_vocab_size", &GenLlmReq::getPromptVocabSize)
+        .def_prop_ro("mrope_position_deltas", &GenLlmReq::getMropePositionDeltas)
+        .def_prop_ro("lora_task_id", &GenLlmReq::getLoraTaskId)
+        .def_prop_ro("lookahead_config", &GenLlmReq::getLookaheadConfig)
+        .def_prop_rw("context_chunk_size", &GenLlmReq::getContextChunkSize, &GenLlmReq::setContextChunkSize)
+        .def_prop_rw("decoding_iter", &GenLlmReq::getDecodingIter, &GenLlmReq::setDecodingIter)
+        .def_rw("request_id", &GenLlmReq::mRequestId)
+        .def_rw("prompt_len", &GenLlmReq::mPromptLen)
+        .def_rw("max_new_tokens", &GenLlmReq::mMaxNewTokens)
+        .def_rw("sampling_config", &GenLlmReq::mSamplingConfig)
+        .def_prop_rw("state", &GenLlmReq::getState, &GenLlmReq::setState)
+        .def_prop_rw("streaming", &GenLlmReq::isStreaming, &GenLlmReq::setStreaming)
+        .def_rw("end_id", &GenLlmReq::mEndId)
+        .def_rw("pad_id", &GenLlmReq::mPadId)
+        .def_rw("seq_slot", &GenLlmReq::mSeqSlot)
+        .def_prop_ro("return_log_probs", &GenLlmReq::returnLogProbs)
+        .def_prop_ro("return_context_logits", &GenLlmReq::getReturnContextLogits)
+        .def_prop_ro("return_generation_logits", &GenLlmReq::getReturnGenerationLogits)
+        .def_prop_ro("log_probs", nb::overload_cast<>(&GenLlmReq::getLogProbs, nb::const_))
+        .def("get_log_probs", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getLogProbs, nb::const_))
+        .def("set_log_probs", &GenLlmReq::setLogProbs, nb::arg("log_probs"), nb::arg("beam"))
+        .def("set_return_encoder_output", &GenLlmReq::setReturnEncoderOutput, nb::arg("return_encoder_output"))
+        .def("get_return_encoder_output", &GenLlmReq::getReturnEncoderOutput)
+        .def("priority", nb::overload_cast<>(&GenLlmReq::priority, nb::const_))
+        .def("set_priority", nb::overload_cast<tle::PriorityType>(&GenLlmReq::setPriority))
+        .def_prop_ro("cum_log_probs", &GenLlmReq::getCumLogProbs)
+        .def("set_cum_log_prob", &GenLlmReq::setCumLogProb, nb::arg("cum_log_prob"), nb::arg("beam"))
+        .def("update_num_tokens_per_iteration", &GenLlmReq::updateNumTokensPerIteration,
+            nb::arg("num_tokens_per_iteration"), nb::arg("model_config"))
+        .def_prop_ro("orig_prompt_len", &GenLlmReq::getOrigPromptLen)
+        .def("has_draft_tokens", &GenLlmReq::hasDraftTokens)
+        .def("move_to_next_context_chunk", &GenLlmReq::moveToNextContextChunk)
+        .def_prop_ro("is_last_context_chunk", &GenLlmReq::isLastContextChunk)
+        .def_prop_ro("is_first_context_chunk", &GenLlmReq::isFirstContextChunk)
+        .def_prop_ro("context_remaining_length", &GenLlmReq::getContextRemainingLength)
+        .def_prop_ro("context_logits", &GenLlmReq::getContextLogitsHost)
+        .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
+        .def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
+        .def_prop_ro("is_finished", &GenLlmReq::isFinished)
+        .def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
+        .def_prop_rw(
+            "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
+        .def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
+        .def_prop_rw("guided_decoding_params", &GenLlmReq::getGuidedDecodingParams, &GenLlmReq::setGuidedDecodingParams)
+        .def_prop_ro("context_phase_params", &GenLlmReq::getContextPhaseParams)
+        .def_prop_ro("is_context_only_request", &GenLlmReq::isContextOnlyRequest)
+        .def_prop_ro("is_generation_only_request", &GenLlmReq::isGenerationOnlyRequest)
+        .def_prop_ro("is_generation_complete_state", &GenLlmReq::isGenerationCompleteState)
+        .def_prop_ro("is_context_finished", &GenLlmReq::isContextFinished)
+        .def_prop_ro("is_disagg_generation_init_state", &GenLlmReq::isDisaggGenerationInitState)
+        .def_prop_ro("is_disagg_generation_transmission_complete", &GenLlmReq::isDisaggGenerationTransmissionComplete)
+        .def_prop_ro(
+            "is_disagg_generation_transmission_in_progress", &GenLlmReq::isDisaggGenerationTransmissionInProgress)
+        .def_prop_ro("is_context_init_state", &GenLlmReq::isContextInitState)
+        .def_prop_ro("is_generation_in_progress_state", &GenLlmReq::isGenerationInProgressState)
+        .def_prop_ro("is_disagg_context_transmission_state", &GenLlmReq::isDisaggContextTransmissionState)
+        .def_prop_ro("is_disagg_context_complete_state", &GenLlmReq::isDisaggContextCompleteState)
+        .def_prop_ro("stage", &GenLlmReq::getRequestStage)
+        .def_prop_ro("kv_cache_transfer_time_ms", &GenLlmReq::getKvCacheTransferTimeMS)
+        .def_prop_ro("kv_cache_size", &GenLlmReq::getKvCacheSize)
+        .def_prop_ro("avg_decoded_tokens_per_iter", &GenLlmReq::getAvgDecodedTokensPerIter)
+        .def_prop_ro("alloc_total_blocks", &GenLlmReq::getAllocTotalBlocksPerRequest)
+        .def_prop_ro("alloc_new_blocks", &GenLlmReq::getAllocNewBlocksPerRequest)
+        .def("alloc_context_logits", &GenLlmReq::allocContextLogitsHost, nb::arg("vocab_size"), nb::arg("logit_dtype"))
+        .def_prop_ro("reused_blocks", &GenLlmReq::getReusedBlocksPerRequest)
+        .def_prop_ro("missed_blocks", &GenLlmReq::getMissedBlocksPerRequest)
+        .def_prop_ro("kv_cache_hit_rate", &GenLlmReq::getKVCacheHitRatePerRequest)
+        .def_prop_ro("llm_request_type", &GenLlmReq::getLlmRequestType)
+        .def_prop_ro("multimodal_hashes",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<std::vector<GenLlmReq::SizeType32>>> hashes = std::nullopt;
+                if (self.getMultimodalHashes())
+                {
+                    hashes = *self.getMultimodalHashes().value();
+                }
+                return hashes;
+            })
+        .def_prop_ro("multimodal_positions",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<GenLlmReq::SizeType32>> positions = std::nullopt;
+                if (self.getMultimodalPositions())
+                {
+                    positions = *self.getMultimodalPositions().value();
+                }
+                return positions;
+            })
+        .def_prop_ro("multimodal_lengths",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<GenLlmReq::SizeType32>> lengths = std::nullopt;
+                if (self.getMultimodalLengths())
+                {
+                    lengths = *self.getMultimodalLengths().value();
+                }
+                return lengths;
+            })
+        .def_prop_ro("position_ids",
+            [](GenLlmReq& self)
+            {
+                std::optional<std::vector<GenLlmReq::SizeType32>> positionIds = std::nullopt;
+                if (self.getPositionIds())
+                {
+                    positionIds = *self.getPositionIds().value();
+                }
+                return positionIds;
+            })
+        .def_prop_rw(
+            "draft_tokens",
+            [](GenLlmReq& self)
+            {
+                std::optional<GenLlmReq::VecTokens> draftTokens = std::nullopt;
+                if (self.hasDraftTokens())
+                {
+                    draftTokens = *self.getDraftTokens();
+                }
+                return draftTokens;
+            },
+            [](GenLlmReq& self, std::optional<GenLlmReq::VecTokens> const& draftTokens)
+            {
+                if (draftTokens)
+                {
+                    self.setDraftTokens(std::make_shared<GenLlmReq::VecTokens>(draftTokens.value()));
+                }
+            })
+        .def_prop_rw("is_dummy_request", &GenLlmReq::isDummyRequest, &GenLlmReq::setIsDummyRequest)
+        .def_prop_ro("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics);
+
+    nb::class_<tb::LlmRequest, GenLlmReq>(m, "LlmRequest", nb::dynamic_attr())
+        .def(
+            "__init__",
+            [](tb::LlmRequest* self, tb::LlmRequest::RequestIdType request_id,
+                tb::LlmRequest::SizeType32 max_new_tokens, std::vector<tb::LlmRequest::TokenIdType> input_tokens,
+                runtime::SamplingConfig sampling_config, bool is_streaming,
+                std::optional<tb::LlmRequest::SizeType32> end_id, std::optional<tb::LlmRequest::SizeType32> pad_id,
+                std::optional<at::Tensor> embedding_bias, std::optional<at::Tensor> bad_words_list,
+                std::optional<at::Tensor> stop_words_list,
+                std::optional<std::vector<tb::LlmRequest::SizeType32>> position_ids,
+                std::optional<at::Tensor> prompt_embedding_table,
+                std::optional<tb::LlmRequest::SizeType32> prompt_vocab_size,
+                std::optional<std::vector<std::vector<tb::LlmRequest::SizeType32>>> multimodal_hashes,
+                std::optional<std::vector<tb::LlmRequest::SizeType32>> multimodal_positions,
+                std::optional<std::vector<tb::LlmRequest::SizeType32>> multimodal_lengths,
+                std::optional<at::Tensor> multimodal_embedding, std::optional<at::Tensor> mrope_rotary_cos_sin,
+                std::optional<tb::LlmRequest::SizeType32> mrope_position_deltas,
+                std::optional<LoraTaskIdType> lora_task_id, std::optional<at::Tensor> lora_weights,
+                std::optional<at::Tensor> lora_config,
+                std::optional<executor::LookaheadDecodingConfig> lookahead_config,
+                std::optional<executor::KvCacheRetentionConfig> kv_cache_retention_config, bool return_log_probs,
+                bool return_context_logits, bool return_generation_logits,
+                std::optional<tb::LlmRequest::VecTokens> draft_tokens, std::optional<at::Tensor> draft_logits,
+                bool exclude_input_from_output,
+                std::optional<tb::LlmRequest::LogitsPostProcessor> logits_post_processor,
+                bool apply_logits_post_processor_batched, std::optional<tb::LlmRequest::VecTokens> encoder_input_tokens,
+                bool return_encoder_output, std::optional<tb::LlmRequest::RequestIdType> client_id,
+                executor::PriorityType priority, std::optional<at::Tensor> encoder_input_features,
+                std::optional<tb::LlmRequest::SizeType32> encoder_output_length,
+                std::optional<at::Tensor> cross_attention_mask, tb::LlmRequestType llm_request_type,
+                std::optional<tb::LlmRequest::VecTokenExtraIds> input_token_extra_ids,
+                tb::LlmRequest::SizeType32 num_return_sequences, std::optional<executor::EagleConfig> eagle_config,
+                std::optional<at::Tensor> skip_cross_attn_blocks, bool return_perf_metrics,
+                std::optional<executor::GuidedDecodingParams> guided_decoding_params,
+                std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
+                std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
+                std::optional<executor::ContextPhaseParams> context_phase_params)
+            {
+                auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
+                {
+                    std::optional<tb::LlmRequest::TensorPtr> tensorPtr = std::nullopt;
+                    if (atTensor)
+                    {
+                        tensorPtr = tr::TorchView::of(atTensor.value());
+                        if (unsqueeze)
+                        {
+                            (*tensorPtr)->unsqueeze(0);
+                        }
+                    }
+                    return tensorPtr;
+                };
+
+                auto embedding_bias_tensor_ptr = makeOptionalTensor(embedding_bias, true);
+                auto bad_words_list_tensor_ptr = makeOptionalTensor(bad_words_list, true);
+                auto stop_words_list_tensor_ptr = makeOptionalTensor(stop_words_list, true);
+                auto prompt_embedding_table_tensor_ptr = makeOptionalTensor(prompt_embedding_table);
+                auto multimodal_embedding_tensor_ptr = makeOptionalTensor(multimodal_embedding);
+                auto lora_weights_tensor_ptr = makeOptionalTensor(lora_weights);
+                auto mrope_rotary_cos_sin_tensor_ptr = makeOptionalTensor(mrope_rotary_cos_sin);
+                auto lora_config_tensor_ptr = makeOptionalTensor(lora_config);
+                auto draft_logits_tensor_ptr = makeOptionalTensor(draft_logits);
+                auto encoder_input_features_tensor_ptr = makeOptionalTensor(encoder_input_features);
+                auto cross_attention_mask_tensor_ptr = makeOptionalTensor(cross_attention_mask);
+                auto skip_cross_attn_blocks_tensor_ptr = makeOptionalTensor(skip_cross_attn_blocks);
+
+                // 49 parameters
+                new (self) tb::LlmRequest{request_id, max_new_tokens, input_tokens, sampling_config, is_streaming,
+                    end_id, pad_id, embedding_bias_tensor_ptr, bad_words_list_tensor_ptr, stop_words_list_tensor_ptr,
+                    position_ids, prompt_embedding_table_tensor_ptr, prompt_vocab_size, multimodal_hashes,
+                    multimodal_positions, multimodal_lengths, multimodal_embedding_tensor_ptr,
+                    mrope_rotary_cos_sin_tensor_ptr, mrope_position_deltas, lora_task_id, lora_weights_tensor_ptr,
+                    lora_config_tensor_ptr, lookahead_config, kv_cache_retention_config, return_log_probs,
+                    return_context_logits, return_generation_logits, draft_tokens, draft_logits_tensor_ptr,
+                    exclude_input_from_output, logits_post_processor, apply_logits_post_processor_batched,
+                    encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr,
+                    encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
+                    num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
+                    guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params};
+            },
+            nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
+            nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
+            nb::arg("embedding_bias") = std::nullopt, nb::arg("bad_words_list") = std::nullopt,
+            nb::arg("stop_words_list") = std::nullopt, nb::arg("position_ids") = std::nullopt,
+            nb::arg("prompt_embedding_table") = std::nullopt, nb::arg("prompt_vocab_size") = std::nullopt,
+            nb::arg("multimodal_hashes") = std::nullopt, nb::arg("multimodal_positions") = std::nullopt,
+            nb::arg("multimodal_lengths") = std::nullopt, nb::arg("multimodal_embedding") = std::nullopt,
+            nb::arg("mrope_rotary_cos_sin") = std::nullopt, nb::arg("mrope_position_deltas") = std::nullopt,
+            nb::arg("lora_task_id") = std::nullopt, nb::arg("lora_weights") = std::nullopt,
+            nb::arg("lora_config") = std::nullopt, nb::arg("lookahead_config") = std::nullopt,
+            nb::arg("kv_cache_retention_config") = std::nullopt, nb::arg("return_log_probs") = false,
+            nb::arg("return_context_logits") = false, nb::arg("return_generation_logits") = false,
+            nb::arg("draft_tokens") = std::nullopt, nb::arg("draft_logits") = std::nullopt,
+            nb::arg("exclude_input_from_output") = false, nb::arg("logits_post_processor") = std::nullopt,
+            nb::arg("apply_logits_post_processor_batched") = false, nb::arg("encoder_input_tokens") = std::nullopt,
+            nb::arg("return_encoder_output") = false, nb::arg("client_id") = std::nullopt,
+            nb::arg("priority") = executor::Request::kDefaultPriority, nb::arg("encoder_input_features") = std::nullopt,
+            nb::arg("encoder_output_len") = std::nullopt, nb::arg("cross_attention_mask") = std::nullopt,
+            nb::arg("llm_request_type") = tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
+            nb::arg("input_token_extra_ids") = std::nullopt, nb::arg("num_return_sequences") = 1,
+            nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt,
+            nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
+            nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
+            nb::arg("context_phase_params") = std::nullopt)
+        .def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),
+            nb::arg("max_draft_len"), nb::arg("vocab_size_padded"), nb::arg("max_endocer_input_len") = std::nullopt,
+            nb::arg("enable_kv_cache_reuse") = false)
+        .def("create_response", &tb::LlmRequest::createResponse, nb::arg("use_fast_logits") = false,
+            nb::arg("mpi_world_rank") = 0)
+        .def("create_result", &tb::LlmRequest::createResult, nb::arg("use_fast_logits") = false,
+            nb::arg("mpi_world_rank") = 0)
+        .def("create_serialized_result",
+            [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
+            {
+                std::vector<char> serialized_result;
+                bool is_final = false;
+                self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
+                return std::make_tuple(nb::bytes(serialized_result.data(), serialized_result.size()), is_final);
+            })
+        .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, nb::arg("manager"))
+        .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, nb::arg("manager"))
+        .def("finish_by_reason", &tb::LlmRequest::finishByReason, nb::arg("finish_reason"))
+        .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
+        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter"));
+
+    nb::class_<tb::SequenceSlotManager>(m, "SequenceSlotManager")
+        .def(nb::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), nb::arg("max_num_slots"),
+            nb::arg("max_sequence_idle_microseconds"))
+        .def("get_sequence_slot", &tb::SequenceSlotManager::getSequenceSlot, nb::arg("start_flag"),
+            nb::arg("sequence_id"))
+        .def("free_sequence_slot", &tb::SequenceSlotManager::freeSequenceSlot, nb::arg("sequence_id"))
+        .def("free_idle_sequence_slots", &tb::SequenceSlotManager::freeIdleSequenceSlots);
+
+    nb::class_<tb::rnn_state_manager::RnnStateManager>(m, "RnnStateManager")
+        .def(nb::init<tr::SizeType32, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
+            nb::arg("max_num_sequences"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
+
+    nb::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(),
+            nb::arg("max_num_sequences"), nb::arg("max_batch_size"), nb::arg("max_tokens_per_engine_step"),
+            nb::arg("manager"))
+        .def_rw("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
+        .def_rw("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
+        .def_rw("fill_values", &tb::DecoderInputBuffers::fillValues)
+        .def_rw("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
+        .def_rw("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
+        .def_rw("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
+        .def_rw("logits", &tb::DecoderInputBuffers::logits);
+
+    nb::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
+        .def_rw("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
+        .def_rw("finished_sum_host", &tb::DecoderOutputBuffers::finishedSumHost)
+        .def_prop_ro("new_output_tokens_host",
+            [](tb::DecoderOutputBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
+        .def_rw("cum_log_probs_host", &tb::DecoderOutputBuffers::cumLogProbsHost)
+        .def_rw("log_probs_host", &tb::DecoderOutputBuffers::logProbsHost)
+        .def_rw("finish_reasons_host", &tb::DecoderOutputBuffers::finishReasonsHost);
+
+    nb::class_<tb::SlotDecoderBuffers>(m, "SlotDecoderBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&>(),
+            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"))
+        .def_rw("output_ids", &tb::SlotDecoderBuffers::outputIds)
+        .def_rw("output_ids_host", &tb::SlotDecoderBuffers::outputIdsHost)
+        .def_rw("sequence_lengths_host", &tb::SlotDecoderBuffers::sequenceLengthsHost)
+        .def_rw("cum_log_probs", &tb::SlotDecoderBuffers::cumLogProbs)
+        .def_rw("cum_log_probs_host", &tb::SlotDecoderBuffers::cumLogProbsHost)
+        .def_rw("log_probs", &tb::SlotDecoderBuffers::logProbs)
+        .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
+        .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
+
+    nb::class_<tb::MedusaBuffers>(m, "MedusaBuffers")
+        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&,
+                 runtime::ModelConfig const&, runtime::WorldConfig const&, executor::DecodingConfig const&,
+                 runtime::TllmRuntime const&>(),
+            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"), nb::arg("model_config"),
+            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("runtime"));
+
+    m.def(
+        "add_new_tokens_to_requests",
+        [](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,
+            std::vector<tb::LlmRequest::TokenIdType> const& tokens, int beam_idx)
+        {
+            TLLM_CHECK_WITH_INFO(requests.size() == tokens.size(), "Expected the same number of requests and tokens.");
+
+            for (int i = 0; i < requests.size(); ++i)
+            {
+                requests[i]->addNewToken(tokens[i], beam_idx);
+            }
+        },
+        nb::arg("requests"), nb::arg("tokens"), nb::arg("beam_idx"),
+        "Add new tokens to multiple LLM requests. The tokens vector should contain tokens for beam beam_idx of all "
+        "requests in order.");
+
+    m.def(
+        "make_decoding_batch_input",
+        [](std::vector<std::shared_ptr<tb::LlmRequest>>& contextRequests,
+            std::vector<std::shared_ptr<tb::LlmRequest>>& genRequests, tr::ITensor::SharedPtr logits, int beamWidth,
+            std::vector<int> const& numContextLogitsPrefixSum, tb::DecoderInputBuffers const& decoderInputBuffers,
+            runtime::decoder::DecoderState& decoderState, tr::BufferManager const& manager)
+        {
+            std::vector<int> activeSlots;
+            std::vector<int> generationSteps;
+            std::vector<std::vector<tr::ITensor::SharedConstPtr>> logitsVec = {{}};
+
+            for (int i = 0; i < contextRequests.size(); ++i)
+            {
+                if (contextRequests[i]->isLastContextChunk())
+                {
+                    activeSlots.push_back(*contextRequests[i]->mSeqSlot);
+                    generationSteps.push_back(contextRequests[i]->getDecodingIter());
+                    auto contextLogitsOffset = numContextLogitsPrefixSum[i + 1] - 1;
+                    tr::ITensor::SharedPtr logitsView = ITensor::slice(logits, contextLogitsOffset, 1);
+
+                    if (beamWidth > 1)
+                    {
+                        // Tile logits of context requests
+                        auto const logitsShape = logitsView->getShape();
+                        auto const logitsType = logitsView->getDataType();
+                        auto decoderLogits = manager.gpu(ITensor::makeShape({beamWidth, logitsShape.d[1]}), logitsType);
+                        tensorrt_llm::runtime::kernels::tileTensor(
+                            *decoderLogits, *logitsView, beamWidth, manager.getStream());
+                        decoderLogits->unsqueeze(0);
+                        logitsVec[0].push_back(std::move(decoderLogits));
+                    }
+                    else
+                    {
+                        logitsView->unsqueeze(1);
+                        logitsVec[0].push_back(std::move(logitsView));
+                    }
+                }
+            }
+
+            auto genLogitsOffset = numContextLogitsPrefixSum.back();
+            for (int i = 0; i < genRequests.size(); ++i)
+            {
+                if (genRequests[i]->isGenerationInProgressState())
+                {
+                    activeSlots.push_back(*genRequests[i]->mSeqSlot);
+                    generationSteps.push_back(genRequests[i]->getDecodingIter());
+
+                    auto logitsOffset = genLogitsOffset + i * beamWidth;
+                    auto numberOfLogits = beamWidth;
+                    tr::ITensor::SharedPtr logitsView = ITensor::slice(logits, logitsOffset, numberOfLogits);
+                    logitsView->unsqueeze(0);
+                    logitsVec[0].push_back(std::move(logitsView));
+                }
+            }
+
+            auto& batchSlots = decoderInputBuffers.forwardBatchSlots;
+            batchSlots[0]->resize(activeSlots.size());
+            auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlots[0]);
+            for (int i = 0; i < activeSlots.size(); ++i)
+            {
+                batchSlotsRange[i] = activeSlots[i];
+            }
+
+            auto decodingInput = std::make_unique<tr::decoder_batch::Input>(logitsVec, 1);
+            decodingInput->batchSlots = batchSlots;
+
+            auto const maxBeamWidth = decoderState.getMaxBeamWidth();
+            if (maxBeamWidth > 1)
+            {
+                // For Variable-Beam-Width-Search
+                decoderState.getJointDecodingInput().generationSteps = generationSteps;
+            }
+
+            return decodingInput;
+        },
+        nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("logits"), nb::arg("beam_width"),
+        nb::arg("num_context_logits_prefix_sum"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
+        nb::arg("buffer_manager"), "Make decoding batch input.");
+}
+
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
new file mode 100644
index 00000000000..3d5a0f5d5b2
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
@@ -0,0 +1,28 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void initBindings(nb::module_& m);
+
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
new file mode 100644
index 00000000000..b6edcca1c24
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
@@ -0,0 +1,108 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "buffers.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
+#include "tensorrt_llm/batch_manager/transformerBuffers.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tr = tensorrt_llm::runtime;
+
+using tr::SizeType32;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+void Buffers::initBindings(nb::module_& m)
+{
+    nb::class_<tb::TransformerBuffers>(m, "TransformerBuffers")
+        .def(nb::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
+                 runtime::TllmRuntime const&, runtime::ModelConfig const&, runtime::WorldConfig const&>(),
+            nb::arg("max_batch_size"), nb::arg("max_beam_width"), nb::arg("max_attention_window_vec"),
+            nb::arg("max_attention_window"), nb::arg("sink_token_len"), nb::arg("runtime"), nb::arg("model_config"),
+            nb::arg("world_config"))
+        .def("reshape", &tb::TransformerBuffers::reshape, nb::arg("num_sequences"), nb::arg("num_input_tokens"))
+        .def("reshape_kv_tensors", &tb::TransformerBuffers::reshapeKvTensors, nb::arg("max_batch_size"),
+            nb::arg("max_beam_width"), nb::arg("max_blocks_per_seq"), nb::arg("kv_cache_type"), nb::arg("num_pools"),
+            nb::arg("buffer_manager"))
+        .def("get_buffers", &tb::TransformerBuffers::getBuffers, nb::arg("input_buffers"), nb::arg("output_buffers"),
+            nb::arg("model_config"))
+        .def("copy_position_ids", &tb::TransformerBuffers::copyPositionIds, nb::arg("runtime"),
+            nb::arg("position_ids_host"), nb::arg("is_chat_glm"), nb::arg("decoder_position_ids"))
+        .def("copy_kv_block_offsets", &tb::TransformerBuffers::copyKvBlockOffsets, nb::arg("context_requests"),
+            nb::arg("gen_requests"), nb::arg("kv_cache_manager"), nb::arg("cross_kv_cache_manager"),
+            nb::arg("buffer_manager"))
+        .def("copy_cache_indirection", &tb::TransformerBuffers::copyCacheIndirection, nb::arg("gen_requests"),
+            nb::arg("decoder_cache_indirection_output"), nb::arg("runtime"))
+        .def_rw("past_key_value_lengths", &tb::TransformerBuffers::pastKeyValueLengths)
+        .def_rw("position_ids", &tb::TransformerBuffers::positionIds)
+        .def_rw("max_attention_windows", &tb::TransformerBuffers::maxAttentionWindows)
+        .def_rw("sink_token_lengths", &tb::TransformerBuffers::sinkTokenLengths)
+        .def_rw("cache_indirection", &tb::TransformerBuffers::cacheIndirection)
+        .def_rw("kv_cache_block_offsets_host", &tb::TransformerBuffers::kvCacheBlockOffsetsHost)
+        .def_rw("kv_cache_block_offsets_device", &tb::TransformerBuffers::kvCacheBlockOffsetsDevice)
+        .def_rw("cross_kv_cache_block_pool_pointers", &tb::TransformerBuffers::crossKvCacheBlockPoolPointers)
+        .def_rw("cross_kv_cache_block_offsets_host", &tb::TransformerBuffers::crossKvCacheBlockOffsetsHost)
+        .def_rw("cross_kv_cache_block_offsets_device", &tb::TransformerBuffers::crossKvCacheBlockOffsetsDevice)
+        .def_rw("cache_indir_batched_copy_src_offsets", &tb::TransformerBuffers::cacheIndirBatchedCopySrcOffsets)
+        .def_rw("cache_indir_batched_copy_dst_offsets", &tb::TransformerBuffers::cacheIndirBatchedCopyDstOffsets)
+        .def_rw("cache_indir_batched_copy_sizes", &tb::TransformerBuffers::cacheIndirBatchedCopySizes)
+        .def_rw("fill_values_alt", &tb::TransformerBuffers::fillValuesAlt)
+        .def_rw("fill_values_alt_device", &tb::TransformerBuffers::fillValuesAltDevice)
+        .def_rw("seq_slots_alt", &tb::TransformerBuffers::seqSlotsAlt)
+        .def_rw("seq_slots_alt_device", &tb::TransformerBuffers::seqSlotsAltDevice);
+
+    nb::class_<tb::RuntimeBuffers>(m, "RuntimeBuffers")
+        .def(nb::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
+                 runtime::TllmRuntime const&, runtime::ModelConfig const&, runtime::WorldConfig const&,
+                 executor::DecodingConfig const&, bool, std::optional<SizeType32>>(),
+            nb::arg("max_batch_size"), nb::arg("max_beam_width"), nb::arg("max_attention_window_vec"),
+            nb::arg("max_attention_window"), nb::arg("sink_token_len"), nb::arg("runtime"), nb::arg("model_config"),
+            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("gather_generation_logits"),
+            nb::arg("max_num_tokens") = std::nullopt)
+        .def_prop_rw(
+            "transformer_buffers", [](tb::RuntimeBuffers& self) { return self.transformerBuffers; },
+            [](tb::RuntimeBuffers& self, std::shared_ptr<tb::TransformerBuffers> val)
+            { self.transformerBuffers = val; })
+        .def_rw("num_context_logits", &tb::RuntimeBuffers::numContextLogits)
+        .def_rw("cache_indir_decoder_io_batched_copy_src_offsets",
+            &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopySrcOffsets)
+        .def_rw("cache_indir_decoder_io_batched_copy_dst_offsets",
+            &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopyDstOffsets)
+        .def_rw("cache_indir_decoder_io_batched_copy_sizes", &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopySizes)
+        .def_rw("logits", &tb::RuntimeBuffers::logits)
+        .def_rw("seq_slots", &tb::RuntimeBuffers::seqSlots)
+        .def_rw("seq_slots_device", &tb::RuntimeBuffers::seqSlotsDevice)
+        .def_rw("cache_indir_decoder_io_batched_copy_src_offsets_slice_device",
+            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopySrcOffsetsSliceDevice)
+        .def_rw("cache_indir_decoder_io_batched_copy_dst_offsets_slice_device",
+            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopyDstOffsetsSliceDevice)
+        .def_rw("cache_indir_decoder_io_batched_copy_copy_sizes_device",
+            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopyCopySizesDevice);
+}
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
new file mode 100644
index 00000000000..34df07e4073
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+class Buffers
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
new file mode 100644
index 00000000000..abac6d17ed8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
@@ -0,0 +1,110 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "cacheTransceiver.h"
+#include "tensorrt_llm/batch_manager/cacheTransceiver.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/trampoline.h>
+#include <torch/extension.h>
+
+using SizeType32 = tensorrt_llm::runtime::SizeType32;
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace nb = nanobind;
+
+namespace
+{
+
+class PyCacheTransceiver : public tb::BaseCacheTransceiver
+{
+public:
+    // using BaseCacheTransceiver::BaseCacheTransceiver; // Inherit constructors
+    NB_TRAMPOLINE(tb::BaseCacheTransceiver, 6);
+
+    void respondAndSendAsync(tb::LlmRequest* llmRequest) override
+    {
+        NB_OVERRIDE_PURE(respondAndSendAsync, llmRequest);
+    }
+
+    void requestAndReceiveSync(tb::LlmRequest* llmRequest) override
+    {
+        NB_OVERRIDE_PURE(requestAndReceiveSync, llmRequest);
+    }
+
+    void requestAndReceiveAsync(tb::LlmRequest* llmRequest) override
+    {
+        NB_OVERRIDE_PURE(requestAndReceiveAsync, llmRequest);
+    }
+
+    void checkContextTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(checkContextTransferStatus, atLeastRequestNum);
+    }
+
+    void checkGenTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(checkGenTransferStatus, atLeastRequestNum);
+    }
+
+    bool checkGenTransferComplete() const override
+    {
+        NB_OVERRIDE_PURE(checkGenTransferComplete);
+    }
+};
+} // namespace
+
+void tb::CacheTransceiverBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tb::BaseCacheTransceiver, PyCacheTransceiver>(m, "BaseCacheTransceiver")
+        .def("respond_and_send_async", &BaseCacheTransceiver::respondAndSendAsync)
+        .def("request_and_receive_sync", &BaseCacheTransceiver::requestAndReceiveSync)
+        .def("request_and_receive_async", &BaseCacheTransceiver::requestAndReceiveAsync)
+        .def("check_context_transfer_status", &BaseCacheTransceiver::checkContextTransferStatus)
+        .def("check_gen_transfer_status", &BaseCacheTransceiver::checkGenTransferStatus)
+        .def("check_gen_transfer_complete", &BaseCacheTransceiver::checkGenTransferComplete);
+
+    nb::enum_<tb::CacheTransceiver::CommType>(m, "CommType")
+        .value("UNKNOWN", tb::CacheTransceiver::CommType::UNKNOWN)
+        .value("MPI", tb::CacheTransceiver::CommType::MPI)
+        .value("UCX", tb::CacheTransceiver::CommType::UCX)
+        .value("NIXL", tb::CacheTransceiver::CommType::NIXL);
+
+    nb::enum_<executor::kv_cache::CacheState::AttentionType>(m, "AttentionType")
+        .value("DEFAULT", executor::kv_cache::CacheState::AttentionType::kDEFAULT)
+        .value("MLA", executor::kv_cache::CacheState::AttentionType::kMLA);
+
+    nb::class_<tb::CacheTransceiver, tb::BaseCacheTransceiver>(m, "CacheTransceiver")
+        .def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, tb::CacheTransceiver::CommType,
+                 std::vector<SizeType32>, SizeType32, SizeType32, runtime::WorldConfig, nvinfer1::DataType,
+                 executor::kv_cache::CacheState::AttentionType, std::optional<executor::CacheTransceiverConfig>>(),
+            nb::arg("cache_manager"), nb::arg("comm_type"), nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"),
+            nb::arg("tokens_per_block"), nb::arg("world_config"), nb::arg("dtype"), nb::arg("attention_type"),
+            nb::arg("cache_transceiver_config") = std::nullopt);
+
+    nb::class_<tb::kv_cache_manager::CacheTransBufferManager>(m, "CacheTransBufferManager")
+        .def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, std::optional<size_t>>(), nb::arg("cache_manager"),
+            nb::arg("max_num_tokens") = std::nullopt)
+        .def_static("pre_alloc_buffer_size", &tb::kv_cache_manager::CacheTransBufferManager::preAllocBufferSize,
+            nb::arg("max_num_tokens") = std::nullopt);
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
new file mode 100644
index 00000000000..90fc63d4fde
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::batch_manager
+{
+class CacheTransceiverBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
new file mode 100644
index 00000000000..f1c398d31f0
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -0,0 +1,478 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/batch_manager/peftCacheManager.h"
+#include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/chrono.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/stl/vector.h>
+#include <nanobind/trampoline.h>
+#include <torch/extension.h>
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
+namespace tr = tensorrt_llm::runtime;
+namespace nb = nanobind;
+using BlockKey = tbk::BlockKey;
+using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
+using SizeType32 = tensorrt_llm::runtime::SizeType32;
+using TokenIdType = tensorrt_llm::runtime::TokenIdType;
+using VecTokens = std::vector<TokenIdType>;
+using CudaStreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
+
+namespace
+{
+std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optional<at::Tensor> torchPtr)
+{
+    if (torchPtr)
+    {
+        return tr::TorchView::of(torchPtr.value());
+    }
+    return std::nullopt;
+}
+
+class PyKvCacheManager : public tbk::BaseKVCacheManager
+{
+public:
+    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 28);
+
+    // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
+    void allocatePools(bool useUvm = false) override
+    {
+        NB_OVERRIDE_PURE(allocatePools, useUvm);
+    }
+
+    void releasePools() override
+    {
+        NB_OVERRIDE_PURE(releasePools);
+    }
+
+    void startScheduling() override
+    {
+        NB_OVERRIDE_PURE(startScheduling);
+    }
+
+    SizeType32 getTokensPerBlock() const override
+    {
+        NB_OVERRIDE_PURE(getTokensPerBlock);
+    }
+
+    SizeType32 getMaxNumBlocks() const override
+    {
+        NB_OVERRIDE_PURE(getMaxNumBlocks);
+    }
+
+    SizeType32 getNumPools() const override
+    {
+        NB_OVERRIDE_PURE(getNumPools);
+    }
+
+    tbk::KvCacheStats getKvCacheStats() const override
+    {
+        NB_OVERRIDE_PURE(getKvCacheStats);
+    }
+
+    void addToken(tb::LlmRequest::RequestIdType requestId) override
+    {
+        NB_OVERRIDE_PURE(addToken, requestId);
+    }
+
+    void addSequence(tb::LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
+        tensorrt_llm::common::OptionalRef<tb::LlmRequest> llmRequest = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(addSequence, requestId, inputLength, beamWidth, llmRequest);
+    }
+
+    void removeSequence(tb::LlmRequest::RequestIdType requestId,
+        tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(removeSequence, requestId, llmRequest);
+    }
+
+    tbk::GenerationRequest const& getSequence(tb::LlmRequest::RequestIdType requestId) const override
+    {
+        NB_OVERRIDE_PURE(getSequence, requestId);
+    }
+
+    void schedulingRemoveSequence(tb::LlmRequest::RequestIdType requestId) override
+    {
+        NB_OVERRIDE_PURE(schedulingRemoveSequence, requestId);
+    }
+
+    tensorrt_llm::runtime::ITensor::SharedPtr getBlockPoolPointers() const override
+    {
+        NB_OVERRIDE_PURE(getBlockPoolPointers);
+    }
+
+    tensorrt_llm::runtime::ITensor::SharedPtr getLayerToPoolMapping() const override
+    {
+        NB_OVERRIDE_PURE(getLayerToPoolMapping);
+    }
+
+    void getBlockOffsetsOfBatch(tensorrt_llm::runtime::ITensor& output, SizeType32 firstBatchSlotIdx,
+        SizeType32 batchSize, SizeType32 beamWidth) const override
+    {
+        NB_OVERRIDE_PURE(getBlockOffsetsOfBatch, output, firstBatchSlotIdx, batchSize, beamWidth);
+    }
+
+    SizeType32 copyBlockOffsets(tensorrt_llm::runtime::ITensor& output, SizeType32 outputSlotOffset,
+        tb::LlmRequest::RequestIdType requestId) const override
+    {
+        NB_OVERRIDE_PURE(copyBlockOffsets, output, outputSlotOffset, requestId);
+    }
+
+    bool isEnableBlockReuse() const override
+    {
+        NB_OVERRIDE_PURE(isEnableBlockReuse);
+    }
+
+    void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override
+    {
+        NB_OVERRIDE_PURE(rewindKVCache, requestId, rewindLengths);
+    }
+
+    bool isCrossKv() const override
+    {
+        NB_OVERRIDE_PURE(isCrossKv);
+    }
+
+    std::optional<BlockKey> findNewContextBlock(
+        VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest) const override
+    {
+        NB_OVERRIDE_PURE(findNewContextBlock, uniqueTokens, llmRequest);
+    }
+
+    void storeContextBlocks(tb::LlmRequest const& llmRequest) override
+    {
+        NB_OVERRIDE_PURE(storeContextBlocks, llmRequest);
+    }
+
+    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
+    {
+        NB_OVERRIDE_PURE(getCacheBlockIds, requestId, windowSize);
+    }
+
+    std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
+        std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const override
+    {
+        NB_OVERRIDE_PURE(getBatchCacheBlockIds, requestIds, windowSize);
+    }
+
+    std::vector<SizeType32> getNewlyAllocatedBlockIds(
+        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
+    {
+        NB_OVERRIDE_PURE(getNewlyAllocatedBlockIds, requestId, windowSize);
+    }
+
+    SizeType32 getUsedNumBlocks() const override
+    {
+        NB_OVERRIDE_PURE(getUsedNumBlocks);
+    }
+
+    SizeType32 getNumFreeBlocks() const override
+    {
+        NB_OVERRIDE_PURE(getNumFreeBlocks);
+    }
+
+    tbk::BlockManager const& getBlockManager() const override
+    {
+        NB_OVERRIDE_PURE(getBlockManager);
+    }
+
+    std::deque<tensorrt_llm::executor::KVCacheEvent> getLatestEvents(
+        std::optional<std::chrono::milliseconds> timeout = std::nullopt) const override
+    {
+        NB_OVERRIDE_PURE(getLatestEvents, timeout);
+    }
+
+    tensorrt_llm::runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override
+    {
+        NB_OVERRIDE_PURE(getPrimaryPool, layer_idx);
+    }
+
+    SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
+    {
+        NB_OVERRIDE_PURE(getPoolLayerIdx, layer_idx);
+    }
+
+    void refreshBlocks() override
+    {
+        NB_OVERRIDE_PURE(refreshBlocks);
+    }
+
+    void flushIterationEvents() override
+    {
+        NB_OVERRIDE_PURE(flushIterationEvents);
+    }
+};
+
+// TODO: Deduplicate executor bindings KvCacheStats
+class PyBasePeftCacheManager : public tb::BasePeftCacheManager
+{
+public:
+    ~PyBasePeftCacheManager() override = default;
+
+    NB_TRAMPOLINE(tb::BasePeftCacheManager, 8);
+
+    void addRequestPeft(tb::BasePeftCacheManager::LlmRequestPtr llmRequest, bool tryGpuCache = true) override
+    {
+        NB_OVERRIDE_PURE(addRequestPeft, llmRequest, tryGpuCache);
+    }
+
+    tb::BasePeftCacheManager::PeftTable ensureBatch(tb::RequestVector const& contextRequests,
+        tb::RequestVector const& generationRequests, bool resetGpuCache = false) override
+    {
+        NB_OVERRIDE_PURE(ensureBatch, contextRequests, generationRequests, resetGpuCache);
+    }
+
+    void resetDeviceCache() override
+    {
+        NB_OVERRIDE_PURE(resetDeviceCache);
+    }
+
+    void markRequestDone(tb::LlmRequest const& llmReq, bool pause = false) override
+    {
+        NB_OVERRIDE_PURE(markRequestDone, llmReq, pause);
+    }
+
+    tr::SizeType32 getMaxDevicePages() const override
+    {
+        NB_OVERRIDE_PURE(getMaxDevicePages);
+    }
+
+    tr::SizeType32 getMaxHostPages() const override
+    {
+        NB_OVERRIDE_PURE(getMaxHostPages);
+    }
+
+    tr::SizeType32 determineNumPages(std::shared_ptr<tb::LlmRequest> llmRequest) const override
+    {
+        NB_OVERRIDE_PURE(determineNumPages, llmRequest);
+    }
+
+    bool enabled() const override
+    {
+        NB_OVERRIDE_PURE(enabled);
+    }
+};
+} // namespace
+
+void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tbk::KvCacheStats>(m, "KvCacheStats")
+        .def(nb::init<>())
+        .def_rw("max_num_blocks", &tbk::KvCacheStats::maxNumBlocks)
+        .def_rw("free_num_blocks", &tbk::KvCacheStats::freeNumBlocks)
+        .def_rw("used_num_blocks", &tbk::KvCacheStats::usedNumBlocks)
+        .def_rw("tokens_per_block", &tbk::KvCacheStats::toksPerBlock)
+        .def_rw("alloc_total_blocks", &tbk::KvCacheStats::allocTotalBlocks)
+        .def_rw("alloc_new_blocks", &tbk::KvCacheStats::allocNewBlocks)
+        .def_rw("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
+        .def_rw("missed_blocks", &tbk::KvCacheStats::missedBlocks)
+        .def_rw("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
+        .def_rw("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
+
+    nb::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
+        .def(nb::init<>())
+        .def_rw("paged_context_fmha", &tbk::TempAttentionWindowInputs::pagedContextFMHA)
+        .def_rw("max_input_len", &tbk::TempAttentionWindowInputs::maxInputLen)
+        .def_rw("max_num_tokens", &tbk::TempAttentionWindowInputs::maxNumTokens);
+
+    nb::class_<tbk::BlockKey>(m, "BlockKey")
+        .def(nb::init<>())
+        .def(nb::init<VecTokens const&, std::optional<tr::LoraTaskIdType>>(), nb::arg("tokens"),
+            nb::arg("lora_task_id") = std::nullopt)
+        .def(nb::init<bool, std::optional<tr::LoraTaskIdType>, VecUniqueTokens const&>(), nb::arg("uses_extra_ids"),
+            nb::arg("lora_task_id"), nb::arg("unique_tokens"))
+        .def_ro("uses_extra_ids", &tbk::BlockKey::usesExtraIds)
+        .def_ro("lora_task_id", &tbk::BlockKey::loraTaskId)
+        .def_ro("unique_tokens", &tbk::BlockKey::uniqueTokens);
+
+    nb::class_<tbk::BlockKeyHasher>(m, "BlockKeyHasher")
+        .def_static("hash", &tbk::BlockKeyHasher::hash, nb::arg("block_key"), nb::arg("parent_hash") = 0);
+
+    nb::class_<tbk::KVCacheEventManager>(m, "KVCacheEventManager")
+        .def(nb::init<size_t>(), nb::arg("max_kv_event_entries"));
+
+    nb::class_<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
+        .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, nb::arg("config"),
+            nb::arg("is_cross_attention"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"),
+            nb::arg("window_size_to_layers"), nb::arg("allotted_primary_mem_bytes"),
+            nb::arg("allotted_secondary_mem_bytes"), nb::arg("extra_cost_memory"), nb::arg("kv_factor"))
+        .def("allocate_pools", &BaseKVCacheManager::allocatePools)
+        .def("release_pools", &BaseKVCacheManager::releasePools)
+        .def("start_scheduling", &BaseKVCacheManager::startScheduling)
+        .def_prop_ro("tokens_per_block", &BaseKVCacheManager::getTokensPerBlock)
+        .def_prop_ro("max_num_blocks", &BaseKVCacheManager::getMaxNumBlocks)
+        .def_prop_ro("num_pools", &BaseKVCacheManager::getNumPools)
+        .def("get_kv_cache_stats", &BaseKVCacheManager::getKvCacheStats)
+        .def_prop_ro("max_blocks_per_seq",
+            [](tbk::BaseKVCacheManager& self) { return self.getOffsetTableDimensions().maxBlocksPerSeq; })
+        .def("get_needed_blocks_one_step", &BaseKVCacheManager::getNeededBlocksOneStep)
+        .def("get_remaining_blocks_to_completion", &BaseKVCacheManager::getRemainingBlocksToCompletion)
+        .def("add_token", &BaseKVCacheManager::addToken)
+        .def("add_sequence", &BaseKVCacheManager::addSequence)
+        .def("remove_sequence", &BaseKVCacheManager::removeSequence)
+        .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence)
+        .def("get_block_pool_pointers",
+            [](tbk::BaseKVCacheManager& self)
+            {
+                std::optional<at::Tensor> block_pool_pointers{std::nullopt};
+                auto tensor = self.getBlockPoolPointers();
+                if (tensor)
+                {
+                    std::shared_ptr<tensorrt_llm::runtime::ITensor> _tensor = std::move(tensor);
+                    block_pool_pointers = tr::Torch::tensor(_tensor);
+                }
+                return block_pool_pointers;
+            })
+        .def("get_layer_to_pool_mapping",
+            [](tbk::BaseKVCacheManager& self)
+            {
+                std::optional<at::Tensor> layer_to_pool_mapping{std::nullopt};
+                auto tensor = self.getLayerToPoolMapping();
+                if (tensor)
+                {
+                    std::shared_ptr<tensorrt_llm::runtime::ITensor> _tensor = std::move(tensor);
+                    layer_to_pool_mapping = tr::Torch::tensor(_tensor);
+                }
+                return layer_to_pool_mapping;
+            })
+        .def("get_primary_pool_data",
+            [](tbk::BaseKVCacheManager& self, SizeType32 layer_idx) -> at::Tensor
+            {
+                auto pool = tr::Torch::tensor(self.getPrimaryPool(layer_idx));
+                auto pool_layer_idx = self.getPoolLayerIdx(layer_idx);
+                return pool.index({torch::indexing::Slice(), pool_layer_idx});
+            })
+        .def("get_block_offsets_of_batch",
+            [](tbk::BaseKVCacheManager& self, at::Tensor output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize,
+                SizeType32 beamWidth)
+            {
+                auto _output = from_torch(output);
+                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+                self.getBlockOffsetsOfBatch(*(_output.value()), firstBatchSlotIdx, batchSize, beamWidth);
+            })
+        .def("copy_block_offsets",
+            [](tbk::BaseKVCacheManager& self, at::Tensor output, SizeType32 outputSlotOffset,
+                tb::LlmRequest::RequestIdType requestId)
+            {
+                auto _output = from_torch(output);
+                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+                auto maxBlockCount = self.copyBlockOffsets(*(_output.value()), outputSlotOffset, requestId);
+                return maxBlockCount;
+            })
+        .def("copy_batch_block_offsets",
+            [](tbk::BaseKVCacheManager& self, at::Tensor output,
+                std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 const beamWidth,
+                SizeType32 const offset)
+            {
+                auto _output = from_torch(output);
+                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
+                for (size_t i = 0; i < requestIds.size(); ++i)
+                {
+                    self.copyBlockOffsets(*(_output.value()), i * beamWidth + offset, requestIds[i]);
+                }
+            })
+        .def(
+            "get_latest_events",
+            [](tbk::BaseKVCacheManager& self, std::optional<double> timeout_ms = std::nullopt)
+            {
+                if (timeout_ms)
+                {
+                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
+                }
+                return self.getLatestEvents(std::nullopt);
+            },
+            nb::arg("timeout_ms") = std::nullopt)
+        .def_prop_ro("enable_block_reuse", &BaseKVCacheManager::isEnableBlockReuse)
+        .def("rewind_kv_cache", &BaseKVCacheManager::rewindKVCache)
+        .def_prop_ro("cross_kv", &BaseKVCacheManager::isCrossKv)
+        .def("store_context_blocks", &BaseKVCacheManager::storeContextBlocks)
+        .def("get_cache_block_ids", &BaseKVCacheManager::getCacheBlockIds)
+        .def("get_batch_cache_block_ids", &BaseKVCacheManager::getBatchCacheBlockIds)
+        .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds)
+        .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents);
+
+    nb::bind_vector<std::vector<std::vector<SizeType32>>>(m, "CacheBlockIds");
+
+    nb::enum_<tbk::CacheType>(m, "CacheType")
+        .value("SELF", tbk::CacheType::kSELF)
+        .value("CROSS", tbk::CacheType::kCROSS)
+        .value("SELFKONLY", tbk::CacheType::kSELFKONLY);
+
+    nb::class_<tbk::KVCacheManager, tbk::BaseKVCacheManager>(m, "KVCacheManager")
+        .def(nb::init<std::vector<SizeType32> const&, SizeType32, SizeType32,
+                 std::map<SizeType32, std::tuple<SizeType32, SizeType32>> const&, SizeType32, SizeType32,
+                 std::vector<SizeType32> const&, std::optional<tbk::TempAttentionWindowInputs> const&,
+                 nvinfer1::DataType, SizeType32, int64_t, std::optional<runtime::SizeType32>, bool, bool,
+                 tbk::CacheType, std::optional<tensorrt_llm::executor::RetentionPriority>,
+                 std::shared_ptr<tbk::KVCacheEventManager>, bool, bool>(),
+            nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"), nb::arg("tokens_per_block"),
+            nb::arg("blocks_per_window"), nb::arg("max_num_sequences"), nb::arg("max_beam_width"),
+            nb::arg("max_attention_window_vec"), nb::arg("temp_attention_window_inputs").none(), nb::arg("dtype"),
+            nb::arg("sink_token_length"), nb::arg("stream"), nb::arg("max_sequence_length").none(),
+            nb::arg("enable_block_reuse") = false, nb::arg("onboard_blocks") = true,
+            nb::arg("cache_type") = tbk::CacheType::kSELF, nb::arg("secondary_offload_min_priority") = std::nullopt,
+            nb::arg("event_manager") = nullptr, nb::arg("enable_partial_reuse") = true,
+            nb::arg("copy_on_partial_reuse") = true);
+}
+
+void tb::BasePeftCacheManagerBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tb::BasePeftCacheManager, PyBasePeftCacheManager>(m, "BasePeftCacheManager")
+        .def("add_request_peft", &tb::BasePeftCacheManager::addRequestPeft, nb::arg("request"),
+            nb::arg("try_gpu_cache") = true)
+        .def(
+            "ensure_batch",
+            [](tb::BasePeftCacheManager& self, tb::RequestVector const& contextRequests,
+                tb::RequestVector const& generationRequests, bool resetGpuCache)
+            {
+                nb::gil_scoped_release release;
+                return self.ensureBatch(contextRequests, generationRequests, resetGpuCache);
+            },
+            nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("reset_gpu_cache") = false)
+        .def("reset_device_cache", &tb::BasePeftCacheManager::resetDeviceCache)
+        .def("mark_request_done", &tb::BasePeftCacheManager::markRequestDone, nb::arg("request"),
+            nb::arg("pause") = false)
+        .def_prop_ro("max_device_pages", &tb::BasePeftCacheManager::getMaxDevicePages)
+        .def_prop_ro("max_host_pages", &tb::BasePeftCacheManager::getMaxHostPages)
+        .def("determine_num_pages", &tb::BasePeftCacheManager::determineNumPages, nb::arg("request"))
+        .def_prop_ro("enabled", &tb::BasePeftCacheManager::enabled);
+
+    nb::class_<tb::PeftCacheManager, tb::BasePeftCacheManager>(m, "PeftCacheManager")
+        .def(nb::init<tb::PeftCacheManagerConfig, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
+            nb::arg("config"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
+
+    nb::class_<tb::NoOpPeftCacheManager, tb::BasePeftCacheManager>(m, "NoOpPeftCacheManager").def(nb::init<>());
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
new file mode 100644
index 00000000000..786c0d391df
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
@@ -0,0 +1,39 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::batch_manager::kv_cache_manager
+{
+class KVCacheManagerBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager::kv_cache_manager
+
+namespace tensorrt_llm::batch_manager
+{
+class BasePeftCacheManagerBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
new file mode 100644
index 00000000000..d8f45cb865f
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
@@ -0,0 +1,131 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "llmRequest.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+#include "tensorrt_llm/nanobind/common/bindTypes.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchUtils.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <torch/extension.h>
+
+#include <memory>
+
+namespace tb = tensorrt_llm::batch_manager;
+namespace tr = tensorrt_llm::runtime;
+namespace tle = tensorrt_llm::executor;
+
+using namespace tensorrt_llm::nanobind::batch_manager;
+
+using LlmRequestPtr = std::shared_ptr<tb::LlmRequest>;
+using RequestList = std::list<LlmRequestPtr>;
+
+namespace
+{
+
+std::optional<tb::LlmRequest::TensorPtr> from_torch(std::optional<LlmRequest::TensorPtr> torchPtr)
+{
+    if (torchPtr)
+    {
+        return tr::TorchView::of(torchPtr.value());
+    }
+    return std::nullopt;
+}
+
+} // namespace
+
+std::optional<tb::LlmRequest::LogitsPostProcessor> LlmRequest::callbackAdapter(
+    std::optional<LlmRequest::LogitsPostProcessor> callback)
+{
+    if (!callback)
+    {
+        return std::nullopt;
+    }
+
+    return [callback](RequestIdType reqId, tr::ITensor::SharedPtr& tensor, tb::LlmRequest::BeamTokens const& tokens,
+               tr::BufferManager::CudaStreamPtr stream, std::optional<RequestIdType> clientId)
+    {
+        at::Tensor atTensor = tr::Torch::tensor(tensor);
+        callback.value()(reqId, atTensor, tokens, runtime::TorchUtils::stream(*stream).unwrap(), clientId);
+    };
+}
+
+std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
+{
+
+    auto const draftTokens = std::make_shared<std::vector<TokenIdType>>(*mDraftTokens.get());
+    auto const optDraftTokens = std::optional<std::shared_ptr<std::vector<TokenIdType>>>(draftTokens);
+    auto const encoderInputTokens = mEncoderTokens.has_value()
+        ? std::make_shared<std::vector<TokenIdType>>(*mEncoderTokens.value().get())
+        : nullptr;
+    auto const optEncoderInputTokens = std::optional<std::shared_ptr<std::vector<TokenIdType>>>(encoderInputTokens);
+    // 49 parameters
+    return std::make_shared<tb::LlmRequest>(                       //
+        mRequestId,                                                //
+        mMaxNewTokens,                                             //
+        std::make_shared<std::vector<TokenIdType>>(mTokens.at(0)), //
+        mSamplingConfig,                                           //
+        mIsStreaming,                                              //
+        mEndId,                                                    //
+        mPadId,                                                    //
+        from_torch(mEmbeddingBias),                                //
+        from_torch(mBadWordsList),                                 //
+        from_torch(mStopWordsList),                                //
+        mPositionIds,                                              //
+        from_torch(mPromptEmbeddingTable),                         //
+        mPromptVocabSize,                                          //
+        mMultimodalHashes,                                         //
+        mMultimodalPositions,                                      //
+        mMultimodalLengths,                                        //
+        from_torch(mMultimodalEmbedding),                          //
+        from_torch(mMropeRotaryCosSin),                            //
+        mMropePositionDeltas,                                      //
+        mLoraTaskId,                                               //
+        from_torch(mLoraWeights),                                  //
+        from_torch(mLoraConfig),                                   //
+        mLookaheadConfig,                                          //
+        mKvCacheRetentionConfig,                                   //
+        mReturnLogProbs,                                           //
+        mReturnContextLogits,                                      //
+        mReturnGenerationLogits,                                   //
+        optDraftTokens,                                            //
+        from_torch(mDraftLogits),                                  //
+        mExcludeInputFromOutput,                                   //
+        callbackAdapter(mLogitsPostProcessor),                     //
+        mApplyLogitsPostProcessorBatched,                          //
+        optEncoderInputTokens,                                     //
+        mReturnEncoderOutput,                                      //
+        mClientId,                                                 //
+        mPriority,                                                 //
+        from_torch(mEncoderInputFeatures),                         //
+        mEncoderOutputLength,                                      //
+        from_torch(mCrossAttentionMask),                           //
+        getLlmRequestType(),                                       //
+        std::nullopt,                                              // inputTokenExtraIds
+        mNumReturnSequences,                                       //
+        mEagleConfig,                                              //
+        from_torch(mSkipCrossAttnBlocks),                          //
+        false,                                                     // returnPerfMetrics
+        mGuidedDecodingParams,                                     //
+        mLanguageAdapterUid,                                       //
+        mAllottedTimeMs,                                           //
+        mContextPhaseParams                                        //
+    );
+}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
new file mode 100644
index 00000000000..624dc55112d
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
@@ -0,0 +1,160 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/llmRequest.h"
+
+#include <ATen/ATen.h>
+#include <ATen/ops/tensor.h>
+#include <memory>
+#include <nanobind/nanobind.h>
+#include <optional>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::batch_manager
+{
+
+namespace tb = tensorrt_llm::batch_manager;
+
+/* Unfortunately, torch's default nanobind bindings don't know about c10::cuda::CUDAStream,
+ * so we have to pass the more generic c10::Stream, and convert it back to a full-fledged
+ * torch.cuda.Stream in python. See example in test/bindings/test_gpt_manager.py
+ */
+class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
+{
+public:
+    using Base = GenericLlmRequest<at::Tensor, c10::Stream>;
+    using TensorPtr = Base::TensorPtr;
+    using SizeType32 = Base::SizeType32;
+    using TokenIdType = Base::TokenIdType;
+    using RequestIdType = Base::RequestIdType;
+    using LoraTaskIdType = Base::LoraTaskIdType;
+    using VecLogProbs = Base::VecLogProbs;
+    using BeamTokens = Base::BeamTokens;
+    using VecTokens = Base::VecTokens;
+    using VecTokenExtraIds = Base::VecTokenExtraIds;
+    using LogitsPostProcessor = Base::LogitsPostProcessor;
+
+    // 49 parameters
+    LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
+        runtime::SamplingConfig samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
+        std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
+        std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
+        std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
+        std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
+        std::optional<SizeType32> promptVocabSize = std::nullopt,
+        std::optional<std::vector<std::vector<SizeType32>>> multimodalHashes = std::nullopt,
+        std::optional<std::vector<SizeType32>> multimodalPositions = std::nullopt,
+        std::optional<std::vector<SizeType32>> multimodalLengths = std::nullopt,
+        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
+        std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
+        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
+        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
+        std::optional<TensorPtr> loraConfig = std::nullopt,
+        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
+        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
+        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
+        std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
+        bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
+        bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
+        bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
+        executor::PriorityType priority = executor::Request::kDefaultPriority,
+        std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
+        std::optional<SizeType32> encoderOutputLength = std::nullopt,
+        std::optional<TensorPtr> crossAttentionMask = std::nullopt,
+        tb::LlmRequestType llmRequestType = tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
+        std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt, SizeType32 numReturnSequences = 1,
+        std::optional<executor::EagleConfig> eagleConfig = std::nullopt,
+        std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false,
+        std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
+        std::optional<SizeType32> languageAdapterUid = std::nullopt,
+        std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
+        std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
+        : Base(requestId,                                                                                       //
+            maxNewTokens,                                                                                       //
+            std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
+            samplingConfig,                                                                                     //
+            isStreaming,                                                                                        //
+            endId,                                                                                              //
+            padId,                                                                                              //
+            embeddingBias,                                                                                      //
+            badWordsList,                                                                                       //
+            stopWordsList,                                                                                      //
+            positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value())) //
+                                    : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),    //
+            promptEmbeddingTable,                                                                               //
+            promptVocabSize,                                                                                    //
+            multimodalHashes.has_value()
+                ? std::make_optional(
+                    std::make_shared<std::vector<std::vector<SizeType32>>>(std::move(multimodalHashes.value()))) //
+                : std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>>(std::nullopt),            //
+            multimodalPositions.has_value()
+                ? std::make_shared<std::vector<SizeType32>>(std::move(multimodalPositions.value()))              //
+                : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),                         //
+            multimodalLengths.has_value()
+                ? std::make_shared<std::vector<SizeType32>>(std::move(multimodalLengths.value()))                //
+                : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),                         //
+            multimodalEmbedding,                                                                                 //
+            mropeRotaryCosSin,                                                                                   //
+            mropePositionDeltas,                                                                                 //
+            loraTaskId,                                                                                          //
+            loraWeights,                                                                                         //
+            loraConfig,                                                                                          //
+            lookaheadConfig,                                                                                     //
+            kvCacheRetentionConfig,                                                                              //
+            returnLogProbs,                                                                                      //
+            returnContextLogits,                                                                                 //
+            returnGenerationLogits,                                                                              //
+            draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))                //
+                                    : std::make_shared<VecTokens>(),                                             //
+            draftLogits,                                                                                         //
+            excludeInputFromOutput,                                                                              //
+            logitsPostProcessor,                                                                                 //
+            applyLogitsPostProcessorBatched,                                                                     //
+            encoderInputTokens ? std::make_optional(std::make_shared<VecTokens>(std::move(*encoderInputTokens))) //
+                               : std::optional<std::shared_ptr<VecTokens>>(std::nullopt),                        //
+            returnEncoderOutput,                                                                                 //
+            clientId,                                                                                            //
+            priority,                                                                                            //
+            encoderInputFeatures,                                                                                //
+            encoderOutputLength,                                                                                 //
+            crossAttentionMask,                                                                                  //
+            llmRequestType,                                                                                      //
+            inputTokenExtraIds                                                                                   //
+                ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))         //
+                : std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),                                //
+            numReturnSequences,                                                                                  //
+            eagleConfig,                                                                                         //
+            skipCrossAttnBlocks,                                                                                 //
+            returnPerfMetrics,                                                                                   //
+            guidedDecodingParams,                                                                                //
+            languageAdapterUid,                                                                                  //
+            allottedTimeMs,                                                                                      //
+            contextPhaseParams                                                                                   //
+        )
+    {
+    }
+
+    static std::optional<tb::LlmRequest::LogitsPostProcessor> callbackAdapter(
+        std::optional<LlmRequest::LogitsPostProcessor> callback);
+
+    [[nodiscard]] std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> toTrtLlm() const;
+};
+
+} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
index adc82587433..dd01d21cced 100644
--- a/cpp/tensorrt_llm/nanobind/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,14 +15,483 @@
  * limitations under the License.
  */
 
+#include "tensorrt_llm/nanobind/common/customCasters.h"
 #include <nanobind/nanobind.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/unique_ptr.h>
+
+#include <torch/extension.h>
+#include <vector>
+
+#include "tensorrt_llm/batch_manager/peftCacheManagerConfig.h"
+#include "tensorrt_llm/common/quantization.h"
+#include "tensorrt_llm/nanobind/batch_manager/algorithms.h"
+#include "tensorrt_llm/nanobind/batch_manager/bindings.h"
+#include "tensorrt_llm/nanobind/batch_manager/buffers.h"
+#include "tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h"
+#include "tensorrt_llm/nanobind/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
+#include "tensorrt_llm/nanobind/executor/bindings.h"
+#include "tensorrt_llm/nanobind/runtime/bindings.h"
+#include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
+#include "tensorrt_llm/nanobind/userbuffers/bindings.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/ipcNvlsMemory.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+
+namespace nb = nanobind;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
+namespace tpb = tensorrt_llm::nanobind::batch_manager;
+namespace tc = tensorrt_llm::common;
+namespace tr = tensorrt_llm::runtime;
+namespace tle = tensorrt_llm::executor;
+using SizeType32 = tr::SizeType32;
+using TokenIdType = tr::TokenIdType;
+template <typename T>
+using OptVec = std::optional<std::vector<T>>;
 
 #if not defined(TRTLLM_NB_MODULE)
 #error "TRTLLM_NB_MODULE must be defined"
 #endif
 
+namespace
+{
+tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& configs)
+{
+    return tr::SamplingConfig(configs);
+}
+} // namespace
+
 NB_MODULE(TRTLLM_NB_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
     m.attr("binding_type") = "nanobind";
+    nb::set_leak_warnings(false);
+
+    // Create MpiComm binding first since it's used in the executor bindings
+    nb::class_<tensorrt_llm::mpi::MpiComm>(m, "MpiComm")
+        .def_static("rank",
+            []()
+            {
+                auto& session = tensorrt_llm::mpi::MpiComm::session();
+                return session.tensorrt_llm::mpi::MpiComm::getRank();
+            })
+        .def_static("size",
+            []()
+            {
+                auto& session = tensorrt_llm::mpi::MpiComm::session();
+                return session.tensorrt_llm::mpi::MpiComm::getSize();
+            })
+        .def_static("local_size",
+            []()
+            {
+                auto& session = tensorrt_llm::mpi::MpiComm::localSession();
+                return session.tensorrt_llm::mpi::MpiComm::getSize();
+            })
+        .def_static("local_init", []() { tensorrt_llm::mpi::MpiComm::localSession(); })
+        .def_static("set_raw_mpi_session_by_fortran_handle",
+            [](int64_t fortran_handle) { tensorrt_llm::mpi::MpiComm::setRawSessionByFortran(fortran_handle); })
+        .def_static("split",
+            [](size_t color, size_t rank)
+            {
+                auto& world = tensorrt_llm::mpi::MpiComm::world();
+                tensorrt_llm::mpi::MpiComm::setSession(world.split(color, rank));
+            });
+
+    nb::class_<tr::CudaStream>(m, "CudaStream")
+        .def(
+            "__init__",
+            [](tr::CudaStream* self, nb::object py_stream)
+            {
+                cudaStream_t stream = reinterpret_cast<cudaStream_t>(nb::cast<uintptr_t>(py_stream));
+                new (self) tr::CudaStream{stream};
+            },
+            nb::arg("stream_ptr"))
+        .def("get_device", &tr::CudaStream::getDevice);
+
+    // Create submodule for executor bindings.
+    auto mExecutor = m.def_submodule("executor", "Executor bindings");
+    auto mInternal = m.def_submodule("internal", "Internal submodule of TRTLLM runtime");
+    auto mInternalRuntime = mInternal.def_submodule("runtime", "Runtime internal bindings");
+    auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
+    auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
+
+    tensorrt_llm::nanobind::executor::initBindings(mExecutor);
+    tensorrt_llm::nanobind::runtime::initBindingsEarly(mInternalRuntime);
+
+    auto buildInfo = m.def_submodule("BuildInfo");
+    buildInfo.attr("ENABLE_MULTI_DEVICE") = nb::int_(ENABLE_MULTI_DEVICE);
+
+    nb::class_<tb::PeftCacheManagerConfig>(m, "PeftCacheManagerConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
+                 SizeType32, std::optional<float>, std::optional<size_t>, std::optional<std::string>>(),
+            nb::arg("num_host_module_layer") = 0, nb::arg("num_device_module_layer") = 0,
+            nb::arg("optimal_adapter_size") = 8, nb::arg("max_adapter_size") = 64, nb::arg("num_put_workers") = 1,
+            nb::arg("num_ensure_workers") = 1, nb::arg("num_copy_streams") = 1,
+            nb::arg("max_pages_per_block_host") = 24, nb::arg("max_pages_per_block_device") = 8,
+            nb::arg("device_cache_percent") = std::nullopt, nb::arg("host_cache_size") = std::nullopt,
+            nb::arg("lora_prefetch_dir") = std::nullopt)
+        .def_rw("num_host_module_layer", &tb::PeftCacheManagerConfig::numHostModuleLayer)
+        .def_rw("num_device_module_layer", &tb::PeftCacheManagerConfig::numDeviceModuleLayer)
+        .def_rw("optimal_adapter_size", &tb::PeftCacheManagerConfig::optimalAdapterSize)
+        .def_rw("max_adapter_size", &tb::PeftCacheManagerConfig::maxAdapterSize)
+        .def_rw("num_put_workers", &tb::PeftCacheManagerConfig::numPutWorkers)
+        .def_rw("num_ensure_workers", &tb::PeftCacheManagerConfig::numEnsureWorkers)
+        .def_rw("num_copy_streams", &tb::PeftCacheManagerConfig::numCopyStreams)
+        .def_rw("max_pages_per_block_host", &tb::PeftCacheManagerConfig::maxPagesPerBlockHost)
+        .def_rw("max_pages_per_block_device", &tb::PeftCacheManagerConfig::maxPagesPerBlockDevice)
+        .def_rw("device_cache_percent", &tb::PeftCacheManagerConfig::deviceCachePercent)
+        .def_rw("host_cache_size", &tb::PeftCacheManagerConfig::hostCacheSize)
+        .def_rw("lora_prefetch_dir", &tb::PeftCacheManagerConfig::loraPrefetchDir);
+
+    nb::enum_<nvinfer1::DataType>(m, "DataType")
+        .value("FLOAT", nvinfer1::DataType::kFLOAT)
+        .value("HALF", nvinfer1::DataType::kHALF)
+        .value("INT8", nvinfer1::DataType::kINT8)
+        .value("INT32", nvinfer1::DataType::kINT32)
+        .value("BOOL", nvinfer1::DataType::kBOOL)
+        .value("UINT8", nvinfer1::DataType::kUINT8)
+        .value("FP8", nvinfer1::DataType::kFP8)
+        .value("BF16", nvinfer1::DataType::kBF16)
+        .value("INT64", nvinfer1::DataType::kINT64)
+        .export_values();
+
+    nb::enum_<tr::ModelConfig::ModelVariant>(m, "GptModelVariant")
+        .value("GPT", tr::ModelConfig::ModelVariant::kGpt)
+        .value("GLM", tr::ModelConfig::ModelVariant::kGlm)
+        .value("CHATGLM", tr::ModelConfig::ModelVariant::kChatGlm)
+        .value("MAMBA", tr::ModelConfig::ModelVariant::kMamba)
+        .value("RECURRENTGEMMA", tr::ModelConfig::ModelVariant::kRecurrentGemma);
+
+    nb::enum_<tr::ModelConfig::KVCacheType>(m, "KVCacheType")
+        .value("CONTINUOUS", tr::ModelConfig::KVCacheType::kCONTINUOUS)
+        .value("PAGED", tr::ModelConfig::KVCacheType::kPAGED)
+        .value("DISABLED", tr::ModelConfig::KVCacheType::kDISABLED)
+        .def("from_string", tr::ModelConfig::KVCacheTypeFromString);
+
+    nb::enum_<tr::ModelConfig::LayerType>(m, "LayerType")
+        .value("ATTENTION", tr::ModelConfig::LayerType::kATTENTION)
+        .value("RECURRENT", tr::ModelConfig::LayerType::kRECURRENT);
+
+    nb::enum_<tr::LoraModule::ModuleType>(m, "LoraModuleType")
+        .value("INVALID", tr::LoraModule::ModuleType::kINVALID)
+        .value("ATTN_QKV", tr::LoraModule::ModuleType::kATTN_QKV)
+        .value("ATTN_Q", tr::LoraModule::ModuleType::kATTN_Q)
+        .value("ATTN_K", tr::LoraModule::ModuleType::kATTN_K)
+        .value("ATTN_V", tr::LoraModule::ModuleType::kATTN_V)
+        .value("ATTN_DENSE", tr::LoraModule::ModuleType::kATTN_DENSE)
+        .value("MLP_H_TO_4H", tr::LoraModule::ModuleType::kMLP_H_TO_4H)
+        .value("MLP_4H_TO_H", tr::LoraModule::ModuleType::kMLP_4H_TO_H)
+        .value("MLP_GATE", tr::LoraModule::ModuleType::kMLP_GATE)
+        .value("CROSS_ATTN_QKV", tr::LoraModule::ModuleType::kCROSS_ATTN_QKV)
+        .value("CROSS_ATTN_Q", tr::LoraModule::ModuleType::kCROSS_ATTN_Q)
+        .value("CROSS_ATTN_K", tr::LoraModule::ModuleType::kCROSS_ATTN_K)
+        .value("CROSS_ATTN_V", tr::LoraModule::ModuleType::kCROSS_ATTN_V)
+        .value("CROSS_ATTN_DENSE", tr::LoraModule::ModuleType::kCROSS_ATTN_DENSE)
+        .value("MOE_H_TO_4H", tr::LoraModule::ModuleType::kMOE_H_TO_4H)
+        .value("MOE_4H_TO_H", tr::LoraModule::ModuleType::kMOE_4H_TO_H)
+        .value("MOE_GATE", tr::LoraModule::ModuleType::kMOE_GATE)
+        .value("MOE_ROUTER", tr::LoraModule::ModuleType::kMOE_ROUTER)
+        .value("MLP_ROUTER", tr::LoraModule::ModuleType::kMLP_ROUTER)
+        .value("MLP_GATE_UP", tr::LoraModule::ModuleType::kMLP_GATE_UP);
+
+    nb::class_<tr::LoraModule>(m, "LoraModule")
+        .def(nb::init<tr::LoraModule::ModuleType, SizeType32, SizeType32, bool, bool, SizeType32, SizeType32>(),
+            nb::arg("module_type"), nb::arg("in_dim"), nb::arg("out_dim"), nb::arg("in_dim_first"),
+            nb::arg("out_dim_first"), nb::arg("in_tp_split_dim"), nb::arg("out_tp_split_dim"))
+        .def_prop_ro("module_type", &tr::LoraModule::name)
+        .def_prop_ro("in_dim", &tr::LoraModule::inDim)
+        .def_prop_ro("out_dim", &tr::LoraModule::outDim)
+        .def_prop_ro("in_dim_first", &tr::LoraModule::inDimFirst)
+        .def_prop_ro("out_dim_first", &tr::LoraModule::outDimFirst)
+        .def_prop_ro("in_tp_split_dim", &tr::LoraModule::inTpSplitDim)
+        .def_prop_ro("out_tp_split_dim", &tr::LoraModule::outTpSplitDim)
+        .def_static("create_lora_modules", &tr::LoraModule::createLoraModules, nb::arg("lora_module_names"),
+            nb::arg("hidden_size"), nb::arg("mlp_hidden_size"), nb::arg("num_attention_heads"),
+            nb::arg("num_kv_attention_heads"), nb::arg("attention_head_size"), nb::arg("tp_size") = 1,
+            nb::arg("num_experts") = 0);
+
+    nb::class_<tc::QuantMode>(m, "QuantMode")
+        .def_static("none", &tc::QuantMode::none)
+        .def_static("int4_weights", &tc::QuantMode::int4Weights)
+        .def_static("int8_weights", &tc::QuantMode::int8Weights)
+        .def_static("activations", &tc::QuantMode::activations)
+        .def_static("per_channel_scaling", &tc::QuantMode::perChannelScaling)
+        .def_static("per_token_scaling", &tc::QuantMode::perTokenScaling)
+        .def_static("per_group_scaling", &tc::QuantMode::perGroupScaling)
+        .def_static("int8_kv_cache", &tc::QuantMode::int8KvCache)
+        .def_static("fp8_kv_cache", &tc::QuantMode::fp8KvCache)
+        .def_static("fp8_qdq", &tc::QuantMode::fp8Qdq)
+        .def_prop_ro("value", &tc::QuantMode::value)
+        .def("is_set", &tc::QuantMode::isSet, nb::arg("mode"))
+        .def_prop_ro("has_int4_weights", &tc::QuantMode::hasInt4Weights)
+        .def_prop_ro("has_int8_weights", &tc::QuantMode::hasInt8Weights)
+        .def_prop_ro("has_activations", &tc::QuantMode::hasActivations)
+        .def_prop_ro("has_per_channel_scaling", &tc::QuantMode::hasPerChannelScaling)
+        .def_prop_ro("has_per_token_scaling", &tc::QuantMode::hasPerTokenScaling)
+        .def_prop_ro("has_per_group_scaling", &tc::QuantMode::hasPerGroupScaling)
+        .def_prop_ro("has_static_activation_scaling", &tc::QuantMode::hasStaticActivationScaling)
+        .def_prop_ro("has_int8_kv_cache", &tc::QuantMode::hasInt8KvCache)
+        .def_prop_ro("has_fp8_kv_cache", &tc::QuantMode::hasFp8KvCache)
+        .def_prop_ro("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq)
+        .def_prop_ro("has_nvfp4", &tc::QuantMode::hasNvfp4)
+        .def_prop_ro("has_w4a8_mxfp4_fp8", &tc::QuantMode::hasW4a8Mxfp4Fp8)
+        .def_prop_ro("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant)
+        .def_static("from_description", &tc::QuantMode::fromDescription, nb::arg("quantize_weights"),
+            nb::arg("quantize_activations"), nb::arg("per_token"), nb::arg("per_channel"), nb::arg("per_group"),
+            nb::arg("use_int4_weights"), nb::arg("use_int8_kv_cache"), nb::arg("use_fp8_kv_kache"),
+            nb::arg("use_fp8_qdq"), nb::arg("use_fp8_rowwise"), nb::arg("use_w4a8_qserve"), nb::arg("use_nvfp4"),
+            nb::arg("use_fp8_block_scales"), nb::arg("use_w4a8_mxfp4_fp8"))
+        .def_static("use_smooth_quant", &tc::QuantMode::useSmoothQuant, nb::arg("per_token") = false,
+            nb::arg("per_channel") = false)
+        .def_static("use_weight_only", &tc::QuantMode::useWeightOnly, nb::arg("use_int4_weights") = false,
+            nb::arg("per_group") = false)
+        .def_static("from_quant_algo", &tc::QuantMode::fromQuantAlgo, nb::arg("quant_algo") = nb::none(),
+            nb::arg("kv_cache_quant_algo") = nb::none())
+        .def(nb::self + nb::self)
+        .def(nb::self += nb::self)
+        .def(nb::self - nb::self)
+        .def(nb::self -= nb::self)
+        .def(nb::self == nb::self)
+        .def(nb::self != nb::self);
+
+    nb::class_<tr::ModelConfig>(m, "ModelConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, nvinfer1::DataType>(),
+            nb::arg("vocab_size"), nb::arg("num_layers"), nb::arg("num_attention_layers"), nb::arg("num_rnn_layers"),
+            nb::arg("num_heads"), nb::arg("hidden_size"), nb::arg("data_type"))
+        .def_prop_ro("vocab_size", &tr::ModelConfig::getVocabSize)
+        .def("vocab_size_padded", &tr::ModelConfig::getVocabSizePadded, nb::arg("world_size"))
+        .def("num_layers", &tr::ModelConfig::getNbLayers, nb::arg("pipeline_parallelism") = 1,
+            nb::arg("pipeline_parallelism_rank") = 0)
+        .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, nb::arg("pipeline_parallelism") = 1,
+            nb::arg("pipeline_parallelism_rank") = 0)
+        .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, nb::arg("pipeline_parallelism") = 1,
+            nb::arg("pipeline_parallelism_rank") = 0)
+        .def("num_kv_heads", &tr::ModelConfig::getNbKvHeads, nb::arg("layer_idx"))
+        .def("set_num_kv_heads", &tr::ModelConfig::setNbKvHeads, nb::arg("num_kv_heads"))
+        .def_prop_ro("num_heads", &tr::ModelConfig::getNbHeads)
+        .def_prop_ro("hidden_size", &tr::ModelConfig::getHiddenSize)
+        .def_prop_ro("size_per_head", &tr::ModelConfig::getSizePerHead)
+        .def_prop_ro("data_type", &tr::ModelConfig::getDataType)
+        .def_prop_ro("speculative_decoding_mode", &tr::ModelConfig::getSpeculativeDecodingMode)
+        .def_prop_rw("head_size", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead)
+        .def_prop_rw(
+            "num_kv_heads_per_layer", &tr::ModelConfig::getNumKvHeadsPerLayer, &tr::ModelConfig::setNumKvHeadsPerLayer)
+        .def_prop_rw("use_gpt_attention_plugin",
+            nb::overload_cast<>(&tr::ModelConfig::useGptAttentionPlugin, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::useGptAttentionPlugin))
+        .def_prop_rw("use_packed_input", nb::overload_cast<>(&tr::ModelConfig::usePackedInput, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::usePackedInput))
+        .def_prop_rw("kv_cache_type", nb::overload_cast<>(&tr::ModelConfig::getKVCacheType, nb::const_),
+            nb::overload_cast<tr::ModelConfig::KVCacheType>(&tr::ModelConfig::setKVCacheType))
+        .def_prop_rw("tokens_per_block", &tr::ModelConfig::getTokensPerBlock, &tr::ModelConfig::setTokensPerBlock)
+        .def_prop_rw("quant_mode", &tr::ModelConfig::getQuantMode, &tr::ModelConfig::setQuantMode)
+        .def_prop_ro("supports_inflight_batching", &tr::ModelConfig::supportsInflightBatching)
+        .def_prop_rw("max_batch_size", &tr::ModelConfig::getMaxBatchSize, &tr::ModelConfig::setMaxBatchSize)
+        .def_prop_rw("max_beam_width", &tr::ModelConfig::getMaxBeamWidth, &tr::ModelConfig::setMaxBeamWidth)
+        .def_prop_rw("max_input_len", &tr::ModelConfig::getMaxInputLen, &tr::ModelConfig::setMaxInputLen)
+        .def_prop_rw("max_seq_len", &tr::ModelConfig::getMaxSequenceLen, &tr::ModelConfig::setMaxSequenceLen)
+        .def_prop_rw("max_num_tokens", &tr::ModelConfig::getMaxNumTokens, &tr::ModelConfig::setMaxNumTokens)
+        .def_prop_rw("max_prompt_embedding_table_size", &tr::ModelConfig::getMaxPromptEmbeddingTableSize,
+            &tr::ModelConfig::setMaxPromptEmbeddingTableSize)
+        .def_prop_ro("use_prompt_tuning", &tr::ModelConfig::usePromptTuning)
+        .def_prop_ro("use_mrope", &tr::ModelConfig::useMrope)
+        .def_prop_rw("use_lora_plugin", nb::overload_cast<>(&tr::ModelConfig::useLoraPlugin, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::useLoraPlugin))
+        .def_prop_rw("layer_types", &tr::ModelConfig::getLayerTypes, &tr::ModelConfig::setLayerTypes)
+        .def_prop_rw("compute_context_logits", nb::overload_cast<>(&tr::ModelConfig::computeContextLogits, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::computeContextLogits))
+        .def_prop_rw("compute_generation_logits",
+            nb::overload_cast<>(&tr::ModelConfig::computeGenerationLogits, nb::const_),
+            nb::overload_cast<bool>(&tr::ModelConfig::computeGenerationLogits))
+        .def_prop_rw("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant)
+        .def_prop_rw("use_cross_attention", &tr::ModelConfig::useCrossAttention, &tr::ModelConfig::setUseCrossAttention)
+        .def_prop_rw("lora_modules", &tr::ModelConfig::getLoraModules, &tr::ModelConfig::setLoraModules)
+        .def_prop_rw("max_lora_rank", &tr::ModelConfig::getMaxLoraRank, &tr::ModelConfig::setMaxLoraRank)
+        .def_prop_rw("mlp_hidden_size", &tr::ModelConfig::getMlpHiddenSize, &tr::ModelConfig::setMlpHiddenSize)
+        .def_prop_rw("size_per_head", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead);
+
+    nb::class_<tr::WorldConfig>(m, "WorldConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
+                 std::optional<std::vector<SizeType32>> const&, bool>(),
+            nb::arg("tensor_parallelism") = 1, nb::arg("pipeline_parallelism") = 1, nb::arg("context_parallelism") = 1,
+            nb::arg("rank") = 0, nb::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode,
+            nb::arg("device_ids") = nb::none(), nb::arg("enable_attention_dp") = false)
+        .def_prop_ro("size", &tr::WorldConfig::getSize)
+        .def_prop_ro("tensor_parallelism", &tr::WorldConfig::getTensorParallelism)
+        .def_prop_ro("pipeline_parallelism", &tr::WorldConfig::getPipelineParallelism)
+        .def_prop_ro("context_parallelism", &tr::WorldConfig::getContextParallelism)
+        .def_prop_ro("is_tensor_parallel", &tr::WorldConfig::isTensorParallel)
+        .def_prop_ro("is_pipeline_parallel", &tr::WorldConfig::isPipelineParallel)
+        .def_prop_ro("is_context_parallel", &tr::WorldConfig::isContextParallel)
+        .def_prop_ro("rank", &tr::WorldConfig::getRank)
+        .def_prop_ro("local_rank", &tr::WorldConfig::getLocalRank)
+        .def_prop_ro("node_rank", &tr::WorldConfig::getNodeRank)
+        .def_prop_ro("gpus_per_node", &tr::WorldConfig::getGpusPerNode)
+        .def_prop_ro("gpus_per_group", &tr::WorldConfig::getGpusPerGroup)
+        .def_prop_ro("device", &tr::WorldConfig::getDevice)
+        .def_prop_ro("pipeline_parallel_rank", &tr::WorldConfig::getPipelineParallelRank)
+        .def_prop_ro("tensor_parallel_rank", &tr::WorldConfig::getTensorParallelRank)
+        .def_prop_ro("context_parallel_rank", &tr::WorldConfig::getContextParallelRank)
+        .def_prop_ro("enable_attention_dp", &tr::WorldConfig::enableAttentionDP)
+        .def_static("mpi",
+            nb::overload_cast<SizeType32, std::optional<SizeType32>, std::optional<SizeType32>,
+                std::optional<SizeType32>, std::optional<std::vector<SizeType32>> const&, bool>(&tr::WorldConfig::mpi),
+            nb::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode, nb::arg("tensor_parallelism") = nb::none(),
+            nb::arg("pipeline_parallelism") = nb::none(), nb::arg("context_parallelism") = nb::none(),
+            nb::arg("device_ids") = nb::none(), nb::arg("enable_attention_dp") = false);
+
+    auto SamplingConfigGetState = [](tr::SamplingConfig const& config) -> nb::tuple
+    {
+        return nb::make_tuple(config.beamWidth, config.temperature, config.minLength, config.repetitionPenalty,
+            config.presencePenalty, config.frequencyPenalty, config.topK, config.topP, config.randomSeed,
+            config.topPDecay, config.topPMin, config.topPResetIds, config.beamSearchDiversityRate, config.lengthPenalty,
+            config.earlyStopping, config.noRepeatNgramSize, config.numReturnSequences, config.minP,
+            config.beamWidthArray);
+    };
+    auto SamplingConfigSetState = [](tr::SamplingConfig& self, nb::tuple t) -> tr::SamplingConfig
+    {
+        assert(t.size() == 19);
+
+        tr::SamplingConfig config;
+        config.beamWidth = nb::cast<SizeType32>(t[0]);
+        config.temperature = nb::cast<OptVec<float>>(t[1]);
+        config.minLength = nb::cast<OptVec<SizeType32>>(t[2]);
+        config.repetitionPenalty = nb::cast<OptVec<float>>(t[3]);
+        config.presencePenalty = nb::cast<OptVec<float>>(t[4]);
+        config.frequencyPenalty = nb::cast<OptVec<float>>(t[5]);
+        config.topK = nb::cast<OptVec<SizeType32>>(t[6]);
+        config.topP = nb::cast<OptVec<float>>(t[7]);
+        config.randomSeed = nb::cast<OptVec<uint64_t>>(t[8]);
+        config.topPDecay = nb::cast<OptVec<float>>(t[9]);
+        config.topPMin = nb::cast<OptVec<float>>(t[10]);
+        config.topPResetIds = nb::cast<OptVec<TokenIdType>>(t[11]);
+        config.beamSearchDiversityRate = nb::cast<OptVec<float>>(t[12]);
+        config.lengthPenalty = nb::cast<OptVec<float>>(t[13]);
+        config.earlyStopping = nb::cast<OptVec<SizeType32>>(t[14]);
+        config.noRepeatNgramSize = nb::cast<OptVec<SizeType32>>(t[15]);
+        config.numReturnSequences = nb::cast<SizeType32>(t[16]);
+        config.minP = nb::cast<OptVec<float>>(t[17]);
+        config.beamWidthArray = nb::cast<OptVec<std::vector<SizeType32>>>(t[18]);
+
+        return config;
+    };
+
+    nb::class_<tr::SamplingConfig>(m, "SamplingConfig")
+        .def(nb::init<SizeType32>(), nb::arg("beam_width") = 1)
+        .def(nb::init<tle::SamplingConfig, std::optional<tle::ExternalDraftTokensConfig>>(),
+            nb::arg("executor_sample_config"), nb::arg("external_draft_tokens_config") = std::nullopt)
+        .def_rw("beam_width", &tr::SamplingConfig::beamWidth)
+        .def_rw("temperature", &tr::SamplingConfig::temperature)
+        .def_rw("min_length", &tr::SamplingConfig::minLength)
+        .def_rw("repetition_penalty", &tr::SamplingConfig::repetitionPenalty)
+        .def_rw("presence_penalty", &tr::SamplingConfig::presencePenalty)
+        .def_rw("frequency_penalty", &tr::SamplingConfig::frequencyPenalty)
+        .def_rw("top_k", &tr::SamplingConfig::topK)
+        .def_rw("top_p", &tr::SamplingConfig::topP)
+        .def_rw("random_seed", &tr::SamplingConfig::randomSeed)
+        .def_rw("top_p_decay", &tr::SamplingConfig::topPDecay)
+        .def_rw("top_p_min", &tr::SamplingConfig::topPMin)
+        .def_rw("top_p_reset_ids", &tr::SamplingConfig::topPResetIds)
+        .def_rw("beam_search_diversity_rate", &tr::SamplingConfig::beamSearchDiversityRate)
+        .def_rw("length_penalty", &tr::SamplingConfig::lengthPenalty)
+        .def_rw("early_stopping", &tr::SamplingConfig::earlyStopping)
+        .def_rw("no_repeat_ngram_size", &tr::SamplingConfig::noRepeatNgramSize)
+        .def_rw("num_return_sequences", &tr::SamplingConfig::numReturnSequences)
+        .def_rw("min_p", &tr::SamplingConfig::minP)
+        .def_rw("beam_width_array", &tr::SamplingConfig::beamWidthArray)
+        .def_rw("normalize_log_probs", &tr::SamplingConfig::normalizeLogProbs)
+        .def("__getstate__", SamplingConfigGetState)
+        .def("__setstate__", SamplingConfigSetState)
+        .def("__eq__", &tr::SamplingConfig::operator==);
+
+    nb::bind_vector<std::vector<tr::SamplingConfig>>(m, "SamplingConfigVector");
+
+    m.def("make_sampling_config", &makeSamplingConfig, nb::arg("configs"));
+
+    nb::class_<tr::GptJsonConfig>(m, "GptJsonConfig")
+        .def(nb::init<std::string, std::string, std::string, SizeType32, SizeType32, SizeType32, SizeType32,
+                 tr::ModelConfig, std::optional<tr::RuntimeDefaults>>(),
+            nb::arg("name"), nb::arg("version"), nb::arg("precision"), nb::arg("tensor_parallelism"),
+            nb::arg("pipeline_parallelism"), nb::arg("context_parallelism"), nb::arg("gpus_per_node"),
+            nb::arg("model_config"), nb::arg("runtime_defaults") = nb::none())
+        .def_static("parse", nb::overload_cast<std::string const&>(&tr::GptJsonConfig::parse), nb::arg("json"))
+        .def_static(
+            "parse_file", nb::overload_cast<std::filesystem::path const&>(&tr::GptJsonConfig::parse), nb::arg("path"))
+        .def_prop_ro("model_config", &tr::GptJsonConfig::getModelConfig)
+        .def_prop_ro("name", &tr::GptJsonConfig::getName)
+        .def_prop_ro("version", &tr::GptJsonConfig::getVersion)
+        .def_prop_ro("precision", &tr::GptJsonConfig::getPrecision)
+        .def_prop_ro("tensor_parallelism", &tr::GptJsonConfig::getTensorParallelism)
+        .def_prop_ro("pipeline_parallelism", &tr::GptJsonConfig::getPipelineParallelism)
+        .def_prop_ro("context_parallelism", &tr::GptJsonConfig::getContextParallelism)
+        .def_prop_ro("gpus_per_node", &tr::GptJsonConfig::getGpusPerNode)
+        .def_prop_ro("world_size", &tr::GptJsonConfig::getWorldSize)
+        .def_prop_ro("runtime_defaults", &tr::GptJsonConfig::getRuntimeDefaults)
+        .def("engine_filename",
+            nb::overload_cast<tr::WorldConfig const&, std::string const&>(
+                &tr::GptJsonConfig::engineFilename, nb::const_),
+            nb::arg("world_config"), nb::arg("model"))
+        .def("engine_filename",
+            nb::overload_cast<tr::WorldConfig const&>(&tr::GptJsonConfig::engineFilename, nb::const_),
+            nb::arg("world_config"));
+
+    nb::enum_<tb::LlmRequestState>(m, "LlmRequestState")
+        .value("UNKNOWN", tb::LlmRequestState::kUNKNOWN)
+        .value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT)
+        .value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT)
+        .value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS)
+        .value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE)
+        .value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE)
+        .value("DISAGG_GENERATION_INIT", tb::LlmRequestState::kDISAGG_GENERATION_INIT)
+        .value("DISAGG_CONTEXT_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS)
+        .value("DISAGG_CONTEXT_COMPLETE", tb::LlmRequestState::kDISAGG_CONTEXT_COMPLETE)
+        .value("DISAGG_GENERATION_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_IN_PROGRESS)
+        .value("DISAGG_GENERATION_TRANS_COMPLETE", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE)
+        .value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS);
+
+    nb::class_<tr::MemoryCounters>(m, "MemoryCounters")
+        .def_static("instance", &tr::MemoryCounters::getInstance, nb::rv_policy::reference)
+        .def_prop_ro("gpu", &tr::MemoryCounters::getGpu)
+        .def_prop_ro("cpu", &tr::MemoryCounters::getCpu)
+        .def_prop_ro("pinned", &tr::MemoryCounters::getPinned)
+        .def_prop_ro("uvm", &tr::MemoryCounters::getUVM);
+
+    tensorrt_llm::nanobind::runtime::initBindings(mInternalRuntime);
+    tensorrt_llm::nanobind::testing::initBindings(mInternalTesting);
+    tpb::initBindings(mInternalBatchManager);
+    tb::kv_cache_manager::KVCacheManagerBindings::initBindings(mInternalBatchManager);
+    tb::BasePeftCacheManagerBindings::initBindings(mInternalBatchManager);
+    tb::CacheTransceiverBindings::initBindings(mInternalBatchManager);
+    tpb::Buffers::initBindings(mInternalBatchManager);
+
+    auto mInternalAlgorithms = mInternal.def_submodule("algorithms", "Algorithms internal bindings");
+    tpb::algorithms::initBindings(mInternalAlgorithms);
+
+    auto mUserbuffers = mInternal.def_submodule("userbuffers", "User buffers internal bindings");
+    tensorrt_llm::kernels::userbuffers::UserBufferBindings::initBindings(mUserbuffers);
+
+    // NVLS allocators
+    nb::class_<tr::IpcNvlsHandle>(m, "IpcNvlsHandle")
+        .def(nb::init<>())
+        .def_rw("uc_ptr", &tr::IpcNvlsHandle::uc_ptr)
+        .def_rw("mc_ptr", &tr::IpcNvlsHandle::mc_ptr)
+        .def_rw("size", &tr::IpcNvlsHandle::size)
+        .def("get_ipc_ptrs",
+            [](tr::IpcNvlsHandle& self) { return reinterpret_cast<uintptr_t>(self.ipc_uc_ptrs.data()); });
+
+    m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference);
+    m.def("ipc_nvls_free", &tr::ipcNvlsFree);
+    m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
 }
diff --git a/cpp/tensorrt_llm/nanobind/common/bindTypes.h b/cpp/tensorrt_llm/nanobind/common/bindTypes.h
new file mode 100644
index 00000000000..5cd714e458a
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/common/bindTypes.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/make_iterator.h>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/string.h>
+
+namespace PybindUtils
+{
+
+namespace nb = nanobind;
+
+template <typename T>
+void bindList(nb::module_& m, std::string const& name)
+{
+    nb::class_<T>(m, name.c_str())
+        .def(nb::init<>())
+        .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); })
+        .def("pop_back", [](T& lst) { lst.pop_back(); })
+        .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); })
+        .def("pop_front", [](T& lst) { lst.pop_front(); })
+        .def("__len__", [](T const& lst) { return lst.size(); })
+        .def(
+            "__iter__", [](T& lst) { return nb::make_iterator(nb::type<T>(), "iterator", lst.begin(), lst.end()); },
+            nb::keep_alive<0, 1>())
+        .def("__getitem__",
+            [](T const& lst, size_t index)
+            {
+                if (index >= lst.size())
+                    throw nb::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                return *it;
+            })
+        .def("__setitem__",
+            [](T& lst, size_t index, const typename T::value_type& value)
+            {
+                if (index >= lst.size())
+                    throw nb::index_error();
+                auto it = lst.begin();
+                std::advance(it, index);
+                *it = value;
+            });
+}
+
+template <typename T>
+void bindSet(nb::module_& m, std::string const& name)
+{
+    nb::class_<T>(m, name.c_str())
+        .def(nb::init<>())
+        .def("clear", &T::clear)
+        .def("size", &T::size)
+        .def("insert", [](T& s, typename T::value_type const& value) { s.insert(value); })
+        .def("erase", nb::overload_cast<typename T::value_type const&>(&T::erase))
+        .def("__len__", [](T const& lst) { return lst.size(); })
+        .def("__contains__", [](T const& s, typename T::value_type x) { return s.find(x) != s.end(); })
+        .def(
+            "__iter__", [](T& s) { return nb::make_iterator(nb::type<T>(), "iterator", s.begin(), s.end()); },
+            nb::keep_alive<0, 1>())
+        .def("__eq__", [](T const& s, T const& other) { return s == other; })
+        .def("__getstate__",
+            [](T const& v)
+            {
+                /* Return a tuple that fully encodes the state of the object */
+                return nb::make_tuple(std::vector<typename T::value_type>(v.begin(), v.end()));
+            })
+        .def("__setstate__",
+            [](T& v, nb::tuple const& t)
+            {
+                if (t.size() != 1)
+                    throw std::runtime_error("Invalid state!");
+                /* Create a new C++ instance */
+                T s;
+                /* Assign any additional state */
+                auto state_list = nb::cast<std::vector<typename T::value_type>>(t[0]);
+                for (auto& item : state_list)
+                {
+                    s.insert(item);
+                }
+                return s;
+            });
+}
+
+} // namespace PybindUtils
diff --git a/cpp/tensorrt_llm/nanobind/common/customCasters.h b/cpp/tensorrt_llm/nanobind/common/customCasters.h
new file mode 100644
index 00000000000..7cfa07d249a
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/common/customCasters.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
+#include "tensorrt_llm/common/optionalRef.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/request.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+#include "tensorrt_llm/runtime/torch.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/DLConvertor.h>
+#include <deque>
+#include <filesystem>
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+#include <torch/csrc/autograd/python_variable.h>
+#include <torch/csrc/autograd/variable.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+
+// Pybind requires to have a central include in order for type casters to work.
+// Opaque bindings add a type caster, so they have the same requirement.
+// See the warning in https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
+
+// Opaque bindings
+NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::decoder_batch::Request>)
+NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)
+NB_MAKE_OPAQUE(std::vector<std::vector<tensorrt_llm::runtime::SizeType32>>)
+
+namespace nb = nanobind;
+
+// Custom casters
+namespace NB_NAMESPACE
+{
+
+namespace detail
+{
+
+template <typename T, typename Alloc>
+struct type_caster<std::deque<T, Alloc>>
+{
+    using Type = std::deque<T, Alloc>;
+    NB_TYPE_CASTER(Type, const_name("List"));
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup) noexcept
+    {
+        sequence seq(src, nanobind::detail::borrow_t{});
+        value.clear();
+        make_caster<T> caster;
+        for (auto const& item : seq)
+        {
+            if (!caster.from_python(item, flags, cleanup))
+                return false;
+            value.push_back(caster.operator T&());
+        }
+        return true;
+    }
+
+    static handle from_cpp(Type const& deque, rv_policy policy, cleanup_list* cleanup) noexcept
+    {
+        nb::list list;
+
+        for (auto const& item : deque)
+        {
+            nb::object py_item = steal(make_caster<T>::from_cpp(item, policy, cleanup));
+            if (!py_item)
+                return {};
+            list.append(py_item);
+        }
+        return list.release();
+    }
+};
+
+template <typename T>
+struct type_caster<tensorrt_llm::common::OptionalRef<T>>
+{
+    using value_conv = make_caster<T>;
+
+    NB_TYPE_CASTER(tensorrt_llm::common::OptionalRef<T>, value_conv::Name);
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        if (src.is_none())
+        {
+            // If the Python object is None, create an empty OptionalRef
+            value = tensorrt_llm::common::OptionalRef<T>();
+            return true;
+        }
+
+        value_conv conv;
+        if (!conv.from_python(src, flags, cleanup))
+            return false;
+
+        // Create an OptionalRef with a reference to the converted value
+        value = tensorrt_llm::common::OptionalRef<T>(conv);
+        return true;
+    }
+
+    static handle from_cpp(tensorrt_llm::common::OptionalRef<T> const& src, rv_policy policy, cleanup_list* cleanup)
+    {
+        if (!src.has_value())
+            return none().release();
+
+        return value_conv::from_cpp(*src, policy, cleanup);
+    }
+};
+
+template <typename T>
+struct PathCaster
+{
+
+private:
+    static PyObject* unicode_from_fs_native(std::string const& w)
+    {
+        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
+    }
+
+    static PyObject* unicode_from_fs_native(std::wstring const& w)
+    {
+        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
+    }
+
+public:
+    static handle from_cpp(T const& path, rv_policy, cleanup_list* cleanup)
+    {
+        if (auto py_str = unicode_from_fs_native(path.native()))
+        {
+            return module_::import_("pathlib").attr("Path")(steal<object>(py_str), cleanup).release();
+        }
+        return nullptr;
+    }
+
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        PyObject* native = nullptr;
+        if constexpr (std::is_same_v<typename T::value_type, char>)
+        {
+            if (PyUnicode_FSConverter(src.ptr(), &native) != 0)
+            {
+                if (auto* c_str = PyBytes_AsString(native))
+                {
+                    // AsString returns a pointer to the internal buffer, which
+                    // must not be free'd.
+                    value = c_str;
+                }
+            }
+        }
+        else if constexpr (std::is_same_v<typename T::value_type, wchar_t>)
+        {
+            if (PyUnicode_FSDecoder(src.ptr(), &native) != 0)
+            {
+                if (auto* c_str = PyUnicode_AsWideCharString(native, nullptr))
+                {
+                    // AsWideCharString returns a new string that must be free'd.
+                    value = c_str; // Copies the string.
+                    PyMem_Free(c_str);
+                }
+            }
+        }
+        Py_XDECREF(native);
+        if (PyErr_Occurred())
+        {
+            PyErr_Clear();
+            return false;
+        }
+        return true;
+    }
+
+    NB_TYPE_CASTER(T, const_name("os.PathLike"));
+};
+
+template <>
+class type_caster<tensorrt_llm::executor::StreamPtr>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::executor::StreamPtr, const_name("int"));
+
+    bool from_python([[maybe_unused]] handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        auto stream_ptr = nanobind::cast<uintptr_t>(src);
+        value = std::make_shared<tensorrt_llm::runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream_ptr));
+
+        return true;
+    }
+
+    static handle from_cpp(
+        tensorrt_llm::executor::StreamPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        // Return cudaStream_t as integer.
+        return PyLong_FromVoidPtr(src->get());
+    }
+};
+
+template <>
+struct type_caster<tensorrt_llm::executor::Tensor>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::executor::Tensor, const_name("torch.Tensor"));
+
+    // Convert PyObject(torch.Tensor) -> tensorrt_llm::executor::Tensor
+    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
+    {
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            at::Tensor const& t = THPVariable_Unpack(obj);
+            value = tensorrt_llm::executor::detail::ofITensor(tensorrt_llm::runtime::TorchView::of(t));
+            return true;
+        }
+        return false;
+    }
+
+    // Convert tensorrt_llm::executor::Tensor -> PyObject(torch.Tensor)
+    static handle from_cpp(
+        tensorrt_llm::executor::Tensor const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(tensorrt_llm::executor::detail::toITensor(src)));
+    }
+};
+
+template <>
+struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedPtr, const_name("torch.Tensor"));
+
+    // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedPtr
+    bool from_python(handle src, uint8_t, cleanup_list*)
+    {
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            at::Tensor const& t = THPVariable_Unpack(obj);
+            value = std::move(tensorrt_llm::runtime::TorchView::of(t));
+            return true;
+        }
+        return false;
+    }
+
+    // Convert tensorrt_llm::runtime::ITensor::SharedPtr -> PyObject(torch.Tensor)
+    static handle from_cpp(
+        tensorrt_llm::runtime::ITensor::SharedPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        if (src == nullptr)
+        {
+            return none().release();
+        }
+        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(src));
+    }
+};
+
+template <>
+struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
+{
+public:
+    NB_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedConstPtr, const_name("torch.Tensor"));
+
+    // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedConstPtr
+    bool from_python(handle src, uint8_t, cleanup_list*)
+    {
+        PyObject* obj = src.ptr();
+        if (THPVariable_Check(obj))
+        {
+            at::Tensor const& t = THPVariable_Unpack(obj);
+            value = std::move(tensorrt_llm::runtime::TorchView::of(t));
+            return true;
+        }
+        return false;
+    }
+
+    // Convert tensorrt_llm::runtime::ITensor::SharedConstPtr -> PyObject(torch.Tensor)
+    static handle from_cpp(
+        tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
+    {
+        if (src == nullptr)
+        {
+            return none().release();
+        }
+        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(
+            reinterpret_cast<tensorrt_llm::runtime::ITensor::SharedPtr const&>(src)));
+    }
+};
+
+template <>
+struct type_caster<at::Tensor>
+{
+    NB_TYPE_CASTER(at::Tensor, const_name("torch.Tensor"));
+
+    bool from_python(nb::handle src, uint8_t, cleanup_list*) noexcept
+    {
+        nb::object capsule = nb::getattr(src, "__dlpack__")();
+        DLManagedTensor* dl_managed = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), "dltensor"));
+        PyCapsule_SetDestructor(capsule.ptr(), nullptr);
+        value = at::fromDLPack(dl_managed).alias();
+        return true;
+    }
+
+    static handle from_cpp(at::Tensor tensor, rv_policy, cleanup_list*) noexcept
+    {
+        DLManagedTensor* dl_managed = at::toDLPack(tensor);
+        if (!dl_managed)
+            return nullptr;
+
+        nanobind::object capsule = nb::steal(PyCapsule_New(dl_managed, "dltensor",
+            [](PyObject* obj)
+            {
+                DLManagedTensor* dl = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, "dltensor"));
+                dl->deleter(dl);
+            }));
+        if (!capsule.is_valid())
+        {
+            dl_managed->deleter(dl_managed);
+            return nullptr;
+        }
+        nanobind::module_ torch = nanobind::module_::import_("torch");
+        nanobind::object result = torch.attr("from_dlpack")(capsule);
+        capsule.release();
+        return result.release();
+    }
+};
+} // namespace detail
+} // namespace NB_NAMESPACE
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
new file mode 100644
index 00000000000..d3f482df899
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
@@ -0,0 +1,263 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "executor.h"
+#include "executorConfig.h"
+#include "request.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/variant.h>
+#include <optional>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+using SizeType32 = tle::SizeType32;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+template <typename T>
+void instantiateEventDiff(nb::module_& m, std::string const& name)
+{
+    nb::class_<tle::KVCacheEventDiff<T>>(m, ("KVCacheEventDiff" + name).c_str())
+        .def_ro("old_value", &tle::KVCacheEventDiff<T>::oldValue)
+        .def_ro("new_value", &tle::KVCacheEventDiff<T>::newValue);
+}
+
+void initBindings(nb::module_& m)
+{
+    m.attr("__version__") = tle::version();
+    nb::enum_<tle::ModelType>(m, "ModelType")
+        .value("DECODER_ONLY", tle::ModelType::kDECODER_ONLY)
+        .value("ENCODER_ONLY", tle::ModelType::kENCODER_ONLY)
+        .value("ENCODER_DECODER", tle::ModelType::kENCODER_DECODER);
+
+    auto decodingModeGetstate = [](tle::DecodingMode const& self) { return nb::make_tuple(self.getState()); };
+    auto decodingModeSetstate = [](tle::DecodingMode& self, nb::tuple const& state)
+    {
+        if (state.size() != 1)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DecodingMode(nb::cast<tle::DecodingMode::UnderlyingType>(state[0]));
+    };
+    nb::class_<tle::DecodingMode>(m, "DecodingMode")
+        .def("Auto", &tle::DecodingMode::Auto)
+        .def("TopK", &tle::DecodingMode::TopK)
+        .def("TopP", &tle::DecodingMode::TopP)
+        .def("TopKTopP", &tle::DecodingMode::TopKTopP)
+        .def("BeamSearch", &tle::DecodingMode::BeamSearch)
+        .def("Medusa", &tle::DecodingMode::Medusa)
+        .def("Lookahead", &tle::DecodingMode::Lookahead)
+        .def("ExplicitDraftTokens", &tle::DecodingMode::ExplicitDraftTokens)
+        .def("Eagle", &tle::DecodingMode::Eagle)
+        .def("isAuto", &tle::DecodingMode::isAuto)
+        .def("isTopK", &tle::DecodingMode::isTopK)
+        .def("isTopP", &tle::DecodingMode::isTopP)
+        .def("isTopKorTopP", &tle::DecodingMode::isTopKorTopP)
+        .def("isTopKandTopP", &tle::DecodingMode::isTopKandTopP)
+        .def("isBeamSearch", &tle::DecodingMode::isBeamSearch)
+        .def("isMedusa", &tle::DecodingMode::isMedusa)
+        .def("isLookahead", &tle::DecodingMode::isLookahead)
+        .def("isExplicitDraftTokens", &tle::DecodingMode::isExplicitDraftTokens)
+        .def("isEagle", &tle::DecodingMode::isEagle)
+        .def("useVariableBeamWidthSearch", &tle::DecodingMode::useVariableBeamWidthSearch)
+        .def_prop_ro("name", &tle::DecodingMode::getName)
+        .def("__getstate__", decodingModeGetstate)
+        .def("__setstate__", decodingModeSetstate);
+
+    nb::enum_<tle::CapacitySchedulerPolicy>(m, "CapacitySchedulerPolicy")
+        .value("MAX_UTILIZATION", tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)
+        .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
+        .value("STATIC_BATCH", tle::CapacitySchedulerPolicy::kSTATIC_BATCH);
+
+    nb::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
+        .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
+        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED);
+
+    nb::enum_<tle::CommunicationType>(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI);
+
+    nb::enum_<tle::CommunicationMode>(m, "CommunicationMode")
+        .value("LEADER", tle::CommunicationMode::kLEADER)
+        .value("ORCHESTRATOR", tle::CommunicationMode::kORCHESTRATOR);
+
+    nb::class_<tle::KvCacheStats>(m, "KvCacheStats")
+        .def(nb::init<>())
+        .def_rw("max_num_blocks", &tle::KvCacheStats::maxNumBlocks)
+        .def_rw("free_num_blocks", &tle::KvCacheStats::freeNumBlocks)
+        .def_rw("used_num_blocks", &tle::KvCacheStats::usedNumBlocks)
+        .def_rw("tokens_per_block", &tle::KvCacheStats::tokensPerBlock)
+        .def_rw("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks)
+        .def_rw("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks)
+        .def_rw("reused_blocks", &tle::KvCacheStats::reusedBlocks)
+        .def_rw("missed_blocks", &tle::KvCacheStats::missedBlocks)
+        .def_rw("cache_hit_rate", &tle::KvCacheStats::cacheHitRate);
+
+    nb::class_<tle::StaticBatchingStats>(m, "StaticBatchingStats")
+        .def(nb::init<>())
+        .def_rw("num_scheduled_requests", &tle::StaticBatchingStats::numScheduledRequests)
+        .def_rw("num_context_requests", &tle::StaticBatchingStats::numContextRequests)
+        .def_rw("num_ctx_tokens", &tle::StaticBatchingStats::numCtxTokens)
+        .def_rw("num_gen_tokens", &tle::StaticBatchingStats::numGenTokens)
+        .def_rw("empty_gen_slots", &tle::StaticBatchingStats::emptyGenSlots);
+
+    nb::class_<tle::InflightBatchingStats>(m, "InflightBatchingStats")
+        .def(nb::init<>())
+        .def_rw("num_scheduled_requests", &tle::InflightBatchingStats::numScheduledRequests)
+        .def_rw("num_context_requests", &tle::InflightBatchingStats::numContextRequests)
+        .def_rw("num_gen_requests", &tle::InflightBatchingStats::numGenRequests)
+        .def_rw("num_paused_requests", &tle::InflightBatchingStats::numPausedRequests)
+        .def_rw("num_ctx_tokens", &tle::InflightBatchingStats::numCtxTokens)
+        .def_rw("micro_batch_id", &tle::InflightBatchingStats::microBatchId)
+        .def_rw("avg_num_decoded_tokens_per_iter", &tle::InflightBatchingStats::avgNumDecodedTokensPerIter);
+
+    nb::class_<tle::SpecDecodingStats>(m, "SpecDecodingStats")
+        .def(nb::init<>())
+        .def_rw("num_draft_tokens", &tle::SpecDecodingStats::numDraftTokens)
+        .def_rw("num_accepted_tokens", &tle::SpecDecodingStats::numAcceptedTokens)
+        .def_rw("num_requests_with_draft_tokens", &tle::SpecDecodingStats::numRequestsWithDraftTokens)
+        .def_rw("acceptance_length", &tle::SpecDecodingStats::acceptanceLength)
+        .def_rw("iter_latency_ms", &tle::SpecDecodingStats::iterLatencyMS)
+        .def_rw("draft_overhead", &tle::SpecDecodingStats::draftOverhead);
+
+    nb::class_<tle::IterationStats>(m, "IterationStats")
+        .def(nb::init<>())
+        .def_rw("timestamp", &tle::IterationStats::timestamp)
+        .def_rw("iter", &tle::IterationStats::iter)
+        .def_rw("iter_latency_ms", &tle::IterationStats::iterLatencyMS)
+        .def_rw("new_active_requests_queue_latency_ms", &tle::IterationStats::newActiveRequestsQueueLatencyMS)
+        .def_rw("num_new_active_requests", &tle::IterationStats::numNewActiveRequests)
+        .def_rw("num_active_requests", &tle::IterationStats::numActiveRequests)
+        .def_rw("num_queued_requests", &tle::IterationStats::numQueuedRequests)
+        .def_rw("num_completed_requests", &tle::IterationStats::numCompletedRequests)
+        .def_rw("max_num_active_requests", &tle::IterationStats::maxNumActiveRequests)
+        .def_rw("gpu_mem_usage", &tle::IterationStats::gpuMemUsage)
+        .def_rw("cpu_mem_usage", &tle::IterationStats::cpuMemUsage)
+        .def_rw("pinned_mem_usage", &tle::IterationStats::pinnedMemUsage)
+        .def_rw("kv_cache_stats", &tle::IterationStats::kvCacheStats)
+        .def_rw("cross_kv_cache_stats", &tle::IterationStats::crossKvCacheStats)
+        .def_rw("static_batching_stats", &tle::IterationStats::staticBatchingStats)
+        .def_rw("inflight_batching_stats", &tle::IterationStats::inflightBatchingStats)
+        .def_rw("specdec_stats", &tle::IterationStats::specDecodingStats)
+        .def("to_json_str",
+            [](tle::IterationStats const& iterationStats)
+            { return tle::JsonSerialization::toJsonStr(iterationStats); });
+
+    nb::class_<tle::DebugTensorsPerIteration>(m, "DebugTensorsPerIteration")
+        .def(nb::init<>())
+        .def_rw("iter", &tle::DebugTensorsPerIteration::iter)
+        .def_rw("debug_tensors", &tle::DebugTensorsPerIteration::debugTensors);
+
+    nb::enum_<tle::RequestStage>(m, "RequestStage")
+        .value("QUEUED", tle::RequestStage::kQUEUED)
+        .value("ENCODER_IN_PROGRESS", tle::RequestStage::kENCODER_IN_PROGRESS)
+        .value("CONTEXT_IN_PROGRESS", tle::RequestStage::kCONTEXT_IN_PROGRESS)
+        .value("GENERATION_IN_PROGRESS", tle::RequestStage::kGENERATION_IN_PROGRESS)
+        .value("GENERATION_COMPLETE", tle::RequestStage::kGENERATION_COMPLETE);
+
+    nb::class_<tle::DisServingRequestStats>(m, "DisServingRequestStats")
+        .def(nb::init<>())
+        .def_rw("kv_cache_transfer_ms", &tle::DisServingRequestStats::kvCacheTransferMS)
+        .def_rw("kv_cache_size", &tle::DisServingRequestStats::kvCacheSize);
+
+    nb::class_<tle::RequestStats>(m, "RequestStats")
+        .def(nb::init<>())
+        .def_rw("id", &tle::RequestStats::id)
+        .def_rw("stage", &tle::RequestStats::stage)
+        .def_rw("context_prefill_position", &tle::RequestStats::contextPrefillPosition)
+        .def_rw("num_generated_tokens", &tle::RequestStats::numGeneratedTokens)
+        .def_rw("avg_num_decoded_tokens_per_iter", &tle::RequestStats::avgNumDecodedTokensPerIter)
+        .def_rw("scheduled", &tle::RequestStats::scheduled)
+        .def_rw("paused", &tle::RequestStats::paused)
+        .def_rw("dis_serving_stats", &tle::RequestStats::disServingStats)
+        .def_rw("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest)
+        .def_rw("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest)
+        .def_rw("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest)
+        .def_rw("missed_blocks_per_request", &tle::RequestStats::missedBlocksPerRequest)
+        .def_rw("kv_cache_hit_rate_per_request", &tle::RequestStats::kvCacheHitRatePerRequest)
+        .def("to_json_str",
+            [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });
+
+    nb::class_<tle::RequestStatsPerIteration>(m, "RequestStatsPerIteration")
+        .def(nb::init<>())
+        .def_rw("iter", &tle::RequestStatsPerIteration::iter)
+        .def_rw("request_stats", &tle::RequestStatsPerIteration::requestStats)
+        .def("to_json_str",
+            [](tle::RequestStatsPerIteration const& iterationStats)
+            { return tle::JsonSerialization::toJsonStr(iterationStats); });
+
+    nb::module_ executor_kv_cache = m.def_submodule("kv_cache", "Executor KV Cache Manager");
+
+    nb::class_<tle::KVCacheCreatedData>(executor_kv_cache, "KVCacheCreatedData")
+        .def_ro("num_blocks_per_cache_level", &tle::KVCacheCreatedData::numBlocksPerCacheLevel);
+
+    nb::class_<tensorrt_llm::runtime::UniqueToken>(executor_kv_cache, "UniqueToken")
+        .def_ro("token_id", &tensorrt_llm::runtime::UniqueToken::tokenId)
+        .def_ro("token_extra_id", &tensorrt_llm::runtime::UniqueToken::tokenExtraId);
+
+    nb::class_<tle::KVCacheStoredBlockData>(executor_kv_cache, "KVCacheStoredBlockData")
+        .def_ro("block_hash", &tle::KVCacheStoredBlockData::blockHash)
+        .def_ro("tokens", &tle::KVCacheStoredBlockData::tokens)
+        .def_ro("lora_id", &tle::KVCacheStoredBlockData::loraId)
+        .def_ro("cache_level", &tle::KVCacheStoredBlockData::cacheLevel)
+        .def_ro("priority", &tle::KVCacheStoredBlockData::priority);
+
+    nb::class_<tle::KVCacheStoredData>(executor_kv_cache, "KVCacheStoredData")
+        .def_ro("parent_hash", &tle::KVCacheStoredData::parentHash)
+        .def_ro("blocks", &tle::KVCacheStoredData::blocks);
+
+    nb::class_<tle::KVCacheRemovedData>(executor_kv_cache, "KVCacheRemovedData")
+        .def_ro("block_hashes", &tle::KVCacheRemovedData::blockHashes);
+
+    instantiateEventDiff<SizeType32>(executor_kv_cache, "Int");
+
+    nb::class_<tle::KVCacheUpdatedData>(executor_kv_cache, "KVCacheUpdatedData")
+        .def_ro("block_hash", &tle::KVCacheUpdatedData::blockHash)
+        .def_ro("cache_level", &tle::KVCacheUpdatedData::cacheLevel)
+        .def_ro("priority", &tle::KVCacheUpdatedData::priority);
+
+    nb::class_<tle::KVCacheEvent>(executor_kv_cache, "KVCacheEvent")
+        .def_ro("event_id", &tle::KVCacheEvent::eventId)
+        .def_ro("data", &tle::KVCacheEvent::data)
+        .def_ro("window_size", &tle::KVCacheEvent::windowSize);
+
+    nb::class_<tle::KVCacheEventManager>(executor_kv_cache, "KVCacheEventManager")
+        .def(
+            "get_latest_events",
+            [](tle::KVCacheEventManager& self, std::optional<double> timeout_ms = std::nullopt)
+            {
+                if (timeout_ms)
+                {
+                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
+                }
+                return self.getLatestEvents(std::nullopt);
+            },
+            nb::arg("timeout_ms") = std::nullopt);
+
+    tensorrt_llm::nanobind::executor::initRequestBindings(m);
+    tensorrt_llm::nanobind::executor::initConfigBindings(m);
+    tensorrt_llm::nanobind::executor::Executor::initBindings(m);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.h b/cpp/tensorrt_llm/nanobind/executor/bindings.h
new file mode 100644
index 00000000000..4df52c2d34e
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/bindings.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+// Register bindings for executor API.
+void initBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executor.cpp b/cpp/tensorrt_llm/nanobind/executor/executor.cpp
new file mode 100644
index 00000000000..59c7d2a3dc1
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executor.cpp
@@ -0,0 +1,241 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "executor.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/chrono.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/vector.h>
+#include <torch/extension.h>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+
+namespace nanobind::detail
+{
+
+template <>
+struct dtype_traits<half>
+{
+    static constexpr dlpack::dtype value{
+        (uint8_t) dlpack::dtype_code::Float, // type code
+        16,                                  // size in bits
+        1                                    // lanes (simd), usually set to 1
+    };
+    static constexpr auto name = const_name("float16");
+};
+} // namespace nanobind::detail
+
+namespace
+{
+// todo: Properly support FP8 and BF16 and verify functionality
+tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)
+{
+    auto npDtype = array.dtype();
+    char kind = '\0';
+    switch (npDtype.code)
+    {
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Int):
+        kind = 'i'; // signed integer
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::UInt):
+        kind = 'u'; // unsigned integer
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Float):
+        kind = 'f'; // floating point
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Bfloat):
+        kind = 'f'; // brain floating point (treat as float kind)
+        break;
+    case static_cast<uint8_t>(nb::dlpack::dtype_code::Complex):
+        kind = 'c'; // complex
+        break;
+    default:
+        kind = 'V'; // void/other
+        break;
+    }
+    tle::DataType dtype;
+    if (npDtype == nb::dtype<half>())
+    {
+        dtype = tle::DataType::kFP16;
+    }
+    else if (npDtype == nb::dtype<float>())
+    {
+        dtype = tle::DataType::kFP32;
+    }
+    else if (npDtype == nb::dtype<int8_t>())
+    {
+        dtype = tle::DataType::kINT8;
+    }
+    else if (npDtype == nb::dtype<int32_t>())
+    {
+        dtype = tle::DataType::kINT32;
+    }
+    else if (npDtype == nb::dtype<int64_t>())
+    {
+        dtype = tle::DataType::kINT64;
+    }
+    else if (kind == 'V' && array.itemsize() == 1)
+    {
+        dtype = tle::DataType::kFP8;
+    }
+    else if (kind == 'V' && array.itemsize() == 2)
+    {
+        dtype = tle::DataType::kBF16;
+    }
+    else
+    {
+        TLLM_THROW("Unsupported numpy dtype.");
+    }
+
+    // todo: improve the following code
+    std::vector<int64_t> dims;
+    dims.reserve(array.ndim());
+    for (size_t i = 0; i < array.ndim(); ++i)
+    {
+        dims.push_back(static_cast<int64_t>(array.shape(i)));
+    }
+    tle::Shape shape(dims.data(), dims.size());
+
+    return tle::Tensor::of(dtype, const_cast<void*>(array.data()), shape);
+}
+
+} // namespace
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+Executor::Executor(
+    std::filesystem::path const& modelPath, tle::ModelType modelType, tle::ExecutorConfig const& executorConfig)
+{
+    mExecutor = std::make_unique<tle::Executor>(modelPath, modelType, executorConfig);
+}
+
+Executor::Executor(std::filesystem::path const& encoderModelPath, std::filesystem::path const& decoderModelPath,
+    tle::ModelType modelType, tle::ExecutorConfig const& executorConfig)
+{
+    mExecutor = std::make_unique<tle::Executor>(encoderModelPath, decoderModelPath, modelType, executorConfig);
+}
+
+Executor::Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigStr, tle::ModelType modelType,
+    tle::ExecutorConfig const& executorConfig, std::optional<nb::dict> managedWeights)
+{
+    uint8_t const* data = static_cast<uint8_t const*>(engineBuffer.data());
+    size_t size = engineBuffer.size();
+    std::optional<std::map<std::string, tle::Tensor>> managedWeightsMap = std::nullopt;
+    if (managedWeights.has_value() && !managedWeights.value().empty())
+    {
+        managedWeightsMap = std::map<std::string, tle::Tensor>();
+        for (auto const& [rawName, rawArray] : managedWeights.value())
+        {
+            std::string name = nb::cast<std::string>(rawName);
+            nb::ndarray<nb::numpy> array = nb::cast<nb::ndarray<nb::numpy>>(rawArray);
+            managedWeightsMap->emplace(name, numpyToTensor(array));
+        }
+    }
+    mExecutor = std::make_unique<tle::Executor>(
+        tle::BufferView(data, size), jsonConfigStr, modelType, executorConfig, managedWeightsMap);
+}
+
+Executor::Executor(std::string const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
+    std::string const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, tle::ModelType modelType,
+    tle::ExecutorConfig const& executorConfig)
+{
+    uint8_t const* encoderData = reinterpret_cast<uint8_t const*>(encoderEngineBuffer.data());
+    size_t encoderSize = encoderEngineBuffer.size();
+    uint8_t const* decoderData = reinterpret_cast<uint8_t const*>(decoderEngineBuffer.data());
+    size_t decoderSize = decoderEngineBuffer.size();
+    mExecutor = std::make_unique<tle::Executor>(tle::BufferView(encoderData, encoderSize), encoderJsonConfigStr,
+        tle::BufferView(decoderData, decoderSize), decoderJsonConfigStr, modelType, executorConfig);
+}
+
+nb::object Executor::enter()
+{
+    TLLM_CHECK(static_cast<bool>(mExecutor));
+    return nb::cast(this);
+}
+
+void Executor::exit(
+    [[maybe_unused]] nb::handle type, [[maybe_unused]] nb::handle value, [[maybe_unused]] nb::handle traceback)
+{
+    shutdown();
+    mExecutor = nullptr;
+}
+
+void Executor::shutdown()
+{
+    // NOTE: we must release the GIL here. Executor has spawned a thread for the execution loop. That thread must be
+    // able to do forward progress for the shutdown process to succeed. It takes the GIL during its callbacks, so
+    // we release it now. Note that we shouldn't do anything related to python objects after that.
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    nb::gil_scoped_release release;
+    mExecutor->shutdown();
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void Executor::initBindings(nb::module_& m)
+{
+    nb::class_<Executor>(m, "Executor")
+        .def(nb::init<std::filesystem::path const&, tle::ModelType, tle::ExecutorConfig const&>(),
+            nb::arg("model_path"), nb::arg("model_type"), nb::arg("executor_config"))
+        .def(nb::init<std::filesystem::path const&, std::filesystem::path const&, tle::ModelType,
+                 tle::ExecutorConfig const&>(),
+            nb::arg("encoder_model_path"), nb::arg("decoder_model_path"), nb::arg("model_type"),
+            nb::arg("executor_config"))
+        .def(nb::init<nb::bytes, std::string const&, tle::ModelType, tle::ExecutorConfig const&, nb::dict>(),
+            nb::arg("engine_buffer"), nb::arg("json_config_str"), nb::arg("model_type"), nb::arg("executor_config"),
+            nb::arg("managed_weights") = nb::dict())
+        .def(nb::init<std::string const&, std::string const&, std::string const&, std::string const&, tle::ModelType,
+                 tle::ExecutorConfig const&>(),
+            nb::arg("encoder_engine_buffer"), nb::arg("encoder_json_config_str"), nb::arg("decoder_engine_buffer"),
+            nb::arg("decoder_json_config_str"), nb::arg("model_type"), nb::arg("executor_config"))
+        .def("shutdown", &Executor::shutdown)
+        .def("__enter__", &Executor::enter)
+        .def("__exit__", &Executor::exit)
+        .def("enqueue_request", &Executor::enqueueRequest, nb::arg("request"))
+        .def("enqueue_requests", &Executor::enqueueRequests, nb::arg("requests"))
+        .def("await_responses",
+            nb::overload_cast<std::optional<std::chrono::milliseconds> const&>(&Executor::awaitResponses),
+            nb::arg("timeout") = nb::none())
+        .def("await_responses",
+            nb::overload_cast<tle::IdType const&, std::optional<std::chrono::milliseconds> const&>(
+                &Executor::awaitResponses),
+            nb::arg("id"), nb::arg("timeout") = nb::none())
+        .def("await_responses",
+            nb::overload_cast<std::vector<tle::IdType> const&, std::optional<std::chrono::milliseconds> const&>(
+                &Executor::awaitResponses),
+            nb::arg("ids"), nb::arg("timeout") = nb::none())
+        .def("get_num_responses_ready", &Executor::getNumResponsesReady, nb::arg("id") = nb::none())
+        .def("cancel_request", &Executor::cancelRequest, nb::arg("id") = nb::none())
+        .def("get_latest_iteration_stats", &Executor::getLatestIterationStats)
+        .def("get_latest_request_stats", &Executor::getLatestRequestStats)
+        .def("get_latest_debug_tensors", &Executor::getLatestDebugTensors)
+        .def("can_enqueue_requests", &Executor::canEnqueueRequests)
+        .def("get_kv_cache_event_manager", &Executor::getKVCacheEventManager);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executor.h b/cpp/tensorrt_llm/nanobind/executor/executor.h
new file mode 100644
index 00000000000..22c24abb4bf
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executor.h
@@ -0,0 +1,129 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/types.h"
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+class Executor
+{
+public:
+    Executor(
+        std::filesystem::path const& modelPath, tle::ModelType modelType, tle::ExecutorConfig const& executorConfig);
+
+    Executor(std::filesystem::path const& encoderModelPath, std::filesystem::path const& decoderModelPath,
+        tle::ModelType modelType, tle::ExecutorConfig const& executorConfig);
+
+    Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigStr, tle::ModelType modelType,
+        tle::ExecutorConfig const& executorConfig, std::optional<nb::dict> managedWeights);
+
+    Executor(std::string const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
+        std::string const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, tle::ModelType modelType,
+        tle::ExecutorConfig const& executorConfig);
+
+    nb::object enter();
+    void exit(
+        [[maybe_unused]] nb::handle type, [[maybe_unused]] nb::handle value, [[maybe_unused]] nb::handle traceback);
+    void shutdown();
+
+    [[nodiscard]] tle::IdType enqueueRequest(tle::Request const& request)
+    {
+        return mExecutor->enqueueRequest(request);
+    }
+
+    [[nodiscard]] std::vector<tle::IdType> enqueueRequests(std::vector<tle::Request> const& requests)
+    {
+        return mExecutor->enqueueRequests(requests);
+    }
+
+    [[nodiscard]] std::vector<tle::Response> awaitResponses(
+        std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
+    {
+        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
+        // thread.
+        nb::gil_scoped_release release;
+        return mExecutor->awaitResponses(timeout);
+    }
+
+    [[nodiscard]] std::vector<tle::Response> awaitResponses(
+        tle::IdType const& requestId, std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
+    {
+        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
+        // thread.
+        nb::gil_scoped_release release;
+        return mExecutor->awaitResponses(requestId, timeout);
+    }
+
+    [[nodiscard]] std::vector<std::vector<tle::Response>> awaitResponses(std::vector<tle::IdType> const& requestIds,
+        std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
+    {
+        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
+        // thread.
+        nb::gil_scoped_release release;
+        return mExecutor->awaitResponses(requestIds, timeout);
+    }
+
+    [[nodiscard]] tle::SizeType32 getNumResponsesReady(std::optional<tle::IdType> const& requestId = std::nullopt) const
+    {
+        return mExecutor->getNumResponsesReady(requestId);
+    }
+
+    void cancelRequest(tle::IdType requestId)
+    {
+        mExecutor->cancelRequest(requestId);
+    }
+
+    std::deque<tle::IterationStats> getLatestIterationStats()
+    {
+        return mExecutor->getLatestIterationStats();
+    }
+
+    std::deque<tle::RequestStatsPerIteration> getLatestRequestStats()
+    {
+        return mExecutor->getLatestRequestStats();
+    }
+
+    std::deque<tle::DebugTensorsPerIteration> getLatestDebugTensors()
+    {
+        return mExecutor->getLatestDebugTensors();
+    }
+
+    [[nodiscard]] bool canEnqueueRequests() const
+    {
+        return mExecutor->canEnqueueRequests();
+    }
+
+    [[nodiscard]] std::optional<std::shared_ptr<tle::KVCacheEventManager>> getKVCacheEventManager() const
+    {
+        return mExecutor->getKVCacheEventManager();
+    }
+
+    static void initBindings(nb::module_& m);
+
+private:
+    std::unique_ptr<tle::Executor> mExecutor;
+};
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
new file mode 100644
index 00000000000..c2d9fe25dff
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
@@ -0,0 +1,616 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "executorConfig.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/utils/mpiUtils.h"
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/function.h>
+#include <nanobind/stl/map.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/pair.h>
+#include <nanobind/stl/set.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/unordered_map.h>
+#include <nanobind/stl/unordered_set.h>
+#include <nanobind/stl/vector.h>
+#include <torch/torch.h>
+#include <vector>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+using SizeType32 = tle::SizeType32;
+using RuntimeDefaults = tensorrt_llm::runtime::RuntimeDefaults;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+void initConfigBindings(nb::module_& m)
+{
+    nb::enum_<tle::BatchingType>(m, "BatchingType")
+        .value("STATIC", tle::BatchingType::kSTATIC)
+        .value("INFLIGHT", tle::BatchingType::kINFLIGHT);
+
+    auto dynamicBatchConfigGetstate = [](tle::DynamicBatchConfig const& self)
+    {
+        return nb::make_tuple(self.getEnableBatchSizeTuning(), self.getEnableMaxNumTokensTuning(),
+            self.getDynamicBatchMovingAverageWindow(), self.getBatchSizeTable());
+    };
+    auto dynamicBatchConfigSetstate = [](tle::DynamicBatchConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DynamicBatchConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<SizeType32>(state[2]), nb::cast<std::vector<std::pair<SizeType32, SizeType32>>>(state[3]));
+    };
+    nb::class_<tle::DynamicBatchConfig>(m, "DynamicBatchConfig")
+        .def(nb::init<bool, bool, SizeType32>(), nb::arg("enable_batch_size_tuning"),
+            nb::arg("enable_max_num_tokens_tuning"), nb::arg("dynamic_batch_moving_average_window"))
+        .def_prop_ro("enable_batch_size_tuning", &tle::DynamicBatchConfig::getEnableBatchSizeTuning)
+        .def_prop_ro("enable_max_num_tokens_tuning", &tle::DynamicBatchConfig::getEnableMaxNumTokensTuning)
+        .def_prop_ro(
+            "dynamic_batch_moving_average_window", &tle::DynamicBatchConfig::getDynamicBatchMovingAverageWindow)
+        .def("__getstate__", dynamicBatchConfigGetstate)
+        .def("__setstate__", dynamicBatchConfigSetstate);
+
+    auto schedulerConfigSetstate = [](tle::SchedulerConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::SchedulerConfig(nb::cast<tle::CapacitySchedulerPolicy>(state[0]),
+            nb::cast<std::optional<tle::ContextChunkingPolicy>>(state[1]),
+            nb::cast<std::optional<tle::DynamicBatchConfig>>(state[2]));
+    };
+    auto schedulerConfigGetstate = [](tle::SchedulerConfig const& self)
+    {
+        return nb::make_tuple(
+            self.getCapacitySchedulerPolicy(), self.getContextChunkingPolicy(), self.getDynamicBatchConfig());
+    };
+    nb::class_<tle::SchedulerConfig>(m, "SchedulerConfig")
+        .def(nb::init<tle::CapacitySchedulerPolicy, std::optional<tle::ContextChunkingPolicy>,
+                 std::optional<tle::DynamicBatchConfig>>(),
+            nb::arg("capacity_scheduler_policy") = tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
+            nb::arg("context_chunking_policy") = nb::none(), nb::arg("dynamic_batch_config") = nb::none())
+        .def_prop_ro("capacity_scheduler_policy", &tle::SchedulerConfig::getCapacitySchedulerPolicy)
+        .def_prop_ro("context_chunking_policy", &tle::SchedulerConfig::getContextChunkingPolicy)
+        .def_prop_ro("dynamic_batch_config", &tle::SchedulerConfig::getDynamicBatchConfig)
+        .def("__getstate__", schedulerConfigGetstate)
+        .def("__setstate__", schedulerConfigSetstate);
+
+    nb::class_<RuntimeDefaults>(m, "RuntimeDefaults")
+        .def(nb::init<std::optional<std::vector<SizeType32>>, std::optional<SizeType32>>(),
+            nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none())
+        .def_ro("max_attention_window", &RuntimeDefaults::maxAttentionWindowVec)
+        .def_ro("sink_token_length", &RuntimeDefaults::sinkTokenLength);
+
+    auto kvCacheConfigGetstate = [](tle::KvCacheConfig const& self)
+    {
+        return nb::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
+            self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
+            self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
+            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm());
+    };
+    auto kvCacheConfigSetstate = [](tle::KvCacheConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 13)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::KvCacheConfig(nb::cast<bool>(state[0]), nb::cast<std::optional<SizeType32>>(state[1]),
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[2]), nb::cast<std::optional<SizeType32>>(state[3]),
+            nb::cast<std::optional<float>>(state[4]), nb::cast<std::optional<size_t>>(state[5]),
+            nb::cast<bool>(state[6]), nb::cast<std::optional<float>>(state[7]),
+            nb::cast<std::optional<tle::RetentionPriority>>(state[8]), nb::cast<size_t>(state[9]),
+            nb::cast<bool>(state[10]), nb::cast<bool>(state[11]), nb::cast<bool>(state[12]));
+    };
+    nb::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
+        .def(nb::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
+                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
+                 std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
+                 std::optional<RuntimeDefaults> const&>(),
+            nb::arg("enable_block_reuse") = true, nb::arg("max_tokens") = nb::none(),
+            nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none(),
+            nb::arg("free_gpu_memory_fraction") = nb::none(), nb::arg("host_cache_size") = nb::none(),
+            nb::arg("onboard_blocks") = true, nb::arg("cross_kv_cache_fraction") = nb::none(),
+            nb::arg("secondary_offload_min_priority") = nb::none(), nb::arg("event_buffer_max_size") = 0, nb::kw_only(),
+            nb::arg("enable_partial_reuse") = true, nb::arg("copy_on_partial_reuse") = true, nb::arg("use_uvm") = false,
+            nb::arg("runtime_defaults") = nb::none())
+        .def_prop_rw(
+            "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
+        .def_prop_rw("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
+        .def_prop_rw("max_attention_window", &tle::KvCacheConfig::getMaxAttentionWindowVec,
+            &tle::KvCacheConfig::setMaxAttentionWindowVec)
+        .def_prop_rw(
+            "sink_token_length", &tle::KvCacheConfig::getSinkTokenLength, &tle::KvCacheConfig::setSinkTokenLength)
+        .def_prop_rw("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction,
+            &tle::KvCacheConfig::setFreeGpuMemoryFraction)
+        .def_prop_rw("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
+        .def_prop_rw("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
+        .def_prop_rw("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
+            &tle::KvCacheConfig::setCrossKvCacheFraction)
+        .def_prop_rw("secondary_offload_min_priority", &tle::KvCacheConfig::getSecondaryOffloadMinPriority,
+            &tle::KvCacheConfig::setSecondaryOffloadMinPriority)
+        .def_prop_rw("event_buffer_max_size", &tle::KvCacheConfig::getEventBufferMaxSize,
+            &tle::KvCacheConfig::setEventBufferMaxSize)
+        .def_prop_rw("enable_partial_reuse", &tle::KvCacheConfig::getEnablePartialReuse,
+            &tle::KvCacheConfig::setEnablePartialReuse)
+        .def_prop_rw("copy_on_partial_reuse", &tle::KvCacheConfig::getCopyOnPartialReuse,
+            &tle::KvCacheConfig::setCopyOnPartialReuse)
+        .def_prop_rw("use_uvm", &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
+        .def("fill_empty_fields_from_runtime_defaults", &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
+        .def("__getstate__", kvCacheConfigGetstate)
+        .def("__setstate__", kvCacheConfigSetstate);
+
+    nb::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")
+        .def(nb::init<bool, std::string, std::shared_ptr<mpi::MpiComm>, bool>(), nb::arg("is_orchestrator") = true,
+            nb::arg("worker_executable_path") = "", nb::arg("orch_leader_comm").none() = nullptr,
+            nb::arg("spawn_processes") = true)
+        .def_prop_rw(
+            "is_orchestrator", &tle::OrchestratorConfig::getIsOrchestrator, &tle::OrchestratorConfig::setIsOrchestrator)
+        .def_prop_rw("worker_executable_path", &tle::OrchestratorConfig::getWorkerExecutablePath,
+            &tle::OrchestratorConfig::setWorkerExecutablePath)
+        .def_prop_rw("orch_leader_comm", &tle::OrchestratorConfig::getOrchLeaderComm,
+            &tle::OrchestratorConfig::setOrchLeaderComm)
+        .def_prop_rw("spawn_processes", &tle::OrchestratorConfig::getSpawnProcesses,
+            &tle::OrchestratorConfig::setSpawnProcesses);
+
+    auto parallelConfigGetstate = [](tle::ParallelConfig const& self)
+    {
+        return nb::make_tuple(self.getCommunicationType(), self.getCommunicationMode(), self.getDeviceIds(),
+            self.getParticipantIds(), self.getOrchestratorConfig(), self.getNumNodes());
+    };
+    auto parallelConfigSetstate = [](tle::ParallelConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 6)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::ParallelConfig(nb::cast<tle::CommunicationType>(state[0]),
+            nb::cast<tle::CommunicationMode>(state[1]), nb::cast<std::optional<std::vector<SizeType32>>>(state[2]),
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[3]),
+            nb::cast<std::optional<tle::OrchestratorConfig>>(state[4]), nb::cast<std::optional<SizeType32>>(state[5]));
+    };
+    nb::class_<tle::ParallelConfig>(m, "ParallelConfig")
+        .def(nb::init<tle::CommunicationType, tle::CommunicationMode, std::optional<std::vector<SizeType32>> const&,
+                 std::optional<std::vector<SizeType32>> const&, std::optional<tle::OrchestratorConfig> const&,
+                 std::optional<SizeType32> const&>(),
+            nb::arg("communication_type") = tle::CommunicationType::kMPI,
+            nb::arg("communication_mode") = tle::CommunicationMode::kLEADER, nb::arg("device_ids") = nb::none(),
+            nb::arg("participant_ids") = nb::none(), nb::arg("orchestrator_config") = nb::none(),
+            nb::arg("num_nodes") = nb::none())
+        .def_prop_rw("communication_type", &tle::ParallelConfig::getCommunicationType,
+            &tle::ParallelConfig::setCommunicationType)
+        .def_prop_rw("communication_mode", &tle::ParallelConfig::getCommunicationMode,
+            &tle::ParallelConfig::setCommunicationMode)
+        .def_prop_rw("device_ids", &tle::ParallelConfig::getDeviceIds, &tle::ParallelConfig::setDeviceIds)
+        .def_prop_rw(
+            "participant_ids", &tle::ParallelConfig::getParticipantIds, &tle::ParallelConfig::setParticipantIds)
+        .def_prop_rw("orchestrator_config", &tle::ParallelConfig::getOrchestratorConfig,
+            &tle::ParallelConfig::setOrchestratorConfig)
+        .def_prop_rw("num_nodes", &tle::ParallelConfig::getNumNodes, &tle::ParallelConfig::setNumNodes)
+        .def("__getstate__", parallelConfigGetstate)
+        .def("__setstate__", parallelConfigSetstate);
+
+    auto peftCacheConfigSetstate = [](tle::PeftCacheConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 11)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::PeftCacheConfig(nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]),
+            nb::cast<SizeType32>(state[2]), nb::cast<SizeType32>(state[3]), nb::cast<SizeType32>(state[4]),
+            nb::cast<SizeType32>(state[5]), nb::cast<SizeType32>(state[6]), nb::cast<SizeType32>(state[7]),
+            nb::cast<SizeType32>(state[8]), nb::cast<std::optional<float>>(state[9]),
+            nb::cast<std::optional<size_t>>(state[10]));
+    };
+    auto peftCacheConfigGetstate = [](tle::PeftCacheConfig const& self)
+    {
+        return nb::make_tuple(self.getNumHostModuleLayer(), self.getNumDeviceModuleLayer(),
+            self.getOptimalAdapterSize(), self.getMaxAdapterSize(), self.getNumPutWorkers(), self.getNumEnsureWorkers(),
+            self.getNumCopyStreams(), self.getMaxPagesPerBlockHost(), self.getMaxPagesPerBlockDevice(),
+            self.getDeviceCachePercent(), self.getHostCacheSize());
+    };
+    nb::class_<tle::PeftCacheConfig>(m, "PeftCacheConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
+                 SizeType32, std::optional<float> const&, std::optional<size_t> const&,
+                 std::optional<std::string> const&>(),
+            nb::arg("num_host_module_layer") = 0, nb::arg("num_device_module_layer") = 0,
+            nb::arg("optimal_adapter_size") = 8, nb::arg("max_adapter_size") = 64, nb::arg("num_put_workers") = 1,
+            nb::arg("num_ensure_workers") = 1, nb::arg("num_copy_streams") = 1,
+            nb::arg("max_pages_per_block_host") = 24, nb::arg("max_pages_per_block_device") = 8,
+            nb::arg("device_cache_percent") = nb::none(), nb::arg("host_cache_size") = nb::none(),
+            nb::arg("lora_prefetch_dir") = nb::none())
+        .def_prop_ro("num_host_module_layer", &tle::PeftCacheConfig::getNumHostModuleLayer)
+        .def_prop_ro("num_device_module_layer", &tle::PeftCacheConfig::getNumDeviceModuleLayer)
+        .def_prop_ro("optimal_adapter_size", &tle::PeftCacheConfig::getOptimalAdapterSize)
+        .def_prop_ro("max_adapter_size", &tle::PeftCacheConfig::getMaxAdapterSize)
+        .def_prop_ro("num_put_workers", &tle::PeftCacheConfig::getNumPutWorkers)
+        .def_prop_ro("num_ensure_workers", &tle::PeftCacheConfig::getNumEnsureWorkers)
+        .def_prop_ro("num_copy_streams", &tle::PeftCacheConfig::getNumCopyStreams)
+        .def_prop_ro("max_pages_per_block_host", &tle::PeftCacheConfig::getMaxPagesPerBlockHost)
+        .def_prop_ro("max_pages_per_block_device", &tle::PeftCacheConfig::getMaxPagesPerBlockDevice)
+        .def_prop_ro("device_cache_percent", &tle::PeftCacheConfig::getDeviceCachePercent)
+        .def_prop_ro("host_cache_size", &tle::PeftCacheConfig::getHostCacheSize)
+        .def_prop_ro("lora_prefetch_dir", &tle::PeftCacheConfig::getLoraPrefetchDir)
+        .def("__getstate__", peftCacheConfigGetstate)
+        .def("__setstate__", peftCacheConfigSetstate);
+
+    auto decodingConfigGetstate = [](tle::DecodingConfig const& self)
+    {
+        return nb::make_tuple(
+            self.getDecodingMode(), self.getLookaheadDecodingConfig(), self.getMedusaChoices(), self.getEagleConfig());
+    };
+    auto decodingConfigSetstate = [](tle::DecodingConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DecodingConfig(nb::cast<std::optional<tle::DecodingMode>>(state[0]), // DecodingMode
+            nb::cast<std::optional<tle::LookaheadDecodingConfig>>(state[1]),                  // LookaheadDecodingConfig
+            nb::cast<std::optional<tle::MedusaChoices>>(state[2]),                            // MedusaChoices
+            nb::cast<std::optional<tle::EagleConfig>>(state[3])                               // EagleConfig
+        );
+    };
+    nb::class_<tle::DecodingConfig>(m, "DecodingConfig")
+        .def(nb::init<std::optional<tle::DecodingMode>, std::optional<tle::LookaheadDecodingConfig>,
+                 std::optional<tle::MedusaChoices>, std::optional<tle::EagleConfig>>(),
+            nb::arg("decoding_mode") = nb::none(), nb::arg("lookahead_decoding_config") = nb::none(),
+            nb::arg("medusa_choices") = nb::none(), nb::arg("eagle_config") = nb::none())
+        .def_prop_rw("decoding_mode", &tle::DecodingConfig::getDecodingMode, &tle::DecodingConfig::setDecodingMode)
+        .def_prop_rw("lookahead_decoding_config", &tle::DecodingConfig::getLookaheadDecodingConfig,
+            &tle::DecodingConfig::setLookaheadDecodingConfig)
+        .def_prop_rw("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices)
+        .def_prop_rw("eagle_config", &tle::DecodingConfig::getEagleConfig, &tle::DecodingConfig::setEagleConfig)
+        .def("__getstate__", decodingConfigGetstate)
+        .def("__setstate__", decodingConfigSetstate);
+
+    auto debugConfigGetstate = [](tle::DebugConfig const& self)
+    {
+        return nb::make_tuple(self.getDebugInputTensors(), self.getDebugOutputTensors(), self.getDebugTensorNames(),
+            self.getDebugTensorsMaxIterations());
+    };
+    auto debugConfigSetstate = [](tle::DebugConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&self) tle::DebugConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<std::vector<std::string>>(state[2]), nb::cast<SizeType32>(state[3]));
+    };
+    nb::class_<tle::DebugConfig>(m, "DebugConfig")
+        .def(nb::init<bool, bool, std::vector<std::string>, SizeType32>(), nb::arg("debug_input_tensors") = false,
+            nb::arg("debug_output_tensors") = false, nb::arg("debug_tensor_names") = nb::none(),
+            nb::arg("debug_tensors_max_iterations") = false)
+        .def_prop_rw(
+            "debug_input_tensors", &tle::DebugConfig::getDebugInputTensors, &tle::DebugConfig::setDebugInputTensors)
+        .def_prop_rw(
+            "debug_output_tensors", &tle::DebugConfig::getDebugOutputTensors, &tle::DebugConfig::setDebugOutputTensors)
+        .def_prop_rw(
+            "debug_tensor_names", &tle::DebugConfig::getDebugTensorNames, &tle::DebugConfig::setDebugTensorNames)
+        .def_prop_rw("debug_tensors_max_iterations", &tle::DebugConfig::getDebugTensorsMaxIterations,
+            &tle::DebugConfig::setDebugTensorsMaxIterations)
+        .def("__getstate__", debugConfigGetstate)
+        .def("__setstate__", debugConfigSetstate);
+
+    auto logitsPostProcessorConfigGetstate = [](tle::LogitsPostProcessorConfig const& self)
+    { return nb::make_tuple(self.getProcessorMap(), self.getProcessorBatched(), self.getReplicate()); };
+
+    auto logitsPostProcessorConfigSetstate = [](tle::LogitsPostProcessorConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid LogitsPostProcessorConfig state!");
+        }
+        new (&self) tle::LogitsPostProcessorConfig(nb::cast<std::optional<tle::LogitsPostProcessorMap>>(state[0]),
+            nb::cast<std::optional<tle::LogitsPostProcessorBatched>>(state[1]), nb::cast<bool>(state[2]));
+    };
+
+    nb::class_<tle::LogitsPostProcessorConfig>(m, "LogitsPostProcessorConfig")
+        .def(nb::init<std::optional<tle::LogitsPostProcessorMap>, std::optional<tle::LogitsPostProcessorBatched>,
+                 bool>(),
+            nb::arg("processor_map") = nb::none(), nb::arg("processor_batched") = nb::none(),
+            nb::arg("replicate") = true)
+        .def_prop_rw("processor_map", &tle::LogitsPostProcessorConfig::getProcessorMap,
+            &tle::LogitsPostProcessorConfig::setProcessorMap)
+        .def_prop_rw("processor_batched", &tle::LogitsPostProcessorConfig::getProcessorBatched,
+            &tle::LogitsPostProcessorConfig::setProcessorBatched)
+        .def_prop_rw(
+            "replicate", &tle::LogitsPostProcessorConfig::getReplicate, &tle::LogitsPostProcessorConfig::setReplicate)
+        .def("__getstate__", logitsPostProcessorConfigGetstate)
+        .def("__setstate__", logitsPostProcessorConfigSetstate);
+
+    auto extendedRuntimePerfKnobConfigSetstate = [](tle::ExtendedRuntimePerfKnobConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
+        }
+        new (&self) tle::ExtendedRuntimePerfKnobConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<bool>(state[2]), nb::cast<SizeType32>(state[2]));
+    };
+    auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
+    {
+        return nb::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc(), self.getCudaGraphMode(),
+            self.getCudaGraphCacheSize());
+    };
+    nb::class_<tle::ExtendedRuntimePerfKnobConfig>(m, "ExtendedRuntimePerfKnobConfig")
+        .def(
+            nb::init<bool, bool>(), nb::arg("multi_block_mode") = true, nb::arg("enable_context_fmha_fp32_acc") = false)
+        .def_prop_rw("multi_block_mode", &tle::ExtendedRuntimePerfKnobConfig::getMultiBlockMode,
+            &tle::ExtendedRuntimePerfKnobConfig::setMultiBlockMode)
+        .def_prop_rw("enable_context_fmha_fp32_acc", &tle::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc,
+            &tle::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc)
+        .def_prop_rw("cuda_graph_mode", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphMode,
+            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphMode)
+        .def_prop_rw("cuda_graph_cache_size", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize,
+            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize)
+        .def("__getstate__", extendedRuntimePerfKnobConfigGetstate)
+        .def("__setstate__", extendedRuntimePerfKnobConfigSetstate);
+
+    auto SpeculativeDecodingConfigGetState
+        = [](tle::SpeculativeDecodingConfig const& self) { return nb::make_tuple(self.fastLogits); };
+    auto SpeculativeDecodingConfigSetState = [](tle::SpeculativeDecodingConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 1)
+        {
+            throw std::runtime_error("Invalid SpeculativeDecodingConfig state!");
+        }
+        new (&self) tle::SpeculativeDecodingConfig(nb::cast<bool>(state[0]));
+    };
+    nb::class_<tle::SpeculativeDecodingConfig>(m, "SpeculativeDecodingConfig")
+        .def(nb::init<bool>(), nb::arg("fast_logits") = false)
+        .def_rw("fast_logits", &tle::SpeculativeDecodingConfig::fastLogits)
+        .def("__getstate__", SpeculativeDecodingConfigGetState)
+        .def("__setstate__", SpeculativeDecodingConfigSetState);
+
+    // Guided decoding config
+    auto pyGuidedDecodingConfig = nb::class_<tle::GuidedDecodingConfig>(m, "GuidedDecodingConfig");
+
+    nb::enum_<tle::GuidedDecodingConfig::GuidedDecodingBackend>(pyGuidedDecodingConfig, "GuidedDecodingBackend")
+        .value("XGRAMMAR", tle::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
+        .value("LLGUIDANCE", tle::GuidedDecodingConfig::GuidedDecodingBackend::kLLGUIDANCE);
+
+    auto guidedDecodingConfigGetstate = [](tle::GuidedDecodingConfig const& self) {
+        return nb::make_tuple(
+            self.getBackend(), self.getEncodedVocab(), self.getTokenizerStr(), self.getStopTokenIds());
+    };
+    auto guidedDecodingConfigSetstate = [](tle::GuidedDecodingConfig& self, nb::tuple state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid GuidedDecodingConfig state!");
+        }
+        new (&self) tle::GuidedDecodingConfig(nb::cast<tle::GuidedDecodingConfig::GuidedDecodingBackend>(state[0]),
+            nb::cast<std::optional<std::vector<std::string>>>(state[1]), nb::cast<std::optional<std::string>>(state[2]),
+            nb::cast<std::optional<std::vector<tle::TokenIdType>>>(state[3]));
+    };
+
+    pyGuidedDecodingConfig
+        .def(nb::init<tle::GuidedDecodingConfig::GuidedDecodingBackend, std::optional<std::vector<std::string>>,
+                 std::optional<std::string>, std::optional<std::vector<tle::TokenIdType>>>(),
+            nb::arg("backend"), nb::arg("encoded_vocab") = nb::none(), nb::arg("tokenizer_str") = nb::none(),
+            nb::arg("stop_token_ids") = nb::none())
+        .def_prop_rw("backend", &tle::GuidedDecodingConfig::getBackend, &tle::GuidedDecodingConfig::setBackend)
+        .def_prop_rw(
+            "encoded_vocab", &tle::GuidedDecodingConfig::getEncodedVocab, &tle::GuidedDecodingConfig::setEncodedVocab)
+        .def_prop_rw(
+            "tokenizer_str", &tle::GuidedDecodingConfig::getTokenizerStr, &tle::GuidedDecodingConfig::setTokenizerStr)
+        .def_prop_rw(
+            "stop_token_ids", &tle::GuidedDecodingConfig::getStopTokenIds, &tle::GuidedDecodingConfig::setStopTokenIds)
+        .def("__getstate__", guidedDecodingConfigGetstate)
+        .def("__setstate__", guidedDecodingConfigSetstate);
+
+    auto cacheTransceiverConfigGetstate
+        = [](tle::CacheTransceiverConfig const& self) { return nb::make_tuple(self.getMaxNumTokens()); };
+    auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
+    {
+        if (state.size() != 1)
+        {
+            throw std::runtime_error("Invalid CacheTransceiverConfig state!");
+        }
+        new (&self) tle::CacheTransceiverConfig(nb::cast<std::optional<size_t>>(state[0]));
+    };
+
+    nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
+        .def(nb::init<std::optional<size_t>>(), nb::arg("max_num_tokens") = nb::none())
+        .def_prop_rw("max_num_tokens", &tle::CacheTransceiverConfig::getMaxNumTokens,
+            &tle::CacheTransceiverConfig::setMaxNumTokens)
+        .def("__getstate__", cacheTransceiverConfigGetstate)
+        .def("__setstate__", cacheTransceiverConfigSetstate);
+
+    auto executorConfigGetState = [](nb::object const& self)
+    {
+        auto& c = nb::cast<tle::ExecutorConfig&>(self);
+        // Return a tuple containing C++ data and the Python __dict__
+        auto cpp_states = nb::make_tuple(c.getMaxBeamWidth(), c.getSchedulerConfig(), c.getKvCacheConfig(),
+            c.getEnableChunkedContext(), c.getNormalizeLogProbs(), c.getIterStatsMaxIterations(),
+            c.getRequestStatsMaxIterations(), c.getBatchingType(), c.getMaxBatchSize(), c.getMaxNumTokens(),
+            c.getParallelConfig(), c.getPeftCacheConfig(), c.getLogitsPostProcessorConfig(), c.getDecodingConfig(),
+            c.getUseGpuDirectStorage(), c.getGpuWeightsPercent(), c.getMaxQueueSize(),
+            c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(), c.getRecvPollPeriodMs(),
+            c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
+            c.getAdditionalModelOutputs(), c.getCacheTransceiverConfig(), c.getGatherGenerationLogits(),
+            c.getPromptTableOffloading(), c.getEnableTrtOverlap());
+        auto pickle_tuple = nb::make_tuple(cpp_states, nb::getattr(self, "__dict__"));
+        return pickle_tuple;
+    };
+
+    auto executorConfigSetState = [](nb::object self, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+
+        auto cpp_states = nb::cast<nb::tuple>(state[0]);
+        if (cpp_states.size() != 28)
+        {
+            throw std::runtime_error("Invalid cpp_states!");
+        }
+
+        // Restore C++ data
+        tle::ExecutorConfig* cpp_self = nb::inst_ptr<tle::ExecutorConfig>(self);
+        new (cpp_self) tle::ExecutorConfig(                                          //
+            nb::cast<SizeType32>(cpp_states[0]),                                     // MaxBeamWidth
+            nb::cast<tle::SchedulerConfig>(cpp_states[1]),                           // SchedulerConfig
+            nb::cast<tle::KvCacheConfig>(cpp_states[2]),                             // KvCacheConfig
+            nb::cast<bool>(cpp_states[3]),                                           // EnableChunkedContext
+            nb::cast<bool>(cpp_states[4]),                                           // NormalizeLogProbs
+            nb::cast<SizeType32>(cpp_states[5]),                                     // IterStatsMaxIterations
+            nb::cast<SizeType32>(cpp_states[6]),                                     // RequestStatsMaxIterations
+            nb::cast<tle::BatchingType>(cpp_states[7]),                              // BatchingType
+            nb::cast<std::optional<SizeType32>>(cpp_states[8]),                      // MaxBatchSize
+            nb::cast<std::optional<SizeType32>>(cpp_states[9]),                      // MaxNumTokens
+            nb::cast<std::optional<tle::ParallelConfig>>(cpp_states[10]),            // ParallelConfig
+            nb::cast<std::optional<tle::PeftCacheConfig>>(cpp_states[11]),           // PeftCacheConfig
+            nb::cast<std::optional<tle::LogitsPostProcessorConfig>>(cpp_states[12]), // LogitsPostProcessorConfig
+            nb::cast<std::optional<tle::DecodingConfig>>(cpp_states[13]),            // DecodingConfig
+            nb::cast<bool>(cpp_states[14]),                                          // UseGpuDirectStorage
+            nb::cast<float>(cpp_states[15]),                                         // GpuWeightsPercent
+            nb::cast<std::optional<SizeType32>>(cpp_states[16]),                     // MaxQueueSize
+            nb::cast<tle::ExtendedRuntimePerfKnobConfig>(cpp_states[17]),            // ExtendedRuntimePerfKnobConfig
+            nb::cast<std::optional<tle::DebugConfig>>(cpp_states[18]),               // DebugConfig
+            nb::cast<SizeType32>(cpp_states[19]),                                    // RecvPollPeriodMs
+            nb::cast<uint64_t>(cpp_states[20]),                                      // MaxSeqIdleMicroseconds
+            nb::cast<std::optional<tle::SpeculativeDecodingConfig>>(cpp_states[21]), // SpecDecConfig
+            nb::cast<std::optional<tle::GuidedDecodingConfig>>(cpp_states[22]),      // GuidedDecodingConfig
+            nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(cpp_states[23]), // AdditionalModelOutputs
+            nb::cast<std::optional<tle::CacheTransceiverConfig>>(cpp_states[24]),             // CacheTransceiverConfig
+            nb::cast<bool>(cpp_states[25]),                                                   // GatherGenerationLogits
+            nb::cast<bool>(cpp_states[26]),                                                   // PromptTableOffloading
+            nb::cast<bool>(cpp_states[27])                                                    // EnableTrtOverlap
+        );
+
+        // Restore Python data
+        auto py_state = nb::cast<nb::dict>(state[1]);
+        self.attr("__dict__").attr("update")(py_state);
+
+        nb::inst_mark_ready(self);
+    };
+
+    nb::class_<tle::ExecutorConfig>(m, "ExecutorConfig", nb::dynamic_attr())
+        .def(nb::init<                                                   //
+                 SizeType32,                                             // MaxBeamWidth
+                 tle::SchedulerConfig const&,                            // SchedulerConfig
+                 tle::KvCacheConfig const&,                              // KvCacheConfig
+                 bool,                                                   // EnableChunkedContext
+                 bool,                                                   // NormalizeLogProbs
+                 SizeType32,                                             // IterStatsMaxIterations
+                 SizeType32,                                             // RequestStatsMaxIterations
+                 tle::BatchingType,                                      // BatchingType
+                 std::optional<SizeType32>,                              // MaxBatchSize
+                 std::optional<SizeType32>,                              // MaxNumTokens
+                 std::optional<tle::ParallelConfig>,                     // ParallelConfig
+                 tle::PeftCacheConfig const&,                            // PeftCacheConfig
+                 std::optional<tle::LogitsPostProcessorConfig>,          // LogitsPostProcessorConfig
+                 std::optional<tle::DecodingConfig>,                     // DecodingConfig
+                 bool,                                                   // UseGpuDirectStorage
+                 float,                                                  // GpuWeightsPercent
+                 std::optional<SizeType32>,                              // MaxQueueSize
+                 tle::ExtendedRuntimePerfKnobConfig const&,              // ExtendedRuntimePerfKnobConfig
+                 std::optional<tle::DebugConfig>,                        // DebugConfig
+                 SizeType32,                                             // RecvPollPeriodMs
+                 uint64_t,                                               // MaxSeqIdleMicroseconds
+                 std::optional<tle::SpeculativeDecodingConfig>,          // SpecDecConfig
+                 std::optional<tle::GuidedDecodingConfig>,               // GuidedDecodingConfig
+                 std::optional<std::vector<tle::AdditionalModelOutput>>, // AdditionalModelOutputs
+                 std::optional<tle::CacheTransceiverConfig>,             // CacheTransceiverConfig
+                 bool,                                                   // GatherGenerationLogits
+                 bool,                                                   // PromptTableOffloading
+                 bool                                                    // EnableTrtOverlap
+                 >(),
+            nb::arg("max_beam_width") = 1, nb::arg("scheduler_config") = tle::SchedulerConfig(),
+            nb::arg("kv_cache_config") = tle::KvCacheConfig(), nb::arg("enable_chunked_context") = false,
+            nb::arg("normalize_log_probs") = true,
+            nb::arg("iter_stats_max_iterations") = tle::ExecutorConfig::kDefaultIterStatsMaxIterations,
+            nb::arg("request_stats_max_iterations") = tle::ExecutorConfig::kDefaultRequestStatsMaxIterations,
+            nb::arg("batching_type") = tle::BatchingType::kINFLIGHT, nb::arg("max_batch_size") = nb::none(),
+            nb::arg("max_num_tokens") = nb::none(), nb::arg("parallel_config") = nb::none(),
+            nb::arg("peft_cache_config") = tle::PeftCacheConfig(), nb::arg("logits_post_processor_config") = nb::none(),
+            nb::arg("decoding_config") = nb::none(), nb::arg("use_gpu_direct_storage") = false,
+            nb::arg("gpu_weights_percent") = 1.0, nb::arg("max_queue_size") = nb::none(),
+            nb::arg("extended_runtime_perf_knob_config") = tle::ExtendedRuntimePerfKnobConfig(),
+            nb::arg("debug_config") = nb::none(), nb::arg("recv_poll_period_ms") = 0,
+            nb::arg("max_seq_idle_microseconds") = tle::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds,
+            nb::arg("spec_dec_config") = nb::none(), nb::arg("guided_decoding_config") = nb::none(),
+            nb::arg("additional_model_outputs") = nb::none(), nb::arg("cache_transceiver_config") = nb::none(),
+            nb::arg("gather_generation_logits") = false, nb::arg("mm_embedding_offloading") = false,
+            nb::arg("enable_trt_overlap") = false)
+        .def_prop_rw("max_beam_width", &tle::ExecutorConfig::getMaxBeamWidth, &tle::ExecutorConfig::setMaxBeamWidth)
+        .def_prop_rw("max_batch_size", &tle::ExecutorConfig::getMaxBatchSize, &tle::ExecutorConfig::setMaxBatchSize)
+        .def_prop_rw("max_num_tokens", &tle::ExecutorConfig::getMaxNumTokens, &tle::ExecutorConfig::setMaxNumTokens)
+        .def_prop_rw(
+            "scheduler_config", &tle::ExecutorConfig::getSchedulerConfigRef, &tle::ExecutorConfig::setSchedulerConfig)
+        .def_prop_rw(
+            "kv_cache_config", &tle::ExecutorConfig::getKvCacheConfigRef, &tle::ExecutorConfig::setKvCacheConfig)
+        .def_prop_rw("enable_chunked_context", &tle::ExecutorConfig::getEnableChunkedContext,
+            &tle::ExecutorConfig::setEnableChunkedContext)
+        .def_prop_rw("normalize_log_probs", &tle::ExecutorConfig::getNormalizeLogProbs,
+            &tle::ExecutorConfig::setNormalizeLogProbs)
+        .def_prop_rw("iter_stats_max_iterations", &tle::ExecutorConfig::getIterStatsMaxIterations,
+            &tle::ExecutorConfig::setIterStatsMaxIterations)
+        .def_prop_rw("request_stats_max_iterations", &tle::ExecutorConfig::getRequestStatsMaxIterations,
+            &tle::ExecutorConfig::setRequestStatsMaxIterations)
+        .def_prop_rw("batching_type", &tle::ExecutorConfig::getBatchingType, &tle::ExecutorConfig::setBatchingType)
+        .def_prop_rw(
+            "parallel_config", &tle::ExecutorConfig::getParallelConfig, &tle::ExecutorConfig::setParallelConfig)
+        .def_prop_rw(
+            "peft_cache_config", &tle::ExecutorConfig::getPeftCacheConfig, &tle::ExecutorConfig::setPeftCacheConfig)
+        .def_prop_rw("logits_post_processor_config", &tle::ExecutorConfig::getLogitsPostProcessorConfig,
+            &tle::ExecutorConfig::setLogitsPostProcessorConfig)
+        .def_prop_rw(
+            "decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig)
+        .def_prop_rw("use_gpu_direct_storage", &tle::ExecutorConfig::getUseGpuDirectStorage,
+            &tle::ExecutorConfig::setUseGpuDirectStorage)
+        .def_prop_rw("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent,
+            &tle::ExecutorConfig::setGpuWeightsPercent)
+        .def_prop_rw("max_queue_size", &tle::ExecutorConfig::getMaxQueueSize, &tle::ExecutorConfig::setMaxQueueSize)
+        .def_prop_rw("extended_runtime_perf_knob_config", &tle::ExecutorConfig::getExtendedRuntimePerfKnobConfig,
+            &tle::ExecutorConfig::setExtendedRuntimePerfKnobConfig)
+        .def_prop_rw("debug_config", &tle::ExecutorConfig::getDebugConfig, &tle::ExecutorConfig::setDebugConfig)
+        .def_prop_rw(
+            "recv_poll_period_ms", &tle::ExecutorConfig::getRecvPollPeriodMs, &tle::ExecutorConfig::setRecvPollPeriodMs)
+        .def_prop_rw("max_seq_idle_microseconds", &tle::ExecutorConfig::getMaxSeqIdleMicroseconds,
+            &tle::ExecutorConfig::setMaxSeqIdleMicroseconds)
+        .def_prop_rw("spec_dec_config", &tle::ExecutorConfig::getSpecDecConfig, &tle::ExecutorConfig::setSpecDecConfig)
+        .def_prop_rw("guided_decoding_config", &tle::ExecutorConfig::getGuidedDecodingConfig,
+            &tle::ExecutorConfig::setGuidedDecodingConfig)
+        .def_prop_rw("additional_model_outputs", &tle::ExecutorConfig::getAdditionalModelOutputs,
+            &tle::ExecutorConfig::setAdditionalModelOutputs)
+        .def_prop_rw("cache_transceiver_config", &tle::ExecutorConfig::getCacheTransceiverConfig,
+            &tle::ExecutorConfig::setCacheTransceiverConfig)
+        .def_prop_rw("gather_generation_logits", &tle::ExecutorConfig::getGatherGenerationLogits,
+            &tle::ExecutorConfig::setGatherGenerationLogits)
+        .def_prop_rw("mm_embedding_offloading", &tle::ExecutorConfig::getPromptTableOffloading,
+            &tle::ExecutorConfig::setPromptTableOffloading)
+        .def_prop_rw(
+            "enable_trt_overlap", &tle::ExecutorConfig::getEnableTrtOverlap, &tle::ExecutorConfig::setEnableTrtOverlap)
+        .def("__getstate__", executorConfigGetState)
+        .def("__setstate__", executorConfigSetState);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.h b/cpp/tensorrt_llm/nanobind/executor/executorConfig.h
new file mode 100644
index 00000000000..5b63e7c5a3e
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+// Register bindings for executor API.
+void initConfigBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/request.cpp b/cpp/tensorrt_llm/nanobind/executor/request.cpp
new file mode 100644
index 00000000000..9c3d34aa8fd
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/request.cpp
@@ -0,0 +1,935 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "request.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/logger.h"
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/executor/serializeUtils.h"
+#include "tensorrt_llm/executor/tensor.h"
+#include "tensorrt_llm/executor/types.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/chrono.h>
+#include <nanobind/stl/list.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/string.h>
+#include <nanobind/stl/tuple.h>
+#include <nanobind/stl/vector.h>
+#include <sstream>
+
+#include <optional>
+#include <vector>
+
+namespace nb = nanobind;
+namespace tle = tensorrt_llm::executor;
+using Tensor = tle::Tensor;
+using SizeType32 = tle::SizeType32;
+using FloatType = tle::FloatType;
+using VecTokens = tle::VecTokens;
+using IdType = tle::IdType;
+using VecTokenExtraIds = tle::VecTokenExtraIds;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+void initRequestBindings(nb::module_& m)
+{
+    nb::enum_<tle::RequestType>(m, "RequestType")
+        .value("REQUEST_TYPE_CONTEXT_AND_GENERATION", tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION)
+        .value("REQUEST_TYPE_CONTEXT_ONLY", tle::RequestType::REQUEST_TYPE_CONTEXT_ONLY)
+        .value("REQUEST_TYPE_GENERATION_ONLY", tle::RequestType::REQUEST_TYPE_GENERATION_ONLY);
+
+    nb::enum_<tle::FinishReason>(m, "FinishReason")
+        .value("NOT_FINISHED", tle::FinishReason::kNOT_FINISHED)
+        .value("END_ID", tle::FinishReason::kEND_ID)
+        .value("STOP_WORDS", tle::FinishReason::kSTOP_WORDS)
+        .value("LENGTH", tle::FinishReason::kLENGTH)
+        .value("TIMED_OUT", tle::FinishReason::kTIMED_OUT)
+        .value("CANCELLED", tle::FinishReason::kCANCELLED);
+
+    nb::enum_<tle::KvCacheTransferMode>(m, "KvCacheTransferMode")
+        .value("DRAM", tle::KvCacheTransferMode::DRAM)
+        .value("GDS", tle::KvCacheTransferMode::GDS)
+        .value("POSIX_DEBUG_FALLBACK", tle::KvCacheTransferMode::POSIX_DEBUG_FALLBACK);
+
+    auto samplingConfigGetstate = [](tle::SamplingConfig const& self)
+    {
+        return nb::make_tuple(self.getBeamWidth(), self.getTopK(), self.getTopP(), self.getTopPMin(),
+            self.getTopPResetIds(), self.getTopPDecay(), self.getSeed(), self.getTemperature(), self.getMinTokens(),
+            self.getBeamSearchDiversityRate(), self.getRepetitionPenalty(), self.getPresencePenalty(),
+            self.getFrequencyPenalty(), self.getLengthPenalty(), self.getEarlyStopping(), self.getNoRepeatNgramSize(),
+            self.getNumReturnSequences(), self.getMinP(), self.getBeamWidthArray());
+    };
+    auto samplingConfigSetstate = [](tle::SamplingConfig& samplingConfig, nb::tuple const& state)
+    {
+        if (state.size() != 19)
+        {
+            throw std::runtime_error("Invalid SamplingConfig state!");
+        }
+        new (&samplingConfig) tle::SamplingConfig(nb::cast<SizeType32>(state[0]), // BeamWidth
+            nb::cast<std::optional<SizeType32>>(state[1]),                        // TopK
+            nb::cast<std::optional<FloatType>>(state[2]),                         // TopP
+            nb::cast<std::optional<FloatType>>(state[3]),                         // TopPMin
+            nb::cast<std::optional<tle::TokenIdType>>(state[4]),                  // TopPResetIds
+            nb::cast<std::optional<FloatType>>(state[5]),                         // TopPDecay
+            nb::cast<std::optional<tle::RandomSeedType>>(state[6]),               // Seed
+            nb::cast<std::optional<FloatType>>(state[7]),                         // Temperature
+            nb::cast<std::optional<SizeType32>>(state[8]),                        // MinTokens
+            nb::cast<std::optional<FloatType>>(state[9]),                         // BeamSearchDiversityRate
+            nb::cast<std::optional<FloatType>>(state[10]),                        // RepetitionPenalty
+            nb::cast<std::optional<FloatType>>(state[11]),                        // PresencePenalty
+            nb::cast<std::optional<FloatType>>(state[12]),                        // FrequencyPenalty
+            nb::cast<std::optional<FloatType>>(state[13]),                        // LengthPenalty
+            nb::cast<std::optional<SizeType32>>(state[14]),                       // EarlyStopping
+            nb::cast<std::optional<SizeType32>>(state[15]),                       // NoRepeatNgramSize
+            nb::cast<std::optional<SizeType32>>(state[16]),                       // NumReturnSequences
+            nb::cast<std::optional<FloatType>>(state[17]),                        // MinP
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[18])           // BeamWidthArray
+        );
+    };
+    nb::class_<tle::SamplingConfig>(m, "SamplingConfig")
+        .def(nb::init<tle::SizeType32,
+                 std::optional<tle::SizeType32> const&,             // beamWidth
+                 std::optional<tle::FloatType> const&,              // topP
+                 std::optional<tle::FloatType> const&,              // topPMin
+                 std::optional<tle::TokenIdType> const&,            // topPResetIds
+                 std::optional<tle::FloatType> const&,              // topPDecay
+                 std::optional<tle::RandomSeedType> const&,         // seed
+                 std::optional<tle::FloatType> const&,              // temperature
+                 std::optional<tle::SizeType32> const&,             // minTokens
+                 std::optional<tle::FloatType> const&,              // beamSearchDiversityRate
+                 std::optional<tle::FloatType> const&,              // repetitionPenalty
+                 std::optional<tle::FloatType> const&,              // presencePenalty
+                 std::optional<tle::FloatType> const&,              // frequencyPenalty
+                 std::optional<tle::FloatType> const&,              // lengthPenalty
+                 std::optional<tle::SizeType32> const&,             // earlyStopping
+                 std::optional<tle::SizeType32> const&,             // noRepeatNgramSize
+                 std::optional<tle::SizeType32> const&,             // numReturnSequences
+                 std::optional<tle::FloatType> const&,              // minP
+                 std::optional<std::vector<tle::SizeType32>> const& // beamWidthArray
+                 >(),
+            // clang-format off
+            nb::arg("beam_width") = 1,
+            nb::kw_only(),
+            nb::arg("top_k") = nb::none(),
+            nb::arg("top_p") = nb::none(),
+            nb::arg("top_p_min") = nb::none(),
+            nb::arg("top_p_reset_ids") = nb::none(),
+            nb::arg("top_p_decay") = nb::none(),
+            nb::arg("seed") = nb::none(),
+            nb::arg("temperature") = nb::none(),
+            nb::arg("min_tokens") = nb::none(),
+            nb::arg("beam_search_diversity_rate") = nb::none(),
+            nb::arg("repetition_penalty") = nb::none(),
+            nb::arg("presence_penalty") = nb::none(),
+            nb::arg("frequency_penalty") = nb::none(),
+            nb::arg("length_penalty") = nb::none(),
+            nb::arg("early_stopping") = nb::none(),
+            nb::arg("no_repeat_ngram_size") = nb::none(),
+            nb::arg("num_return_sequences") = nb::none(),
+            nb::arg("min_p") = nb::none(),
+            nb::arg("beam_width_array") = nb::none())               // clang-format on
+        .def_prop_rw("beam_width", &tle::SamplingConfig::getBeamWidth, &tle::SamplingConfig::setBeamWidth)
+        .def_prop_rw("top_k", &tle::SamplingConfig::getTopK, &tle::SamplingConfig::setTopK)
+        .def_prop_rw("top_p", &tle::SamplingConfig::getTopP, &tle::SamplingConfig::setTopP)
+        .def_prop_rw("top_p_min", &tle::SamplingConfig::getTopPMin, &tle::SamplingConfig::setTopPMin)
+        .def_prop_rw("top_p_reset_ids", &tle::SamplingConfig::getTopPResetIds, &tle::SamplingConfig::setTopPResetIds)
+        .def_prop_rw("top_p_decay", &tle::SamplingConfig::getTopPDecay, &tle::SamplingConfig::setTopPDecay)
+        .def_prop_rw("seed", &tle::SamplingConfig::getSeed, &tle::SamplingConfig::setSeed)
+        .def_prop_rw("temperature", &tle::SamplingConfig::getTemperature, &tle::SamplingConfig::setTemperature)
+        .def_prop_rw("min_tokens", &tle::SamplingConfig::getMinTokens, &tle::SamplingConfig::setMinTokens)
+        .def_prop_rw("beam_search_diversity_rate", &tle::SamplingConfig::getBeamSearchDiversityRate,
+            &tle::SamplingConfig::setBeamSearchDiversityRate)
+        .def_prop_rw("repetition_penalty", &tle::SamplingConfig::getRepetitionPenalty,
+            &tle::SamplingConfig::setRepetitionPenalty)
+        .def_prop_rw("presence_penalty", &tle::SamplingConfig::getPresencePenalty,
+            [](tle::SamplingConfig& self, std::optional<FloatType> v) { self.setPresencePenalty(v); })
+        .def_prop_rw(
+            "frequency_penalty", &tle::SamplingConfig::getFrequencyPenalty, &tle::SamplingConfig::setFrequencyPenalty)
+        .def_prop_rw("length_penalty", &tle::SamplingConfig::getLengthPenalty, &tle::SamplingConfig::setLengthPenalty)
+        .def_prop_rw("early_stopping", &tle::SamplingConfig::getEarlyStopping, &tle::SamplingConfig::setEarlyStopping)
+        .def_prop_rw("no_repeat_ngram_size", &tle::SamplingConfig::getNoRepeatNgramSize,
+            &tle::SamplingConfig::setNoRepeatNgramSize)
+        .def_prop_rw("num_return_sequences", &tle::SamplingConfig::getNumReturnSequences,
+            &tle::SamplingConfig::setNumReturnSequences)
+        .def_prop_rw("min_p", &tle::SamplingConfig::getMinP, &tle::SamplingConfig::setMinP)
+        .def_prop_rw(
+            "beam_width_array", &tle::SamplingConfig::getBeamWidthArray, &tle::SamplingConfig::setBeamWidthArray)
+        .def("__getstate__", samplingConfigGetstate)
+        .def("__setstate__", samplingConfigSetstate);
+
+    auto additionalModelOutputGetstate
+        = [](tle::AdditionalModelOutput const& self) { return nb::make_tuple(self.name, self.gatherContext); };
+    auto additionalModelOutputSetstate = [](tle::AdditionalModelOutput& additionalModelOutput, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid AdditionalModelOutput state!");
+        }
+        new (&additionalModelOutput)
+            tle::AdditionalModelOutput(nb::cast<std::string>(state[0]), nb::cast<bool>(state[1]));
+    };
+    nb::class_<tle::AdditionalModelOutput>(m, "AdditionalModelOutput")
+        .def(nb::init<std::string, bool>(), nb::arg("name"), nb::arg("gather_context") = false)
+        .def_rw("name", &tle::AdditionalModelOutput::name)
+        .def_rw("gather_context", &tle::AdditionalModelOutput::gatherContext)
+        .def("__getstate__", additionalModelOutputGetstate)
+        .def("__setstate__", additionalModelOutputSetstate);
+
+    auto outputConfigGetstate = [](tle::OutputConfig const& self)
+    {
+        return nb::make_tuple(self.returnLogProbs, self.returnContextLogits, self.returnGenerationLogits,
+            self.excludeInputFromOutput, self.returnEncoderOutput, self.returnPerfMetrics, self.additionalModelOutputs);
+    };
+    auto outputConfigSetstate = [](tle::OutputConfig& outputConfig, nb::tuple const& state)
+    {
+        if (state.size() != 7)
+        {
+            throw std::runtime_error("Invalid OutputConfig state!");
+        }
+        new (&outputConfig) tle::OutputConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
+            nb::cast<bool>(state[2]), nb::cast<bool>(state[3]), nb::cast<bool>(state[4]), nb::cast<bool>(state[5]),
+            nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(state[6]));
+    };
+    nb::class_<tle::OutputConfig>(m, "OutputConfig")
+        .def(nb::init<bool, bool, bool, bool, bool, bool, std::optional<std::vector<tle::AdditionalModelOutput>>>(),
+            nb::arg("return_log_probs").none() = false, nb::arg("return_context_logits") = false,
+            nb::arg("return_generation_logits") = false, nb::arg("exclude_input_from_output") = false,
+            nb::arg("return_encoder_output") = false, nb::arg("return_perf_metrics") = false,
+            nb::arg("additional_model_outputs") = nb::none())
+        .def_rw("return_log_probs", &tle::OutputConfig::returnLogProbs)
+        .def_rw("return_context_logits", &tle::OutputConfig::returnContextLogits)
+        .def_rw("return_generation_logits", &tle::OutputConfig::returnGenerationLogits)
+        .def_rw("exclude_input_from_output", &tle::OutputConfig::excludeInputFromOutput)
+        .def_rw("return_encoder_output", &tle::OutputConfig::returnEncoderOutput)
+        .def_rw("return_perf_metrics", &tle::OutputConfig::returnPerfMetrics)
+        .def_rw("additional_model_outputs", &tle::OutputConfig::additionalModelOutputs)
+        .def("__getstate__", outputConfigGetstate)
+        .def("__setstate__", outputConfigSetstate);
+
+    auto externalDraftTokensConfigGetstate = [](tle::ExternalDraftTokensConfig const& self)
+    { return nb::make_tuple(self.getTokens(), self.getLogits(), self.getAcceptanceThreshold()); };
+    auto externalDraftTokensConfigSetstate
+        = [](tle::ExternalDraftTokensConfig& externalDraftTokensConfig, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid ExternalDraftTokensConfig state!");
+        }
+        new (&externalDraftTokensConfig) tle::ExternalDraftTokensConfig(nb::cast<VecTokens>(state[0]),
+            nb::cast<std::optional<Tensor>>(state[1]), nb::cast<std::optional<FloatType>>(state[2]));
+    };
+    nb::class_<tle::ExternalDraftTokensConfig>(m, "ExternalDraftTokensConfig")
+        .def(nb::init<VecTokens, std::optional<Tensor>, std::optional<FloatType> const&, std::optional<bool>>(),
+            nb::arg("tokens"), nb::arg("logits") = nb::none(), nb::arg("acceptance_threshold") = nb::none(),
+            nb::arg("fast_logits") = nb::none())
+        .def_prop_ro("tokens", &tle::ExternalDraftTokensConfig::getTokens)
+        .def_prop_ro("logits", &tle::ExternalDraftTokensConfig::getLogits)
+        .def_prop_ro("acceptance_threshold", &tle::ExternalDraftTokensConfig::getAcceptanceThreshold)
+        .def("__getstate__", externalDraftTokensConfigGetstate)
+        .def("__setstate__", externalDraftTokensConfigSetstate)
+        .def_prop_ro("fast_logits", &tle::ExternalDraftTokensConfig::getFastLogits);
+
+    auto promptTuningConfigGetstate = [](tle::PromptTuningConfig const& self)
+    { return nb::make_tuple(self.getEmbeddingTable(), self.getInputTokenExtraIds()); };
+    auto promptTuningConfigSetstate = [](tle::PromptTuningConfig& promptTuningConfig, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid PromptTuningConfig state!");
+        }
+        new (&promptTuningConfig)
+            tle::PromptTuningConfig(nb::cast<Tensor>(state[0]), nb::cast<std::optional<VecTokenExtraIds>>(state[1]));
+    };
+    nb::class_<tle::PromptTuningConfig>(m, "PromptTuningConfig")
+        .def(nb::init<Tensor, std::optional<VecTokenExtraIds>>(), nb::arg("embedding_table"),
+            nb::arg("input_token_extra_ids") = nb::none())
+        .def_prop_ro("embedding_table", &tle::PromptTuningConfig::getEmbeddingTable)
+        .def_prop_ro("input_token_extra_ids", &tle::PromptTuningConfig::getInputTokenExtraIds)
+        .def("__getstate__", promptTuningConfigGetstate)
+        .def("__setstate__", promptTuningConfigSetstate);
+
+    auto loraConfigGetstate = [](tle::LoraConfig const& self)
+    { return nb::make_tuple(self.getTaskId(), self.getWeights(), self.getConfig()); };
+    auto loraConfigSetstate = [](tle::LoraConfig& loraConfig, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid LoraConfig state!");
+        }
+        new (&loraConfig) tle::LoraConfig(nb::cast<IdType>(state[0]), nb::cast<std::optional<Tensor>>(state[1]),
+            nb::cast<std::optional<Tensor>>(state[2]));
+    };
+    nb::class_<tle::LoraConfig>(m, "LoraConfig")
+        .def(nb::init<uint64_t, std::optional<Tensor>, std::optional<Tensor>>(), nb::arg("task_id"),
+            nb::arg("weights") = nb::none(), nb::arg("config") = nb::none())
+        .def_prop_ro("task_id", &tle::LoraConfig::getTaskId)
+        .def_prop_ro("weights", &tle::LoraConfig::getWeights)
+        .def_prop_ro("config", &tle::LoraConfig::getConfig)
+        .def("__getstate__", loraConfigGetstate)
+        .def("__setstate__", loraConfigSetstate);
+
+    auto multimodalInputGetstate = [](tle::MultimodalInput const& self)
+    { return nb::make_tuple(self.getMultimodalHashes(), self.getMultimodalPositions(), self.getMultimodalLengths()); };
+    auto multimodalInputSetstate = [](tle::MultimodalInput& multimodalInput, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid MultimodalInput state!");
+        }
+        new (&multimodalInput) tle::MultimodalInput(nb::cast<std::vector<std::vector<SizeType32>>>(state[0]),
+            nb::cast<std::vector<SizeType32>>(state[1]), nb::cast<std::vector<SizeType32>>(state[2]));
+    };
+    nb::class_<tle::MultimodalInput>(m, "MultimodalInput")
+        .def(nb::init<std::vector<std::vector<SizeType32>>, std::vector<SizeType32>, std::vector<SizeType32>>(),
+            nb::arg("multimodal_hashes"), nb::arg("multimodal_positions"), nb::arg("multimodal_lengths"))
+        .def_prop_ro("multimodal_hashes", &tle::MultimodalInput::getMultimodalHashes)
+        .def_prop_ro("multimodal_positions", &tle::MultimodalInput::getMultimodalPositions)
+        .def_prop_ro("multimodal_lengths", &tle::MultimodalInput::getMultimodalLengths)
+        .def("__getstate__", multimodalInputGetstate)
+        .def("__setstate__", multimodalInputSetstate);
+
+    auto MropeConfigGetstate = [](tle::MropeConfig const& self)
+    { return nb::make_tuple(self.getMRopeRotaryCosSin(), self.getMRopePositionDeltas()); };
+    auto MropeConfigSetstate = [](tle::MropeConfig& mropeConfig, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid MropeConfig state!");
+        }
+        new (&mropeConfig) tle::MropeConfig(nb::cast<tle::Tensor>(state[0]), nb::cast<SizeType32>(state[1]));
+    };
+    nb::class_<tle::MropeConfig>(m, "MropeConfig")
+        .def(nb::init<Tensor, SizeType32>(), nb::arg("mrope_rotary_cos_sin"), nb::arg("mrope_position_deltas"))
+        .def_prop_ro("mrope_rotary_cos_sin", &tle::MropeConfig::getMRopeRotaryCosSin)
+        .def_prop_ro("mrope_position_deltas", &tle::MropeConfig::getMRopePositionDeltas)
+        .def("__getstate__", MropeConfigGetstate)
+        .def("__setstate__", MropeConfigSetstate);
+
+    auto lookaheadDecodingConfigGetstate = [](tle::LookaheadDecodingConfig const& self)
+    { return nb::make_tuple(self.getWindowSize(), self.getNgramSize(), self.getVerificationSetSize()); };
+    auto lookaheadDecodingConfigSetstate
+        = [](tle::LookaheadDecodingConfig& lookaheadDecodingConfig, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid LookaheadDecodingConfig state!");
+        }
+        new (&lookaheadDecodingConfig) tle::LookaheadDecodingConfig(
+            nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]), nb::cast<SizeType32>(state[2]));
+    };
+    nb::class_<tle::LookaheadDecodingConfig>(m, "LookaheadDecodingConfig")
+        .def(nb::init<SizeType32, SizeType32, SizeType32>(), nb::arg("max_window_size"), nb::arg("max_ngram_size"),
+            nb::arg("max_verification_set_size"))
+        .def_prop_ro("max_window_size", &tle::LookaheadDecodingConfig::getWindowSize)
+        .def_prop_ro("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize)
+        .def_prop_ro("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize)
+        .def("calculate_speculative_resource", &tle::LookaheadDecodingConfig::calculateSpeculativeResource)
+        .def_static(
+            "calculate_speculative_resource_tuple", &tle::LookaheadDecodingConfig::calculateSpeculativeResourceTuple)
+        .def("__getstate__", lookaheadDecodingConfigGetstate)
+        .def("__setstate__", lookaheadDecodingConfigSetstate)
+        .def_static("get_default_lookahead_decoding_window",
+            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingWindow; })
+        .def_static("get_default_lookahead_decoding_ngram",
+            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingNgram; })
+        .def_static("get_default_lookahead_decoding_verification_set",
+            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingVerificationSet; });
+
+    auto TokenRangeRetentionConfigGetstate = [](tle::KvCacheRetentionConfig::TokenRangeRetentionConfig const& self)
+    { return nb::make_tuple(self.tokenStart, self.tokenEnd, self.priority, self.durationMs); };
+    auto TokenRangeRetentionConfigSetstate
+        = [](tle::KvCacheRetentionConfig::TokenRangeRetentionConfig& tokenRangeRetentionConfig, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&tokenRangeRetentionConfig) tle::KvCacheRetentionConfig::TokenRangeRetentionConfig(
+            nb::cast<SizeType32>(state[0]), nb::cast<std::optional<SizeType32>>(state[1]),
+            nb::cast<tle::RetentionPriority>(state[2]), nb::cast<std::optional<std::chrono::milliseconds>>(state[3]));
+    };
+    auto kvCacheRetentionConfigGetstate = [](tle::KvCacheRetentionConfig const& self)
+    {
+        return nb::make_tuple(self.getTokenRangeRetentionConfigs(), self.getDecodeRetentionPriority(),
+            self.getDecodeDurationMs(), self.getTransferMode(), self.getDirectory());
+    };
+    auto kvCacheRetentionConfigSetstate
+        = [](tle::KvCacheRetentionConfig& kvCacheRetentionConfig, nb::tuple const& state)
+    {
+        if (state.size() != 5)
+        {
+            throw std::runtime_error("Invalid state!");
+        }
+        new (&kvCacheRetentionConfig) tle::KvCacheRetentionConfig(
+            nb::cast<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>>(state[0]),
+            nb::cast<tle::RetentionPriority>(state[1]), nb::cast<std::optional<std::chrono::milliseconds>>(state[2]),
+            nb::cast<tle::KvCacheTransferMode>(state[3]), nb::cast<std::optional<std::string>>(state[4]));
+    };
+
+    auto kvCacheRetentionConfig = nb::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig");
+
+    nb::class_<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>(
+        kvCacheRetentionConfig, "TokenRangeRetentionConfig")
+        .def(nb::init<SizeType32, std::optional<SizeType32>, tle::RetentionPriority,
+                 std::optional<std::chrono::milliseconds>>(),
+            nb::arg("token_start"), nb::arg("token_end"), nb::arg("priority"), nb::arg("duration_ms") = nb::none())
+        .def_rw("token_start", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenStart)
+        .def_rw("token_end", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenEnd)
+        .def_rw("priority", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::priority)
+        .def_rw("duration_ms", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::durationMs)
+        .def("__getstate__", TokenRangeRetentionConfigGetstate)
+        .def("__setstate__", TokenRangeRetentionConfigSetstate)
+        .def("__eq__", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator==);
+
+    // There's a circular dependency between the declaration of the TokenRangeRetentionPriority and
+    // KvCacheRetentionConfig bindings. Defer definition of the KvCacheRetentionConfig bindings until the
+    // TokenRangeRetentionPriority bindings have been defined.
+    kvCacheRetentionConfig
+        .def(nb::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>, tle::RetentionPriority,
+                 std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::optional<std::string>>(),
+            nb::arg("token_range_retention_configs"),
+            nb::arg("decode_retention_priority") = tle::KvCacheRetentionConfig::kDefaultRetentionPriority,
+            nb::arg("decode_duration_ms") = nb::none(), nb::arg("transfer_mode") = tle::KvCacheTransferMode::DRAM,
+            nb::arg("directory") = nb::none())
+        .def_prop_ro("token_range_retention_configs", &tle::KvCacheRetentionConfig::getTokenRangeRetentionConfigs)
+        .def_prop_ro("decode_retention_priority", &tle::KvCacheRetentionConfig::getDecodeRetentionPriority)
+        .def_prop_ro("decode_duration_ms", &tle::KvCacheRetentionConfig::getDecodeDurationMs)
+        .def_prop_ro("transfer_mode", &tle::KvCacheRetentionConfig::getTransferMode)
+        .def_prop_ro("directory", &tle::KvCacheRetentionConfig::getDirectory)
+        .def("__getstate__", kvCacheRetentionConfigGetstate)
+        .def("__setstate__", kvCacheRetentionConfigSetstate)
+        .def("__eq__", &tle::KvCacheRetentionConfig::operator==);
+
+    auto ContextPhaseParamsGetState = [](tle::ContextPhaseParams const& self)
+    {
+        if (self.getState() != nullptr)
+        {
+            auto serializedState = self.getSerializedState();
+            return nb::make_tuple(self.getFirstGenTokens(), self.getReqId(),
+                nb::bytes(serializedState.data(), serializedState.size()), self.getDraftTokens());
+        }
+        return nb::make_tuple(self.getFirstGenTokens(), self.getReqId(), nb::none(), self.getDraftTokens());
+    };
+
+    auto ContextPhaseParamsSetState = [](tle::ContextPhaseParams& contextPhaseParams, nb::tuple const& state)
+    {
+        if (state.size() != 4)
+        {
+            throw std::runtime_error("Invalid ContextPhaseParams state!");
+        }
+        if (!state[2].is_none())
+        {
+            auto opaque_state = nb::cast<nb::bytes>(state[2]);
+            auto opaque_state_str_view = std::string_view(opaque_state.c_str(), opaque_state.size());
+            new (&contextPhaseParams) tle::ContextPhaseParams(nb::cast<VecTokens>(state[0]),
+                nb::cast<tle::ContextPhaseParams::RequestIdType>(state[1]),
+                std::vector<char>(opaque_state_str_view.begin(), opaque_state_str_view.end()),
+                nb::cast<std::optional<VecTokens>>(state[3]));
+        }
+        new (&contextPhaseParams) tle::ContextPhaseParams(nb::cast<VecTokens>(state[0]),
+            nb::cast<tle::ContextPhaseParams::RequestIdType>(state[1]), nb::cast<std::optional<VecTokens>>(state[3]));
+    };
+
+    nb::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
+        .def("__init__",
+            [](tle::ContextPhaseParams const& self, VecTokens const& first_gen_tokens,
+                tle::ContextPhaseParams::RequestIdType req_id, std::optional<nb::bytes> const& opaque_state,
+                std::optional<VecTokens> const& draft_tokens)
+            {
+                if (opaque_state)
+                {
+                    auto opaque_state_str_view
+                        = std::string_view(opaque_state.value().c_str(), opaque_state.value().size());
+                    return std::make_unique<tle::ContextPhaseParams>(first_gen_tokens, req_id,
+                        std::vector<char>(opaque_state_str_view.begin(), opaque_state_str_view.end()), draft_tokens);
+                }
+                return std::make_unique<tle::ContextPhaseParams>(first_gen_tokens, req_id, draft_tokens);
+            })
+        .def_prop_ro("first_gen_tokens", [](tle::ContextPhaseParams const& self) { return self.getFirstGenTokens(); })
+        .def_prop_ro("draft_tokens", [](tle::ContextPhaseParams const& self) { return self.getDraftTokens(); })
+        .def_prop_ro("req_id", &tle::ContextPhaseParams::getReqId)
+        .def_prop_ro("opaque_state",
+            [](tle::ContextPhaseParams const& self)
+            {
+                std::optional<nb::bytes> opaque_state{std::nullopt};
+                if (self.getState() != nullptr)
+                {
+                    auto serializedState = self.getSerializedState();
+                    opaque_state = nb::bytes(serializedState.data(), serializedState.size());
+                }
+                return opaque_state;
+            })
+        .def("__getstate__", ContextPhaseParamsGetState)
+        .def("__setstate__", ContextPhaseParamsSetState);
+
+    auto EagleDecodingConfigGetstate = [](tle::EagleConfig const& self)
+    {
+        return nb::make_tuple(self.getEagleChoices(), self.isGreedySampling(), self.getPosteriorThreshold(),
+            self.useDynamicTree(), self.getDynamicTreeMaxTopK());
+    };
+    auto EagleDecodingConfigSetstate = [](tle::EagleConfig& eagleConfig, nb::tuple const& state)
+    {
+        if (state.size() != 5)
+        {
+            throw std::runtime_error("Invalid EagleConfig state!");
+        }
+        new (&eagleConfig) tle::EagleConfig(nb::cast<std::optional<tle::EagleChoices>>(state[0]),
+            nb::cast<bool>(state[1]), nb::cast<std::optional<float>>(state[2]), nb::cast<bool>(state[3]),
+            nb::cast<std::optional<SizeType32>>(state[4]));
+    };
+    nb::class_<tle::EagleConfig>(m, "EagleConfig")
+        .def(nb::init<std::optional<tle::EagleChoices>, bool, std::optional<float>, bool, std::optional<SizeType32>>(),
+            nb::arg("eagle_choices") = nb::none(), nb::arg("greedy_sampling") = true,
+            nb::arg("posterior_threshold") = nb::none(), nb::arg("use_dynamic_tree") = false,
+            nb::arg("dynamic_tree_max_topK") = nb::none())
+        .def_prop_ro("eagle_choices", &tle::EagleConfig::getEagleChoices)
+        .def_prop_ro("greedy_sampling", &tle::EagleConfig::isGreedySampling)
+        .def_prop_ro("posterior_threshold", &tle::EagleConfig::getPosteriorThreshold)
+        .def_prop_ro("use_dynamic_tree", &tle::EagleConfig::useDynamicTree)
+        .def_prop_ro("dynamic_tree_max_topK", &tle::EagleConfig::getDynamicTreeMaxTopK)
+        .def("__getstate__", EagleDecodingConfigGetstate)
+        .def("__setstate__", EagleDecodingConfigSetstate);
+
+    // Guided decoding params
+    auto pyGuidedDecodingParams = nb::class_<tle::GuidedDecodingParams>(m, "GuidedDecodingParams");
+
+    nb::enum_<tle::GuidedDecodingParams::GuideType>(pyGuidedDecodingParams, "GuideType")
+        .value("JSON", tle::GuidedDecodingParams::GuideType::kJSON)
+        .value("JSON_SCHEMA", tle::GuidedDecodingParams::GuideType::kJSON_SCHEMA)
+        .value("REGEX", tle::GuidedDecodingParams::GuideType::kREGEX)
+        .value("EBNF_GRAMMAR", tle::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR)
+        .value("STRUCTURAL_TAG", tle::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG);
+
+    auto guidedDecodingParamsGetstate
+        = [](tle::GuidedDecodingParams const& self) { return nb::make_tuple(self.getGuideType(), self.getGuide()); };
+
+    auto guidedDecodingParamsSetstate = [](tle::GuidedDecodingParams& guidedDecodingParams, nb::tuple const& state)
+    {
+        if (state.size() != 2)
+        {
+            throw std::runtime_error("Invalid GuidedDecodingParams state!");
+        }
+        new (&guidedDecodingParams) tle::GuidedDecodingParams(
+            nb::cast<tle::GuidedDecodingParams::GuideType>(state[0]), nb::cast<std::optional<std::string>>(state[1]));
+    };
+
+    pyGuidedDecodingParams
+        .def(nb::init<tle::GuidedDecodingParams::GuideType, std::optional<std::string>>(), nb::arg("guide_type"),
+            nb::arg("guide") = nb::none())
+        .def_prop_ro("guide_type", &tle::GuidedDecodingParams::getGuideType)
+        .def_prop_ro("guide", &tle::GuidedDecodingParams::getGuide)
+        .def("__getstate__", guidedDecodingParamsGetstate)
+        .def("__setstate__", guidedDecodingParamsSetstate);
+
+    auto requestGetstate = [](tle::Request const& self)
+    {
+        return nb::make_tuple(self.getInputTokenIds(), self.getMaxTokens(), self.getStreaming(),
+            self.getSamplingConfig(), self.getOutputConfig(), self.getEndId(), self.getPadId(), self.getPositionIds(),
+            self.getBadWords(), self.getStopWords(), self.getEmbeddingBias(), self.getExternalDraftTokensConfig(),
+            self.getPromptTuningConfig(), self.getMultimodalInput(), self.getMultimodalEmbedding(),
+            self.getMropeConfig(), self.getLoraConfig(), self.getLookaheadConfig(), self.getKvCacheRetentionConfig(),
+            self.getLogitsPostProcessorName(), self.getLogitsPostProcessor(), self.getEncoderInputTokenIds(),
+            self.getClientId(), self.getReturnAllGeneratedTokens(), self.getPriority(), self.getRequestType(),
+            self.getContextPhaseParams(), self.getEncoderInputFeatures(), self.getEncoderOutputLength(),
+            self.getCrossAttentionMask(), self.getEagleConfig(), self.getSkipCrossAttnBlocks(),
+            self.getGuidedDecodingParams());
+    };
+    auto requestSetstate = [](tle::Request& request, nb::tuple const& state)
+    {
+        if (state.size() != 33)
+        {
+            throw std::runtime_error("Invalid Request state!");
+        }
+        new (&request) tle::Request(nb::cast<VecTokens>(state[0]), nb::cast<SizeType32>(state[1]),
+            nb::cast<bool>(state[2]), nb::cast<tle::SamplingConfig>(state[3]), nb::cast<tle::OutputConfig>(state[4]),
+            nb::cast<std::optional<SizeType32>>(state[5]), nb::cast<std::optional<SizeType32>>(state[6]),
+            nb::cast<std::optional<std::vector<SizeType32>>>(state[7]),
+            nb::cast<std::optional<std::list<VecTokens>>>(state[8]),
+            nb::cast<std::optional<std::list<VecTokens>>>(state[9]), nb::cast<std::optional<Tensor>>(state[10]),
+            nb::cast<std::optional<tle::ExternalDraftTokensConfig>>(state[11]),
+            nb::cast<std::optional<tle::PromptTuningConfig>>(state[12]),
+            nb::cast<std::optional<tle::MultimodalInput>>(state[13]), nb::cast<std::optional<Tensor>>(state[14]),
+            nb::cast<std::optional<tle::MropeConfig>>(state[15]), nb::cast<std::optional<tle::LoraConfig>>(state[16]),
+            nb::cast<std::optional<tle::LookaheadDecodingConfig>>(state[17]),
+            nb::cast<std::optional<tle::KvCacheRetentionConfig>>(state[18]),
+            nb::cast<std::optional<std::string>>(state[19]),
+            nb::cast<std::optional<tle::LogitsPostProcessor>>(state[20]), nb::cast<std::optional<VecTokens>>(state[21]),
+            nb::cast<std::optional<IdType>>(state[22]), nb::cast<bool>(state[23]),
+            nb::cast<tle::PriorityType>(state[24]), nb::cast<tle::RequestType>(state[25]),
+            nb::cast<std::optional<tle::ContextPhaseParams>>(state[26]),
+            nb::cast<std::optional<tle::Tensor>>(state[27]), nb::cast<std::optional<SizeType32>>(state[28]),
+            nb::cast<std::optional<tle::Tensor>>(state[29]), 1, nb::cast<std::optional<tle::EagleConfig>>(state[30]),
+            nb::cast<std::optional<tle::Tensor>>(state[31]),
+            nb::cast<std::optional<tle::GuidedDecodingParams>>(state[32]));
+    };
+
+    nb::class_<tle::Request> request(m, "Request", nb::dynamic_attr());
+    request
+        .def(nb::init<tle::VecTokens,                           // inputTokenIds
+                 tle::SizeType32,                               // maxTokens
+                 bool,                                          // streaming
+                 tle::SamplingConfig const&,                    // samplingConfig
+                 tle::OutputConfig const&,                      // outputConfig
+                 std::optional<tle::SizeType32> const&,         // endId
+                 std::optional<tle::SizeType32> const&,         // padId
+                 std::optional<std::vector<SizeType32>>,        // positionIds
+                 std::optional<std::list<tle::VecTokens>>,      // badWords
+                 std::optional<std::list<tle::VecTokens>>,      // stopWords
+                 std::optional<tle::Tensor>,                    // embeddingBias
+                 std::optional<tle::ExternalDraftTokensConfig>, // externalDraftTokensConfig
+                 std::optional<tle::PromptTuningConfig>,        // pTuningConfig
+                 std::optional<tle::MultimodalInput>,           // multimodalInput
+                 std::optional<tle::Tensor>,                    // multimodalEmbedding
+                 std::optional<tle::MropeConfig>,               // mRopeConfig
+                 std::optional<tle::LoraConfig>,                // loraConfig
+                 std::optional<tle::LookaheadDecodingConfig>,   // lookaheadConfig
+                 std::optional<tle::KvCacheRetentionConfig>,    // kvCacheRetentionConfig
+                 std::optional<std::string>,                    // logitsPostProcessorName
+                 std::optional<tle::LogitsPostProcessor>,       // logitsPostProcessor
+                 std::optional<tle::VecTokens>,                 // encoderInputTokenIds
+                 std::optional<tle::IdType>,                    // clientId
+                 bool,                                          // returnAllGeneratedTokens
+                 tle::PriorityType,                             // priority
+                 tle::RequestType,                              // type
+                 std::optional<tle::ContextPhaseParams>,        // contextPhaseParams
+                 std::optional<tle::Tensor>,                    // encoderInputFeatures
+                 std::optional<tle::SizeType32>,                // encoderOutputLength
+                 std::optional<tle::Tensor>,                    // crossAttentionMask
+                 SizeType32,                                    // numReturnSequences
+                 std::optional<tle::EagleConfig>,               // eagleConfig
+                 std::optional<tle::Tensor>,                    // skipCrossAttnBlocks
+                 std::optional<tle::GuidedDecodingParams>,      // guidedDecodingParams
+                 std::optional<tle::SizeType32>,                // languageAdapterUid
+                 std::optional<tle::MillisecondsType>           // allottedTimeMs
+                 >(),
+            // clang-format off
+        nb::arg("input_token_ids"),
+        nb::arg("max_tokens"),
+        nb::kw_only(),
+        nb::arg("streaming") = false,
+        nb::arg("sampling_config") = tle::SamplingConfig(),
+        nb::arg("output_config") = tle::OutputConfig(),
+        nb::arg("end_id") = nb::none(),
+        nb::arg("pad_id") = nb::none(),
+        nb::arg("position_ids") = nb::none(),
+        nb::arg("bad_words") = nb::none(),
+        nb::arg("stop_words") = nb::none(),
+        nb::arg("embedding_bias") = nb::none(),
+        nb::arg("external_draft_tokens_config") = nb::none(),
+        nb::arg("prompt_tuning_config") = nb::none(),
+        nb::arg("multimodal_input") = nb::none(),
+        nb::arg("multimodal_embedding") = nb::none(),
+        nb::arg("mrope_config") = nb::none(),
+        nb::arg("lora_config") = nb::none(),
+        nb::arg("lookahead_config") = nb::none(),
+        nb::arg("kv_cache_retention_config") = nb::none(),
+        nb::arg("logits_post_processor_name") = nb::none(),
+        nb::arg("logits_post_processor") = nb::none(),
+        nb::arg("encoder_input_token_ids") = nb::none(),
+        nb::arg("client_id") = nb::none(),
+        nb::arg("return_all_generated_tokens") = false,
+        nb::arg("priority") = tle::Request::kDefaultPriority,
+        nb::arg("type") = tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
+        nb::arg("context_phase_params") = nb::none(),
+        nb::arg("encoder_input_features") = nb::none(),
+        nb::arg("encoder_output_length") = nb::none(),
+        nb::arg("cross_attention_mask") = nb::none(),
+        nb::arg("num_return_sequences") = 1,
+        nb::arg("eagle_config") = nb::none(),
+        nb::arg("skip_cross_attn_blocks") = nb::none(),
+        nb::arg("guided_decoding_params") = nb::none(),
+        nb::arg("language_adapter_uid") = nb::none(),
+        nb::arg("allotted_time_ms") = nb::none()
+    )          // clang-format on
+        .def_prop_ro("input_token_ids", &tle::Request::getInputTokenIds)
+        .def_prop_ro("max_tokens", &tle::Request::getMaxTokens)
+        .def_prop_rw("streaming", &tle::Request::getStreaming, &tle::Request::setStreaming)
+        .def_prop_rw("sampling_config", &tle::Request::getSamplingConfig, &tle::Request::setSamplingConfig)
+        .def_prop_rw("output_config", &tle::Request::getOutputConfig, &tle::Request::setOutputConfig)
+        .def_prop_rw("end_id", &tle::Request::getEndId, &tle::Request::setEndId)
+        .def_prop_rw("pad_id", &tle::Request::getPadId, &tle::Request::setPadId)
+        .def_prop_rw("position_ids", &tle::Request::getPositionIds, &tle::Request::setPositionIds)
+        .def_prop_rw("bad_words", &tle::Request::getBadWords, &tle::Request::setBadWords)
+        .def_prop_rw("stop_words", &tle::Request::getStopWords, &tle::Request::setStopWords)
+        .def_prop_rw("embedding_bias", &tle::Request::getEmbeddingBias, &tle::Request::setEmbeddingBias)
+        .def_prop_rw("external_draft_tokens_config", &tle::Request::getExternalDraftTokensConfig,
+            &tle::Request::setExternalDraftTokensConfig)
+        .def_prop_rw("prompt_tuning_config", &tle::Request::getPromptTuningConfig, &tle::Request::setPromptTuningConfig)
+        .def_prop_rw("multimodal_input", &tle::Request::getMultimodalInput, &tle::Request::setMultimodalInput)
+        .def_prop_rw(
+            "multimodal_embedding", &tle::Request::getMultimodalEmbedding, &tle::Request::setMultimodalEmbedding)
+        .def_prop_rw("mrope_config", &tle::Request::getMropeConfig, &tle::Request::setMropeConfig)
+        .def_prop_rw("lora_config", &tle::Request::getLoraConfig, &tle::Request::setLoraConfig)
+        .def_prop_rw("lookahead_config", &tle::Request::getLookaheadConfig, &tle::Request::setLookaheadConfig)
+        .def_prop_rw("kv_cache_retention_config", &tle::Request::getKvCacheRetentionConfig,
+            &tle::Request::setKvCacheRetentionConfig)
+        .def_prop_rw("logits_post_processor_name", &tle::Request::getLogitsPostProcessorName,
+            &tle::Request::setLogitsPostProcessorName)
+        .def_prop_rw(
+            "logits_post_processor", &tle::Request::getLogitsPostProcessor, &tle::Request::setLogitsPostProcessor)
+        .def_prop_rw(
+            "encoder_input_token_ids", &tle::Request::getEncoderInputTokenIds, &tle::Request::setEncoderInputTokenIds)
+        .def_prop_rw("client_id", &tle::Request::getClientId, &tle::Request::setClientId)
+        .def_prop_rw("return_all_generated_tokens", &tle::Request::getReturnAllGeneratedTokens,
+            &tle::Request::setReturnAllGeneratedTokens)
+        .def_prop_rw("request_type", &tle::Request::getRequestType, &tle::Request::setRequestType)
+        .def_prop_rw(
+            "encoder_input_features", &tle::Request::getEncoderInputFeatures, &tle::Request::setEncoderInputFeatures)
+        .def_prop_rw("cross_attention_mask", &tle::Request::getCrossAttentionMask, &tle::Request::setCrossAttentionMask)
+        .def_prop_rw("eagle_config", &tle::Request::getEagleConfig, &tle::Request::setEagleConfig)
+        .def_prop_rw(
+            "skip_cross_attn_blocks", &tle::Request::getSkipCrossAttnBlocks, &tle::Request::setSkipCrossAttnBlocks)
+        .def_prop_rw(
+            "guided_decoding_params", &tle::Request::getGuidedDecodingParams, &tle::Request::setGuidedDecodingParams)
+        .def_prop_rw("allotted_time_ms", &tle::Request::getAllottedTimeMs, &tle::Request::setAllottedTimeMs)
+        .def_prop_rw("context_phase_params", &tle::Request::getContextPhaseParams, &tle::Request::setContextPhaseParams)
+        .def("__getstate__", requestGetstate)
+        .def("__setstate__", requestSetstate);
+    request.attr("BATCHED_POST_PROCESSOR_NAME") = tle::Request::kBatchedPostProcessorName;
+
+    nb::class_<tle::SpeculativeDecodingFastLogitsInfo>(m, "SpeculativeDecodingFastLogitsInfo")
+        .def(nb::init<>())
+        .def_rw("draft_request_id", &tle::SpeculativeDecodingFastLogitsInfo::draftRequestId)
+        .def_rw("draft_participant_id", &tle::SpeculativeDecodingFastLogitsInfo::draftParticipantId)
+        .def("to_tensor", &tle::SpeculativeDecodingFastLogitsInfo::toTensor);
+
+    auto requestPerfMetrics = nb::class_<tle::RequestPerfMetrics>(m, "RequestPerfMetrics");
+
+    auto timingMetricsGetstate = [](tle::RequestPerfMetrics::TimingMetrics const& self)
+    {
+        return nb::make_tuple(self.arrivalTime, self.firstScheduledTime, self.firstTokenTime, self.lastTokenTime,
+            self.kvCacheTransferStart, self.kvCacheTransferEnd, self.kvCacheSize);
+    };
+    auto timingMetricsSetstate = [](tle::RequestPerfMetrics::TimingMetrics& timingMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 7)
+        {
+            throw std::runtime_error("Invalid TimingMetrics state!");
+        }
+        new (&timingMetrics)
+            tle::RequestPerfMetrics::TimingMetrics{nb::cast<tle::RequestPerfMetrics::TimePoint>(state[0]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[1]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[2]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[3]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[4]),
+                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[5]), nb::cast<size_t>(state[6])};
+    };
+    nb::class_<tle::RequestPerfMetrics::TimingMetrics>(m, "TimingMetrics")
+        .def(nb::init<>())
+        .def_rw("arrival_time", &tle::RequestPerfMetrics::TimingMetrics::arrivalTime)
+        .def_rw("first_scheduled_time", &tle::RequestPerfMetrics::TimingMetrics::firstScheduledTime)
+        .def_rw("first_token_time", &tle::RequestPerfMetrics::TimingMetrics::firstTokenTime)
+        .def_rw("last_token_time", &tle::RequestPerfMetrics::TimingMetrics::lastTokenTime)
+        .def_rw("kv_cache_transfer_start", &tle::RequestPerfMetrics::TimingMetrics::kvCacheTransferStart)
+        .def_rw("kv_cache_transfer_end", &tle::RequestPerfMetrics::TimingMetrics::kvCacheTransferEnd)
+        .def_rw("kv_cache_size", &tle::RequestPerfMetrics::TimingMetrics::kvCacheSize)
+        .def("__getstate__", timingMetricsGetstate)
+        .def("__setstate__", timingMetricsSetstate);
+
+    auto kvCacheMetricsGetstate = [](tle::RequestPerfMetrics::KvCacheMetrics const& self)
+    {
+        return nb::make_tuple(self.numTotalAllocatedBlocks, self.numNewAllocatedBlocks, self.numReusedBlocks,
+            self.numMissedBlocks, self.kvCacheHitRate);
+    };
+    auto kvCacheMetricsSetstate = [](tle::RequestPerfMetrics::KvCacheMetrics& kvCacheMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 5)
+        {
+            throw std::runtime_error("Invalid KvCacheMetrics state!");
+        }
+        new (&kvCacheMetrics)
+            tle::RequestPerfMetrics::KvCacheMetrics{nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]),
+                nb::cast<SizeType32>(state[2]), nb::cast<SizeType32>(state[3]), nb::cast<float>(state[4])};
+    };
+    nb::class_<tle::RequestPerfMetrics::KvCacheMetrics>(m, "KvCacheMetrics")
+        .def(nb::init<>())
+        .def_rw("num_total_allocated_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numTotalAllocatedBlocks)
+        .def_rw("num_new_allocated_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numNewAllocatedBlocks)
+        .def_rw("num_reused_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numReusedBlocks)
+        .def_rw("num_missed_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numMissedBlocks)
+        .def_rw("kv_cache_hit_rate", &tle::RequestPerfMetrics::KvCacheMetrics::kvCacheHitRate)
+        .def("__getstate__", kvCacheMetricsGetstate)
+        .def("__setstate__", kvCacheMetricsSetstate);
+
+    auto speculativeDecodingMetricsGetstate = [](tle::RequestPerfMetrics::SpeculativeDecodingMetrics const& self)
+    { return nb::make_tuple(self.acceptanceRate, self.totalAcceptedDraftTokens, self.totalDraftTokens); };
+    auto speculativeDecodingMetricsSetstate
+        = [](tle::RequestPerfMetrics::SpeculativeDecodingMetrics& speculativeDecodingMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid SpeculativeDecodingMetrics state!");
+        }
+        new (&speculativeDecodingMetrics) tle::RequestPerfMetrics::SpeculativeDecodingMetrics{
+            nb::cast<float>(state[0]), nb::cast<SizeType32>(state[1]), nb::cast<SizeType32>(state[2])};
+    };
+
+    nb::class_<tle::RequestPerfMetrics::SpeculativeDecodingMetrics>(m, "SpeculativeDecodingMetrics")
+        .def(nb::init<>())
+        .def_rw("acceptance_rate", &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::acceptanceRate)
+        .def_rw("total_accepted_draft_tokens",
+            &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::totalAcceptedDraftTokens)
+        .def_rw("total_draft_tokens", &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::totalDraftTokens)
+        .def("__getstate__", speculativeDecodingMetricsGetstate)
+        .def("__setstate__", speculativeDecodingMetricsSetstate);
+
+    auto requestPerfMetricsGetstate = [](tle::RequestPerfMetrics const& self)
+    {
+        return nb::make_tuple(self.timingMetrics, self.kvCacheMetrics, self.speculativeDecoding, self.firstIter,
+            self.lastIter, self.iter);
+    };
+    auto requestPerfMetricsSetstate = [](tle::RequestPerfMetrics& requestPerfMetrics, nb::tuple const& state)
+    {
+        if (state.size() != 6)
+        {
+            throw std::runtime_error("Invalid RequestPerfMetrics state!");
+        }
+        new (&requestPerfMetrics) tle::RequestPerfMetrics{nb::cast<tle::RequestPerfMetrics::TimingMetrics>(state[0]),
+            nb::cast<tle::RequestPerfMetrics::KvCacheMetrics>(state[1]),
+            nb::cast<tle::RequestPerfMetrics::SpeculativeDecodingMetrics>(state[2]),
+            nb::cast<std::optional<tle::IterationType>>(state[3]),
+            nb::cast<std::optional<tle::IterationType>>(state[4]),
+            nb::cast<std::optional<tle::IterationType>>(state[5])};
+    };
+
+    // There's a circular dependency between the declaration of the TimingMetrics and RequestPerfMetrics bindings.
+    // Defer definition of the RequestPerfMetrics bindings until the TimingMetrics have been defined.
+    requestPerfMetrics.def(nb::init<>())
+        .def_rw("timing_metrics", &tle::RequestPerfMetrics::timingMetrics)
+        .def_rw("kv_cache_metrics", &tle::RequestPerfMetrics::kvCacheMetrics)
+        .def_rw("speculative_decoding", &tle::RequestPerfMetrics::speculativeDecoding)
+        .def_rw("first_iter", &tle::RequestPerfMetrics::firstIter)
+        .def_rw("last_iter", &tle::RequestPerfMetrics::lastIter)
+        .def_rw("iter", &tle::RequestPerfMetrics::iter)
+        .def("__getstate__", requestPerfMetricsGetstate)
+        .def("__setstate__", requestPerfMetricsSetstate);
+
+    nb::class_<tle::AdditionalOutput>(m, "AdditionalOutput")
+        .def("__init__ ",
+            [](tle::AdditionalOutput const& self, std::string const& name, tle::Tensor const& output)
+            { return std::make_unique<tle::AdditionalOutput>(name, output); })
+        .def_rw("name", &tle::AdditionalOutput::name)
+        .def_rw("output", &tle::AdditionalOutput::output);
+
+    auto resultSetstate = [](tle::Result& result, nb::tuple const& state)
+    {
+        if (state.size() != 13)
+        {
+            throw std::runtime_error("Invalid Request state!");
+        }
+        new (&result) tle::Result();
+        result.isFinal = nb::cast<bool>(state[0]);
+        result.outputTokenIds = nb::cast<std::vector<VecTokens>>(state[1]);
+        result.cumLogProbs = nb::cast<std::optional<std::vector<float>>>(state[2]);
+        result.logProbs = nb::cast<std::optional<std::vector<std::vector<float>>>>(state[3]);
+        result.contextLogits = nb::cast<std::optional<Tensor>>(state[4]);
+        result.generationLogits = nb::cast<std::optional<Tensor>>(state[5]);
+        result.encoderOutput = nb::cast<std::optional<Tensor>>(state[6]);
+        result.finishReasons = nb::cast<std::vector<tle::FinishReason>>(state[7]);
+        result.sequenceIndex = nb::cast<SizeType32>(state[8]);
+        result.isSequenceFinal = nb::cast<bool>(state[9]);
+        result.decodingIter = nb::cast<SizeType32>(state[10]);
+        result.contextPhaseParams = nb::cast<std::optional<tle::ContextPhaseParams>>(state[11]);
+        result.requestPerfMetrics = nb::cast<std::optional<tle::RequestPerfMetrics>>(state[12]);
+    };
+
+    auto resultGetstate = [](tle::Result const& self)
+    {
+        return nb::make_tuple(self.isFinal, self.outputTokenIds, self.cumLogProbs, self.logProbs, self.contextLogits,
+            self.generationLogits, self.encoderOutput, self.finishReasons, self.sequenceIndex, self.isSequenceFinal,
+            self.decodingIter, self.contextPhaseParams, self.requestPerfMetrics);
+    };
+
+    nb::class_<tle::Result>(m, "Result")
+        .def(nb::init<>())
+        .def_rw("is_final", &tle::Result::isFinal)
+        .def_rw("output_token_ids", &tle::Result::outputTokenIds)
+        .def_rw("cum_log_probs", &tle::Result::cumLogProbs)
+        .def_rw("log_probs", &tle::Result::logProbs)
+        .def_rw("context_logits", &tle::Result::contextLogits)
+        .def_rw("generation_logits", &tle::Result::generationLogits)
+        .def_rw("spec_dec_fast_logits_info", &tle::Result::specDecFastLogitsInfo)
+        .def_rw("encoder_output", &tle::Result::encoderOutput)
+        .def_rw("finish_reasons", &tle::Result::finishReasons)
+        .def_rw("sequence_index", &tle::Result::sequenceIndex)
+        .def_rw("is_sequence_final", &tle::Result::isSequenceFinal)
+        .def_rw("decoding_iter", &tle::Result::decodingIter)
+        .def_rw("context_phase_params", &tle::Result::contextPhaseParams)
+        .def_rw("request_perf_metrics", &tle::Result::requestPerfMetrics)
+        .def_rw("additional_outputs", &tle::Result::additionalOutputs)
+        .def("__getstate__", resultGetstate)
+        .def("__setstate__", resultSetstate);
+
+    m.def("deserialize_result",
+        [](nb::bytes& x)
+        {
+            std::string str(x.c_str(), x.size());
+            std::istringstream is(str);
+            return tle::serialize_utils::deserialize<tle::Result>(is);
+        });
+
+    auto responseGetstate = [](tle::Response const& self)
+    { return nb::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); };
+
+    auto responseSetstate = [](tle::Response& response, nb::tuple const& state)
+    {
+        if (state.size() != 3)
+        {
+            throw std::runtime_error("Invalid Request state!");
+        }
+        new (&response) tle::Response(
+            nb::cast<SizeType32>(state[0]), nb::cast<tle::Result>(state[1]), nb::cast<SizeType32>(state[2]));
+    };
+
+    nb::class_<tle::Response>(m, "Response")
+        .def(nb::init<IdType, std::string, std::optional<IdType>>(), nb::arg("request_id"), nb::arg("error_msg"),
+            nb::arg("client_id") = std::nullopt)
+        .def(nb::init<IdType, tle::Result, std::optional<IdType>>(), nb::arg("request_id"), nb::arg("result"),
+            nb::arg("client_id") = std::nullopt)
+        .def_prop_ro("request_id", &tle::Response::getRequestId)
+        .def_prop_ro("client_id", &tle::Response::getClientId)
+        .def("has_error", &tle::Response::hasError)
+        .def_prop_ro("error_msg", &tle::Response::getErrorMsg)
+        .def_prop_ro("result", &tle::Response::getResult)
+        .def("clear_context_logits",
+            [](tle::Response& self)
+            {
+                if (!self.hasError())
+                {
+                    auto& result = const_cast<tle::Result&>(self.getResult());
+                    result.contextLogits.reset();
+                }
+            })
+        .def("clear_generation_logits",
+            [](tle::Response& self)
+            {
+                if (!self.hasError())
+                {
+                    auto& result = const_cast<tle::Result&>(self.getResult());
+                    result.generationLogits.reset();
+                }
+            })
+        .def("__getstate__", responseGetstate)
+        .def("__setstate__", responseSetstate);
+}
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/request.h b/cpp/tensorrt_llm/nanobind/executor/request.h
new file mode 100644
index 00000000000..5a5cf9acbee
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/executor/request.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::executor
+{
+
+// Register bindings for executor API.
+void initRequestBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
new file mode 100644
index 00000000000..f3be85bbbf2
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
@@ -0,0 +1,388 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "moeBindings.h"
+#include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
+#include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
+#include "tensorrt_llm/kernels/customAllReduceKernels.h"
+#include "tensorrt_llm/kernels/delayStream.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/cudaEvent.h"
+#include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/decoderState.h"
+#include "tensorrt_llm/runtime/decodingInput.h"
+#include "tensorrt_llm/runtime/decodingOutput.h"
+#include "tensorrt_llm/runtime/gptDecoder.h"
+#include "tensorrt_llm/runtime/gptDecoderBatched.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/ipcUtils.h"
+#include "tensorrt_llm/runtime/lookaheadBuffers.h"
+#include "tensorrt_llm/runtime/loraCache.h"
+#include "tensorrt_llm/runtime/mcastGPUBuffer.h"
+#include "tensorrt_llm/runtime/request.h"
+#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
+#include "tensorrt_llm/runtime/tllmRuntime.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+#include <ATen/ATen.h>
+#include <c10/cuda/CUDAStream.h>
+#include <nanobind/stl/vector.h>
+
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/operators.h>
+#include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/filesystem.h>
+#include <nanobind/stl/optional.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <nanobind/stl/unique_ptr.h>
+#include <nanobind/trampoline.h>
+#include <torch/extension.h>
+namespace tr = tensorrt_llm::runtime;
+namespace te = tensorrt_llm::executor;
+
+class PyIGptDecoder : public tr::IGptDecoder
+{
+public:
+    NB_TRAMPOLINE(tr::IGptDecoder, 5);
+
+    void setup(tr::SamplingConfig const& samplingConfig, size_t batchSize,
+        tr::DecodingInput::TensorConstPtr const& batchSlots,
+        std::optional<tr::DecodingOutput> const& output = std::nullopt,
+        std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+        std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
+        std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt) override
+    {
+        NB_OVERRIDE_PURE(setup, samplingConfig, batchSize, batchSlots, output, explicitDraftTokensDType,
+            lookaheadPrompt, lookaheadAlgoConfigs);
+    }
+
+    void forwardAsync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
+    {
+        NB_OVERRIDE_PURE(forwardAsync, output, input);
+    }
+
+    void forwardSync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
+    {
+        NB_OVERRIDE_PURE(forwardSync, output, input);
+    }
+
+    tr::SamplingConfig const& getSamplingConfig() override
+    {
+        NB_OVERRIDE_PURE(getSamplingConfig);
+    }
+
+    void disableLookahead(std::optional<tr::SamplingConfig> const& samplingConfig, tr::SizeType32 batchSize,
+        tr::DecodingInput::TensorConstPtr batchSlots) override
+    {
+        NB_OVERRIDE_PURE(disableLookahead, samplingConfig, batchSize, batchSlots);
+    }
+};
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void initBindings(nb::module_& m)
+{
+
+    nb::class_<tr::LoraCache::TaskLayerModuleConfig>(m, "TaskLayerModuleConfig")
+        .def(nb::init<>())
+        .def_rw("page_id", &tr::LoraCache::TaskLayerModuleConfig::pageId)
+        .def_rw("slot_idx", &tr::LoraCache::TaskLayerModuleConfig::slotIdx)
+        .def_rw("in_size", &tr::LoraCache::TaskLayerModuleConfig::inSize)
+        .def_rw("out_size", &tr::LoraCache::TaskLayerModuleConfig::outSize)
+        .def_rw("module_id", &tr::LoraCache::TaskLayerModuleConfig::moduleId)
+        .def_rw("layer_id", &tr::LoraCache::TaskLayerModuleConfig::layerId)
+        .def_rw("adapter_size", &tr::LoraCache::TaskLayerModuleConfig::adapterSize)
+        .def_rw("num_slots", &tr::LoraCache::TaskLayerModuleConfig::numSlots)
+        .def_rw("weights_in_pointer", &tr::LoraCache::TaskLayerModuleConfig::weightsInPointer)
+        .def_rw("weights_out_pointer", &tr::LoraCache::TaskLayerModuleConfig::weightsOutPointer)
+        .def_rw("scaling_vec_pointer", &tr::LoraCache::TaskLayerModuleConfig::scalingVecPointer)
+        .def(nb::self == nb::self);
+
+    nb::class_<tr::BufferManager>(m, "BufferManager")
+        .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
+        .def_prop_ro("stream", &tr::BufferManager::getStream);
+
+    nb::class_<tr::TllmRuntime>(m, "TllmRuntime")
+        .def(
+            "__init__",
+            [](tr::TllmRuntime* self, std::filesystem::path engine_path, float gpu_weights_percent = 1.0f,
+                bool use_shape_inference = true)
+            {
+                // Using default logger by passing nullptr
+                new (self)
+                    tr::TllmRuntime(tr::RawEngine(engine_path), nullptr, gpu_weights_percent, use_shape_inference);
+            },
+            nb::arg("engine_path"), nb::arg("gpu_weights_percent") = 1.0f, nb::arg("use_shape_inference") = true)
+        .def(
+            "__init__",
+            [](tr::TllmRuntime* self, nb::ndarray<nb::numpy, uint8_t> engine_buffer, float gpu_weights_percent = 1.0f,
+                bool use_shape_inference = true)
+            {
+                if (engine_buffer.ndim() != 1)
+                    throw std::runtime_error("Expected 1-D array for engine buffer");
+                new (self) tr::TllmRuntime(tr::RawEngine(engine_buffer.data(), engine_buffer.size()), nullptr,
+                    gpu_weights_percent, use_shape_inference);
+            },
+            nb::arg("engine_buffer"), nb::arg("gpu_weights_percent") = 1.0f, nb::arg("use_shape_inference") = true)
+        .def_prop_ro("num_contexts", &tr::TllmRuntime::getNbContexts)
+        .def_prop_ro("num_profiles", &tr::TllmRuntime::getNbProfiles)
+        .def("get_opt_profile_id", &tr::TllmRuntime::getOptProfileId, nb::arg("num_tokens"), nb::arg("split_points"))
+        .def("clear_contexts", &tr::TllmRuntime::clearContexts)
+        .def("execute_context", &tr::TllmRuntime::executeContext, nb::arg("context_id"))
+        .def_prop_ro("stream_ptr", &tr::TllmRuntime::getStreamPtr)
+        .def_prop_ro("buffer_manager",
+            static_cast<tr::BufferManager& (tr::TllmRuntime::*) ()>(&tr::TllmRuntime::getBufferManager))
+        .def("set_layer_profiler", &tr::TllmRuntime::setLayerProfiler)
+        .def("has_layer_profiler", &tr::TllmRuntime::hasLayerProfiler, nb::arg("context_id"))
+        .def_prop_ro("layer_profiler_info", &tr::TllmRuntime::getLayerProfileInfo)
+        .def("report_to_profiler", &tr::TllmRuntime::reportToProfiler, nb::arg("context_id"))
+        .def_prop_ro("logits_dtype_from_engine",
+            [](tr::TllmRuntime& self) { return self.getEngine().getTensorDataType("logits"); });
+
+    nb::class_<tr::decoder_batch::Request>(m, "Request")
+        .def(nb::init<tr::decoder_batch::Request::TensorConstPtr, tr::SizeType32, std::optional<tr::SizeType32>,
+                 std::optional<tr::SizeType32>>(),
+            nb::arg("ids"), nb::arg("input_len"), nb::arg("max_new_tokens") = std::nullopt,
+            nb::arg("end_id") = std::nullopt)
+        .def_rw("ids", &tr::decoder_batch::Request::ids)
+        .def_rw("input_len", &tr::decoder_batch::Request::inputLen)
+        .def_rw("max_new_tokens", &tr::decoder_batch::Request::maxNewTokens)
+        .def_rw("end_id", &tr::decoder_batch::Request::endId)
+        .def_rw("draft_logits", &tr::decoder_batch::Request::draftLogits)
+        .def_rw("embedding_bias", &tr::decoder_batch::Request::embeddingBias)
+        .def_rw("bad_words_list", &tr::decoder_batch::Request::badWordsList)
+        .def_rw("stop_words_list", &tr::decoder_batch::Request::stopWordsList)
+        .def_rw("generated_tokens_per_engine_step", &tr::decoder_batch::Request::generatedTokensPerEngineStep)
+        .def_rw("medusa_paths", &tr::decoder_batch::Request::medusaPaths)
+        .def_rw("medusa_tree_ids", &tr::decoder_batch::Request::medusaTreeIds)
+        .def_rw("lookahead_runtime_config", &tr::decoder_batch::Request::lookaheadRuntimeConfig);
+    nb::bind_vector<std::vector<tr::decoder_batch::Request>>(m, "RequestVector");
+
+    nb::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
+        .def(nb::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), nb::arg("logits"),
+            nb::arg("max_decoding_engine_tokens"))
+        .def(nb::init<std::vector<tr::ITensor::SharedConstPtr>>(), nb::arg("logits"))
+        .def_rw("logits", &tr::decoder_batch::Input::logits)
+        .def_rw("max_decoder_steps", &tr::decoder_batch::Input::maxDecoderSteps)
+        .def_rw("batch_slots", &tr::decoder_batch::Input::batchSlots);
+
+    nb::class_<tr::LookaheadDecodingBuffers>(m, "LookaheadDecodingBuffers")
+        .def(nb::init<tr::SizeType32, tr::SizeType32, tr::BufferManager const&>(), nb::arg("max_num_sequences"),
+            nb::arg("max_tokens_per_step"), nb::arg("buffer_manager"))
+        .def_rw("generation_lengths", &tr::LookaheadDecodingBuffers::generationLengths)
+        .def_rw("position_offsets", &tr::LookaheadDecodingBuffers::positionOffsets)
+        .def_rw("packed_masks", &tr::LookaheadDecodingBuffers::packedMasks)
+        .def_rw("position_ids", &tr::LookaheadDecodingBuffers::positionIds);
+
+    nb::class_<tr::ExplicitDraftTokensBuffers::Inputs>(m, "ExplicitDraftTokensBuffersInputs")
+        .def("create", &tr::ExplicitDraftTokensBuffers::Inputs::create, nb::arg("max_num_sequences"),
+            nb::arg("runtime"), nb::arg("model_config"), nb::arg("world_config"))
+        .def_rw("temperatures", &tr::ExplicitDraftTokensBuffers::Inputs::temperatures)
+        .def_rw("position_ids_base", &tr::ExplicitDraftTokensBuffers::Inputs::positionIdsBase)
+        .def_rw("generation_lengths", &tr::ExplicitDraftTokensBuffers::Inputs::generationLengths)
+        .def_rw("random_data_sample", &tr::ExplicitDraftTokensBuffers::Inputs::randomDataSample)
+        .def_rw("random_data_validation", &tr::ExplicitDraftTokensBuffers::Inputs::randomDataValidation)
+        .def_rw("draft_tokens", &tr::ExplicitDraftTokensBuffers::Inputs::draftTokens)
+        .def_rw("draft_indices", &tr::ExplicitDraftTokensBuffers::Inputs::draftIndices)
+        .def_rw("draft_probs", &tr::ExplicitDraftTokensBuffers::Inputs::draftProbs)
+        .def_rw("packed_masks", &tr::ExplicitDraftTokensBuffers::Inputs::packedMasks)
+        .def_rw("position_ids", &tr::ExplicitDraftTokensBuffers::Inputs::positionIds)
+        .def_rw("max_gen_length_host", &tr::ExplicitDraftTokensBuffers::Inputs::maxGenLengthHost)
+        .def_rw("generation_lengths_host", &tr::ExplicitDraftTokensBuffers::Inputs::generationLengthsHost);
+
+    nb::class_<tr::DecodingInput>(m, "DecodingInput");
+    nb::class_<tr::DecodingOutput>(m, "DecodingOutput");
+
+    nb::class_<tr::CudaEvent>(m, "CudaEvent")
+        .def(nb::init<unsigned int>(), nb::arg("flags") = cudaEventDisableTiming)
+        .def("synchronize", &tr::CudaEvent::synchronize);
+
+    nb::class_<tr::IGptDecoder, PyIGptDecoder>(m, "IGptDecoder")
+        .def(
+            "setup",
+            [](tr::IGptDecoder& self, tr::SamplingConfig const& samplingConfig, size_t batchSize,
+                at::Tensor const& batchSlots, std::optional<tr::DecodingOutput> const& output = std::nullopt,
+                std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
+                std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
+                std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt)
+            {
+                auto tensorPtrBatchSlots = tr::TorchView::of(batchSlots);
+                self.setup(samplingConfig, batchSize, std::move(tensorPtrBatchSlots), output, explicitDraftTokensDType,
+                    lookaheadPrompt, lookaheadAlgoConfigs);
+            },
+            nb::arg("sampling_config"), nb::arg("batch_size"), nb::arg("batch_slots"), nb::arg("output") = std::nullopt,
+            nb::arg("explicit_draft_tokens_d_type") = std::nullopt, nb::arg("lookahead_prompt") = std::nullopt,
+            nb::arg("lookahead_algo_configs") = std::nullopt);
+
+    nb::class_<tr::decoder::DecoderState>(m, "DecoderState")
+        .def(nb::init<>())
+        .def("setup", &tr::decoder::DecoderState::setup, nb::arg("max_batch_size"), nb::arg("max_beam_width"),
+            nb::arg("max_attention_window"), nb::arg("sink_token_length"), nb::arg("max_sequence_length"),
+            nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
+        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, nb::arg("max_batch_size"),
+            nb::arg("max_beam_width"), nb::arg("max_attention_window"), nb::arg("buffer_manager"))
+        .def("setup_speculative_decoding", &tr::decoder::DecoderState::setupSpeculativeDecoding,
+            nb::arg("speculative_decoding_mode"), nb::arg("max_tokens_per_engine_step"), nb::arg("dtype"),
+            nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
+        .def_prop_ro("joint_decoding_input", &tr::decoder::DecoderState::getJointDecodingInput)
+        .def_prop_ro("joint_decoding_output", &tr::decoder::DecoderState::getJointDecodingOutput)
+        .def_prop_ro("cache_indirection_input", &tr::decoder::DecoderState::getCacheIndirectionInput)
+        .def_prop_ro("cache_indirection_output", &tr::decoder::DecoderState::getCacheIndirectionOutput)
+        .def_prop_ro(
+            "sequence_lengths", nb::overload_cast<>(&tr::decoder::DecoderState::getSequenceLengths, nb::const_))
+        .def("get_sequence_lengths",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getSequenceLengths, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("all_new_tokens", &tr::decoder::DecoderState::getAllNewTokens)
+        .def_prop_ro("finished_sum", &tr::decoder::DecoderState::getFinishedSum)
+        .def_prop_ro("finish_reasons", &tr::decoder::DecoderState::getFinishReasons)
+        .def_prop_ro("ids", nb::overload_cast<>(&tr::decoder::DecoderState::getIds, nb::const_))
+        .def("get_ids", nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getIds, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("gathered_ids", nb::overload_cast<>(&tr::decoder::DecoderState::getGatheredIds, nb::const_))
+        .def("get_gathered_ids",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getGatheredIds, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("parent_ids", &tr::decoder::DecoderState::getParentIds)
+        .def_prop_ro("cum_log_probs", nb::overload_cast<>(&tr::decoder::DecoderState::getCumLogProbs, nb::const_))
+        .def("get_cum_log_probs",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getCumLogProbs, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("log_probs", nb::overload_cast<>(&tr::decoder::DecoderState::getLogProbs, nb::const_))
+        .def("get_log_probs", nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getLogProbs, nb::const_),
+            nb::arg("batch_idx"))
+        .def_prop_ro("next_draft_tokens", &tr::decoder::DecoderState::getNextDraftTokens)
+        .def_prop_ro("prev_draft_tokens_lengths", &tr::decoder::DecoderState::getPrevDraftTokensLengths)
+        .def_prop_ro("next_draft_tokens_lengths", &tr::decoder::DecoderState::getNextDraftTokensLengths)
+        .def_prop_ro("accepted_lengths_cum_sum", &tr::decoder::DecoderState::getAcceptedLengthsCumSum)
+        .def_prop_ro("accepted_packed_paths", &tr::decoder::DecoderState::getAcceptedPackedPaths)
+        .def_prop_ro("finished_steps", &tr::decoder::DecoderState::getFinishedSteps)
+        .def_prop_ro("max_beam_width", &tr::decoder::DecoderState::getMaxBeamWidth)
+        .def_prop_ro("max_sequence_length", &tr::decoder::DecoderState::getMaxSequenceLength)
+        .def_prop_ro("max_decoding_decoder_tokens", &tr::decoder::DecoderState::getMaxDecodingDecoderTokens)
+        .def_prop_ro("max_decoding_engine_tokens", &tr::decoder::DecoderState::getMaxDecodingEngineTokens)
+        .def_prop_ro("num_decoding_engine_tokens",
+            nb::overload_cast<>(&tr::decoder::DecoderState::getNumDecodingEngineTokens, nb::const_))
+        .def("get_num_decoding_engine_tokens",
+            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getNumDecodingEngineTokens, nb::const_),
+            nb::arg("batch_idx"))
+        .def("set_num_decoding_engine_tokens", &tr::decoder::DecoderState::setNumDecodingEngineTokens,
+            nb::arg("batch_idx"), nb::arg("num_tokens"))
+        .def_prop_ro("speculative_decoding_mode", &tr::decoder::DecoderState::getSpeculativeDecodingMode)
+        .def_prop_rw("generation_steps", &tr::decoder::DecoderState::getGenerationSteps,
+            &tr::decoder::DecoderState::setGenerationSteps);
+
+    nb::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
+        .def(nb::init<tr::GptDecoderBatched::CudaStreamPtr>(), nb::arg("stream"))
+        .def("setup", &tr::GptDecoderBatched::setup, nb::arg("mode"), nb::arg("max_batch_size"),
+            nb::arg("max_beam_width"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"))
+        .def("forward_async", &tr::GptDecoderBatched::forwardAsync, nb::arg("output"), nb::arg("input"))
+        .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, nb::rv_policy::reference)
+        .def("finalize", &tr::GptDecoderBatched::finalize, nb::arg("decoder_state"), nb::arg("batch_idx"),
+            nb::arg("sampling_config"), nb::arg("streaming"))
+        .def_prop_ro(
+            "decoder_stream",
+            [](tr::GptDecoderBatched& self) -> tr::CudaStream const& { return *self.getDecoderStream(); },
+            nb::rv_policy::reference);
+
+    m.def(
+        "lamport_initialize_all",
+        [](intptr_t buffer_0, intptr_t buffer_1, intptr_t buffer_2, size_t size)
+        {
+            tr::lamportInitializeAll(reinterpret_cast<void*>(buffer_0), reinterpret_cast<void*>(buffer_1),
+                reinterpret_cast<void*>(buffer_2), size);
+        },
+        "Lamport initialize all buffers");
+    m.def(
+        "lamport_initialize",
+        [](intptr_t buffer, size_t size)
+        { tensorrt_llm::kernels::ar_fusion::lamport_initialize(reinterpret_cast<void*>(buffer), size, 0); },
+        "Lmaport initialize buffer");
+    m.def(
+        "delay_kernel",
+        [](int64_t delay_micro_secs, nb::object py_stream)
+        {
+            // Get the raw stream handle from PyTorch stream object
+            auto stream_ptr = nb::cast<int64_t>(py_stream.attr("cuda_stream"));
+            cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
+            tensorrt_llm::kernels::invokeDelayStreamKernel(delay_micro_secs, stream);
+        },
+        "Delay kernel launch on the default stream");
+    m.def(
+        "max_workspace_size_lowprecision",
+        [](int32_t tp_size) { return tensorrt_llm::kernels::max_workspace_size_lowprecision(tp_size); },
+        "Calculate the maximum workspace size needed for low precision all-reduce operations");
+
+    nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
+        .def(nb::init<size_t, uint32_t, uint32_t, at::Device, bool>())
+        .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer)
+        .def("get_mc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getMCBuffer);
+
+    nb::enum_<tensorrt_llm::kernels::AllReduceFusionOp>(m, "AllReduceFusionOp")
+        .value("NONE", tensorrt_llm::kernels::AllReduceFusionOp::NONE)
+        .value("RESIDUAL_RMS_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM)
+        .value("LAST_PROCESS_FOR_UB", tensorrt_llm::kernels::AllReduceFusionOp::LAST_PROCESS_FOR_UB)
+        .value("RESIDUAL_RMS_PREPOST_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM)
+        .value("RESIDUAL_RMS_NORM_QUANT_FP8", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_FP8)
+        .value("RESIDUAL_RMS_NORM_QUANT_NVFP4", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_NVFP4)
+        .value("RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4",
+            tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4)
+        .value("RESIDUAL_RMS_NORM_OUT_QUANT_FP8",
+            tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_FP8);
+
+    nb::enum_<tensorrt_llm::kernels::AllReduceStrategyType>(m, "AllReduceStrategy")
+        .value("NCCL", tensorrt_llm::kernels::AllReduceStrategyType::NCCL)
+        .value("MIN_LATENCY", tensorrt_llm::kernels::AllReduceStrategyType::MIN_LATENCY)
+        .value("AUTO", tensorrt_llm::kernels::AllReduceStrategyType::AUTO)
+        .value("UB", tensorrt_llm::kernels::AllReduceStrategyType::UB)
+        .value("ONESHOT", tensorrt_llm::kernels::AllReduceStrategyType::ONESHOT)
+        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT);
+
+    // Initialize MoeLoadBalancer bindings
+    initMoeBindings(m);
+}
+
+void initBindingsEarly(nb::module_& m)
+{
+    nb::class_<tr::SpeculativeDecodingMode>(m, "SpeculativeDecodingMode")
+        .def(nb::init<tr::SpeculativeDecodingMode::UnderlyingType>(), nb::arg("state"))
+        .def_static("NoneType", &tr::SpeculativeDecodingMode::None)
+        .def_static("DraftTokensExternal", &tr::SpeculativeDecodingMode::DraftTokensExternal)
+        .def_static("Medusa", &tr::SpeculativeDecodingMode::Medusa)
+        .def_static("Eagle", &tr::SpeculativeDecodingMode::Eagle)
+        .def_static("LookaheadDecoding", &tr::SpeculativeDecodingMode::LookaheadDecoding)
+        .def_static("ExplicitDraftTokens", &tr::SpeculativeDecodingMode::ExplicitDraftTokens)
+        .def_prop_ro("is_none", &tr::SpeculativeDecodingMode::isNone)
+        .def_prop_ro("is_draft_tokens_external", &tr::SpeculativeDecodingMode::isDraftTokensExternal)
+        .def_prop_ro("is_medusa", &tr::SpeculativeDecodingMode::isMedusa)
+        .def_prop_ro("is_eagle", &tr::SpeculativeDecodingMode::isEagle)
+        .def_prop_ro("is_lookahead_decoding", &tr::SpeculativeDecodingMode::isLookaheadDecoding)
+        .def_prop_ro("is_explicit_draft_tokens", &tr::SpeculativeDecodingMode::isExplicitDraftTokens)
+        .def_prop_ro("updates_position_ids", &tr::SpeculativeDecodingMode::updatesPositionIds)
+        .def_prop_ro("requires_attention_mask", &tr::SpeculativeDecodingMode::requiresAttentionMask)
+        .def_prop_ro("predicts_draft_tokens", &tr::SpeculativeDecodingMode::predictsDraftTokens)
+        .def_prop_ro("needs_kv_cache_rewind", &tr::SpeculativeDecodingMode::needsKVCacheRewind)
+        .def_prop_ro("variable_draft_length", &tr::SpeculativeDecodingMode::variableDraftLength)
+        .def_prop_ro("has_draft_logits", &tr::SpeculativeDecodingMode::hasDraftLogits)
+        .def_prop_ro("needs_decoder_prologue", &tr::SpeculativeDecodingMode::needsDecoderPrologue);
+}
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.h b/cpp/tensorrt_llm/nanobind/runtime/bindings.h
new file mode 100644
index 00000000000..410dac80b05
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/bindings.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void initBindings(nb::module_& m);
+void initBindingsEarly(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
new file mode 100644
index 00000000000..c26fa84b661
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
@@ -0,0 +1,124 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "moeBindings.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.h"
+#include "tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h"
+#include <nanobind/nanobind.h>
+#include <nanobind/stl/shared_ptr.h>
+#include <vector>
+
+namespace nb = nanobind;
+namespace tr = tensorrt_llm::runtime;
+namespace tk = tensorrt_llm::kernels;
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void pyDoReplication(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
+    tr::MoePlacementCpuInfo* cpuPlacement)
+{
+    TLLM_CHECK_WITH_INFO(
+        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
+    tr::doReplication(metaInfo, expertLoadFactor.data(), cpuPlacement);
+};
+
+void pyDoPlacement(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
+    tr::MoePlacementCpuInfo* cpuPlacement)
+{
+    TLLM_CHECK_WITH_INFO(
+        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
+    tr::doPlacement(metaInfo, expertLoadFactor.data(), cpuPlacement);
+};
+
+void initMoeBindings(nb::module_& m)
+{
+    // Bind MoeWeight struct
+    nb::class_<tr::MoeWeight>(m, "MoeWeight")
+        .def(nb::init<>())
+        .def_prop_rw("weight_ptr", &tr::MoeWeight::getWeightPtr, &tr::MoeWeight::setWeightPtr)
+        .def_rw("height", &tr::MoeWeight::mHeight)
+        .def_rw("width", &tr::MoeWeight::mWidth)
+        .def_rw("pitch", &tr::MoeWeight::mPitch)
+        .def("__repr__",
+            [](tr::MoeWeight const& self)
+            {
+                return "<MoeWeight ptr=" + std::to_string(self.getWeightPtr())
+                    + " height=" + std::to_string(self.mHeight) + " width=" + std::to_string(self.mWidth)
+                    + " pitch=" + std::to_string(self.mPitch) + ">";
+            });
+
+    // Bind MoeLoadBalanceMetaInfo struct
+    nb::class_<tk::MoeLoadBalanceMetaInfo>(m, "MoeLoadBalanceMetaInfo")
+        .def(nb::init<int, int, int, int, int>(), nb::arg("expert_count"), nb::arg("top_k"), nb::arg("ep_rank"),
+            nb::arg("ep_size"), nb::arg("slot_count_per_rank"))
+        .def_rw("expert_count", &tk::MoeLoadBalanceMetaInfo::expertCount)
+        .def_rw("top_k", &tk::MoeLoadBalanceMetaInfo::topK)
+        .def_rw("ep_rank", &tk::MoeLoadBalanceMetaInfo::epRank)
+        .def_rw("ep_size", &tk::MoeLoadBalanceMetaInfo::epSize)
+        .def_rw("slot_count_per_rank", &tk::MoeLoadBalanceMetaInfo::slotCountPerRank);
+
+    // Bind MoePlacementCpuInfo struct
+    nb::class_<tr::MoePlacementCpuInfo>(m, "MoePlacementCpuInfo")
+        .def(nb::init<>())
+        .def_rw("expert_replica_count", &tr::MoePlacementCpuInfo::expertReplicaCount)
+        .def_rw("rank_expert_ids", &tr::MoePlacementCpuInfo::rankExpertIds);
+
+    // Bind SingleLayerMoeLoadBalancer class
+    nb::class_<tr::SingleLayerMoeLoadBalancer>(m, "SingleLayerMoeLoadBalancer")
+        .def("add_single_weight_slot", &tr::SingleLayerMoeLoadBalancer::addSingleWeightSlot, nb::arg("slot_id"),
+            nb::arg("name"), nb::arg("weight_slot"), "Add a single weight slot for a specific slot ID")
+        .def("add_single_host_weight", &tr::SingleLayerMoeLoadBalancer::addSingleHostWeight, nb::arg("expert_id"),
+            nb::arg("name"), nb::arg("host_weight"), "Add a single host weight for a specific expert ID")
+        .def("set_initial_weight_assignments", &tr::SingleLayerMoeLoadBalancer::setInitialWeightAssignments,
+            nb::arg("initial_weight_assignments"), "Set initial weight assignments for each slot")
+        .def("get_pointer", &tr::SingleLayerMoeLoadBalancer::getSelfPtr,
+            "Get the pointer of the SingleLayerMoeLoadBalancer")
+        .def("get_layer_id", &tr::SingleLayerMoeLoadBalancer::getLayerId,
+            "Get the layer id of the SingleLayerMoeLoadBalancer");
+
+    // Bind MoeLoadBalancer class
+    nb::class_<tr::MoeLoadBalancer>(m, "MoeLoadBalancer")
+        .def(nb::init<int, int, int>(), nb::arg("ep_rank"), nb::arg("ep_size"), nb::arg("layer_updates_per_iter"),
+            "Initialize the MoeLoadBalancer with the specified expert parallel rank, size, and update frequency")
+        .def("set_use_gpu_memcpy", &tr::MoeLoadBalancer::setUseGpuMemcpy, nb::arg("use_gpu_memcpy"),
+            "Set whether to use GPU memcpy for weight updates")
+        .def("add_layer", &tr::MoeLoadBalancer::AddLayer, nb::arg("expert_count"), nb::arg("top_k"),
+            nb::arg("slot_count_per_rank"), "Add a new MOE layer to the load balancer")
+        .def("finalize_model", &tr::MoeLoadBalancer::finalizeModel,
+            "Finalize the model structure, must be called after all layers are added")
+        .def("set_warm_up_iter_count", &tr::MoeLoadBalancer::setWarmUpIterCount, nb::arg("iter_count"),
+            "Set the number of warm-up iterations")
+        .def("start_iter", &tr::MoeLoadBalancer::startIter, nb::arg("iter_id"), nb::arg("enable_statistic"),
+            nb::arg("enable_update_weights"), "Start a new iteration with the given ID and settings")
+        .def("end_iter", &tr::MoeLoadBalancer::endIter, nb::arg("iter_id"), "End the iteration with the given ID")
+        .def("shutdown", &tr::MoeLoadBalancer::shutdown, "Shutdown the load balancer and clean up resources");
+
+    m.def("is_host_accessible_device_memory_supported", &tr::HostAccessibleDeviceAllocator::isSupported,
+        "If current system support host accessible device memory");
+
+    // Bind do_replication function for testing
+    m.def("do_replication", &pyDoReplication, nb::arg("meta_info"), nb::arg("expert_load_factor"),
+        nb::arg("cpu_placement"), "Do replication");
+
+    // Bind do_placement function for testing
+    m.def("do_placement", &pyDoPlacement, nb::arg("meta_info"), nb::arg("expert_load_factor"), nb::arg("cpu_placement"),
+        "Do placement");
+}
+
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
new file mode 100644
index 00000000000..73b9a3ceec8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::runtime
+{
+
+void initMoeBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
new file mode 100644
index 00000000000..caef94c5def
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
@@ -0,0 +1,87 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "modelSpecBinding.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include "tensorrt_llm/testing/modelSpec.h"
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+using tensorrt_llm::testing::ModelSpec;
+using tensorrt_llm::testing::KVCacheType;
+using tensorrt_llm::testing::QuantMethod;
+using tensorrt_llm::testing::OutputContentType;
+
+namespace tensorrt_llm::nanobind::testing
+{
+
+void initBindings(nb::module_& m)
+{
+    nb::enum_<QuantMethod>(m, "QuantMethod", nb::is_arithmetic(), "Quantization Method")
+        .value("NONE", QuantMethod::kNONE, "No Quantization")
+        .value("SMOOTH_QUANT", QuantMethod::kSMOOTH_QUANT, "Smooth Quantization");
+
+    nb::enum_<OutputContentType>(m, "OutputContentType", nb::is_arithmetic(), "Output Content Type")
+        .value("NONE", OutputContentType::kNONE, "No Output Content")
+        .value("CONTEXT_LOGITS", OutputContentType::kCONTEXT_LOGITS, "Context Logits")
+        .value("GENERATION_LOGITS", OutputContentType::kGENERATION_LOGITS, "Generation Logits")
+        .value("LOG_PROBS", OutputContentType::kLOG_PROBS, "Log Probs")
+        .value("CUM_LOG_PROBS", OutputContentType::kCUM_LOG_PROBS, "Cumulative Log");
+
+    nb::class_<ModelSpec>(m, "ModelSpec")
+        .def(nb::init<std::string const&, nvinfer1::DataType>())
+        .def("use_gpt_plugin", &ModelSpec::useGptAttentionPlugin, nb::rv_policy::reference_internal)
+        .def("use_packed_input", &ModelSpec::usePackedInput, nb::rv_policy::reference_internal)
+        .def("set_kv_cache_type", &ModelSpec::setKVCacheType, nb::rv_policy::reference_internal)
+        .def("use_decoder_per_request", &ModelSpec::useDecoderPerRequest, nb::rv_policy::reference_internal)
+        .def("use_tensor_parallelism", &ModelSpec::useTensorParallelism, nb::rv_policy::reference_internal)
+        .def("use_pipeline_parallelism", &ModelSpec::usePipelineParallelism, nb::rv_policy::reference_internal)
+        .def("use_context_parallelism", &ModelSpec::useContextParallelism, nb::rv_policy::reference_internal)
+        .def("set_draft_tokens", &ModelSpec::setDraftTokens, nb::rv_policy::reference_internal)
+        .def("use_accept_by_logits", &ModelSpec::useAcceptByLogits, nb::rv_policy::reference_internal)
+        .def("use_mamba_plugin", &ModelSpec::useMambaPlugin, nb::rv_policy::reference_internal)
+        .def("gather_logits", &ModelSpec::gatherLogits, nb::rv_policy::reference_internal)
+        .def("replace_logits", &ModelSpec::replaceLogits, nb::rv_policy::reference_internal)
+        .def("return_log_probs", &ModelSpec::returnLogProbs, nb::rv_policy::reference_internal)
+        .def("smoke_test", &ModelSpec::smokeTest, nb::rv_policy::reference_internal)
+        .def("use_medusa", &ModelSpec::useMedusa, nb::rv_policy::reference_internal)
+        .def("use_eagle", &ModelSpec::useEagle, nb::rv_policy::reference_internal)
+        .def("use_lookahead_decoding", &ModelSpec::useLookaheadDecoding, nb::rv_policy::reference_internal)
+        .def("use_explicit_draft_tokens_decoding", &ModelSpec::useExplicitDraftTokensDecoding,
+            nb::rv_policy::reference_internal)
+        .def("use_draft_tokens_external_decoding", &ModelSpec::useDraftTokensExternalDecoding,
+            nb::rv_policy::reference_internal)
+        .def("use_logits", &ModelSpec::useLogits)
+        .def("use_multiple_profiles", &ModelSpec::useMultipleProfiles, nb::rv_policy::reference_internal)
+        .def("set_max_input_length", &ModelSpec::setMaxInputLength, nb::rv_policy::reference_internal)
+        .def("set_max_output_length", &ModelSpec::setMaxOutputLength, nb::rv_policy::reference_internal)
+        .def("set_quant_method", &ModelSpec::setQuantMethod, nb::rv_policy::reference_internal)
+        .def("use_lora_plugin", &ModelSpec::useLoraPlugin, nb::rv_policy::reference_internal)
+        .def("get_input_file", &ModelSpec::getInputFile)
+        .def("get_model_path", &ModelSpec::getModelPath)
+        .def("get_results_file", &ModelSpec::getResultsFile)
+        .def("get_generation_logits_file", &ModelSpec::getGenerationLogitsFile)
+        .def("get_context_logits_file", &ModelSpec::getContextLogitsFile)
+        .def("get_cum_log_probs_file", &ModelSpec::getCumLogProbsFile)
+        .def("get_log_probs_file", &ModelSpec::getLogProbsFile)
+        .def("enable_context_fmha_fp32_acc", &ModelSpec::enableContextFMHAFp32Acc, nb::rv_policy::reference_internal)
+        .def("get_enable_context_fmha_fp32_acc", &ModelSpec::getEnableContextFMHAFp32Acc)
+        .def("__copy__", [](ModelSpec const& self) { return ModelSpec(self); });
+}
+
+} // namespace tensorrt_llm::nanobind::testing
diff --git a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
new file mode 100644
index 00000000000..1aababc6ff8
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
@@ -0,0 +1,29 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+
+namespace tensorrt_llm::nanobind::testing
+{
+
+void initBindings(nb::module_& m);
+
+} // namespace tensorrt_llm::nanobind::testing
diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
new file mode 100644
index 00000000000..82e0d0a1f0c
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
@@ -0,0 +1,47 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "bindings.h"
+#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
+#include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h"
+#include "tensorrt_llm/nanobind/common/customCasters.h"
+#include <nanobind/nanobind.h>
+
+namespace nb = nanobind;
+namespace tub = tensorrt_llm::runtime::ub;
+
+namespace tensorrt_llm::kernels::userbuffers
+{
+
+void UserBufferBindings::initBindings(nb::module_& m)
+{
+    nb::class_<tub::UBBuffer>(m, "UBBuffer")
+        .def_ro("size", &tub::UBBuffer::size)
+        .def_prop_ro("addr", [](tub::UBBuffer& self) { return reinterpret_cast<intptr_t>(self.addr); })
+        .def_ro("handle", &tub::UBBuffer::handle)
+        .def("invalid", &tub::UBBuffer::invalid);
+
+    m.def("ub_initialize", [](int tp_size) { tub::ub_initialize(tp_size); });
+    m.def("ub_is_initialized", &tub::ub_is_initialized);
+    m.def("ub_allocate", [](size_t bytes) { return tub::ub_allocate(bytes); });
+    m.def("ub_deallocate", [](intptr_t addr) { return tub::ub_deallocate(reinterpret_cast<void*>(addr)); });
+    m.def("ub_get", &tub::ub_get);
+    m.def("ub_supported", &tub::ub_supported);
+
+    m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager);
+}
+} // namespace tensorrt_llm::kernels::userbuffers
diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h
new file mode 100644
index 00000000000..15728bf6c1d
--- /dev/null
+++ b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h
@@ -0,0 +1,30 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <nanobind/nanobind.h>
+namespace nb = nanobind;
+
+namespace tensorrt_llm::kernels::userbuffers
+{
+class UserBufferBindings
+{
+public:
+    static void initBindings(nb::module_& m);
+};
+} // namespace tensorrt_llm::kernels::userbuffers
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 1a5841d4b7a..962071c4857 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -170,7 +170,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .value("CONTINUOUS", tr::ModelConfig::KVCacheType::kCONTINUOUS)
         .value("PAGED", tr::ModelConfig::KVCacheType::kPAGED)
         .value("DISABLED", tr::ModelConfig::KVCacheType::kDISABLED)
-        .def(py::init(&tr::ModelConfig::KVCacheTypeFromString));
+        .def("from_string", &tr::ModelConfig::KVCacheTypeFromString);
 
     py::enum_<tr::ModelConfig::LayerType>(m, "LayerType")
         .value("ATTENTION", tr::ModelConfig::LayerType::kATTENTION)
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index d09157e1a8b..a8f6aaef73d 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -244,7 +244,17 @@ void initBindings(pybind11::module_& m)
 
     py::class_<tle::KVCacheEventManager, std::shared_ptr<tle::KVCacheEventManager>>(
         executor_kv_cache, "KVCacheEventManager")
-        .def("get_latest_events", &tle::KVCacheEventManager::getLatestEvents, py::arg("timeout") = std::nullopt);
+        .def(
+            "get_latest_events",
+            [](tle::KVCacheEventManager& self, std::optional<double> timeout_ms = std::nullopt)
+            {
+                if (timeout_ms)
+                {
+                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
+                }
+                return self.getLatestEvents(std::nullopt);
+            },
+            py::arg("timeout_ms") = std::nullopt);
 
     tensorrt_llm::pybind::executor::initRequestBindings(m);
     tensorrt_llm::pybind::executor::initConfigBindings(m);
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index bc0d997e337..1153ca13a8e 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -336,7 +336,7 @@ void initConfigBindings(pybind11::module_& m)
             throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
         }
         return tle::ExtendedRuntimePerfKnobConfig(
-            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[2].cast<SizeType32>());
+            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[3].cast<SizeType32>());
     };
     auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
     {
diff --git a/examples/models/core/llama/summarize_long.py b/examples/models/core/llama/summarize_long.py
index 9f127bc32a6..cee2e07fdd5 100644
--- a/examples/models/core/llama/summarize_long.py
+++ b/examples/models/core/llama/summarize_long.py
@@ -97,7 +97,7 @@ def TRTLLaMA(args, config):
     quantization_config = pretrained_config['quantization']
 
     build_config = config['build_config']
-    kv_cache_type = KVCacheType(build_config['kv_cache_type'])
+    kv_cache_type = KVCacheType.from_string(build_config['kv_cache_type'])
     plugin_config = build_config['plugin_config']
 
     dtype = pretrained_config['dtype']
diff --git a/examples/models/core/qwen2audio/run.py b/examples/models/core/qwen2audio/run.py
index e0d495a67f8..93e161c7e08 100644
--- a/examples/models/core/qwen2audio/run.py
+++ b/examples/models/core/qwen2audio/run.py
@@ -122,7 +122,8 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType.from_string(
+                config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
diff --git a/examples/models/core/qwenvl/run.py b/examples/models/core/qwenvl/run.py
index a04c2b142e3..06ce341a9a0 100644
--- a/examples/models/core/qwenvl/run.py
+++ b/examples/models/core/qwenvl/run.py
@@ -118,7 +118,8 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType.from_string(
+                config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index bb8fd7816ce..77e12ee5100 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -47,6 +47,12 @@ CONFIG_LINUX_AARCH64 = "linux_aarch64"
 @Field
 def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
 
+@Field
+def CONFIG_LINUX_X86_64_NANOBIND = "linux_x86_64_Nanobind"
+
+@Field
+def CONFIG_LINUX_AARCH64_NANOBIND = "linux_aarch64_Nanobind"
+
 @Field
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
@@ -56,6 +62,11 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
   ],
+  (CONFIG_LINUX_X86_64_NANOBIND) : [
+    (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
+    (TARNAME) : "nanobind-TensorRT-LLM.tar.gz",
+    (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
+  ],
   (CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
     (TARNAME) : "single-device-TensorRT-LLM.tar.gz",
@@ -71,6 +82,11 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;120-real",
   ],
+  (CONFIG_LINUX_AARCH64_NANOBIND): [
+    (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars WARNING_IS_ERROR=ON",
+    (TARNAME) : "nanobind-TensorRT-LLM-GH200.tar.gz",
+    (WHEEL_ARCHS): "90-real;100-real;120-real",
+  ],
   (CONFIG_LINUX_AARCH64_LLVM) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
     (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
@@ -523,6 +539,8 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
         "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
+        "Build TRT-LLM Nanobind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
+            pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_NANOBIND : CONFIG_LINUX_X86_64_NANOBIND),
     ]
 
     if (cpu_arch == X86_64_TRIPLE) {
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 6f6ae7c1186..35e7140ebda 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -64,6 +64,9 @@ def LLVM_CONFIG = "LLVM"
 @Field
 LINUX_AARCH64_CONFIG = "linux_aarch64"
 
+@Field
+def NANOBIND_CONFIG = "Nanobind"
+
 @Field
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
@@ -71,6 +74,7 @@ def BUILD_CONFIGS = [
   (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"],
   (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"],
   (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"],
+  (NANOBIND_CONFIG) : [(TARNAME) : "nanobind-TensorRT-LLM.tar.gz"],
 ]
 
 // TODO: Move common variables to an unified location
@@ -1724,6 +1728,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "A10-TensorRT-4": ["a10", "l0_a10", 4, 6],
         "A10-TensorRT-5": ["a10", "l0_a10", 5, 6],
         "A10-TensorRT-6": ["a10", "l0_a10", 6, 6],
+        "A10-Nanobind": ["a10", "l0_a10_nanobind", 1, 1],
         "A30-Triton-1": ["a30", "l0_a30", 1, 1],
         "A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
         "A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
@@ -1800,6 +1805,9 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         if (key.contains("llvm")) {
             config = LLVM_CONFIG
         }
+        if (key.contains("Nanobind")) {
+            config = NANOBIND_CONFIG
+        }
         runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
     }]]}
     fullSet = parallelJobs.keySet()
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index e2dc543ac42..11d528a853d 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -593,7 +593,7 @@ def from_dict(cls, config, plugin_config=None):
             defaults.get('max_prompt_embedding_table_size'))
 
         if "kv_cache_type" in config and config["kv_cache_type"] is not None:
-            kv_cache_type = KVCacheType(config.pop('kv_cache_type'))
+            kv_cache_type = KVCacheType.from_string(config.pop('kv_cache_type'))
         else:
             kv_cache_type = None
         gather_context_logits = config.pop(
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index a47e1485b71..e6b55f6e040 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -38,6 +38,23 @@
 from tensorrt_llm.quantization.mode import QuantAlgo
 
 
+def enum_type(enum_class):
+
+    def parse_enum(value):
+        if isinstance(value, enum_class):
+            return value
+
+        if isinstance(value, str):
+            return enum_class.from_string(value)
+
+        valid_values = [e.name for e in enum_class]
+        raise argparse.ArgumentTypeError(
+            f"Invalid value '{value}' of type {type(value).__name__}. Expected one of {valid_values}"
+        )
+
+    return parse_enum
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -131,7 +148,7 @@ def parse_arguments():
     parser.add_argument(
         '--kv_cache_type',
         default=argparse.SUPPRESS,
-        type=KVCacheType,
+        type=enum_type(KVCacheType),
         help=
         "Set KV cache type (continuous, paged, or disabled). For disabled case, KV cache is disabled and only context phase is allowed."
     )
diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py
index 486c58f6d15..a9f0fe8de40 100644
--- a/tensorrt_llm/runtime/model_runner.py
+++ b/tensorrt_llm/runtime/model_runner.py
@@ -86,7 +86,7 @@ def _builder_to_model_config(config: dict) -> Tuple[ModelConfig, dict]:
     dtype = builder_config['precision']
     tp_size = builder_config['tensor_parallel']
     pp_size = builder_config.get('pipeline_parallel', 1)
-    kv_cache_type = KVCacheType(builder_config.get('kv_cache_type'))
+    kv_cache_type = KVCacheType.from_string(builder_config.get('kv_cache_type'))
     world_size = tp_size * pp_size
     assert world_size == mpi_world_size(), \
         f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({mpi_world_size()})'
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 2f63ab45f3a..5799ea27945 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -190,3 +190,18 @@ l0_a10:
   tests:
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
+l0_a10_nanobind:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*a10*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: tensorrt
+  tests:
+  - unittest/bindings
diff --git a/tests/unittest/bindings/test_bindings_ut.py b/tests/unittest/bindings/test_bindings_ut.py
index 774accb080f..6fd46040b66 100644
--- a/tests/unittest/bindings/test_bindings_ut.py
+++ b/tests/unittest/bindings/test_bindings_ut.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import numpy as np
+import pytest
 import torch
 from utils.runtime_defaults import assert_runtime_defaults_are_parsed_correctly
 
@@ -309,6 +310,8 @@ def parse_runtime_defaults(defaults_dict: dict | None = None):
                                                  strict_keys=strict_keys)
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_llm_request():
     beam_width = 2
     sampling_config = _tb.SamplingConfig(beam_width)
@@ -418,6 +421,8 @@ def test_Mpicomm():
     assert size2 == session_size
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_SamplingConfig_pickle():
     config = _tb.SamplingConfig()
     config.beam_width = 5
@@ -497,6 +502,8 @@ def test_KvCache_events_binding():
     torch.cuda.empty_cache()
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_ReqIdsSet_pickle():
     ids = _tb.internal.batch_manager.ReqIdsSet()
     ids1 = pickle.loads(pickle.dumps(ids))
diff --git a/tests/unittest/bindings/test_executor_bindings.py b/tests/unittest/bindings/test_executor_bindings.py
index 935c4c9bfc3..af72d9ac44b 100644
--- a/tests/unittest/bindings/test_executor_bindings.py
+++ b/tests/unittest/bindings/test_executor_bindings.py
@@ -14,6 +14,7 @@
 from binding_test_utils import *
 from pydantic import BaseModel
 
+import tensorrt_llm.bindings as _tb
 import tensorrt_llm.bindings.executor as trtllm
 import tensorrt_llm.version as trtllm_version
 from tensorrt_llm.models.modeling_utils import PretrainedConfig
@@ -484,6 +485,8 @@ def test_get_num_responses_ready(streaming: bool,
     assert executor.get_num_responses_ready() == num_expected_responses
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("batching_type", [trtllm.BatchingType.INFLIGHT])
 @pytest.mark.parametrize("streaming", [False, True])
 @pytest.mark.parametrize("beam_width", [1])
@@ -688,6 +691,8 @@ def verify_output(beam_tokens, test_data, given_input_lengths):
     verify_output(tokens, test_data, given_input_lengths)
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("streaming", [False, True])
 @pytest.mark.parametrize("beam_width", [1])
 def test_finish_reason(streaming: bool, beam_width: int, model_files,
@@ -1112,6 +1117,8 @@ def test_spec_dec_fast_logits_info():
     assert fast_logits_info.draft_participant_id == 5
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_result():
     result = trtllm.Result()
     result.is_final = True
@@ -1149,6 +1156,8 @@ def test_result():
     assert (additional_output.output == torch.ones(1, 4, 100)).all()
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_result_pickle():
     result = trtllm.Result()
     result.is_final = True
@@ -1495,6 +1504,8 @@ def test_eagle_config():
         assert getattr(config, k) == v
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_eagle_config_pickle():
     config = trtllm.EagleConfig([[0, 0], [0, 1]], False, 0.5)
     config_copy = pickle.loads(pickle.dumps(config))
@@ -1867,6 +1878,8 @@ def logits_post_processor(req_id: int, logits: torch.Tensor,
     assert tokens[-max_tokens:] == [42] * max_tokens
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 def test_logits_post_processor_batched(model_files, model_path):
 
     # Define the logits post-processor callback
@@ -2141,6 +2154,8 @@ def test_request_perf_metrics_kv_cache(model_path):
     assert kv_cache_metrics.kv_cache_hit_rate == 1.0
 
 
+@pytest.mark.skipif(_tb.binding_type == "nanobind",
+                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("exclude_input_from_output", [False, True])
 def test_request_perf_metrics_draft(model_path_draft_tokens_external,
                                     exclude_input_from_output: bool):
@@ -2221,7 +2236,7 @@ def test_kv_event_stream_timeout(model_path):
     assert len(events) == 1
 
     start = datetime.datetime.now()
-    events = cache_manager.get_latest_events(datetime.timedelta(seconds=1))
+    events = cache_manager.get_latest_events(1000)
     end = datetime.datetime.now()
     # Make sure that it actually waited
     assert abs(end - start) > datetime.timedelta(milliseconds=900)

From d71c6fe5267f4b61c51cc39d4594cdcb417f0703 Mon Sep 17 00:00:00 2001
From: ixlmar <206748156+ixlmar@users.noreply.github.com>
Date: Thu, 17 Jul 2025 17:22:25 +0200
Subject: [PATCH 66/88] [fix] Update jenkins container images (#6094)

Signed-off-by: ixlmar <206748156+ixlmar@users.noreply.github.com>
---
 docker/Makefile                       |  3 +-
 docker/README.md                      | 41 +++++++++++++++++++++++----
 jenkins/current_image_tags.properties | 11 ++++---
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/docker/Makefile b/docker/Makefile
index 926c8cea1aa..2b5022b1ee8 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -180,7 +180,8 @@ jenkins-aarch64_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.prop
 jenkins-aarch64_%: STAGE = tritondevel
 
 # For x86_64
-jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE)
+jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_VERSION}),PY312,$(if $(findstring 3.10,${PYTHON_VERSION}),PY310,$(error Unknown PYTHON_VERSION specified)))
+jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
 jenkins-rockylinux8_%: STAGE = tritondevel
 jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
 jenkins-rockylinux8_%: BASE_TAG = 12.9.0-devel-rockylinux8
diff --git a/docker/README.md b/docker/README.md
index 3bfac62a2c4..fa1b80a9fd7 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -89,13 +89,10 @@ equivalent containers as [described above](#building-docker-images-with-gnu-make
 ### Jenkins Integration
 
 [`Makefile`](Makefile) has special targets for building, pushing and running the Docker build image used on Jenkins.
-The full image name and tag is defined in [`L0_MergeRequest.groovy`](../jenkins/L0_MergeRequest.groovy). The `make`
-system will parse this name as the value of `LLM_DOCKER_IMAGE`. To build and push a new Docker image for Jenkins,
-define a new image name and tag in [`L0_MergeRequest.groovy`](../jenkins/L0_MergeRequest.groovy) and run
+The full image names and tags are defined in [`current_image_tags.properties`](../jenkins/current_image_tags.properties). The `make`
+system will parse the names/tags from this file.
 
-```bash
-make -C docker jenkins_push
-```
+#### Running
 
 Start a new container using the same image as Jenkins using your local user account with
 
@@ -134,6 +131,38 @@ make -C docker trtllm_run LOCAL_USER=1 DOCKER_PULL=1
 The argument `DOCKER_PULL=1` instructs `make` to pull the latest version of the image before deploying it in the container.
 By default, the release images built in the above manner are tagged by their `git` branch name and may be frequently updated.
 
+#### Building CI images
+
+To build and push a new Docker image for Jenkins, define new image names and tags in [`current_image_tags.properties`](../jenkins/current_image_tags.properties) and run
+
+```bash
+# Commands assume an amd64 host
+make -C docker jenkins_build
+#
+docker buildx create --name multi-builder
+make -C docker jenkins-aarch64_build \
+    DOCKER_BUILD_ARGS="--platform arm64 --builder=multi-builder"
+#
+# check jenkins/BuildDockerImage.groovy for current Python versions
+make -C docker jenkins-rockylinux8_build PYTHON_VERSION=3.12.3
+make -C docker jenkins-rockylinux8_build PYTHON_VERSION=3.10.12
+```
+
+The resulting images then need to be pushed:
+
+```bash
+sh -c '. jenkins/current_image_tags.properties && echo $LLM_DOCKER_IMAGE $LLM_SBSA_DOCKER_IMAGE $LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE $LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE' | tr ' ' '\n' | xargs -I{} docker push {}
+```
+
+Alternatively, it is possible to trigger the image build by opening a new pull request and commenting
+
+```text
+/bot run --stage-list "Build-Docker-Images"
+```
+
+The resulting images can then be re-tagged using `scripts/rename_docker_images.py`
+and the new tags included in [`current_image_tags.properties`](../jenkins/current_image_tags.properties).
+
 ### Docker rootless
 
 Some aspects require special treatment when using [Docker rootless mode](https://docs.docker.com/engine/security/rootless/). The `docker/Makefile` contains heuristics to detect Docker rootless mode. When assuming
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index 5836d212c5e..6e4863a11ed 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -8,7 +8,10 @@
 # NB: Although string interpolation is supported, redundant substrings are
 #     kept in the variables below for interoperability with
 #     scripts/rename_docker_images.py
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507150652-9504
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507150652-9504
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202507150652-9504
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202507150652-9504
+#
+# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
+#     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507162011-ec3ebae
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202507162011-ec3ebae
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202507162011-ec3ebae
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202507162011-ec3ebae

From 10dbf4f0f4565ff9f241b89cab4634c7205734f1 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 09:02:19 -0700
Subject: [PATCH 67/88] [fix] Remove duplicated KVCache transmission check
 (#6022)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor.py | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 6826cda6114..3514ce3e351 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -966,19 +966,14 @@ def _executor_loop(self):
                         self._prepare_disagg_gen_transmission_complete(
                             scheduled_batch)
 
+                        # Return the first token to the client
+                        self._handle_first_token_response(scheduled_batch)
+
                     self.resource_manager.prepare_resources(scheduled_batch)
                     if self.drafter is not None:
                         self.drafter.prepare_draft_tokens(
                             scheduled_batch, self.resource_manager)
 
-                    if self.kv_cache_transceiver:
-                        # For generation requests which have completed KV cache transfer
-                        self._prepare_disagg_gen_transmission_complete(
-                            scheduled_batch)
-
-                        # Return the first token to the client
-                        self._handle_first_token_response(scheduled_batch)
-
                     batch_outputs = self._forward_step(scheduled_batch)
 
                     if self.guided_decoder is not None:

From 8480c120b1c6546a44fb4f47f7b24ceeeaf4b114 Mon Sep 17 00:00:00 2001
From: 2ez4bz <133824995+2ez4bz@users.noreply.github.com>
Date: Thu, 17 Jul 2025 11:04:17 -0700
Subject: [PATCH 68/88] [fix] Fix Mistral3VLM weight-loading & enable in
 pre-merge (#6105)

Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
---
 tensorrt_llm/_torch/models/__init__.py         |  3 ++-
 tensorrt_llm/_torch/models/modeling_mistral.py |  2 ++
 tests/integration/defs/local_venv.py           | 18 ++++++++++++------
 .../integration/test_lists/test-db/l0_h100.yml |  1 +
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index c5acbef804a..e4da7aff5a9 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -10,7 +10,7 @@
 from .modeling_hyperclovax import HCXVisionForCausalLM
 from .modeling_llama import LlamaForCausalLM
 from .modeling_llava_next import LlavaNextModel
-from .modeling_mistral import MistralForCausalLM
+from .modeling_mistral import Mistral3VLM, MistralForCausalLM
 from .modeling_mixtral import MixtralForCausalLM
 from .modeling_nemotron import NemotronForCausalLM
 from .modeling_nemotron_h import NemotronHForCausalLM
@@ -39,6 +39,7 @@
     "HCXVisionForCausalLM",
     "LlamaForCausalLM",
     "LlavaNextModel",
+    "Mistral3VLM",
     "MistralForCausalLM",
     "MixtralForCausalLM",
     "NemotronForCausalLM",
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
index 594ba4a56cf..a8e07f24d7f 100644
--- a/tensorrt_llm/_torch/models/modeling_mistral.py
+++ b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -296,6 +296,8 @@ def __init__(
 
         llm_model_config = self._get_sub_model_config(model_config,
                                                       "text_config")
+        # This is necessary for the auto weight mapper to figure out what it needs.
+        llm_model_config.pretrained_config.architectures = config.architectures
         self.llm = MistralForCausalLM(llm_model_config)
 
         self._device = "cuda"
diff --git a/tests/integration/defs/local_venv.py b/tests/integration/defs/local_venv.py
index a98662852e1..4e72ad8ecbe 100644
--- a/tests/integration/defs/local_venv.py
+++ b/tests/integration/defs/local_venv.py
@@ -4,6 +4,7 @@
 """
 import copy
 import os
+import shlex
 import subprocess
 import tempfile
 import textwrap as tw
@@ -116,12 +117,17 @@ def run_cmd(self,
             new_env = os.environ
 
         if caller.__name__ == 'check_output':
-            result = subprocess.run(call_args,
-                                    env=new_env,
-                                    check=True,
-                                    capture_output=True,
-                                    **kwargs)
-            return result.stdout.decode('utf-8')
+            try:
+                result = subprocess.run(call_args,
+                                        env=new_env,
+                                        check=True,
+                                        capture_output=True,
+                                        **kwargs)
+                return result.stdout.decode('utf-8')
+            except subprocess.CalledProcessError as e:
+                raise RuntimeError(f"Failed to run `{shlex.join(e.cmd)}`:\n"
+                                   f"Stdout: {e.stdout.decode()}\n"
+                                   f"Stderr: {e.stderr.decode()}\n")
         else:
             print(f"Start subprocess with {caller}({call_args}, env={new_env})")
             return caller(call_args, env=new_env, **kwargs)
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 66ce79bb239..cfa03bc10ce 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -193,6 +193,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
   - test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
+  - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
 - condition:
     ranges:
       system_gpu_count:

From 161490f03948abb21fcac3f4a64372c7801815f3 Mon Sep 17 00:00:00 2001
From: Frank <3429989+FrankD412@users.noreply.github.com>
Date: Thu, 17 Jul 2025 12:44:44 -0700
Subject: [PATCH 69/88] [fix] Fixes KV Cache overrides in trtllm-bench (#6103)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
---
 tensorrt_llm/bench/dataclasses/configuration.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/bench/dataclasses/configuration.py b/tensorrt_llm/bench/dataclasses/configuration.py
index 77f80632088..a693333230c 100755
--- a/tensorrt_llm/bench/dataclasses/configuration.py
+++ b/tensorrt_llm/bench/dataclasses/configuration.py
@@ -58,8 +58,6 @@ def get_llm_args(self) -> Dict:
             self.world_config.cluster_size,
             "trust_remote_code":
             True,
-            "kv_cache_config":
-            self.settings_config.get_kvcache_config(),
             "enable_chunked_prefill":
             self.settings_config.chunking,
             "extended_runtime_perf_knob_config":
@@ -82,6 +80,10 @@ def get_llm_args(self) -> Dict:
         if self.backend in backend_config_map:
             llm_args.update(backend_config_map[self.backend]())
 
+        kv_cache_config = self.settings_config.get_kvcache_config().__dict__
+        backend_cache_config = llm_args.pop("kv_cache_config", {})
+        llm_args["kv_cache_config"] = backend_cache_config | kv_cache_config
+
         return update_llm_args_with_extra_options(llm_args,
                                                   self.extra_llm_api_options)
 

From 2c90203c36a8a97938d364a6624a2f36c5d949b2 Mon Sep 17 00:00:00 2001
From: qixiang-99 <203170375+qixiang-99@users.noreply.github.com>
Date: Thu, 17 Jul 2025 13:33:33 -0700
Subject: [PATCH 70/88] =?UTF-8?q?Refactor=20KVCacheManager:=20Simplify=20t?=
 =?UTF-8?q?oken=20availability=20calculation=20and=20=E2=80=A6=20(#6134)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: qixiang-99 <203170375+qixiang-99@users.noreply.github.com>
---
 tensorrt_llm/_torch/pyexecutor/resource_manager.py | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
index c5a9f264b01..df577bc7e89 100644
--- a/tensorrt_llm/_torch/pyexecutor/resource_manager.py
+++ b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -536,16 +536,8 @@ def get_num_kv_blocks(self, num_tokens: int) -> int:
         return (num_tokens + self.tokens_per_block - 1) // self.tokens_per_block
 
     def get_num_available_tokens(self, max_num_draft_tokens: int = 0) -> int:
-        if self.max_attention_window_vec and len(
-                self.max_attention_window_vec) > 1:
-            # VSWA case, the available tokens should the the minimum of the available tokens for each window size
-            min_free_blocks = min(self.impl.get_kv_cache_stats().
-                                  num_free_blocks_per_window_size.values())
-            res = min_free_blocks * self.tokens_per_block - self.num_extra_kv_tokens - max_num_draft_tokens
-        else:
-            res = (self.get_num_free_blocks() * self.tokens_per_block -
-                   self.num_extra_kv_tokens - max_num_draft_tokens)
-        return res
+        return (self.get_num_free_blocks() * self.tokens_per_block -
+                self.num_extra_kv_tokens - max_num_draft_tokens)
 
     def get_buffers(self, layer_idx: int) -> Optional[torch.Tensor]:
         layer_offset = self.layer_offsets[layer_idx]
@@ -732,6 +724,8 @@ def calculate_max_num_blocks_from_cpp(
 
         # VSWA on Torch backend has not supported the cross attention.
         is_cross_attention = False
+        # check model config
+        assert model_config.layer_types is not None, "layer_types have to be set correctly for VSWA"
 
         # Construct WorldConfig from self.mapping
         world_config_cpp = WorldConfig(

From ae28b3a664e5b278d8412b72cff3e13915062d3b Mon Sep 17 00:00:00 2001
From: Daniel Stokes <40156487+djns99@users.noreply.github.com>
Date: Fri, 18 Jul 2025 09:00:12 +1200
Subject: [PATCH 71/88] feat: Add support for benchmarking individual gemms in
 MOE benchmark (#6080)

Signed-off-by: Daniel Stokes <40156487+djns99@users.noreply.github.com>
---
 cpp/micro_benchmarks/README.md                |   3 +
 .../gen-moe-benchmark-file.py                 |  66 +--
 .../mixtureOfExpertsBackendBenchmarkFixture.h | 390 ++++++++++++------
 ...ixtureOfExpertsBackendBenchmarkLauncher.cu |  60 ++-
 .../cutlass_kernels/include/moe_kernels.h     |   4 +-
 5 files changed, 348 insertions(+), 175 deletions(-)

diff --git a/cpp/micro_benchmarks/README.md b/cpp/micro_benchmarks/README.md
index 39fc5e102c4..a1504a2dee9 100644
--- a/cpp/micro_benchmarks/README.md
+++ b/cpp/micro_benchmarks/README.md
@@ -11,6 +11,9 @@ To build add the `--micro_benchmark` flag to `build_wheel.py` or pass `-DBUILD_M
 
 ### Mixture Of Experts Backend Benchmark
 
+> [!CAUTION]
+> Disclaimer this benchmark is intended for developers to help evaluating the impact of new optimisations. This benchmark does not meet the same quality standards as other parts of TRT-LLM. Please use with caution
+
 Target `mixtureOfExpertsBackendBenchmark`
 
 This benchmark covers the backend used by the `MixtureOfExperts` plugin. It allows you to benchmark different MOE
diff --git a/cpp/micro_benchmarks/gen-moe-benchmark-file.py b/cpp/micro_benchmarks/gen-moe-benchmark-file.py
index 571edd976da..c8f72b4ef65 100644
--- a/cpp/micro_benchmarks/gen-moe-benchmark-file.py
+++ b/cpp/micro_benchmarks/gen-moe-benchmark-file.py
@@ -14,7 +14,8 @@
   {dtype_string}
   {routing_string}
   {tactic_string}
-  "bias": 0
+  "bias": 0,
+  "gemm_to_profile": {gemm_to_profile}
 }}'''
 
 
@@ -54,39 +55,50 @@ def populate_benchmark_config(**kwargs):
 
 
 # Default Mixtral configurations
-num_experts = 256
-k = 8
+num_experts = 8
+k = 2
 hidden_size = 4096
-inter_size = 2048
-tp_size = 8
-ep_size = 1
+inter_size = 14336
+# tp_size = 8
+# ep_size = 1
 world_rank = 0
 act_fn = 3
-dtype_string = make_dtype_string(["fp4", "wfp4afp8"])  # All dtypes
-routing_string = make_routing_string(
-    name="uniform",
-    is_distribution=True)  # Use the default uniform random distribution
+dtype_string = make_dtype_string()  # All dtypes
 tactic_id1 = '"auto"'
 tactic_id2 = '"auto"'
+gemms_to_profile = [1, 2, 3]
 
 configs = []
-for num_tokens in [1, 8, 64, 2048, 65536]:
-    configs.append(
-        populate_benchmark_config(
-            num_experts=num_experts,
-            k=k,
-            hidden_size=hidden_size,
-            inter_size=inter_size,
-            tp_size=tp_size,
-            ep_size=ep_size,
-            world_rank=world_rank,
-            num_tokens=num_tokens,
-            act_fn=act_fn,
-            dtype_string=dtype_string,
-            routing_string=routing_string,
-            tactic_string=make_tactic_string(tactic_id1=tactic_id1,
-                                             tactic_id2=tactic_id2),
-        ))
+for ep_size in [1, num_experts]:
+    for num_tokens in [1, 8, 64, 2048, 16384]:
+        tp_size = 8 // ep_size
+        if inter_size % (tp_size * 128) != 0:
+            continue  # Insufficient alignment
+        if num_tokens <= num_experts:
+            routing_string = make_routing_string(
+                name="balanced",
+                is_distribution=False)  # Use the balanced distribution
+        else:
+            routing_string = make_routing_string(
+                name="uniform", is_distribution=True
+            )  # Use the default uniform random distribution
+        for gemm_to_profile in gemms_to_profile:
+            configs.append(
+                populate_benchmark_config(num_experts=num_experts,
+                                          k=k,
+                                          hidden_size=hidden_size,
+                                          inter_size=inter_size,
+                                          tp_size=tp_size,
+                                          ep_size=ep_size,
+                                          world_rank=world_rank,
+                                          num_tokens=num_tokens,
+                                          act_fn=act_fn,
+                                          dtype_string=dtype_string,
+                                          routing_string=routing_string,
+                                          tactic_string=make_tactic_string(
+                                              tactic_id1=tactic_id1,
+                                              tactic_id2=tactic_id2),
+                                          gemm_to_profile=gemm_to_profile))
 
 full_string = "[\n" + ",\n".join(configs) + "\n]"
 
diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
index 0790b842d45..565c170e1df 100644
--- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
+++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkFixture.h
@@ -71,6 +71,13 @@ enum VERBOSE_LEVEL
 
 constexpr int LOG_LEVEL = ERROR;
 
+enum class GemmToProfile : int
+{
+    GEMM_1 = static_cast<int>(GemmProfilerBackend::GemmToProfile::GEMM_1),
+    GEMM_2 = static_cast<int>(GemmProfilerBackend::GemmToProfile::GEMM_2),
+    LAYER = static_cast<int>(3),
+};
+
 namespace
 {
 // Abstract class for routing config
@@ -358,6 +365,10 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
     constexpr static int64_t FP4_VECTOR_SIZE = NVFP4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize
                                                      : TmaWarpSpecializedGroupedGemmInput::MXFPXBlockScaleVectorSize;
+    constexpr static int64_t MinNDimAlignment = NVFP4 ? TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentNVFP4
+                                                      : TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX;
+    constexpr static int64_t MinKDimAlignment = NVFP4 ? TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentNVFP4
+                                                      : TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX;
 
     std::vector<BufferManager::IBufferPtr> managed_buffers;
     int* mSelectedExperts{};
@@ -365,6 +376,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
     int64_t mHiddenSize{};
     int64_t mNumExperts{};
+    int64_t mNumExpertsPerNode{};
     int64_t mK{};
 
     constexpr static nvinfer1::DataType toDTypeID()
@@ -497,6 +509,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     }
 
     CutlassMoeFCRunner<DataType, WeightType, OutputType, InputType> mMoERunner{};
+    GemmProfilerBackend mGemmProfilerBackend{};
+    char* mGemmProfilerWorkspace{};
     char* mWorkspace{};
     float* mScaleProbs{};
     WeightStorage* mExpertWeight1{};
@@ -544,6 +558,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     std::optional<tensorrt_llm::cutlass_extensions::CutlassGemmConfig> mSelectedConfig = std::nullopt;
 
     int64_t mBufferIndex = 0;
+    size_t mGemmProfilerWorkspaceSize = 0;
     size_t mWorkspaceSize = 0;
     size_t mExpertWeight1Size = 0;
     size_t mExpertWeight2Size = 0;
@@ -559,10 +574,15 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     size_t mExpertIntScale1Size = 0;
     size_t mExpertIntScale2Size = 0;
 
+    size_t padSize(size_t size)
+    {
+        return ceilDiv(size, 128) * 128;
+    }
+
     template <class T>
     T* allocBuffer(size_t size)
     {
-        size_t size_padded = ceilDiv(size * sizeof(T), 128) * 128;
+        size_t size_padded = padSize(size) * sizeof(T);
         auto i_buffer = bufferManager->gpu(size_padded);
         check_cuda_error(cudaGetLastError());
         managed_buffers.emplace_back(std::move(i_buffer));
@@ -572,7 +592,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
     }
 
     void initBuffersPermute(int64_t num_tokens, int64_t hidden_size, int64_t inter_size, int64_t num_experts, int64_t k,
-        int64_t routing_config, MOEParallelismConfig parallelism_config)
+        int64_t routing_config, MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
         assert(hidden_size % BASE_HIDDEN_SIZE == 0);
 
@@ -582,104 +602,160 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         mHiddenSize = hidden_size;
         mInterSize = inter_size / parallelism_config.tp_size;
         mNumExperts = num_experts;
+        mNumExpertsPerNode = num_experts / parallelism_config.ep_size;
         mK = k;
         mIsGated = isGatedActivation(mActType);
         mGatedMultiplier = mIsGated ? 2 : 1;
         auto const gated_inter = mInterSize * mGatedMultiplier;
+        size_t const expert_matrix_size = padSize(mNumExpertsPerNode * mHiddenSize * mInterSize);
 
-        mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK, mActType,
-            {}, mUseLora, /*use_deepseek_fp8_block_scale=*/false, /*min_latency_mode=*/false, mUsePrequantScale);
-
-        mWorkspace = allocBuffer<char>(mWorkspaceSize * NUM_BUFFERS);
-        size_t const expert_matrix_size = mNumExperts * mHiddenSize * mInterSize;
-
-        mExpertWeight1Size = expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE;
-        mExpertWeight2Size = expert_matrix_size / WEIGHT_ELEM_PER_BYTE;
-        mExpertWeight1 = allocBuffer<WeightStorage>(mExpertWeight1Size * NUM_BUFFERS);
-        mExpertWeight2 = allocBuffer<WeightStorage>(mExpertWeight2Size * NUM_BUFFERS);
+        bool need_weight_1 = gemm_to_profile == GemmToProfile::GEMM_1 || gemm_to_profile == GemmToProfile::LAYER;
+        bool need_weight_2 = gemm_to_profile == GemmToProfile::GEMM_2 || gemm_to_profile == GemmToProfile::LAYER;
+        mExpertWeight1Size = need_weight_1 ? expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE : 0;
+        mExpertWeight2Size = need_weight_2 ? expert_matrix_size / WEIGHT_ELEM_PER_BYTE : 0;
+        mExpertWeight1 = need_weight_1 ? allocBuffer<WeightStorage>(mExpertWeight1Size * NUM_BUFFERS) : nullptr;
+        mExpertWeight2 = need_weight_2 ? allocBuffer<WeightStorage>(mExpertWeight2Size * NUM_BUFFERS) : nullptr;
 
-        mExpertBias1 = nullptr;
-        mExpertBias2 = nullptr;
-        if (mUseBias)
+        if (gemm_to_profile == GemmToProfile::LAYER)
         {
-            mExpertBias1Size = mNumExperts * gated_inter;
-            mExpertBias2Size = mNumExperts * mHiddenSize;
-            mExpertBias1 = allocBuffer<DataType>(mExpertBias1Size * NUM_BUFFERS);
-            mExpertBias2 = allocBuffer<DataType>(mExpertBias2Size * NUM_BUFFERS);
-        }
 
-        if constexpr (INT_QUANT)
-        {
-            mExpertIntScale1Size = mNumExperts * gated_inter;
-            mExpertIntScale2Size = mNumExperts * mHiddenSize;
-            mExpertIntScale1 = allocBuffer<DataType>(mExpertIntScale1Size * NUM_BUFFERS);
-            mExpertIntScale2 = allocBuffer<DataType>(mExpertIntScale2Size * NUM_BUFFERS);
+            mWorkspaceSize = mMoERunner.getWorkspaceSize(mTotalTokens, mHiddenSize, mInterSize, mNumExperts, mK,
+                mActType, parallelism_config, mUseLora, /*use_deepseek_fp8_block_scale=*/false,
+                /*min_latency_mode=*/false, mUsePrequantScale);
 
-            for (int i = 0; i < NUM_BUFFERS; i++)
+            mWorkspace = allocBuffer<char>(mWorkspaceSize * NUM_BUFFERS);
+
+            mExpertBias1 = nullptr;
+            mExpertBias2 = nullptr;
+            if (mUseBias)
             {
-                mQuantParams[i] = QuantParams::Int(
-                    mExpertIntScale1 + mExpertIntScale1Size * i, mExpertIntScale2 + mExpertIntScale2Size * i);
+                mExpertBias1Size = padSize(mNumExpertsPerNode * gated_inter);
+                mExpertBias2Size = padSize(mNumExpertsPerNode * mHiddenSize);
+                mExpertBias1 = allocBuffer<DataType>(mExpertBias1Size * NUM_BUFFERS);
+                mExpertBias2 = allocBuffer<DataType>(mExpertBias2Size * NUM_BUFFERS);
             }
-        }
-        else if constexpr (FP8)
-        {
-            mExpertFP8Scale1 = allocBuffer<float>(mNumExperts);
-            mExpertFP8Scale2 = allocBuffer<float>(1);
-            mExpertFP8Scale3 = allocBuffer<float>(mNumExperts);
 
-            for (int i = 0; i < NUM_BUFFERS; i++)
+            if constexpr (INT_QUANT)
             {
-                mQuantParams[i] = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
+                mExpertIntScale1Size = padSize(mNumExpertsPerNode * gated_inter);
+                mExpertIntScale2Size = padSize(mNumExpertsPerNode * mHiddenSize);
+                mExpertIntScale1 = allocBuffer<DataType>(mExpertIntScale1Size * NUM_BUFFERS);
+                mExpertIntScale2 = allocBuffer<DataType>(mExpertIntScale2Size * NUM_BUFFERS);
+
+                for (int i = 0; i < NUM_BUFFERS; i++)
+                {
+                    mQuantParams[i] = QuantParams::Int(
+                        mExpertIntScale1 + mExpertIntScale1Size * i, mExpertIntScale2 + mExpertIntScale2Size * i);
+                }
             }
-        }
-        else if constexpr (ANY_FP4)
-        {
-            mExpertFP4ActScale1 = allocBuffer<float>(1);
-            mExpertFP4WeightSf1Size = num_experts * gated_inter * mHiddenSize / FP4_VECTOR_SIZE;
-            mExpertFP4WeightSf1 = allocBuffer<ElementSF>(mExpertFP4WeightSf1Size * NUM_BUFFERS);
-            mExpertFP4GlobalScale1 = allocBuffer<float>(num_experts);
+            else if constexpr (FP8)
+            {
+                mExpertFP8Scale1 = allocBuffer<float>(mNumExpertsPerNode);
+                mExpertFP8Scale2 = allocBuffer<float>(1);
+                mExpertFP8Scale3 = allocBuffer<float>(mNumExpertsPerNode);
 
-            mExpertFP4ActScale2 = allocBuffer<float>(1);
-            mExpertFP4WeightSf2Size = num_experts * mInterSize * mHiddenSize / FP4_VECTOR_SIZE;
-            mExpertFP4WeightSf2 = allocBuffer<ElementSF>(mExpertFP4WeightSf2Size * NUM_BUFFERS);
-            mExpertFP4GlobalScale2 = allocBuffer<float>(num_experts);
+                for (int i = 0; i < NUM_BUFFERS; i++)
+                {
+                    mQuantParams[i] = QuantParams::FP8(mExpertFP8Scale1, mExpertFP8Scale2, mExpertFP8Scale3);
+                }
+            }
+            else if constexpr (ANY_FP4)
+            {
+                mExpertFP4ActScale1 = allocBuffer<float>(mNumExpertsPerNode);
+                mExpertFP4WeightSf1Size = mNumExpertsPerNode
+                    * TmaWarpSpecializedGroupedGemmInput::alignToSfDim(gated_inter, MinNDimAlignment)
+                    * TmaWarpSpecializedGroupedGemmInput::alignToSfDim(mHiddenSize, MinKDimAlignment) / FP4_VECTOR_SIZE;
+                mExpertFP4WeightSf1 = allocBuffer<ElementSF>(mExpertFP4WeightSf1Size * NUM_BUFFERS);
+                mExpertFP4GlobalScale1 = allocBuffer<float>(mNumExpertsPerNode);
+
+                mExpertFP4ActScale2 = allocBuffer<float>(mNumExpertsPerNode);
+                mExpertFP4WeightSf2Size = mNumExpertsPerNode
+                    * TmaWarpSpecializedGroupedGemmInput::alignToSfDim(mInterSize, MinNDimAlignment)
+                    * TmaWarpSpecializedGroupedGemmInput::alignToSfDim(mHiddenSize, MinKDimAlignment) / FP4_VECTOR_SIZE;
+                mExpertFP4WeightSf2 = allocBuffer<ElementSF>(mExpertFP4WeightSf2Size * NUM_BUFFERS);
+                mExpertFP4GlobalScale2 = allocBuffer<float>(mNumExpertsPerNode);
+
+                auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4;
+                for (int i = 0; i < NUM_BUFFERS; i++)
+                {
+                    mQuantParams[i] = func(mExpertFP4ActScale1, mExpertFP4WeightSf1 + mExpertFP4WeightSf1Size * i,
+                        mExpertFP4GlobalScale1, mExpertFP4ActScale2, mExpertFP4WeightSf2 + mExpertFP4WeightSf2Size * i,
+                        mExpertFP4GlobalScale2, false, false);
+                }
+            }
 
-            auto func = NVFP4 ? QuantParams::FP4 : QuantParams::FP8MXFP4;
+            mSelectedExpertsSize = padSize(mTotalTokens * mK);
+            mSelectedExperts = allocBuffer<int>(mSelectedExpertsSize * NUM_BUFFERS);
+            mScaleProbsSize = padSize(mTotalTokens * mK);
+            mScaleProbs = allocBuffer<float>(mScaleProbsSize * NUM_BUFFERS);
+            mInputTensorSize = padSize(mTotalTokens * mHiddenSize);
+            mInputTensor = allocBuffer<DataType>(mInputTensorSize * NUM_BUFFERS);
+            mFinalOutputSize = padSize(mTotalTokens * mHiddenSize);
+            mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * NUM_BUFFERS);
+
+            mSourceToExpandedMapSize = padSize(mTotalTokens * mK);
+            mSourceToExpandedMap = allocBuffer<int>(mSourceToExpandedMapSize * NUM_BUFFERS);
+            mRoutingConfigIndex = routing_config;
+            auto tactic = routingConfigCache.at(routing_config);
+            tactic->start();
             for (int i = 0; i < NUM_BUFFERS; i++)
             {
-                mQuantParams[i] = func(mExpertFP4ActScale1, mExpertFP4WeightSf1 + mExpertFP4WeightSf1Size * i,
-                    mExpertFP4GlobalScale1, mExpertFP4ActScale2, mExpertFP4WeightSf2 + mExpertFP4WeightSf2Size * i,
-                    mExpertFP4GlobalScale2, false, false);
+                tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * i, mNumExperts, mK, mTotalTokens);
             }
         }
 
-        mSelectedExpertsSize = mTotalTokens * mK;
-        mSelectedExperts = allocBuffer<int>(mSelectedExpertsSize * NUM_BUFFERS);
-        mScaleProbsSize = mTotalTokens * mK;
-        mScaleProbs = allocBuffer<float>(mScaleProbsSize * NUM_BUFFERS);
-        mInputTensorSize = mTotalTokens * mHiddenSize;
-        mInputTensor = allocBuffer<DataType>(mInputTensorSize * NUM_BUFFERS);
-        mFinalOutputSize = mTotalTokens * mHiddenSize;
-        mFinalOutput = allocBuffer<OutputType>(mFinalOutputSize * NUM_BUFFERS);
-
-        mSourceToExpandedMapSize = mTotalTokens * mK;
-        mSourceToExpandedMap = allocBuffer<int>(mSourceToExpandedMapSize * NUM_BUFFERS);
-
-        mRoutingConfigIndex = routing_config;
-        auto tactic = routingConfigCache.at(routing_config);
-        tactic->start();
-        for (int i = 0; i < NUM_BUFFERS; i++)
+#ifdef USING_OSS_CUTLASS_MOE_GEMM
+        mGemmProfilerBackend.init(mMoERunner, GemmProfilerBackend::GemmToProfile::Undefined, typeToDtypeID<DataType>(),
+            typeToDtypeID<WeightType>(), typeToDtypeID<OutputType>(), mNumExperts, mK, mHiddenSize, mInterSize,
+            mGroupSize, mActType, mUseBias, mUseLora, /*min_latency_mode=*/false,
+            /*need_weights=*/false, parallelism_config, /*enable_alltoall=*/false);
+#else
+        mGemmProfilerBackend.init(mMoERunner, GemmProfilerBackend::GemmToProfile::Undefined, typeToDtypeID<DataType>(),
+            typeToDtypeID<WeightType>(), typeToDtypeID<OutputType>(), mNumExperts, mK, mHiddenSize, mInterSize,
+            mGroupSize, mActType, mUseBias, mUseLora, /*min_latency_mode=*/false,
+            /*need_weights=*/false, parallelism_config);
+#endif
+
+        mGemmProfilerWorkspaceSize = 0;
+        if (gemm_to_profile == GemmToProfile::GEMM_1 || gemm_to_profile == GemmToProfile::LAYER)
+        {
+            mGemmProfilerBackend.mGemmToProfile = GemmProfilerBackend::GemmToProfile::GEMM_1;
+            mGemmProfilerWorkspaceSize
+                = std::max(mGemmProfilerWorkspaceSize, mGemmProfilerBackend.getWorkspaceSize(mTotalTokens));
+        }
+
+        if (gemm_to_profile == GemmToProfile::GEMM_2 || gemm_to_profile == GemmToProfile::LAYER)
         {
-            tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * i, mNumExperts, mK, mTotalTokens);
+            mGemmProfilerBackend.mGemmToProfile = GemmProfilerBackend::GemmToProfile::GEMM_2;
+            mGemmProfilerWorkspaceSize
+                = std::max(mGemmProfilerWorkspaceSize, mGemmProfilerBackend.getWorkspaceSize(mTotalTokens));
         }
 
+        int64_t num_gemm_buffers = gemm_to_profile == GemmToProfile::LAYER ? 1 : NUM_BUFFERS;
+        mGemmProfilerWorkspaceSize = padSize(mGemmProfilerWorkspaceSize);
+        mGemmProfilerWorkspace = mGemmProfilerWorkspaceSize > 0
+            ? allocBuffer<char>(mGemmProfilerWorkspaceSize * num_gemm_buffers)
+            : nullptr;
+
         check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
     }
 
+    void prepareGemmProfiler(GemmToProfile gemm_to_profile)
+    {
+        if (gemm_to_profile == GemmToProfile::LAYER)
+            return;
+        mGemmProfilerBackend.mGemmToProfile = static_cast<GemmProfilerBackend::GemmToProfile>(gemm_to_profile);
+        auto* expert_weights = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1 : mExpertWeight2;
+        auto expert_weights_size = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1Size : mExpertWeight2Size;
+        mGemmProfilerBackend.prepare(mTotalTokens, mGemmProfilerWorkspace + mGemmProfilerWorkspaceSize * mBufferIndex,
+            /*expert_weights=*/expert_weights + expert_weights_size * mBufferIndex, streamPtr->get());
+    }
+
     std::array<cudaGraph_t, NUM_BUFFERS> mGraph{};
+
     std::array<cudaGraphExec_t, NUM_BUFFERS> mGraphInstance{};
 
-    void createGraph(MOEParallelismConfig parallelism_config)
+    void createGraph(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
         if (!useCudaGraph)
             return;
@@ -689,9 +765,11 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         for (int i = 0; i < NUM_BUFFERS; i++)
         {
             mBufferIndex = i;
+            // Each buffer will have a different routing config for the gemm profiler
+            prepareGemmProfiler(gemm_to_profile);
             check_cuda_error(cudaGraphCreate(&mGraph[i], 0));
             check_cuda_error(cudaStreamBeginCapture(streamPtr->get(), cudaStreamCaptureModeThreadLocal));
-            runMoEPermute(parallelism_config);
+            runMoEPermute(parallelism_config, gemm_to_profile);
             check_cuda_error(cudaStreamEndCapture(streamPtr->get(), &mGraph[i]));
             check_cuda_error(cudaGraphInstantiate(&mGraphInstance[i], mGraph[i], nullptr, nullptr, 0));
         }
@@ -711,13 +789,23 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         }
     }
 
-    float benchmarkLoop(MOEParallelismConfig parallelism_config)
+    float benchmarkLoop(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
         mBufferIndex = (mBufferIndex + 1) % NUM_BUFFERS;
-        auto tactic = routingConfigCache.at(mRoutingConfigIndex);
-        if (!tactic->isDeterministic())
+
+        // Setup the profiler state for this iteration. CUDA Graphs will do this when it captures the graph.
+        if (gemm_to_profile != GemmToProfile::LAYER && !useCudaGraph)
+        {
+            prepareGemmProfiler(gemm_to_profile);
+        }
+        else if (gemm_to_profile == GemmToProfile::LAYER)
         {
-            tactic->setRouting(mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mNumExperts, mK, mTotalTokens);
+            auto tactic = routingConfigCache.at(mRoutingConfigIndex);
+            if (!tactic->isDeterministic())
+            {
+                tactic->setRouting(
+                    mSelectedExperts + mSelectedExpertsSize * mBufferIndex, mNumExperts, mK, mTotalTokens);
+            }
         }
 
         {
@@ -729,7 +817,7 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
             }
             else
             {
-                runMoEPermute(parallelism_config);
+                runMoEPermute(parallelism_config, gemm_to_profile);
             }
             check_cuda_error(cudaEventRecord(mEndEvent, streamPtr->get()));
             check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
@@ -742,27 +830,19 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
 
     // An imprecise benchmark pass for picking the best tactic.
     // Runs for 3 iterations or 1 second and picks the best option
-    int pickBestTactic(MOEParallelismConfig parallelism_config, GemmProfilerBackend::GemmToProfile gemm_to_profile)
+    int pickBestTactic(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
         auto tactics = mMoERunner.getTactics();
         ::nvtx3::scoped_range nvtx(tensorrt_llm::common::nvtx::nextColor(),
             "Tactic Profiling GEMM " + std::to_string(static_cast<int>(gemm_to_profile)));
+        // We save space by reusing the same workspace buffer for all tactics when doing full layer profiling. So we
+        // need to hardcode the buffer index to 0.
+        auto old_buffer_index = mBufferIndex;
+        mBufferIndex = 0;
+        prepareGemmProfiler(gemm_to_profile);
+        mBufferIndex = old_buffer_index;
 
-        GemmProfilerBackend profiler;
-#ifdef USING_OSS_CUTLASS_MOE_GEMM
-        profiler.init(mMoERunner, gemm_to_profile, typeToDtypeID<DataType>(), typeToDtypeID<WeightType>(),
-            typeToDtypeID<OutputType>(), mNumExperts, mK, mHiddenSize, mInterSize, mGroupSize, mActType, mUseBias,
-            mUseLora, /*min_latency_mode=*/false, /*need_weights=*/true, parallelism_config, /*enable_alltoall=*/false);
-#else
-        profiler.init(mMoERunner, gemm_to_profile, typeToDtypeID<DataType>(), typeToDtypeID<WeightType>(),
-            typeToDtypeID<OutputType>(), mNumExperts, mK, mHiddenSize, mInterSize, mGroupSize, mActType, mUseBias,
-            mUseLora, /*min_latency_mode=*/false, /*need_weights=*/true, parallelism_config);
-#endif
-        auto workspace_size = profiler.getWorkspaceSize(mTotalTokens);
-        auto workspace = bufferManager->gpu(workspace_size);
-
-        profiler.prepare(
-            mTotalTokens, static_cast<char*>(workspace->data()), /*expert_weights=*/nullptr, streamPtr->get());
+        auto* mGemmProfilerExpertWeights = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1 : mExpertWeight2;
 
         float best_time = INFINITY;
         int best_idx = -1;
@@ -778,13 +858,13 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
                 {
                     ::nvtx3::scoped_range nvtx(tensorrt_llm::common::nvtx::nextColor(), "Tactic Profiling Warm-Up");
                     // Warm-Up run
-                    profiler.runProfiler(mTotalTokens, t, static_cast<char*>(workspace->data()),
-                        /*expert_weights=*/nullptr, streamPtr->get());
+                    mGemmProfilerBackend.runProfiler(mTotalTokens, t, mGemmProfilerWorkspace,
+                        /*expert_weights=*/mGemmProfilerExpertWeights, streamPtr->get());
                     check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
                 }
 
                 // Profile all samples or for 1 sec
-                int const max_iters = profiler.NUM_ROUTING_SAMPLES;
+                int const max_iters = mGemmProfilerBackend.NUM_ROUTING_SAMPLES;
                 float const max_time_ms = 1000.f;
 
                 float time = 0.f;
@@ -796,8 +876,8 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
                             "Tactic Profiling Iteration " + std::to_string(iter));
 
                         check_cuda_error(cudaEventRecord(mStartEvent, streamPtr->get()));
-                        profiler.runProfiler(mTotalTokens, t, static_cast<char*>(workspace->data()),
-                            /*expert_weights=*/nullptr, streamPtr->get());
+                        mGemmProfilerBackend.runProfiler(mTotalTokens, t, mGemmProfilerWorkspace,
+                            /*expert_weights=*/mGemmProfilerExpertWeights, streamPtr->get());
                         check_cuda_error(cudaEventRecord(mEndEvent, streamPtr->get()));
                         check_cuda_error(cudaStreamSynchronize(streamPtr->get()));
                     }
@@ -838,17 +918,26 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         return best_idx;
     }
 
-    std::pair<int, int> setTactic(int tactic_idx1, int tactic_idx2, MOEParallelismConfig parallelism_config)
+    int mBestTacticGemm1 = -1;
+    int mBestTacticGemm2 = -1;
+
+    std::pair<int, int> setTactic(
+        int tactic_idx1, int tactic_idx2, MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
         auto tactics = mMoERunner.getTactics();
-        for (auto& t_ptr : {&tactic_idx1, &tactic_idx2})
+        std::vector<std::pair<std::reference_wrapper<int>, GemmToProfile>> tactics_to_profile{
+            {tactic_idx1, GemmToProfile::GEMM_1}, {tactic_idx2, GemmToProfile::GEMM_2}};
+        for (auto& combo : tactics_to_profile)
         {
-            auto& t = *t_ptr;
+            auto& t = combo.first.get();
+            if (combo.second != gemm_to_profile && gemm_to_profile != GemmToProfile::LAYER)
+            {
+                t = 0; // Unneeded tactic, set to 0
+                continue;
+            }
             if (t == -1)
             {
-                t = pickBestTactic(parallelism_config,
-                    t_ptr == &tactic_idx1 ? GemmProfilerBackend::GemmToProfile::GEMM_1
-                                          : GemmProfilerBackend::GemmToProfile::GEMM_2);
+                t = pickBestTactic(parallelism_config, combo.second);
             }
 
             if (t < 0 || t >= tactics.size())
@@ -858,38 +947,66 @@ class MixtureOfExpertsBenchmark : public ::benchmark::Fixture
         }
 
         mMoERunner.setTactic(tactics[tactic_idx1], tactics[tactic_idx2]);
+        mBestTacticGemm1 = tactic_idx1;
+        mBestTacticGemm2 = tactic_idx2;
         return {tactic_idx1, tactic_idx2};
     }
 
-    void runMoEPermute(MOEParallelismConfig parallelism_config)
+    void runMoEPermute(MOEParallelismConfig parallelism_config, GemmToProfile gemm_to_profile)
     {
-        auto stream = streamPtr->get();
-        MoeMinLatencyParams min_latency_params;
+        switch (gemm_to_profile)
+        {
+        case GemmToProfile::GEMM_1:
+        case GemmToProfile::GEMM_2:
+        {
+            auto tactic_idx = gemm_to_profile == GemmToProfile::GEMM_1 ? mBestTacticGemm1 : mBestTacticGemm2;
+            auto* expert_weights = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1 : mExpertWeight2;
+            auto expert_weights_size
+                = gemm_to_profile == GemmToProfile::GEMM_1 ? mExpertWeight1Size : mExpertWeight2Size;
+
+            auto tactics = mMoERunner.getTactics()[tactic_idx];
+            if (static_cast<int>(gemm_to_profile) != static_cast<int>(mGemmProfilerBackend.mGemmToProfile))
+            {
+                throw std::runtime_error("Configuration mismatch between mGemmProfilerBackend and runMoEPermute");
+            }
+            mGemmProfilerBackend.mSampleIndex = mBufferIndex % mGemmProfilerBackend.NUM_ROUTING_SAMPLES;
+            mGemmProfilerBackend.runProfiler(mTotalTokens, tactics,
+                mGemmProfilerWorkspace + mGemmProfilerWorkspaceSize * mBufferIndex,
+                /*expert_weights=*/expert_weights + expert_weights_size * mBufferIndex, streamPtr->get());
+            break;
+        }
+        case GemmToProfile::LAYER:
+        {
+            auto stream = streamPtr->get();
+            MoeMinLatencyParams min_latency_params;
 #ifdef USING_OSS_CUTLASS_MOE_GEMM
-        mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
-            mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
-            mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
-            mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
-            mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
-            mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
-            mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
-            mFinalOutput + mFinalOutputSize * mBufferIndex,
-            mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
-            /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
-            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
+            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+                mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
+                mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
+                mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
+                mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+                mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
+                mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
+                mFinalOutput + mFinalOutputSize * mBufferIndex,
+                mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config,
+                /*enable_alltoall=*/false, mUseLora, mLoraParams[mBufferIndex],
+                /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #else
-        mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
-            mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
-            mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
-            mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
-            mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
-            mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
-            mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
-            mFinalOutput + mFinalOutputSize * mBufferIndex,
-            mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, mUseLora,
-            mLoraParams[mBufferIndex],
-            /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
+            mMoERunner.runMoe(mInputTensor + mInputTensorSize * mBufferIndex, nullptr,
+                mSelectedExperts + mSelectedExpertsSize * mBufferIndex,
+                mUseFinalScale ? mScaleProbs + mScaleProbsSize * mBufferIndex : nullptr,
+                mExpertWeight1 + mExpertWeight1Size * mBufferIndex, mExpertBias1 + mExpertBias1Size * mBufferIndex,
+                mActType, mExpertWeight2 + mExpertWeight2Size * mBufferIndex,
+                mExpertBias2 + mExpertBias2Size * mBufferIndex, mQuantParams[mBufferIndex], mTotalTokens, mHiddenSize,
+                mInterSize, mNumExperts, mK, mWorkspace + mWorkspaceSize * mBufferIndex,
+                mFinalOutput + mFinalOutputSize * mBufferIndex,
+                mSourceToExpandedMap + mSourceToExpandedMapSize * mBufferIndex, parallelism_config, mUseLora,
+                mLoraParams[mBufferIndex],
+                /*use_fp8_block_scaling=*/false, /*min_latency_mode=*/false, min_latency_params, stream);
 #endif
+            break;
+        }
+        }
     }
 
     void runBenchmark(benchmark::State& state);
@@ -913,6 +1030,7 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
     int tactic_idx1 = state.range(11);
     int tactic_idx2 = state.range(12);
     int const routing_config = state.range(13);
+    GemmToProfile const gemm_to_profile = static_cast<GemmToProfile>(state.range(14));
 
     state.counters["num_experts"] = num_experts;
     state.counters["top_k"] = top_k;
@@ -928,11 +1046,12 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
     state.counters["routing_config"] = (int) routing_config;
     state.counters["dtype"] = (int) toDTypeID();
     state.counters["wtype"] = (int) toWTypeID();
+    state.counters["gemm_to_profile"] = (int) gemm_to_profile;
 
     std::stringstream ss;
-    ss << "Experts,K,Hidden,Inter,TP,EP,Rank,Tokens,Bias,Scale,Actfn,Tactic,Routing=";
+    ss << "Experts,K,Hidden,Inter,TP,EP,Rank,Tokens,Bias,Scale,Actfn,Tactic1,Tactic2,Gemm,Routing=";
     for (auto v : {num_experts, top_k, hidden_size, inter_size, tp_size, ep_size, world_rank, num_tokens,
-             (int) mUseBias, (int) mUseFinalScale, (int) mActType, tactic_idx1, tactic_idx2})
+             (int) mUseBias, (int) mUseFinalScale, (int) mActType, tactic_idx1, tactic_idx2, (int) gemm_to_profile})
     {
         ss << v << ",";
     }
@@ -942,10 +1061,11 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
 
     // Always use EP size for moe config until we support TP+EP, we just divide the inter size for TP
     MOEParallelismConfig parallelism_config{tp_size, world_rank / ep_size, ep_size, world_rank % ep_size};
-    initBuffersPermute(num_tokens, hidden_size, inter_size, num_experts, top_k, routing_config, parallelism_config);
+    initBuffersPermute(
+        num_tokens, hidden_size, inter_size, num_experts, top_k, routing_config, parallelism_config, gemm_to_profile);
 
     // Parse the tactic, does checks for "auto" mode and out of range
-    std::tie(tactic_idx1, tactic_idx2) = setTactic(tactic_idx1, tactic_idx2, parallelism_config);
+    std::tie(tactic_idx1, tactic_idx2) = setTactic(tactic_idx1, tactic_idx2, parallelism_config, gemm_to_profile);
     if (tactic_idx1 < 0 || tactic_idx2 < 0)
     {
         state.SkipWithMessage("Out of range tactic");
@@ -962,13 +1082,13 @@ void MixtureOfExpertsBenchmark<TypeTuple_>::runBenchmark(benchmark::State& state
     state.counters["tactic_idx1"] = tactic_idx1;
     state.counters["tactic_idx2"] = tactic_idx2;
 
-    createGraph(parallelism_config);
+    createGraph(parallelism_config, gemm_to_profile);
 
     {
-        NVTX3_SCOPED_RANGE(BenchmarkRun);
+        ::nvtx3::scoped_range nvtx(tensorrt_llm::common::nvtx::nextColor(), "BenchmarkRun " + ss.str());
         for (auto _ : state)
         {
-            float ms = benchmarkLoop(parallelism_config);
+            float ms = benchmarkLoop(parallelism_config, gemm_to_profile);
             state.SetIterationTime(ms / 1000.f);
         }
     }
diff --git a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
index 663759e3ff7..b784c6d0bc4 100644
--- a/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
+++ b/cpp/micro_benchmarks/mixtureOfExpertsBackendBenchmarkLauncher.cu
@@ -389,11 +389,11 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
             {
                 continue;
             }
-            else if (std::is_same_v<typename BenchClass::WeightType, float> && !hasDtype("float")
-                && !hasDtype("float32"))
-            {
-                continue;
-            }
+            // else if (std::is_same_v<typename BenchClass::WeightType, float> && !hasDtype("float")
+            //     && !hasDtype("float32"))
+            // {
+            //     continue;
+            // }
             else if (std::is_same_v<typename BenchClass::WeightType, half> && !hasDtype("float16") && !hasDtype("half"))
             {
                 continue;
@@ -452,8 +452,38 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
         int world_rank = get_or("world_rank", 0);
         int bias = get_or("bias", 0);
         int do_final_scale = get_or("do_final_scale", 1); // Default to scales on
+        int gemm_to_profile = get_or("gemm_to_profile", (int) GemmToProfile::LAYER);
         TLLM_CHECK_WITH_INFO(world_rank < tp_size * ep_size, "Rank is out of bounds of tp*ep");
 
+        if (gemm_to_profile != (int) GemmToProfile::LAYER && routing_config != UNIFORM_ROUTING_CONFIG)
+        {
+            static bool info_printed = false;
+            if (!info_printed && LOG_LEVEL >= INFO)
+            {
+                std::cerr << "Warning: GEMM profiling is experimental, results may be inaccurate" << std::endl;
+                info_printed = true;
+            }
+
+            static bool printed = false;
+            if (LOG_LEVEL >= ERROR && !printed)
+            {
+                std::cerr << "Warning: Profiling a specific GEMM will always use uniform random token distribution"
+                          << std::endl;
+                printed = true;
+            }
+            routing_config = UNIFORM_ROUTING_CONFIG;
+            if (gemm_to_profile == (int) GemmToProfile::GEMM_1)
+            {
+                tactic_ids2 = {-1};
+            }
+            else if (gemm_to_profile == (int) GemmToProfile::GEMM_2)
+            {
+                if (!has_tactic_ids2)
+                    tactic_ids2 = std::move(tactic_ids1);
+                tactic_ids1 = {-1};
+            }
+        }
+
         auto get_range = [&](std::string name, int min = 1, int max = INT32_MAX)
         {
             auto val = run_config.at(name).get<int>();
@@ -482,7 +512,7 @@ void argGenLoadFile(benchmark::internal::Benchmark* benchmark)
                     get_range("act_fn", 0, (int) ActivationType::Identity), //
                     t1,                                                     //
                     t2,                                                     //
-                    *routing_config});
+                    *routing_config, gemm_to_profile});
             }
         }
     }
@@ -518,7 +548,8 @@ void argGenHardcoded(benchmark::internal::Benchmark* benchmark)
                                         for (auto tactic2 : cutlass_tactic)
                                             for (auto routing : routing_config)
                                                 benchmark->Args({num_expert, k, size, inter_size, 1, 1, 0, tokens, bias,
-                                                    1, (int) act, tactic1, tactic2, routing});
+                                                    1, (int) act, tactic1, tactic2, routing,
+                                                    (int) GemmToProfile::LAYER});
                     }
 }
 
@@ -542,7 +573,7 @@ void argGen(benchmark::internal::Benchmark* benchmark)
     benchmark->UseManualTime();
     benchmark->ArgNames(
         {"Num Experts", "K", "Hidden Size", "Inter Size", "TP Size", "EP Size", "World Rank", "Num Tokens", "Use Bias",
-            "Use Final Scale", "Activation Function", "Tactic ID 1", "Tactic ID 2", "Routing ID"});
+            "Use Final Scale", "Activation Function", "Tactic ID 1", "Tactic ID 2", "Routing ID", "Gemm To Profile"});
 
     if (workloadFile)
         argGenLoadFile<BenchClass>(benchmark);
@@ -550,7 +581,8 @@ void argGen(benchmark::internal::Benchmark* benchmark)
         argGenHardcoded<BenchClass>(benchmark);
 }
 
-BENCHMARK_BASIC(float, float, float)
+// No one cares about float32
+// BENCHMARK_BASIC(float, float, float)
 BENCHMARK_BASIC(half, half, half)
 using uint8 = uint8_t;
 BENCHMARK_BASIC(half, uint8, half)
@@ -576,7 +608,7 @@ void delayedRegisterBenchmark()
     if (workloadFile)
     {
         // Extra ones we don't want for hardcoded runs
-        BENCHMARK_BASIC_DO_REGISTER(float, float, float);
+        // BENCHMARK_BASIC_DO_REGISTER(float, float, float);
         BENCHMARK_BASIC_DO_REGISTER(half, uint8, half);
         BENCHMARK_BASIC_DO_REGISTER(half, uint4b_t, half);
 #ifdef ENABLE_BF16
@@ -597,6 +629,9 @@ void doCleanup()
 
 void help()
 {
+    std::cout << "**Disclaimer: This benchmark is intended for developers to help evaluating the impact of new "
+                 "optimisations. This benchmark does not meet the same quality standards as other parts of TRT-LLM. "
+                 "Please use with caution**\n\n";
     std::cout << "Usage: mixtureOfExpertsBackendBenchmark [--disable_cuda_graphs] [--input_file <file>] [benchmark "
                  "options]\n";
     std::cout
@@ -624,6 +659,7 @@ void help()
            "    \"routing_name\": string, (optional)\n"
            "    \"selected_experts\": [int, ...], or string, (optional, length is a multiple of k)\n"
            "    \"expert_distribution\": [float, ...], or string, (optional, length is num_experts)\n"
+           "    \"gemm_to_profile\": int, (experimental, optional, 1 = gemm1, 2 = gemm2, 3 = layer)\n"
            "  },\n"
            "  ...\n"
            "]\n"
@@ -664,7 +700,7 @@ void help()
            "Useful for quick perf tests, prefer a full sweep and manually setting the tactic for more accurate "
            "results"
            "- dtypes - A list of dtypes to run this config through.\n"
-           "Allowed values are: fp8, fp4, wfp4afp8, int4, int8, float, half, bfloat16\n"
+           "Allowed values are: fp8, fp4, wfp4afp8, int4, int8, half, bfloat16\n"
            "If this argument is omitted all dtypes will be run. Note, not all tactics are supported for all "
            "dtypes,\n"
            "unsupported tactics will be skipped with a warning.\n"
@@ -681,6 +717,8 @@ void help()
            "- \"expert_distribution\" - instead of explicitly setting selected_experts, define a random distribution "
            "that experts will be randomly sampled from."
            "There is also pre-defined config \"uniform\", which is short-hand for a random uniform distribution\n"
+           "- \"gemm_to_profile\" - the gemm to profile, 1 = gemm1, 2 = gemm2, 3 = full layer. (default layer). If a "
+           "specific GEMM is profiled, it will always use uniform random token distribution\n"
            "\n";
 
     std::cout << "benchmark options:\n";
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
index 912c3553bb0..c7c9a55b959 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h
@@ -845,10 +845,10 @@ struct GemmProfilerBackend
         mWType = wtype;
         mOType = otype;
         mNumExperts = num_experts;
-        mNumExpertsPerNode = num_experts / (parallelism_config.ep_size * parallelism_config.tp_size);
+        mNumExpertsPerNode = num_experts / parallelism_config.ep_size;
         mK = k;
         mExpertHiddenSize = hidden_size;
-        mExpertInterSize = inter_size;
+        mExpertInterSize = inter_size; // Already divided by tp_size
         mGroupSize = group_size;
         mActivationType = activation_type;
         mBias = bias;

From b75e53ab695308f9464d5b3fc0e1d6441d053f71 Mon Sep 17 00:00:00 2001
From: Iman Tabrizian <10105175+Tabrizian@users.noreply.github.com>
Date: Thu, 17 Jul 2025 19:12:54 -0700
Subject: [PATCH 72/88] Revert "feat: nanobind bindings (#5961)" (#6160)

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
---
 cpp/CMakeLists.txt                            |   4 +-
 .../batch_manager/runtimeBuffers.h            |   2 +-
 .../batch_manager/runtimeBuffers.cpp          |   2 +-
 cpp/tensorrt_llm/nanobind/CMakeLists.txt      |  37 +-
 .../nanobind/batch_manager/algorithms.cpp     | 178 ----
 .../nanobind/batch_manager/algorithms.h       |  29 -
 .../nanobind/batch_manager/bindings.cpp       | 525 ----------
 .../nanobind/batch_manager/bindings.h         |  28 -
 .../nanobind/batch_manager/buffers.cpp        | 108 --
 .../nanobind/batch_manager/buffers.h          |  29 -
 .../batch_manager/cacheTransceiver.cpp        | 110 ---
 .../nanobind/batch_manager/cacheTransceiver.h |  29 -
 .../nanobind/batch_manager/kvCacheManager.cpp | 478 ---------
 .../nanobind/batch_manager/kvCacheManager.h   |  39 -
 .../nanobind/batch_manager/llmRequest.cpp     | 131 ---
 .../nanobind/batch_manager/llmRequest.h       | 160 ---
 cpp/tensorrt_llm/nanobind/bindings.cpp        | 471 +--------
 cpp/tensorrt_llm/nanobind/common/bindTypes.h  | 100 --
 .../nanobind/common/customCasters.h           | 345 -------
 .../nanobind/executor/bindings.cpp            | 263 -----
 cpp/tensorrt_llm/nanobind/executor/bindings.h |  29 -
 .../nanobind/executor/executor.cpp            | 241 -----
 cpp/tensorrt_llm/nanobind/executor/executor.h | 129 ---
 .../nanobind/executor/executorConfig.cpp      | 616 ------------
 .../nanobind/executor/executorConfig.h        |  30 -
 .../nanobind/executor/request.cpp             | 935 ------------------
 cpp/tensorrt_llm/nanobind/executor/request.h  |  29 -
 .../nanobind/runtime/bindings.cpp             | 388 --------
 cpp/tensorrt_llm/nanobind/runtime/bindings.h  |  30 -
 .../nanobind/runtime/moeBindings.cpp          | 124 ---
 .../nanobind/runtime/moeBindings.h            |  29 -
 .../nanobind/testing/modelSpecBinding.cpp     |  87 --
 .../nanobind/testing/modelSpecBinding.h       |  29 -
 .../nanobind/userbuffers/bindings.cpp         |  47 -
 .../nanobind/userbuffers/bindings.h           |  30 -
 cpp/tensorrt_llm/pybind/bindings.cpp          |   2 +-
 cpp/tensorrt_llm/pybind/executor/bindings.cpp |  12 +-
 .../pybind/executor/executorConfig.cpp        |   2 +-
 examples/models/core/llama/summarize_long.py  |   2 +-
 examples/models/core/qwen2audio/run.py        |   3 +-
 examples/models/core/qwenvl/run.py            |   3 +-
 jenkins/Build.groovy                          |  18 -
 jenkins/L0_Test.groovy                        |   8 -
 tensorrt_llm/builder.py                       |   2 +-
 tensorrt_llm/commands/build.py                |  19 +-
 tensorrt_llm/runtime/model_runner.py          |   2 +-
 .../integration/test_lists/test-db/l0_a10.yml |  15 -
 tests/unittest/bindings/test_bindings_ut.py   |   7 -
 .../bindings/test_executor_bindings.py        |  17 +-
 49 files changed, 21 insertions(+), 5932 deletions(-)
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/common/bindTypes.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/common/customCasters.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/bindings.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/bindings.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/executor.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/executor.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/executorConfig.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/request.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/executor/request.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/runtime/bindings.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
 delete mode 100644 cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
 delete mode 100644 cpp/tensorrt_llm/nanobind/userbuffers/bindings.h

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d9e8c206f46..a76b3e21558 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -198,7 +198,7 @@ set(TRT_LIB TensorRT::NvInfer)
 get_filename_component(TRT_LLM_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR} PATH)
 
 set(3RDPARTY_DIR ${TRT_LLM_ROOT_DIR}/3rdparty)
-if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
+if(BINDING_TYPE STREQUAL "pybind")
   add_subdirectory(${3RDPARTY_DIR}/pybind11
                    ${CMAKE_CURRENT_BINARY_DIR}/pybind11)
 endif()
@@ -217,7 +217,7 @@ include_directories(
   ${3RDPARTY_DIR}/cutlass/tools/util/include
   ${3RDPARTY_DIR}/NVTX/include
   ${3RDPARTY_DIR}/json/include)
-if(BINDING_TYPE STREQUAL "pybind" OR BUILD_DEEP_EP)
+if(BINDING_TYPE STREQUAL "pybind")
   include_directories(${3RDPARTY_DIR}/pybind11/include)
 endif()
 if(BINDING_TYPE STREQUAL "nanobind")
diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
index fa43d084b27..13bde6d07a5 100644
--- a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -168,7 +168,7 @@ class RuntimeBuffers
 
 public:
     //! Additional buffers depending on model type
-    std::shared_ptr<TransformerBuffers> transformerBuffers;
+    std::unique_ptr<TransformerBuffers> transformerBuffers;
     std::unique_ptr<RnnStateBuffers> rnnStateBuffers;
 
     //! Encoder-Decoder
diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
index e8b71d065f3..691fb9c7efd 100644
--- a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -84,7 +84,7 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
 
     if (modelConfig.isTransformerBased())
     {
-        transformerBuffers = std::make_shared<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
+        transformerBuffers = std::make_unique<TransformerBuffers>(maxBatchSize, maxBeamWidth, maxAttentionWindowVec,
             maxAttentionWindow, sinkTokenLen, runtime, modelConfig, worldConfig);
     }
     if (modelConfig.isRnnBased())
diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
index 3d570f024d7..d2e7eac20c2 100755
--- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt
+++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt
@@ -3,23 +3,7 @@ set(TRTLLM_NB_MODULE
     ${TRTLLM_NB_MODULE}
     PARENT_SCOPE)
 
-set(SRCS
-    batch_manager/algorithms.cpp
-    batch_manager/bindings.cpp
-    batch_manager/buffers.cpp
-    batch_manager/cacheTransceiver.cpp
-    batch_manager/kvCacheManager.cpp
-    batch_manager/llmRequest.cpp
-    executor/bindings.cpp
-    executor/executor.cpp
-    executor/executorConfig.cpp
-    executor/request.cpp
-    runtime/bindings.cpp
-    testing/modelSpecBinding.cpp
-    runtime/moeBindings.cpp
-    userbuffers/bindings.cpp
-    ../runtime/ipcNvlsMemory.cu
-    bindings.cpp)
+set(SRCS ../runtime/ipcNvlsMemory.cu bindings.cpp)
 
 include_directories(${PROJECT_SOURCE_DIR}/include)
 
@@ -30,29 +14,20 @@ set_property(TARGET ${TRTLLM_NB_MODULE} PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_link_directories(${TRTLLM_NB_MODULE} PUBLIC
                         "${TORCH_INSTALL_PREFIX}/lib")
 
-if(ENABLE_NVSHMEM)
-  target_link_libraries(${TRTLLM_NB_MODULE} PUBLIC nvshmem::nvshmem_host
-                                                   nvshmem::nvshmem_device)
-endif()
-
 target_link_libraries(
   ${TRTLLM_NB_MODULE}
-  PUBLIC ${SHARED_TARGET}
-         ${UNDEFINED_FLAG}
-         ${NO_AS_NEEDED_FLAG}
-         ${Python3_LIBRARIES}
-         ${TORCH_LIBRARIES}
-         torch_python
-         ${CUDA_NVML_LIB})
+  PUBLIC ${SHARED_TARGET} ${UNDEFINED_FLAG} ${NO_AS_NEEDED_FLAG}
+         ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python)
+
 target_compile_definitions(
   ${TRTLLM_NB_MODULE} PUBLIC TRTLLM_NB_MODULE=${TRTLLM_NB_MODULE}
-                             PYBIND11_DETAILED_ERROR_MESSAGES=1)
+                             NB_DETAILED_ERROR_MESSAGES=1)
 
 if(NOT WIN32)
   set_target_properties(
     ${TRTLLM_NB_MODULE}
     PROPERTIES
       LINK_FLAGS
-      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' -Wl,-rpath,'${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib/stubs' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
+      "-Wl,-rpath,'$ORIGIN/libs' -Wl,-rpath,'$ORIGIN/../nvidia/nccl/lib' ${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}"
   )
 endif()
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
deleted file mode 100644
index 637401555e8..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.cpp
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "algorithms.h"
-#include "tensorrt_llm/batch_manager/allocateKvCache.h"
-#include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
-#include "tensorrt_llm/batch_manager/capacityScheduler.h"
-#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
-#include "tensorrt_llm/batch_manager/handleContextLogits.h"
-#include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
-#include "tensorrt_llm/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/batch_manager/llmRequest.h"
-#include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
-#include "tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h"
-#include "tensorrt_llm/batch_manager/medusaBuffers.h"
-#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
-#include "tensorrt_llm/batch_manager/pauseRequests.h"
-#include "tensorrt_llm/batch_manager/peftCacheManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
-#include "tensorrt_llm/batch_manager/updateDecoderBuffers.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/runtime/decoderState.h"
-#include "tensorrt_llm/runtime/torch.h"
-#include "tensorrt_llm/runtime/torchView.h"
-
-#include <ATen/core/TensorBody.h>
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/list.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/tuple.h>
-#include <nanobind/stl/vector.h>
-#include <torch/extension.h>
-
-#include <optional>
-
-namespace nb = nanobind;
-
-namespace tr = tensorrt_llm::runtime;
-using namespace tensorrt_llm::batch_manager;
-
-void tensorrt_llm::nanobind::batch_manager::algorithms::initBindings(nb::module_& m)
-{
-    nb::class_<CapacityScheduler>(m, CapacityScheduler::name)
-        .def(nb::init<SizeType32, executor::CapacitySchedulerPolicy, bool, bool, LlmRequestState, LlmRequestState>(),
-            nb::arg("max_num_requests"), nb::arg("capacity_scheduler_policy"), nb::arg("has_kv_cache_manager"),
-            nb::arg("two_step_lookahead") = false, nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
-            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
-        .def("__call__", &CapacityScheduler::operator(), nb::arg("active_requests"),
-            nb::arg("kv_cache_manager") = nullptr, nb::arg("peft_cache_manager") = nullptr,
-            nb::arg("cross_kv_cache_manager") = nullptr)
-        .def("name", [](CapacityScheduler const&) { return CapacityScheduler::name; });
-
-    nb::class_<MicroBatchScheduler>(m, MicroBatchScheduler::name)
-        .def(nb::init<std::optional<batch_scheduler::ContextChunkingConfig>, std::optional<SizeType32>, LlmRequestState,
-                 LlmRequestState>(),
-            nb::arg("ctx_chunk_config") = std::nullopt, nb::arg("max_context_length") = std::nullopt,
-            nb::arg("no_schedule_until_state") = LlmRequestState::kCONTEXT_INIT,
-            nb::arg("no_schedule_after_state") = LlmRequestState::kGENERATION_COMPLETE)
-        .def("__call__", &MicroBatchScheduler::operator(), nb::arg("active_requests"), nb::arg("inflight_req_ids"),
-            nb::arg("max_batch_size_runtime"), nb::arg("max_num_tokens_runtime"))
-        .def("name", [](MicroBatchScheduler const&) { return MicroBatchScheduler::name; });
-
-    nb::class_<PauseRequests>(m, PauseRequests::name)
-        .def(nb::init<SizeType32>(), nb::arg("max_input_len"))
-        .def("__call__", &PauseRequests::operator(), nb::arg("requests_to_pause"), nb::arg("inflight_req_ids"),
-            nb::arg("req_ids_to_pause"), nb::arg("pause_flagged"), nb::arg("seq_slot_manager"),
-            nb::arg("kv_cache_manager") = std::nullopt, nb::arg("cross_kv_cache_manager") = std::nullopt,
-            nb::arg("peft_cache_manager") = std::nullopt)
-        .def("name", [](PauseRequests const&) { return PauseRequests::name; });
-
-    nb::class_<AssignReqSeqSlots>(m, AssignReqSeqSlots::name)
-        .def(nb::init<>())
-        .def("__call__", &AssignReqSeqSlots::operator(), nb::arg("seq_slot_manager"), nb::arg("context_requests"),
-            nb::arg("generation_requests"))
-        .def("name", [](AssignReqSeqSlots const&) { return AssignReqSeqSlots::name; });
-
-    nb::class_<AllocateKvCache>(m, AllocateKvCache::name)
-        .def(nb::init<>())
-        .def("__call__", &AllocateKvCache::operator(), nb::arg("kv_cache_manager"), nb::arg("context_requests"),
-            nb::arg("generation_requests"), nb::arg("model_config"), nb::arg("cross_kv_cache_manager") = std::nullopt)
-        .def("name", [](AllocateKvCache const&) { return AllocateKvCache::name; });
-
-    nb::class_<HandleContextLogits>(m, HandleContextLogits::name)
-        .def(nb::init<>())
-        .def(
-            "__call__",
-            [](HandleContextLogits const& self, DecoderInputBuffers& inputBuffers, RequestVector const& contextRequests,
-                at::Tensor const& logits, std::vector<tr::SizeType32> const& numContextLogitsVec,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                return self(inputBuffers, contextRequests, tr::TorchView::of(logits), numContextLogitsVec, modelConfig,
-                    manager, medusaBuffers);
-            },
-            nb::arg("decoder_input_buffers"), nb::arg("context_requests"), nb::arg("logits"),
-            nb::arg("num_context_logits"), nb::arg("model_config"), nb::arg("buffer_manager"),
-            nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleContextLogits const&) { return HandleContextLogits::name; });
-
-    nb::class_<HandleGenerationLogits>(m, HandleGenerationLogits::name)
-        .def(nb::init<>())
-        .def(
-            "__call__",
-            [](HandleGenerationLogits const& self, DecoderInputBuffers& inputBuffers,
-                RequestVector const& generationRequests, at::Tensor const& logits, tr::SizeType32 logitsIndex,
-                tr::ModelConfig const& modelConfig, tr::BufferManager const& manager,
-                OptionalRef<RuntimeBuffers> genRuntimeBuffers = std::nullopt,
-                OptionalRef<MedusaBuffers> medusaBuffers = std::nullopt)
-            {
-                self(inputBuffers, generationRequests, tr::TorchView::of(logits), logitsIndex, modelConfig, manager,
-                    genRuntimeBuffers, medusaBuffers);
-            },
-            nb::arg("decoder_input_buffers"), nb::arg("generation_requests"), nb::arg("logits"),
-            nb::arg("logits_index"), nb::arg("model_config"), nb::arg("buffer_manager"),
-            nb::arg("gen_runtime_buffers") = std::nullopt, nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](HandleGenerationLogits const&) { return HandleGenerationLogits::name; });
-
-    nb::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
-        .def(nb::init<>())
-        .def("__call__", &MakeDecodingBatchInputOutput::operator(), nb::arg("context_requests"),
-            nb::arg("generation_requests"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
-            nb::arg("model_config"), nb::arg("max_num_sequences"), nb::arg("fused_runtime_buffers") = std::nullopt)
-        .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
-
-    nb::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
-        .def(nb::init<>())
-        .def("__call__", &LogitsPostProcessor::operator(), nb::arg("context_requests"), nb::arg("generation_requests"),
-            nb::arg("replicate_logits_post_processor"), nb::arg("decoder_buffers"), nb::arg("world_config"),
-            nb::arg("runtime"), nb::arg("logits_post_processor_batched") = std::nullopt)
-        .def("name", [](LogitsPostProcessor const&) { return LogitsPostProcessor::name; });
-
-    nb::class_<CreateNewDecoderRequests>(m, CreateNewDecoderRequests::name)
-        .def(nb::init<bool, bool, bool>(), nb::arg("speculative_decoding_fast_logits"),
-            nb::arg("is_leader_in_orch_mode"), nb::arg("is_normalize_log_probs"))
-        .def(
-            "__call__",
-            [](CreateNewDecoderRequests& self, tr::ModelConfig const& modelConfig, tr::WorldConfig const& worldConfig,
-                executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
-                tr::BufferManager const& bufferManager, nvinfer1::DataType logitsType,
-                DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
-                tensorrt_llm::runtime::CudaStream const& runtimeStream,
-                tensorrt_llm::runtime::CudaStream const& decoderStream, SizeType32 maxSequenceLength,
-                SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers = std::nullopt)
-            {
-                auto [batchSlots, samplingConfigs, lookaheadPrompt, lookaheadAlgoConfigs] = self(modelConfig,
-                    worldConfig, decodingConfig, contextRequests, bufferManager, logitsType, inputBuffers, decoderState,
-                    runtimeStream, decoderStream, maxSequenceLength, beamWidth, medusaBuffers);
-
-                return std::tuple{runtime::Torch::tensor(batchSlots), std::move(samplingConfigs),
-                    std::move(lookaheadPrompt), std::move(lookaheadAlgoConfigs)};
-            },
-            nb::arg("model_config"), nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("context_requests"),
-            nb::arg("buffer_manager"), nb::arg("logits_type"), nb::arg("decoder_input_buffers"),
-            nb::arg("decoder_state"), nb::arg("runtime_stream"), nb::arg("decoder_stream"),
-            nb::arg("max_sequence_length"), nb::arg("beam_width"), nb::arg("medusa_buffers") = std::nullopt)
-        .def("name", [](CreateNewDecoderRequests const&) { return CreateNewDecoderRequests::name; });
-
-    nb::class_<UpdateDecoderBuffers>(m, UpdateDecoderBuffers::name)
-        .def(nb::init<>())
-        .def("__call__", &UpdateDecoderBuffers::operator(), nb::arg("model_config"), nb::arg("decoder_output_buffers"),
-            nb::arg("copy_buffer_manager"), nb::arg("decoder_state"), nb::arg("return_log_probs"),
-            nb::arg("decoder_finish_event"))
-        .def("name", [](UpdateDecoderBuffers const&) { return UpdateDecoderBuffers::name; });
-}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h b/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
deleted file mode 100644
index cac81d73f27..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/algorithms.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::batch_manager::algorithms
-{
-
-void initBindings(nb::module_& m);
-
-}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
deleted file mode 100644
index d44a957aad9..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
+++ /dev/null
@@ -1,525 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bindings.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-
-#include "tensorrt_llm/batch_manager/common.h"
-#include "tensorrt_llm/batch_manager/decoderBuffers.h"
-#include "tensorrt_llm/batch_manager/medusaBuffers.h"
-#include "tensorrt_llm/batch_manager/microBatchScheduler.h"
-#include "tensorrt_llm/batch_manager/peftCacheManager.h"
-#include "tensorrt_llm/batch_manager/rnnStateManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
-#include "tensorrt_llm/batch_manager/sequenceSlotManager.h"
-#include "tensorrt_llm/nanobind/common/bindTypes.h"
-#include "tensorrt_llm/runtime/gptDecoderBatched.h"
-#include "tensorrt_llm/runtime/runtimeKernels.h"
-#include "tensorrt_llm/runtime/torch.h"
-#include "tensorrt_llm/runtime/torchView.h"
-
-#include <ATen/ATen.h>
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/tuple.h>
-#include <nanobind/stl/unique_ptr.h>
-#include <nanobind/stl/vector.h>
-#include <torch/extension.h>
-#include <tuple>
-
-namespace nb = nanobind;
-namespace tb = tensorrt_llm::batch_manager;
-namespace tle = tensorrt_llm::executor;
-namespace tr = tensorrt_llm::runtime;
-
-using namespace tensorrt_llm::runtime;
-
-namespace tensorrt_llm::nanobind::batch_manager
-{
-
-void initBindings(nb::module_& m)
-{
-    using GenLlmReq = tb::GenericLlmRequest<runtime::ITensor::SharedPtr>;
-
-    // Create and register exceptions in module scope
-    nb::exception<tb::PeftTaskNotCachedException>(m, "PeftTaskNotCachedException");
-    nb::exception<tr::LoraCacheFullException>(m, "LoraCacheFullException");
-
-    // Register with no captures
-    nb::register_exception_translator(
-        [](std::exception_ptr const& p, void*)
-        {
-            try
-            {
-                if (p)
-                    std::rethrow_exception(p);
-            }
-            catch (const tb::PeftTaskNotCachedException& e)
-            {
-                PyErr_SetString(nb::type<tb::PeftTaskNotCachedException>().ptr(), e.what());
-            }
-            catch (const tr::LoraCacheFullException& e)
-            {
-                PyErr_SetString(nb::type<tr::LoraCacheFullException>().ptr(), e.what());
-            }
-        });
-
-    PybindUtils::bindSet<tb::ReqIdsSet>(m, "ReqIdsSet");
-
-    nb::enum_<tb::LlmRequestType>(m, "LlmRequestType")
-        .value("LLMREQUEST_TYPE_CONTEXT_AND_GENERATION", tb::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION)
-        .value("LLMREQUEST_TYPE_CONTEXT_ONLY", tb::LLMREQUEST_TYPE_CONTEXT_ONLY)
-        .value("LLMREQUEST_TYPE_GENERATION_ONLY", tb::LLMREQUEST_TYPE_GENERATION_ONLY)
-        .export_values();
-
-    nb::class_<tb::batch_scheduler::ContextChunkingConfig>(m, "ContextChunkingConfig")
-        .def(nb::init<tle::ContextChunkingPolicy, tensorrt_llm::runtime::SizeType32>(), nb::arg("chunking_policy"),
-            nb::arg("chunk_unit_size"))
-        .def_rw("chunking_policy", &tb::batch_scheduler::ContextChunkingConfig::chunkingPolicy)
-        .def_rw("chunk_unit_size", &tb::batch_scheduler::ContextChunkingConfig::chunkUnitSize);
-
-    nb::class_<GenLlmReq>(m, "GenericLlmRequest")
-        .def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, nb::arg("exclude"))
-        .def("get_num_tokens", &GenLlmReq::getNumTokens, nb::arg("beam"))
-        .def_prop_ro("max_beam_num_tokens", &GenLlmReq::getMaxBeamNumTokens)
-        .def("get_token", &GenLlmReq::getToken, nb::arg("beam"), nb::arg("pos"))
-        .def("get_tokens", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getTokens, nb::const_), nb::arg("beam"))
-        .def("get_tokens", nb::overload_cast<>(&GenLlmReq::getTokens, nb::const_))
-        .def("get_last_tokens", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getLastTokens), nb::arg("beam"))
-        .def("get_last_tokens", nb::overload_cast<>(&GenLlmReq::getLastTokens))
-        .def("get_beam_width_by_iter", &GenLlmReq::getBeamWidthByIter, nb::arg("for_next_iteration") = false)
-        .def_prop_ro("max_num_generated_tokens", &GenLlmReq::getMaxNumGeneratedTokens)
-        .def("add_new_token", &GenLlmReq::addNewToken, nb::arg("token"), nb::arg("beam"))
-        .def("add_new_tokens", &GenLlmReq::addNewTokens, nb::arg("beam_tokens"))
-        .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
-        .def("set_generated_tokens", &GenLlmReq::setGeneratedTokens, nb::arg("generated_beam_tokens"))
-        .def("pause", &GenLlmReq::pause, nb::arg("max_input_len"))
-        .def_prop_rw("max_sent_token_len", &GenLlmReq::getMaxSentTokenLen, &GenLlmReq::setMaxSentTokenLen)
-        .def_prop_ro("prompt_embedding_table", &GenLlmReq::getPromptEmbeddingTable)
-        .def_prop_ro("multimodal_embedding", &GenLlmReq::getMultimodalEmbedding)
-        .def_prop_ro("mrope_rotary_cos_sin", &GenLlmReq::getMropeRotaryCosSin)
-        .def_prop_ro("bad_words_list", &GenLlmReq::getBadWordsList)
-        .def_prop_rw("draft_logits", &GenLlmReq::getDraftLogits, &GenLlmReq::setDraftLogits)
-        .def_prop_ro("embedding_bias", &GenLlmReq::getEmbeddingBias)
-        .def_prop_rw("lora_config", &GenLlmReq::getLoraConfig, &GenLlmReq::setLoraConfig)
-        .def_prop_rw("lora_weights", &GenLlmReq::getLoraWeights, &GenLlmReq::setLoraWeights)
-        .def_prop_ro("stop_words_list", &GenLlmReq::getStopWordsList)
-        .def_prop_ro("context_logits", &GenLlmReq::getContextLogitsHost)
-        .def_prop_ro("generation_logits", &GenLlmReq::getGenerationLogitsHost)
-        .def_prop_ro("prompt_vocab_size", &GenLlmReq::getPromptVocabSize)
-        .def_prop_ro("mrope_position_deltas", &GenLlmReq::getMropePositionDeltas)
-        .def_prop_ro("lora_task_id", &GenLlmReq::getLoraTaskId)
-        .def_prop_ro("lookahead_config", &GenLlmReq::getLookaheadConfig)
-        .def_prop_rw("context_chunk_size", &GenLlmReq::getContextChunkSize, &GenLlmReq::setContextChunkSize)
-        .def_prop_rw("decoding_iter", &GenLlmReq::getDecodingIter, &GenLlmReq::setDecodingIter)
-        .def_rw("request_id", &GenLlmReq::mRequestId)
-        .def_rw("prompt_len", &GenLlmReq::mPromptLen)
-        .def_rw("max_new_tokens", &GenLlmReq::mMaxNewTokens)
-        .def_rw("sampling_config", &GenLlmReq::mSamplingConfig)
-        .def_prop_rw("state", &GenLlmReq::getState, &GenLlmReq::setState)
-        .def_prop_rw("streaming", &GenLlmReq::isStreaming, &GenLlmReq::setStreaming)
-        .def_rw("end_id", &GenLlmReq::mEndId)
-        .def_rw("pad_id", &GenLlmReq::mPadId)
-        .def_rw("seq_slot", &GenLlmReq::mSeqSlot)
-        .def_prop_ro("return_log_probs", &GenLlmReq::returnLogProbs)
-        .def_prop_ro("return_context_logits", &GenLlmReq::getReturnContextLogits)
-        .def_prop_ro("return_generation_logits", &GenLlmReq::getReturnGenerationLogits)
-        .def_prop_ro("log_probs", nb::overload_cast<>(&GenLlmReq::getLogProbs, nb::const_))
-        .def("get_log_probs", nb::overload_cast<GenLlmReq::SizeType32>(&GenLlmReq::getLogProbs, nb::const_))
-        .def("set_log_probs", &GenLlmReq::setLogProbs, nb::arg("log_probs"), nb::arg("beam"))
-        .def("set_return_encoder_output", &GenLlmReq::setReturnEncoderOutput, nb::arg("return_encoder_output"))
-        .def("get_return_encoder_output", &GenLlmReq::getReturnEncoderOutput)
-        .def("priority", nb::overload_cast<>(&GenLlmReq::priority, nb::const_))
-        .def("set_priority", nb::overload_cast<tle::PriorityType>(&GenLlmReq::setPriority))
-        .def_prop_ro("cum_log_probs", &GenLlmReq::getCumLogProbs)
-        .def("set_cum_log_prob", &GenLlmReq::setCumLogProb, nb::arg("cum_log_prob"), nb::arg("beam"))
-        .def("update_num_tokens_per_iteration", &GenLlmReq::updateNumTokensPerIteration,
-            nb::arg("num_tokens_per_iteration"), nb::arg("model_config"))
-        .def_prop_ro("orig_prompt_len", &GenLlmReq::getOrigPromptLen)
-        .def("has_draft_tokens", &GenLlmReq::hasDraftTokens)
-        .def("move_to_next_context_chunk", &GenLlmReq::moveToNextContextChunk)
-        .def_prop_ro("is_last_context_chunk", &GenLlmReq::isLastContextChunk)
-        .def_prop_ro("is_first_context_chunk", &GenLlmReq::isFirstContextChunk)
-        .def_prop_ro("context_remaining_length", &GenLlmReq::getContextRemainingLength)
-        .def_prop_ro("context_logits", &GenLlmReq::getContextLogitsHost)
-        .def_prop_ro("num_draft_tokens", &GenLlmReq::getNumDraftTokens)
-        .def("set_finished_reason", &GenLlmReq::setFinishedReason, nb::arg("finish_reason"), nb::arg("beam"))
-        .def_prop_ro("is_finished", &GenLlmReq::isFinished)
-        .def_prop_ro("is_finished_due_to_length", &GenLlmReq::isFinishedDueToLength)
-        .def_prop_rw(
-            "context_current_position", &GenLlmReq::getContextCurrentPosition, &GenLlmReq::setContextCurrentPosition)
-        .def_prop_ro("prepopulated_prompt_len", &GenLlmReq::getPrepopulatedPromptLen)
-        .def_prop_rw("guided_decoding_params", &GenLlmReq::getGuidedDecodingParams, &GenLlmReq::setGuidedDecodingParams)
-        .def_prop_ro("context_phase_params", &GenLlmReq::getContextPhaseParams)
-        .def_prop_ro("is_context_only_request", &GenLlmReq::isContextOnlyRequest)
-        .def_prop_ro("is_generation_only_request", &GenLlmReq::isGenerationOnlyRequest)
-        .def_prop_ro("is_generation_complete_state", &GenLlmReq::isGenerationCompleteState)
-        .def_prop_ro("is_context_finished", &GenLlmReq::isContextFinished)
-        .def_prop_ro("is_disagg_generation_init_state", &GenLlmReq::isDisaggGenerationInitState)
-        .def_prop_ro("is_disagg_generation_transmission_complete", &GenLlmReq::isDisaggGenerationTransmissionComplete)
-        .def_prop_ro(
-            "is_disagg_generation_transmission_in_progress", &GenLlmReq::isDisaggGenerationTransmissionInProgress)
-        .def_prop_ro("is_context_init_state", &GenLlmReq::isContextInitState)
-        .def_prop_ro("is_generation_in_progress_state", &GenLlmReq::isGenerationInProgressState)
-        .def_prop_ro("is_disagg_context_transmission_state", &GenLlmReq::isDisaggContextTransmissionState)
-        .def_prop_ro("is_disagg_context_complete_state", &GenLlmReq::isDisaggContextCompleteState)
-        .def_prop_ro("stage", &GenLlmReq::getRequestStage)
-        .def_prop_ro("kv_cache_transfer_time_ms", &GenLlmReq::getKvCacheTransferTimeMS)
-        .def_prop_ro("kv_cache_size", &GenLlmReq::getKvCacheSize)
-        .def_prop_ro("avg_decoded_tokens_per_iter", &GenLlmReq::getAvgDecodedTokensPerIter)
-        .def_prop_ro("alloc_total_blocks", &GenLlmReq::getAllocTotalBlocksPerRequest)
-        .def_prop_ro("alloc_new_blocks", &GenLlmReq::getAllocNewBlocksPerRequest)
-        .def("alloc_context_logits", &GenLlmReq::allocContextLogitsHost, nb::arg("vocab_size"), nb::arg("logit_dtype"))
-        .def_prop_ro("reused_blocks", &GenLlmReq::getReusedBlocksPerRequest)
-        .def_prop_ro("missed_blocks", &GenLlmReq::getMissedBlocksPerRequest)
-        .def_prop_ro("kv_cache_hit_rate", &GenLlmReq::getKVCacheHitRatePerRequest)
-        .def_prop_ro("llm_request_type", &GenLlmReq::getLlmRequestType)
-        .def_prop_ro("multimodal_hashes",
-            [](GenLlmReq& self)
-            {
-                std::optional<std::vector<std::vector<GenLlmReq::SizeType32>>> hashes = std::nullopt;
-                if (self.getMultimodalHashes())
-                {
-                    hashes = *self.getMultimodalHashes().value();
-                }
-                return hashes;
-            })
-        .def_prop_ro("multimodal_positions",
-            [](GenLlmReq& self)
-            {
-                std::optional<std::vector<GenLlmReq::SizeType32>> positions = std::nullopt;
-                if (self.getMultimodalPositions())
-                {
-                    positions = *self.getMultimodalPositions().value();
-                }
-                return positions;
-            })
-        .def_prop_ro("multimodal_lengths",
-            [](GenLlmReq& self)
-            {
-                std::optional<std::vector<GenLlmReq::SizeType32>> lengths = std::nullopt;
-                if (self.getMultimodalLengths())
-                {
-                    lengths = *self.getMultimodalLengths().value();
-                }
-                return lengths;
-            })
-        .def_prop_ro("position_ids",
-            [](GenLlmReq& self)
-            {
-                std::optional<std::vector<GenLlmReq::SizeType32>> positionIds = std::nullopt;
-                if (self.getPositionIds())
-                {
-                    positionIds = *self.getPositionIds().value();
-                }
-                return positionIds;
-            })
-        .def_prop_rw(
-            "draft_tokens",
-            [](GenLlmReq& self)
-            {
-                std::optional<GenLlmReq::VecTokens> draftTokens = std::nullopt;
-                if (self.hasDraftTokens())
-                {
-                    draftTokens = *self.getDraftTokens();
-                }
-                return draftTokens;
-            },
-            [](GenLlmReq& self, std::optional<GenLlmReq::VecTokens> const& draftTokens)
-            {
-                if (draftTokens)
-                {
-                    self.setDraftTokens(std::make_shared<GenLlmReq::VecTokens>(draftTokens.value()));
-                }
-            })
-        .def_prop_rw("is_dummy_request", &GenLlmReq::isDummyRequest, &GenLlmReq::setIsDummyRequest)
-        .def_prop_ro("return_perf_metrics", &GenLlmReq::getReturnPerfMetrics);
-
-    nb::class_<tb::LlmRequest, GenLlmReq>(m, "LlmRequest", nb::dynamic_attr())
-        .def(
-            "__init__",
-            [](tb::LlmRequest* self, tb::LlmRequest::RequestIdType request_id,
-                tb::LlmRequest::SizeType32 max_new_tokens, std::vector<tb::LlmRequest::TokenIdType> input_tokens,
-                runtime::SamplingConfig sampling_config, bool is_streaming,
-                std::optional<tb::LlmRequest::SizeType32> end_id, std::optional<tb::LlmRequest::SizeType32> pad_id,
-                std::optional<at::Tensor> embedding_bias, std::optional<at::Tensor> bad_words_list,
-                std::optional<at::Tensor> stop_words_list,
-                std::optional<std::vector<tb::LlmRequest::SizeType32>> position_ids,
-                std::optional<at::Tensor> prompt_embedding_table,
-                std::optional<tb::LlmRequest::SizeType32> prompt_vocab_size,
-                std::optional<std::vector<std::vector<tb::LlmRequest::SizeType32>>> multimodal_hashes,
-                std::optional<std::vector<tb::LlmRequest::SizeType32>> multimodal_positions,
-                std::optional<std::vector<tb::LlmRequest::SizeType32>> multimodal_lengths,
-                std::optional<at::Tensor> multimodal_embedding, std::optional<at::Tensor> mrope_rotary_cos_sin,
-                std::optional<tb::LlmRequest::SizeType32> mrope_position_deltas,
-                std::optional<LoraTaskIdType> lora_task_id, std::optional<at::Tensor> lora_weights,
-                std::optional<at::Tensor> lora_config,
-                std::optional<executor::LookaheadDecodingConfig> lookahead_config,
-                std::optional<executor::KvCacheRetentionConfig> kv_cache_retention_config, bool return_log_probs,
-                bool return_context_logits, bool return_generation_logits,
-                std::optional<tb::LlmRequest::VecTokens> draft_tokens, std::optional<at::Tensor> draft_logits,
-                bool exclude_input_from_output,
-                std::optional<tb::LlmRequest::LogitsPostProcessor> logits_post_processor,
-                bool apply_logits_post_processor_batched, std::optional<tb::LlmRequest::VecTokens> encoder_input_tokens,
-                bool return_encoder_output, std::optional<tb::LlmRequest::RequestIdType> client_id,
-                executor::PriorityType priority, std::optional<at::Tensor> encoder_input_features,
-                std::optional<tb::LlmRequest::SizeType32> encoder_output_length,
-                std::optional<at::Tensor> cross_attention_mask, tb::LlmRequestType llm_request_type,
-                std::optional<tb::LlmRequest::VecTokenExtraIds> input_token_extra_ids,
-                tb::LlmRequest::SizeType32 num_return_sequences, std::optional<executor::EagleConfig> eagle_config,
-                std::optional<at::Tensor> skip_cross_attn_blocks, bool return_perf_metrics,
-                std::optional<executor::GuidedDecodingParams> guided_decoding_params,
-                std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
-                std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
-                std::optional<executor::ContextPhaseParams> context_phase_params)
-            {
-                auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
-                {
-                    std::optional<tb::LlmRequest::TensorPtr> tensorPtr = std::nullopt;
-                    if (atTensor)
-                    {
-                        tensorPtr = tr::TorchView::of(atTensor.value());
-                        if (unsqueeze)
-                        {
-                            (*tensorPtr)->unsqueeze(0);
-                        }
-                    }
-                    return tensorPtr;
-                };
-
-                auto embedding_bias_tensor_ptr = makeOptionalTensor(embedding_bias, true);
-                auto bad_words_list_tensor_ptr = makeOptionalTensor(bad_words_list, true);
-                auto stop_words_list_tensor_ptr = makeOptionalTensor(stop_words_list, true);
-                auto prompt_embedding_table_tensor_ptr = makeOptionalTensor(prompt_embedding_table);
-                auto multimodal_embedding_tensor_ptr = makeOptionalTensor(multimodal_embedding);
-                auto lora_weights_tensor_ptr = makeOptionalTensor(lora_weights);
-                auto mrope_rotary_cos_sin_tensor_ptr = makeOptionalTensor(mrope_rotary_cos_sin);
-                auto lora_config_tensor_ptr = makeOptionalTensor(lora_config);
-                auto draft_logits_tensor_ptr = makeOptionalTensor(draft_logits);
-                auto encoder_input_features_tensor_ptr = makeOptionalTensor(encoder_input_features);
-                auto cross_attention_mask_tensor_ptr = makeOptionalTensor(cross_attention_mask);
-                auto skip_cross_attn_blocks_tensor_ptr = makeOptionalTensor(skip_cross_attn_blocks);
-
-                // 49 parameters
-                new (self) tb::LlmRequest{request_id, max_new_tokens, input_tokens, sampling_config, is_streaming,
-                    end_id, pad_id, embedding_bias_tensor_ptr, bad_words_list_tensor_ptr, stop_words_list_tensor_ptr,
-                    position_ids, prompt_embedding_table_tensor_ptr, prompt_vocab_size, multimodal_hashes,
-                    multimodal_positions, multimodal_lengths, multimodal_embedding_tensor_ptr,
-                    mrope_rotary_cos_sin_tensor_ptr, mrope_position_deltas, lora_task_id, lora_weights_tensor_ptr,
-                    lora_config_tensor_ptr, lookahead_config, kv_cache_retention_config, return_log_probs,
-                    return_context_logits, return_generation_logits, draft_tokens, draft_logits_tensor_ptr,
-                    exclude_input_from_output, logits_post_processor, apply_logits_post_processor_batched,
-                    encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr,
-                    encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
-                    num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
-                    guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params};
-            },
-            nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
-            nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
-            nb::arg("embedding_bias") = std::nullopt, nb::arg("bad_words_list") = std::nullopt,
-            nb::arg("stop_words_list") = std::nullopt, nb::arg("position_ids") = std::nullopt,
-            nb::arg("prompt_embedding_table") = std::nullopt, nb::arg("prompt_vocab_size") = std::nullopt,
-            nb::arg("multimodal_hashes") = std::nullopt, nb::arg("multimodal_positions") = std::nullopt,
-            nb::arg("multimodal_lengths") = std::nullopt, nb::arg("multimodal_embedding") = std::nullopt,
-            nb::arg("mrope_rotary_cos_sin") = std::nullopt, nb::arg("mrope_position_deltas") = std::nullopt,
-            nb::arg("lora_task_id") = std::nullopt, nb::arg("lora_weights") = std::nullopt,
-            nb::arg("lora_config") = std::nullopt, nb::arg("lookahead_config") = std::nullopt,
-            nb::arg("kv_cache_retention_config") = std::nullopt, nb::arg("return_log_probs") = false,
-            nb::arg("return_context_logits") = false, nb::arg("return_generation_logits") = false,
-            nb::arg("draft_tokens") = std::nullopt, nb::arg("draft_logits") = std::nullopt,
-            nb::arg("exclude_input_from_output") = false, nb::arg("logits_post_processor") = std::nullopt,
-            nb::arg("apply_logits_post_processor_batched") = false, nb::arg("encoder_input_tokens") = std::nullopt,
-            nb::arg("return_encoder_output") = false, nb::arg("client_id") = std::nullopt,
-            nb::arg("priority") = executor::Request::kDefaultPriority, nb::arg("encoder_input_features") = std::nullopt,
-            nb::arg("encoder_output_len") = std::nullopt, nb::arg("cross_attention_mask") = std::nullopt,
-            nb::arg("llm_request_type") = tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
-            nb::arg("input_token_extra_ids") = std::nullopt, nb::arg("num_return_sequences") = 1,
-            nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt,
-            nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
-            nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
-            nb::arg("context_phase_params") = std::nullopt)
-        .def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),
-            nb::arg("max_draft_len"), nb::arg("vocab_size_padded"), nb::arg("max_endocer_input_len") = std::nullopt,
-            nb::arg("enable_kv_cache_reuse") = false)
-        .def("create_response", &tb::LlmRequest::createResponse, nb::arg("use_fast_logits") = false,
-            nb::arg("mpi_world_rank") = 0)
-        .def("create_result", &tb::LlmRequest::createResult, nb::arg("use_fast_logits") = false,
-            nb::arg("mpi_world_rank") = 0)
-        .def("create_serialized_result",
-            [](tb::LlmRequest& self, bool use_fast_logits = false, int mpi_world_rank = 0)
-            {
-                std::vector<char> serialized_result;
-                bool is_final = false;
-                self.createSerializedResult(serialized_result, is_final, use_fast_logits, mpi_world_rank);
-                return std::make_tuple(nb::bytes(serialized_result.data(), serialized_result.size()), is_final);
-            })
-        .def("move_prompt_embedding_table_to_gpu", &tb::LlmRequest::movePromptEmbeddingTableToGpu, nb::arg("manager"))
-        .def("move_lora_weights_to_gpu", &tb::LlmRequest::moveLoraWeightsToGpu, nb::arg("manager"))
-        .def("finish_by_reason", &tb::LlmRequest::finishByReason, nb::arg("finish_reason"))
-        .def("set_first_scheduled_time", &tb::LlmRequest::setFirstScheduledTime)
-        .def("update_perf_metrics", &tb::LlmRequest::updatePerfMetrics, nb::arg("iter_counter"));
-
-    nb::class_<tb::SequenceSlotManager>(m, "SequenceSlotManager")
-        .def(nb::init<tb::SequenceSlotManager::SlotIdType, uint64_t>(), nb::arg("max_num_slots"),
-            nb::arg("max_sequence_idle_microseconds"))
-        .def("get_sequence_slot", &tb::SequenceSlotManager::getSequenceSlot, nb::arg("start_flag"),
-            nb::arg("sequence_id"))
-        .def("free_sequence_slot", &tb::SequenceSlotManager::freeSequenceSlot, nb::arg("sequence_id"))
-        .def("free_idle_sequence_slots", &tb::SequenceSlotManager::freeIdleSequenceSlots);
-
-    nb::class_<tb::rnn_state_manager::RnnStateManager>(m, "RnnStateManager")
-        .def(nb::init<tr::SizeType32, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
-            nb::arg("max_num_sequences"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
-
-    nb::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(),
-            nb::arg("max_num_sequences"), nb::arg("max_batch_size"), nb::arg("max_tokens_per_engine_step"),
-            nb::arg("manager"))
-        .def_rw("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
-        .def_rw("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
-        .def_rw("fill_values", &tb::DecoderInputBuffers::fillValues)
-        .def_rw("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
-        .def_rw("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
-        .def_rw("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
-        .def_rw("logits", &tb::DecoderInputBuffers::logits);
-
-    nb::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
-        .def_rw("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
-        .def_rw("finished_sum_host", &tb::DecoderOutputBuffers::finishedSumHost)
-        .def_prop_ro("new_output_tokens_host",
-            [](tb::DecoderOutputBuffers& self) { return tr::Torch::tensor(self.newOutputTokensHost); })
-        .def_rw("cum_log_probs_host", &tb::DecoderOutputBuffers::cumLogProbsHost)
-        .def_rw("log_probs_host", &tb::DecoderOutputBuffers::logProbsHost)
-        .def_rw("finish_reasons_host", &tb::DecoderOutputBuffers::finishReasonsHost);
-
-    nb::class_<tb::SlotDecoderBuffers>(m, "SlotDecoderBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&>(),
-            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"))
-        .def_rw("output_ids", &tb::SlotDecoderBuffers::outputIds)
-        .def_rw("output_ids_host", &tb::SlotDecoderBuffers::outputIdsHost)
-        .def_rw("sequence_lengths_host", &tb::SlotDecoderBuffers::sequenceLengthsHost)
-        .def_rw("cum_log_probs", &tb::SlotDecoderBuffers::cumLogProbs)
-        .def_rw("cum_log_probs_host", &tb::SlotDecoderBuffers::cumLogProbsHost)
-        .def_rw("log_probs", &tb::SlotDecoderBuffers::logProbs)
-        .def_rw("log_probs_host", &tb::SlotDecoderBuffers::logProbsHost)
-        .def_rw("finish_reasons_host", &tb::SlotDecoderBuffers::finishReasonsHost);
-
-    nb::class_<tb::MedusaBuffers>(m, "MedusaBuffers")
-        .def(nb::init<runtime::SizeType32, runtime::SizeType32, runtime::BufferManager const&,
-                 runtime::ModelConfig const&, runtime::WorldConfig const&, executor::DecodingConfig const&,
-                 runtime::TllmRuntime const&>(),
-            nb::arg("max_beam_width"), nb::arg("max_seq_len"), nb::arg("buffer_manager"), nb::arg("model_config"),
-            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("runtime"));
-
-    m.def(
-        "add_new_tokens_to_requests",
-        [](std::vector<std::shared_ptr<tb::LlmRequest>>& requests,
-            std::vector<tb::LlmRequest::TokenIdType> const& tokens, int beam_idx)
-        {
-            TLLM_CHECK_WITH_INFO(requests.size() == tokens.size(), "Expected the same number of requests and tokens.");
-
-            for (int i = 0; i < requests.size(); ++i)
-            {
-                requests[i]->addNewToken(tokens[i], beam_idx);
-            }
-        },
-        nb::arg("requests"), nb::arg("tokens"), nb::arg("beam_idx"),
-        "Add new tokens to multiple LLM requests. The tokens vector should contain tokens for beam beam_idx of all "
-        "requests in order.");
-
-    m.def(
-        "make_decoding_batch_input",
-        [](std::vector<std::shared_ptr<tb::LlmRequest>>& contextRequests,
-            std::vector<std::shared_ptr<tb::LlmRequest>>& genRequests, tr::ITensor::SharedPtr logits, int beamWidth,
-            std::vector<int> const& numContextLogitsPrefixSum, tb::DecoderInputBuffers const& decoderInputBuffers,
-            runtime::decoder::DecoderState& decoderState, tr::BufferManager const& manager)
-        {
-            std::vector<int> activeSlots;
-            std::vector<int> generationSteps;
-            std::vector<std::vector<tr::ITensor::SharedConstPtr>> logitsVec = {{}};
-
-            for (int i = 0; i < contextRequests.size(); ++i)
-            {
-                if (contextRequests[i]->isLastContextChunk())
-                {
-                    activeSlots.push_back(*contextRequests[i]->mSeqSlot);
-                    generationSteps.push_back(contextRequests[i]->getDecodingIter());
-                    auto contextLogitsOffset = numContextLogitsPrefixSum[i + 1] - 1;
-                    tr::ITensor::SharedPtr logitsView = ITensor::slice(logits, contextLogitsOffset, 1);
-
-                    if (beamWidth > 1)
-                    {
-                        // Tile logits of context requests
-                        auto const logitsShape = logitsView->getShape();
-                        auto const logitsType = logitsView->getDataType();
-                        auto decoderLogits = manager.gpu(ITensor::makeShape({beamWidth, logitsShape.d[1]}), logitsType);
-                        tensorrt_llm::runtime::kernels::tileTensor(
-                            *decoderLogits, *logitsView, beamWidth, manager.getStream());
-                        decoderLogits->unsqueeze(0);
-                        logitsVec[0].push_back(std::move(decoderLogits));
-                    }
-                    else
-                    {
-                        logitsView->unsqueeze(1);
-                        logitsVec[0].push_back(std::move(logitsView));
-                    }
-                }
-            }
-
-            auto genLogitsOffset = numContextLogitsPrefixSum.back();
-            for (int i = 0; i < genRequests.size(); ++i)
-            {
-                if (genRequests[i]->isGenerationInProgressState())
-                {
-                    activeSlots.push_back(*genRequests[i]->mSeqSlot);
-                    generationSteps.push_back(genRequests[i]->getDecodingIter());
-
-                    auto logitsOffset = genLogitsOffset + i * beamWidth;
-                    auto numberOfLogits = beamWidth;
-                    tr::ITensor::SharedPtr logitsView = ITensor::slice(logits, logitsOffset, numberOfLogits);
-                    logitsView->unsqueeze(0);
-                    logitsVec[0].push_back(std::move(logitsView));
-                }
-            }
-
-            auto& batchSlots = decoderInputBuffers.forwardBatchSlots;
-            batchSlots[0]->resize(activeSlots.size());
-            auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlots[0]);
-            for (int i = 0; i < activeSlots.size(); ++i)
-            {
-                batchSlotsRange[i] = activeSlots[i];
-            }
-
-            auto decodingInput = std::make_unique<tr::decoder_batch::Input>(logitsVec, 1);
-            decodingInput->batchSlots = batchSlots;
-
-            auto const maxBeamWidth = decoderState.getMaxBeamWidth();
-            if (maxBeamWidth > 1)
-            {
-                // For Variable-Beam-Width-Search
-                decoderState.getJointDecodingInput().generationSteps = generationSteps;
-            }
-
-            return decodingInput;
-        },
-        nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("logits"), nb::arg("beam_width"),
-        nb::arg("num_context_logits_prefix_sum"), nb::arg("decoder_input_buffers"), nb::arg("decoder_state"),
-        nb::arg("buffer_manager"), "Make decoding batch input.");
-}
-
-} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
deleted file mode 100644
index 3d5a0f5d5b2..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.h
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::batch_manager
-{
-
-void initBindings(nb::module_& m);
-
-}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
deleted file mode 100644
index b6edcca1c24..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "buffers.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-
-#include "tensorrt_llm/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/batch_manager/runtimeBuffers.h"
-#include "tensorrt_llm/batch_manager/transformerBuffers.h"
-
-#include <ATen/ATen.h>
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/vector.h>
-#include <torch/extension.h>
-
-namespace nb = nanobind;
-namespace tb = tensorrt_llm::batch_manager;
-namespace tr = tensorrt_llm::runtime;
-
-using tr::SizeType32;
-
-namespace tensorrt_llm::nanobind::batch_manager
-{
-
-void Buffers::initBindings(nb::module_& m)
-{
-    nb::class_<tb::TransformerBuffers>(m, "TransformerBuffers")
-        .def(nb::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
-                 runtime::TllmRuntime const&, runtime::ModelConfig const&, runtime::WorldConfig const&>(),
-            nb::arg("max_batch_size"), nb::arg("max_beam_width"), nb::arg("max_attention_window_vec"),
-            nb::arg("max_attention_window"), nb::arg("sink_token_len"), nb::arg("runtime"), nb::arg("model_config"),
-            nb::arg("world_config"))
-        .def("reshape", &tb::TransformerBuffers::reshape, nb::arg("num_sequences"), nb::arg("num_input_tokens"))
-        .def("reshape_kv_tensors", &tb::TransformerBuffers::reshapeKvTensors, nb::arg("max_batch_size"),
-            nb::arg("max_beam_width"), nb::arg("max_blocks_per_seq"), nb::arg("kv_cache_type"), nb::arg("num_pools"),
-            nb::arg("buffer_manager"))
-        .def("get_buffers", &tb::TransformerBuffers::getBuffers, nb::arg("input_buffers"), nb::arg("output_buffers"),
-            nb::arg("model_config"))
-        .def("copy_position_ids", &tb::TransformerBuffers::copyPositionIds, nb::arg("runtime"),
-            nb::arg("position_ids_host"), nb::arg("is_chat_glm"), nb::arg("decoder_position_ids"))
-        .def("copy_kv_block_offsets", &tb::TransformerBuffers::copyKvBlockOffsets, nb::arg("context_requests"),
-            nb::arg("gen_requests"), nb::arg("kv_cache_manager"), nb::arg("cross_kv_cache_manager"),
-            nb::arg("buffer_manager"))
-        .def("copy_cache_indirection", &tb::TransformerBuffers::copyCacheIndirection, nb::arg("gen_requests"),
-            nb::arg("decoder_cache_indirection_output"), nb::arg("runtime"))
-        .def_rw("past_key_value_lengths", &tb::TransformerBuffers::pastKeyValueLengths)
-        .def_rw("position_ids", &tb::TransformerBuffers::positionIds)
-        .def_rw("max_attention_windows", &tb::TransformerBuffers::maxAttentionWindows)
-        .def_rw("sink_token_lengths", &tb::TransformerBuffers::sinkTokenLengths)
-        .def_rw("cache_indirection", &tb::TransformerBuffers::cacheIndirection)
-        .def_rw("kv_cache_block_offsets_host", &tb::TransformerBuffers::kvCacheBlockOffsetsHost)
-        .def_rw("kv_cache_block_offsets_device", &tb::TransformerBuffers::kvCacheBlockOffsetsDevice)
-        .def_rw("cross_kv_cache_block_pool_pointers", &tb::TransformerBuffers::crossKvCacheBlockPoolPointers)
-        .def_rw("cross_kv_cache_block_offsets_host", &tb::TransformerBuffers::crossKvCacheBlockOffsetsHost)
-        .def_rw("cross_kv_cache_block_offsets_device", &tb::TransformerBuffers::crossKvCacheBlockOffsetsDevice)
-        .def_rw("cache_indir_batched_copy_src_offsets", &tb::TransformerBuffers::cacheIndirBatchedCopySrcOffsets)
-        .def_rw("cache_indir_batched_copy_dst_offsets", &tb::TransformerBuffers::cacheIndirBatchedCopyDstOffsets)
-        .def_rw("cache_indir_batched_copy_sizes", &tb::TransformerBuffers::cacheIndirBatchedCopySizes)
-        .def_rw("fill_values_alt", &tb::TransformerBuffers::fillValuesAlt)
-        .def_rw("fill_values_alt_device", &tb::TransformerBuffers::fillValuesAltDevice)
-        .def_rw("seq_slots_alt", &tb::TransformerBuffers::seqSlotsAlt)
-        .def_rw("seq_slots_alt_device", &tb::TransformerBuffers::seqSlotsAltDevice);
-
-    nb::class_<tb::RuntimeBuffers>(m, "RuntimeBuffers")
-        .def(nb::init<SizeType32, SizeType32, std::vector<SizeType32> const&, SizeType32, SizeType32,
-                 runtime::TllmRuntime const&, runtime::ModelConfig const&, runtime::WorldConfig const&,
-                 executor::DecodingConfig const&, bool, std::optional<SizeType32>>(),
-            nb::arg("max_batch_size"), nb::arg("max_beam_width"), nb::arg("max_attention_window_vec"),
-            nb::arg("max_attention_window"), nb::arg("sink_token_len"), nb::arg("runtime"), nb::arg("model_config"),
-            nb::arg("world_config"), nb::arg("decoding_config"), nb::arg("gather_generation_logits"),
-            nb::arg("max_num_tokens") = std::nullopt)
-        .def_prop_rw(
-            "transformer_buffers", [](tb::RuntimeBuffers& self) { return self.transformerBuffers; },
-            [](tb::RuntimeBuffers& self, std::shared_ptr<tb::TransformerBuffers> val)
-            { self.transformerBuffers = val; })
-        .def_rw("num_context_logits", &tb::RuntimeBuffers::numContextLogits)
-        .def_rw("cache_indir_decoder_io_batched_copy_src_offsets",
-            &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopySrcOffsets)
-        .def_rw("cache_indir_decoder_io_batched_copy_dst_offsets",
-            &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopyDstOffsets)
-        .def_rw("cache_indir_decoder_io_batched_copy_sizes", &tb::RuntimeBuffers::cacheIndirDecoderIOBatchedCopySizes)
-        .def_rw("logits", &tb::RuntimeBuffers::logits)
-        .def_rw("seq_slots", &tb::RuntimeBuffers::seqSlots)
-        .def_rw("seq_slots_device", &tb::RuntimeBuffers::seqSlotsDevice)
-        .def_rw("cache_indir_decoder_io_batched_copy_src_offsets_slice_device",
-            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopySrcOffsetsSliceDevice)
-        .def_rw("cache_indir_decoder_io_batched_copy_dst_offsets_slice_device",
-            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopyDstOffsetsSliceDevice)
-        .def_rw("cache_indir_decoder_io_batched_copy_copy_sizes_device",
-            &tb::RuntimeBuffers::mCacheIndirDecoderIOBatchedCopyCopySizesDevice);
-}
-} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h b/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
deleted file mode 100644
index 34df07e4073..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/buffers.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::batch_manager
-{
-class Buffers
-{
-public:
-    static void initBindings(nb::module_& m);
-};
-} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
deleted file mode 100644
index abac6d17ed8..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.cpp
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "cacheTransceiver.h"
-#include "tensorrt_llm/batch_manager/cacheTransceiver.h"
-#include "tensorrt_llm/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include <ATen/ATen.h>
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/unique_ptr.h>
-#include <nanobind/trampoline.h>
-#include <torch/extension.h>
-
-using SizeType32 = tensorrt_llm::runtime::SizeType32;
-
-namespace tb = tensorrt_llm::batch_manager;
-namespace nb = nanobind;
-
-namespace
-{
-
-class PyCacheTransceiver : public tb::BaseCacheTransceiver
-{
-public:
-    // using BaseCacheTransceiver::BaseCacheTransceiver; // Inherit constructors
-    NB_TRAMPOLINE(tb::BaseCacheTransceiver, 6);
-
-    void respondAndSendAsync(tb::LlmRequest* llmRequest) override
-    {
-        NB_OVERRIDE_PURE(respondAndSendAsync, llmRequest);
-    }
-
-    void requestAndReceiveSync(tb::LlmRequest* llmRequest) override
-    {
-        NB_OVERRIDE_PURE(requestAndReceiveSync, llmRequest);
-    }
-
-    void requestAndReceiveAsync(tb::LlmRequest* llmRequest) override
-    {
-        NB_OVERRIDE_PURE(requestAndReceiveAsync, llmRequest);
-    }
-
-    void checkContextTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) override
-    {
-        NB_OVERRIDE_PURE(checkContextTransferStatus, atLeastRequestNum);
-    }
-
-    void checkGenTransferStatus(std::optional<int> const& atLeastRequestNum = std::nullopt) override
-    {
-        NB_OVERRIDE_PURE(checkGenTransferStatus, atLeastRequestNum);
-    }
-
-    bool checkGenTransferComplete() const override
-    {
-        NB_OVERRIDE_PURE(checkGenTransferComplete);
-    }
-};
-} // namespace
-
-void tb::CacheTransceiverBindings::initBindings(nb::module_& m)
-{
-    nb::class_<tb::BaseCacheTransceiver, PyCacheTransceiver>(m, "BaseCacheTransceiver")
-        .def("respond_and_send_async", &BaseCacheTransceiver::respondAndSendAsync)
-        .def("request_and_receive_sync", &BaseCacheTransceiver::requestAndReceiveSync)
-        .def("request_and_receive_async", &BaseCacheTransceiver::requestAndReceiveAsync)
-        .def("check_context_transfer_status", &BaseCacheTransceiver::checkContextTransferStatus)
-        .def("check_gen_transfer_status", &BaseCacheTransceiver::checkGenTransferStatus)
-        .def("check_gen_transfer_complete", &BaseCacheTransceiver::checkGenTransferComplete);
-
-    nb::enum_<tb::CacheTransceiver::CommType>(m, "CommType")
-        .value("UNKNOWN", tb::CacheTransceiver::CommType::UNKNOWN)
-        .value("MPI", tb::CacheTransceiver::CommType::MPI)
-        .value("UCX", tb::CacheTransceiver::CommType::UCX)
-        .value("NIXL", tb::CacheTransceiver::CommType::NIXL);
-
-    nb::enum_<executor::kv_cache::CacheState::AttentionType>(m, "AttentionType")
-        .value("DEFAULT", executor::kv_cache::CacheState::AttentionType::kDEFAULT)
-        .value("MLA", executor::kv_cache::CacheState::AttentionType::kMLA);
-
-    nb::class_<tb::CacheTransceiver, tb::BaseCacheTransceiver>(m, "CacheTransceiver")
-        .def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, tb::CacheTransceiver::CommType,
-                 std::vector<SizeType32>, SizeType32, SizeType32, runtime::WorldConfig, nvinfer1::DataType,
-                 executor::kv_cache::CacheState::AttentionType, std::optional<executor::CacheTransceiverConfig>>(),
-            nb::arg("cache_manager"), nb::arg("comm_type"), nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"),
-            nb::arg("tokens_per_block"), nb::arg("world_config"), nb::arg("dtype"), nb::arg("attention_type"),
-            nb::arg("cache_transceiver_config") = std::nullopt);
-
-    nb::class_<tb::kv_cache_manager::CacheTransBufferManager>(m, "CacheTransBufferManager")
-        .def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, std::optional<size_t>>(), nb::arg("cache_manager"),
-            nb::arg("max_num_tokens") = std::nullopt)
-        .def_static("pre_alloc_buffer_size", &tb::kv_cache_manager::CacheTransBufferManager::preAllocBufferSize,
-            nb::arg("max_num_tokens") = std::nullopt);
-}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h b/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
deleted file mode 100644
index 90fc63d4fde..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::batch_manager
-{
-class CacheTransceiverBindings
-{
-public:
-    static void initBindings(nb::module_& m);
-};
-} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
deleted file mode 100644
index f1c398d31f0..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "kvCacheManager.h"
-#include "tensorrt_llm/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/batch_manager/peftCacheManager.h"
-#include "tensorrt_llm/nanobind/common/bindTypes.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/runtime/torch.h"
-#include "tensorrt_llm/runtime/torchView.h"
-
-#include <ATen/ATen.h>
-#include <nanobind/nanobind.h>
-#include <nanobind/ndarray.h>
-#include <nanobind/operators.h>
-#include <nanobind/stl/bind_vector.h>
-#include <nanobind/stl/chrono.h>
-#include <nanobind/stl/map.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/tuple.h>
-#include <nanobind/stl/unique_ptr.h>
-#include <nanobind/stl/vector.h>
-#include <nanobind/trampoline.h>
-#include <torch/extension.h>
-
-namespace tb = tensorrt_llm::batch_manager;
-namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
-namespace tr = tensorrt_llm::runtime;
-namespace nb = nanobind;
-using BlockKey = tbk::BlockKey;
-using VecUniqueTokens = tensorrt_llm::runtime::VecUniqueTokens;
-using SizeType32 = tensorrt_llm::runtime::SizeType32;
-using TokenIdType = tensorrt_llm::runtime::TokenIdType;
-using VecTokens = std::vector<TokenIdType>;
-using CudaStreamPtr = std::shared_ptr<tensorrt_llm::runtime::CudaStream>;
-
-namespace
-{
-std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optional<at::Tensor> torchPtr)
-{
-    if (torchPtr)
-    {
-        return tr::TorchView::of(torchPtr.value());
-    }
-    return std::nullopt;
-}
-
-class PyKvCacheManager : public tbk::BaseKVCacheManager
-{
-public:
-    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 28);
-
-    // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
-    void allocatePools(bool useUvm = false) override
-    {
-        NB_OVERRIDE_PURE(allocatePools, useUvm);
-    }
-
-    void releasePools() override
-    {
-        NB_OVERRIDE_PURE(releasePools);
-    }
-
-    void startScheduling() override
-    {
-        NB_OVERRIDE_PURE(startScheduling);
-    }
-
-    SizeType32 getTokensPerBlock() const override
-    {
-        NB_OVERRIDE_PURE(getTokensPerBlock);
-    }
-
-    SizeType32 getMaxNumBlocks() const override
-    {
-        NB_OVERRIDE_PURE(getMaxNumBlocks);
-    }
-
-    SizeType32 getNumPools() const override
-    {
-        NB_OVERRIDE_PURE(getNumPools);
-    }
-
-    tbk::KvCacheStats getKvCacheStats() const override
-    {
-        NB_OVERRIDE_PURE(getKvCacheStats);
-    }
-
-    void addToken(tb::LlmRequest::RequestIdType requestId) override
-    {
-        NB_OVERRIDE_PURE(addToken, requestId);
-    }
-
-    void addSequence(tb::LlmRequest::RequestIdType requestId, SizeType32 inputLength, SizeType32 beamWidth,
-        tensorrt_llm::common::OptionalRef<tb::LlmRequest> llmRequest = std::nullopt) override
-    {
-        NB_OVERRIDE_PURE(addSequence, requestId, inputLength, beamWidth, llmRequest);
-    }
-
-    void removeSequence(tb::LlmRequest::RequestIdType requestId,
-        tensorrt_llm::common::OptionalRef<tb::LlmRequest const> llmRequest = std::nullopt) override
-    {
-        NB_OVERRIDE_PURE(removeSequence, requestId, llmRequest);
-    }
-
-    tbk::GenerationRequest const& getSequence(tb::LlmRequest::RequestIdType requestId) const override
-    {
-        NB_OVERRIDE_PURE(getSequence, requestId);
-    }
-
-    void schedulingRemoveSequence(tb::LlmRequest::RequestIdType requestId) override
-    {
-        NB_OVERRIDE_PURE(schedulingRemoveSequence, requestId);
-    }
-
-    tensorrt_llm::runtime::ITensor::SharedPtr getBlockPoolPointers() const override
-    {
-        NB_OVERRIDE_PURE(getBlockPoolPointers);
-    }
-
-    tensorrt_llm::runtime::ITensor::SharedPtr getLayerToPoolMapping() const override
-    {
-        NB_OVERRIDE_PURE(getLayerToPoolMapping);
-    }
-
-    void getBlockOffsetsOfBatch(tensorrt_llm::runtime::ITensor& output, SizeType32 firstBatchSlotIdx,
-        SizeType32 batchSize, SizeType32 beamWidth) const override
-    {
-        NB_OVERRIDE_PURE(getBlockOffsetsOfBatch, output, firstBatchSlotIdx, batchSize, beamWidth);
-    }
-
-    SizeType32 copyBlockOffsets(tensorrt_llm::runtime::ITensor& output, SizeType32 outputSlotOffset,
-        tb::LlmRequest::RequestIdType requestId) const override
-    {
-        NB_OVERRIDE_PURE(copyBlockOffsets, output, outputSlotOffset, requestId);
-    }
-
-    bool isEnableBlockReuse() const override
-    {
-        NB_OVERRIDE_PURE(isEnableBlockReuse);
-    }
-
-    void rewindKVCache(tb::LlmRequest::RequestIdType requestId, SizeType32 rewindLengths) override
-    {
-        NB_OVERRIDE_PURE(rewindKVCache, requestId, rewindLengths);
-    }
-
-    bool isCrossKv() const override
-    {
-        NB_OVERRIDE_PURE(isCrossKv);
-    }
-
-    std::optional<BlockKey> findNewContextBlock(
-        VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest) const override
-    {
-        NB_OVERRIDE_PURE(findNewContextBlock, uniqueTokens, llmRequest);
-    }
-
-    void storeContextBlocks(tb::LlmRequest const& llmRequest) override
-    {
-        NB_OVERRIDE_PURE(storeContextBlocks, llmRequest);
-    }
-
-    std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
-        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
-    {
-        NB_OVERRIDE_PURE(getCacheBlockIds, requestId, windowSize);
-    }
-
-    std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
-        std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 windowSize) const override
-    {
-        NB_OVERRIDE_PURE(getBatchCacheBlockIds, requestIds, windowSize);
-    }
-
-    std::vector<SizeType32> getNewlyAllocatedBlockIds(
-        tb::LlmRequest::RequestIdType requestId, SizeType32 windowSize) const override
-    {
-        NB_OVERRIDE_PURE(getNewlyAllocatedBlockIds, requestId, windowSize);
-    }
-
-    SizeType32 getUsedNumBlocks() const override
-    {
-        NB_OVERRIDE_PURE(getUsedNumBlocks);
-    }
-
-    SizeType32 getNumFreeBlocks() const override
-    {
-        NB_OVERRIDE_PURE(getNumFreeBlocks);
-    }
-
-    tbk::BlockManager const& getBlockManager() const override
-    {
-        NB_OVERRIDE_PURE(getBlockManager);
-    }
-
-    std::deque<tensorrt_llm::executor::KVCacheEvent> getLatestEvents(
-        std::optional<std::chrono::milliseconds> timeout = std::nullopt) const override
-    {
-        NB_OVERRIDE_PURE(getLatestEvents, timeout);
-    }
-
-    tensorrt_llm::runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const override
-    {
-        NB_OVERRIDE_PURE(getPrimaryPool, layer_idx);
-    }
-
-    SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const override
-    {
-        NB_OVERRIDE_PURE(getPoolLayerIdx, layer_idx);
-    }
-
-    void refreshBlocks() override
-    {
-        NB_OVERRIDE_PURE(refreshBlocks);
-    }
-
-    void flushIterationEvents() override
-    {
-        NB_OVERRIDE_PURE(flushIterationEvents);
-    }
-};
-
-// TODO: Deduplicate executor bindings KvCacheStats
-class PyBasePeftCacheManager : public tb::BasePeftCacheManager
-{
-public:
-    ~PyBasePeftCacheManager() override = default;
-
-    NB_TRAMPOLINE(tb::BasePeftCacheManager, 8);
-
-    void addRequestPeft(tb::BasePeftCacheManager::LlmRequestPtr llmRequest, bool tryGpuCache = true) override
-    {
-        NB_OVERRIDE_PURE(addRequestPeft, llmRequest, tryGpuCache);
-    }
-
-    tb::BasePeftCacheManager::PeftTable ensureBatch(tb::RequestVector const& contextRequests,
-        tb::RequestVector const& generationRequests, bool resetGpuCache = false) override
-    {
-        NB_OVERRIDE_PURE(ensureBatch, contextRequests, generationRequests, resetGpuCache);
-    }
-
-    void resetDeviceCache() override
-    {
-        NB_OVERRIDE_PURE(resetDeviceCache);
-    }
-
-    void markRequestDone(tb::LlmRequest const& llmReq, bool pause = false) override
-    {
-        NB_OVERRIDE_PURE(markRequestDone, llmReq, pause);
-    }
-
-    tr::SizeType32 getMaxDevicePages() const override
-    {
-        NB_OVERRIDE_PURE(getMaxDevicePages);
-    }
-
-    tr::SizeType32 getMaxHostPages() const override
-    {
-        NB_OVERRIDE_PURE(getMaxHostPages);
-    }
-
-    tr::SizeType32 determineNumPages(std::shared_ptr<tb::LlmRequest> llmRequest) const override
-    {
-        NB_OVERRIDE_PURE(determineNumPages, llmRequest);
-    }
-
-    bool enabled() const override
-    {
-        NB_OVERRIDE_PURE(enabled);
-    }
-};
-} // namespace
-
-void tb::kv_cache_manager::KVCacheManagerBindings::initBindings(nb::module_& m)
-{
-    nb::class_<tbk::KvCacheStats>(m, "KvCacheStats")
-        .def(nb::init<>())
-        .def_rw("max_num_blocks", &tbk::KvCacheStats::maxNumBlocks)
-        .def_rw("free_num_blocks", &tbk::KvCacheStats::freeNumBlocks)
-        .def_rw("used_num_blocks", &tbk::KvCacheStats::usedNumBlocks)
-        .def_rw("tokens_per_block", &tbk::KvCacheStats::toksPerBlock)
-        .def_rw("alloc_total_blocks", &tbk::KvCacheStats::allocTotalBlocks)
-        .def_rw("alloc_new_blocks", &tbk::KvCacheStats::allocNewBlocks)
-        .def_rw("reused_blocks", &tbk::KvCacheStats::reusedBlocks)
-        .def_rw("missed_blocks", &tbk::KvCacheStats::missedBlocks)
-        .def_rw("cache_hit_rate", &tbk::KvCacheStats::cacheHitRate)
-        .def_rw("num_free_blocks_per_window_size", &tbk::KvCacheStats::numFreeBlocksPerWindowSize);
-
-    nb::class_<tbk::TempAttentionWindowInputs>(m, "TempAttentionWindowInputs")
-        .def(nb::init<>())
-        .def_rw("paged_context_fmha", &tbk::TempAttentionWindowInputs::pagedContextFMHA)
-        .def_rw("max_input_len", &tbk::TempAttentionWindowInputs::maxInputLen)
-        .def_rw("max_num_tokens", &tbk::TempAttentionWindowInputs::maxNumTokens);
-
-    nb::class_<tbk::BlockKey>(m, "BlockKey")
-        .def(nb::init<>())
-        .def(nb::init<VecTokens const&, std::optional<tr::LoraTaskIdType>>(), nb::arg("tokens"),
-            nb::arg("lora_task_id") = std::nullopt)
-        .def(nb::init<bool, std::optional<tr::LoraTaskIdType>, VecUniqueTokens const&>(), nb::arg("uses_extra_ids"),
-            nb::arg("lora_task_id"), nb::arg("unique_tokens"))
-        .def_ro("uses_extra_ids", &tbk::BlockKey::usesExtraIds)
-        .def_ro("lora_task_id", &tbk::BlockKey::loraTaskId)
-        .def_ro("unique_tokens", &tbk::BlockKey::uniqueTokens);
-
-    nb::class_<tbk::BlockKeyHasher>(m, "BlockKeyHasher")
-        .def_static("hash", &tbk::BlockKeyHasher::hash, nb::arg("block_key"), nb::arg("parent_hash") = 0);
-
-    nb::class_<tbk::KVCacheEventManager>(m, "KVCacheEventManager")
-        .def(nb::init<size_t>(), nb::arg("max_kv_event_entries"));
-
-    nb::class_<tbk::BaseKVCacheManager, PyKvCacheManager>(m, "BaseKVCacheManager")
-        .def_static("calculate_max_num_blocks", &tbk::BaseKVCacheManager::calculateMaxNumBlocks, nb::arg("config"),
-            nb::arg("is_cross_attention"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"),
-            nb::arg("window_size_to_layers"), nb::arg("allotted_primary_mem_bytes"),
-            nb::arg("allotted_secondary_mem_bytes"), nb::arg("extra_cost_memory"), nb::arg("kv_factor"))
-        .def("allocate_pools", &BaseKVCacheManager::allocatePools)
-        .def("release_pools", &BaseKVCacheManager::releasePools)
-        .def("start_scheduling", &BaseKVCacheManager::startScheduling)
-        .def_prop_ro("tokens_per_block", &BaseKVCacheManager::getTokensPerBlock)
-        .def_prop_ro("max_num_blocks", &BaseKVCacheManager::getMaxNumBlocks)
-        .def_prop_ro("num_pools", &BaseKVCacheManager::getNumPools)
-        .def("get_kv_cache_stats", &BaseKVCacheManager::getKvCacheStats)
-        .def_prop_ro("max_blocks_per_seq",
-            [](tbk::BaseKVCacheManager& self) { return self.getOffsetTableDimensions().maxBlocksPerSeq; })
-        .def("get_needed_blocks_one_step", &BaseKVCacheManager::getNeededBlocksOneStep)
-        .def("get_remaining_blocks_to_completion", &BaseKVCacheManager::getRemainingBlocksToCompletion)
-        .def("add_token", &BaseKVCacheManager::addToken)
-        .def("add_sequence", &BaseKVCacheManager::addSequence)
-        .def("remove_sequence", &BaseKVCacheManager::removeSequence)
-        .def("scheduling_remove_sequence", &BaseKVCacheManager::schedulingRemoveSequence)
-        .def("get_block_pool_pointers",
-            [](tbk::BaseKVCacheManager& self)
-            {
-                std::optional<at::Tensor> block_pool_pointers{std::nullopt};
-                auto tensor = self.getBlockPoolPointers();
-                if (tensor)
-                {
-                    std::shared_ptr<tensorrt_llm::runtime::ITensor> _tensor = std::move(tensor);
-                    block_pool_pointers = tr::Torch::tensor(_tensor);
-                }
-                return block_pool_pointers;
-            })
-        .def("get_layer_to_pool_mapping",
-            [](tbk::BaseKVCacheManager& self)
-            {
-                std::optional<at::Tensor> layer_to_pool_mapping{std::nullopt};
-                auto tensor = self.getLayerToPoolMapping();
-                if (tensor)
-                {
-                    std::shared_ptr<tensorrt_llm::runtime::ITensor> _tensor = std::move(tensor);
-                    layer_to_pool_mapping = tr::Torch::tensor(_tensor);
-                }
-                return layer_to_pool_mapping;
-            })
-        .def("get_primary_pool_data",
-            [](tbk::BaseKVCacheManager& self, SizeType32 layer_idx) -> at::Tensor
-            {
-                auto pool = tr::Torch::tensor(self.getPrimaryPool(layer_idx));
-                auto pool_layer_idx = self.getPoolLayerIdx(layer_idx);
-                return pool.index({torch::indexing::Slice(), pool_layer_idx});
-            })
-        .def("get_block_offsets_of_batch",
-            [](tbk::BaseKVCacheManager& self, at::Tensor output, SizeType32 firstBatchSlotIdx, SizeType32 batchSize,
-                SizeType32 beamWidth)
-            {
-                auto _output = from_torch(output);
-                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
-                self.getBlockOffsetsOfBatch(*(_output.value()), firstBatchSlotIdx, batchSize, beamWidth);
-            })
-        .def("copy_block_offsets",
-            [](tbk::BaseKVCacheManager& self, at::Tensor output, SizeType32 outputSlotOffset,
-                tb::LlmRequest::RequestIdType requestId)
-            {
-                auto _output = from_torch(output);
-                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
-                auto maxBlockCount = self.copyBlockOffsets(*(_output.value()), outputSlotOffset, requestId);
-                return maxBlockCount;
-            })
-        .def("copy_batch_block_offsets",
-            [](tbk::BaseKVCacheManager& self, at::Tensor output,
-                std::vector<tb::LlmRequest::RequestIdType> const& requestIds, SizeType32 const beamWidth,
-                SizeType32 const offset)
-            {
-                auto _output = from_torch(output);
-                TLLM_CHECK_WITH_INFO(_output.has_value(), "Invalid output tensor.");
-                for (size_t i = 0; i < requestIds.size(); ++i)
-                {
-                    self.copyBlockOffsets(*(_output.value()), i * beamWidth + offset, requestIds[i]);
-                }
-            })
-        .def(
-            "get_latest_events",
-            [](tbk::BaseKVCacheManager& self, std::optional<double> timeout_ms = std::nullopt)
-            {
-                if (timeout_ms)
-                {
-                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
-                }
-                return self.getLatestEvents(std::nullopt);
-            },
-            nb::arg("timeout_ms") = std::nullopt)
-        .def_prop_ro("enable_block_reuse", &BaseKVCacheManager::isEnableBlockReuse)
-        .def("rewind_kv_cache", &BaseKVCacheManager::rewindKVCache)
-        .def_prop_ro("cross_kv", &BaseKVCacheManager::isCrossKv)
-        .def("store_context_blocks", &BaseKVCacheManager::storeContextBlocks)
-        .def("get_cache_block_ids", &BaseKVCacheManager::getCacheBlockIds)
-        .def("get_batch_cache_block_ids", &BaseKVCacheManager::getBatchCacheBlockIds)
-        .def("get_newly_allocated_block_ids", &BaseKVCacheManager::getNewlyAllocatedBlockIds)
-        .def("flush_iteration_events", &BaseKVCacheManager::flushIterationEvents);
-
-    nb::bind_vector<std::vector<std::vector<SizeType32>>>(m, "CacheBlockIds");
-
-    nb::enum_<tbk::CacheType>(m, "CacheType")
-        .value("SELF", tbk::CacheType::kSELF)
-        .value("CROSS", tbk::CacheType::kCROSS)
-        .value("SELFKONLY", tbk::CacheType::kSELFKONLY);
-
-    nb::class_<tbk::KVCacheManager, tbk::BaseKVCacheManager>(m, "KVCacheManager")
-        .def(nb::init<std::vector<SizeType32> const&, SizeType32, SizeType32,
-                 std::map<SizeType32, std::tuple<SizeType32, SizeType32>> const&, SizeType32, SizeType32,
-                 std::vector<SizeType32> const&, std::optional<tbk::TempAttentionWindowInputs> const&,
-                 nvinfer1::DataType, SizeType32, int64_t, std::optional<runtime::SizeType32>, bool, bool,
-                 tbk::CacheType, std::optional<tensorrt_llm::executor::RetentionPriority>,
-                 std::shared_ptr<tbk::KVCacheEventManager>, bool, bool>(),
-            nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"), nb::arg("tokens_per_block"),
-            nb::arg("blocks_per_window"), nb::arg("max_num_sequences"), nb::arg("max_beam_width"),
-            nb::arg("max_attention_window_vec"), nb::arg("temp_attention_window_inputs").none(), nb::arg("dtype"),
-            nb::arg("sink_token_length"), nb::arg("stream"), nb::arg("max_sequence_length").none(),
-            nb::arg("enable_block_reuse") = false, nb::arg("onboard_blocks") = true,
-            nb::arg("cache_type") = tbk::CacheType::kSELF, nb::arg("secondary_offload_min_priority") = std::nullopt,
-            nb::arg("event_manager") = nullptr, nb::arg("enable_partial_reuse") = true,
-            nb::arg("copy_on_partial_reuse") = true);
-}
-
-void tb::BasePeftCacheManagerBindings::initBindings(nb::module_& m)
-{
-    nb::class_<tb::BasePeftCacheManager, PyBasePeftCacheManager>(m, "BasePeftCacheManager")
-        .def("add_request_peft", &tb::BasePeftCacheManager::addRequestPeft, nb::arg("request"),
-            nb::arg("try_gpu_cache") = true)
-        .def(
-            "ensure_batch",
-            [](tb::BasePeftCacheManager& self, tb::RequestVector const& contextRequests,
-                tb::RequestVector const& generationRequests, bool resetGpuCache)
-            {
-                nb::gil_scoped_release release;
-                return self.ensureBatch(contextRequests, generationRequests, resetGpuCache);
-            },
-            nb::arg("context_requests"), nb::arg("generation_requests"), nb::arg("reset_gpu_cache") = false)
-        .def("reset_device_cache", &tb::BasePeftCacheManager::resetDeviceCache)
-        .def("mark_request_done", &tb::BasePeftCacheManager::markRequestDone, nb::arg("request"),
-            nb::arg("pause") = false)
-        .def_prop_ro("max_device_pages", &tb::BasePeftCacheManager::getMaxDevicePages)
-        .def_prop_ro("max_host_pages", &tb::BasePeftCacheManager::getMaxHostPages)
-        .def("determine_num_pages", &tb::BasePeftCacheManager::determineNumPages, nb::arg("request"))
-        .def_prop_ro("enabled", &tb::BasePeftCacheManager::enabled);
-
-    nb::class_<tb::PeftCacheManager, tb::BasePeftCacheManager>(m, "PeftCacheManager")
-        .def(nb::init<tb::PeftCacheManagerConfig, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
-            nb::arg("config"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"));
-
-    nb::class_<tb::NoOpPeftCacheManager, tb::BasePeftCacheManager>(m, "NoOpPeftCacheManager").def(nb::init<>());
-}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
deleted file mode 100644
index 786c0d391df..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::batch_manager::kv_cache_manager
-{
-class KVCacheManagerBindings
-{
-public:
-    static void initBindings(nb::module_& m);
-};
-} // namespace tensorrt_llm::batch_manager::kv_cache_manager
-
-namespace tensorrt_llm::batch_manager
-{
-class BasePeftCacheManagerBindings
-{
-public:
-    static void initBindings(nb::module_& m);
-};
-} // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
deleted file mode 100644
index d8f45cb865f..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "llmRequest.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-
-#include "tensorrt_llm/batch_manager/llmRequest.h"
-#include "tensorrt_llm/nanobind/common/bindTypes.h"
-#include "tensorrt_llm/runtime/torch.h"
-#include "tensorrt_llm/runtime/torchUtils.h"
-#include "tensorrt_llm/runtime/torchView.h"
-
-#include <ATen/ATen.h>
-#include <torch/extension.h>
-
-#include <memory>
-
-namespace tb = tensorrt_llm::batch_manager;
-namespace tr = tensorrt_llm::runtime;
-namespace tle = tensorrt_llm::executor;
-
-using namespace tensorrt_llm::nanobind::batch_manager;
-
-using LlmRequestPtr = std::shared_ptr<tb::LlmRequest>;
-using RequestList = std::list<LlmRequestPtr>;
-
-namespace
-{
-
-std::optional<tb::LlmRequest::TensorPtr> from_torch(std::optional<LlmRequest::TensorPtr> torchPtr)
-{
-    if (torchPtr)
-    {
-        return tr::TorchView::of(torchPtr.value());
-    }
-    return std::nullopt;
-}
-
-} // namespace
-
-std::optional<tb::LlmRequest::LogitsPostProcessor> LlmRequest::callbackAdapter(
-    std::optional<LlmRequest::LogitsPostProcessor> callback)
-{
-    if (!callback)
-    {
-        return std::nullopt;
-    }
-
-    return [callback](RequestIdType reqId, tr::ITensor::SharedPtr& tensor, tb::LlmRequest::BeamTokens const& tokens,
-               tr::BufferManager::CudaStreamPtr stream, std::optional<RequestIdType> clientId)
-    {
-        at::Tensor atTensor = tr::Torch::tensor(tensor);
-        callback.value()(reqId, atTensor, tokens, runtime::TorchUtils::stream(*stream).unwrap(), clientId);
-    };
-}
-
-std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
-{
-
-    auto const draftTokens = std::make_shared<std::vector<TokenIdType>>(*mDraftTokens.get());
-    auto const optDraftTokens = std::optional<std::shared_ptr<std::vector<TokenIdType>>>(draftTokens);
-    auto const encoderInputTokens = mEncoderTokens.has_value()
-        ? std::make_shared<std::vector<TokenIdType>>(*mEncoderTokens.value().get())
-        : nullptr;
-    auto const optEncoderInputTokens = std::optional<std::shared_ptr<std::vector<TokenIdType>>>(encoderInputTokens);
-    // 49 parameters
-    return std::make_shared<tb::LlmRequest>(                       //
-        mRequestId,                                                //
-        mMaxNewTokens,                                             //
-        std::make_shared<std::vector<TokenIdType>>(mTokens.at(0)), //
-        mSamplingConfig,                                           //
-        mIsStreaming,                                              //
-        mEndId,                                                    //
-        mPadId,                                                    //
-        from_torch(mEmbeddingBias),                                //
-        from_torch(mBadWordsList),                                 //
-        from_torch(mStopWordsList),                                //
-        mPositionIds,                                              //
-        from_torch(mPromptEmbeddingTable),                         //
-        mPromptVocabSize,                                          //
-        mMultimodalHashes,                                         //
-        mMultimodalPositions,                                      //
-        mMultimodalLengths,                                        //
-        from_torch(mMultimodalEmbedding),                          //
-        from_torch(mMropeRotaryCosSin),                            //
-        mMropePositionDeltas,                                      //
-        mLoraTaskId,                                               //
-        from_torch(mLoraWeights),                                  //
-        from_torch(mLoraConfig),                                   //
-        mLookaheadConfig,                                          //
-        mKvCacheRetentionConfig,                                   //
-        mReturnLogProbs,                                           //
-        mReturnContextLogits,                                      //
-        mReturnGenerationLogits,                                   //
-        optDraftTokens,                                            //
-        from_torch(mDraftLogits),                                  //
-        mExcludeInputFromOutput,                                   //
-        callbackAdapter(mLogitsPostProcessor),                     //
-        mApplyLogitsPostProcessorBatched,                          //
-        optEncoderInputTokens,                                     //
-        mReturnEncoderOutput,                                      //
-        mClientId,                                                 //
-        mPriority,                                                 //
-        from_torch(mEncoderInputFeatures),                         //
-        mEncoderOutputLength,                                      //
-        from_torch(mCrossAttentionMask),                           //
-        getLlmRequestType(),                                       //
-        std::nullopt,                                              // inputTokenExtraIds
-        mNumReturnSequences,                                       //
-        mEagleConfig,                                              //
-        from_torch(mSkipCrossAttnBlocks),                          //
-        false,                                                     // returnPerfMetrics
-        mGuidedDecodingParams,                                     //
-        mLanguageAdapterUid,                                       //
-        mAllottedTimeMs,                                           //
-        mContextPhaseParams                                        //
-    );
-}
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
deleted file mode 100644
index 624dc55112d..00000000000
--- a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/batch_manager/llmRequest.h"
-
-#include <ATen/ATen.h>
-#include <ATen/ops/tensor.h>
-#include <memory>
-#include <nanobind/nanobind.h>
-#include <optional>
-
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::batch_manager
-{
-
-namespace tb = tensorrt_llm::batch_manager;
-
-/* Unfortunately, torch's default nanobind bindings don't know about c10::cuda::CUDAStream,
- * so we have to pass the more generic c10::Stream, and convert it back to a full-fledged
- * torch.cuda.Stream in python. See example in test/bindings/test_gpt_manager.py
- */
-class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
-{
-public:
-    using Base = GenericLlmRequest<at::Tensor, c10::Stream>;
-    using TensorPtr = Base::TensorPtr;
-    using SizeType32 = Base::SizeType32;
-    using TokenIdType = Base::TokenIdType;
-    using RequestIdType = Base::RequestIdType;
-    using LoraTaskIdType = Base::LoraTaskIdType;
-    using VecLogProbs = Base::VecLogProbs;
-    using BeamTokens = Base::BeamTokens;
-    using VecTokens = Base::VecTokens;
-    using VecTokenExtraIds = Base::VecTokenExtraIds;
-    using LogitsPostProcessor = Base::LogitsPostProcessor;
-
-    // 49 parameters
-    LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
-        runtime::SamplingConfig samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
-        std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
-        std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
-        std::optional<std::vector<SizeType32>> positionIds = std::nullopt,
-        std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
-        std::optional<SizeType32> promptVocabSize = std::nullopt,
-        std::optional<std::vector<std::vector<SizeType32>>> multimodalHashes = std::nullopt,
-        std::optional<std::vector<SizeType32>> multimodalPositions = std::nullopt,
-        std::optional<std::vector<SizeType32>> multimodalLengths = std::nullopt,
-        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
-        std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
-        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
-        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
-        std::optional<TensorPtr> loraConfig = std::nullopt,
-        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
-        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
-        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
-        std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
-        bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
-        bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
-        bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
-        executor::PriorityType priority = executor::Request::kDefaultPriority,
-        std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
-        std::optional<SizeType32> encoderOutputLength = std::nullopt,
-        std::optional<TensorPtr> crossAttentionMask = std::nullopt,
-        tb::LlmRequestType llmRequestType = tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
-        std::optional<VecTokenExtraIds> inputTokenExtraIds = std::nullopt, SizeType32 numReturnSequences = 1,
-        std::optional<executor::EagleConfig> eagleConfig = std::nullopt,
-        std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false,
-        std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
-        std::optional<SizeType32> languageAdapterUid = std::nullopt,
-        std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
-        std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt)
-        : Base(requestId,                                                                                       //
-            maxNewTokens,                                                                                       //
-            std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
-            samplingConfig,                                                                                     //
-            isStreaming,                                                                                        //
-            endId,                                                                                              //
-            padId,                                                                                              //
-            embeddingBias,                                                                                      //
-            badWordsList,                                                                                       //
-            stopWordsList,                                                                                      //
-            positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value())) //
-                                    : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),    //
-            promptEmbeddingTable,                                                                               //
-            promptVocabSize,                                                                                    //
-            multimodalHashes.has_value()
-                ? std::make_optional(
-                    std::make_shared<std::vector<std::vector<SizeType32>>>(std::move(multimodalHashes.value()))) //
-                : std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>>(std::nullopt),            //
-            multimodalPositions.has_value()
-                ? std::make_shared<std::vector<SizeType32>>(std::move(multimodalPositions.value()))              //
-                : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),                         //
-            multimodalLengths.has_value()
-                ? std::make_shared<std::vector<SizeType32>>(std::move(multimodalLengths.value()))                //
-                : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),                         //
-            multimodalEmbedding,                                                                                 //
-            mropeRotaryCosSin,                                                                                   //
-            mropePositionDeltas,                                                                                 //
-            loraTaskId,                                                                                          //
-            loraWeights,                                                                                         //
-            loraConfig,                                                                                          //
-            lookaheadConfig,                                                                                     //
-            kvCacheRetentionConfig,                                                                              //
-            returnLogProbs,                                                                                      //
-            returnContextLogits,                                                                                 //
-            returnGenerationLogits,                                                                              //
-            draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))                //
-                                    : std::make_shared<VecTokens>(),                                             //
-            draftLogits,                                                                                         //
-            excludeInputFromOutput,                                                                              //
-            logitsPostProcessor,                                                                                 //
-            applyLogitsPostProcessorBatched,                                                                     //
-            encoderInputTokens ? std::make_optional(std::make_shared<VecTokens>(std::move(*encoderInputTokens))) //
-                               : std::optional<std::shared_ptr<VecTokens>>(std::nullopt),                        //
-            returnEncoderOutput,                                                                                 //
-            clientId,                                                                                            //
-            priority,                                                                                            //
-            encoderInputFeatures,                                                                                //
-            encoderOutputLength,                                                                                 //
-            crossAttentionMask,                                                                                  //
-            llmRequestType,                                                                                      //
-            inputTokenExtraIds                                                                                   //
-                ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))         //
-                : std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),                                //
-            numReturnSequences,                                                                                  //
-            eagleConfig,                                                                                         //
-            skipCrossAttnBlocks,                                                                                 //
-            returnPerfMetrics,                                                                                   //
-            guidedDecodingParams,                                                                                //
-            languageAdapterUid,                                                                                  //
-            allottedTimeMs,                                                                                      //
-            contextPhaseParams                                                                                   //
-        )
-    {
-    }
-
-    static std::optional<tb::LlmRequest::LogitsPostProcessor> callbackAdapter(
-        std::optional<LlmRequest::LogitsPostProcessor> callback);
-
-    [[nodiscard]] std::shared_ptr<tensorrt_llm::batch_manager::LlmRequest> toTrtLlm() const;
-};
-
-} // namespace tensorrt_llm::nanobind::batch_manager
diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
index dd01d21cced..adc82587433 100644
--- a/cpp/tensorrt_llm/nanobind/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,483 +15,14 @@
  * limitations under the License.
  */
 
-#include "tensorrt_llm/nanobind/common/customCasters.h"
 #include <nanobind/nanobind.h>
-#include <nanobind/operators.h>
-#include <nanobind/stl/bind_vector.h>
-#include <nanobind/stl/filesystem.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/tuple.h>
-#include <nanobind/stl/unique_ptr.h>
-
-#include <torch/extension.h>
-#include <vector>
-
-#include "tensorrt_llm/batch_manager/peftCacheManagerConfig.h"
-#include "tensorrt_llm/common/quantization.h"
-#include "tensorrt_llm/nanobind/batch_manager/algorithms.h"
-#include "tensorrt_llm/nanobind/batch_manager/bindings.h"
-#include "tensorrt_llm/nanobind/batch_manager/buffers.h"
-#include "tensorrt_llm/nanobind/batch_manager/cacheTransceiver.h"
-#include "tensorrt_llm/nanobind/batch_manager/kvCacheManager.h"
-#include "tensorrt_llm/nanobind/batch_manager/llmRequest.h"
-#include "tensorrt_llm/nanobind/executor/bindings.h"
-#include "tensorrt_llm/nanobind/runtime/bindings.h"
-#include "tensorrt_llm/nanobind/testing/modelSpecBinding.h"
-#include "tensorrt_llm/nanobind/userbuffers/bindings.h"
-#include "tensorrt_llm/runtime/common.h"
-#include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/gptJsonConfig.h"
-#include "tensorrt_llm/runtime/ipcNvlsMemory.h"
-#include "tensorrt_llm/runtime/memoryCounters.h"
-#include "tensorrt_llm/runtime/samplingConfig.h"
-#include "tensorrt_llm/runtime/utils/mpiUtils.h"
-
-namespace nb = nanobind;
-namespace tb = tensorrt_llm::batch_manager;
-namespace tbk = tensorrt_llm::batch_manager::kv_cache_manager;
-namespace tpb = tensorrt_llm::nanobind::batch_manager;
-namespace tc = tensorrt_llm::common;
-namespace tr = tensorrt_llm::runtime;
-namespace tle = tensorrt_llm::executor;
-using SizeType32 = tr::SizeType32;
-using TokenIdType = tr::TokenIdType;
-template <typename T>
-using OptVec = std::optional<std::vector<T>>;
 
 #if not defined(TRTLLM_NB_MODULE)
 #error "TRTLLM_NB_MODULE must be defined"
 #endif
 
-namespace
-{
-tr::SamplingConfig makeSamplingConfig(std::vector<tr::SamplingConfig> const& configs)
-{
-    return tr::SamplingConfig(configs);
-}
-} // namespace
-
 NB_MODULE(TRTLLM_NB_MODULE, m)
 {
     m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
     m.attr("binding_type") = "nanobind";
-    nb::set_leak_warnings(false);
-
-    // Create MpiComm binding first since it's used in the executor bindings
-    nb::class_<tensorrt_llm::mpi::MpiComm>(m, "MpiComm")
-        .def_static("rank",
-            []()
-            {
-                auto& session = tensorrt_llm::mpi::MpiComm::session();
-                return session.tensorrt_llm::mpi::MpiComm::getRank();
-            })
-        .def_static("size",
-            []()
-            {
-                auto& session = tensorrt_llm::mpi::MpiComm::session();
-                return session.tensorrt_llm::mpi::MpiComm::getSize();
-            })
-        .def_static("local_size",
-            []()
-            {
-                auto& session = tensorrt_llm::mpi::MpiComm::localSession();
-                return session.tensorrt_llm::mpi::MpiComm::getSize();
-            })
-        .def_static("local_init", []() { tensorrt_llm::mpi::MpiComm::localSession(); })
-        .def_static("set_raw_mpi_session_by_fortran_handle",
-            [](int64_t fortran_handle) { tensorrt_llm::mpi::MpiComm::setRawSessionByFortran(fortran_handle); })
-        .def_static("split",
-            [](size_t color, size_t rank)
-            {
-                auto& world = tensorrt_llm::mpi::MpiComm::world();
-                tensorrt_llm::mpi::MpiComm::setSession(world.split(color, rank));
-            });
-
-    nb::class_<tr::CudaStream>(m, "CudaStream")
-        .def(
-            "__init__",
-            [](tr::CudaStream* self, nb::object py_stream)
-            {
-                cudaStream_t stream = reinterpret_cast<cudaStream_t>(nb::cast<uintptr_t>(py_stream));
-                new (self) tr::CudaStream{stream};
-            },
-            nb::arg("stream_ptr"))
-        .def("get_device", &tr::CudaStream::getDevice);
-
-    // Create submodule for executor bindings.
-    auto mExecutor = m.def_submodule("executor", "Executor bindings");
-    auto mInternal = m.def_submodule("internal", "Internal submodule of TRTLLM runtime");
-    auto mInternalRuntime = mInternal.def_submodule("runtime", "Runtime internal bindings");
-    auto mInternalTesting = mInternal.def_submodule("testing", "Testing internal bindings");
-    auto mInternalBatchManager = mInternal.def_submodule("batch_manager", "Batch manager internal bindings");
-
-    tensorrt_llm::nanobind::executor::initBindings(mExecutor);
-    tensorrt_llm::nanobind::runtime::initBindingsEarly(mInternalRuntime);
-
-    auto buildInfo = m.def_submodule("BuildInfo");
-    buildInfo.attr("ENABLE_MULTI_DEVICE") = nb::int_(ENABLE_MULTI_DEVICE);
-
-    nb::class_<tb::PeftCacheManagerConfig>(m, "PeftCacheManagerConfig")
-        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
-                 SizeType32, std::optional<float>, std::optional<size_t>, std::optional<std::string>>(),
-            nb::arg("num_host_module_layer") = 0, nb::arg("num_device_module_layer") = 0,
-            nb::arg("optimal_adapter_size") = 8, nb::arg("max_adapter_size") = 64, nb::arg("num_put_workers") = 1,
-            nb::arg("num_ensure_workers") = 1, nb::arg("num_copy_streams") = 1,
-            nb::arg("max_pages_per_block_host") = 24, nb::arg("max_pages_per_block_device") = 8,
-            nb::arg("device_cache_percent") = std::nullopt, nb::arg("host_cache_size") = std::nullopt,
-            nb::arg("lora_prefetch_dir") = std::nullopt)
-        .def_rw("num_host_module_layer", &tb::PeftCacheManagerConfig::numHostModuleLayer)
-        .def_rw("num_device_module_layer", &tb::PeftCacheManagerConfig::numDeviceModuleLayer)
-        .def_rw("optimal_adapter_size", &tb::PeftCacheManagerConfig::optimalAdapterSize)
-        .def_rw("max_adapter_size", &tb::PeftCacheManagerConfig::maxAdapterSize)
-        .def_rw("num_put_workers", &tb::PeftCacheManagerConfig::numPutWorkers)
-        .def_rw("num_ensure_workers", &tb::PeftCacheManagerConfig::numEnsureWorkers)
-        .def_rw("num_copy_streams", &tb::PeftCacheManagerConfig::numCopyStreams)
-        .def_rw("max_pages_per_block_host", &tb::PeftCacheManagerConfig::maxPagesPerBlockHost)
-        .def_rw("max_pages_per_block_device", &tb::PeftCacheManagerConfig::maxPagesPerBlockDevice)
-        .def_rw("device_cache_percent", &tb::PeftCacheManagerConfig::deviceCachePercent)
-        .def_rw("host_cache_size", &tb::PeftCacheManagerConfig::hostCacheSize)
-        .def_rw("lora_prefetch_dir", &tb::PeftCacheManagerConfig::loraPrefetchDir);
-
-    nb::enum_<nvinfer1::DataType>(m, "DataType")
-        .value("FLOAT", nvinfer1::DataType::kFLOAT)
-        .value("HALF", nvinfer1::DataType::kHALF)
-        .value("INT8", nvinfer1::DataType::kINT8)
-        .value("INT32", nvinfer1::DataType::kINT32)
-        .value("BOOL", nvinfer1::DataType::kBOOL)
-        .value("UINT8", nvinfer1::DataType::kUINT8)
-        .value("FP8", nvinfer1::DataType::kFP8)
-        .value("BF16", nvinfer1::DataType::kBF16)
-        .value("INT64", nvinfer1::DataType::kINT64)
-        .export_values();
-
-    nb::enum_<tr::ModelConfig::ModelVariant>(m, "GptModelVariant")
-        .value("GPT", tr::ModelConfig::ModelVariant::kGpt)
-        .value("GLM", tr::ModelConfig::ModelVariant::kGlm)
-        .value("CHATGLM", tr::ModelConfig::ModelVariant::kChatGlm)
-        .value("MAMBA", tr::ModelConfig::ModelVariant::kMamba)
-        .value("RECURRENTGEMMA", tr::ModelConfig::ModelVariant::kRecurrentGemma);
-
-    nb::enum_<tr::ModelConfig::KVCacheType>(m, "KVCacheType")
-        .value("CONTINUOUS", tr::ModelConfig::KVCacheType::kCONTINUOUS)
-        .value("PAGED", tr::ModelConfig::KVCacheType::kPAGED)
-        .value("DISABLED", tr::ModelConfig::KVCacheType::kDISABLED)
-        .def("from_string", tr::ModelConfig::KVCacheTypeFromString);
-
-    nb::enum_<tr::ModelConfig::LayerType>(m, "LayerType")
-        .value("ATTENTION", tr::ModelConfig::LayerType::kATTENTION)
-        .value("RECURRENT", tr::ModelConfig::LayerType::kRECURRENT);
-
-    nb::enum_<tr::LoraModule::ModuleType>(m, "LoraModuleType")
-        .value("INVALID", tr::LoraModule::ModuleType::kINVALID)
-        .value("ATTN_QKV", tr::LoraModule::ModuleType::kATTN_QKV)
-        .value("ATTN_Q", tr::LoraModule::ModuleType::kATTN_Q)
-        .value("ATTN_K", tr::LoraModule::ModuleType::kATTN_K)
-        .value("ATTN_V", tr::LoraModule::ModuleType::kATTN_V)
-        .value("ATTN_DENSE", tr::LoraModule::ModuleType::kATTN_DENSE)
-        .value("MLP_H_TO_4H", tr::LoraModule::ModuleType::kMLP_H_TO_4H)
-        .value("MLP_4H_TO_H", tr::LoraModule::ModuleType::kMLP_4H_TO_H)
-        .value("MLP_GATE", tr::LoraModule::ModuleType::kMLP_GATE)
-        .value("CROSS_ATTN_QKV", tr::LoraModule::ModuleType::kCROSS_ATTN_QKV)
-        .value("CROSS_ATTN_Q", tr::LoraModule::ModuleType::kCROSS_ATTN_Q)
-        .value("CROSS_ATTN_K", tr::LoraModule::ModuleType::kCROSS_ATTN_K)
-        .value("CROSS_ATTN_V", tr::LoraModule::ModuleType::kCROSS_ATTN_V)
-        .value("CROSS_ATTN_DENSE", tr::LoraModule::ModuleType::kCROSS_ATTN_DENSE)
-        .value("MOE_H_TO_4H", tr::LoraModule::ModuleType::kMOE_H_TO_4H)
-        .value("MOE_4H_TO_H", tr::LoraModule::ModuleType::kMOE_4H_TO_H)
-        .value("MOE_GATE", tr::LoraModule::ModuleType::kMOE_GATE)
-        .value("MOE_ROUTER", tr::LoraModule::ModuleType::kMOE_ROUTER)
-        .value("MLP_ROUTER", tr::LoraModule::ModuleType::kMLP_ROUTER)
-        .value("MLP_GATE_UP", tr::LoraModule::ModuleType::kMLP_GATE_UP);
-
-    nb::class_<tr::LoraModule>(m, "LoraModule")
-        .def(nb::init<tr::LoraModule::ModuleType, SizeType32, SizeType32, bool, bool, SizeType32, SizeType32>(),
-            nb::arg("module_type"), nb::arg("in_dim"), nb::arg("out_dim"), nb::arg("in_dim_first"),
-            nb::arg("out_dim_first"), nb::arg("in_tp_split_dim"), nb::arg("out_tp_split_dim"))
-        .def_prop_ro("module_type", &tr::LoraModule::name)
-        .def_prop_ro("in_dim", &tr::LoraModule::inDim)
-        .def_prop_ro("out_dim", &tr::LoraModule::outDim)
-        .def_prop_ro("in_dim_first", &tr::LoraModule::inDimFirst)
-        .def_prop_ro("out_dim_first", &tr::LoraModule::outDimFirst)
-        .def_prop_ro("in_tp_split_dim", &tr::LoraModule::inTpSplitDim)
-        .def_prop_ro("out_tp_split_dim", &tr::LoraModule::outTpSplitDim)
-        .def_static("create_lora_modules", &tr::LoraModule::createLoraModules, nb::arg("lora_module_names"),
-            nb::arg("hidden_size"), nb::arg("mlp_hidden_size"), nb::arg("num_attention_heads"),
-            nb::arg("num_kv_attention_heads"), nb::arg("attention_head_size"), nb::arg("tp_size") = 1,
-            nb::arg("num_experts") = 0);
-
-    nb::class_<tc::QuantMode>(m, "QuantMode")
-        .def_static("none", &tc::QuantMode::none)
-        .def_static("int4_weights", &tc::QuantMode::int4Weights)
-        .def_static("int8_weights", &tc::QuantMode::int8Weights)
-        .def_static("activations", &tc::QuantMode::activations)
-        .def_static("per_channel_scaling", &tc::QuantMode::perChannelScaling)
-        .def_static("per_token_scaling", &tc::QuantMode::perTokenScaling)
-        .def_static("per_group_scaling", &tc::QuantMode::perGroupScaling)
-        .def_static("int8_kv_cache", &tc::QuantMode::int8KvCache)
-        .def_static("fp8_kv_cache", &tc::QuantMode::fp8KvCache)
-        .def_static("fp8_qdq", &tc::QuantMode::fp8Qdq)
-        .def_prop_ro("value", &tc::QuantMode::value)
-        .def("is_set", &tc::QuantMode::isSet, nb::arg("mode"))
-        .def_prop_ro("has_int4_weights", &tc::QuantMode::hasInt4Weights)
-        .def_prop_ro("has_int8_weights", &tc::QuantMode::hasInt8Weights)
-        .def_prop_ro("has_activations", &tc::QuantMode::hasActivations)
-        .def_prop_ro("has_per_channel_scaling", &tc::QuantMode::hasPerChannelScaling)
-        .def_prop_ro("has_per_token_scaling", &tc::QuantMode::hasPerTokenScaling)
-        .def_prop_ro("has_per_group_scaling", &tc::QuantMode::hasPerGroupScaling)
-        .def_prop_ro("has_static_activation_scaling", &tc::QuantMode::hasStaticActivationScaling)
-        .def_prop_ro("has_int8_kv_cache", &tc::QuantMode::hasInt8KvCache)
-        .def_prop_ro("has_fp8_kv_cache", &tc::QuantMode::hasFp8KvCache)
-        .def_prop_ro("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq)
-        .def_prop_ro("has_nvfp4", &tc::QuantMode::hasNvfp4)
-        .def_prop_ro("has_w4a8_mxfp4_fp8", &tc::QuantMode::hasW4a8Mxfp4Fp8)
-        .def_prop_ro("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant)
-        .def_static("from_description", &tc::QuantMode::fromDescription, nb::arg("quantize_weights"),
-            nb::arg("quantize_activations"), nb::arg("per_token"), nb::arg("per_channel"), nb::arg("per_group"),
-            nb::arg("use_int4_weights"), nb::arg("use_int8_kv_cache"), nb::arg("use_fp8_kv_kache"),
-            nb::arg("use_fp8_qdq"), nb::arg("use_fp8_rowwise"), nb::arg("use_w4a8_qserve"), nb::arg("use_nvfp4"),
-            nb::arg("use_fp8_block_scales"), nb::arg("use_w4a8_mxfp4_fp8"))
-        .def_static("use_smooth_quant", &tc::QuantMode::useSmoothQuant, nb::arg("per_token") = false,
-            nb::arg("per_channel") = false)
-        .def_static("use_weight_only", &tc::QuantMode::useWeightOnly, nb::arg("use_int4_weights") = false,
-            nb::arg("per_group") = false)
-        .def_static("from_quant_algo", &tc::QuantMode::fromQuantAlgo, nb::arg("quant_algo") = nb::none(),
-            nb::arg("kv_cache_quant_algo") = nb::none())
-        .def(nb::self + nb::self)
-        .def(nb::self += nb::self)
-        .def(nb::self - nb::self)
-        .def(nb::self -= nb::self)
-        .def(nb::self == nb::self)
-        .def(nb::self != nb::self);
-
-    nb::class_<tr::ModelConfig>(m, "ModelConfig")
-        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, nvinfer1::DataType>(),
-            nb::arg("vocab_size"), nb::arg("num_layers"), nb::arg("num_attention_layers"), nb::arg("num_rnn_layers"),
-            nb::arg("num_heads"), nb::arg("hidden_size"), nb::arg("data_type"))
-        .def_prop_ro("vocab_size", &tr::ModelConfig::getVocabSize)
-        .def("vocab_size_padded", &tr::ModelConfig::getVocabSizePadded, nb::arg("world_size"))
-        .def("num_layers", &tr::ModelConfig::getNbLayers, nb::arg("pipeline_parallelism") = 1,
-            nb::arg("pipeline_parallelism_rank") = 0)
-        .def("num_attention_layers", &tr::ModelConfig::getNbAttentionLayers, nb::arg("pipeline_parallelism") = 1,
-            nb::arg("pipeline_parallelism_rank") = 0)
-        .def("num_rnn_layers", &tr::ModelConfig::getNbRnnLayers, nb::arg("pipeline_parallelism") = 1,
-            nb::arg("pipeline_parallelism_rank") = 0)
-        .def("num_kv_heads", &tr::ModelConfig::getNbKvHeads, nb::arg("layer_idx"))
-        .def("set_num_kv_heads", &tr::ModelConfig::setNbKvHeads, nb::arg("num_kv_heads"))
-        .def_prop_ro("num_heads", &tr::ModelConfig::getNbHeads)
-        .def_prop_ro("hidden_size", &tr::ModelConfig::getHiddenSize)
-        .def_prop_ro("size_per_head", &tr::ModelConfig::getSizePerHead)
-        .def_prop_ro("data_type", &tr::ModelConfig::getDataType)
-        .def_prop_ro("speculative_decoding_mode", &tr::ModelConfig::getSpeculativeDecodingMode)
-        .def_prop_rw("head_size", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead)
-        .def_prop_rw(
-            "num_kv_heads_per_layer", &tr::ModelConfig::getNumKvHeadsPerLayer, &tr::ModelConfig::setNumKvHeadsPerLayer)
-        .def_prop_rw("use_gpt_attention_plugin",
-            nb::overload_cast<>(&tr::ModelConfig::useGptAttentionPlugin, nb::const_),
-            nb::overload_cast<bool>(&tr::ModelConfig::useGptAttentionPlugin))
-        .def_prop_rw("use_packed_input", nb::overload_cast<>(&tr::ModelConfig::usePackedInput, nb::const_),
-            nb::overload_cast<bool>(&tr::ModelConfig::usePackedInput))
-        .def_prop_rw("kv_cache_type", nb::overload_cast<>(&tr::ModelConfig::getKVCacheType, nb::const_),
-            nb::overload_cast<tr::ModelConfig::KVCacheType>(&tr::ModelConfig::setKVCacheType))
-        .def_prop_rw("tokens_per_block", &tr::ModelConfig::getTokensPerBlock, &tr::ModelConfig::setTokensPerBlock)
-        .def_prop_rw("quant_mode", &tr::ModelConfig::getQuantMode, &tr::ModelConfig::setQuantMode)
-        .def_prop_ro("supports_inflight_batching", &tr::ModelConfig::supportsInflightBatching)
-        .def_prop_rw("max_batch_size", &tr::ModelConfig::getMaxBatchSize, &tr::ModelConfig::setMaxBatchSize)
-        .def_prop_rw("max_beam_width", &tr::ModelConfig::getMaxBeamWidth, &tr::ModelConfig::setMaxBeamWidth)
-        .def_prop_rw("max_input_len", &tr::ModelConfig::getMaxInputLen, &tr::ModelConfig::setMaxInputLen)
-        .def_prop_rw("max_seq_len", &tr::ModelConfig::getMaxSequenceLen, &tr::ModelConfig::setMaxSequenceLen)
-        .def_prop_rw("max_num_tokens", &tr::ModelConfig::getMaxNumTokens, &tr::ModelConfig::setMaxNumTokens)
-        .def_prop_rw("max_prompt_embedding_table_size", &tr::ModelConfig::getMaxPromptEmbeddingTableSize,
-            &tr::ModelConfig::setMaxPromptEmbeddingTableSize)
-        .def_prop_ro("use_prompt_tuning", &tr::ModelConfig::usePromptTuning)
-        .def_prop_ro("use_mrope", &tr::ModelConfig::useMrope)
-        .def_prop_rw("use_lora_plugin", nb::overload_cast<>(&tr::ModelConfig::useLoraPlugin, nb::const_),
-            nb::overload_cast<bool>(&tr::ModelConfig::useLoraPlugin))
-        .def_prop_rw("layer_types", &tr::ModelConfig::getLayerTypes, &tr::ModelConfig::setLayerTypes)
-        .def_prop_rw("compute_context_logits", nb::overload_cast<>(&tr::ModelConfig::computeContextLogits, nb::const_),
-            nb::overload_cast<bool>(&tr::ModelConfig::computeContextLogits))
-        .def_prop_rw("compute_generation_logits",
-            nb::overload_cast<>(&tr::ModelConfig::computeGenerationLogits, nb::const_),
-            nb::overload_cast<bool>(&tr::ModelConfig::computeGenerationLogits))
-        .def_prop_rw("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant)
-        .def_prop_rw("use_cross_attention", &tr::ModelConfig::useCrossAttention, &tr::ModelConfig::setUseCrossAttention)
-        .def_prop_rw("lora_modules", &tr::ModelConfig::getLoraModules, &tr::ModelConfig::setLoraModules)
-        .def_prop_rw("max_lora_rank", &tr::ModelConfig::getMaxLoraRank, &tr::ModelConfig::setMaxLoraRank)
-        .def_prop_rw("mlp_hidden_size", &tr::ModelConfig::getMlpHiddenSize, &tr::ModelConfig::setMlpHiddenSize)
-        .def_prop_rw("size_per_head", &tr::ModelConfig::getSizePerHead, &tr::ModelConfig::setSizePerHead);
-
-    nb::class_<tr::WorldConfig>(m, "WorldConfig")
-        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
-                 std::optional<std::vector<SizeType32>> const&, bool>(),
-            nb::arg("tensor_parallelism") = 1, nb::arg("pipeline_parallelism") = 1, nb::arg("context_parallelism") = 1,
-            nb::arg("rank") = 0, nb::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode,
-            nb::arg("device_ids") = nb::none(), nb::arg("enable_attention_dp") = false)
-        .def_prop_ro("size", &tr::WorldConfig::getSize)
-        .def_prop_ro("tensor_parallelism", &tr::WorldConfig::getTensorParallelism)
-        .def_prop_ro("pipeline_parallelism", &tr::WorldConfig::getPipelineParallelism)
-        .def_prop_ro("context_parallelism", &tr::WorldConfig::getContextParallelism)
-        .def_prop_ro("is_tensor_parallel", &tr::WorldConfig::isTensorParallel)
-        .def_prop_ro("is_pipeline_parallel", &tr::WorldConfig::isPipelineParallel)
-        .def_prop_ro("is_context_parallel", &tr::WorldConfig::isContextParallel)
-        .def_prop_ro("rank", &tr::WorldConfig::getRank)
-        .def_prop_ro("local_rank", &tr::WorldConfig::getLocalRank)
-        .def_prop_ro("node_rank", &tr::WorldConfig::getNodeRank)
-        .def_prop_ro("gpus_per_node", &tr::WorldConfig::getGpusPerNode)
-        .def_prop_ro("gpus_per_group", &tr::WorldConfig::getGpusPerGroup)
-        .def_prop_ro("device", &tr::WorldConfig::getDevice)
-        .def_prop_ro("pipeline_parallel_rank", &tr::WorldConfig::getPipelineParallelRank)
-        .def_prop_ro("tensor_parallel_rank", &tr::WorldConfig::getTensorParallelRank)
-        .def_prop_ro("context_parallel_rank", &tr::WorldConfig::getContextParallelRank)
-        .def_prop_ro("enable_attention_dp", &tr::WorldConfig::enableAttentionDP)
-        .def_static("mpi",
-            nb::overload_cast<SizeType32, std::optional<SizeType32>, std::optional<SizeType32>,
-                std::optional<SizeType32>, std::optional<std::vector<SizeType32>> const&, bool>(&tr::WorldConfig::mpi),
-            nb::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode, nb::arg("tensor_parallelism") = nb::none(),
-            nb::arg("pipeline_parallelism") = nb::none(), nb::arg("context_parallelism") = nb::none(),
-            nb::arg("device_ids") = nb::none(), nb::arg("enable_attention_dp") = false);
-
-    auto SamplingConfigGetState = [](tr::SamplingConfig const& config) -> nb::tuple
-    {
-        return nb::make_tuple(config.beamWidth, config.temperature, config.minLength, config.repetitionPenalty,
-            config.presencePenalty, config.frequencyPenalty, config.topK, config.topP, config.randomSeed,
-            config.topPDecay, config.topPMin, config.topPResetIds, config.beamSearchDiversityRate, config.lengthPenalty,
-            config.earlyStopping, config.noRepeatNgramSize, config.numReturnSequences, config.minP,
-            config.beamWidthArray);
-    };
-    auto SamplingConfigSetState = [](tr::SamplingConfig& self, nb::tuple t) -> tr::SamplingConfig
-    {
-        assert(t.size() == 19);
-
-        tr::SamplingConfig config;
-        config.beamWidth = nb::cast<SizeType32>(t[0]);
-        config.temperature = nb::cast<OptVec<float>>(t[1]);
-        config.minLength = nb::cast<OptVec<SizeType32>>(t[2]);
-        config.repetitionPenalty = nb::cast<OptVec<float>>(t[3]);
-        config.presencePenalty = nb::cast<OptVec<float>>(t[4]);
-        config.frequencyPenalty = nb::cast<OptVec<float>>(t[5]);
-        config.topK = nb::cast<OptVec<SizeType32>>(t[6]);
-        config.topP = nb::cast<OptVec<float>>(t[7]);
-        config.randomSeed = nb::cast<OptVec<uint64_t>>(t[8]);
-        config.topPDecay = nb::cast<OptVec<float>>(t[9]);
-        config.topPMin = nb::cast<OptVec<float>>(t[10]);
-        config.topPResetIds = nb::cast<OptVec<TokenIdType>>(t[11]);
-        config.beamSearchDiversityRate = nb::cast<OptVec<float>>(t[12]);
-        config.lengthPenalty = nb::cast<OptVec<float>>(t[13]);
-        config.earlyStopping = nb::cast<OptVec<SizeType32>>(t[14]);
-        config.noRepeatNgramSize = nb::cast<OptVec<SizeType32>>(t[15]);
-        config.numReturnSequences = nb::cast<SizeType32>(t[16]);
-        config.minP = nb::cast<OptVec<float>>(t[17]);
-        config.beamWidthArray = nb::cast<OptVec<std::vector<SizeType32>>>(t[18]);
-
-        return config;
-    };
-
-    nb::class_<tr::SamplingConfig>(m, "SamplingConfig")
-        .def(nb::init<SizeType32>(), nb::arg("beam_width") = 1)
-        .def(nb::init<tle::SamplingConfig, std::optional<tle::ExternalDraftTokensConfig>>(),
-            nb::arg("executor_sample_config"), nb::arg("external_draft_tokens_config") = std::nullopt)
-        .def_rw("beam_width", &tr::SamplingConfig::beamWidth)
-        .def_rw("temperature", &tr::SamplingConfig::temperature)
-        .def_rw("min_length", &tr::SamplingConfig::minLength)
-        .def_rw("repetition_penalty", &tr::SamplingConfig::repetitionPenalty)
-        .def_rw("presence_penalty", &tr::SamplingConfig::presencePenalty)
-        .def_rw("frequency_penalty", &tr::SamplingConfig::frequencyPenalty)
-        .def_rw("top_k", &tr::SamplingConfig::topK)
-        .def_rw("top_p", &tr::SamplingConfig::topP)
-        .def_rw("random_seed", &tr::SamplingConfig::randomSeed)
-        .def_rw("top_p_decay", &tr::SamplingConfig::topPDecay)
-        .def_rw("top_p_min", &tr::SamplingConfig::topPMin)
-        .def_rw("top_p_reset_ids", &tr::SamplingConfig::topPResetIds)
-        .def_rw("beam_search_diversity_rate", &tr::SamplingConfig::beamSearchDiversityRate)
-        .def_rw("length_penalty", &tr::SamplingConfig::lengthPenalty)
-        .def_rw("early_stopping", &tr::SamplingConfig::earlyStopping)
-        .def_rw("no_repeat_ngram_size", &tr::SamplingConfig::noRepeatNgramSize)
-        .def_rw("num_return_sequences", &tr::SamplingConfig::numReturnSequences)
-        .def_rw("min_p", &tr::SamplingConfig::minP)
-        .def_rw("beam_width_array", &tr::SamplingConfig::beamWidthArray)
-        .def_rw("normalize_log_probs", &tr::SamplingConfig::normalizeLogProbs)
-        .def("__getstate__", SamplingConfigGetState)
-        .def("__setstate__", SamplingConfigSetState)
-        .def("__eq__", &tr::SamplingConfig::operator==);
-
-    nb::bind_vector<std::vector<tr::SamplingConfig>>(m, "SamplingConfigVector");
-
-    m.def("make_sampling_config", &makeSamplingConfig, nb::arg("configs"));
-
-    nb::class_<tr::GptJsonConfig>(m, "GptJsonConfig")
-        .def(nb::init<std::string, std::string, std::string, SizeType32, SizeType32, SizeType32, SizeType32,
-                 tr::ModelConfig, std::optional<tr::RuntimeDefaults>>(),
-            nb::arg("name"), nb::arg("version"), nb::arg("precision"), nb::arg("tensor_parallelism"),
-            nb::arg("pipeline_parallelism"), nb::arg("context_parallelism"), nb::arg("gpus_per_node"),
-            nb::arg("model_config"), nb::arg("runtime_defaults") = nb::none())
-        .def_static("parse", nb::overload_cast<std::string const&>(&tr::GptJsonConfig::parse), nb::arg("json"))
-        .def_static(
-            "parse_file", nb::overload_cast<std::filesystem::path const&>(&tr::GptJsonConfig::parse), nb::arg("path"))
-        .def_prop_ro("model_config", &tr::GptJsonConfig::getModelConfig)
-        .def_prop_ro("name", &tr::GptJsonConfig::getName)
-        .def_prop_ro("version", &tr::GptJsonConfig::getVersion)
-        .def_prop_ro("precision", &tr::GptJsonConfig::getPrecision)
-        .def_prop_ro("tensor_parallelism", &tr::GptJsonConfig::getTensorParallelism)
-        .def_prop_ro("pipeline_parallelism", &tr::GptJsonConfig::getPipelineParallelism)
-        .def_prop_ro("context_parallelism", &tr::GptJsonConfig::getContextParallelism)
-        .def_prop_ro("gpus_per_node", &tr::GptJsonConfig::getGpusPerNode)
-        .def_prop_ro("world_size", &tr::GptJsonConfig::getWorldSize)
-        .def_prop_ro("runtime_defaults", &tr::GptJsonConfig::getRuntimeDefaults)
-        .def("engine_filename",
-            nb::overload_cast<tr::WorldConfig const&, std::string const&>(
-                &tr::GptJsonConfig::engineFilename, nb::const_),
-            nb::arg("world_config"), nb::arg("model"))
-        .def("engine_filename",
-            nb::overload_cast<tr::WorldConfig const&>(&tr::GptJsonConfig::engineFilename, nb::const_),
-            nb::arg("world_config"));
-
-    nb::enum_<tb::LlmRequestState>(m, "LlmRequestState")
-        .value("UNKNOWN", tb::LlmRequestState::kUNKNOWN)
-        .value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT)
-        .value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT)
-        .value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS)
-        .value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE)
-        .value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE)
-        .value("DISAGG_GENERATION_INIT", tb::LlmRequestState::kDISAGG_GENERATION_INIT)
-        .value("DISAGG_CONTEXT_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS)
-        .value("DISAGG_CONTEXT_COMPLETE", tb::LlmRequestState::kDISAGG_CONTEXT_COMPLETE)
-        .value("DISAGG_GENERATION_TRANS_IN_PROGRESS", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_IN_PROGRESS)
-        .value("DISAGG_GENERATION_TRANS_COMPLETE", tb::LlmRequestState::kDISAGG_GENERATION_TRANS_COMPLETE)
-        .value("DISAGG_CONTEXT_INIT_AND_TRANS", tb::LlmRequestState::kDISAGG_CONTEXT_INIT_AND_TRANS);
-
-    nb::class_<tr::MemoryCounters>(m, "MemoryCounters")
-        .def_static("instance", &tr::MemoryCounters::getInstance, nb::rv_policy::reference)
-        .def_prop_ro("gpu", &tr::MemoryCounters::getGpu)
-        .def_prop_ro("cpu", &tr::MemoryCounters::getCpu)
-        .def_prop_ro("pinned", &tr::MemoryCounters::getPinned)
-        .def_prop_ro("uvm", &tr::MemoryCounters::getUVM);
-
-    tensorrt_llm::nanobind::runtime::initBindings(mInternalRuntime);
-    tensorrt_llm::nanobind::testing::initBindings(mInternalTesting);
-    tpb::initBindings(mInternalBatchManager);
-    tb::kv_cache_manager::KVCacheManagerBindings::initBindings(mInternalBatchManager);
-    tb::BasePeftCacheManagerBindings::initBindings(mInternalBatchManager);
-    tb::CacheTransceiverBindings::initBindings(mInternalBatchManager);
-    tpb::Buffers::initBindings(mInternalBatchManager);
-
-    auto mInternalAlgorithms = mInternal.def_submodule("algorithms", "Algorithms internal bindings");
-    tpb::algorithms::initBindings(mInternalAlgorithms);
-
-    auto mUserbuffers = mInternal.def_submodule("userbuffers", "User buffers internal bindings");
-    tensorrt_llm::kernels::userbuffers::UserBufferBindings::initBindings(mUserbuffers);
-
-    // NVLS allocators
-    nb::class_<tr::IpcNvlsHandle>(m, "IpcNvlsHandle")
-        .def(nb::init<>())
-        .def_rw("uc_ptr", &tr::IpcNvlsHandle::uc_ptr)
-        .def_rw("mc_ptr", &tr::IpcNvlsHandle::mc_ptr)
-        .def_rw("size", &tr::IpcNvlsHandle::size)
-        .def("get_ipc_ptrs",
-            [](tr::IpcNvlsHandle& self) { return reinterpret_cast<uintptr_t>(self.ipc_uc_ptrs.data()); });
-
-    m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference);
-    m.def("ipc_nvls_free", &tr::ipcNvlsFree);
-    m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
 }
diff --git a/cpp/tensorrt_llm/nanobind/common/bindTypes.h b/cpp/tensorrt_llm/nanobind/common/bindTypes.h
deleted file mode 100644
index 5cd714e458a..00000000000
--- a/cpp/tensorrt_llm/nanobind/common/bindTypes.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/make_iterator.h>
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/string.h>
-
-namespace PybindUtils
-{
-
-namespace nb = nanobind;
-
-template <typename T>
-void bindList(nb::module_& m, std::string const& name)
-{
-    nb::class_<T>(m, name.c_str())
-        .def(nb::init<>())
-        .def("push_back", [](T& lst, const typename T::value_type& value) { lst.push_back(value); })
-        .def("pop_back", [](T& lst) { lst.pop_back(); })
-        .def("push_front", [](T& lst, const typename T::value_type& value) { lst.push_front(value); })
-        .def("pop_front", [](T& lst) { lst.pop_front(); })
-        .def("__len__", [](T const& lst) { return lst.size(); })
-        .def(
-            "__iter__", [](T& lst) { return nb::make_iterator(nb::type<T>(), "iterator", lst.begin(), lst.end()); },
-            nb::keep_alive<0, 1>())
-        .def("__getitem__",
-            [](T const& lst, size_t index)
-            {
-                if (index >= lst.size())
-                    throw nb::index_error();
-                auto it = lst.begin();
-                std::advance(it, index);
-                return *it;
-            })
-        .def("__setitem__",
-            [](T& lst, size_t index, const typename T::value_type& value)
-            {
-                if (index >= lst.size())
-                    throw nb::index_error();
-                auto it = lst.begin();
-                std::advance(it, index);
-                *it = value;
-            });
-}
-
-template <typename T>
-void bindSet(nb::module_& m, std::string const& name)
-{
-    nb::class_<T>(m, name.c_str())
-        .def(nb::init<>())
-        .def("clear", &T::clear)
-        .def("size", &T::size)
-        .def("insert", [](T& s, typename T::value_type const& value) { s.insert(value); })
-        .def("erase", nb::overload_cast<typename T::value_type const&>(&T::erase))
-        .def("__len__", [](T const& lst) { return lst.size(); })
-        .def("__contains__", [](T const& s, typename T::value_type x) { return s.find(x) != s.end(); })
-        .def(
-            "__iter__", [](T& s) { return nb::make_iterator(nb::type<T>(), "iterator", s.begin(), s.end()); },
-            nb::keep_alive<0, 1>())
-        .def("__eq__", [](T const& s, T const& other) { return s == other; })
-        .def("__getstate__",
-            [](T const& v)
-            {
-                /* Return a tuple that fully encodes the state of the object */
-                return nb::make_tuple(std::vector<typename T::value_type>(v.begin(), v.end()));
-            })
-        .def("__setstate__",
-            [](T& v, nb::tuple const& t)
-            {
-                if (t.size() != 1)
-                    throw std::runtime_error("Invalid state!");
-                /* Create a new C++ instance */
-                T s;
-                /* Assign any additional state */
-                auto state_list = nb::cast<std::vector<typename T::value_type>>(t[0]);
-                for (auto& item : state_list)
-                {
-                    s.insert(item);
-                }
-                return s;
-            });
-}
-
-} // namespace PybindUtils
diff --git a/cpp/tensorrt_llm/nanobind/common/customCasters.h b/cpp/tensorrt_llm/nanobind/common/customCasters.h
deleted file mode 100644
index 7cfa07d249a..00000000000
--- a/cpp/tensorrt_llm/nanobind/common/customCasters.h
+++ /dev/null
@@ -1,345 +0,0 @@
-/*
- * Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/batch_manager/common.h"
-#include "tensorrt_llm/batch_manager/decoderBuffers.h"
-#include "tensorrt_llm/common/optionalRef.h"
-#include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/request.h"
-#include "tensorrt_llm/runtime/samplingConfig.h"
-#include "tensorrt_llm/runtime/torch.h"
-#include "tensorrt_llm/runtime/torchView.h"
-
-#include <ATen/DLConvertor.h>
-#include <deque>
-#include <filesystem>
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/filesystem.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/vector.h>
-#include <torch/csrc/autograd/python_variable.h>
-#include <torch/csrc/autograd/variable.h>
-#include <torch/extension.h>
-#include <torch/torch.h>
-
-// Pybind requires to have a central include in order for type casters to work.
-// Opaque bindings add a type caster, so they have the same requirement.
-// See the warning in https://pybind11.readthedocs.io/en/stable/advanced/cast/custom.html
-
-// Opaque bindings
-NB_MAKE_OPAQUE(tensorrt_llm::batch_manager::ReqIdsSet)
-NB_MAKE_OPAQUE(std::vector<tensorrt_llm::batch_manager::SlotDecoderBuffers>)
-NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::decoder_batch::Request>)
-NB_MAKE_OPAQUE(std::vector<tensorrt_llm::runtime::SamplingConfig>)
-NB_MAKE_OPAQUE(std::vector<std::vector<tensorrt_llm::runtime::SizeType32>>)
-
-namespace nb = nanobind;
-
-// Custom casters
-namespace NB_NAMESPACE
-{
-
-namespace detail
-{
-
-template <typename T, typename Alloc>
-struct type_caster<std::deque<T, Alloc>>
-{
-    using Type = std::deque<T, Alloc>;
-    NB_TYPE_CASTER(Type, const_name("List"));
-
-    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup) noexcept
-    {
-        sequence seq(src, nanobind::detail::borrow_t{});
-        value.clear();
-        make_caster<T> caster;
-        for (auto const& item : seq)
-        {
-            if (!caster.from_python(item, flags, cleanup))
-                return false;
-            value.push_back(caster.operator T&());
-        }
-        return true;
-    }
-
-    static handle from_cpp(Type const& deque, rv_policy policy, cleanup_list* cleanup) noexcept
-    {
-        nb::list list;
-
-        for (auto const& item : deque)
-        {
-            nb::object py_item = steal(make_caster<T>::from_cpp(item, policy, cleanup));
-            if (!py_item)
-                return {};
-            list.append(py_item);
-        }
-        return list.release();
-    }
-};
-
-template <typename T>
-struct type_caster<tensorrt_llm::common::OptionalRef<T>>
-{
-    using value_conv = make_caster<T>;
-
-    NB_TYPE_CASTER(tensorrt_llm::common::OptionalRef<T>, value_conv::Name);
-
-    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
-    {
-        if (src.is_none())
-        {
-            // If the Python object is None, create an empty OptionalRef
-            value = tensorrt_llm::common::OptionalRef<T>();
-            return true;
-        }
-
-        value_conv conv;
-        if (!conv.from_python(src, flags, cleanup))
-            return false;
-
-        // Create an OptionalRef with a reference to the converted value
-        value = tensorrt_llm::common::OptionalRef<T>(conv);
-        return true;
-    }
-
-    static handle from_cpp(tensorrt_llm::common::OptionalRef<T> const& src, rv_policy policy, cleanup_list* cleanup)
-    {
-        if (!src.has_value())
-            return none().release();
-
-        return value_conv::from_cpp(*src, policy, cleanup);
-    }
-};
-
-template <typename T>
-struct PathCaster
-{
-
-private:
-    static PyObject* unicode_from_fs_native(std::string const& w)
-    {
-        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
-    }
-
-    static PyObject* unicode_from_fs_native(std::wstring const& w)
-    {
-        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
-    }
-
-public:
-    static handle from_cpp(T const& path, rv_policy, cleanup_list* cleanup)
-    {
-        if (auto py_str = unicode_from_fs_native(path.native()))
-        {
-            return module_::import_("pathlib").attr("Path")(steal<object>(py_str), cleanup).release();
-        }
-        return nullptr;
-    }
-
-    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
-    {
-        PyObject* native = nullptr;
-        if constexpr (std::is_same_v<typename T::value_type, char>)
-        {
-            if (PyUnicode_FSConverter(src.ptr(), &native) != 0)
-            {
-                if (auto* c_str = PyBytes_AsString(native))
-                {
-                    // AsString returns a pointer to the internal buffer, which
-                    // must not be free'd.
-                    value = c_str;
-                }
-            }
-        }
-        else if constexpr (std::is_same_v<typename T::value_type, wchar_t>)
-        {
-            if (PyUnicode_FSDecoder(src.ptr(), &native) != 0)
-            {
-                if (auto* c_str = PyUnicode_AsWideCharString(native, nullptr))
-                {
-                    // AsWideCharString returns a new string that must be free'd.
-                    value = c_str; // Copies the string.
-                    PyMem_Free(c_str);
-                }
-            }
-        }
-        Py_XDECREF(native);
-        if (PyErr_Occurred())
-        {
-            PyErr_Clear();
-            return false;
-        }
-        return true;
-    }
-
-    NB_TYPE_CASTER(T, const_name("os.PathLike"));
-};
-
-template <>
-class type_caster<tensorrt_llm::executor::StreamPtr>
-{
-public:
-    NB_TYPE_CASTER(tensorrt_llm::executor::StreamPtr, const_name("int"));
-
-    bool from_python([[maybe_unused]] handle src, uint8_t flags, cleanup_list* cleanup)
-    {
-        auto stream_ptr = nanobind::cast<uintptr_t>(src);
-        value = std::make_shared<tensorrt_llm::runtime::CudaStream>(reinterpret_cast<cudaStream_t>(stream_ptr));
-
-        return true;
-    }
-
-    static handle from_cpp(
-        tensorrt_llm::executor::StreamPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
-    {
-        // Return cudaStream_t as integer.
-        return PyLong_FromVoidPtr(src->get());
-    }
-};
-
-template <>
-struct type_caster<tensorrt_llm::executor::Tensor>
-{
-public:
-    NB_TYPE_CASTER(tensorrt_llm::executor::Tensor, const_name("torch.Tensor"));
-
-    // Convert PyObject(torch.Tensor) -> tensorrt_llm::executor::Tensor
-    bool from_python(handle src, uint8_t flags, cleanup_list* cleanup)
-    {
-        PyObject* obj = src.ptr();
-        if (THPVariable_Check(obj))
-        {
-            at::Tensor const& t = THPVariable_Unpack(obj);
-            value = tensorrt_llm::executor::detail::ofITensor(tensorrt_llm::runtime::TorchView::of(t));
-            return true;
-        }
-        return false;
-    }
-
-    // Convert tensorrt_llm::executor::Tensor -> PyObject(torch.Tensor)
-    static handle from_cpp(
-        tensorrt_llm::executor::Tensor const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
-    {
-        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(tensorrt_llm::executor::detail::toITensor(src)));
-    }
-};
-
-template <>
-struct type_caster<tensorrt_llm::runtime::ITensor::SharedPtr>
-{
-public:
-    NB_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedPtr, const_name("torch.Tensor"));
-
-    // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedPtr
-    bool from_python(handle src, uint8_t, cleanup_list*)
-    {
-        PyObject* obj = src.ptr();
-        if (THPVariable_Check(obj))
-        {
-            at::Tensor const& t = THPVariable_Unpack(obj);
-            value = std::move(tensorrt_llm::runtime::TorchView::of(t));
-            return true;
-        }
-        return false;
-    }
-
-    // Convert tensorrt_llm::runtime::ITensor::SharedPtr -> PyObject(torch.Tensor)
-    static handle from_cpp(
-        tensorrt_llm::runtime::ITensor::SharedPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
-    {
-        if (src == nullptr)
-        {
-            return none().release();
-        }
-        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(src));
-    }
-};
-
-template <>
-struct type_caster<tensorrt_llm::runtime::ITensor::SharedConstPtr>
-{
-public:
-    NB_TYPE_CASTER(tensorrt_llm::runtime::ITensor::SharedConstPtr, const_name("torch.Tensor"));
-
-    // Convert PyObject(torch.Tensor) -> tensorrt_llm::runtime::ITensor::SharedConstPtr
-    bool from_python(handle src, uint8_t, cleanup_list*)
-    {
-        PyObject* obj = src.ptr();
-        if (THPVariable_Check(obj))
-        {
-            at::Tensor const& t = THPVariable_Unpack(obj);
-            value = std::move(tensorrt_llm::runtime::TorchView::of(t));
-            return true;
-        }
-        return false;
-    }
-
-    // Convert tensorrt_llm::runtime::ITensor::SharedConstPtr -> PyObject(torch.Tensor)
-    static handle from_cpp(
-        tensorrt_llm::runtime::ITensor::SharedConstPtr const& src, rv_policy /* policy */, cleanup_list* /* cleanup */)
-    {
-        if (src == nullptr)
-        {
-            return none().release();
-        }
-        return THPVariable_Wrap(tensorrt_llm::runtime::Torch::tensor(
-            reinterpret_cast<tensorrt_llm::runtime::ITensor::SharedPtr const&>(src)));
-    }
-};
-
-template <>
-struct type_caster<at::Tensor>
-{
-    NB_TYPE_CASTER(at::Tensor, const_name("torch.Tensor"));
-
-    bool from_python(nb::handle src, uint8_t, cleanup_list*) noexcept
-    {
-        nb::object capsule = nb::getattr(src, "__dlpack__")();
-        DLManagedTensor* dl_managed = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(capsule.ptr(), "dltensor"));
-        PyCapsule_SetDestructor(capsule.ptr(), nullptr);
-        value = at::fromDLPack(dl_managed).alias();
-        return true;
-    }
-
-    static handle from_cpp(at::Tensor tensor, rv_policy, cleanup_list*) noexcept
-    {
-        DLManagedTensor* dl_managed = at::toDLPack(tensor);
-        if (!dl_managed)
-            return nullptr;
-
-        nanobind::object capsule = nb::steal(PyCapsule_New(dl_managed, "dltensor",
-            [](PyObject* obj)
-            {
-                DLManagedTensor* dl = static_cast<DLManagedTensor*>(PyCapsule_GetPointer(obj, "dltensor"));
-                dl->deleter(dl);
-            }));
-        if (!capsule.is_valid())
-        {
-            dl_managed->deleter(dl_managed);
-            return nullptr;
-        }
-        nanobind::module_ torch = nanobind::module_::import_("torch");
-        nanobind::object result = torch.attr("from_dlpack")(capsule);
-        capsule.release();
-        return result.release();
-    }
-};
-} // namespace detail
-} // namespace NB_NAMESPACE
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp b/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
deleted file mode 100644
index d3f482df899..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/bindings.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bindings.h"
-#include "executor.h"
-#include "executorConfig.h"
-#include "request.h"
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/executor/types.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/tuple.h>
-#include <nanobind/stl/variant.h>
-#include <optional>
-
-namespace nb = nanobind;
-namespace tle = tensorrt_llm::executor;
-using SizeType32 = tle::SizeType32;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-template <typename T>
-void instantiateEventDiff(nb::module_& m, std::string const& name)
-{
-    nb::class_<tle::KVCacheEventDiff<T>>(m, ("KVCacheEventDiff" + name).c_str())
-        .def_ro("old_value", &tle::KVCacheEventDiff<T>::oldValue)
-        .def_ro("new_value", &tle::KVCacheEventDiff<T>::newValue);
-}
-
-void initBindings(nb::module_& m)
-{
-    m.attr("__version__") = tle::version();
-    nb::enum_<tle::ModelType>(m, "ModelType")
-        .value("DECODER_ONLY", tle::ModelType::kDECODER_ONLY)
-        .value("ENCODER_ONLY", tle::ModelType::kENCODER_ONLY)
-        .value("ENCODER_DECODER", tle::ModelType::kENCODER_DECODER);
-
-    auto decodingModeGetstate = [](tle::DecodingMode const& self) { return nb::make_tuple(self.getState()); };
-    auto decodingModeSetstate = [](tle::DecodingMode& self, nb::tuple const& state)
-    {
-        if (state.size() != 1)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::DecodingMode(nb::cast<tle::DecodingMode::UnderlyingType>(state[0]));
-    };
-    nb::class_<tle::DecodingMode>(m, "DecodingMode")
-        .def("Auto", &tle::DecodingMode::Auto)
-        .def("TopK", &tle::DecodingMode::TopK)
-        .def("TopP", &tle::DecodingMode::TopP)
-        .def("TopKTopP", &tle::DecodingMode::TopKTopP)
-        .def("BeamSearch", &tle::DecodingMode::BeamSearch)
-        .def("Medusa", &tle::DecodingMode::Medusa)
-        .def("Lookahead", &tle::DecodingMode::Lookahead)
-        .def("ExplicitDraftTokens", &tle::DecodingMode::ExplicitDraftTokens)
-        .def("Eagle", &tle::DecodingMode::Eagle)
-        .def("isAuto", &tle::DecodingMode::isAuto)
-        .def("isTopK", &tle::DecodingMode::isTopK)
-        .def("isTopP", &tle::DecodingMode::isTopP)
-        .def("isTopKorTopP", &tle::DecodingMode::isTopKorTopP)
-        .def("isTopKandTopP", &tle::DecodingMode::isTopKandTopP)
-        .def("isBeamSearch", &tle::DecodingMode::isBeamSearch)
-        .def("isMedusa", &tle::DecodingMode::isMedusa)
-        .def("isLookahead", &tle::DecodingMode::isLookahead)
-        .def("isExplicitDraftTokens", &tle::DecodingMode::isExplicitDraftTokens)
-        .def("isEagle", &tle::DecodingMode::isEagle)
-        .def("useVariableBeamWidthSearch", &tle::DecodingMode::useVariableBeamWidthSearch)
-        .def_prop_ro("name", &tle::DecodingMode::getName)
-        .def("__getstate__", decodingModeGetstate)
-        .def("__setstate__", decodingModeSetstate);
-
-    nb::enum_<tle::CapacitySchedulerPolicy>(m, "CapacitySchedulerPolicy")
-        .value("MAX_UTILIZATION", tle::CapacitySchedulerPolicy::kMAX_UTILIZATION)
-        .value("GUARANTEED_NO_EVICT", tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT)
-        .value("STATIC_BATCH", tle::CapacitySchedulerPolicy::kSTATIC_BATCH);
-
-    nb::enum_<tle::ContextChunkingPolicy>(m, "ContextChunkingPolicy")
-        .value("EQUAL_PROGRESS", tle::ContextChunkingPolicy::kEQUAL_PROGRESS)
-        .value("FIRST_COME_FIRST_SERVED", tle::ContextChunkingPolicy::kFIRST_COME_FIRST_SERVED);
-
-    nb::enum_<tle::CommunicationType>(m, "CommunicationType").value("MPI", tle::CommunicationType::kMPI);
-
-    nb::enum_<tle::CommunicationMode>(m, "CommunicationMode")
-        .value("LEADER", tle::CommunicationMode::kLEADER)
-        .value("ORCHESTRATOR", tle::CommunicationMode::kORCHESTRATOR);
-
-    nb::class_<tle::KvCacheStats>(m, "KvCacheStats")
-        .def(nb::init<>())
-        .def_rw("max_num_blocks", &tle::KvCacheStats::maxNumBlocks)
-        .def_rw("free_num_blocks", &tle::KvCacheStats::freeNumBlocks)
-        .def_rw("used_num_blocks", &tle::KvCacheStats::usedNumBlocks)
-        .def_rw("tokens_per_block", &tle::KvCacheStats::tokensPerBlock)
-        .def_rw("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks)
-        .def_rw("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks)
-        .def_rw("reused_blocks", &tle::KvCacheStats::reusedBlocks)
-        .def_rw("missed_blocks", &tle::KvCacheStats::missedBlocks)
-        .def_rw("cache_hit_rate", &tle::KvCacheStats::cacheHitRate);
-
-    nb::class_<tle::StaticBatchingStats>(m, "StaticBatchingStats")
-        .def(nb::init<>())
-        .def_rw("num_scheduled_requests", &tle::StaticBatchingStats::numScheduledRequests)
-        .def_rw("num_context_requests", &tle::StaticBatchingStats::numContextRequests)
-        .def_rw("num_ctx_tokens", &tle::StaticBatchingStats::numCtxTokens)
-        .def_rw("num_gen_tokens", &tle::StaticBatchingStats::numGenTokens)
-        .def_rw("empty_gen_slots", &tle::StaticBatchingStats::emptyGenSlots);
-
-    nb::class_<tle::InflightBatchingStats>(m, "InflightBatchingStats")
-        .def(nb::init<>())
-        .def_rw("num_scheduled_requests", &tle::InflightBatchingStats::numScheduledRequests)
-        .def_rw("num_context_requests", &tle::InflightBatchingStats::numContextRequests)
-        .def_rw("num_gen_requests", &tle::InflightBatchingStats::numGenRequests)
-        .def_rw("num_paused_requests", &tle::InflightBatchingStats::numPausedRequests)
-        .def_rw("num_ctx_tokens", &tle::InflightBatchingStats::numCtxTokens)
-        .def_rw("micro_batch_id", &tle::InflightBatchingStats::microBatchId)
-        .def_rw("avg_num_decoded_tokens_per_iter", &tle::InflightBatchingStats::avgNumDecodedTokensPerIter);
-
-    nb::class_<tle::SpecDecodingStats>(m, "SpecDecodingStats")
-        .def(nb::init<>())
-        .def_rw("num_draft_tokens", &tle::SpecDecodingStats::numDraftTokens)
-        .def_rw("num_accepted_tokens", &tle::SpecDecodingStats::numAcceptedTokens)
-        .def_rw("num_requests_with_draft_tokens", &tle::SpecDecodingStats::numRequestsWithDraftTokens)
-        .def_rw("acceptance_length", &tle::SpecDecodingStats::acceptanceLength)
-        .def_rw("iter_latency_ms", &tle::SpecDecodingStats::iterLatencyMS)
-        .def_rw("draft_overhead", &tle::SpecDecodingStats::draftOverhead);
-
-    nb::class_<tle::IterationStats>(m, "IterationStats")
-        .def(nb::init<>())
-        .def_rw("timestamp", &tle::IterationStats::timestamp)
-        .def_rw("iter", &tle::IterationStats::iter)
-        .def_rw("iter_latency_ms", &tle::IterationStats::iterLatencyMS)
-        .def_rw("new_active_requests_queue_latency_ms", &tle::IterationStats::newActiveRequestsQueueLatencyMS)
-        .def_rw("num_new_active_requests", &tle::IterationStats::numNewActiveRequests)
-        .def_rw("num_active_requests", &tle::IterationStats::numActiveRequests)
-        .def_rw("num_queued_requests", &tle::IterationStats::numQueuedRequests)
-        .def_rw("num_completed_requests", &tle::IterationStats::numCompletedRequests)
-        .def_rw("max_num_active_requests", &tle::IterationStats::maxNumActiveRequests)
-        .def_rw("gpu_mem_usage", &tle::IterationStats::gpuMemUsage)
-        .def_rw("cpu_mem_usage", &tle::IterationStats::cpuMemUsage)
-        .def_rw("pinned_mem_usage", &tle::IterationStats::pinnedMemUsage)
-        .def_rw("kv_cache_stats", &tle::IterationStats::kvCacheStats)
-        .def_rw("cross_kv_cache_stats", &tle::IterationStats::crossKvCacheStats)
-        .def_rw("static_batching_stats", &tle::IterationStats::staticBatchingStats)
-        .def_rw("inflight_batching_stats", &tle::IterationStats::inflightBatchingStats)
-        .def_rw("specdec_stats", &tle::IterationStats::specDecodingStats)
-        .def("to_json_str",
-            [](tle::IterationStats const& iterationStats)
-            { return tle::JsonSerialization::toJsonStr(iterationStats); });
-
-    nb::class_<tle::DebugTensorsPerIteration>(m, "DebugTensorsPerIteration")
-        .def(nb::init<>())
-        .def_rw("iter", &tle::DebugTensorsPerIteration::iter)
-        .def_rw("debug_tensors", &tle::DebugTensorsPerIteration::debugTensors);
-
-    nb::enum_<tle::RequestStage>(m, "RequestStage")
-        .value("QUEUED", tle::RequestStage::kQUEUED)
-        .value("ENCODER_IN_PROGRESS", tle::RequestStage::kENCODER_IN_PROGRESS)
-        .value("CONTEXT_IN_PROGRESS", tle::RequestStage::kCONTEXT_IN_PROGRESS)
-        .value("GENERATION_IN_PROGRESS", tle::RequestStage::kGENERATION_IN_PROGRESS)
-        .value("GENERATION_COMPLETE", tle::RequestStage::kGENERATION_COMPLETE);
-
-    nb::class_<tle::DisServingRequestStats>(m, "DisServingRequestStats")
-        .def(nb::init<>())
-        .def_rw("kv_cache_transfer_ms", &tle::DisServingRequestStats::kvCacheTransferMS)
-        .def_rw("kv_cache_size", &tle::DisServingRequestStats::kvCacheSize);
-
-    nb::class_<tle::RequestStats>(m, "RequestStats")
-        .def(nb::init<>())
-        .def_rw("id", &tle::RequestStats::id)
-        .def_rw("stage", &tle::RequestStats::stage)
-        .def_rw("context_prefill_position", &tle::RequestStats::contextPrefillPosition)
-        .def_rw("num_generated_tokens", &tle::RequestStats::numGeneratedTokens)
-        .def_rw("avg_num_decoded_tokens_per_iter", &tle::RequestStats::avgNumDecodedTokensPerIter)
-        .def_rw("scheduled", &tle::RequestStats::scheduled)
-        .def_rw("paused", &tle::RequestStats::paused)
-        .def_rw("dis_serving_stats", &tle::RequestStats::disServingStats)
-        .def_rw("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest)
-        .def_rw("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest)
-        .def_rw("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest)
-        .def_rw("missed_blocks_per_request", &tle::RequestStats::missedBlocksPerRequest)
-        .def_rw("kv_cache_hit_rate_per_request", &tle::RequestStats::kvCacheHitRatePerRequest)
-        .def("to_json_str",
-            [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });
-
-    nb::class_<tle::RequestStatsPerIteration>(m, "RequestStatsPerIteration")
-        .def(nb::init<>())
-        .def_rw("iter", &tle::RequestStatsPerIteration::iter)
-        .def_rw("request_stats", &tle::RequestStatsPerIteration::requestStats)
-        .def("to_json_str",
-            [](tle::RequestStatsPerIteration const& iterationStats)
-            { return tle::JsonSerialization::toJsonStr(iterationStats); });
-
-    nb::module_ executor_kv_cache = m.def_submodule("kv_cache", "Executor KV Cache Manager");
-
-    nb::class_<tle::KVCacheCreatedData>(executor_kv_cache, "KVCacheCreatedData")
-        .def_ro("num_blocks_per_cache_level", &tle::KVCacheCreatedData::numBlocksPerCacheLevel);
-
-    nb::class_<tensorrt_llm::runtime::UniqueToken>(executor_kv_cache, "UniqueToken")
-        .def_ro("token_id", &tensorrt_llm::runtime::UniqueToken::tokenId)
-        .def_ro("token_extra_id", &tensorrt_llm::runtime::UniqueToken::tokenExtraId);
-
-    nb::class_<tle::KVCacheStoredBlockData>(executor_kv_cache, "KVCacheStoredBlockData")
-        .def_ro("block_hash", &tle::KVCacheStoredBlockData::blockHash)
-        .def_ro("tokens", &tle::KVCacheStoredBlockData::tokens)
-        .def_ro("lora_id", &tle::KVCacheStoredBlockData::loraId)
-        .def_ro("cache_level", &tle::KVCacheStoredBlockData::cacheLevel)
-        .def_ro("priority", &tle::KVCacheStoredBlockData::priority);
-
-    nb::class_<tle::KVCacheStoredData>(executor_kv_cache, "KVCacheStoredData")
-        .def_ro("parent_hash", &tle::KVCacheStoredData::parentHash)
-        .def_ro("blocks", &tle::KVCacheStoredData::blocks);
-
-    nb::class_<tle::KVCacheRemovedData>(executor_kv_cache, "KVCacheRemovedData")
-        .def_ro("block_hashes", &tle::KVCacheRemovedData::blockHashes);
-
-    instantiateEventDiff<SizeType32>(executor_kv_cache, "Int");
-
-    nb::class_<tle::KVCacheUpdatedData>(executor_kv_cache, "KVCacheUpdatedData")
-        .def_ro("block_hash", &tle::KVCacheUpdatedData::blockHash)
-        .def_ro("cache_level", &tle::KVCacheUpdatedData::cacheLevel)
-        .def_ro("priority", &tle::KVCacheUpdatedData::priority);
-
-    nb::class_<tle::KVCacheEvent>(executor_kv_cache, "KVCacheEvent")
-        .def_ro("event_id", &tle::KVCacheEvent::eventId)
-        .def_ro("data", &tle::KVCacheEvent::data)
-        .def_ro("window_size", &tle::KVCacheEvent::windowSize);
-
-    nb::class_<tle::KVCacheEventManager>(executor_kv_cache, "KVCacheEventManager")
-        .def(
-            "get_latest_events",
-            [](tle::KVCacheEventManager& self, std::optional<double> timeout_ms = std::nullopt)
-            {
-                if (timeout_ms)
-                {
-                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
-                }
-                return self.getLatestEvents(std::nullopt);
-            },
-            nb::arg("timeout_ms") = std::nullopt);
-
-    tensorrt_llm::nanobind::executor::initRequestBindings(m);
-    tensorrt_llm::nanobind::executor::initConfigBindings(m);
-    tensorrt_llm::nanobind::executor::Executor::initBindings(m);
-}
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/bindings.h b/cpp/tensorrt_llm/nanobind/executor/bindings.h
deleted file mode 100644
index 4df52c2d34e..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/bindings.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-// Register bindings for executor API.
-void initBindings(nb::module_& m);
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executor.cpp b/cpp/tensorrt_llm/nanobind/executor/executor.cpp
deleted file mode 100644
index 59c7d2a3dc1..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/executor.cpp
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "executor.h"
-#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/executor/tensor.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-
-#include <nanobind/nanobind.h>
-#include <nanobind/ndarray.h>
-#include <nanobind/stl/chrono.h>
-#include <nanobind/stl/filesystem.h>
-#include <nanobind/stl/map.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/vector.h>
-#include <torch/extension.h>
-
-namespace nb = nanobind;
-namespace tle = tensorrt_llm::executor;
-
-namespace nanobind::detail
-{
-
-template <>
-struct dtype_traits<half>
-{
-    static constexpr dlpack::dtype value{
-        (uint8_t) dlpack::dtype_code::Float, // type code
-        16,                                  // size in bits
-        1                                    // lanes (simd), usually set to 1
-    };
-    static constexpr auto name = const_name("float16");
-};
-} // namespace nanobind::detail
-
-namespace
-{
-// todo: Properly support FP8 and BF16 and verify functionality
-tle::Tensor numpyToTensor(nb::ndarray<nb::numpy> const& array)
-{
-    auto npDtype = array.dtype();
-    char kind = '\0';
-    switch (npDtype.code)
-    {
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Int):
-        kind = 'i'; // signed integer
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::UInt):
-        kind = 'u'; // unsigned integer
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Float):
-        kind = 'f'; // floating point
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Bfloat):
-        kind = 'f'; // brain floating point (treat as float kind)
-        break;
-    case static_cast<uint8_t>(nb::dlpack::dtype_code::Complex):
-        kind = 'c'; // complex
-        break;
-    default:
-        kind = 'V'; // void/other
-        break;
-    }
-    tle::DataType dtype;
-    if (npDtype == nb::dtype<half>())
-    {
-        dtype = tle::DataType::kFP16;
-    }
-    else if (npDtype == nb::dtype<float>())
-    {
-        dtype = tle::DataType::kFP32;
-    }
-    else if (npDtype == nb::dtype<int8_t>())
-    {
-        dtype = tle::DataType::kINT8;
-    }
-    else if (npDtype == nb::dtype<int32_t>())
-    {
-        dtype = tle::DataType::kINT32;
-    }
-    else if (npDtype == nb::dtype<int64_t>())
-    {
-        dtype = tle::DataType::kINT64;
-    }
-    else if (kind == 'V' && array.itemsize() == 1)
-    {
-        dtype = tle::DataType::kFP8;
-    }
-    else if (kind == 'V' && array.itemsize() == 2)
-    {
-        dtype = tle::DataType::kBF16;
-    }
-    else
-    {
-        TLLM_THROW("Unsupported numpy dtype.");
-    }
-
-    // todo: improve the following code
-    std::vector<int64_t> dims;
-    dims.reserve(array.ndim());
-    for (size_t i = 0; i < array.ndim(); ++i)
-    {
-        dims.push_back(static_cast<int64_t>(array.shape(i)));
-    }
-    tle::Shape shape(dims.data(), dims.size());
-
-    return tle::Tensor::of(dtype, const_cast<void*>(array.data()), shape);
-}
-
-} // namespace
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-Executor::Executor(
-    std::filesystem::path const& modelPath, tle::ModelType modelType, tle::ExecutorConfig const& executorConfig)
-{
-    mExecutor = std::make_unique<tle::Executor>(modelPath, modelType, executorConfig);
-}
-
-Executor::Executor(std::filesystem::path const& encoderModelPath, std::filesystem::path const& decoderModelPath,
-    tle::ModelType modelType, tle::ExecutorConfig const& executorConfig)
-{
-    mExecutor = std::make_unique<tle::Executor>(encoderModelPath, decoderModelPath, modelType, executorConfig);
-}
-
-Executor::Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigStr, tle::ModelType modelType,
-    tle::ExecutorConfig const& executorConfig, std::optional<nb::dict> managedWeights)
-{
-    uint8_t const* data = static_cast<uint8_t const*>(engineBuffer.data());
-    size_t size = engineBuffer.size();
-    std::optional<std::map<std::string, tle::Tensor>> managedWeightsMap = std::nullopt;
-    if (managedWeights.has_value() && !managedWeights.value().empty())
-    {
-        managedWeightsMap = std::map<std::string, tle::Tensor>();
-        for (auto const& [rawName, rawArray] : managedWeights.value())
-        {
-            std::string name = nb::cast<std::string>(rawName);
-            nb::ndarray<nb::numpy> array = nb::cast<nb::ndarray<nb::numpy>>(rawArray);
-            managedWeightsMap->emplace(name, numpyToTensor(array));
-        }
-    }
-    mExecutor = std::make_unique<tle::Executor>(
-        tle::BufferView(data, size), jsonConfigStr, modelType, executorConfig, managedWeightsMap);
-}
-
-Executor::Executor(std::string const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
-    std::string const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, tle::ModelType modelType,
-    tle::ExecutorConfig const& executorConfig)
-{
-    uint8_t const* encoderData = reinterpret_cast<uint8_t const*>(encoderEngineBuffer.data());
-    size_t encoderSize = encoderEngineBuffer.size();
-    uint8_t const* decoderData = reinterpret_cast<uint8_t const*>(decoderEngineBuffer.data());
-    size_t decoderSize = decoderEngineBuffer.size();
-    mExecutor = std::make_unique<tle::Executor>(tle::BufferView(encoderData, encoderSize), encoderJsonConfigStr,
-        tle::BufferView(decoderData, decoderSize), decoderJsonConfigStr, modelType, executorConfig);
-}
-
-nb::object Executor::enter()
-{
-    TLLM_CHECK(static_cast<bool>(mExecutor));
-    return nb::cast(this);
-}
-
-void Executor::exit(
-    [[maybe_unused]] nb::handle type, [[maybe_unused]] nb::handle value, [[maybe_unused]] nb::handle traceback)
-{
-    shutdown();
-    mExecutor = nullptr;
-}
-
-void Executor::shutdown()
-{
-    // NOTE: we must release the GIL here. Executor has spawned a thread for the execution loop. That thread must be
-    // able to do forward progress for the shutdown process to succeed. It takes the GIL during its callbacks, so
-    // we release it now. Note that we shouldn't do anything related to python objects after that.
-    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
-    nb::gil_scoped_release release;
-    mExecutor->shutdown();
-    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-}
-
-void Executor::initBindings(nb::module_& m)
-{
-    nb::class_<Executor>(m, "Executor")
-        .def(nb::init<std::filesystem::path const&, tle::ModelType, tle::ExecutorConfig const&>(),
-            nb::arg("model_path"), nb::arg("model_type"), nb::arg("executor_config"))
-        .def(nb::init<std::filesystem::path const&, std::filesystem::path const&, tle::ModelType,
-                 tle::ExecutorConfig const&>(),
-            nb::arg("encoder_model_path"), nb::arg("decoder_model_path"), nb::arg("model_type"),
-            nb::arg("executor_config"))
-        .def(nb::init<nb::bytes, std::string const&, tle::ModelType, tle::ExecutorConfig const&, nb::dict>(),
-            nb::arg("engine_buffer"), nb::arg("json_config_str"), nb::arg("model_type"), nb::arg("executor_config"),
-            nb::arg("managed_weights") = nb::dict())
-        .def(nb::init<std::string const&, std::string const&, std::string const&, std::string const&, tle::ModelType,
-                 tle::ExecutorConfig const&>(),
-            nb::arg("encoder_engine_buffer"), nb::arg("encoder_json_config_str"), nb::arg("decoder_engine_buffer"),
-            nb::arg("decoder_json_config_str"), nb::arg("model_type"), nb::arg("executor_config"))
-        .def("shutdown", &Executor::shutdown)
-        .def("__enter__", &Executor::enter)
-        .def("__exit__", &Executor::exit)
-        .def("enqueue_request", &Executor::enqueueRequest, nb::arg("request"))
-        .def("enqueue_requests", &Executor::enqueueRequests, nb::arg("requests"))
-        .def("await_responses",
-            nb::overload_cast<std::optional<std::chrono::milliseconds> const&>(&Executor::awaitResponses),
-            nb::arg("timeout") = nb::none())
-        .def("await_responses",
-            nb::overload_cast<tle::IdType const&, std::optional<std::chrono::milliseconds> const&>(
-                &Executor::awaitResponses),
-            nb::arg("id"), nb::arg("timeout") = nb::none())
-        .def("await_responses",
-            nb::overload_cast<std::vector<tle::IdType> const&, std::optional<std::chrono::milliseconds> const&>(
-                &Executor::awaitResponses),
-            nb::arg("ids"), nb::arg("timeout") = nb::none())
-        .def("get_num_responses_ready", &Executor::getNumResponsesReady, nb::arg("id") = nb::none())
-        .def("cancel_request", &Executor::cancelRequest, nb::arg("id") = nb::none())
-        .def("get_latest_iteration_stats", &Executor::getLatestIterationStats)
-        .def("get_latest_request_stats", &Executor::getLatestRequestStats)
-        .def("get_latest_debug_tensors", &Executor::getLatestDebugTensors)
-        .def("can_enqueue_requests", &Executor::canEnqueueRequests)
-        .def("get_kv_cache_event_manager", &Executor::getKVCacheEventManager);
-}
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executor.h b/cpp/tensorrt_llm/nanobind/executor/executor.h
deleted file mode 100644
index 22c24abb4bf..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/executor.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/executor/types.h"
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-namespace tle = tensorrt_llm::executor;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-class Executor
-{
-public:
-    Executor(
-        std::filesystem::path const& modelPath, tle::ModelType modelType, tle::ExecutorConfig const& executorConfig);
-
-    Executor(std::filesystem::path const& encoderModelPath, std::filesystem::path const& decoderModelPath,
-        tle::ModelType modelType, tle::ExecutorConfig const& executorConfig);
-
-    Executor(nb::bytes const& engineBuffer, std::string const& jsonConfigStr, tle::ModelType modelType,
-        tle::ExecutorConfig const& executorConfig, std::optional<nb::dict> managedWeights);
-
-    Executor(std::string const& encoderEngineBuffer, std::string const& encoderJsonConfigStr,
-        std::string const& decoderEngineBuffer, std::string const& decoderJsonConfigStr, tle::ModelType modelType,
-        tle::ExecutorConfig const& executorConfig);
-
-    nb::object enter();
-    void exit(
-        [[maybe_unused]] nb::handle type, [[maybe_unused]] nb::handle value, [[maybe_unused]] nb::handle traceback);
-    void shutdown();
-
-    [[nodiscard]] tle::IdType enqueueRequest(tle::Request const& request)
-    {
-        return mExecutor->enqueueRequest(request);
-    }
-
-    [[nodiscard]] std::vector<tle::IdType> enqueueRequests(std::vector<tle::Request> const& requests)
-    {
-        return mExecutor->enqueueRequests(requests);
-    }
-
-    [[nodiscard]] std::vector<tle::Response> awaitResponses(
-        std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
-    {
-        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
-        // thread.
-        nb::gil_scoped_release release;
-        return mExecutor->awaitResponses(timeout);
-    }
-
-    [[nodiscard]] std::vector<tle::Response> awaitResponses(
-        tle::IdType const& requestId, std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
-    {
-        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
-        // thread.
-        nb::gil_scoped_release release;
-        return mExecutor->awaitResponses(requestId, timeout);
-    }
-
-    [[nodiscard]] std::vector<std::vector<tle::Response>> awaitResponses(std::vector<tle::IdType> const& requestIds,
-        std::optional<std::chrono::milliseconds> const& timeout = std::nullopt)
-    {
-        // Await responses blocks until a response is received. Release GIL so that it can be ran in a background
-        // thread.
-        nb::gil_scoped_release release;
-        return mExecutor->awaitResponses(requestIds, timeout);
-    }
-
-    [[nodiscard]] tle::SizeType32 getNumResponsesReady(std::optional<tle::IdType> const& requestId = std::nullopt) const
-    {
-        return mExecutor->getNumResponsesReady(requestId);
-    }
-
-    void cancelRequest(tle::IdType requestId)
-    {
-        mExecutor->cancelRequest(requestId);
-    }
-
-    std::deque<tle::IterationStats> getLatestIterationStats()
-    {
-        return mExecutor->getLatestIterationStats();
-    }
-
-    std::deque<tle::RequestStatsPerIteration> getLatestRequestStats()
-    {
-        return mExecutor->getLatestRequestStats();
-    }
-
-    std::deque<tle::DebugTensorsPerIteration> getLatestDebugTensors()
-    {
-        return mExecutor->getLatestDebugTensors();
-    }
-
-    [[nodiscard]] bool canEnqueueRequests() const
-    {
-        return mExecutor->canEnqueueRequests();
-    }
-
-    [[nodiscard]] std::optional<std::shared_ptr<tle::KVCacheEventManager>> getKVCacheEventManager() const
-    {
-        return mExecutor->getKVCacheEventManager();
-    }
-
-    static void initBindings(nb::module_& m);
-
-private:
-    std::unique_ptr<tle::Executor> mExecutor;
-};
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
deleted file mode 100644
index c2d9fe25dff..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp
+++ /dev/null
@@ -1,616 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "executorConfig.h"
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/executor/types.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/utils/mpiUtils.h"
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/function.h>
-#include <nanobind/stl/map.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/pair.h>
-#include <nanobind/stl/set.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/unordered_map.h>
-#include <nanobind/stl/unordered_set.h>
-#include <nanobind/stl/vector.h>
-#include <torch/torch.h>
-#include <vector>
-
-namespace nb = nanobind;
-namespace tle = tensorrt_llm::executor;
-using SizeType32 = tle::SizeType32;
-using RuntimeDefaults = tensorrt_llm::runtime::RuntimeDefaults;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-void initConfigBindings(nb::module_& m)
-{
-    nb::enum_<tle::BatchingType>(m, "BatchingType")
-        .value("STATIC", tle::BatchingType::kSTATIC)
-        .value("INFLIGHT", tle::BatchingType::kINFLIGHT);
-
-    auto dynamicBatchConfigGetstate = [](tle::DynamicBatchConfig const& self)
-    {
-        return nb::make_tuple(self.getEnableBatchSizeTuning(), self.getEnableMaxNumTokensTuning(),
-            self.getDynamicBatchMovingAverageWindow(), self.getBatchSizeTable());
-    };
-    auto dynamicBatchConfigSetstate = [](tle::DynamicBatchConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::DynamicBatchConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
-            nb::cast<SizeType32>(state[2]), nb::cast<std::vector<std::pair<SizeType32, SizeType32>>>(state[3]));
-    };
-    nb::class_<tle::DynamicBatchConfig>(m, "DynamicBatchConfig")
-        .def(nb::init<bool, bool, SizeType32>(), nb::arg("enable_batch_size_tuning"),
-            nb::arg("enable_max_num_tokens_tuning"), nb::arg("dynamic_batch_moving_average_window"))
-        .def_prop_ro("enable_batch_size_tuning", &tle::DynamicBatchConfig::getEnableBatchSizeTuning)
-        .def_prop_ro("enable_max_num_tokens_tuning", &tle::DynamicBatchConfig::getEnableMaxNumTokensTuning)
-        .def_prop_ro(
-            "dynamic_batch_moving_average_window", &tle::DynamicBatchConfig::getDynamicBatchMovingAverageWindow)
-        .def("__getstate__", dynamicBatchConfigGetstate)
-        .def("__setstate__", dynamicBatchConfigSetstate);
-
-    auto schedulerConfigSetstate = [](tle::SchedulerConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::SchedulerConfig(nb::cast<tle::CapacitySchedulerPolicy>(state[0]),
-            nb::cast<std::optional<tle::ContextChunkingPolicy>>(state[1]),
-            nb::cast<std::optional<tle::DynamicBatchConfig>>(state[2]));
-    };
-    auto schedulerConfigGetstate = [](tle::SchedulerConfig const& self)
-    {
-        return nb::make_tuple(
-            self.getCapacitySchedulerPolicy(), self.getContextChunkingPolicy(), self.getDynamicBatchConfig());
-    };
-    nb::class_<tle::SchedulerConfig>(m, "SchedulerConfig")
-        .def(nb::init<tle::CapacitySchedulerPolicy, std::optional<tle::ContextChunkingPolicy>,
-                 std::optional<tle::DynamicBatchConfig>>(),
-            nb::arg("capacity_scheduler_policy") = tle::CapacitySchedulerPolicy::kGUARANTEED_NO_EVICT,
-            nb::arg("context_chunking_policy") = nb::none(), nb::arg("dynamic_batch_config") = nb::none())
-        .def_prop_ro("capacity_scheduler_policy", &tle::SchedulerConfig::getCapacitySchedulerPolicy)
-        .def_prop_ro("context_chunking_policy", &tle::SchedulerConfig::getContextChunkingPolicy)
-        .def_prop_ro("dynamic_batch_config", &tle::SchedulerConfig::getDynamicBatchConfig)
-        .def("__getstate__", schedulerConfigGetstate)
-        .def("__setstate__", schedulerConfigSetstate);
-
-    nb::class_<RuntimeDefaults>(m, "RuntimeDefaults")
-        .def(nb::init<std::optional<std::vector<SizeType32>>, std::optional<SizeType32>>(),
-            nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none())
-        .def_ro("max_attention_window", &RuntimeDefaults::maxAttentionWindowVec)
-        .def_ro("sink_token_length", &RuntimeDefaults::sinkTokenLength);
-
-    auto kvCacheConfigGetstate = [](tle::KvCacheConfig const& self)
-    {
-        return nb::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
-            self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
-            self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority(),
-            self.getEventBufferMaxSize(), self.getEnablePartialReuse(), self.getCopyOnPartialReuse(), self.getUseUvm());
-    };
-    auto kvCacheConfigSetstate = [](tle::KvCacheConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 13)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::KvCacheConfig(nb::cast<bool>(state[0]), nb::cast<std::optional<SizeType32>>(state[1]),
-            nb::cast<std::optional<std::vector<SizeType32>>>(state[2]), nb::cast<std::optional<SizeType32>>(state[3]),
-            nb::cast<std::optional<float>>(state[4]), nb::cast<std::optional<size_t>>(state[5]),
-            nb::cast<bool>(state[6]), nb::cast<std::optional<float>>(state[7]),
-            nb::cast<std::optional<tle::RetentionPriority>>(state[8]), nb::cast<size_t>(state[9]),
-            nb::cast<bool>(state[10]), nb::cast<bool>(state[11]), nb::cast<bool>(state[12]));
-    };
-    nb::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
-        .def(nb::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
-                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
-                 std::optional<float> const&, std::optional<tle::RetentionPriority>, size_t const&, bool, bool, bool,
-                 std::optional<RuntimeDefaults> const&>(),
-            nb::arg("enable_block_reuse") = true, nb::arg("max_tokens") = nb::none(),
-            nb::arg("max_attention_window") = nb::none(), nb::arg("sink_token_length") = nb::none(),
-            nb::arg("free_gpu_memory_fraction") = nb::none(), nb::arg("host_cache_size") = nb::none(),
-            nb::arg("onboard_blocks") = true, nb::arg("cross_kv_cache_fraction") = nb::none(),
-            nb::arg("secondary_offload_min_priority") = nb::none(), nb::arg("event_buffer_max_size") = 0, nb::kw_only(),
-            nb::arg("enable_partial_reuse") = true, nb::arg("copy_on_partial_reuse") = true, nb::arg("use_uvm") = false,
-            nb::arg("runtime_defaults") = nb::none())
-        .def_prop_rw(
-            "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
-        .def_prop_rw("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
-        .def_prop_rw("max_attention_window", &tle::KvCacheConfig::getMaxAttentionWindowVec,
-            &tle::KvCacheConfig::setMaxAttentionWindowVec)
-        .def_prop_rw(
-            "sink_token_length", &tle::KvCacheConfig::getSinkTokenLength, &tle::KvCacheConfig::setSinkTokenLength)
-        .def_prop_rw("free_gpu_memory_fraction", &tle::KvCacheConfig::getFreeGpuMemoryFraction,
-            &tle::KvCacheConfig::setFreeGpuMemoryFraction)
-        .def_prop_rw("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
-        .def_prop_rw("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
-        .def_prop_rw("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
-            &tle::KvCacheConfig::setCrossKvCacheFraction)
-        .def_prop_rw("secondary_offload_min_priority", &tle::KvCacheConfig::getSecondaryOffloadMinPriority,
-            &tle::KvCacheConfig::setSecondaryOffloadMinPriority)
-        .def_prop_rw("event_buffer_max_size", &tle::KvCacheConfig::getEventBufferMaxSize,
-            &tle::KvCacheConfig::setEventBufferMaxSize)
-        .def_prop_rw("enable_partial_reuse", &tle::KvCacheConfig::getEnablePartialReuse,
-            &tle::KvCacheConfig::setEnablePartialReuse)
-        .def_prop_rw("copy_on_partial_reuse", &tle::KvCacheConfig::getCopyOnPartialReuse,
-            &tle::KvCacheConfig::setCopyOnPartialReuse)
-        .def_prop_rw("use_uvm", &tle::KvCacheConfig::getUseUvm, &tle::KvCacheConfig::setUseUvm)
-        .def("fill_empty_fields_from_runtime_defaults", &tle::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults)
-        .def("__getstate__", kvCacheConfigGetstate)
-        .def("__setstate__", kvCacheConfigSetstate);
-
-    nb::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")
-        .def(nb::init<bool, std::string, std::shared_ptr<mpi::MpiComm>, bool>(), nb::arg("is_orchestrator") = true,
-            nb::arg("worker_executable_path") = "", nb::arg("orch_leader_comm").none() = nullptr,
-            nb::arg("spawn_processes") = true)
-        .def_prop_rw(
-            "is_orchestrator", &tle::OrchestratorConfig::getIsOrchestrator, &tle::OrchestratorConfig::setIsOrchestrator)
-        .def_prop_rw("worker_executable_path", &tle::OrchestratorConfig::getWorkerExecutablePath,
-            &tle::OrchestratorConfig::setWorkerExecutablePath)
-        .def_prop_rw("orch_leader_comm", &tle::OrchestratorConfig::getOrchLeaderComm,
-            &tle::OrchestratorConfig::setOrchLeaderComm)
-        .def_prop_rw("spawn_processes", &tle::OrchestratorConfig::getSpawnProcesses,
-            &tle::OrchestratorConfig::setSpawnProcesses);
-
-    auto parallelConfigGetstate = [](tle::ParallelConfig const& self)
-    {
-        return nb::make_tuple(self.getCommunicationType(), self.getCommunicationMode(), self.getDeviceIds(),
-            self.getParticipantIds(), self.getOrchestratorConfig(), self.getNumNodes());
-    };
-    auto parallelConfigSetstate = [](tle::ParallelConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 6)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::ParallelConfig(nb::cast<tle::CommunicationType>(state[0]),
-            nb::cast<tle::CommunicationMode>(state[1]), nb::cast<std::optional<std::vector<SizeType32>>>(state[2]),
-            nb::cast<std::optional<std::vector<SizeType32>>>(state[3]),
-            nb::cast<std::optional<tle::OrchestratorConfig>>(state[4]), nb::cast<std::optional<SizeType32>>(state[5]));
-    };
-    nb::class_<tle::ParallelConfig>(m, "ParallelConfig")
-        .def(nb::init<tle::CommunicationType, tle::CommunicationMode, std::optional<std::vector<SizeType32>> const&,
-                 std::optional<std::vector<SizeType32>> const&, std::optional<tle::OrchestratorConfig> const&,
-                 std::optional<SizeType32> const&>(),
-            nb::arg("communication_type") = tle::CommunicationType::kMPI,
-            nb::arg("communication_mode") = tle::CommunicationMode::kLEADER, nb::arg("device_ids") = nb::none(),
-            nb::arg("participant_ids") = nb::none(), nb::arg("orchestrator_config") = nb::none(),
-            nb::arg("num_nodes") = nb::none())
-        .def_prop_rw("communication_type", &tle::ParallelConfig::getCommunicationType,
-            &tle::ParallelConfig::setCommunicationType)
-        .def_prop_rw("communication_mode", &tle::ParallelConfig::getCommunicationMode,
-            &tle::ParallelConfig::setCommunicationMode)
-        .def_prop_rw("device_ids", &tle::ParallelConfig::getDeviceIds, &tle::ParallelConfig::setDeviceIds)
-        .def_prop_rw(
-            "participant_ids", &tle::ParallelConfig::getParticipantIds, &tle::ParallelConfig::setParticipantIds)
-        .def_prop_rw("orchestrator_config", &tle::ParallelConfig::getOrchestratorConfig,
-            &tle::ParallelConfig::setOrchestratorConfig)
-        .def_prop_rw("num_nodes", &tle::ParallelConfig::getNumNodes, &tle::ParallelConfig::setNumNodes)
-        .def("__getstate__", parallelConfigGetstate)
-        .def("__setstate__", parallelConfigSetstate);
-
-    auto peftCacheConfigSetstate = [](tle::PeftCacheConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 11)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::PeftCacheConfig(nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]),
-            nb::cast<SizeType32>(state[2]), nb::cast<SizeType32>(state[3]), nb::cast<SizeType32>(state[4]),
-            nb::cast<SizeType32>(state[5]), nb::cast<SizeType32>(state[6]), nb::cast<SizeType32>(state[7]),
-            nb::cast<SizeType32>(state[8]), nb::cast<std::optional<float>>(state[9]),
-            nb::cast<std::optional<size_t>>(state[10]));
-    };
-    auto peftCacheConfigGetstate = [](tle::PeftCacheConfig const& self)
-    {
-        return nb::make_tuple(self.getNumHostModuleLayer(), self.getNumDeviceModuleLayer(),
-            self.getOptimalAdapterSize(), self.getMaxAdapterSize(), self.getNumPutWorkers(), self.getNumEnsureWorkers(),
-            self.getNumCopyStreams(), self.getMaxPagesPerBlockHost(), self.getMaxPagesPerBlockDevice(),
-            self.getDeviceCachePercent(), self.getHostCacheSize());
-    };
-    nb::class_<tle::PeftCacheConfig>(m, "PeftCacheConfig")
-        .def(nb::init<SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32, SizeType32,
-                 SizeType32, std::optional<float> const&, std::optional<size_t> const&,
-                 std::optional<std::string> const&>(),
-            nb::arg("num_host_module_layer") = 0, nb::arg("num_device_module_layer") = 0,
-            nb::arg("optimal_adapter_size") = 8, nb::arg("max_adapter_size") = 64, nb::arg("num_put_workers") = 1,
-            nb::arg("num_ensure_workers") = 1, nb::arg("num_copy_streams") = 1,
-            nb::arg("max_pages_per_block_host") = 24, nb::arg("max_pages_per_block_device") = 8,
-            nb::arg("device_cache_percent") = nb::none(), nb::arg("host_cache_size") = nb::none(),
-            nb::arg("lora_prefetch_dir") = nb::none())
-        .def_prop_ro("num_host_module_layer", &tle::PeftCacheConfig::getNumHostModuleLayer)
-        .def_prop_ro("num_device_module_layer", &tle::PeftCacheConfig::getNumDeviceModuleLayer)
-        .def_prop_ro("optimal_adapter_size", &tle::PeftCacheConfig::getOptimalAdapterSize)
-        .def_prop_ro("max_adapter_size", &tle::PeftCacheConfig::getMaxAdapterSize)
-        .def_prop_ro("num_put_workers", &tle::PeftCacheConfig::getNumPutWorkers)
-        .def_prop_ro("num_ensure_workers", &tle::PeftCacheConfig::getNumEnsureWorkers)
-        .def_prop_ro("num_copy_streams", &tle::PeftCacheConfig::getNumCopyStreams)
-        .def_prop_ro("max_pages_per_block_host", &tle::PeftCacheConfig::getMaxPagesPerBlockHost)
-        .def_prop_ro("max_pages_per_block_device", &tle::PeftCacheConfig::getMaxPagesPerBlockDevice)
-        .def_prop_ro("device_cache_percent", &tle::PeftCacheConfig::getDeviceCachePercent)
-        .def_prop_ro("host_cache_size", &tle::PeftCacheConfig::getHostCacheSize)
-        .def_prop_ro("lora_prefetch_dir", &tle::PeftCacheConfig::getLoraPrefetchDir)
-        .def("__getstate__", peftCacheConfigGetstate)
-        .def("__setstate__", peftCacheConfigSetstate);
-
-    auto decodingConfigGetstate = [](tle::DecodingConfig const& self)
-    {
-        return nb::make_tuple(
-            self.getDecodingMode(), self.getLookaheadDecodingConfig(), self.getMedusaChoices(), self.getEagleConfig());
-    };
-    auto decodingConfigSetstate = [](tle::DecodingConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::DecodingConfig(nb::cast<std::optional<tle::DecodingMode>>(state[0]), // DecodingMode
-            nb::cast<std::optional<tle::LookaheadDecodingConfig>>(state[1]),                  // LookaheadDecodingConfig
-            nb::cast<std::optional<tle::MedusaChoices>>(state[2]),                            // MedusaChoices
-            nb::cast<std::optional<tle::EagleConfig>>(state[3])                               // EagleConfig
-        );
-    };
-    nb::class_<tle::DecodingConfig>(m, "DecodingConfig")
-        .def(nb::init<std::optional<tle::DecodingMode>, std::optional<tle::LookaheadDecodingConfig>,
-                 std::optional<tle::MedusaChoices>, std::optional<tle::EagleConfig>>(),
-            nb::arg("decoding_mode") = nb::none(), nb::arg("lookahead_decoding_config") = nb::none(),
-            nb::arg("medusa_choices") = nb::none(), nb::arg("eagle_config") = nb::none())
-        .def_prop_rw("decoding_mode", &tle::DecodingConfig::getDecodingMode, &tle::DecodingConfig::setDecodingMode)
-        .def_prop_rw("lookahead_decoding_config", &tle::DecodingConfig::getLookaheadDecodingConfig,
-            &tle::DecodingConfig::setLookaheadDecodingConfig)
-        .def_prop_rw("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices)
-        .def_prop_rw("eagle_config", &tle::DecodingConfig::getEagleConfig, &tle::DecodingConfig::setEagleConfig)
-        .def("__getstate__", decodingConfigGetstate)
-        .def("__setstate__", decodingConfigSetstate);
-
-    auto debugConfigGetstate = [](tle::DebugConfig const& self)
-    {
-        return nb::make_tuple(self.getDebugInputTensors(), self.getDebugOutputTensors(), self.getDebugTensorNames(),
-            self.getDebugTensorsMaxIterations());
-    };
-    auto debugConfigSetstate = [](tle::DebugConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&self) tle::DebugConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
-            nb::cast<std::vector<std::string>>(state[2]), nb::cast<SizeType32>(state[3]));
-    };
-    nb::class_<tle::DebugConfig>(m, "DebugConfig")
-        .def(nb::init<bool, bool, std::vector<std::string>, SizeType32>(), nb::arg("debug_input_tensors") = false,
-            nb::arg("debug_output_tensors") = false, nb::arg("debug_tensor_names") = nb::none(),
-            nb::arg("debug_tensors_max_iterations") = false)
-        .def_prop_rw(
-            "debug_input_tensors", &tle::DebugConfig::getDebugInputTensors, &tle::DebugConfig::setDebugInputTensors)
-        .def_prop_rw(
-            "debug_output_tensors", &tle::DebugConfig::getDebugOutputTensors, &tle::DebugConfig::setDebugOutputTensors)
-        .def_prop_rw(
-            "debug_tensor_names", &tle::DebugConfig::getDebugTensorNames, &tle::DebugConfig::setDebugTensorNames)
-        .def_prop_rw("debug_tensors_max_iterations", &tle::DebugConfig::getDebugTensorsMaxIterations,
-            &tle::DebugConfig::setDebugTensorsMaxIterations)
-        .def("__getstate__", debugConfigGetstate)
-        .def("__setstate__", debugConfigSetstate);
-
-    auto logitsPostProcessorConfigGetstate = [](tle::LogitsPostProcessorConfig const& self)
-    { return nb::make_tuple(self.getProcessorMap(), self.getProcessorBatched(), self.getReplicate()); };
-
-    auto logitsPostProcessorConfigSetstate = [](tle::LogitsPostProcessorConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid LogitsPostProcessorConfig state!");
-        }
-        new (&self) tle::LogitsPostProcessorConfig(nb::cast<std::optional<tle::LogitsPostProcessorMap>>(state[0]),
-            nb::cast<std::optional<tle::LogitsPostProcessorBatched>>(state[1]), nb::cast<bool>(state[2]));
-    };
-
-    nb::class_<tle::LogitsPostProcessorConfig>(m, "LogitsPostProcessorConfig")
-        .def(nb::init<std::optional<tle::LogitsPostProcessorMap>, std::optional<tle::LogitsPostProcessorBatched>,
-                 bool>(),
-            nb::arg("processor_map") = nb::none(), nb::arg("processor_batched") = nb::none(),
-            nb::arg("replicate") = true)
-        .def_prop_rw("processor_map", &tle::LogitsPostProcessorConfig::getProcessorMap,
-            &tle::LogitsPostProcessorConfig::setProcessorMap)
-        .def_prop_rw("processor_batched", &tle::LogitsPostProcessorConfig::getProcessorBatched,
-            &tle::LogitsPostProcessorConfig::setProcessorBatched)
-        .def_prop_rw(
-            "replicate", &tle::LogitsPostProcessorConfig::getReplicate, &tle::LogitsPostProcessorConfig::setReplicate)
-        .def("__getstate__", logitsPostProcessorConfigGetstate)
-        .def("__setstate__", logitsPostProcessorConfigSetstate);
-
-    auto extendedRuntimePerfKnobConfigSetstate = [](tle::ExtendedRuntimePerfKnobConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
-        }
-        new (&self) tle::ExtendedRuntimePerfKnobConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
-            nb::cast<bool>(state[2]), nb::cast<SizeType32>(state[2]));
-    };
-    auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
-    {
-        return nb::make_tuple(self.getMultiBlockMode(), self.getEnableContextFMHAFP32Acc(), self.getCudaGraphMode(),
-            self.getCudaGraphCacheSize());
-    };
-    nb::class_<tle::ExtendedRuntimePerfKnobConfig>(m, "ExtendedRuntimePerfKnobConfig")
-        .def(
-            nb::init<bool, bool>(), nb::arg("multi_block_mode") = true, nb::arg("enable_context_fmha_fp32_acc") = false)
-        .def_prop_rw("multi_block_mode", &tle::ExtendedRuntimePerfKnobConfig::getMultiBlockMode,
-            &tle::ExtendedRuntimePerfKnobConfig::setMultiBlockMode)
-        .def_prop_rw("enable_context_fmha_fp32_acc", &tle::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc,
-            &tle::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc)
-        .def_prop_rw("cuda_graph_mode", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphMode,
-            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphMode)
-        .def_prop_rw("cuda_graph_cache_size", &tle::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize,
-            &tle::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize)
-        .def("__getstate__", extendedRuntimePerfKnobConfigGetstate)
-        .def("__setstate__", extendedRuntimePerfKnobConfigSetstate);
-
-    auto SpeculativeDecodingConfigGetState
-        = [](tle::SpeculativeDecodingConfig const& self) { return nb::make_tuple(self.fastLogits); };
-    auto SpeculativeDecodingConfigSetState = [](tle::SpeculativeDecodingConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 1)
-        {
-            throw std::runtime_error("Invalid SpeculativeDecodingConfig state!");
-        }
-        new (&self) tle::SpeculativeDecodingConfig(nb::cast<bool>(state[0]));
-    };
-    nb::class_<tle::SpeculativeDecodingConfig>(m, "SpeculativeDecodingConfig")
-        .def(nb::init<bool>(), nb::arg("fast_logits") = false)
-        .def_rw("fast_logits", &tle::SpeculativeDecodingConfig::fastLogits)
-        .def("__getstate__", SpeculativeDecodingConfigGetState)
-        .def("__setstate__", SpeculativeDecodingConfigSetState);
-
-    // Guided decoding config
-    auto pyGuidedDecodingConfig = nb::class_<tle::GuidedDecodingConfig>(m, "GuidedDecodingConfig");
-
-    nb::enum_<tle::GuidedDecodingConfig::GuidedDecodingBackend>(pyGuidedDecodingConfig, "GuidedDecodingBackend")
-        .value("XGRAMMAR", tle::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
-        .value("LLGUIDANCE", tle::GuidedDecodingConfig::GuidedDecodingBackend::kLLGUIDANCE);
-
-    auto guidedDecodingConfigGetstate = [](tle::GuidedDecodingConfig const& self) {
-        return nb::make_tuple(
-            self.getBackend(), self.getEncodedVocab(), self.getTokenizerStr(), self.getStopTokenIds());
-    };
-    auto guidedDecodingConfigSetstate = [](tle::GuidedDecodingConfig& self, nb::tuple state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid GuidedDecodingConfig state!");
-        }
-        new (&self) tle::GuidedDecodingConfig(nb::cast<tle::GuidedDecodingConfig::GuidedDecodingBackend>(state[0]),
-            nb::cast<std::optional<std::vector<std::string>>>(state[1]), nb::cast<std::optional<std::string>>(state[2]),
-            nb::cast<std::optional<std::vector<tle::TokenIdType>>>(state[3]));
-    };
-
-    pyGuidedDecodingConfig
-        .def(nb::init<tle::GuidedDecodingConfig::GuidedDecodingBackend, std::optional<std::vector<std::string>>,
-                 std::optional<std::string>, std::optional<std::vector<tle::TokenIdType>>>(),
-            nb::arg("backend"), nb::arg("encoded_vocab") = nb::none(), nb::arg("tokenizer_str") = nb::none(),
-            nb::arg("stop_token_ids") = nb::none())
-        .def_prop_rw("backend", &tle::GuidedDecodingConfig::getBackend, &tle::GuidedDecodingConfig::setBackend)
-        .def_prop_rw(
-            "encoded_vocab", &tle::GuidedDecodingConfig::getEncodedVocab, &tle::GuidedDecodingConfig::setEncodedVocab)
-        .def_prop_rw(
-            "tokenizer_str", &tle::GuidedDecodingConfig::getTokenizerStr, &tle::GuidedDecodingConfig::setTokenizerStr)
-        .def_prop_rw(
-            "stop_token_ids", &tle::GuidedDecodingConfig::getStopTokenIds, &tle::GuidedDecodingConfig::setStopTokenIds)
-        .def("__getstate__", guidedDecodingConfigGetstate)
-        .def("__setstate__", guidedDecodingConfigSetstate);
-
-    auto cacheTransceiverConfigGetstate
-        = [](tle::CacheTransceiverConfig const& self) { return nb::make_tuple(self.getMaxNumTokens()); };
-    auto cacheTransceiverConfigSetstate = [](tle::CacheTransceiverConfig& self, nb::tuple const& state)
-    {
-        if (state.size() != 1)
-        {
-            throw std::runtime_error("Invalid CacheTransceiverConfig state!");
-        }
-        new (&self) tle::CacheTransceiverConfig(nb::cast<std::optional<size_t>>(state[0]));
-    };
-
-    nb::class_<tle::CacheTransceiverConfig>(m, "CacheTransceiverConfig")
-        .def(nb::init<std::optional<size_t>>(), nb::arg("max_num_tokens") = nb::none())
-        .def_prop_rw("max_num_tokens", &tle::CacheTransceiverConfig::getMaxNumTokens,
-            &tle::CacheTransceiverConfig::setMaxNumTokens)
-        .def("__getstate__", cacheTransceiverConfigGetstate)
-        .def("__setstate__", cacheTransceiverConfigSetstate);
-
-    auto executorConfigGetState = [](nb::object const& self)
-    {
-        auto& c = nb::cast<tle::ExecutorConfig&>(self);
-        // Return a tuple containing C++ data and the Python __dict__
-        auto cpp_states = nb::make_tuple(c.getMaxBeamWidth(), c.getSchedulerConfig(), c.getKvCacheConfig(),
-            c.getEnableChunkedContext(), c.getNormalizeLogProbs(), c.getIterStatsMaxIterations(),
-            c.getRequestStatsMaxIterations(), c.getBatchingType(), c.getMaxBatchSize(), c.getMaxNumTokens(),
-            c.getParallelConfig(), c.getPeftCacheConfig(), c.getLogitsPostProcessorConfig(), c.getDecodingConfig(),
-            c.getUseGpuDirectStorage(), c.getGpuWeightsPercent(), c.getMaxQueueSize(),
-            c.getExtendedRuntimePerfKnobConfig(), c.getDebugConfig(), c.getRecvPollPeriodMs(),
-            c.getMaxSeqIdleMicroseconds(), c.getSpecDecConfig(), c.getGuidedDecodingConfig(),
-            c.getAdditionalModelOutputs(), c.getCacheTransceiverConfig(), c.getGatherGenerationLogits(),
-            c.getPromptTableOffloading(), c.getEnableTrtOverlap());
-        auto pickle_tuple = nb::make_tuple(cpp_states, nb::getattr(self, "__dict__"));
-        return pickle_tuple;
-    };
-
-    auto executorConfigSetState = [](nb::object self, nb::tuple const& state)
-    {
-        if (state.size() != 2)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-
-        auto cpp_states = nb::cast<nb::tuple>(state[0]);
-        if (cpp_states.size() != 28)
-        {
-            throw std::runtime_error("Invalid cpp_states!");
-        }
-
-        // Restore C++ data
-        tle::ExecutorConfig* cpp_self = nb::inst_ptr<tle::ExecutorConfig>(self);
-        new (cpp_self) tle::ExecutorConfig(                                          //
-            nb::cast<SizeType32>(cpp_states[0]),                                     // MaxBeamWidth
-            nb::cast<tle::SchedulerConfig>(cpp_states[1]),                           // SchedulerConfig
-            nb::cast<tle::KvCacheConfig>(cpp_states[2]),                             // KvCacheConfig
-            nb::cast<bool>(cpp_states[3]),                                           // EnableChunkedContext
-            nb::cast<bool>(cpp_states[4]),                                           // NormalizeLogProbs
-            nb::cast<SizeType32>(cpp_states[5]),                                     // IterStatsMaxIterations
-            nb::cast<SizeType32>(cpp_states[6]),                                     // RequestStatsMaxIterations
-            nb::cast<tle::BatchingType>(cpp_states[7]),                              // BatchingType
-            nb::cast<std::optional<SizeType32>>(cpp_states[8]),                      // MaxBatchSize
-            nb::cast<std::optional<SizeType32>>(cpp_states[9]),                      // MaxNumTokens
-            nb::cast<std::optional<tle::ParallelConfig>>(cpp_states[10]),            // ParallelConfig
-            nb::cast<std::optional<tle::PeftCacheConfig>>(cpp_states[11]),           // PeftCacheConfig
-            nb::cast<std::optional<tle::LogitsPostProcessorConfig>>(cpp_states[12]), // LogitsPostProcessorConfig
-            nb::cast<std::optional<tle::DecodingConfig>>(cpp_states[13]),            // DecodingConfig
-            nb::cast<bool>(cpp_states[14]),                                          // UseGpuDirectStorage
-            nb::cast<float>(cpp_states[15]),                                         // GpuWeightsPercent
-            nb::cast<std::optional<SizeType32>>(cpp_states[16]),                     // MaxQueueSize
-            nb::cast<tle::ExtendedRuntimePerfKnobConfig>(cpp_states[17]),            // ExtendedRuntimePerfKnobConfig
-            nb::cast<std::optional<tle::DebugConfig>>(cpp_states[18]),               // DebugConfig
-            nb::cast<SizeType32>(cpp_states[19]),                                    // RecvPollPeriodMs
-            nb::cast<uint64_t>(cpp_states[20]),                                      // MaxSeqIdleMicroseconds
-            nb::cast<std::optional<tle::SpeculativeDecodingConfig>>(cpp_states[21]), // SpecDecConfig
-            nb::cast<std::optional<tle::GuidedDecodingConfig>>(cpp_states[22]),      // GuidedDecodingConfig
-            nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(cpp_states[23]), // AdditionalModelOutputs
-            nb::cast<std::optional<tle::CacheTransceiverConfig>>(cpp_states[24]),             // CacheTransceiverConfig
-            nb::cast<bool>(cpp_states[25]),                                                   // GatherGenerationLogits
-            nb::cast<bool>(cpp_states[26]),                                                   // PromptTableOffloading
-            nb::cast<bool>(cpp_states[27])                                                    // EnableTrtOverlap
-        );
-
-        // Restore Python data
-        auto py_state = nb::cast<nb::dict>(state[1]);
-        self.attr("__dict__").attr("update")(py_state);
-
-        nb::inst_mark_ready(self);
-    };
-
-    nb::class_<tle::ExecutorConfig>(m, "ExecutorConfig", nb::dynamic_attr())
-        .def(nb::init<                                                   //
-                 SizeType32,                                             // MaxBeamWidth
-                 tle::SchedulerConfig const&,                            // SchedulerConfig
-                 tle::KvCacheConfig const&,                              // KvCacheConfig
-                 bool,                                                   // EnableChunkedContext
-                 bool,                                                   // NormalizeLogProbs
-                 SizeType32,                                             // IterStatsMaxIterations
-                 SizeType32,                                             // RequestStatsMaxIterations
-                 tle::BatchingType,                                      // BatchingType
-                 std::optional<SizeType32>,                              // MaxBatchSize
-                 std::optional<SizeType32>,                              // MaxNumTokens
-                 std::optional<tle::ParallelConfig>,                     // ParallelConfig
-                 tle::PeftCacheConfig const&,                            // PeftCacheConfig
-                 std::optional<tle::LogitsPostProcessorConfig>,          // LogitsPostProcessorConfig
-                 std::optional<tle::DecodingConfig>,                     // DecodingConfig
-                 bool,                                                   // UseGpuDirectStorage
-                 float,                                                  // GpuWeightsPercent
-                 std::optional<SizeType32>,                              // MaxQueueSize
-                 tle::ExtendedRuntimePerfKnobConfig const&,              // ExtendedRuntimePerfKnobConfig
-                 std::optional<tle::DebugConfig>,                        // DebugConfig
-                 SizeType32,                                             // RecvPollPeriodMs
-                 uint64_t,                                               // MaxSeqIdleMicroseconds
-                 std::optional<tle::SpeculativeDecodingConfig>,          // SpecDecConfig
-                 std::optional<tle::GuidedDecodingConfig>,               // GuidedDecodingConfig
-                 std::optional<std::vector<tle::AdditionalModelOutput>>, // AdditionalModelOutputs
-                 std::optional<tle::CacheTransceiverConfig>,             // CacheTransceiverConfig
-                 bool,                                                   // GatherGenerationLogits
-                 bool,                                                   // PromptTableOffloading
-                 bool                                                    // EnableTrtOverlap
-                 >(),
-            nb::arg("max_beam_width") = 1, nb::arg("scheduler_config") = tle::SchedulerConfig(),
-            nb::arg("kv_cache_config") = tle::KvCacheConfig(), nb::arg("enable_chunked_context") = false,
-            nb::arg("normalize_log_probs") = true,
-            nb::arg("iter_stats_max_iterations") = tle::ExecutorConfig::kDefaultIterStatsMaxIterations,
-            nb::arg("request_stats_max_iterations") = tle::ExecutorConfig::kDefaultRequestStatsMaxIterations,
-            nb::arg("batching_type") = tle::BatchingType::kINFLIGHT, nb::arg("max_batch_size") = nb::none(),
-            nb::arg("max_num_tokens") = nb::none(), nb::arg("parallel_config") = nb::none(),
-            nb::arg("peft_cache_config") = tle::PeftCacheConfig(), nb::arg("logits_post_processor_config") = nb::none(),
-            nb::arg("decoding_config") = nb::none(), nb::arg("use_gpu_direct_storage") = false,
-            nb::arg("gpu_weights_percent") = 1.0, nb::arg("max_queue_size") = nb::none(),
-            nb::arg("extended_runtime_perf_knob_config") = tle::ExtendedRuntimePerfKnobConfig(),
-            nb::arg("debug_config") = nb::none(), nb::arg("recv_poll_period_ms") = 0,
-            nb::arg("max_seq_idle_microseconds") = tle::ExecutorConfig::kDefaultMaxSeqIdleMicroseconds,
-            nb::arg("spec_dec_config") = nb::none(), nb::arg("guided_decoding_config") = nb::none(),
-            nb::arg("additional_model_outputs") = nb::none(), nb::arg("cache_transceiver_config") = nb::none(),
-            nb::arg("gather_generation_logits") = false, nb::arg("mm_embedding_offloading") = false,
-            nb::arg("enable_trt_overlap") = false)
-        .def_prop_rw("max_beam_width", &tle::ExecutorConfig::getMaxBeamWidth, &tle::ExecutorConfig::setMaxBeamWidth)
-        .def_prop_rw("max_batch_size", &tle::ExecutorConfig::getMaxBatchSize, &tle::ExecutorConfig::setMaxBatchSize)
-        .def_prop_rw("max_num_tokens", &tle::ExecutorConfig::getMaxNumTokens, &tle::ExecutorConfig::setMaxNumTokens)
-        .def_prop_rw(
-            "scheduler_config", &tle::ExecutorConfig::getSchedulerConfigRef, &tle::ExecutorConfig::setSchedulerConfig)
-        .def_prop_rw(
-            "kv_cache_config", &tle::ExecutorConfig::getKvCacheConfigRef, &tle::ExecutorConfig::setKvCacheConfig)
-        .def_prop_rw("enable_chunked_context", &tle::ExecutorConfig::getEnableChunkedContext,
-            &tle::ExecutorConfig::setEnableChunkedContext)
-        .def_prop_rw("normalize_log_probs", &tle::ExecutorConfig::getNormalizeLogProbs,
-            &tle::ExecutorConfig::setNormalizeLogProbs)
-        .def_prop_rw("iter_stats_max_iterations", &tle::ExecutorConfig::getIterStatsMaxIterations,
-            &tle::ExecutorConfig::setIterStatsMaxIterations)
-        .def_prop_rw("request_stats_max_iterations", &tle::ExecutorConfig::getRequestStatsMaxIterations,
-            &tle::ExecutorConfig::setRequestStatsMaxIterations)
-        .def_prop_rw("batching_type", &tle::ExecutorConfig::getBatchingType, &tle::ExecutorConfig::setBatchingType)
-        .def_prop_rw(
-            "parallel_config", &tle::ExecutorConfig::getParallelConfig, &tle::ExecutorConfig::setParallelConfig)
-        .def_prop_rw(
-            "peft_cache_config", &tle::ExecutorConfig::getPeftCacheConfig, &tle::ExecutorConfig::setPeftCacheConfig)
-        .def_prop_rw("logits_post_processor_config", &tle::ExecutorConfig::getLogitsPostProcessorConfig,
-            &tle::ExecutorConfig::setLogitsPostProcessorConfig)
-        .def_prop_rw(
-            "decoding_config", &tle::ExecutorConfig::getDecodingConfig, &tle::ExecutorConfig::setDecodingConfig)
-        .def_prop_rw("use_gpu_direct_storage", &tle::ExecutorConfig::getUseGpuDirectStorage,
-            &tle::ExecutorConfig::setUseGpuDirectStorage)
-        .def_prop_rw("gpu_weights_percent", &tle::ExecutorConfig::getGpuWeightsPercent,
-            &tle::ExecutorConfig::setGpuWeightsPercent)
-        .def_prop_rw("max_queue_size", &tle::ExecutorConfig::getMaxQueueSize, &tle::ExecutorConfig::setMaxQueueSize)
-        .def_prop_rw("extended_runtime_perf_knob_config", &tle::ExecutorConfig::getExtendedRuntimePerfKnobConfig,
-            &tle::ExecutorConfig::setExtendedRuntimePerfKnobConfig)
-        .def_prop_rw("debug_config", &tle::ExecutorConfig::getDebugConfig, &tle::ExecutorConfig::setDebugConfig)
-        .def_prop_rw(
-            "recv_poll_period_ms", &tle::ExecutorConfig::getRecvPollPeriodMs, &tle::ExecutorConfig::setRecvPollPeriodMs)
-        .def_prop_rw("max_seq_idle_microseconds", &tle::ExecutorConfig::getMaxSeqIdleMicroseconds,
-            &tle::ExecutorConfig::setMaxSeqIdleMicroseconds)
-        .def_prop_rw("spec_dec_config", &tle::ExecutorConfig::getSpecDecConfig, &tle::ExecutorConfig::setSpecDecConfig)
-        .def_prop_rw("guided_decoding_config", &tle::ExecutorConfig::getGuidedDecodingConfig,
-            &tle::ExecutorConfig::setGuidedDecodingConfig)
-        .def_prop_rw("additional_model_outputs", &tle::ExecutorConfig::getAdditionalModelOutputs,
-            &tle::ExecutorConfig::setAdditionalModelOutputs)
-        .def_prop_rw("cache_transceiver_config", &tle::ExecutorConfig::getCacheTransceiverConfig,
-            &tle::ExecutorConfig::setCacheTransceiverConfig)
-        .def_prop_rw("gather_generation_logits", &tle::ExecutorConfig::getGatherGenerationLogits,
-            &tle::ExecutorConfig::setGatherGenerationLogits)
-        .def_prop_rw("mm_embedding_offloading", &tle::ExecutorConfig::getPromptTableOffloading,
-            &tle::ExecutorConfig::setPromptTableOffloading)
-        .def_prop_rw(
-            "enable_trt_overlap", &tle::ExecutorConfig::getEnableTrtOverlap, &tle::ExecutorConfig::setEnableTrtOverlap)
-        .def("__getstate__", executorConfigGetState)
-        .def("__setstate__", executorConfigSetState);
-}
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.h b/cpp/tensorrt_llm/nanobind/executor/executorConfig.h
deleted file mode 100644
index 5b63e7c5a3e..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-// Register bindings for executor API.
-void initConfigBindings(nb::module_& m);
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/request.cpp b/cpp/tensorrt_llm/nanobind/executor/request.cpp
deleted file mode 100644
index 9c3d34aa8fd..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/request.cpp
+++ /dev/null
@@ -1,935 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "request.h"
-#include "tensorrt_llm/common/assert.h"
-#include "tensorrt_llm/common/logger.h"
-#include "tensorrt_llm/executor/executor.h"
-#include "tensorrt_llm/executor/serializeUtils.h"
-#include "tensorrt_llm/executor/tensor.h"
-#include "tensorrt_llm/executor/types.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/runtime/cudaStream.h"
-
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/chrono.h>
-#include <nanobind/stl/list.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/string.h>
-#include <nanobind/stl/tuple.h>
-#include <nanobind/stl/vector.h>
-#include <sstream>
-
-#include <optional>
-#include <vector>
-
-namespace nb = nanobind;
-namespace tle = tensorrt_llm::executor;
-using Tensor = tle::Tensor;
-using SizeType32 = tle::SizeType32;
-using FloatType = tle::FloatType;
-using VecTokens = tle::VecTokens;
-using IdType = tle::IdType;
-using VecTokenExtraIds = tle::VecTokenExtraIds;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-void initRequestBindings(nb::module_& m)
-{
-    nb::enum_<tle::RequestType>(m, "RequestType")
-        .value("REQUEST_TYPE_CONTEXT_AND_GENERATION", tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION)
-        .value("REQUEST_TYPE_CONTEXT_ONLY", tle::RequestType::REQUEST_TYPE_CONTEXT_ONLY)
-        .value("REQUEST_TYPE_GENERATION_ONLY", tle::RequestType::REQUEST_TYPE_GENERATION_ONLY);
-
-    nb::enum_<tle::FinishReason>(m, "FinishReason")
-        .value("NOT_FINISHED", tle::FinishReason::kNOT_FINISHED)
-        .value("END_ID", tle::FinishReason::kEND_ID)
-        .value("STOP_WORDS", tle::FinishReason::kSTOP_WORDS)
-        .value("LENGTH", tle::FinishReason::kLENGTH)
-        .value("TIMED_OUT", tle::FinishReason::kTIMED_OUT)
-        .value("CANCELLED", tle::FinishReason::kCANCELLED);
-
-    nb::enum_<tle::KvCacheTransferMode>(m, "KvCacheTransferMode")
-        .value("DRAM", tle::KvCacheTransferMode::DRAM)
-        .value("GDS", tle::KvCacheTransferMode::GDS)
-        .value("POSIX_DEBUG_FALLBACK", tle::KvCacheTransferMode::POSIX_DEBUG_FALLBACK);
-
-    auto samplingConfigGetstate = [](tle::SamplingConfig const& self)
-    {
-        return nb::make_tuple(self.getBeamWidth(), self.getTopK(), self.getTopP(), self.getTopPMin(),
-            self.getTopPResetIds(), self.getTopPDecay(), self.getSeed(), self.getTemperature(), self.getMinTokens(),
-            self.getBeamSearchDiversityRate(), self.getRepetitionPenalty(), self.getPresencePenalty(),
-            self.getFrequencyPenalty(), self.getLengthPenalty(), self.getEarlyStopping(), self.getNoRepeatNgramSize(),
-            self.getNumReturnSequences(), self.getMinP(), self.getBeamWidthArray());
-    };
-    auto samplingConfigSetstate = [](tle::SamplingConfig& samplingConfig, nb::tuple const& state)
-    {
-        if (state.size() != 19)
-        {
-            throw std::runtime_error("Invalid SamplingConfig state!");
-        }
-        new (&samplingConfig) tle::SamplingConfig(nb::cast<SizeType32>(state[0]), // BeamWidth
-            nb::cast<std::optional<SizeType32>>(state[1]),                        // TopK
-            nb::cast<std::optional<FloatType>>(state[2]),                         // TopP
-            nb::cast<std::optional<FloatType>>(state[3]),                         // TopPMin
-            nb::cast<std::optional<tle::TokenIdType>>(state[4]),                  // TopPResetIds
-            nb::cast<std::optional<FloatType>>(state[5]),                         // TopPDecay
-            nb::cast<std::optional<tle::RandomSeedType>>(state[6]),               // Seed
-            nb::cast<std::optional<FloatType>>(state[7]),                         // Temperature
-            nb::cast<std::optional<SizeType32>>(state[8]),                        // MinTokens
-            nb::cast<std::optional<FloatType>>(state[9]),                         // BeamSearchDiversityRate
-            nb::cast<std::optional<FloatType>>(state[10]),                        // RepetitionPenalty
-            nb::cast<std::optional<FloatType>>(state[11]),                        // PresencePenalty
-            nb::cast<std::optional<FloatType>>(state[12]),                        // FrequencyPenalty
-            nb::cast<std::optional<FloatType>>(state[13]),                        // LengthPenalty
-            nb::cast<std::optional<SizeType32>>(state[14]),                       // EarlyStopping
-            nb::cast<std::optional<SizeType32>>(state[15]),                       // NoRepeatNgramSize
-            nb::cast<std::optional<SizeType32>>(state[16]),                       // NumReturnSequences
-            nb::cast<std::optional<FloatType>>(state[17]),                        // MinP
-            nb::cast<std::optional<std::vector<SizeType32>>>(state[18])           // BeamWidthArray
-        );
-    };
-    nb::class_<tle::SamplingConfig>(m, "SamplingConfig")
-        .def(nb::init<tle::SizeType32,
-                 std::optional<tle::SizeType32> const&,             // beamWidth
-                 std::optional<tle::FloatType> const&,              // topP
-                 std::optional<tle::FloatType> const&,              // topPMin
-                 std::optional<tle::TokenIdType> const&,            // topPResetIds
-                 std::optional<tle::FloatType> const&,              // topPDecay
-                 std::optional<tle::RandomSeedType> const&,         // seed
-                 std::optional<tle::FloatType> const&,              // temperature
-                 std::optional<tle::SizeType32> const&,             // minTokens
-                 std::optional<tle::FloatType> const&,              // beamSearchDiversityRate
-                 std::optional<tle::FloatType> const&,              // repetitionPenalty
-                 std::optional<tle::FloatType> const&,              // presencePenalty
-                 std::optional<tle::FloatType> const&,              // frequencyPenalty
-                 std::optional<tle::FloatType> const&,              // lengthPenalty
-                 std::optional<tle::SizeType32> const&,             // earlyStopping
-                 std::optional<tle::SizeType32> const&,             // noRepeatNgramSize
-                 std::optional<tle::SizeType32> const&,             // numReturnSequences
-                 std::optional<tle::FloatType> const&,              // minP
-                 std::optional<std::vector<tle::SizeType32>> const& // beamWidthArray
-                 >(),
-            // clang-format off
-            nb::arg("beam_width") = 1,
-            nb::kw_only(),
-            nb::arg("top_k") = nb::none(),
-            nb::arg("top_p") = nb::none(),
-            nb::arg("top_p_min") = nb::none(),
-            nb::arg("top_p_reset_ids") = nb::none(),
-            nb::arg("top_p_decay") = nb::none(),
-            nb::arg("seed") = nb::none(),
-            nb::arg("temperature") = nb::none(),
-            nb::arg("min_tokens") = nb::none(),
-            nb::arg("beam_search_diversity_rate") = nb::none(),
-            nb::arg("repetition_penalty") = nb::none(),
-            nb::arg("presence_penalty") = nb::none(),
-            nb::arg("frequency_penalty") = nb::none(),
-            nb::arg("length_penalty") = nb::none(),
-            nb::arg("early_stopping") = nb::none(),
-            nb::arg("no_repeat_ngram_size") = nb::none(),
-            nb::arg("num_return_sequences") = nb::none(),
-            nb::arg("min_p") = nb::none(),
-            nb::arg("beam_width_array") = nb::none())               // clang-format on
-        .def_prop_rw("beam_width", &tle::SamplingConfig::getBeamWidth, &tle::SamplingConfig::setBeamWidth)
-        .def_prop_rw("top_k", &tle::SamplingConfig::getTopK, &tle::SamplingConfig::setTopK)
-        .def_prop_rw("top_p", &tle::SamplingConfig::getTopP, &tle::SamplingConfig::setTopP)
-        .def_prop_rw("top_p_min", &tle::SamplingConfig::getTopPMin, &tle::SamplingConfig::setTopPMin)
-        .def_prop_rw("top_p_reset_ids", &tle::SamplingConfig::getTopPResetIds, &tle::SamplingConfig::setTopPResetIds)
-        .def_prop_rw("top_p_decay", &tle::SamplingConfig::getTopPDecay, &tle::SamplingConfig::setTopPDecay)
-        .def_prop_rw("seed", &tle::SamplingConfig::getSeed, &tle::SamplingConfig::setSeed)
-        .def_prop_rw("temperature", &tle::SamplingConfig::getTemperature, &tle::SamplingConfig::setTemperature)
-        .def_prop_rw("min_tokens", &tle::SamplingConfig::getMinTokens, &tle::SamplingConfig::setMinTokens)
-        .def_prop_rw("beam_search_diversity_rate", &tle::SamplingConfig::getBeamSearchDiversityRate,
-            &tle::SamplingConfig::setBeamSearchDiversityRate)
-        .def_prop_rw("repetition_penalty", &tle::SamplingConfig::getRepetitionPenalty,
-            &tle::SamplingConfig::setRepetitionPenalty)
-        .def_prop_rw("presence_penalty", &tle::SamplingConfig::getPresencePenalty,
-            [](tle::SamplingConfig& self, std::optional<FloatType> v) { self.setPresencePenalty(v); })
-        .def_prop_rw(
-            "frequency_penalty", &tle::SamplingConfig::getFrequencyPenalty, &tle::SamplingConfig::setFrequencyPenalty)
-        .def_prop_rw("length_penalty", &tle::SamplingConfig::getLengthPenalty, &tle::SamplingConfig::setLengthPenalty)
-        .def_prop_rw("early_stopping", &tle::SamplingConfig::getEarlyStopping, &tle::SamplingConfig::setEarlyStopping)
-        .def_prop_rw("no_repeat_ngram_size", &tle::SamplingConfig::getNoRepeatNgramSize,
-            &tle::SamplingConfig::setNoRepeatNgramSize)
-        .def_prop_rw("num_return_sequences", &tle::SamplingConfig::getNumReturnSequences,
-            &tle::SamplingConfig::setNumReturnSequences)
-        .def_prop_rw("min_p", &tle::SamplingConfig::getMinP, &tle::SamplingConfig::setMinP)
-        .def_prop_rw(
-            "beam_width_array", &tle::SamplingConfig::getBeamWidthArray, &tle::SamplingConfig::setBeamWidthArray)
-        .def("__getstate__", samplingConfigGetstate)
-        .def("__setstate__", samplingConfigSetstate);
-
-    auto additionalModelOutputGetstate
-        = [](tle::AdditionalModelOutput const& self) { return nb::make_tuple(self.name, self.gatherContext); };
-    auto additionalModelOutputSetstate = [](tle::AdditionalModelOutput& additionalModelOutput, nb::tuple const& state)
-    {
-        if (state.size() != 2)
-        {
-            throw std::runtime_error("Invalid AdditionalModelOutput state!");
-        }
-        new (&additionalModelOutput)
-            tle::AdditionalModelOutput(nb::cast<std::string>(state[0]), nb::cast<bool>(state[1]));
-    };
-    nb::class_<tle::AdditionalModelOutput>(m, "AdditionalModelOutput")
-        .def(nb::init<std::string, bool>(), nb::arg("name"), nb::arg("gather_context") = false)
-        .def_rw("name", &tle::AdditionalModelOutput::name)
-        .def_rw("gather_context", &tle::AdditionalModelOutput::gatherContext)
-        .def("__getstate__", additionalModelOutputGetstate)
-        .def("__setstate__", additionalModelOutputSetstate);
-
-    auto outputConfigGetstate = [](tle::OutputConfig const& self)
-    {
-        return nb::make_tuple(self.returnLogProbs, self.returnContextLogits, self.returnGenerationLogits,
-            self.excludeInputFromOutput, self.returnEncoderOutput, self.returnPerfMetrics, self.additionalModelOutputs);
-    };
-    auto outputConfigSetstate = [](tle::OutputConfig& outputConfig, nb::tuple const& state)
-    {
-        if (state.size() != 7)
-        {
-            throw std::runtime_error("Invalid OutputConfig state!");
-        }
-        new (&outputConfig) tle::OutputConfig(nb::cast<bool>(state[0]), nb::cast<bool>(state[1]),
-            nb::cast<bool>(state[2]), nb::cast<bool>(state[3]), nb::cast<bool>(state[4]), nb::cast<bool>(state[5]),
-            nb::cast<std::optional<std::vector<tle::AdditionalModelOutput>>>(state[6]));
-    };
-    nb::class_<tle::OutputConfig>(m, "OutputConfig")
-        .def(nb::init<bool, bool, bool, bool, bool, bool, std::optional<std::vector<tle::AdditionalModelOutput>>>(),
-            nb::arg("return_log_probs").none() = false, nb::arg("return_context_logits") = false,
-            nb::arg("return_generation_logits") = false, nb::arg("exclude_input_from_output") = false,
-            nb::arg("return_encoder_output") = false, nb::arg("return_perf_metrics") = false,
-            nb::arg("additional_model_outputs") = nb::none())
-        .def_rw("return_log_probs", &tle::OutputConfig::returnLogProbs)
-        .def_rw("return_context_logits", &tle::OutputConfig::returnContextLogits)
-        .def_rw("return_generation_logits", &tle::OutputConfig::returnGenerationLogits)
-        .def_rw("exclude_input_from_output", &tle::OutputConfig::excludeInputFromOutput)
-        .def_rw("return_encoder_output", &tle::OutputConfig::returnEncoderOutput)
-        .def_rw("return_perf_metrics", &tle::OutputConfig::returnPerfMetrics)
-        .def_rw("additional_model_outputs", &tle::OutputConfig::additionalModelOutputs)
-        .def("__getstate__", outputConfigGetstate)
-        .def("__setstate__", outputConfigSetstate);
-
-    auto externalDraftTokensConfigGetstate = [](tle::ExternalDraftTokensConfig const& self)
-    { return nb::make_tuple(self.getTokens(), self.getLogits(), self.getAcceptanceThreshold()); };
-    auto externalDraftTokensConfigSetstate
-        = [](tle::ExternalDraftTokensConfig& externalDraftTokensConfig, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid ExternalDraftTokensConfig state!");
-        }
-        new (&externalDraftTokensConfig) tle::ExternalDraftTokensConfig(nb::cast<VecTokens>(state[0]),
-            nb::cast<std::optional<Tensor>>(state[1]), nb::cast<std::optional<FloatType>>(state[2]));
-    };
-    nb::class_<tle::ExternalDraftTokensConfig>(m, "ExternalDraftTokensConfig")
-        .def(nb::init<VecTokens, std::optional<Tensor>, std::optional<FloatType> const&, std::optional<bool>>(),
-            nb::arg("tokens"), nb::arg("logits") = nb::none(), nb::arg("acceptance_threshold") = nb::none(),
-            nb::arg("fast_logits") = nb::none())
-        .def_prop_ro("tokens", &tle::ExternalDraftTokensConfig::getTokens)
-        .def_prop_ro("logits", &tle::ExternalDraftTokensConfig::getLogits)
-        .def_prop_ro("acceptance_threshold", &tle::ExternalDraftTokensConfig::getAcceptanceThreshold)
-        .def("__getstate__", externalDraftTokensConfigGetstate)
-        .def("__setstate__", externalDraftTokensConfigSetstate)
-        .def_prop_ro("fast_logits", &tle::ExternalDraftTokensConfig::getFastLogits);
-
-    auto promptTuningConfigGetstate = [](tle::PromptTuningConfig const& self)
-    { return nb::make_tuple(self.getEmbeddingTable(), self.getInputTokenExtraIds()); };
-    auto promptTuningConfigSetstate = [](tle::PromptTuningConfig& promptTuningConfig, nb::tuple const& state)
-    {
-        if (state.size() != 2)
-        {
-            throw std::runtime_error("Invalid PromptTuningConfig state!");
-        }
-        new (&promptTuningConfig)
-            tle::PromptTuningConfig(nb::cast<Tensor>(state[0]), nb::cast<std::optional<VecTokenExtraIds>>(state[1]));
-    };
-    nb::class_<tle::PromptTuningConfig>(m, "PromptTuningConfig")
-        .def(nb::init<Tensor, std::optional<VecTokenExtraIds>>(), nb::arg("embedding_table"),
-            nb::arg("input_token_extra_ids") = nb::none())
-        .def_prop_ro("embedding_table", &tle::PromptTuningConfig::getEmbeddingTable)
-        .def_prop_ro("input_token_extra_ids", &tle::PromptTuningConfig::getInputTokenExtraIds)
-        .def("__getstate__", promptTuningConfigGetstate)
-        .def("__setstate__", promptTuningConfigSetstate);
-
-    auto loraConfigGetstate = [](tle::LoraConfig const& self)
-    { return nb::make_tuple(self.getTaskId(), self.getWeights(), self.getConfig()); };
-    auto loraConfigSetstate = [](tle::LoraConfig& loraConfig, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid LoraConfig state!");
-        }
-        new (&loraConfig) tle::LoraConfig(nb::cast<IdType>(state[0]), nb::cast<std::optional<Tensor>>(state[1]),
-            nb::cast<std::optional<Tensor>>(state[2]));
-    };
-    nb::class_<tle::LoraConfig>(m, "LoraConfig")
-        .def(nb::init<uint64_t, std::optional<Tensor>, std::optional<Tensor>>(), nb::arg("task_id"),
-            nb::arg("weights") = nb::none(), nb::arg("config") = nb::none())
-        .def_prop_ro("task_id", &tle::LoraConfig::getTaskId)
-        .def_prop_ro("weights", &tle::LoraConfig::getWeights)
-        .def_prop_ro("config", &tle::LoraConfig::getConfig)
-        .def("__getstate__", loraConfigGetstate)
-        .def("__setstate__", loraConfigSetstate);
-
-    auto multimodalInputGetstate = [](tle::MultimodalInput const& self)
-    { return nb::make_tuple(self.getMultimodalHashes(), self.getMultimodalPositions(), self.getMultimodalLengths()); };
-    auto multimodalInputSetstate = [](tle::MultimodalInput& multimodalInput, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid MultimodalInput state!");
-        }
-        new (&multimodalInput) tle::MultimodalInput(nb::cast<std::vector<std::vector<SizeType32>>>(state[0]),
-            nb::cast<std::vector<SizeType32>>(state[1]), nb::cast<std::vector<SizeType32>>(state[2]));
-    };
-    nb::class_<tle::MultimodalInput>(m, "MultimodalInput")
-        .def(nb::init<std::vector<std::vector<SizeType32>>, std::vector<SizeType32>, std::vector<SizeType32>>(),
-            nb::arg("multimodal_hashes"), nb::arg("multimodal_positions"), nb::arg("multimodal_lengths"))
-        .def_prop_ro("multimodal_hashes", &tle::MultimodalInput::getMultimodalHashes)
-        .def_prop_ro("multimodal_positions", &tle::MultimodalInput::getMultimodalPositions)
-        .def_prop_ro("multimodal_lengths", &tle::MultimodalInput::getMultimodalLengths)
-        .def("__getstate__", multimodalInputGetstate)
-        .def("__setstate__", multimodalInputSetstate);
-
-    auto MropeConfigGetstate = [](tle::MropeConfig const& self)
-    { return nb::make_tuple(self.getMRopeRotaryCosSin(), self.getMRopePositionDeltas()); };
-    auto MropeConfigSetstate = [](tle::MropeConfig& mropeConfig, nb::tuple const& state)
-    {
-        if (state.size() != 2)
-        {
-            throw std::runtime_error("Invalid MropeConfig state!");
-        }
-        new (&mropeConfig) tle::MropeConfig(nb::cast<tle::Tensor>(state[0]), nb::cast<SizeType32>(state[1]));
-    };
-    nb::class_<tle::MropeConfig>(m, "MropeConfig")
-        .def(nb::init<Tensor, SizeType32>(), nb::arg("mrope_rotary_cos_sin"), nb::arg("mrope_position_deltas"))
-        .def_prop_ro("mrope_rotary_cos_sin", &tle::MropeConfig::getMRopeRotaryCosSin)
-        .def_prop_ro("mrope_position_deltas", &tle::MropeConfig::getMRopePositionDeltas)
-        .def("__getstate__", MropeConfigGetstate)
-        .def("__setstate__", MropeConfigSetstate);
-
-    auto lookaheadDecodingConfigGetstate = [](tle::LookaheadDecodingConfig const& self)
-    { return nb::make_tuple(self.getWindowSize(), self.getNgramSize(), self.getVerificationSetSize()); };
-    auto lookaheadDecodingConfigSetstate
-        = [](tle::LookaheadDecodingConfig& lookaheadDecodingConfig, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid LookaheadDecodingConfig state!");
-        }
-        new (&lookaheadDecodingConfig) tle::LookaheadDecodingConfig(
-            nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]), nb::cast<SizeType32>(state[2]));
-    };
-    nb::class_<tle::LookaheadDecodingConfig>(m, "LookaheadDecodingConfig")
-        .def(nb::init<SizeType32, SizeType32, SizeType32>(), nb::arg("max_window_size"), nb::arg("max_ngram_size"),
-            nb::arg("max_verification_set_size"))
-        .def_prop_ro("max_window_size", &tle::LookaheadDecodingConfig::getWindowSize)
-        .def_prop_ro("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize)
-        .def_prop_ro("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize)
-        .def("calculate_speculative_resource", &tle::LookaheadDecodingConfig::calculateSpeculativeResource)
-        .def_static(
-            "calculate_speculative_resource_tuple", &tle::LookaheadDecodingConfig::calculateSpeculativeResourceTuple)
-        .def("__getstate__", lookaheadDecodingConfigGetstate)
-        .def("__setstate__", lookaheadDecodingConfigSetstate)
-        .def_static("get_default_lookahead_decoding_window",
-            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingWindow; })
-        .def_static("get_default_lookahead_decoding_ngram",
-            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingNgram; })
-        .def_static("get_default_lookahead_decoding_verification_set",
-            []() { return tle::LookaheadDecodingConfig::kDefaultLookaheadDecodingVerificationSet; });
-
-    auto TokenRangeRetentionConfigGetstate = [](tle::KvCacheRetentionConfig::TokenRangeRetentionConfig const& self)
-    { return nb::make_tuple(self.tokenStart, self.tokenEnd, self.priority, self.durationMs); };
-    auto TokenRangeRetentionConfigSetstate
-        = [](tle::KvCacheRetentionConfig::TokenRangeRetentionConfig& tokenRangeRetentionConfig, nb::tuple const& state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&tokenRangeRetentionConfig) tle::KvCacheRetentionConfig::TokenRangeRetentionConfig(
-            nb::cast<SizeType32>(state[0]), nb::cast<std::optional<SizeType32>>(state[1]),
-            nb::cast<tle::RetentionPriority>(state[2]), nb::cast<std::optional<std::chrono::milliseconds>>(state[3]));
-    };
-    auto kvCacheRetentionConfigGetstate = [](tle::KvCacheRetentionConfig const& self)
-    {
-        return nb::make_tuple(self.getTokenRangeRetentionConfigs(), self.getDecodeRetentionPriority(),
-            self.getDecodeDurationMs(), self.getTransferMode(), self.getDirectory());
-    };
-    auto kvCacheRetentionConfigSetstate
-        = [](tle::KvCacheRetentionConfig& kvCacheRetentionConfig, nb::tuple const& state)
-    {
-        if (state.size() != 5)
-        {
-            throw std::runtime_error("Invalid state!");
-        }
-        new (&kvCacheRetentionConfig) tle::KvCacheRetentionConfig(
-            nb::cast<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>>(state[0]),
-            nb::cast<tle::RetentionPriority>(state[1]), nb::cast<std::optional<std::chrono::milliseconds>>(state[2]),
-            nb::cast<tle::KvCacheTransferMode>(state[3]), nb::cast<std::optional<std::string>>(state[4]));
-    };
-
-    auto kvCacheRetentionConfig = nb::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig");
-
-    nb::class_<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>(
-        kvCacheRetentionConfig, "TokenRangeRetentionConfig")
-        .def(nb::init<SizeType32, std::optional<SizeType32>, tle::RetentionPriority,
-                 std::optional<std::chrono::milliseconds>>(),
-            nb::arg("token_start"), nb::arg("token_end"), nb::arg("priority"), nb::arg("duration_ms") = nb::none())
-        .def_rw("token_start", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenStart)
-        .def_rw("token_end", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::tokenEnd)
-        .def_rw("priority", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::priority)
-        .def_rw("duration_ms", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::durationMs)
-        .def("__getstate__", TokenRangeRetentionConfigGetstate)
-        .def("__setstate__", TokenRangeRetentionConfigSetstate)
-        .def("__eq__", &tle::KvCacheRetentionConfig::TokenRangeRetentionConfig::operator==);
-
-    // There's a circular dependency between the declaration of the TokenRangeRetentionPriority and
-    // KvCacheRetentionConfig bindings. Defer definition of the KvCacheRetentionConfig bindings until the
-    // TokenRangeRetentionPriority bindings have been defined.
-    kvCacheRetentionConfig
-        .def(nb::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>, tle::RetentionPriority,
-                 std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::optional<std::string>>(),
-            nb::arg("token_range_retention_configs"),
-            nb::arg("decode_retention_priority") = tle::KvCacheRetentionConfig::kDefaultRetentionPriority,
-            nb::arg("decode_duration_ms") = nb::none(), nb::arg("transfer_mode") = tle::KvCacheTransferMode::DRAM,
-            nb::arg("directory") = nb::none())
-        .def_prop_ro("token_range_retention_configs", &tle::KvCacheRetentionConfig::getTokenRangeRetentionConfigs)
-        .def_prop_ro("decode_retention_priority", &tle::KvCacheRetentionConfig::getDecodeRetentionPriority)
-        .def_prop_ro("decode_duration_ms", &tle::KvCacheRetentionConfig::getDecodeDurationMs)
-        .def_prop_ro("transfer_mode", &tle::KvCacheRetentionConfig::getTransferMode)
-        .def_prop_ro("directory", &tle::KvCacheRetentionConfig::getDirectory)
-        .def("__getstate__", kvCacheRetentionConfigGetstate)
-        .def("__setstate__", kvCacheRetentionConfigSetstate)
-        .def("__eq__", &tle::KvCacheRetentionConfig::operator==);
-
-    auto ContextPhaseParamsGetState = [](tle::ContextPhaseParams const& self)
-    {
-        if (self.getState() != nullptr)
-        {
-            auto serializedState = self.getSerializedState();
-            return nb::make_tuple(self.getFirstGenTokens(), self.getReqId(),
-                nb::bytes(serializedState.data(), serializedState.size()), self.getDraftTokens());
-        }
-        return nb::make_tuple(self.getFirstGenTokens(), self.getReqId(), nb::none(), self.getDraftTokens());
-    };
-
-    auto ContextPhaseParamsSetState = [](tle::ContextPhaseParams& contextPhaseParams, nb::tuple const& state)
-    {
-        if (state.size() != 4)
-        {
-            throw std::runtime_error("Invalid ContextPhaseParams state!");
-        }
-        if (!state[2].is_none())
-        {
-            auto opaque_state = nb::cast<nb::bytes>(state[2]);
-            auto opaque_state_str_view = std::string_view(opaque_state.c_str(), opaque_state.size());
-            new (&contextPhaseParams) tle::ContextPhaseParams(nb::cast<VecTokens>(state[0]),
-                nb::cast<tle::ContextPhaseParams::RequestIdType>(state[1]),
-                std::vector<char>(opaque_state_str_view.begin(), opaque_state_str_view.end()),
-                nb::cast<std::optional<VecTokens>>(state[3]));
-        }
-        new (&contextPhaseParams) tle::ContextPhaseParams(nb::cast<VecTokens>(state[0]),
-            nb::cast<tle::ContextPhaseParams::RequestIdType>(state[1]), nb::cast<std::optional<VecTokens>>(state[3]));
-    };
-
-    nb::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
-        .def("__init__",
-            [](tle::ContextPhaseParams const& self, VecTokens const& first_gen_tokens,
-                tle::ContextPhaseParams::RequestIdType req_id, std::optional<nb::bytes> const& opaque_state,
-                std::optional<VecTokens> const& draft_tokens)
-            {
-                if (opaque_state)
-                {
-                    auto opaque_state_str_view
-                        = std::string_view(opaque_state.value().c_str(), opaque_state.value().size());
-                    return std::make_unique<tle::ContextPhaseParams>(first_gen_tokens, req_id,
-                        std::vector<char>(opaque_state_str_view.begin(), opaque_state_str_view.end()), draft_tokens);
-                }
-                return std::make_unique<tle::ContextPhaseParams>(first_gen_tokens, req_id, draft_tokens);
-            })
-        .def_prop_ro("first_gen_tokens", [](tle::ContextPhaseParams const& self) { return self.getFirstGenTokens(); })
-        .def_prop_ro("draft_tokens", [](tle::ContextPhaseParams const& self) { return self.getDraftTokens(); })
-        .def_prop_ro("req_id", &tle::ContextPhaseParams::getReqId)
-        .def_prop_ro("opaque_state",
-            [](tle::ContextPhaseParams const& self)
-            {
-                std::optional<nb::bytes> opaque_state{std::nullopt};
-                if (self.getState() != nullptr)
-                {
-                    auto serializedState = self.getSerializedState();
-                    opaque_state = nb::bytes(serializedState.data(), serializedState.size());
-                }
-                return opaque_state;
-            })
-        .def("__getstate__", ContextPhaseParamsGetState)
-        .def("__setstate__", ContextPhaseParamsSetState);
-
-    auto EagleDecodingConfigGetstate = [](tle::EagleConfig const& self)
-    {
-        return nb::make_tuple(self.getEagleChoices(), self.isGreedySampling(), self.getPosteriorThreshold(),
-            self.useDynamicTree(), self.getDynamicTreeMaxTopK());
-    };
-    auto EagleDecodingConfigSetstate = [](tle::EagleConfig& eagleConfig, nb::tuple const& state)
-    {
-        if (state.size() != 5)
-        {
-            throw std::runtime_error("Invalid EagleConfig state!");
-        }
-        new (&eagleConfig) tle::EagleConfig(nb::cast<std::optional<tle::EagleChoices>>(state[0]),
-            nb::cast<bool>(state[1]), nb::cast<std::optional<float>>(state[2]), nb::cast<bool>(state[3]),
-            nb::cast<std::optional<SizeType32>>(state[4]));
-    };
-    nb::class_<tle::EagleConfig>(m, "EagleConfig")
-        .def(nb::init<std::optional<tle::EagleChoices>, bool, std::optional<float>, bool, std::optional<SizeType32>>(),
-            nb::arg("eagle_choices") = nb::none(), nb::arg("greedy_sampling") = true,
-            nb::arg("posterior_threshold") = nb::none(), nb::arg("use_dynamic_tree") = false,
-            nb::arg("dynamic_tree_max_topK") = nb::none())
-        .def_prop_ro("eagle_choices", &tle::EagleConfig::getEagleChoices)
-        .def_prop_ro("greedy_sampling", &tle::EagleConfig::isGreedySampling)
-        .def_prop_ro("posterior_threshold", &tle::EagleConfig::getPosteriorThreshold)
-        .def_prop_ro("use_dynamic_tree", &tle::EagleConfig::useDynamicTree)
-        .def_prop_ro("dynamic_tree_max_topK", &tle::EagleConfig::getDynamicTreeMaxTopK)
-        .def("__getstate__", EagleDecodingConfigGetstate)
-        .def("__setstate__", EagleDecodingConfigSetstate);
-
-    // Guided decoding params
-    auto pyGuidedDecodingParams = nb::class_<tle::GuidedDecodingParams>(m, "GuidedDecodingParams");
-
-    nb::enum_<tle::GuidedDecodingParams::GuideType>(pyGuidedDecodingParams, "GuideType")
-        .value("JSON", tle::GuidedDecodingParams::GuideType::kJSON)
-        .value("JSON_SCHEMA", tle::GuidedDecodingParams::GuideType::kJSON_SCHEMA)
-        .value("REGEX", tle::GuidedDecodingParams::GuideType::kREGEX)
-        .value("EBNF_GRAMMAR", tle::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR)
-        .value("STRUCTURAL_TAG", tle::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG);
-
-    auto guidedDecodingParamsGetstate
-        = [](tle::GuidedDecodingParams const& self) { return nb::make_tuple(self.getGuideType(), self.getGuide()); };
-
-    auto guidedDecodingParamsSetstate = [](tle::GuidedDecodingParams& guidedDecodingParams, nb::tuple const& state)
-    {
-        if (state.size() != 2)
-        {
-            throw std::runtime_error("Invalid GuidedDecodingParams state!");
-        }
-        new (&guidedDecodingParams) tle::GuidedDecodingParams(
-            nb::cast<tle::GuidedDecodingParams::GuideType>(state[0]), nb::cast<std::optional<std::string>>(state[1]));
-    };
-
-    pyGuidedDecodingParams
-        .def(nb::init<tle::GuidedDecodingParams::GuideType, std::optional<std::string>>(), nb::arg("guide_type"),
-            nb::arg("guide") = nb::none())
-        .def_prop_ro("guide_type", &tle::GuidedDecodingParams::getGuideType)
-        .def_prop_ro("guide", &tle::GuidedDecodingParams::getGuide)
-        .def("__getstate__", guidedDecodingParamsGetstate)
-        .def("__setstate__", guidedDecodingParamsSetstate);
-
-    auto requestGetstate = [](tle::Request const& self)
-    {
-        return nb::make_tuple(self.getInputTokenIds(), self.getMaxTokens(), self.getStreaming(),
-            self.getSamplingConfig(), self.getOutputConfig(), self.getEndId(), self.getPadId(), self.getPositionIds(),
-            self.getBadWords(), self.getStopWords(), self.getEmbeddingBias(), self.getExternalDraftTokensConfig(),
-            self.getPromptTuningConfig(), self.getMultimodalInput(), self.getMultimodalEmbedding(),
-            self.getMropeConfig(), self.getLoraConfig(), self.getLookaheadConfig(), self.getKvCacheRetentionConfig(),
-            self.getLogitsPostProcessorName(), self.getLogitsPostProcessor(), self.getEncoderInputTokenIds(),
-            self.getClientId(), self.getReturnAllGeneratedTokens(), self.getPriority(), self.getRequestType(),
-            self.getContextPhaseParams(), self.getEncoderInputFeatures(), self.getEncoderOutputLength(),
-            self.getCrossAttentionMask(), self.getEagleConfig(), self.getSkipCrossAttnBlocks(),
-            self.getGuidedDecodingParams());
-    };
-    auto requestSetstate = [](tle::Request& request, nb::tuple const& state)
-    {
-        if (state.size() != 33)
-        {
-            throw std::runtime_error("Invalid Request state!");
-        }
-        new (&request) tle::Request(nb::cast<VecTokens>(state[0]), nb::cast<SizeType32>(state[1]),
-            nb::cast<bool>(state[2]), nb::cast<tle::SamplingConfig>(state[3]), nb::cast<tle::OutputConfig>(state[4]),
-            nb::cast<std::optional<SizeType32>>(state[5]), nb::cast<std::optional<SizeType32>>(state[6]),
-            nb::cast<std::optional<std::vector<SizeType32>>>(state[7]),
-            nb::cast<std::optional<std::list<VecTokens>>>(state[8]),
-            nb::cast<std::optional<std::list<VecTokens>>>(state[9]), nb::cast<std::optional<Tensor>>(state[10]),
-            nb::cast<std::optional<tle::ExternalDraftTokensConfig>>(state[11]),
-            nb::cast<std::optional<tle::PromptTuningConfig>>(state[12]),
-            nb::cast<std::optional<tle::MultimodalInput>>(state[13]), nb::cast<std::optional<Tensor>>(state[14]),
-            nb::cast<std::optional<tle::MropeConfig>>(state[15]), nb::cast<std::optional<tle::LoraConfig>>(state[16]),
-            nb::cast<std::optional<tle::LookaheadDecodingConfig>>(state[17]),
-            nb::cast<std::optional<tle::KvCacheRetentionConfig>>(state[18]),
-            nb::cast<std::optional<std::string>>(state[19]),
-            nb::cast<std::optional<tle::LogitsPostProcessor>>(state[20]), nb::cast<std::optional<VecTokens>>(state[21]),
-            nb::cast<std::optional<IdType>>(state[22]), nb::cast<bool>(state[23]),
-            nb::cast<tle::PriorityType>(state[24]), nb::cast<tle::RequestType>(state[25]),
-            nb::cast<std::optional<tle::ContextPhaseParams>>(state[26]),
-            nb::cast<std::optional<tle::Tensor>>(state[27]), nb::cast<std::optional<SizeType32>>(state[28]),
-            nb::cast<std::optional<tle::Tensor>>(state[29]), 1, nb::cast<std::optional<tle::EagleConfig>>(state[30]),
-            nb::cast<std::optional<tle::Tensor>>(state[31]),
-            nb::cast<std::optional<tle::GuidedDecodingParams>>(state[32]));
-    };
-
-    nb::class_<tle::Request> request(m, "Request", nb::dynamic_attr());
-    request
-        .def(nb::init<tle::VecTokens,                           // inputTokenIds
-                 tle::SizeType32,                               // maxTokens
-                 bool,                                          // streaming
-                 tle::SamplingConfig const&,                    // samplingConfig
-                 tle::OutputConfig const&,                      // outputConfig
-                 std::optional<tle::SizeType32> const&,         // endId
-                 std::optional<tle::SizeType32> const&,         // padId
-                 std::optional<std::vector<SizeType32>>,        // positionIds
-                 std::optional<std::list<tle::VecTokens>>,      // badWords
-                 std::optional<std::list<tle::VecTokens>>,      // stopWords
-                 std::optional<tle::Tensor>,                    // embeddingBias
-                 std::optional<tle::ExternalDraftTokensConfig>, // externalDraftTokensConfig
-                 std::optional<tle::PromptTuningConfig>,        // pTuningConfig
-                 std::optional<tle::MultimodalInput>,           // multimodalInput
-                 std::optional<tle::Tensor>,                    // multimodalEmbedding
-                 std::optional<tle::MropeConfig>,               // mRopeConfig
-                 std::optional<tle::LoraConfig>,                // loraConfig
-                 std::optional<tle::LookaheadDecodingConfig>,   // lookaheadConfig
-                 std::optional<tle::KvCacheRetentionConfig>,    // kvCacheRetentionConfig
-                 std::optional<std::string>,                    // logitsPostProcessorName
-                 std::optional<tle::LogitsPostProcessor>,       // logitsPostProcessor
-                 std::optional<tle::VecTokens>,                 // encoderInputTokenIds
-                 std::optional<tle::IdType>,                    // clientId
-                 bool,                                          // returnAllGeneratedTokens
-                 tle::PriorityType,                             // priority
-                 tle::RequestType,                              // type
-                 std::optional<tle::ContextPhaseParams>,        // contextPhaseParams
-                 std::optional<tle::Tensor>,                    // encoderInputFeatures
-                 std::optional<tle::SizeType32>,                // encoderOutputLength
-                 std::optional<tle::Tensor>,                    // crossAttentionMask
-                 SizeType32,                                    // numReturnSequences
-                 std::optional<tle::EagleConfig>,               // eagleConfig
-                 std::optional<tle::Tensor>,                    // skipCrossAttnBlocks
-                 std::optional<tle::GuidedDecodingParams>,      // guidedDecodingParams
-                 std::optional<tle::SizeType32>,                // languageAdapterUid
-                 std::optional<tle::MillisecondsType>           // allottedTimeMs
-                 >(),
-            // clang-format off
-        nb::arg("input_token_ids"),
-        nb::arg("max_tokens"),
-        nb::kw_only(),
-        nb::arg("streaming") = false,
-        nb::arg("sampling_config") = tle::SamplingConfig(),
-        nb::arg("output_config") = tle::OutputConfig(),
-        nb::arg("end_id") = nb::none(),
-        nb::arg("pad_id") = nb::none(),
-        nb::arg("position_ids") = nb::none(),
-        nb::arg("bad_words") = nb::none(),
-        nb::arg("stop_words") = nb::none(),
-        nb::arg("embedding_bias") = nb::none(),
-        nb::arg("external_draft_tokens_config") = nb::none(),
-        nb::arg("prompt_tuning_config") = nb::none(),
-        nb::arg("multimodal_input") = nb::none(),
-        nb::arg("multimodal_embedding") = nb::none(),
-        nb::arg("mrope_config") = nb::none(),
-        nb::arg("lora_config") = nb::none(),
-        nb::arg("lookahead_config") = nb::none(),
-        nb::arg("kv_cache_retention_config") = nb::none(),
-        nb::arg("logits_post_processor_name") = nb::none(),
-        nb::arg("logits_post_processor") = nb::none(),
-        nb::arg("encoder_input_token_ids") = nb::none(),
-        nb::arg("client_id") = nb::none(),
-        nb::arg("return_all_generated_tokens") = false,
-        nb::arg("priority") = tle::Request::kDefaultPriority,
-        nb::arg("type") = tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
-        nb::arg("context_phase_params") = nb::none(),
-        nb::arg("encoder_input_features") = nb::none(),
-        nb::arg("encoder_output_length") = nb::none(),
-        nb::arg("cross_attention_mask") = nb::none(),
-        nb::arg("num_return_sequences") = 1,
-        nb::arg("eagle_config") = nb::none(),
-        nb::arg("skip_cross_attn_blocks") = nb::none(),
-        nb::arg("guided_decoding_params") = nb::none(),
-        nb::arg("language_adapter_uid") = nb::none(),
-        nb::arg("allotted_time_ms") = nb::none()
-    )          // clang-format on
-        .def_prop_ro("input_token_ids", &tle::Request::getInputTokenIds)
-        .def_prop_ro("max_tokens", &tle::Request::getMaxTokens)
-        .def_prop_rw("streaming", &tle::Request::getStreaming, &tle::Request::setStreaming)
-        .def_prop_rw("sampling_config", &tle::Request::getSamplingConfig, &tle::Request::setSamplingConfig)
-        .def_prop_rw("output_config", &tle::Request::getOutputConfig, &tle::Request::setOutputConfig)
-        .def_prop_rw("end_id", &tle::Request::getEndId, &tle::Request::setEndId)
-        .def_prop_rw("pad_id", &tle::Request::getPadId, &tle::Request::setPadId)
-        .def_prop_rw("position_ids", &tle::Request::getPositionIds, &tle::Request::setPositionIds)
-        .def_prop_rw("bad_words", &tle::Request::getBadWords, &tle::Request::setBadWords)
-        .def_prop_rw("stop_words", &tle::Request::getStopWords, &tle::Request::setStopWords)
-        .def_prop_rw("embedding_bias", &tle::Request::getEmbeddingBias, &tle::Request::setEmbeddingBias)
-        .def_prop_rw("external_draft_tokens_config", &tle::Request::getExternalDraftTokensConfig,
-            &tle::Request::setExternalDraftTokensConfig)
-        .def_prop_rw("prompt_tuning_config", &tle::Request::getPromptTuningConfig, &tle::Request::setPromptTuningConfig)
-        .def_prop_rw("multimodal_input", &tle::Request::getMultimodalInput, &tle::Request::setMultimodalInput)
-        .def_prop_rw(
-            "multimodal_embedding", &tle::Request::getMultimodalEmbedding, &tle::Request::setMultimodalEmbedding)
-        .def_prop_rw("mrope_config", &tle::Request::getMropeConfig, &tle::Request::setMropeConfig)
-        .def_prop_rw("lora_config", &tle::Request::getLoraConfig, &tle::Request::setLoraConfig)
-        .def_prop_rw("lookahead_config", &tle::Request::getLookaheadConfig, &tle::Request::setLookaheadConfig)
-        .def_prop_rw("kv_cache_retention_config", &tle::Request::getKvCacheRetentionConfig,
-            &tle::Request::setKvCacheRetentionConfig)
-        .def_prop_rw("logits_post_processor_name", &tle::Request::getLogitsPostProcessorName,
-            &tle::Request::setLogitsPostProcessorName)
-        .def_prop_rw(
-            "logits_post_processor", &tle::Request::getLogitsPostProcessor, &tle::Request::setLogitsPostProcessor)
-        .def_prop_rw(
-            "encoder_input_token_ids", &tle::Request::getEncoderInputTokenIds, &tle::Request::setEncoderInputTokenIds)
-        .def_prop_rw("client_id", &tle::Request::getClientId, &tle::Request::setClientId)
-        .def_prop_rw("return_all_generated_tokens", &tle::Request::getReturnAllGeneratedTokens,
-            &tle::Request::setReturnAllGeneratedTokens)
-        .def_prop_rw("request_type", &tle::Request::getRequestType, &tle::Request::setRequestType)
-        .def_prop_rw(
-            "encoder_input_features", &tle::Request::getEncoderInputFeatures, &tle::Request::setEncoderInputFeatures)
-        .def_prop_rw("cross_attention_mask", &tle::Request::getCrossAttentionMask, &tle::Request::setCrossAttentionMask)
-        .def_prop_rw("eagle_config", &tle::Request::getEagleConfig, &tle::Request::setEagleConfig)
-        .def_prop_rw(
-            "skip_cross_attn_blocks", &tle::Request::getSkipCrossAttnBlocks, &tle::Request::setSkipCrossAttnBlocks)
-        .def_prop_rw(
-            "guided_decoding_params", &tle::Request::getGuidedDecodingParams, &tle::Request::setGuidedDecodingParams)
-        .def_prop_rw("allotted_time_ms", &tle::Request::getAllottedTimeMs, &tle::Request::setAllottedTimeMs)
-        .def_prop_rw("context_phase_params", &tle::Request::getContextPhaseParams, &tle::Request::setContextPhaseParams)
-        .def("__getstate__", requestGetstate)
-        .def("__setstate__", requestSetstate);
-    request.attr("BATCHED_POST_PROCESSOR_NAME") = tle::Request::kBatchedPostProcessorName;
-
-    nb::class_<tle::SpeculativeDecodingFastLogitsInfo>(m, "SpeculativeDecodingFastLogitsInfo")
-        .def(nb::init<>())
-        .def_rw("draft_request_id", &tle::SpeculativeDecodingFastLogitsInfo::draftRequestId)
-        .def_rw("draft_participant_id", &tle::SpeculativeDecodingFastLogitsInfo::draftParticipantId)
-        .def("to_tensor", &tle::SpeculativeDecodingFastLogitsInfo::toTensor);
-
-    auto requestPerfMetrics = nb::class_<tle::RequestPerfMetrics>(m, "RequestPerfMetrics");
-
-    auto timingMetricsGetstate = [](tle::RequestPerfMetrics::TimingMetrics const& self)
-    {
-        return nb::make_tuple(self.arrivalTime, self.firstScheduledTime, self.firstTokenTime, self.lastTokenTime,
-            self.kvCacheTransferStart, self.kvCacheTransferEnd, self.kvCacheSize);
-    };
-    auto timingMetricsSetstate = [](tle::RequestPerfMetrics::TimingMetrics& timingMetrics, nb::tuple const& state)
-    {
-        if (state.size() != 7)
-        {
-            throw std::runtime_error("Invalid TimingMetrics state!");
-        }
-        new (&timingMetrics)
-            tle::RequestPerfMetrics::TimingMetrics{nb::cast<tle::RequestPerfMetrics::TimePoint>(state[0]),
-                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[1]),
-                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[2]),
-                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[3]),
-                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[4]),
-                nb::cast<tle::RequestPerfMetrics::TimePoint>(state[5]), nb::cast<size_t>(state[6])};
-    };
-    nb::class_<tle::RequestPerfMetrics::TimingMetrics>(m, "TimingMetrics")
-        .def(nb::init<>())
-        .def_rw("arrival_time", &tle::RequestPerfMetrics::TimingMetrics::arrivalTime)
-        .def_rw("first_scheduled_time", &tle::RequestPerfMetrics::TimingMetrics::firstScheduledTime)
-        .def_rw("first_token_time", &tle::RequestPerfMetrics::TimingMetrics::firstTokenTime)
-        .def_rw("last_token_time", &tle::RequestPerfMetrics::TimingMetrics::lastTokenTime)
-        .def_rw("kv_cache_transfer_start", &tle::RequestPerfMetrics::TimingMetrics::kvCacheTransferStart)
-        .def_rw("kv_cache_transfer_end", &tle::RequestPerfMetrics::TimingMetrics::kvCacheTransferEnd)
-        .def_rw("kv_cache_size", &tle::RequestPerfMetrics::TimingMetrics::kvCacheSize)
-        .def("__getstate__", timingMetricsGetstate)
-        .def("__setstate__", timingMetricsSetstate);
-
-    auto kvCacheMetricsGetstate = [](tle::RequestPerfMetrics::KvCacheMetrics const& self)
-    {
-        return nb::make_tuple(self.numTotalAllocatedBlocks, self.numNewAllocatedBlocks, self.numReusedBlocks,
-            self.numMissedBlocks, self.kvCacheHitRate);
-    };
-    auto kvCacheMetricsSetstate = [](tle::RequestPerfMetrics::KvCacheMetrics& kvCacheMetrics, nb::tuple const& state)
-    {
-        if (state.size() != 5)
-        {
-            throw std::runtime_error("Invalid KvCacheMetrics state!");
-        }
-        new (&kvCacheMetrics)
-            tle::RequestPerfMetrics::KvCacheMetrics{nb::cast<SizeType32>(state[0]), nb::cast<SizeType32>(state[1]),
-                nb::cast<SizeType32>(state[2]), nb::cast<SizeType32>(state[3]), nb::cast<float>(state[4])};
-    };
-    nb::class_<tle::RequestPerfMetrics::KvCacheMetrics>(m, "KvCacheMetrics")
-        .def(nb::init<>())
-        .def_rw("num_total_allocated_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numTotalAllocatedBlocks)
-        .def_rw("num_new_allocated_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numNewAllocatedBlocks)
-        .def_rw("num_reused_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numReusedBlocks)
-        .def_rw("num_missed_blocks", &tle::RequestPerfMetrics::KvCacheMetrics::numMissedBlocks)
-        .def_rw("kv_cache_hit_rate", &tle::RequestPerfMetrics::KvCacheMetrics::kvCacheHitRate)
-        .def("__getstate__", kvCacheMetricsGetstate)
-        .def("__setstate__", kvCacheMetricsSetstate);
-
-    auto speculativeDecodingMetricsGetstate = [](tle::RequestPerfMetrics::SpeculativeDecodingMetrics const& self)
-    { return nb::make_tuple(self.acceptanceRate, self.totalAcceptedDraftTokens, self.totalDraftTokens); };
-    auto speculativeDecodingMetricsSetstate
-        = [](tle::RequestPerfMetrics::SpeculativeDecodingMetrics& speculativeDecodingMetrics, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid SpeculativeDecodingMetrics state!");
-        }
-        new (&speculativeDecodingMetrics) tle::RequestPerfMetrics::SpeculativeDecodingMetrics{
-            nb::cast<float>(state[0]), nb::cast<SizeType32>(state[1]), nb::cast<SizeType32>(state[2])};
-    };
-
-    nb::class_<tle::RequestPerfMetrics::SpeculativeDecodingMetrics>(m, "SpeculativeDecodingMetrics")
-        .def(nb::init<>())
-        .def_rw("acceptance_rate", &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::acceptanceRate)
-        .def_rw("total_accepted_draft_tokens",
-            &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::totalAcceptedDraftTokens)
-        .def_rw("total_draft_tokens", &tle::RequestPerfMetrics::SpeculativeDecodingMetrics::totalDraftTokens)
-        .def("__getstate__", speculativeDecodingMetricsGetstate)
-        .def("__setstate__", speculativeDecodingMetricsSetstate);
-
-    auto requestPerfMetricsGetstate = [](tle::RequestPerfMetrics const& self)
-    {
-        return nb::make_tuple(self.timingMetrics, self.kvCacheMetrics, self.speculativeDecoding, self.firstIter,
-            self.lastIter, self.iter);
-    };
-    auto requestPerfMetricsSetstate = [](tle::RequestPerfMetrics& requestPerfMetrics, nb::tuple const& state)
-    {
-        if (state.size() != 6)
-        {
-            throw std::runtime_error("Invalid RequestPerfMetrics state!");
-        }
-        new (&requestPerfMetrics) tle::RequestPerfMetrics{nb::cast<tle::RequestPerfMetrics::TimingMetrics>(state[0]),
-            nb::cast<tle::RequestPerfMetrics::KvCacheMetrics>(state[1]),
-            nb::cast<tle::RequestPerfMetrics::SpeculativeDecodingMetrics>(state[2]),
-            nb::cast<std::optional<tle::IterationType>>(state[3]),
-            nb::cast<std::optional<tle::IterationType>>(state[4]),
-            nb::cast<std::optional<tle::IterationType>>(state[5])};
-    };
-
-    // There's a circular dependency between the declaration of the TimingMetrics and RequestPerfMetrics bindings.
-    // Defer definition of the RequestPerfMetrics bindings until the TimingMetrics have been defined.
-    requestPerfMetrics.def(nb::init<>())
-        .def_rw("timing_metrics", &tle::RequestPerfMetrics::timingMetrics)
-        .def_rw("kv_cache_metrics", &tle::RequestPerfMetrics::kvCacheMetrics)
-        .def_rw("speculative_decoding", &tle::RequestPerfMetrics::speculativeDecoding)
-        .def_rw("first_iter", &tle::RequestPerfMetrics::firstIter)
-        .def_rw("last_iter", &tle::RequestPerfMetrics::lastIter)
-        .def_rw("iter", &tle::RequestPerfMetrics::iter)
-        .def("__getstate__", requestPerfMetricsGetstate)
-        .def("__setstate__", requestPerfMetricsSetstate);
-
-    nb::class_<tle::AdditionalOutput>(m, "AdditionalOutput")
-        .def("__init__ ",
-            [](tle::AdditionalOutput const& self, std::string const& name, tle::Tensor const& output)
-            { return std::make_unique<tle::AdditionalOutput>(name, output); })
-        .def_rw("name", &tle::AdditionalOutput::name)
-        .def_rw("output", &tle::AdditionalOutput::output);
-
-    auto resultSetstate = [](tle::Result& result, nb::tuple const& state)
-    {
-        if (state.size() != 13)
-        {
-            throw std::runtime_error("Invalid Request state!");
-        }
-        new (&result) tle::Result();
-        result.isFinal = nb::cast<bool>(state[0]);
-        result.outputTokenIds = nb::cast<std::vector<VecTokens>>(state[1]);
-        result.cumLogProbs = nb::cast<std::optional<std::vector<float>>>(state[2]);
-        result.logProbs = nb::cast<std::optional<std::vector<std::vector<float>>>>(state[3]);
-        result.contextLogits = nb::cast<std::optional<Tensor>>(state[4]);
-        result.generationLogits = nb::cast<std::optional<Tensor>>(state[5]);
-        result.encoderOutput = nb::cast<std::optional<Tensor>>(state[6]);
-        result.finishReasons = nb::cast<std::vector<tle::FinishReason>>(state[7]);
-        result.sequenceIndex = nb::cast<SizeType32>(state[8]);
-        result.isSequenceFinal = nb::cast<bool>(state[9]);
-        result.decodingIter = nb::cast<SizeType32>(state[10]);
-        result.contextPhaseParams = nb::cast<std::optional<tle::ContextPhaseParams>>(state[11]);
-        result.requestPerfMetrics = nb::cast<std::optional<tle::RequestPerfMetrics>>(state[12]);
-    };
-
-    auto resultGetstate = [](tle::Result const& self)
-    {
-        return nb::make_tuple(self.isFinal, self.outputTokenIds, self.cumLogProbs, self.logProbs, self.contextLogits,
-            self.generationLogits, self.encoderOutput, self.finishReasons, self.sequenceIndex, self.isSequenceFinal,
-            self.decodingIter, self.contextPhaseParams, self.requestPerfMetrics);
-    };
-
-    nb::class_<tle::Result>(m, "Result")
-        .def(nb::init<>())
-        .def_rw("is_final", &tle::Result::isFinal)
-        .def_rw("output_token_ids", &tle::Result::outputTokenIds)
-        .def_rw("cum_log_probs", &tle::Result::cumLogProbs)
-        .def_rw("log_probs", &tle::Result::logProbs)
-        .def_rw("context_logits", &tle::Result::contextLogits)
-        .def_rw("generation_logits", &tle::Result::generationLogits)
-        .def_rw("spec_dec_fast_logits_info", &tle::Result::specDecFastLogitsInfo)
-        .def_rw("encoder_output", &tle::Result::encoderOutput)
-        .def_rw("finish_reasons", &tle::Result::finishReasons)
-        .def_rw("sequence_index", &tle::Result::sequenceIndex)
-        .def_rw("is_sequence_final", &tle::Result::isSequenceFinal)
-        .def_rw("decoding_iter", &tle::Result::decodingIter)
-        .def_rw("context_phase_params", &tle::Result::contextPhaseParams)
-        .def_rw("request_perf_metrics", &tle::Result::requestPerfMetrics)
-        .def_rw("additional_outputs", &tle::Result::additionalOutputs)
-        .def("__getstate__", resultGetstate)
-        .def("__setstate__", resultSetstate);
-
-    m.def("deserialize_result",
-        [](nb::bytes& x)
-        {
-            std::string str(x.c_str(), x.size());
-            std::istringstream is(str);
-            return tle::serialize_utils::deserialize<tle::Result>(is);
-        });
-
-    auto responseGetstate = [](tle::Response const& self)
-    { return nb::make_tuple(self.getRequestId(), self.getResult(), self.getClientId()); };
-
-    auto responseSetstate = [](tle::Response& response, nb::tuple const& state)
-    {
-        if (state.size() != 3)
-        {
-            throw std::runtime_error("Invalid Request state!");
-        }
-        new (&response) tle::Response(
-            nb::cast<SizeType32>(state[0]), nb::cast<tle::Result>(state[1]), nb::cast<SizeType32>(state[2]));
-    };
-
-    nb::class_<tle::Response>(m, "Response")
-        .def(nb::init<IdType, std::string, std::optional<IdType>>(), nb::arg("request_id"), nb::arg("error_msg"),
-            nb::arg("client_id") = std::nullopt)
-        .def(nb::init<IdType, tle::Result, std::optional<IdType>>(), nb::arg("request_id"), nb::arg("result"),
-            nb::arg("client_id") = std::nullopt)
-        .def_prop_ro("request_id", &tle::Response::getRequestId)
-        .def_prop_ro("client_id", &tle::Response::getClientId)
-        .def("has_error", &tle::Response::hasError)
-        .def_prop_ro("error_msg", &tle::Response::getErrorMsg)
-        .def_prop_ro("result", &tle::Response::getResult)
-        .def("clear_context_logits",
-            [](tle::Response& self)
-            {
-                if (!self.hasError())
-                {
-                    auto& result = const_cast<tle::Result&>(self.getResult());
-                    result.contextLogits.reset();
-                }
-            })
-        .def("clear_generation_logits",
-            [](tle::Response& self)
-            {
-                if (!self.hasError())
-                {
-                    auto& result = const_cast<tle::Result&>(self.getResult());
-                    result.generationLogits.reset();
-                }
-            })
-        .def("__getstate__", responseGetstate)
-        .def("__setstate__", responseSetstate);
-}
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/executor/request.h b/cpp/tensorrt_llm/nanobind/executor/request.h
deleted file mode 100644
index 5a5cf9acbee..00000000000
--- a/cpp/tensorrt_llm/nanobind/executor/request.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::executor
-{
-
-// Register bindings for executor API.
-void initRequestBindings(nb::module_& m);
-
-} // namespace tensorrt_llm::nanobind::executor
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
deleted file mode 100644
index f3be85bbbf2..00000000000
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.cpp
+++ /dev/null
@@ -1,388 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bindings.h"
-#include "moeBindings.h"
-#include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
-#include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
-#include "tensorrt_llm/kernels/customAllReduceKernels.h"
-#include "tensorrt_llm/kernels/delayStream.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/runtime/cudaEvent.h"
-#include "tensorrt_llm/runtime/cudaStream.h"
-#include "tensorrt_llm/runtime/decoderState.h"
-#include "tensorrt_llm/runtime/decodingInput.h"
-#include "tensorrt_llm/runtime/decodingOutput.h"
-#include "tensorrt_llm/runtime/gptDecoder.h"
-#include "tensorrt_llm/runtime/gptDecoderBatched.h"
-#include "tensorrt_llm/runtime/iBuffer.h"
-#include "tensorrt_llm/runtime/iGptDecoderBatched.h"
-#include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/ipcUtils.h"
-#include "tensorrt_llm/runtime/lookaheadBuffers.h"
-#include "tensorrt_llm/runtime/loraCache.h"
-#include "tensorrt_llm/runtime/mcastGPUBuffer.h"
-#include "tensorrt_llm/runtime/request.h"
-#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
-#include "tensorrt_llm/runtime/tllmRuntime.h"
-#include "tensorrt_llm/runtime/torchView.h"
-
-#include <ATen/ATen.h>
-#include <c10/cuda/CUDAStream.h>
-#include <nanobind/stl/vector.h>
-
-#include <nanobind/nanobind.h>
-#include <nanobind/ndarray.h>
-#include <nanobind/operators.h>
-#include <nanobind/stl/bind_vector.h>
-#include <nanobind/stl/filesystem.h>
-#include <nanobind/stl/optional.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <nanobind/stl/unique_ptr.h>
-#include <nanobind/trampoline.h>
-#include <torch/extension.h>
-namespace tr = tensorrt_llm::runtime;
-namespace te = tensorrt_llm::executor;
-
-class PyIGptDecoder : public tr::IGptDecoder
-{
-public:
-    NB_TRAMPOLINE(tr::IGptDecoder, 5);
-
-    void setup(tr::SamplingConfig const& samplingConfig, size_t batchSize,
-        tr::DecodingInput::TensorConstPtr const& batchSlots,
-        std::optional<tr::DecodingOutput> const& output = std::nullopt,
-        std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
-        std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
-        std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt) override
-    {
-        NB_OVERRIDE_PURE(setup, samplingConfig, batchSize, batchSlots, output, explicitDraftTokensDType,
-            lookaheadPrompt, lookaheadAlgoConfigs);
-    }
-
-    void forwardAsync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
-    {
-        NB_OVERRIDE_PURE(forwardAsync, output, input);
-    }
-
-    void forwardSync(tr::DecodingOutput& output, tr::DecodingInput const& input) override
-    {
-        NB_OVERRIDE_PURE(forwardSync, output, input);
-    }
-
-    tr::SamplingConfig const& getSamplingConfig() override
-    {
-        NB_OVERRIDE_PURE(getSamplingConfig);
-    }
-
-    void disableLookahead(std::optional<tr::SamplingConfig> const& samplingConfig, tr::SizeType32 batchSize,
-        tr::DecodingInput::TensorConstPtr batchSlots) override
-    {
-        NB_OVERRIDE_PURE(disableLookahead, samplingConfig, batchSize, batchSlots);
-    }
-};
-
-namespace tensorrt_llm::nanobind::runtime
-{
-
-void initBindings(nb::module_& m)
-{
-
-    nb::class_<tr::LoraCache::TaskLayerModuleConfig>(m, "TaskLayerModuleConfig")
-        .def(nb::init<>())
-        .def_rw("page_id", &tr::LoraCache::TaskLayerModuleConfig::pageId)
-        .def_rw("slot_idx", &tr::LoraCache::TaskLayerModuleConfig::slotIdx)
-        .def_rw("in_size", &tr::LoraCache::TaskLayerModuleConfig::inSize)
-        .def_rw("out_size", &tr::LoraCache::TaskLayerModuleConfig::outSize)
-        .def_rw("module_id", &tr::LoraCache::TaskLayerModuleConfig::moduleId)
-        .def_rw("layer_id", &tr::LoraCache::TaskLayerModuleConfig::layerId)
-        .def_rw("adapter_size", &tr::LoraCache::TaskLayerModuleConfig::adapterSize)
-        .def_rw("num_slots", &tr::LoraCache::TaskLayerModuleConfig::numSlots)
-        .def_rw("weights_in_pointer", &tr::LoraCache::TaskLayerModuleConfig::weightsInPointer)
-        .def_rw("weights_out_pointer", &tr::LoraCache::TaskLayerModuleConfig::weightsOutPointer)
-        .def_rw("scaling_vec_pointer", &tr::LoraCache::TaskLayerModuleConfig::scalingVecPointer)
-        .def(nb::self == nb::self);
-
-    nb::class_<tr::BufferManager>(m, "BufferManager")
-        .def(nb::init<tr::BufferManager::CudaStreamPtr, bool>(), nb::arg("stream"), nb::arg("trim_pool") = false)
-        .def_prop_ro("stream", &tr::BufferManager::getStream);
-
-    nb::class_<tr::TllmRuntime>(m, "TllmRuntime")
-        .def(
-            "__init__",
-            [](tr::TllmRuntime* self, std::filesystem::path engine_path, float gpu_weights_percent = 1.0f,
-                bool use_shape_inference = true)
-            {
-                // Using default logger by passing nullptr
-                new (self)
-                    tr::TllmRuntime(tr::RawEngine(engine_path), nullptr, gpu_weights_percent, use_shape_inference);
-            },
-            nb::arg("engine_path"), nb::arg("gpu_weights_percent") = 1.0f, nb::arg("use_shape_inference") = true)
-        .def(
-            "__init__",
-            [](tr::TllmRuntime* self, nb::ndarray<nb::numpy, uint8_t> engine_buffer, float gpu_weights_percent = 1.0f,
-                bool use_shape_inference = true)
-            {
-                if (engine_buffer.ndim() != 1)
-                    throw std::runtime_error("Expected 1-D array for engine buffer");
-                new (self) tr::TllmRuntime(tr::RawEngine(engine_buffer.data(), engine_buffer.size()), nullptr,
-                    gpu_weights_percent, use_shape_inference);
-            },
-            nb::arg("engine_buffer"), nb::arg("gpu_weights_percent") = 1.0f, nb::arg("use_shape_inference") = true)
-        .def_prop_ro("num_contexts", &tr::TllmRuntime::getNbContexts)
-        .def_prop_ro("num_profiles", &tr::TllmRuntime::getNbProfiles)
-        .def("get_opt_profile_id", &tr::TllmRuntime::getOptProfileId, nb::arg("num_tokens"), nb::arg("split_points"))
-        .def("clear_contexts", &tr::TllmRuntime::clearContexts)
-        .def("execute_context", &tr::TllmRuntime::executeContext, nb::arg("context_id"))
-        .def_prop_ro("stream_ptr", &tr::TllmRuntime::getStreamPtr)
-        .def_prop_ro("buffer_manager",
-            static_cast<tr::BufferManager& (tr::TllmRuntime::*) ()>(&tr::TllmRuntime::getBufferManager))
-        .def("set_layer_profiler", &tr::TllmRuntime::setLayerProfiler)
-        .def("has_layer_profiler", &tr::TllmRuntime::hasLayerProfiler, nb::arg("context_id"))
-        .def_prop_ro("layer_profiler_info", &tr::TllmRuntime::getLayerProfileInfo)
-        .def("report_to_profiler", &tr::TllmRuntime::reportToProfiler, nb::arg("context_id"))
-        .def_prop_ro("logits_dtype_from_engine",
-            [](tr::TllmRuntime& self) { return self.getEngine().getTensorDataType("logits"); });
-
-    nb::class_<tr::decoder_batch::Request>(m, "Request")
-        .def(nb::init<tr::decoder_batch::Request::TensorConstPtr, tr::SizeType32, std::optional<tr::SizeType32>,
-                 std::optional<tr::SizeType32>>(),
-            nb::arg("ids"), nb::arg("input_len"), nb::arg("max_new_tokens") = std::nullopt,
-            nb::arg("end_id") = std::nullopt)
-        .def_rw("ids", &tr::decoder_batch::Request::ids)
-        .def_rw("input_len", &tr::decoder_batch::Request::inputLen)
-        .def_rw("max_new_tokens", &tr::decoder_batch::Request::maxNewTokens)
-        .def_rw("end_id", &tr::decoder_batch::Request::endId)
-        .def_rw("draft_logits", &tr::decoder_batch::Request::draftLogits)
-        .def_rw("embedding_bias", &tr::decoder_batch::Request::embeddingBias)
-        .def_rw("bad_words_list", &tr::decoder_batch::Request::badWordsList)
-        .def_rw("stop_words_list", &tr::decoder_batch::Request::stopWordsList)
-        .def_rw("generated_tokens_per_engine_step", &tr::decoder_batch::Request::generatedTokensPerEngineStep)
-        .def_rw("medusa_paths", &tr::decoder_batch::Request::medusaPaths)
-        .def_rw("medusa_tree_ids", &tr::decoder_batch::Request::medusaTreeIds)
-        .def_rw("lookahead_runtime_config", &tr::decoder_batch::Request::lookaheadRuntimeConfig);
-    nb::bind_vector<std::vector<tr::decoder_batch::Request>>(m, "RequestVector");
-
-    nb::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
-        .def(nb::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), nb::arg("logits"),
-            nb::arg("max_decoding_engine_tokens"))
-        .def(nb::init<std::vector<tr::ITensor::SharedConstPtr>>(), nb::arg("logits"))
-        .def_rw("logits", &tr::decoder_batch::Input::logits)
-        .def_rw("max_decoder_steps", &tr::decoder_batch::Input::maxDecoderSteps)
-        .def_rw("batch_slots", &tr::decoder_batch::Input::batchSlots);
-
-    nb::class_<tr::LookaheadDecodingBuffers>(m, "LookaheadDecodingBuffers")
-        .def(nb::init<tr::SizeType32, tr::SizeType32, tr::BufferManager const&>(), nb::arg("max_num_sequences"),
-            nb::arg("max_tokens_per_step"), nb::arg("buffer_manager"))
-        .def_rw("generation_lengths", &tr::LookaheadDecodingBuffers::generationLengths)
-        .def_rw("position_offsets", &tr::LookaheadDecodingBuffers::positionOffsets)
-        .def_rw("packed_masks", &tr::LookaheadDecodingBuffers::packedMasks)
-        .def_rw("position_ids", &tr::LookaheadDecodingBuffers::positionIds);
-
-    nb::class_<tr::ExplicitDraftTokensBuffers::Inputs>(m, "ExplicitDraftTokensBuffersInputs")
-        .def("create", &tr::ExplicitDraftTokensBuffers::Inputs::create, nb::arg("max_num_sequences"),
-            nb::arg("runtime"), nb::arg("model_config"), nb::arg("world_config"))
-        .def_rw("temperatures", &tr::ExplicitDraftTokensBuffers::Inputs::temperatures)
-        .def_rw("position_ids_base", &tr::ExplicitDraftTokensBuffers::Inputs::positionIdsBase)
-        .def_rw("generation_lengths", &tr::ExplicitDraftTokensBuffers::Inputs::generationLengths)
-        .def_rw("random_data_sample", &tr::ExplicitDraftTokensBuffers::Inputs::randomDataSample)
-        .def_rw("random_data_validation", &tr::ExplicitDraftTokensBuffers::Inputs::randomDataValidation)
-        .def_rw("draft_tokens", &tr::ExplicitDraftTokensBuffers::Inputs::draftTokens)
-        .def_rw("draft_indices", &tr::ExplicitDraftTokensBuffers::Inputs::draftIndices)
-        .def_rw("draft_probs", &tr::ExplicitDraftTokensBuffers::Inputs::draftProbs)
-        .def_rw("packed_masks", &tr::ExplicitDraftTokensBuffers::Inputs::packedMasks)
-        .def_rw("position_ids", &tr::ExplicitDraftTokensBuffers::Inputs::positionIds)
-        .def_rw("max_gen_length_host", &tr::ExplicitDraftTokensBuffers::Inputs::maxGenLengthHost)
-        .def_rw("generation_lengths_host", &tr::ExplicitDraftTokensBuffers::Inputs::generationLengthsHost);
-
-    nb::class_<tr::DecodingInput>(m, "DecodingInput");
-    nb::class_<tr::DecodingOutput>(m, "DecodingOutput");
-
-    nb::class_<tr::CudaEvent>(m, "CudaEvent")
-        .def(nb::init<unsigned int>(), nb::arg("flags") = cudaEventDisableTiming)
-        .def("synchronize", &tr::CudaEvent::synchronize);
-
-    nb::class_<tr::IGptDecoder, PyIGptDecoder>(m, "IGptDecoder")
-        .def(
-            "setup",
-            [](tr::IGptDecoder& self, tr::SamplingConfig const& samplingConfig, size_t batchSize,
-                at::Tensor const& batchSlots, std::optional<tr::DecodingOutput> const& output = std::nullopt,
-                std::optional<nvinfer1::DataType> explicitDraftTokensDType = std::nullopt,
-                std::optional<std::vector<tr::ITensor::SharedConstPtr>> const& lookaheadPrompt = std::nullopt,
-                std::optional<std::vector<te::LookaheadDecodingConfig>> const& lookaheadAlgoConfigs = std::nullopt)
-            {
-                auto tensorPtrBatchSlots = tr::TorchView::of(batchSlots);
-                self.setup(samplingConfig, batchSize, std::move(tensorPtrBatchSlots), output, explicitDraftTokensDType,
-                    lookaheadPrompt, lookaheadAlgoConfigs);
-            },
-            nb::arg("sampling_config"), nb::arg("batch_size"), nb::arg("batch_slots"), nb::arg("output") = std::nullopt,
-            nb::arg("explicit_draft_tokens_d_type") = std::nullopt, nb::arg("lookahead_prompt") = std::nullopt,
-            nb::arg("lookahead_algo_configs") = std::nullopt);
-
-    nb::class_<tr::decoder::DecoderState>(m, "DecoderState")
-        .def(nb::init<>())
-        .def("setup", &tr::decoder::DecoderState::setup, nb::arg("max_batch_size"), nb::arg("max_beam_width"),
-            nb::arg("max_attention_window"), nb::arg("sink_token_length"), nb::arg("max_sequence_length"),
-            nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
-        .def("setup_cache_indirection", &tr::decoder::DecoderState::setupCacheIndirection, nb::arg("max_batch_size"),
-            nb::arg("max_beam_width"), nb::arg("max_attention_window"), nb::arg("buffer_manager"))
-        .def("setup_speculative_decoding", &tr::decoder::DecoderState::setupSpeculativeDecoding,
-            nb::arg("speculative_decoding_mode"), nb::arg("max_tokens_per_engine_step"), nb::arg("dtype"),
-            nb::arg("model_config"), nb::arg("world_config"), nb::arg("buffer_manager"))
-        .def_prop_ro("joint_decoding_input", &tr::decoder::DecoderState::getJointDecodingInput)
-        .def_prop_ro("joint_decoding_output", &tr::decoder::DecoderState::getJointDecodingOutput)
-        .def_prop_ro("cache_indirection_input", &tr::decoder::DecoderState::getCacheIndirectionInput)
-        .def_prop_ro("cache_indirection_output", &tr::decoder::DecoderState::getCacheIndirectionOutput)
-        .def_prop_ro(
-            "sequence_lengths", nb::overload_cast<>(&tr::decoder::DecoderState::getSequenceLengths, nb::const_))
-        .def("get_sequence_lengths",
-            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getSequenceLengths, nb::const_),
-            nb::arg("batch_idx"))
-        .def_prop_ro("all_new_tokens", &tr::decoder::DecoderState::getAllNewTokens)
-        .def_prop_ro("finished_sum", &tr::decoder::DecoderState::getFinishedSum)
-        .def_prop_ro("finish_reasons", &tr::decoder::DecoderState::getFinishReasons)
-        .def_prop_ro("ids", nb::overload_cast<>(&tr::decoder::DecoderState::getIds, nb::const_))
-        .def("get_ids", nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getIds, nb::const_),
-            nb::arg("batch_idx"))
-        .def_prop_ro("gathered_ids", nb::overload_cast<>(&tr::decoder::DecoderState::getGatheredIds, nb::const_))
-        .def("get_gathered_ids",
-            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getGatheredIds, nb::const_),
-            nb::arg("batch_idx"))
-        .def_prop_ro("parent_ids", &tr::decoder::DecoderState::getParentIds)
-        .def_prop_ro("cum_log_probs", nb::overload_cast<>(&tr::decoder::DecoderState::getCumLogProbs, nb::const_))
-        .def("get_cum_log_probs",
-            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getCumLogProbs, nb::const_),
-            nb::arg("batch_idx"))
-        .def_prop_ro("log_probs", nb::overload_cast<>(&tr::decoder::DecoderState::getLogProbs, nb::const_))
-        .def("get_log_probs", nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getLogProbs, nb::const_),
-            nb::arg("batch_idx"))
-        .def_prop_ro("next_draft_tokens", &tr::decoder::DecoderState::getNextDraftTokens)
-        .def_prop_ro("prev_draft_tokens_lengths", &tr::decoder::DecoderState::getPrevDraftTokensLengths)
-        .def_prop_ro("next_draft_tokens_lengths", &tr::decoder::DecoderState::getNextDraftTokensLengths)
-        .def_prop_ro("accepted_lengths_cum_sum", &tr::decoder::DecoderState::getAcceptedLengthsCumSum)
-        .def_prop_ro("accepted_packed_paths", &tr::decoder::DecoderState::getAcceptedPackedPaths)
-        .def_prop_ro("finished_steps", &tr::decoder::DecoderState::getFinishedSteps)
-        .def_prop_ro("max_beam_width", &tr::decoder::DecoderState::getMaxBeamWidth)
-        .def_prop_ro("max_sequence_length", &tr::decoder::DecoderState::getMaxSequenceLength)
-        .def_prop_ro("max_decoding_decoder_tokens", &tr::decoder::DecoderState::getMaxDecodingDecoderTokens)
-        .def_prop_ro("max_decoding_engine_tokens", &tr::decoder::DecoderState::getMaxDecodingEngineTokens)
-        .def_prop_ro("num_decoding_engine_tokens",
-            nb::overload_cast<>(&tr::decoder::DecoderState::getNumDecodingEngineTokens, nb::const_))
-        .def("get_num_decoding_engine_tokens",
-            nb::overload_cast<tr::SizeType32>(&tr::decoder::DecoderState::getNumDecodingEngineTokens, nb::const_),
-            nb::arg("batch_idx"))
-        .def("set_num_decoding_engine_tokens", &tr::decoder::DecoderState::setNumDecodingEngineTokens,
-            nb::arg("batch_idx"), nb::arg("num_tokens"))
-        .def_prop_ro("speculative_decoding_mode", &tr::decoder::DecoderState::getSpeculativeDecodingMode)
-        .def_prop_rw("generation_steps", &tr::decoder::DecoderState::getGenerationSteps,
-            &tr::decoder::DecoderState::setGenerationSteps);
-
-    nb::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
-        .def(nb::init<tr::GptDecoderBatched::CudaStreamPtr>(), nb::arg("stream"))
-        .def("setup", &tr::GptDecoderBatched::setup, nb::arg("mode"), nb::arg("max_batch_size"),
-            nb::arg("max_beam_width"), nb::arg("dtype"), nb::arg("model_config"), nb::arg("world_config"))
-        .def("forward_async", &tr::GptDecoderBatched::forwardAsync, nb::arg("output"), nb::arg("input"))
-        .def("underlying_decoder", &tr::GptDecoderBatched::getUnderlyingDecoder, nb::rv_policy::reference)
-        .def("finalize", &tr::GptDecoderBatched::finalize, nb::arg("decoder_state"), nb::arg("batch_idx"),
-            nb::arg("sampling_config"), nb::arg("streaming"))
-        .def_prop_ro(
-            "decoder_stream",
-            [](tr::GptDecoderBatched& self) -> tr::CudaStream const& { return *self.getDecoderStream(); },
-            nb::rv_policy::reference);
-
-    m.def(
-        "lamport_initialize_all",
-        [](intptr_t buffer_0, intptr_t buffer_1, intptr_t buffer_2, size_t size)
-        {
-            tr::lamportInitializeAll(reinterpret_cast<void*>(buffer_0), reinterpret_cast<void*>(buffer_1),
-                reinterpret_cast<void*>(buffer_2), size);
-        },
-        "Lamport initialize all buffers");
-    m.def(
-        "lamport_initialize",
-        [](intptr_t buffer, size_t size)
-        { tensorrt_llm::kernels::ar_fusion::lamport_initialize(reinterpret_cast<void*>(buffer), size, 0); },
-        "Lmaport initialize buffer");
-    m.def(
-        "delay_kernel",
-        [](int64_t delay_micro_secs, nb::object py_stream)
-        {
-            // Get the raw stream handle from PyTorch stream object
-            auto stream_ptr = nb::cast<int64_t>(py_stream.attr("cuda_stream"));
-            cudaStream_t stream = reinterpret_cast<cudaStream_t>(stream_ptr);
-            tensorrt_llm::kernels::invokeDelayStreamKernel(delay_micro_secs, stream);
-        },
-        "Delay kernel launch on the default stream");
-    m.def(
-        "max_workspace_size_lowprecision",
-        [](int32_t tp_size) { return tensorrt_llm::kernels::max_workspace_size_lowprecision(tp_size); },
-        "Calculate the maximum workspace size needed for low precision all-reduce operations");
-
-    nb::class_<tensorrt_llm::runtime::McastGPUBuffer>(m, "McastGPUBuffer")
-        .def(nb::init<size_t, uint32_t, uint32_t, at::Device, bool>())
-        .def("get_uc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getUCBuffer)
-        .def("get_mc_buffer", &tensorrt_llm::runtime::McastGPUBuffer::getMCBuffer);
-
-    nb::enum_<tensorrt_llm::kernels::AllReduceFusionOp>(m, "AllReduceFusionOp")
-        .value("NONE", tensorrt_llm::kernels::AllReduceFusionOp::NONE)
-        .value("RESIDUAL_RMS_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM)
-        .value("LAST_PROCESS_FOR_UB", tensorrt_llm::kernels::AllReduceFusionOp::LAST_PROCESS_FOR_UB)
-        .value("RESIDUAL_RMS_PREPOST_NORM", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_PREPOST_NORM)
-        .value("RESIDUAL_RMS_NORM_QUANT_FP8", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_FP8)
-        .value("RESIDUAL_RMS_NORM_QUANT_NVFP4", tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_QUANT_NVFP4)
-        .value("RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4",
-            tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_NVFP4)
-        .value("RESIDUAL_RMS_NORM_OUT_QUANT_FP8",
-            tensorrt_llm::kernels::AllReduceFusionOp::RESIDUAL_RMS_NORM_OUT_QUANT_FP8);
-
-    nb::enum_<tensorrt_llm::kernels::AllReduceStrategyType>(m, "AllReduceStrategy")
-        .value("NCCL", tensorrt_llm::kernels::AllReduceStrategyType::NCCL)
-        .value("MIN_LATENCY", tensorrt_llm::kernels::AllReduceStrategyType::MIN_LATENCY)
-        .value("AUTO", tensorrt_llm::kernels::AllReduceStrategyType::AUTO)
-        .value("UB", tensorrt_llm::kernels::AllReduceStrategyType::UB)
-        .value("ONESHOT", tensorrt_llm::kernels::AllReduceStrategyType::ONESHOT)
-        .value("TWOSHOT", tensorrt_llm::kernels::AllReduceStrategyType::TWOSHOT);
-
-    // Initialize MoeLoadBalancer bindings
-    initMoeBindings(m);
-}
-
-void initBindingsEarly(nb::module_& m)
-{
-    nb::class_<tr::SpeculativeDecodingMode>(m, "SpeculativeDecodingMode")
-        .def(nb::init<tr::SpeculativeDecodingMode::UnderlyingType>(), nb::arg("state"))
-        .def_static("NoneType", &tr::SpeculativeDecodingMode::None)
-        .def_static("DraftTokensExternal", &tr::SpeculativeDecodingMode::DraftTokensExternal)
-        .def_static("Medusa", &tr::SpeculativeDecodingMode::Medusa)
-        .def_static("Eagle", &tr::SpeculativeDecodingMode::Eagle)
-        .def_static("LookaheadDecoding", &tr::SpeculativeDecodingMode::LookaheadDecoding)
-        .def_static("ExplicitDraftTokens", &tr::SpeculativeDecodingMode::ExplicitDraftTokens)
-        .def_prop_ro("is_none", &tr::SpeculativeDecodingMode::isNone)
-        .def_prop_ro("is_draft_tokens_external", &tr::SpeculativeDecodingMode::isDraftTokensExternal)
-        .def_prop_ro("is_medusa", &tr::SpeculativeDecodingMode::isMedusa)
-        .def_prop_ro("is_eagle", &tr::SpeculativeDecodingMode::isEagle)
-        .def_prop_ro("is_lookahead_decoding", &tr::SpeculativeDecodingMode::isLookaheadDecoding)
-        .def_prop_ro("is_explicit_draft_tokens", &tr::SpeculativeDecodingMode::isExplicitDraftTokens)
-        .def_prop_ro("updates_position_ids", &tr::SpeculativeDecodingMode::updatesPositionIds)
-        .def_prop_ro("requires_attention_mask", &tr::SpeculativeDecodingMode::requiresAttentionMask)
-        .def_prop_ro("predicts_draft_tokens", &tr::SpeculativeDecodingMode::predictsDraftTokens)
-        .def_prop_ro("needs_kv_cache_rewind", &tr::SpeculativeDecodingMode::needsKVCacheRewind)
-        .def_prop_ro("variable_draft_length", &tr::SpeculativeDecodingMode::variableDraftLength)
-        .def_prop_ro("has_draft_logits", &tr::SpeculativeDecodingMode::hasDraftLogits)
-        .def_prop_ro("needs_decoder_prologue", &tr::SpeculativeDecodingMode::needsDecoderPrologue);
-}
-} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/bindings.h b/cpp/tensorrt_llm/nanobind/runtime/bindings.h
deleted file mode 100644
index 410dac80b05..00000000000
--- a/cpp/tensorrt_llm/nanobind/runtime/bindings.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::runtime
-{
-
-void initBindings(nb::module_& m);
-void initBindingsEarly(nb::module_& m);
-
-} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
deleted file mode 100644
index c26fa84b661..00000000000
--- a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.cpp
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "moeBindings.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/runtime/moeLoadBalancer/hostAccessibleDeviceAllocator.h"
-#include "tensorrt_llm/runtime/moeLoadBalancer/moeLoadBalancer.h"
-#include <nanobind/nanobind.h>
-#include <nanobind/stl/shared_ptr.h>
-#include <vector>
-
-namespace nb = nanobind;
-namespace tr = tensorrt_llm::runtime;
-namespace tk = tensorrt_llm::kernels;
-
-namespace tensorrt_llm::nanobind::runtime
-{
-
-void pyDoReplication(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
-    tr::MoePlacementCpuInfo* cpuPlacement)
-{
-    TLLM_CHECK_WITH_INFO(
-        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
-    tr::doReplication(metaInfo, expertLoadFactor.data(), cpuPlacement);
-};
-
-void pyDoPlacement(tk::MoeLoadBalanceMetaInfo const& metaInfo, std::vector<float>& expertLoadFactor,
-    tr::MoePlacementCpuInfo* cpuPlacement)
-{
-    TLLM_CHECK_WITH_INFO(
-        metaInfo.expertCount == expertLoadFactor.size(), "expert_count and expert_load_factor size mismatch");
-    tr::doPlacement(metaInfo, expertLoadFactor.data(), cpuPlacement);
-};
-
-void initMoeBindings(nb::module_& m)
-{
-    // Bind MoeWeight struct
-    nb::class_<tr::MoeWeight>(m, "MoeWeight")
-        .def(nb::init<>())
-        .def_prop_rw("weight_ptr", &tr::MoeWeight::getWeightPtr, &tr::MoeWeight::setWeightPtr)
-        .def_rw("height", &tr::MoeWeight::mHeight)
-        .def_rw("width", &tr::MoeWeight::mWidth)
-        .def_rw("pitch", &tr::MoeWeight::mPitch)
-        .def("__repr__",
-            [](tr::MoeWeight const& self)
-            {
-                return "<MoeWeight ptr=" + std::to_string(self.getWeightPtr())
-                    + " height=" + std::to_string(self.mHeight) + " width=" + std::to_string(self.mWidth)
-                    + " pitch=" + std::to_string(self.mPitch) + ">";
-            });
-
-    // Bind MoeLoadBalanceMetaInfo struct
-    nb::class_<tk::MoeLoadBalanceMetaInfo>(m, "MoeLoadBalanceMetaInfo")
-        .def(nb::init<int, int, int, int, int>(), nb::arg("expert_count"), nb::arg("top_k"), nb::arg("ep_rank"),
-            nb::arg("ep_size"), nb::arg("slot_count_per_rank"))
-        .def_rw("expert_count", &tk::MoeLoadBalanceMetaInfo::expertCount)
-        .def_rw("top_k", &tk::MoeLoadBalanceMetaInfo::topK)
-        .def_rw("ep_rank", &tk::MoeLoadBalanceMetaInfo::epRank)
-        .def_rw("ep_size", &tk::MoeLoadBalanceMetaInfo::epSize)
-        .def_rw("slot_count_per_rank", &tk::MoeLoadBalanceMetaInfo::slotCountPerRank);
-
-    // Bind MoePlacementCpuInfo struct
-    nb::class_<tr::MoePlacementCpuInfo>(m, "MoePlacementCpuInfo")
-        .def(nb::init<>())
-        .def_rw("expert_replica_count", &tr::MoePlacementCpuInfo::expertReplicaCount)
-        .def_rw("rank_expert_ids", &tr::MoePlacementCpuInfo::rankExpertIds);
-
-    // Bind SingleLayerMoeLoadBalancer class
-    nb::class_<tr::SingleLayerMoeLoadBalancer>(m, "SingleLayerMoeLoadBalancer")
-        .def("add_single_weight_slot", &tr::SingleLayerMoeLoadBalancer::addSingleWeightSlot, nb::arg("slot_id"),
-            nb::arg("name"), nb::arg("weight_slot"), "Add a single weight slot for a specific slot ID")
-        .def("add_single_host_weight", &tr::SingleLayerMoeLoadBalancer::addSingleHostWeight, nb::arg("expert_id"),
-            nb::arg("name"), nb::arg("host_weight"), "Add a single host weight for a specific expert ID")
-        .def("set_initial_weight_assignments", &tr::SingleLayerMoeLoadBalancer::setInitialWeightAssignments,
-            nb::arg("initial_weight_assignments"), "Set initial weight assignments for each slot")
-        .def("get_pointer", &tr::SingleLayerMoeLoadBalancer::getSelfPtr,
-            "Get the pointer of the SingleLayerMoeLoadBalancer")
-        .def("get_layer_id", &tr::SingleLayerMoeLoadBalancer::getLayerId,
-            "Get the layer id of the SingleLayerMoeLoadBalancer");
-
-    // Bind MoeLoadBalancer class
-    nb::class_<tr::MoeLoadBalancer>(m, "MoeLoadBalancer")
-        .def(nb::init<int, int, int>(), nb::arg("ep_rank"), nb::arg("ep_size"), nb::arg("layer_updates_per_iter"),
-            "Initialize the MoeLoadBalancer with the specified expert parallel rank, size, and update frequency")
-        .def("set_use_gpu_memcpy", &tr::MoeLoadBalancer::setUseGpuMemcpy, nb::arg("use_gpu_memcpy"),
-            "Set whether to use GPU memcpy for weight updates")
-        .def("add_layer", &tr::MoeLoadBalancer::AddLayer, nb::arg("expert_count"), nb::arg("top_k"),
-            nb::arg("slot_count_per_rank"), "Add a new MOE layer to the load balancer")
-        .def("finalize_model", &tr::MoeLoadBalancer::finalizeModel,
-            "Finalize the model structure, must be called after all layers are added")
-        .def("set_warm_up_iter_count", &tr::MoeLoadBalancer::setWarmUpIterCount, nb::arg("iter_count"),
-            "Set the number of warm-up iterations")
-        .def("start_iter", &tr::MoeLoadBalancer::startIter, nb::arg("iter_id"), nb::arg("enable_statistic"),
-            nb::arg("enable_update_weights"), "Start a new iteration with the given ID and settings")
-        .def("end_iter", &tr::MoeLoadBalancer::endIter, nb::arg("iter_id"), "End the iteration with the given ID")
-        .def("shutdown", &tr::MoeLoadBalancer::shutdown, "Shutdown the load balancer and clean up resources");
-
-    m.def("is_host_accessible_device_memory_supported", &tr::HostAccessibleDeviceAllocator::isSupported,
-        "If current system support host accessible device memory");
-
-    // Bind do_replication function for testing
-    m.def("do_replication", &pyDoReplication, nb::arg("meta_info"), nb::arg("expert_load_factor"),
-        nb::arg("cpu_placement"), "Do replication");
-
-    // Bind do_placement function for testing
-    m.def("do_placement", &pyDoPlacement, nb::arg("meta_info"), nb::arg("expert_load_factor"), nb::arg("cpu_placement"),
-        "Do placement");
-}
-
-} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h b/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
deleted file mode 100644
index 73b9a3ceec8..00000000000
--- a/cpp/tensorrt_llm/nanobind/runtime/moeBindings.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::runtime
-{
-
-void initMoeBindings(nb::module_& m);
-
-} // namespace tensorrt_llm::nanobind::runtime
diff --git a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
deleted file mode 100644
index caef94c5def..00000000000
--- a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "modelSpecBinding.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include "tensorrt_llm/testing/modelSpec.h"
-
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-using tensorrt_llm::testing::ModelSpec;
-using tensorrt_llm::testing::KVCacheType;
-using tensorrt_llm::testing::QuantMethod;
-using tensorrt_llm::testing::OutputContentType;
-
-namespace tensorrt_llm::nanobind::testing
-{
-
-void initBindings(nb::module_& m)
-{
-    nb::enum_<QuantMethod>(m, "QuantMethod", nb::is_arithmetic(), "Quantization Method")
-        .value("NONE", QuantMethod::kNONE, "No Quantization")
-        .value("SMOOTH_QUANT", QuantMethod::kSMOOTH_QUANT, "Smooth Quantization");
-
-    nb::enum_<OutputContentType>(m, "OutputContentType", nb::is_arithmetic(), "Output Content Type")
-        .value("NONE", OutputContentType::kNONE, "No Output Content")
-        .value("CONTEXT_LOGITS", OutputContentType::kCONTEXT_LOGITS, "Context Logits")
-        .value("GENERATION_LOGITS", OutputContentType::kGENERATION_LOGITS, "Generation Logits")
-        .value("LOG_PROBS", OutputContentType::kLOG_PROBS, "Log Probs")
-        .value("CUM_LOG_PROBS", OutputContentType::kCUM_LOG_PROBS, "Cumulative Log");
-
-    nb::class_<ModelSpec>(m, "ModelSpec")
-        .def(nb::init<std::string const&, nvinfer1::DataType>())
-        .def("use_gpt_plugin", &ModelSpec::useGptAttentionPlugin, nb::rv_policy::reference_internal)
-        .def("use_packed_input", &ModelSpec::usePackedInput, nb::rv_policy::reference_internal)
-        .def("set_kv_cache_type", &ModelSpec::setKVCacheType, nb::rv_policy::reference_internal)
-        .def("use_decoder_per_request", &ModelSpec::useDecoderPerRequest, nb::rv_policy::reference_internal)
-        .def("use_tensor_parallelism", &ModelSpec::useTensorParallelism, nb::rv_policy::reference_internal)
-        .def("use_pipeline_parallelism", &ModelSpec::usePipelineParallelism, nb::rv_policy::reference_internal)
-        .def("use_context_parallelism", &ModelSpec::useContextParallelism, nb::rv_policy::reference_internal)
-        .def("set_draft_tokens", &ModelSpec::setDraftTokens, nb::rv_policy::reference_internal)
-        .def("use_accept_by_logits", &ModelSpec::useAcceptByLogits, nb::rv_policy::reference_internal)
-        .def("use_mamba_plugin", &ModelSpec::useMambaPlugin, nb::rv_policy::reference_internal)
-        .def("gather_logits", &ModelSpec::gatherLogits, nb::rv_policy::reference_internal)
-        .def("replace_logits", &ModelSpec::replaceLogits, nb::rv_policy::reference_internal)
-        .def("return_log_probs", &ModelSpec::returnLogProbs, nb::rv_policy::reference_internal)
-        .def("smoke_test", &ModelSpec::smokeTest, nb::rv_policy::reference_internal)
-        .def("use_medusa", &ModelSpec::useMedusa, nb::rv_policy::reference_internal)
-        .def("use_eagle", &ModelSpec::useEagle, nb::rv_policy::reference_internal)
-        .def("use_lookahead_decoding", &ModelSpec::useLookaheadDecoding, nb::rv_policy::reference_internal)
-        .def("use_explicit_draft_tokens_decoding", &ModelSpec::useExplicitDraftTokensDecoding,
-            nb::rv_policy::reference_internal)
-        .def("use_draft_tokens_external_decoding", &ModelSpec::useDraftTokensExternalDecoding,
-            nb::rv_policy::reference_internal)
-        .def("use_logits", &ModelSpec::useLogits)
-        .def("use_multiple_profiles", &ModelSpec::useMultipleProfiles, nb::rv_policy::reference_internal)
-        .def("set_max_input_length", &ModelSpec::setMaxInputLength, nb::rv_policy::reference_internal)
-        .def("set_max_output_length", &ModelSpec::setMaxOutputLength, nb::rv_policy::reference_internal)
-        .def("set_quant_method", &ModelSpec::setQuantMethod, nb::rv_policy::reference_internal)
-        .def("use_lora_plugin", &ModelSpec::useLoraPlugin, nb::rv_policy::reference_internal)
-        .def("get_input_file", &ModelSpec::getInputFile)
-        .def("get_model_path", &ModelSpec::getModelPath)
-        .def("get_results_file", &ModelSpec::getResultsFile)
-        .def("get_generation_logits_file", &ModelSpec::getGenerationLogitsFile)
-        .def("get_context_logits_file", &ModelSpec::getContextLogitsFile)
-        .def("get_cum_log_probs_file", &ModelSpec::getCumLogProbsFile)
-        .def("get_log_probs_file", &ModelSpec::getLogProbsFile)
-        .def("enable_context_fmha_fp32_acc", &ModelSpec::enableContextFMHAFp32Acc, nb::rv_policy::reference_internal)
-        .def("get_enable_context_fmha_fp32_acc", &ModelSpec::getEnableContextFMHAFp32Acc)
-        .def("__copy__", [](ModelSpec const& self) { return ModelSpec(self); });
-}
-
-} // namespace tensorrt_llm::nanobind::testing
diff --git a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h b/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
deleted file mode 100644
index 1aababc6ff8..00000000000
--- a/cpp/tensorrt_llm/nanobind/testing/modelSpecBinding.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-
-namespace tensorrt_llm::nanobind::testing
-{
-
-void initBindings(nb::module_& m);
-
-} // namespace tensorrt_llm::nanobind::testing
diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
deleted file mode 100644
index 82e0d0a1f0c..00000000000
--- a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.cpp
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "bindings.h"
-#include "tensorrt_llm/kernels/userbuffers/ub_interface.h"
-#include "tensorrt_llm/kernels/userbuffers/userbuffersManager.h"
-#include "tensorrt_llm/nanobind/common/customCasters.h"
-#include <nanobind/nanobind.h>
-
-namespace nb = nanobind;
-namespace tub = tensorrt_llm::runtime::ub;
-
-namespace tensorrt_llm::kernels::userbuffers
-{
-
-void UserBufferBindings::initBindings(nb::module_& m)
-{
-    nb::class_<tub::UBBuffer>(m, "UBBuffer")
-        .def_ro("size", &tub::UBBuffer::size)
-        .def_prop_ro("addr", [](tub::UBBuffer& self) { return reinterpret_cast<intptr_t>(self.addr); })
-        .def_ro("handle", &tub::UBBuffer::handle)
-        .def("invalid", &tub::UBBuffer::invalid);
-
-    m.def("ub_initialize", [](int tp_size) { tub::ub_initialize(tp_size); });
-    m.def("ub_is_initialized", &tub::ub_is_initialized);
-    m.def("ub_allocate", [](size_t bytes) { return tub::ub_allocate(bytes); });
-    m.def("ub_deallocate", [](intptr_t addr) { return tub::ub_deallocate(reinterpret_cast<void*>(addr)); });
-    m.def("ub_get", &tub::ub_get);
-    m.def("ub_supported", &tub::ub_supported);
-
-    m.def("initialize_userbuffers_manager", &tub::initialize_userbuffers_manager);
-}
-} // namespace tensorrt_llm::kernels::userbuffers
diff --git a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h b/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h
deleted file mode 100644
index 15728bf6c1d..00000000000
--- a/cpp/tensorrt_llm/nanobind/userbuffers/bindings.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanobind/nanobind.h>
-namespace nb = nanobind;
-
-namespace tensorrt_llm::kernels::userbuffers
-{
-class UserBufferBindings
-{
-public:
-    static void initBindings(nb::module_& m);
-};
-} // namespace tensorrt_llm::kernels::userbuffers
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 962071c4857..1a5841d4b7a 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -170,7 +170,7 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
         .value("CONTINUOUS", tr::ModelConfig::KVCacheType::kCONTINUOUS)
         .value("PAGED", tr::ModelConfig::KVCacheType::kPAGED)
         .value("DISABLED", tr::ModelConfig::KVCacheType::kDISABLED)
-        .def("from_string", &tr::ModelConfig::KVCacheTypeFromString);
+        .def(py::init(&tr::ModelConfig::KVCacheTypeFromString));
 
     py::enum_<tr::ModelConfig::LayerType>(m, "LayerType")
         .value("ATTENTION", tr::ModelConfig::LayerType::kATTENTION)
diff --git a/cpp/tensorrt_llm/pybind/executor/bindings.cpp b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
index a8f6aaef73d..d09157e1a8b 100644
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@@ -244,17 +244,7 @@ void initBindings(pybind11::module_& m)
 
     py::class_<tle::KVCacheEventManager, std::shared_ptr<tle::KVCacheEventManager>>(
         executor_kv_cache, "KVCacheEventManager")
-        .def(
-            "get_latest_events",
-            [](tle::KVCacheEventManager& self, std::optional<double> timeout_ms = std::nullopt)
-            {
-                if (timeout_ms)
-                {
-                    return self.getLatestEvents(std::chrono::milliseconds(static_cast<int64_t>(*timeout_ms)));
-                }
-                return self.getLatestEvents(std::nullopt);
-            },
-            py::arg("timeout_ms") = std::nullopt);
+        .def("get_latest_events", &tle::KVCacheEventManager::getLatestEvents, py::arg("timeout") = std::nullopt);
 
     tensorrt_llm::pybind::executor::initRequestBindings(m);
     tensorrt_llm::pybind::executor::initConfigBindings(m);
diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
index 1153ca13a8e..bc0d997e337 100644
--- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp
@@ -336,7 +336,7 @@ void initConfigBindings(pybind11::module_& m)
             throw std::runtime_error("Invalid extendedRuntimePerfKnobConfig state!");
         }
         return tle::ExtendedRuntimePerfKnobConfig(
-            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[3].cast<SizeType32>());
+            state[0].cast<bool>(), state[1].cast<bool>(), state[2].cast<bool>(), state[2].cast<SizeType32>());
     };
     auto extendedRuntimePerfKnobConfigGetstate = [](tle::ExtendedRuntimePerfKnobConfig const& self)
     {
diff --git a/examples/models/core/llama/summarize_long.py b/examples/models/core/llama/summarize_long.py
index cee2e07fdd5..9f127bc32a6 100644
--- a/examples/models/core/llama/summarize_long.py
+++ b/examples/models/core/llama/summarize_long.py
@@ -97,7 +97,7 @@ def TRTLLaMA(args, config):
     quantization_config = pretrained_config['quantization']
 
     build_config = config['build_config']
-    kv_cache_type = KVCacheType.from_string(build_config['kv_cache_type'])
+    kv_cache_type = KVCacheType(build_config['kv_cache_type'])
     plugin_config = build_config['plugin_config']
 
     dtype = pretrained_config['dtype']
diff --git a/examples/models/core/qwen2audio/run.py b/examples/models/core/qwen2audio/run.py
index 93e161c7e08..e0d495a67f8 100644
--- a/examples/models/core/qwen2audio/run.py
+++ b/examples/models/core/qwen2audio/run.py
@@ -122,8 +122,7 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType.from_string(
-                config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
diff --git a/examples/models/core/qwenvl/run.py b/examples/models/core/qwenvl/run.py
index 06ce341a9a0..a04c2b142e3 100644
--- a/examples/models/core/qwenvl/run.py
+++ b/examples/models/core/qwenvl/run.py
@@ -118,8 +118,7 @@ def get_model(self):
         num_kv_heads = config["pretrained_config"].get("num_key_value_heads",
                                                        num_heads)
         if "kv_cache_type" in config["build_config"]:
-            kv_cache_type = KVCacheType.from_string(
-                config["build_config"]["kv_cache_type"])
+            kv_cache_type = KVCacheType(config["build_config"]["kv_cache_type"])
         else:
             kv_cache_type = KVCacheType.CONTINUOUS
 
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 77e12ee5100..bb8fd7816ce 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -47,12 +47,6 @@ CONFIG_LINUX_AARCH64 = "linux_aarch64"
 @Field
 def CONFIG_LINUX_AARCH64_LLVM = "linux_aarch64_LLVM"
 
-@Field
-def CONFIG_LINUX_X86_64_NANOBIND = "linux_x86_64_Nanobind"
-
-@Field
-def CONFIG_LINUX_AARCH64_NANOBIND = "linux_aarch64_Nanobind"
-
 @Field
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
@@ -62,11 +56,6 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM.tar.gz",
     (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
   ],
-  (CONFIG_LINUX_X86_64_NANOBIND) : [
-    (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars ENABLE_MULTI_DEVICE=1 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --micro_benchmarks",
-    (TARNAME) : "nanobind-TensorRT-LLM.tar.gz",
-    (WHEEL_ARCHS): "80-real;86-real;89-real;90-real;100-real;120-real",
-  ],
   (CONFIG_LINUX_X86_64_SINGLE_DEVICE) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars ENABLE_MULTI_DEVICE=0 --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars ENABLE_UCX=0 --micro_benchmarks",
     (TARNAME) : "single-device-TensorRT-LLM.tar.gz",
@@ -82,11 +71,6 @@ def BUILD_CONFIGS = [
     (TARNAME) : "TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;120-real",
   ],
-  (CONFIG_LINUX_AARCH64_NANOBIND): [
-    (WHEEL_EXTRA_ARGS) : "--binding_type nanobind --extra-cmake-vars WARNING_IS_ERROR=ON",
-    (TARNAME) : "nanobind-TensorRT-LLM-GH200.tar.gz",
-    (WHEEL_ARCHS): "90-real;100-real;120-real",
-  ],
   (CONFIG_LINUX_AARCH64_LLVM) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
     (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
@@ -539,8 +523,6 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64 : CONFIG_LINUX_X86_64_VANILLA),
         "Build TRT-LLM LLVM": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
             pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_LLVM : CONFIG_LINUX_X86_64_LLVM),
-        "Build TRT-LLM Nanobind": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
-            pipeline, cpu_arch == AARCH64_TRIPLE ? CONFIG_LINUX_AARCH64_NANOBIND : CONFIG_LINUX_X86_64_NANOBIND),
     ]
 
     if (cpu_arch == X86_64_TRIPLE) {
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 35e7140ebda..6f6ae7c1186 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -64,9 +64,6 @@ def LLVM_CONFIG = "LLVM"
 @Field
 LINUX_AARCH64_CONFIG = "linux_aarch64"
 
-@Field
-def NANOBIND_CONFIG = "Nanobind"
-
 @Field
 def BUILD_CONFIGS = [
   // Vanilla TARNAME is used for packaging in runLLMPackage
@@ -74,7 +71,6 @@ def BUILD_CONFIGS = [
   (SINGLE_DEVICE_CONFIG) : [(TARNAME) : "single-device-TensorRT-LLM.tar.gz"],
   (LLVM_CONFIG) : [(TARNAME) : "llvm-TensorRT-LLM.tar.gz"],
   (LINUX_AARCH64_CONFIG) : [(TARNAME) : "TensorRT-LLM-GH200.tar.gz"],
-  (NANOBIND_CONFIG) : [(TARNAME) : "nanobind-TensorRT-LLM.tar.gz"],
 ]
 
 // TODO: Move common variables to an unified location
@@ -1728,7 +1724,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         "A10-TensorRT-4": ["a10", "l0_a10", 4, 6],
         "A10-TensorRT-5": ["a10", "l0_a10", 5, 6],
         "A10-TensorRT-6": ["a10", "l0_a10", 6, 6],
-        "A10-Nanobind": ["a10", "l0_a10_nanobind", 1, 1],
         "A30-Triton-1": ["a30", "l0_a30", 1, 1],
         "A30-PyTorch-1": ["a30", "l0_a30", 1, 2],
         "A30-PyTorch-2": ["a30", "l0_a30", 2, 2],
@@ -1805,9 +1800,6 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
         if (key.contains("llvm")) {
             config = LLVM_CONFIG
         }
-        if (key.contains("Nanobind")) {
-            config = NANOBIND_CONFIG
-        }
         runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
     }]]}
     fullSet = parallelJobs.keySet()
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index 11d528a853d..e2dc543ac42 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -593,7 +593,7 @@ def from_dict(cls, config, plugin_config=None):
             defaults.get('max_prompt_embedding_table_size'))
 
         if "kv_cache_type" in config and config["kv_cache_type"] is not None:
-            kv_cache_type = KVCacheType.from_string(config.pop('kv_cache_type'))
+            kv_cache_type = KVCacheType(config.pop('kv_cache_type'))
         else:
             kv_cache_type = None
         gather_context_logits = config.pop(
diff --git a/tensorrt_llm/commands/build.py b/tensorrt_llm/commands/build.py
index e6b55f6e040..a47e1485b71 100644
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@@ -38,23 +38,6 @@
 from tensorrt_llm.quantization.mode import QuantAlgo
 
 
-def enum_type(enum_class):
-
-    def parse_enum(value):
-        if isinstance(value, enum_class):
-            return value
-
-        if isinstance(value, str):
-            return enum_class.from_string(value)
-
-        valid_values = [e.name for e in enum_class]
-        raise argparse.ArgumentTypeError(
-            f"Invalid value '{value}' of type {type(value).__name__}. Expected one of {valid_values}"
-        )
-
-    return parse_enum
-
-
 def parse_arguments():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
@@ -148,7 +131,7 @@ def parse_arguments():
     parser.add_argument(
         '--kv_cache_type',
         default=argparse.SUPPRESS,
-        type=enum_type(KVCacheType),
+        type=KVCacheType,
         help=
         "Set KV cache type (continuous, paged, or disabled). For disabled case, KV cache is disabled and only context phase is allowed."
     )
diff --git a/tensorrt_llm/runtime/model_runner.py b/tensorrt_llm/runtime/model_runner.py
index a9f0fe8de40..486c58f6d15 100644
--- a/tensorrt_llm/runtime/model_runner.py
+++ b/tensorrt_llm/runtime/model_runner.py
@@ -86,7 +86,7 @@ def _builder_to_model_config(config: dict) -> Tuple[ModelConfig, dict]:
     dtype = builder_config['precision']
     tp_size = builder_config['tensor_parallel']
     pp_size = builder_config.get('pipeline_parallel', 1)
-    kv_cache_type = KVCacheType.from_string(builder_config.get('kv_cache_type'))
+    kv_cache_type = KVCacheType(builder_config.get('kv_cache_type'))
     world_size = tp_size * pp_size
     assert world_size == mpi_world_size(), \
         f'Engine world size ({tp_size} * {pp_size}) != Runtime world size ({mpi_world_size()})'
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 5799ea27945..2f63ab45f3a 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -190,18 +190,3 @@ l0_a10:
   tests:
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
-l0_a10_nanobind:
-- condition:
-    ranges:
-      system_gpu_count:
-        gte: 1
-        lte: 1
-    wildcards:
-      gpu:
-      - '*a10*'
-      linux_distribution_name: ubuntu*
-    terms:
-      stage: pre_merge
-      backend: tensorrt
-  tests:
-  - unittest/bindings
diff --git a/tests/unittest/bindings/test_bindings_ut.py b/tests/unittest/bindings/test_bindings_ut.py
index 6fd46040b66..774accb080f 100644
--- a/tests/unittest/bindings/test_bindings_ut.py
+++ b/tests/unittest/bindings/test_bindings_ut.py
@@ -5,7 +5,6 @@
 from pathlib import Path
 
 import numpy as np
-import pytest
 import torch
 from utils.runtime_defaults import assert_runtime_defaults_are_parsed_correctly
 
@@ -310,8 +309,6 @@ def parse_runtime_defaults(defaults_dict: dict | None = None):
                                                  strict_keys=strict_keys)
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_llm_request():
     beam_width = 2
     sampling_config = _tb.SamplingConfig(beam_width)
@@ -421,8 +418,6 @@ def test_Mpicomm():
     assert size2 == session_size
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_SamplingConfig_pickle():
     config = _tb.SamplingConfig()
     config.beam_width = 5
@@ -502,8 +497,6 @@ def test_KvCache_events_binding():
     torch.cuda.empty_cache()
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_ReqIdsSet_pickle():
     ids = _tb.internal.batch_manager.ReqIdsSet()
     ids1 = pickle.loads(pickle.dumps(ids))
diff --git a/tests/unittest/bindings/test_executor_bindings.py b/tests/unittest/bindings/test_executor_bindings.py
index af72d9ac44b..935c4c9bfc3 100644
--- a/tests/unittest/bindings/test_executor_bindings.py
+++ b/tests/unittest/bindings/test_executor_bindings.py
@@ -14,7 +14,6 @@
 from binding_test_utils import *
 from pydantic import BaseModel
 
-import tensorrt_llm.bindings as _tb
 import tensorrt_llm.bindings.executor as trtllm
 import tensorrt_llm.version as trtllm_version
 from tensorrt_llm.models.modeling_utils import PretrainedConfig
@@ -485,8 +484,6 @@ def test_get_num_responses_ready(streaming: bool,
     assert executor.get_num_responses_ready() == num_expected_responses
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("batching_type", [trtllm.BatchingType.INFLIGHT])
 @pytest.mark.parametrize("streaming", [False, True])
 @pytest.mark.parametrize("beam_width", [1])
@@ -691,8 +688,6 @@ def verify_output(beam_tokens, test_data, given_input_lengths):
     verify_output(tokens, test_data, given_input_lengths)
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("streaming", [False, True])
 @pytest.mark.parametrize("beam_width", [1])
 def test_finish_reason(streaming: bool, beam_width: int, model_files,
@@ -1117,8 +1112,6 @@ def test_spec_dec_fast_logits_info():
     assert fast_logits_info.draft_participant_id == 5
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_result():
     result = trtllm.Result()
     result.is_final = True
@@ -1156,8 +1149,6 @@ def test_result():
     assert (additional_output.output == torch.ones(1, 4, 100)).all()
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_result_pickle():
     result = trtllm.Result()
     result.is_final = True
@@ -1504,8 +1495,6 @@ def test_eagle_config():
         assert getattr(config, k) == v
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_eagle_config_pickle():
     config = trtllm.EagleConfig([[0, 0], [0, 1]], False, 0.5)
     config_copy = pickle.loads(pickle.dumps(config))
@@ -1878,8 +1867,6 @@ def logits_post_processor(req_id: int, logits: torch.Tensor,
     assert tokens[-max_tokens:] == [42] * max_tokens
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 def test_logits_post_processor_batched(model_files, model_path):
 
     # Define the logits post-processor callback
@@ -2154,8 +2141,6 @@ def test_request_perf_metrics_kv_cache(model_path):
     assert kv_cache_metrics.kv_cache_hit_rate == 1.0
 
 
-@pytest.mark.skipif(_tb.binding_type == "nanobind",
-                    reason="Test not supported for nanobind yet")
 @pytest.mark.parametrize("exclude_input_from_output", [False, True])
 def test_request_perf_metrics_draft(model_path_draft_tokens_external,
                                     exclude_input_from_output: bool):
@@ -2236,7 +2221,7 @@ def test_kv_event_stream_timeout(model_path):
     assert len(events) == 1
 
     start = datetime.datetime.now()
-    events = cache_manager.get_latest_events(1000)
+    events = cache_manager.get_latest_events(datetime.timedelta(seconds=1))
     end = datetime.datetime.now()
     # Make sure that it actually waited
     assert abs(end - start) > datetime.timedelta(milliseconds=900)

From 0155e7a3a17d2575d18123951e0a5d645ef9a154 Mon Sep 17 00:00:00 2001
From: yifeizhang-c <219273404+yifeizhang-c@users.noreply.github.com>
Date: Fri, 18 Jul 2025 10:13:31 +0800
Subject: [PATCH 73/88] [TRTLLM-6368] Update deepep dispatch API (#6037)

Signed-off-by: Yifei Zhang <219273404+yifeizhang-c@users.noreply.github.com>
---
 cpp/tensorrt_llm/deep_ep/CMakeLists.txt       |  2 +-
 .../_torch/modules/fused_moe/deep_ep_utils.py |  5 ++--
 .../modules/fused_moe/fused_moe_wide_ep.py    | 23 +++++++------------
 3 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
index 603f26796e6..a404013aad3 100644
--- a/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
+++ b/cpp/tensorrt_llm/deep_ep/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(DEEP_EP_COMMIT c381dadf43a85062f6a8947592017ee513abc70b)
+set(DEEP_EP_COMMIT eb3f072664251c05074c3ecc3c3f5dad179c29a9)
 set(NVSHMEM_URL_HASH
     SHA256=eb2c8fb3b7084c2db86bd9fd905387909f1dfd483e7b45f7b3c3d5fcf5374b5a)
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
index 62146d9295f..bf808c93c1d 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/deep_ep_utils.py
@@ -59,7 +59,7 @@ def reserve(self, hidden_size: int, hidden_dtype: torch.dtype):
 
     def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
                  topk_idx: torch.Tensor, topk_weights: torch.Tensor,
-                 num_experts: int) -> \
+                 num_experts: int, global_expert_id_offset: int) -> \
             Tuple[Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]], torch.Tensor, torch.Tensor, List, Tuple]:
         # NOTES: an optional `previous_event` means a CUDA event captured that you want to make it as a dependency
         # of the dispatch kernel, it may be useful with communication-computation overlap. For more information, please
@@ -76,7 +76,8 @@ def dispatch(self, x: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
         recv_x, recv_topk_idx, recv_topk_weights, num_recv_tokens_per_expert_list, handle, event = \
             self.buffer.dispatch(x, topk_idx=topk_idx, topk_weights=topk_weights,
                                  num_tokens_per_rank=num_tokens_per_rank, num_tokens_per_rdma_rank=num_tokens_per_rdma_rank,
-                                 is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert)
+                                 is_token_in_rank=is_token_in_rank, num_tokens_per_expert=num_tokens_per_expert,
+                                 global_expert_id_offset=global_expert_id_offset)
         assert event.event is None
 
         # For event management, please refer to the docs of the `EventOverlap` class
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index 1d46d0712ff..2bf7a45c7fc 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -455,12 +455,13 @@ def forward_chunk(
             elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
                 if not use_postquant_alltoall:
                     x, recv_topk_idx, token_final_scales, num_recv_tokens_per_expert_list, deep_ep_handle = \
-                        self.deep_ep_buffer.dispatch(x, token_selected_slots.to(torch.int64), token_final_scales, self.num_slots)
-                    padded, x, _, recv_topk_idx, token_final_scales = self.pad_empty_recv_tensors(
+                        self.deep_ep_buffer.dispatch(x, token_selected_slots, token_final_scales, self.num_slots,
+                        self.expert_size_per_partition * self.mapping.moe_ep_rank)
+                    padded, x, _, token_selected_slots, token_final_scales = self.pad_empty_recv_tensors(
                         x, None, recv_topk_idx, token_final_scales)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
                 if not use_postquant_alltoall:
-                    deep_ep_topk_idx = token_selected_slots.to(torch.int64)
+                    deep_ep_topk_idx = token_selected_slots
                     deep_ep_topk_weights = token_final_scales
                     x, recv_expert_count, deep_ep_handle = \
                         self.deep_ep_buffer.low_latency_dispatch(x, deep_ep_topk_idx, self.deep_ep_max_num_tokens, self.num_slots)
@@ -588,8 +589,9 @@ def forward_chunk(
                     x_sf_dtype = x_sf.dtype
                     x_sf = x_sf.view(torch.float32)
                 (x, x_sf), recv_topk_idx, token_final_scales, num_recv_tokens_per_expert_list, deep_ep_handle = \
-                    self.deep_ep_buffer.dispatch((x, x_sf), token_selected_slots.to(torch.int64), token_final_scales, self.num_slots)
-                padded, x, x_sf, recv_topk_idx, token_final_scales = self.pad_empty_recv_tensors(
+                    self.deep_ep_buffer.dispatch((x, x_sf), token_selected_slots, token_final_scales, self.num_slots,
+                    self.expert_size_per_partition * self.mapping.moe_ep_rank)
+                padded, x, x_sf, token_selected_slots, token_final_scales = self.pad_empty_recv_tensors(
                     x, x_sf, recv_topk_idx, token_final_scales)
                 if x_sf is not None:
                     x_sf = x_sf.view(x_sf_dtype)
@@ -619,7 +621,7 @@ def forward_chunk(
                 fp4_packed_tensor[:,
                                   x.shape[1]:x.shape[1] + x_sf.shape[1]] = x_sf
 
-                deep_ep_topk_idx = token_selected_slots.to(torch.int64)
+                deep_ep_topk_idx = token_selected_slots
                 deep_ep_topk_weights = token_final_scales
                 # Each LL combine/dispatch kernel call requires that the `dispatch_rdma_recv_count_buffer` be properly cleaned.
                 # However, the offset of this buffer within the entire RDMA buffer changes according to the hidden size.
@@ -668,15 +670,6 @@ def forward_chunk(
                     f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
-        if use_all_to_all:
-            # Adapter between `torch.ops.trtllm.fused_moe` and DeepEP
-            # TODO: remove the adapter by changing APIs
-            if self.alltoall_method_type == AlltoallMethodType.DeepEP:
-                token_selected_slots = recv_topk_idx.to(torch.int32)
-                mask = token_selected_slots == -1
-                token_selected_slots += self.expert_size_per_partition * self.mapping.moe_ep_rank
-                token_selected_slots[mask] = self.num_slots
-
         final_hidden_states = torch.ops.trtllm.fused_moe(
             x,
             token_selected_slots,

From 200ea9ee819ddcbbf65a4ea08826d0ac6a50f18b Mon Sep 17 00:00:00 2001
From: xavier-nvidia <xsimmons@nvidia.com>
Date: Thu, 17 Jul 2025 19:26:08 -0700
Subject: [PATCH 74/88] fix TMA error with GEMM+AR on TP=2 (#6075)

Signed-off-by: Xavier Simmons <xsimmons@nvidia.com>
---
 .../allreduce_gemm/allreduce_gemm_impl_sm100.h            | 5 -----
 .../allreduce_gemm/allreduce_gemm_impl_sm90.h             | 5 -----
 .../plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp   | 7 +++++--
 .../plugins/gemmAllReducePlugin/gemmAllReducePlugin.h     | 2 +-
 .../gemmAllReducePlugin/gemmAllReducePluginProfiler.cpp   | 8 ++++++--
 cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu                 | 7 +++++--
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h
index ed18541d0ac..a4be82607a8 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm100.h
@@ -221,9 +221,6 @@ class GemmAllReduceImplTwoshot_Sm100 : public GemmAllReduceImplInterface
             {
                 MPI_group_barrier(_ranks);
             }
-
-            TLLM_CUDA_CHECK(cudaStreamCreate(&_memcpy_stream));
-            TLLM_CUDA_CHECK(cudaEventCreate(&_fork_join_event));
         }
 
         int free() override
@@ -267,8 +264,6 @@ class GemmAllReduceImplTwoshot_Sm100 : public GemmAllReduceImplInterface
         DeviceAllocationNvls<BarrierT> _tile_barriers;
         DeviceAllocationNvls<BarrierT> _completion_barriers;
         DeviceAllocationNvls<ElementD> _stage_buf;
-        cudaStream_t _memcpy_stream;
-        cudaEvent_t _fork_join_event;
     };
 
     GemmAllReduceImplTwoshot_Sm100()
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h
index ab867b69a87..fb446b451d8 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/allreduce_gemm/allreduce_gemm_impl_sm90.h
@@ -186,9 +186,6 @@ class GemmAllReduceImplTwoshot_Sm90 : public GemmAllReduceImplInterface
             {
                 MPI_group_barrier(_ranks);
             }
-
-            TLLM_CUDA_CHECK(cudaStreamCreate(&_memcpy_stream));
-            TLLM_CUDA_CHECK(cudaEventCreate(&_fork_join_event));
         }
 
         int free() override
@@ -232,8 +229,6 @@ class GemmAllReduceImplTwoshot_Sm90 : public GemmAllReduceImplInterface
         DeviceAllocationNvls<BarrierT> _tile_barriers;
         DeviceAllocationNvls<BarrierT> _completion_barriers;
         DeviceAllocationNvls<ElementD> _stage_buf;
-        cudaStream_t _memcpy_stream;
-        cudaEvent_t _fork_join_event;
     };
 
     GemmAllReduceImplTwoshot_Sm90()
diff --git a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp
index 8d80827b900..4cec38b046a 100644
--- a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.cpp
@@ -108,6 +108,8 @@ void GemmAllReducePlugin::allocatePersistentWorkspace()
 {
     TLLM_CHECK(mOptions.maxProblemShape.isInitialized());
 
+    mWorkspaceKey = "gemm_allreduce_workspace_m" + std::to_string(mOptions.maxProblemShape.maxM);
+
     cutlass_kernels::GemmAllReduceImplInterface::LaunchConfig smallest_tile_config
         = mGemm->getSupportedLaunchConfigs()[0];
     cutlass_kernels::GemmAllReduceImplInterface::ProblemArgs args;
@@ -123,7 +125,7 @@ void GemmAllReducePlugin::allocatePersistentWorkspace()
 
     // Register and allocate workspace
     mWorkspace = static_cast<GemmAllReducePersistentWorkspace*>(
-        getPluginRegistry()->acquirePluginResource(mWorkspaceKey, &unallocated_resource));
+        getPluginRegistry()->acquirePluginResource(mWorkspaceKey.c_str(), &unallocated_resource));
     TLLM_CHECK(mWorkspace != nullptr);
 }
 
@@ -395,6 +397,7 @@ int GemmAllReducePlugin::enqueue(PluginTensorDesc const* inputDesc, PluginTensor
     auto const N = utils::computeNDimension(mOptions.transB, inputDesc[1].dims);
     auto const K = mOptions.transA ? inputDesc[0].dims.d[0] : inputDesc[0].dims.d[nbDimsA - 1];
 
+    TLLM_CHECK_WITH_INFO(M <= mOptions.maxProblemShape.maxM, "GemmAllReducePlugin M > maxM.");
     TLLM_CHECK_WITH_INFO(M > 0, "GemmAllReducePlugin M is 0.");
     TLLM_CHECK_WITH_INFO(N > 0, "GemmAllReducePlugin N is 0.");
     TLLM_CHECK_WITH_INFO(K > 0, "GemmAllReducePlugin K is 0.");
@@ -513,7 +516,7 @@ void GemmAllReducePlugin::terminate() noexcept
     // free mWorkspace
     if (mWorkspace)
     {
-        getPluginRegistry()->releasePluginResource(mWorkspaceKey);
+        getPluginRegistry()->releasePluginResource(mWorkspaceKey.c_str());
         mWorkspace = nullptr;
     }
 }
diff --git a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.h b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.h
index 4cd2a77a5c4..45792624600 100644
--- a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.h
+++ b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePlugin.h
@@ -154,7 +154,7 @@ class GemmAllReducePlugin : public BasePlugin
     int mNbOutputs = 0;
 
     std::map<KeyType, ValueType> mTypedInstantiators;
-    char const* mWorkspaceKey = "gemm_allreduce_workspace";
+    std::string mWorkspaceKey;
     std::shared_ptr<cutlass_kernels::GemmAllReduceImplInterface> mGemm;
     // Params that are initialized during configurePlugin()
     GemmAllReducePersistentWorkspace* mWorkspace = nullptr;
diff --git a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePluginProfiler.cpp b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePluginProfiler.cpp
index d6e0f3b8ac6..a6f7ca2615d 100644
--- a/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePluginProfiler.cpp
+++ b/cpp/tensorrt_llm/plugins/gemmAllReducePlugin/gemmAllReducePluginProfiler.cpp
@@ -60,8 +60,12 @@ void GemmAllReducePluginProfiler::deserializeFromOwnFile(GemmIdCore gemmId, Gemm
 
 bool GemmAllReducePluginProfiler::useProfiler()
 {
-    char const* envDir = getenv("GEMM_AR_PLUGIN_PROFILE_DIR");
-    return envDir != nullptr;
+    // char const* envDir = getenv("GEMM_AR_PLUGIN_PROFILE_DIR");
+    // return envDir != nullptr;
+    // TODO(xsimmons): currently the profiler does not add any perf gain
+    // due to static heuristics being sufficient. We can re-enable this
+    // when we need more configurations.
+    return false;
 }
 
 std::string GemmAllReducePluginProfiler::getCacheFileName(GemmIdCore gemmId)
diff --git a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
index c685966148f..031ac92168a 100644
--- a/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
+++ b/cpp/tensorrt_llm/runtime/ipcNvlsMemory.cu
@@ -295,6 +295,7 @@ public:
         // Clean up
         MPI_Group_free(&new_group);
         MPI_Group_free(&world_group);
+        MPI_Comm_free(&new_comm);
 
         return nvls_handle;
     }
@@ -401,14 +402,14 @@ void MPI_group_barrier(std::set<int> group)
     MPI_Comm new_comm;
 
     // Get the group of the world communicator
-    MPI_Comm_group(MPI_COMM_WORLD, &world_group);
+    MPI_Comm_group(COMM_SESSION, &world_group);
 
     // Create a new group containing only the ranks we want
     std::vector<int> ranks(group.begin(), group.end());
     MPI_Group_incl(world_group, ranks.size(), ranks.data(), &new_group);
 
     // Create a new communicator from the group
-    MPI_Comm_create_group(MPI_COMM_WORLD, new_group, 0, &new_comm);
+    MPI_Comm_create_group(COMM_SESSION, new_group, 0, &new_comm);
 
     // Use the new communicator for the barrier
     MPI_Barrier(new_comm);
@@ -510,6 +511,8 @@ IpcNvlsHandle* ipcNvlsAllocate(size_t size, std::set<int> group)
 
     MPI_Barrier(new_comm);
 
+    MPI_Comm_free(&new_comm);
+
     return handle;
 #else
     TLLM_THROW("ipcNvlsAllocate needs to be compiled with ENABLE_MULTI_DEVICE");

From 992b2730451be96a2a52dff85a33f6295f81091d Mon Sep 17 00:00:00 2001
From: Zhenhuan Chen <chenzhh3671@gmail.com>
Date: Fri, 18 Jul 2025 10:34:37 +0800
Subject: [PATCH 75/88] [https://nvbugs/5387375] fix(scaffolding): fix
 scaffolding aime test in test_e2e (#6140)

Signed-off-by: Zhenhuan Chen <chenzhh3671@gmail.com>
---
 .../scaffolding/run_best_of_n_with_reward.py  |  2 +-
 .../scaffolding/run_majority_vote_aime24.py   |  5 ++-
 tensorrt_llm/scaffolding/__init__.py          |  1 -
 tensorrt_llm/scaffolding/controller.py        | 13 +++----
 tensorrt_llm/scaffolding/math_utils.py        | 34 ++++++++++---------
 tensorrt_llm/scaffolding/result.py            | 10 +-----
 tensorrt_llm/scaffolding/scaffolding_llm.py   |  2 +-
 tensorrt_llm/scaffolding/task.py              | 27 +++++++--------
 tests/integration/test_lists/waives.txt       |  1 -
 tests/unittest/scaffolding/test_bench.py      |  6 ++--
 .../scaffolding/test_parallel_process.py      |  8 -----
 .../scaffolding/test_task_collection.py       |  7 ----
 12 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/examples/scaffolding/run_best_of_n_with_reward.py b/examples/scaffolding/run_best_of_n_with_reward.py
index e451cf6b2c0..6ff9ed1228a 100644
--- a/examples/scaffolding/run_best_of_n_with_reward.py
+++ b/examples/scaffolding/run_best_of_n_with_reward.py
@@ -60,7 +60,7 @@ def main():
     prompts = [query]
 
     results = llm.generate(prompts)
-    print(results[0].output.output_str)
+    print(results[0].outputs[0].text)
     llm.shutdown(shutdown_workers=True)
     print(f'main shut down done')
 
diff --git a/examples/scaffolding/run_majority_vote_aime24.py b/examples/scaffolding/run_majority_vote_aime24.py
index 64b4510b19d..a3587a13663 100644
--- a/examples/scaffolding/run_majority_vote_aime24.py
+++ b/examples/scaffolding/run_majority_vote_aime24.py
@@ -101,9 +101,8 @@ def main():
         result = results[i]
         test_case = test_dataset[i]
         ref_answer = int(test_case["answer"])
-        result.result()
-        output = result.output
-        extracted_answer = extract_answer_from_boxed(output.output_str)
+        output = result.outputs[0]
+        extracted_answer = extract_answer_from_boxed(output.text)
         try:
             # print(f"[QUESTION]:\n{prompt}\n\n[OUTPUT]\n\n{output.output_str}\n\n")
             answer = int(extracted_answer)
diff --git a/tensorrt_llm/scaffolding/__init__.py b/tensorrt_llm/scaffolding/__init__.py
index 87ece61f90c..a07c30ac72a 100644
--- a/tensorrt_llm/scaffolding/__init__.py
+++ b/tensorrt_llm/scaffolding/__init__.py
@@ -12,7 +12,6 @@
 
 __all__ = [
     "ScaffoldingLlm",
-    "ScaffoldingOutput",
     "ParallelProcess",
     "Controller",
     "NativeGenerationController",
diff --git a/tensorrt_llm/scaffolding/controller.py b/tensorrt_llm/scaffolding/controller.py
index 10d7e5e0876..2e032cbb163 100644
--- a/tensorrt_llm/scaffolding/controller.py
+++ b/tensorrt_llm/scaffolding/controller.py
@@ -1,7 +1,7 @@
 import copy
 from abc import ABC
 from enum import Enum
-from typing import Any, List, Mapping
+from typing import Any, List, Mapping, Tuple
 
 import torch
 from torch.nn import functional as F
@@ -231,13 +231,14 @@ def process(self,
                               generation_kwargs_list)
 
         candidates = [tasks[0].output_str for tasks in tasks_list]
-        result = self.majority_vote(candidates, **majority_vote_kwargs)
+        majority_index, majority_answer = self.majority_vote(
+            candidates, **majority_vote_kwargs)
 
-        assert isinstance(result, str), "majority_vote failed"
+        assert isinstance(majority_answer, str), "majority_vote failed"
         # The task returned by majority vote does not have output_tokens and logits.
-        tasks[0].output_str = result
+        tasks[0].result = tasks_list[majority_index][0].result
 
-    def majority_vote(self, candidates: List[str], **kwargs) -> str:
+    def majority_vote(self, candidates: List[str], **kwargs) -> Tuple[int, str]:
         return get_digit_majority_vote_result(candidates)
 
 
@@ -292,7 +293,7 @@ def process(self,
 
         best_task, best_idx = self.select_best(generation_tasks, reward_values,
                                                **select_best_kwargs)
-        task.output_str = best_task.output_str
+        task.result = best_task.result
 
     def select_best(self, tasks: List[Task], reward_values, **kwargs) -> Task:
         max_index = torch.argmax(torch.tensor(reward_values)).item()
diff --git a/tensorrt_llm/scaffolding/math_utils.py b/tensorrt_llm/scaffolding/math_utils.py
index 71036d67129..df8417657f3 100644
--- a/tensorrt_llm/scaffolding/math_utils.py
+++ b/tensorrt_llm/scaffolding/math_utils.py
@@ -1,5 +1,4 @@
 import re
-from collections import Counter
 from typing import List
 
 
@@ -59,28 +58,31 @@ def get_majority_result(
     result_extractor=lambda x: x,
     result_validator=lambda x: True,
 ):
-    valid_answers_and_results = [(result, result_extractor(result))
-                                 for result in results
-                                 if result_validator(result) is True
-                                 and result_extractor(result) is not None]
-    if len(valid_answers_and_results) == 0:
+    extract_answers = [result_extractor(result) for result in results]
+    valid_answers = [
+        result for result in extract_answers
+        if result is not None and result_validator(result) is True
+    ]
+    if len(valid_answers) == 0:
         return None, None
 
-    majority_result = Counter(valid_answers_and_results).most_common(1)[0][0]
-    # return result and extracted result
-    return majority_result[0], majority_result[1]
+    answer_counts = {}
+    for answer in valid_answers:
+        answer_counts[answer] = answer_counts.get(answer, 0) + 1
+    majority_answer = max(answer_counts, key=answer_counts.get)
+    majority_index = next(
+        filter(lambda x: x[1] == majority_answer,
+               enumerate(extract_answers)))[0]
+    return majority_index, majority_answer
 
 
 def get_digit_majority_vote_result(results: List[str]) -> str:
 
     def is_digit(result: str):
-        extracted_answer = extract_answer_from_boxed(result)
-        if extracted_answer is None:
-            return False
-        return extracted_answer.isdigit()
+        return result.isdigit()
 
-    vote_result = get_majority_result(
+    index, extract_answer = get_majority_result(
         results,
         result_extractor=extract_answer_from_boxed,
-        result_validator=is_digit)[0]
-    return vote_result if vote_result else results[0]
+        result_validator=is_digit)
+    return (index, extract_answer) if extract_answer else (0, None)
diff --git a/tensorrt_llm/scaffolding/result.py b/tensorrt_llm/scaffolding/result.py
index b0571c8d60b..9ebb978d9b1 100644
--- a/tensorrt_llm/scaffolding/result.py
+++ b/tensorrt_llm/scaffolding/result.py
@@ -1,23 +1,15 @@
 import asyncio
-from dataclasses import dataclass
 from typing import Mapping, Optional
 
 from tensorrt_llm.executor.result import GenerationResult
 
 
-@dataclass(slots=True)
-class ScaffoldingOutput:
-
-    def __init__(self):
-        self.output_str = None
-
-
 class ScaffoldingResult:
 
     def __init__(self, streaming_event: Optional[asyncio.Event] = None):
         super().__init__()
         self.aqueue = asyncio.Queue()
-        self.cur_output = None
+        self.cur_output: GenerationResult = None
         self._done = False
         self.task_collections = None
         self.streaming_event = streaming_event
diff --git a/tensorrt_llm/scaffolding/scaffolding_llm.py b/tensorrt_llm/scaffolding/scaffolding_llm.py
index feda3e416cb..9eb79fdd657 100644
--- a/tensorrt_llm/scaffolding/scaffolding_llm.py
+++ b/tensorrt_llm/scaffolding/scaffolding_llm.py
@@ -82,7 +82,7 @@ async def _handle_task_list(self,
         ]
         await asyncio.gather(*async_tasks)
         for task in tasks:
-            if task.streaming:
+            if getattr(task, 'streaming', False):
                 await request.result.set_output_async(task.result)
                 self.streaming_event.clear()
                 await self.streaming_event.wait()
diff --git a/tensorrt_llm/scaffolding/task.py b/tensorrt_llm/scaffolding/task.py
index 5426e6d38fe..0abf666d981 100644
--- a/tensorrt_llm/scaffolding/task.py
+++ b/tensorrt_llm/scaffolding/task.py
@@ -62,8 +62,6 @@ class GenerationTask(Task):
     worker_tag: Union[str, "Controller.WorkerTag"] = None
 
     # result field
-    _outputs: Optional[List[dict]] = None
-
     # link to TRTLLM's GenerationResult, for async update in streaming mode
     _result: Optional[GenerationResult] = None
 
@@ -74,35 +72,36 @@ def result(self) -> GenerationResult:
     @result.setter
     def result(self, result: GenerationResult) -> None:
         self._result = result
-        self._outputs = result.outputs
+
+    @property
+    def outputs(self) -> Optional[List[dict]]:
+        return self._result.outputs if self._result else None
 
     @property
     def output_tokens(self) -> List[int]:
-        return self._outputs[
-            0].token_ids if self.result and self._outputs else None
+        return self._result.outputs[0].token_ids if self._result else None
 
     @property
     def output_str(self) -> Optional[str]:
-        return self._outputs[0].text if self.result and self._outputs else None
+        return self._result.outputs[0].text if self._result else None
 
     @output_str.setter
     def output_str(self, output) -> Optional[str]:
-        assert self.result and self._outputs
-        self._outputs[0].text = output
+        assert self.result
+        self._result.outputs[0].text = output
 
     @property
     def cumulative_logprob(self) -> Optional[float]:
-        return self._outputs[
-            0].cumulative_logprob if self.result and self._outputs else None
+        return self._result.outputs[
+            0].cumulative_logprob if self._result else None
 
     @property
     def logprobs(self) -> Optional[List[float]]:
-        return self._outputs[
-            0].logprobs if self.result and self._outputs else None
+        return self._result.outputs[0].logprobs if self._result else None
 
     @property
     def context_logits(self) -> Optional[torch.Tensor]:
-        return self.result.context_logits if self.result else None
+        return self._result.context_logits if self._result else None
 
     @staticmethod
     def create_from_prompt(prompt: str) -> "GenerationTask":
@@ -113,7 +112,7 @@ def create_from_prompt(prompt: str) -> "GenerationTask":
         return task
 
     def create_scaffolding_output(self) -> GenerationResult:
-        return self.result
+        return self._result
 
 
 @dataclass
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index cd453839d9a..630f62ab670 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -433,7 +433,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp
 examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987)
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914)
-test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5387375)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
 test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)
diff --git a/tests/unittest/scaffolding/test_bench.py b/tests/unittest/scaffolding/test_bench.py
index 27988e8453e..a65584d4c44 100644
--- a/tests/unittest/scaffolding/test_bench.py
+++ b/tests/unittest/scaffolding/test_bench.py
@@ -13,7 +13,7 @@
 class DummyWorker(Worker):
 
     async def dummy_generation_handler(self, task: GenerationTask):
-        task.output_str = OUTPUT_STR
+        task.result = OUTPUT_STR
         return TaskStatus.SUCCESS
 
     task_handlers = {GenerationTask: dummy_generation_handler}
@@ -29,7 +29,7 @@ def before_yield(self, tasks: List[Task]):
         pass
 
     def after_yield(self, tasks: List[Task]):
-        self.output_len = len(tasks[0].output_str)
+        self.output_len = len(tasks[0].result)
 
 
 def test_scaffolding_benchmark():
@@ -56,6 +56,6 @@ def test_scaffolding_benchmark():
 
     assert len(results) == requests_num
     assert len(requests_execution_time) == requests_num
-    assert results[0].output.output_str == OUTPUT_STR
+    assert results[0].cur_output == OUTPUT_STR
     assert results[0].task_collections[
         "bench_dummy_collection"].output_len == len(OUTPUT_STR)
diff --git a/tests/unittest/scaffolding/test_parallel_process.py b/tests/unittest/scaffolding/test_parallel_process.py
index 7b2e7d4c4cb..e277b9d97ac 100644
--- a/tests/unittest/scaffolding/test_parallel_process.py
+++ b/tests/unittest/scaffolding/test_parallel_process.py
@@ -4,8 +4,6 @@
 from enum import Enum
 from typing import List
 
-import pytest
-
 from tensorrt_llm.scaffolding import (Controller, ParallelProcess,
                                       ScaffoldingLlm, Task, TaskStatus, Worker)
 
@@ -21,8 +19,6 @@ def create_from_prompt(prompt: str) -> "DummyTask":
         task = DummyTask(2)
         return task
 
-    # TODO: Fix when ScaffoldingOutput is replaced with GenerationResult
-    # def create_scaffolding_output(self) -> "ScaffoldingOutput":
     def create_scaffolding_output(self):
         self.verify()
         return None
@@ -34,8 +30,6 @@ def verify(self):
 
 class DummyControllerBase(Controller):
 
-    # TODO: Fix when ScaffoldingOutput is replaced with GenerationResult
-    # def generate(self, prompt: str, **kwargs) -> ScaffoldingOutput:
     def generate(self, prompt: str, **kwargs):
         task = DummyTask.create_from_prompt(prompt)
         yield from self.process([task], **kwargs)
@@ -125,7 +119,6 @@ def parallel_process_helper_run_and_verify(controllers):
     llm.shutdown()
 
 
-@pytest.skip(reason="ScaffoldingOutput removed in PR #5345, needs refactoring")
 def test_parallel_process_helper():
     NUM_CONTROLLERS = 3
     controllers = []
@@ -137,7 +130,6 @@ def test_parallel_process_helper():
     parallel_process_helper_run_and_verify(controllers)
 
 
-@pytest.skip(reason="ScaffoldingOutput removed in PR #5345, needs refactoring")
 def test_parallel_process_helper_with_two_level():
     NUM_CONTROLLERS_LEVEL_1 = 2
     NUM_CONTROLLERS_LEVEL_2 = 2
diff --git a/tests/unittest/scaffolding/test_task_collection.py b/tests/unittest/scaffolding/test_task_collection.py
index 53ce7c590ed..6f611ab57fc 100644
--- a/tests/unittest/scaffolding/test_task_collection.py
+++ b/tests/unittest/scaffolding/test_task_collection.py
@@ -2,8 +2,6 @@
 from enum import Enum
 from typing import List
 
-import pytest
-
 from tensorrt_llm.scaffolding import (Controller, ParallelProcess,
                                       ScaffoldingLlm, Task, TaskCollection,
                                       TaskStatus, Worker, with_task_collection)
@@ -20,8 +18,6 @@ def create_from_prompt(prompt: str) -> "DummyTask":
         task = DummyTask()
         return task
 
-    # TODO: Fix when ScaffoldingOutput is replaced with GenerationResult
-    # def create_scaffolding_output(self) -> "ScaffoldingOutput":
     def create_scaffolding_output(self):
         return None
 
@@ -55,8 +51,6 @@ def __init__(self, expected_task_count: int):
         super().__init__()
         self.expected_task_count = expected_task_count
 
-    # TODO: Fix when ScaffoldingOutput is replaced with GenerationResult
-    # def generate(self, prompt: str, **kwargs) -> ScaffoldingOutput:
     def generate(self, prompt: str, **kwargs):
         task = DummyTask.create_from_prompt(prompt)
         yield from self.process([task], **kwargs)
@@ -127,7 +121,6 @@ def run(controller, expected_task_count):
     llm.shutdown()
 
 
-@pytest.skip(reason="ScaffoldingOutput removed in PR #5345, needs refactoring")
 def test_dummy_task_collection():
     controller = DummyController(1)
     run(controller, 1)

From 812243bdd6a4596e1775039bb79db0dea6318adf Mon Sep 17 00:00:00 2001
From: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
Date: Thu, 17 Jul 2025 19:35:12 -0700
Subject: [PATCH 76/88] feat: add support for Modelopt fp8_pb_wo quantization
 scheme (#6106)

Signed-off-by: Aurelien Chartier <2567591+achartier@users.noreply.github.com>
Co-authored-by: Haohang Huang <31998628+symphonylyh@users.noreply.github.com>
---
 tensorrt_llm/_torch/model_config.py   | 3 +++
 tensorrt_llm/_torch/modules/linear.py | 8 +++++---
 tensorrt_llm/llmapi/llm_utils.py      | 6 +++++-
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/_torch/model_config.py b/tensorrt_llm/_torch/model_config.py
index 671564baadc..3de3edd3a9b 100644
--- a/tensorrt_llm/_torch/model_config.py
+++ b/tensorrt_llm/_torch/model_config.py
@@ -202,6 +202,9 @@ def from_pretrained(cls,
             json_quant_configs = quant_config_dict['quantization']
 
             quant_config.quant_algo = json_quant_configs.get('quant_algo', None)
+            # fp8_pb_wo from modelopt is the same as FP8_BLOCK_SCALES
+            if quant_config.quant_algo == "fp8_pb_wo":
+                quant_config.quant_algo = 'FP8_BLOCK_SCALES'
             quant_config.kv_cache_quant_algo = json_quant_configs.get(
                 'kv_cache_quant_algo', None)
             quant_config.group_size = json_quant_configs.get('group_size', None)
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
index ca9cb6501d0..134f1c8ebf8 100644
--- a/tensorrt_llm/_torch/modules/linear.py
+++ b/tensorrt_llm/_torch/modules/linear.py
@@ -562,7 +562,8 @@ def load_weights_vanilla(self, module: Linear, weights: List[Dict]) -> None:
 
         scale_name = self._get_scale_name(weights)
         weight_scale = load_weight_shard(weights[0][scale_name], module.tp_size,
-                                         module.tp_rank, module.tp_mode)
+                                         module.tp_rank,
+                                         module.tp_mode).squeeze()
         copy_weight(module.weight_scale, weight_scale)
         if "input_scale" in weights[0]:
             copy_weight(module.input_scale, weights[0]["input_scale"])
@@ -582,7 +583,8 @@ def load_weights_fused_qkv_linear(self, module: Linear,
                                     module.tp_rank, module.tp_mode)
         v_scale = load_weight_shard(weights[2][scale_name], module.tp_size,
                                     module.tp_rank, module.tp_mode)
-        fused_fp8_block_scale = torch.cat((q_scale, k_scale, v_scale))
+        fused_fp8_block_scale = torch.cat((q_scale, k_scale, v_scale)).squeeze()
+
         copy_weight(module.weight_scale, fused_fp8_block_scale)
 
     def load_weights_fused_gate_up_linear(self, module: Linear,
@@ -597,7 +599,7 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
                                        module.tp_rank, module.tp_mode)
         right_scale = load_weight_shard(weights[1][scale_name], module.tp_size,
                                         module.tp_rank, module.tp_mode)
-        fused_scale = torch.cat([left_scale, right_scale], dim=0)
+        fused_scale = torch.cat([left_scale, right_scale], dim=0).squeeze()
         copy_weight(module.weight_scale, fused_scale)
 
 
diff --git a/tensorrt_llm/llmapi/llm_utils.py b/tensorrt_llm/llmapi/llm_utils.py
index 31f853f3705..a62568a54e8 100644
--- a/tensorrt_llm/llmapi/llm_utils.py
+++ b/tensorrt_llm/llmapi/llm_utils.py
@@ -362,7 +362,11 @@ def _update_from_hf_quant_config(self) -> bool:
 
             hf_quant_algo = hf_quant_config.pop("quant_algo", None)
             if hf_quant_algo is not None:
-                hf_quant_algo = QuantAlgo(hf_quant_algo)
+                # fp8_pb_wo from modelopt is the same as fp8_block_scales
+                if hf_quant_algo == "fp8_pb_wo":
+                    hf_quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+                else:
+                    hf_quant_algo = QuantAlgo(hf_quant_algo)
                 if quant_config.quant_algo is None:
                     logger.info(
                         f"Setting quant_algo={hf_quant_algo} form HF quant config."

From c0e416535e830fabacb49f2f671bd662b50d85cc Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Fri, 18 Jul 2025 13:18:37 +0800
Subject: [PATCH 77/88] fix single_disagg_test (#6166)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
---
 .../defs/disaggregated/test_disaggregated_single_gpu.py  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
index 1e1859f5aa6..5ed5c3e2710 100644
--- a/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
+++ b/tests/integration/defs/disaggregated/test_disaggregated_single_gpu.py
@@ -360,18 +360,21 @@ def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
         KvCacheConfig(max_tokens=128, enable_block_reuse=False)
         for _ in range(2)
     ]
+    cache_transceiver_configs = [
+        CacheTransceiverConfig(backend="default") for _ in range(2)
+    ]
     model_names = [model_path(model) for _ in range(2)]
     ranks = [0, 1]
     worker_args = list(
-        zip(kv_cache_configs, worker_pytorch_configs, model_names, ranks))
+        zip(kv_cache_configs, cache_transceiver_configs, worker_pytorch_configs,
+            model_names, ranks))
 
     port_name = MPI.Open_port()
     MPI.Publish_name('my_port', port_name)
 
     prompt = "What is the capital of Germany?"
 
-    with MPIPoolExecutor(max_workers=2, env={"TRTLLM_USE_MPI_KVCACHE":
-                                             "1"}) as executor:
+    with MPIPoolExecutor(max_workers=2, env={"UCX_TLS": "^ib"}) as executor:
         futures = []
         try:
             for worker_arg in worker_args:

From f32169269a233ea5c3e7f2d6a712befb7548bbee Mon Sep 17 00:00:00 2001
From: Yiqing Yan <yiqingy@nvidia.com>
Date: Fri, 18 Jul 2025 15:25:05 +0800
Subject: [PATCH 78/88] [TRTLLM-5179] - Update bot help messages (#5277)

Signed-off-by: Yiqing Yan <yiqingy@nvidia.com>
---
 .github/pull_request_template.md  | 18 ++++++++++++++----
 .github/workflows/bot-command.yml | 13 +++++++++----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index f4bb9f33c48..202a38d90d0 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -38,27 +38,37 @@ See details below for each supported subcommand.
 
 <details>
 
-`run  [--disable-fail-fast --skip-test --stage-list "A10-1, xxx" --gpu-type "A30, H100_PCIe" --add-multi-gpu-test --only-multi-gpu-test --disable-multi-gpu-test --post-merge --extra-stage "H100_PCIe-[Post-Merge]-1, xxx"]`
+`run  [--reuse-test (optional)pipeline-id --disable-fail-fast --skip-test --stage-list "A10-PyTorch-1, xxx" --gpu-type "A30, H100_PCIe" --test-backend "pytorch, cpp" --add-multi-gpu-test --only-multi-gpu-test --disable-multi-gpu-test --post-merge --extra-stage "H100_PCIe-TensorRT-Post-Merge-1, xxx" --detailed-log --debug(experimental)]`
 
 Launch build/test pipelines. All previously running jobs will be killed.
 
+`--reuse-test (optional)pipeline-id ` *(OPTIONAL)* : Allow the new pipeline to reuse build artifacts and skip successful test stages from a specified pipeline or the last pipeline if no pipeline-id is indicated. If the Git commit ID has changed, this option will be always ignored. The DEFAULT behavior of the bot is to reuse build artifacts and successful test results from the last pipeline.
+
+`--disable-reuse-test ` *(OPTIONAL)* : Explicitly prevent the pipeline from reusing build artifacts and skipping successful test stages from a previous pipeline. Ensure that all builds and tests are run regardless of previous successes.
+
 `--disable-fail-fast ` *(OPTIONAL)* : Disable fail fast on build/tests/infra failures.
 
 `--skip-test ` *(OPTIONAL)* : Skip all test stages, but still run build stages, package stages and sanity check stages. Note: Does **NOT** update GitHub check status.
 
-`--stage-list "A10-1, xxx"` *(OPTIONAL)* : Only run the specified test stages. Examples: "A10-1, xxx". Note: Does **NOT** update GitHub check status.
+`--stage-list "A10-PyTorch-1, xxx"` *(OPTIONAL)* : Only run the specified test stages. Examples: "A10-PyTorch-1, xxx". Note: Does **NOT** update GitHub check status.
 
 `--gpu-type "A30, H100_PCIe"` *(OPTIONAL)* : Only run the test stages on the specified GPU types. Examples: "A30, H100_PCIe". Note: Does **NOT** update GitHub check status.
 
+`--test-backend "pytorch, cpp"` *(OPTIONAL)* : Skip test stages which don't match the specified backends. Only support [pytorch, cpp, tensorrt, triton]. Examples: "pytorch, cpp" (does not run test stages with tensorrt or triton backend). Note: Does **NOT** update GitHub pipeline status.
+
 `--only-multi-gpu-test ` *(OPTIONAL)* : Only run the multi-GPU tests. Note: Does **NOT** update GitHub check status.
 
 `--disable-multi-gpu-test ` *(OPTIONAL)* : Disable the multi-GPU tests. Note: Does **NOT** update GitHub check status.
 
-`--add-multi-gpu-test ` *(OPTIONAL)* : Force run the multi-GPU tests. Will also run L0 pre-merge pipeline.
+`--add-multi-gpu-test ` *(OPTIONAL)* : Force run the multi-GPU tests in addition to running L0 pre-merge pipeline.
 
 `--post-merge ` *(OPTIONAL)* : Run the L0 post-merge pipeline instead of the ordinary L0 pre-merge pipeline.
 
-`--extra-stage "H100_PCIe-[Post-Merge]-1, xxx"` *(OPTIONAL)* : Run the ordinary L0 pre-merge pipeline and specified test stages. Examples: --extra-stage "H100_PCIe-[Post-Merge]-1, xxx".
+`--extra-stage "H100_PCIe-TensorRT-Post-Merge-1, xxx"` *(OPTIONAL)* : Run the ordinary L0 pre-merge pipeline and specified test stages. Examples: --extra-stage "H100_PCIe-TensorRT-Post-Merge-1, xxx".
+
+`--detailed-log ` *(OPTIONAL)* : Enable flushing out all logs to the Jenkins console. This will significantly increase the log volume and may slow down the job.
+
+`--debug ` *(OPTIONAL)* : **Experimental feature**. Enable access to the CI container for debugging purpose. Note: Specify exactly one stage in the `stage-list` parameter to access the appropriate container environment. Note: Does **NOT** update GitHub check status.
 
 For guidance on mapping tests to stage names, see `docs/source/reference/ci-overview.md`.
 
diff --git a/.github/workflows/bot-command.yml b/.github/workflows/bot-command.yml
index 573e7f499ab..6689ab619d3 100644
--- a/.github/workflows/bot-command.yml
+++ b/.github/workflows/bot-command.yml
@@ -46,17 +46,22 @@ jobs:
             "Run `/bot [-h|--help]` to print this help message.\n\n" +
             "See details below for each supported subcommand.\n\n" +
             "<details>\n\n" +
-            "`run  [--disable-fail-fast --skip-test --stage-list \"A10-1, xxx\" --gpu-type \"A30, H100_PCIe\" --add-multi-gpu-test --only-multi-gpu-test --disable-multi-gpu-test --post-merge --extra-stage \"H100_PCIe-[Post-Merge]-1, xxx\"]`\n\n" +
+            "`run  [--reuse-test (optional)pipeline-id --disable-fail-fast --skip-test --stage-list \"A10-PyTorch-1, xxx\" --gpu-type \"A30, H100_PCIe\" --test-backend \"pytorch, cpp\" --add-multi-gpu-test --only-multi-gpu-test --disable-multi-gpu-test --post-merge --extra-stage \"H100_PCIe-TensorRT-Post-Merge-1, xxx\" --detailed-log --debug(experimental)]`\n\n" +
             "Launch build/test pipelines. All previously running jobs will be killed.\n\n" +
+            "`--reuse-test (optional)pipeline-id ` *(OPTIONAL)* : Allow the new pipeline to reuse build artifacts and skip successful test stages from a specified pipeline or the last pipeline if no pipeline-id is indicated. If the Git commit ID has changed, this option will be always ignored. The DEFAULT behavior of the bot is to reuse build artifacts and successful test results from the last pipeline.\n\n" +
+            "`--disable-reuse-test ` *(OPTIONAL)* : Explicitly prevent the pipeline from reusing build artifacts and skipping successful test stages from a previous pipeline. Ensure that all builds and tests are run regardless of previous successes.\n\n" +
             "`--disable-fail-fast ` *(OPTIONAL)* : Disable fail fast on build/tests/infra failures.\n\n" +
             "`--skip-test ` *(OPTIONAL)* : Skip all test stages, but still run build stages, package stages and sanity check stages. Note: Does **NOT** update GitHub check status.\n\n" +
-            "`--stage-list \"A10-1, xxx\"` *(OPTIONAL)* : Only run the specified test stages. Examples: \"A10-1, xxx\". Note: Does **NOT** update GitHub check status.\n\n" +
+            "`--stage-list \"A10-PyTorch-1, xxx\"` *(OPTIONAL)* : Only run the specified test stages. Examples: \"A10-PyTorch-1, xxx\". Note: Does **NOT** update GitHub check status.\n\n" +
             "`--gpu-type \"A30, H100_PCIe\"` *(OPTIONAL)* : Only run the test stages on the specified GPU types. Examples: \"A30, H100_PCIe\". Note: Does **NOT** update GitHub check status.\n\n" +
+            "`--test-backend \"pytorch, cpp\"` *(OPTIONAL)* : Skip test stages which don't match the specified backends. Only support [pytorch, cpp, tensorrt, triton]. Examples: \"pytorch, cpp\" (does not run test stages with tensorrt or triton backend). Note: Does **NOT** update GitHub pipeline status.\n\n" +
             "`--only-multi-gpu-test ` *(OPTIONAL)* : Only run the multi-GPU tests. Note: Does **NOT** update GitHub check status.\n\n" +
             "`--disable-multi-gpu-test ` *(OPTIONAL)* : Disable the multi-GPU tests. Note: Does **NOT** update GitHub check status.\n\n" +
-            "`--add-multi-gpu-test ` *(OPTIONAL)* : Force run the multi-GPU tests. Will also run L0 pre-merge pipeline.\n\n" +
+            "`--add-multi-gpu-test ` *(OPTIONAL)* : Force run the multi-GPU tests in addition to running L0 pre-merge pipeline.\n\n" +
             "`--post-merge ` *(OPTIONAL)* : Run the L0 post-merge pipeline instead of the ordinary L0 pre-merge pipeline.\n\n" +
-            "`--extra-stage \"H100_PCIe-[Post-Merge]-1, xxx\"` *(OPTIONAL)* : Run the ordinary L0 pre-merge pipeline and specified test stages. Examples: --extra-stage \"H100_PCIe-[Post-Merge]-1, xxx\".\n\n" +
+            "`--extra-stage \"H100_PCIe-TensorRT-Post-Merge-1, xxx\"` *(OPTIONAL)* : Run the ordinary L0 pre-merge pipeline and specified test stages. Examples: --extra-stage \"H100_PCIe-TensorRT-Post-Merge-1, xxx\".\n\n" +
+            "`--detailed-log ` *(OPTIONAL)* : Enable flushing out all logs to the Jenkins console. This will significantly increase the log volume and may slow down the job.\n\n" +
+            "`--debug ` *(OPTIONAL)* : **Experimental feature**. Enable access to the CI container for debugging purpose. Note: Specify exactly one stage in the `stage-list` parameter to access the appropriate container environment. Note: Does **NOT** update GitHub check status.\n\n" +
             "### kill\n\n" +
             "`kill  `\n\n" +
             "Kill all running builds associated with pull request.\n\n" +

From 519a2116b5c4d0c945654a8eacb52817c1ad8f93 Mon Sep 17 00:00:00 2001
From: Yiteng Niu <6831097+niukuo@users.noreply.github.com>
Date: Fri, 18 Jul 2025 15:38:38 +0800
Subject: [PATCH 79/88] [None][infra] Update the allow list of CI trigger
 (#6168)

Signed-off-by: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com>
Co-authored-by: tensorrt-cicd <90828364+tensorrt-cicd@users.noreply.github.com>
---
 .github/workflows/blossom-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml
index 7690a85e22d..b2b253b2f6c 100644
--- a/.github/workflows/blossom-ci.yml
+++ b/.github/workflows/blossom-ci.yml
@@ -40,7 +40,7 @@ jobs:
         startsWith(github.event.comment.body, '/bot skip --comment') ||
         startsWith(github.event.comment.body, '/bot reuse-pipeline') ||
         startsWith(github.event.comment.body, '/bot kill')) && contains(
-        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub"]'),
+        fromJson('["byshiue","chuangz0","funatiq","hypdeb","jdemouth-nvidia","joyang-nv","lowsfer","Tabrizian","yweng0828","Shixiaowei02","MartinMarciniszyn","schetlur-nv","dcampora","pcastonguay","Naveassaf","lfr-0531","nekorobov","PerkzZheng","kaiyux","nv-guomingz","LinPoly","thorjohnsen","jiahanc","latency1024","tburt-nv","zeroepoch","chzblych","niukuo","ZhanruiSunCh","EmmaQiaoCh","yiqingy0","achartier","suyoggupta","amukkara","mk-nvidia","QiJune","lucaslie","davidmlw","hlu1","nvzhou","syuoni","NVGaryJi","symphonylyh","hello-11","zongfeijing","Jackch-NV","jinyangyuan-nvidia","LarryXFly","crazydemo","jaedeok-nvidia","wm2012011492","rosenrodt","zhuoyao1012","xinhe-nv","Yuening-wa","Shunkangz","zhengd-nv","yibinl-nvidia","StanleySun639","KingsleyLiu-NV","kxdc","yingcanw","BestJuly","ChristinaZ","bobboli","xueweilnvidia","kunlunl","cherichy","lucifer1004","Autumn1998","litaotju","peaceh-nv","liji-nv","SimengLiu-nv","yuxianq","yechank-nvidia","vallis-neria","DylanChen-NV","Tracin","zhhuang-nv","ISEEKYAN","xupinjie","tongyuantongyu","laikhtewari","zhuolingwang","dominicshanshan","jershi425","shifangx","StudyingShao","Superjomn","dongjiyingdjy","guangyunh-nv","wili-65535","tiffany940107","DanBlanaru","mikeiovine","djns99","ruodil","xiaoweiw-nv","xuwchen","bashimao","yizhang-nv","hyukn","nvpohanh","yuki-666","juney-nvidia","barry-delaney","Kefeng-Duan","MinaHuai","yilin-void","jhaotingc","jmydurant","katec846","CarstyYou","Njuapp","Jie-Fang","nvbrantz","inocsin","ruoqianguo","chenfeiz0326","ming-wei","eopXD","longlee0622","dongfengy","georgeliu95","evezhier","rakib-hasan","shangz-ai","JyChang012","wangsiping1997","yuanjings-nvda","tomeras91","roikoren755","amirkl94","shaharmor98","danielafrimi","amitz-nv","hijkzzz","rzilberstein-nvidia","dc3671","hchings","yuhengxnv","dongxuy04","qiaoxj07","omera-nv","DomBrown","brb-nv","FrankD412","yuhsuan-t","Fridah-nv","a-mccarthy","HuiGao-NV","alexmsettle","meenchen","sugunav14","cjluo-nv","kyleliang-nv","chang-l","WeiHaocheng","qixiang-99","BatshevaBlack","ebarilanM","xmchen1987","lingjiew","heyuhhh","netanel-haber","jiefangz-nv","wyw1267","yunruis","sklevtsov-nvidia","jgangani","pamelap-nvidia","ixlmar","GalSha","Dido0o0","rabiel","nvzhihanj","milesial","fzmu727","zackyoray","RoeyAzran1992","viraatc","v-shobhit","yuanjingx87","uchihatmtkinu","nvrohanv","vegaluisjose","qsang-nv","ChunhuanLin","timlee0212","venkywonka","zbpatel","tijyojwad","shyeh25","zihaok","nv-yilinf","ttyio","farazkh80","yuantailing","JennyLiu-nv","moraxu","IzzyPutterman","nvchenghaoz","nvxuanyuc","poweiw","stnie","zhanga5","nzmora-nvidia","greg-kwasniewski1","linda-stadter","Tom-Zheng","vanshilshah97","ixlmar","MatthiasKohl","Wanli-Jiang", "arekay", "davidclark-nv", "2ez4bz", "tcherckez-nvidia", "MrGeva", "galagam", "limin2021", "dhansen-nvidia","talorabr","kanghui0204","wu6u3tw","hvagadia","xavier-nvidia","raayandhar","dbari","nvjullin","elvischenv","zhenhuaw-me","weireweire","yifeizhang-c","jiaganc","ziyixiong-nv","FelixXidddd","JunyiXu-nv","bo-nv","zerollzeng","RayenTian","ameynaik-hub","raymochen","shuyixiong","johncalesp","leslie-fang25","reasonsolo","zhou-yuxin","vadiklyutiy","yali-arch","NVShreyas","h-guo18","pengbowang-nv"]'),
         github.actor)
     steps:
       - name: Check if comment is issued by authorized person

From a95f31e72aeac0a07ad7f7c0cb219a9b8e800a43 Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Fri, 18 Jul 2025 16:53:02 +0800
Subject: [PATCH 80/88] chore: add more log in FmhaDispatcher (#6170)

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
---
 cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
index 7eb6682ec7a..52471c70d7f 100644
--- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
+++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
@@ -56,7 +56,8 @@ FmhaDispatcher::FmhaDispatcher(MHARunnerFixedParams fixedParams)
     else
     {
         TLLM_CHECK_WITH_INFO(mFixedParams.dataType == mFixedParams.dataTypeKv,
-            "KV cache data type should be the same as input data type.");
+            "KV cache data type %s is not the same as input data type %s.",
+            data_type_to_string(mFixedParams.dataTypeKv).c_str(), data_type_to_string(mFixedParams.dataType).c_str());
 
         // For FP8 MLA generation, the output type is BF16, which could be different from the input type.
         // So we shouldn't do this check anymore.

From 77acb4f753e1d2cb9385a7f0880f3ea05a2d5f52 Mon Sep 17 00:00:00 2001
From: Emma Qiao <qqiao@nvidia.com>
Date: Fri, 18 Jul 2025 17:34:34 +0800
Subject: [PATCH 81/88] [Infra] - Waive failed tests in post-merge (#6176)

Signed-off-by: qqiao <qqiao@nvidia.com>
---
 tests/integration/test_lists/waives.txt   | 10 ++++++++++
 tests/unittest/llmapi/test_llm_pytorch.py |  1 +
 2 files changed, 11 insertions(+)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 630f62ab670..d1ed978c99e 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -438,3 +438,13 @@ examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float
 test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)
 triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437)
 triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] SKIP (https://nvbugs/5396437)
+triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5401088)
+accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype SKIP (https://nvbugs/5401114)
+test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True] SKIP (https://nvbugs/5401114)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm] SKIP (https://nvbugs/5401163)
+accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm] SKIP (https://nvbugs/5401163)
+examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
+triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p--False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-True-tensorrt_llm_bls] SKIP (https://nvbugs/5401261)
+triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5401261)
+examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
+examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5401156)
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index fbf97c88117..2a91c42192b 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -254,6 +254,7 @@ def test_llama_7b_multi_lora():
 
 # TODO smor: currently Nemotron-Super-49B-v1 with LoRA memory consumption is overly high
 # https://jirasw.nvidia.com/browse/TRTLLM-5045
+@pytest.mark.skip(reason="https://nvbugs/5401210")
 @skip_gpu_memory_less_than_138gb
 def test_nemotron_nas_lora() -> None:
     lora_config = LoraConfig(lora_dir=[

From ec2b953e7e05f9fc9fa2e1cf5d831707a6d812c5 Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Fri, 18 Jul 2025 12:12:08 +0200
Subject: [PATCH 82/88] refactor: Enhanced handling of decoder requests and
 logits within the batch manager (#6055)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 .../batch_manager/decoderBuffers.h            | 11 ++--
 .../batch_manager/guidedDecoder.h             |  4 +-
 .../batch_manager/logitsPostProcessor.h       | 13 +++--
 .../makeDecodingBatchInputOutput.h            |  3 +-
 .../batch_manager/decoderBuffers.cpp          |  4 +-
 .../batch_manager/guidedDecoder.cpp           | 40 ++++++-------
 .../batch_manager/handleContextLogits.cpp     | 44 ++++++++------
 .../batch_manager/handleGenerationLogits.cpp  | 17 ++++--
 .../batch_manager/logitsPostProcessor.cpp     | 52 +++++++----------
 .../makeDecodingBatchInputOutput.cpp          | 57 +++++++------------
 .../trtGptModelInflightBatching.cpp           | 15 ++---
 .../pybind/batch_manager/algorithms.cpp       | 12 ++--
 .../pybind/batch_manager/bindings.cpp         |  8 +--
 cpp/tests/batch_manager/guidedDecoderTest.cpp | 34 ++++++++---
 cpp/tests/runtime/gptDecoderBatchedTest.cpp   |  6 +-
 tensorrt_llm/_torch/pyexecutor/sampler.py     |  3 +-
 16 files changed, 168 insertions(+), 155 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h b/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
index 831a4179ecb..2af03c0af71 100644
--- a/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
@@ -38,8 +39,8 @@ class DecoderInputBuffers
     using SizeType32 = runtime::SizeType32;
     using TensorPtr = runtime::ITensor::SharedPtr;
 
-    explicit DecoderInputBuffers(SizeType32 maxNumSequences, SizeType32 maxBatchSize, SizeType32 maxDecoderSteps,
-        runtime::BufferManager const& manager);
+    explicit DecoderInputBuffers(
+        SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, runtime::BufferManager const& manager);
 
     void setupMedusaLogits(SizeType32 maxNumSequences, runtime::ModelConfig const& modelConfig);
 
@@ -56,11 +57,13 @@ class DecoderInputBuffers
 
     //! Buffers for decoder forward
 
+    //! Requests for considered in decoder forward
+    RequestVector decoderRequests;
+
     //! Batch slots for all decoder steps, [maxDecoderSteps][maxBatchSize]
     std::vector<TensorPtr> forwardBatchSlots;
 
-    //! Logits for all batch slots, [maxNumSequences]
-    //! The vector is sparse, only slots in forwardBatchSlots are used.
+    //! Logits of decoder requests
     std::vector<TensorPtr> logits;
 
     //! Logits for speculative decoding (Medusa)
diff --git a/cpp/include/tensorrt_llm/batch_manager/guidedDecoder.h b/cpp/include/tensorrt_llm/batch_manager/guidedDecoder.h
index 26d20cc9fa3..9a577b61ad5 100644
--- a/cpp/include/tensorrt_llm/batch_manager/guidedDecoder.h
+++ b/cpp/include/tensorrt_llm/batch_manager/guidedDecoder.h
@@ -29,6 +29,7 @@ class GrammarCompiler;
 
 namespace tensorrt_llm::batch_manager
 {
+class DecoderInputBuffers;
 
 class GuidedDecoder
 {
@@ -40,8 +41,7 @@ class GuidedDecoder
     GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodingConfig, SizeType32 maxNumSequences,
         SizeType32 vocabSizePadded, nvinfer1::DataType logitsDtype, runtime::BufferManager const& runtimeBufferManager);
     void build(ScheduledRequests const& scheduledRequests);
-    void execute(ScheduledRequests const& scheduledRequests, runtime::BufferManager const& runtimeBufferManager,
-        std::vector<TensorPtr> const& decoderBuffersLogits);
+    void execute(DecoderInputBuffers const& decoderInputBuffers, runtime::BufferManager const& runtimeBufferManager);
 
 private:
     executor::GuidedDecodingConfig::GuidedDecodingBackend mGuidedDecodingBackend;
diff --git a/cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h b/cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h
index 9610b96763b..048a84ecca3 100644
--- a/cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h
+++ b/cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h
@@ -24,28 +24,29 @@
 
 namespace tensorrt_llm::runtime
 {
-class TllmRuntime;
+class CudaStream;
 }
 
 namespace tensorrt_llm::batch_manager
 {
+class DecoderInputBuffers;
 
 class LogitsPostProcessor : Algorithm
 {
 public:
+    using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
+
     using LogitsPostProcessorBatched = std::function<void(std::vector<batch_manager::LlmRequest::RequestIdType> const&,
         std::vector<batch_manager::LlmRequest::TensorPtr>&,
-        std::vector<std::reference_wrapper<batch_manager::LlmRequest::BeamTokens const>> const&,
-        runtime::BufferManager::CudaStreamPtr const&,
+        std::vector<std::reference_wrapper<batch_manager::LlmRequest::BeamTokens const>> const&, CudaStreamPtr const&,
         std::vector<std::optional<batch_manager::LlmRequest::RequestIdType>> const&)>;
 
     constexpr static auto name{"LogitsPostProcessor"};
 
     LogitsPostProcessor() = default;
 
-    bool operator()(RequestVector const& contextRequests, RequestVector const& generationRequests,
-        bool replicateLogitsPostProcessor, std::vector<batch_manager::LlmRequest::TensorPtr>& seqSlotLogits,
-        runtime::WorldConfig const& worldConfig, runtime::TllmRuntime& runtime,
+    bool operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
+        runtime::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
         std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched = std::nullopt) const;
 };
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
index 1757a9f076e..cea23a4e7ec 100644
--- a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
+++ b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -46,8 +46,7 @@ class MakeDecodingBatchInputOutput : Algorithm
 
     MakeDecodingBatchInputOutput() = default;
 
-    std::unique_ptr<runtime::decoder_batch::Input> operator()(RequestVector const& contextRequests,
-        RequestVector const& generationRequests, DecoderInputBuffers const& inputBuffers,
+    std::unique_ptr<runtime::decoder_batch::Input> operator()(DecoderInputBuffers& inputBuffers,
         runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig,
         SizeType32 maxNumSequences, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
diff --git a/cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp b/cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp
index f48e12d6c88..fd67bb55e89 100644
--- a/cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp
+++ b/cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp
@@ -31,7 +31,7 @@ namespace tensorrt_llm::batch_manager
 {
 
 DecoderInputBuffers::DecoderInputBuffers(
-    SizeType32 maxNumSequences, SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, BufferManager const& manager)
+    SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, BufferManager const& manager)
 {
     auto const maxBatchSizeShape = ITensor::makeShape({maxBatchSize});
     auto const nvSizeType = TRTDataType<SizeType32>::value;
@@ -49,8 +49,6 @@ DecoderInputBuffers::DecoderInputBuffers(
     {
         forwardBatchSlots.emplace_back(BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize}), nvSizeType));
     }
-
-    logits.resize(maxNumSequences);
 }
 
 void DecoderInputBuffers::setupMedusaLogits(SizeType32 maxNumSequences, ModelConfig const& modelConfig)
diff --git a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
index 871a33e3ee5..a5a7502c330 100644
--- a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
+++ b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
@@ -16,6 +16,7 @@
  */
 
 #include "tensorrt_llm/batch_manager/guidedDecoder.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/kernels/logitsBitmask.h"
 
@@ -136,8 +137,7 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
     }
 }
 
-void GuidedDecoder::execute(ScheduledRequests const& scheduledRequests, BufferManager const& runtimeBufferManager,
-    std::vector<TensorPtr> const& decoderBuffersLogits)
+void GuidedDecoder::execute(DecoderInputBuffers const& decoderInputBuffers, BufferManager const& runtimeBufferManager)
 {
     auto const& stream = runtimeBufferManager.getStream();
 
@@ -150,32 +150,28 @@ void GuidedDecoder::execute(ScheduledRequests const& scheduledRequests, BufferMa
     mCopyBufferManager.getStream().record(event);
     stream.wait(event);
 
-    SizeType32 batchIdx{0};
-    if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
+    if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR
+        && !decoderInputBuffers.decoderRequests.empty())
     {
-        for (auto const& requests : {scheduledRequests.contextRequests, scheduledRequests.generationRequests})
+        SizeType32 batchIdx{0};
+        for (size_t requestIdx = 0; requestIdx < decoderInputBuffers.decoderRequests.size(); ++requestIdx)
         {
-            for (auto const& llmReq : requests)
+            auto const& llmReq = decoderInputBuffers.decoderRequests.at(requestIdx);
+
+            auto const& guidedDecodingParams = llmReq->getGuidedDecodingParams();
+            if (guidedDecodingParams.has_value())
             {
-                if (llmReq->isContextInitState() && !llmReq->isLastContextChunk())
-                {
-                    continue;
-                }
-                auto const& guidedDecodingParams = llmReq->getGuidedDecodingParams();
-                if (guidedDecodingParams.has_value())
-                {
-                    auto const seqSlot = llmReq->mSeqSlot.value();
+                auto const seqSlot = llmReq->mSeqSlot.value();
 
-                    auto const& logits = decoderBuffersLogits.at(seqSlot);
-                    auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});
+                auto const& logits = decoderInputBuffers.logits.at(requestIdx);
+                auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});
 
-                    // Use void* to unify the code for different mLogitsDtype
-                    *reinterpret_cast<void**>(ITensor::at(mLogitsPtrVecHost, {batchIdx})->data()) = logits->data();
-                    *reinterpret_cast<void**>(ITensor::at(mLogitsBitmaskPtrVecHost, {batchIdx})->data())
-                        = logitsBitmask->data();
+                // Use void* to unify the code for different mLogitsDtype
+                *reinterpret_cast<void**>(ITensor::at(mLogitsPtrVecHost, {batchIdx})->data()) = logits->data();
+                *reinterpret_cast<void**>(ITensor::at(mLogitsBitmaskPtrVecHost, {batchIdx})->data())
+                    = logitsBitmask->data();
 
-                    ++batchIdx;
-                }
+                ++batchIdx;
             }
         }
         if (batchIdx > 0)
diff --git a/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp b/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp
index e7ead88fb34..df3840c14b4 100644
--- a/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp
+++ b/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp
@@ -76,6 +76,13 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(HandleContextLogits);
 
+    auto& decoderRequests = inputBuffers.decoderRequests;
+    decoderRequests.clear();
+    decoderRequests.reserve(contextRequests.size());
+    auto& allDecoderLogits = inputBuffers.logits;
+    allDecoderLogits.clear();
+    allDecoderLogits.reserve(contextRequests.size());
+
     SizeType32 batchIndex{0};
     SizeType32 logitsIndex{0};
     // Copy logits into decoderBuffers.logits
@@ -115,7 +122,6 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
         // Get the logits from the last context token and draft tokens
         auto const numDecoderLogits = 1 + draftLength;
         auto const seqSlot = llmReq->mSeqSlot.value();
-        auto& decoderLogits = inputBuffers.logits.at(seqSlot);
         TensorPtr logitsView = ITensor::slice(logits, logitsIndex - numDecoderLogits, numDecoderLogits);
 
         if (modelConfig.getSpeculativeDecodingMode().hasDraftLogits())
@@ -136,22 +142,28 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
 
         TLLM_CHECK_DEBUG_WITH_INFO(tru::tensorHasInvalid<float>(*logitsView, manager, "logits") == false,
             "Found invalid number (NaN or Inf) in logits");
-        // Scatter the output logits to the decoderLogits
-        auto const reqBeamWidth = llmReq->getBeamWidthByIter();
-        if (reqBeamWidth > 1)
-        {
-            // Tile logits of context requests
-            auto const logitsShape = logitsView->getShape();
-            auto const logitsType = logitsView->getDataType();
-            decoderLogits = manager.gpu(ITensor::makeShape({reqBeamWidth, logitsShape.d[1]}), logitsType);
-            tensorrt_llm::runtime::kernels::tileTensor(*decoderLogits, *logitsView, reqBeamWidth, manager.getStream());
-            decoderLogits->unsqueeze(0);
-        }
-        else
+
+        if (llmReq->isLastContextChunk())
         {
-            auto const logitsViewShape = logitsView->getShape();
-            decoderLogits
-                = ITensor::view(logitsView, ITensor::makeShape({logitsViewShape.d[0], 1, logitsViewShape.d[1]}));
+            TensorPtr decoderLogits;
+            auto const reqBeamWidth = llmReq->getBeamWidthByIter();
+            if (reqBeamWidth > 1)
+            {
+                // Tile logits of context requests
+                auto const& logitsShape = logitsView->getShape();
+                auto const logitsType = logitsView->getDataType();
+                decoderLogits = manager.gpu(ITensor::makeShape({reqBeamWidth, logitsShape.d[1]}), logitsType);
+                tensorrt_llm::runtime::kernels::tileTensor(
+                    *decoderLogits, *logitsView, reqBeamWidth, manager.getStream());
+                decoderLogits->unsqueeze(0);
+            }
+            else
+            {
+                decoderLogits = logitsView;
+                decoderLogits->unsqueeze(1);
+            }
+            decoderRequests.push_back(llmReq);
+            allDecoderLogits.emplace_back(std::move(decoderLogits));
         }
 
         ++batchIndex;
diff --git a/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp b/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp
index a5cecc54751..5018ae36290 100644
--- a/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp
+++ b/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp
@@ -22,6 +22,7 @@
 #include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/batch_manager/utils/inflightBatchingUtils.h"
+#include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/nvtxUtils.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
@@ -82,6 +83,11 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(HandleGenerationLogits);
 
+    auto& decoderRequests = inputBuffers.decoderRequests;
+    decoderRequests.reserve(decoderRequests.size() + generationRequests.size());
+    auto& allDecoderLogits = inputBuffers.logits;
+    allDecoderLogits.reserve(allDecoderLogits.size() + generationRequests.size());
+
     for (auto const& llmReq : generationRequests)
     {
         auto const reqBeamWidth = llmReq->getBeamWidthByIter();
@@ -101,8 +107,9 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
         TensorPtr logitsView = ITensor::slice(logits, logitsIndex, numLogits);
         TLLM_CHECK_DEBUG_WITH_INFO(tru::tensorHasInvalid<float>(*logitsView, manager, "logits") == false,
             "Found invalid number (NaN or Inf) in logits");
-        auto& decoderLogits = inputBuffers.logits.at(seqSlot);
-        auto const logitsViewShape = logitsView->getShape();
+
+        TLLM_CHECK(llmReq->isGenerationInProgressState());
+        TensorPtr decoderLogits;
         if (reqBeamWidth > 1)
         {
             decoderLogits = logitsView;
@@ -110,9 +117,11 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
         }
         else
         {
-            decoderLogits
-                = ITensor::view(logitsView, ITensor::makeShape({logitsViewShape.d[0], 1, logitsViewShape.d[1]}));
+            decoderLogits = logitsView;
+            decoderLogits->unsqueeze(1);
         }
+        decoderRequests.push_back(llmReq);
+        allDecoderLogits.emplace_back(std::move(decoderLogits));
 
         if (llmReq->getReturnGenerationLogits())
         {
diff --git a/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp b/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
index 10210c3f4eb..dd34de0ef9a 100644
--- a/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
+++ b/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
@@ -17,25 +17,24 @@
 
 #include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
 
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/common/nvtxUtils.h"
 #include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/tllmRuntime.h"
 
 namespace tr = tensorrt_llm::runtime;
 
 namespace tensorrt_llm::batch_manager
 {
 
-using BufferManager = tensorrt_llm::runtime::BufferManager;
 using TensorPtr = runtime::ITensor::SharedPtr;
 using ITensor = runtime::ITensor;
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
-bool LogitsPostProcessor::operator()(RequestVector const& contextRequests, RequestVector const& generationRequests,
-    bool replicateLogitsPostProcessor, std::vector<TensorPtr>& seqSlotLogits, tr::WorldConfig const& worldConfig,
-    tr::TllmRuntime& runtime, std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const
+bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
+    tr::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
+    std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(LogitsPostProcessor);
@@ -47,35 +46,28 @@ bool LogitsPostProcessor::operator()(RequestVector const& contextRequests, Reque
     std::vector<std::optional<LlmRequest::RequestIdType>> clientIdsVec;
 
     bool logitsPostProcessorIsApplied = false;
-    for (auto const& requests : {contextRequests, generationRequests})
+    for (size_t batchIdx = 0; batchIdx < inputBuffers.decoderRequests.size(); ++batchIdx)
     {
-        for (auto const& llmReq : requests)
+        auto const& llmReq = inputBuffers.decoderRequests.at(batchIdx);
+        auto& logits = inputBuffers.logits.at(batchIdx);
+
+        // Invoke non-batched processor or collect arguments for batched processor
+        if (llmReq->mLogitsPostProcessor)
         {
-            if (llmReq->isContextInitState() ? llmReq->isLastContextChunk() : llmReq->isGenerationInProgressState())
+            logitsPostProcessorIsApplied = true;
+            if (replicateLogitsPostProcessor || worldConfig.isFirstTensorParallelRank())
             {
-                // Invoke non-batched processor or collect arguments for batched processor
-                if (llmReq->mLogitsPostProcessor)
-                {
-                    logitsPostProcessorIsApplied = true;
-                    if (replicateLogitsPostProcessor || worldConfig.isFirstTensorParallelRank())
-                    {
-                        auto& logits = seqSlotLogits.at(llmReq->mSeqSlot.value());
-                        (*llmReq->mLogitsPostProcessor)(
-                            llmReq->mRequestId, logits, llmReq->getTokens(), runtime.getStreamPtr(), llmReq->mClientId);
-                    }
-                }
-                else if (llmReq->mApplyLogitsPostProcessorBatched)
-                {
-                    reqIdsVec.push_back(llmReq->mRequestId);
-
-                    auto& logits = seqSlotLogits.at(llmReq->mSeqSlot.value());
-                    logitsVec.push_back(logits);
-
-                    beamTokensVec.emplace_back(llmReq->getTokens());
-                    clientIdsVec.push_back(llmReq->mClientId);
-                }
+                (*llmReq->mLogitsPostProcessor)(
+                    llmReq->mRequestId, logits, llmReq->getTokens(), stream, llmReq->mClientId);
             }
         }
+        else if (llmReq->mApplyLogitsPostProcessorBatched)
+        {
+            reqIdsVec.push_back(llmReq->mRequestId);
+            logitsVec.push_back(logits);
+            beamTokensVec.emplace_back(llmReq->getTokens());
+            clientIdsVec.push_back(llmReq->mClientId);
+        }
     }
 
     // Invoke batched processor
@@ -84,7 +76,7 @@ bool LogitsPostProcessor::operator()(RequestVector const& contextRequests, Reque
         logitsPostProcessorIsApplied = true;
         if (replicateLogitsPostProcessor || worldConfig.isFirstTensorParallelRank())
         {
-            (*logitsPostProcessorBatched)(reqIdsVec, logitsVec, beamTokensVec, runtime.getStreamPtr(), clientIdsVec);
+            (*logitsPostProcessorBatched)(reqIdsVec, logitsVec, beamTokensVec, stream, clientIdsVec);
         }
     }
 
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
index 64dedbc4497..c9b2bb0b937 100644
--- a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
+++ b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -33,7 +33,7 @@ using TensorPtr = MakeDecodingBatchInputOutput::TensorPtr;
 
 std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDecoderBatchInputs(
     std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState,
-    std::vector<TensorPtr> const& logits, SizeType32 maxNumSequences, std::vector<TensorPtr> const& batchSlots)
+    std::vector<TensorPtr> const& decoderLogits, SizeType32 maxNumSequences, std::vector<TensorPtr> const& batchSlots)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -47,40 +47,35 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDe
         batchSlots.at(step)->resize(maxNumSequences);
     }
 
-    std::vector<SizeType32> batchIdx(maxDecoderSteps);
+    auto constexpr singleRequest = 1;
+
+    std::vector<SizeType32> batchSizes(maxDecoderSteps);
+    std::vector<std::vector<tr::ITensor::SharedConstPtr>> batchLogits(maxDecoderSteps);
     auto maxActiveDecoderSteps = 1;
-    for (auto const slot : activeSlots)
+    for (size_t batchIdx = 0; batchIdx < activeSlots.size(); ++batchIdx)
     {
+        auto const slot = activeSlots.at(batchIdx);
+        auto const& logits = decoderLogits.at(batchIdx);
+
         auto const numDecoderSteps = common::ceilDiv(numDecodingEngineTokens.at(slot), maxDecodingDecoderTokens);
         maxActiveDecoderSteps = std::max(maxActiveDecoderSteps, numDecoderSteps);
         for (SizeType32 step = 0; step < numDecoderSteps; ++step)
         {
             auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlots.at(step));
-            batchSlotsRange[batchIdx[step]] = slot;
-            batchIdx[step]++;
+            batchSlotsRange[batchSizes[step]] = slot;
+            batchSizes[step]++;
+            TensorPtr logitsSlice = tr::ITensor::slice(logits, step, singleRequest);
+            batchLogits[step].emplace_back(std::move(logitsSlice));
         }
     }
 
     for (SizeType32 step = 0; step < maxDecoderSteps; ++step)
     {
-        batchSlots.at(step)->resize(batchIdx[step]);
-    }
-
-    auto constexpr singleRequest = 1;
-    std::vector<std::vector<tr::ITensor::SharedConstPtr>> logitsVec(maxActiveDecoderSteps);
-    for (SizeType32 step = 0; step < maxActiveDecoderSteps; ++step)
-    {
-        auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlots.at(step));
-
-        for (auto slot : batchSlotsRange)
-        {
-            auto const& targetLogits = logits.at(slot);
-            TensorPtr logitsSlice = tr::ITensor::slice(targetLogits, step, singleRequest);
-            logitsVec.at(step).push_back(logitsSlice);
-        }
+        batchSlots.at(step)->resize(batchSizes[step]);
     }
+    batchLogits.resize(maxActiveDecoderSteps);
 
-    auto decodingInput = std::make_unique<tr::decoder_batch::Input>(logitsVec, maxActiveDecoderSteps);
+    auto decodingInput = std::make_unique<tr::decoder_batch::Input>(batchLogits, maxActiveDecoderSteps);
     decodingInput->batchSlots = batchSlots;
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
     return decodingInput;
@@ -89,21 +84,14 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDe
 namespace
 {
 
-std::pair<std::vector<SizeType32>, std::vector<SizeType32>> getActiveSlots(
-    RequestVector const& contextRequests, RequestVector const& generationRequests)
+std::pair<std::vector<SizeType32>, std::vector<SizeType32>> getActiveSlots(RequestVector const& decoderRequests)
 {
     std::vector<SizeType32> activeSlots;
     std::vector<SizeType32> generationSteps;
-    for (auto const& requests : {contextRequests, generationRequests})
+    for (auto const& llmReq : decoderRequests)
     {
-        for (auto const& llmReq : requests)
-        {
-            if (llmReq->isGenerationInProgressState() || llmReq->isLastContextChunk())
-            {
-                activeSlots.push_back(llmReq->mSeqSlot.value());
-                generationSteps.push_back(llmReq->getDecodingIter());
-            }
-        }
+        activeSlots.push_back(llmReq->mSeqSlot.value());
+        generationSteps.push_back(llmReq->getDecodingIter());
     }
 
     return {activeSlots, generationSteps};
@@ -167,14 +155,13 @@ void setEagleInputs(tr::DecodingInput& dInput, RuntimeBuffers const& fusedRuntim
 
 } // namespace
 
-std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::operator()(RequestVector const& contextRequests,
-    RequestVector const& generationRequests, DecoderInputBuffers const& inputBuffers,
+std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::operator()(DecoderInputBuffers& inputBuffers,
     runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
     OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
-    auto [activeSlots, generationSteps] = getActiveSlots(contextRequests, generationRequests);
+    auto [activeSlots, generationSteps] = getActiveSlots(inputBuffers.decoderRequests);
 
     auto decodingInput = createDecoderBatchInputs(
         activeSlots, decoderState, inputBuffers.logits, maxNumSequences, inputBuffers.forwardBatchSlots);
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index b36f0856fd5..80418b2bc73 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -1530,7 +1530,7 @@ void TrtGptModelInflightBatching::createBuffers(executor::DecodingConfig const&
     for (SizeType32 i = 0; i < mNumMicroBatches; ++i)
     {
         mDecoderInputBuffers.emplace_back(
-            getMaxNumSequences(), getMaxBatchSize(), mModelConfig.getMaxDecodingTokens(), mRuntime->getBufferManager());
+            getMaxBatchSize(), mModelConfig.getMaxDecodingTokens(), mRuntime->getBufferManager());
         mDecoderInputBuffers.back().setupMedusaLogits(getMaxNumSequences(), mModelConfig);
         mDecoderOutputBuffers.emplace_back(getMaxNumSequences(), mOperatingBeamWidth, getMaxSequenceLen(),
             mModelConfig.getMaxDecodingTokens(), mRuntime->getBufferManager());
@@ -2029,7 +2029,6 @@ runtime::CudaEvent TrtGptModelInflightBatching::decoderStepAsync(ScheduledReques
     NVTX3_SCOPED_RANGE(decoderStepAsync);
 
     auto& decoderInputBuffers = mDecoderInputBuffers.at(getFusedBufferId());
-    auto& seqSlotLogits = decoderInputBuffers.logits;
 
     auto const contextBufferId = mCtxGenFusion ? getFusedBufferId() : getContextBufferId();
     auto& contextRuntimeBuffers = mBuffers.at(contextBufferId);
@@ -2049,22 +2048,20 @@ runtime::CudaEvent TrtGptModelInflightBatching::decoderStepAsync(ScheduledReques
         copyCacheIndirectionFromOutputsToInputs(scheduledRequests, genBufferId);
     }
 
-    mLogitsPostProcessorIsApplied
-        = (*mLogitsPostProcessor)(scheduledRequests.contextRequests, scheduledRequests.generationRequests,
-            mReplicateLogitsPostProcessor, seqSlotLogits, mWorldConfig, *mRuntime, mLogitsPostProcessorBatched);
+    mLogitsPostProcessorIsApplied = (*mLogitsPostProcessor)(decoderInputBuffers, mReplicateLogitsPostProcessor,
+        mWorldConfig, mRuntime->getStreamPtr(), mLogitsPostProcessorBatched);
 
     if (mGuidedDecoder)
     {
-        mGuidedDecoder->execute(scheduledRequests, mRuntime->getBufferManager(), seqSlotLogits);
+        mGuidedDecoder->execute(decoderInputBuffers, mRuntime->getBufferManager());
     }
 
     auto const fusedBufferId = getFusedBufferId();
     auto& fusedRuntimeBuffers = mBuffers.at(fusedBufferId);
 
     auto& decodingInput = mDecodingInputs.at(mMicroBatchId);
-    decodingInput = (*mMakeDecodingBatchInputOutput)(scheduledRequests.contextRequests,
-        scheduledRequests.generationRequests, mDecoderInputBuffers.at(fusedBufferId), *mDecoderState, mModelConfig,
-        getMaxNumSequences(), *fusedRuntimeBuffers);
+    decodingInput = (*mMakeDecodingBatchInputOutput)(mDecoderInputBuffers.at(fusedBufferId), *mDecoderState,
+        mModelConfig, getMaxNumSequences(), *fusedRuntimeBuffers);
 
     auto decoderFinishEvent = mDecoder->forwardAsync(*mDecoderState, *decodingInput);
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
index 0f391d16650..f6bd8f02491 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -133,16 +133,16 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
 
     py::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
         .def(py::init())
-        .def("__call__", &MakeDecodingBatchInputOutput::operator(), py::arg("context_requests"),
-            py::arg("generation_requests"), py::arg("decoder_input_buffers"), py::arg("decoder_state"),
-            py::arg("model_config"), py::arg("max_num_sequences"), py::arg("fused_runtime_buffers") = std::nullopt)
+        .def("__call__", &MakeDecodingBatchInputOutput::operator(), py::arg("decoder_input_buffers"),
+            py::arg("decoder_state"), py::arg("model_config"), py::arg("max_num_sequences"),
+            py::arg("fused_runtime_buffers") = std::nullopt)
         .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
 
     py::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
         .def(py::init())
-        .def("__call__", &LogitsPostProcessor::operator(), py::arg("context_requests"), py::arg("generation_requests"),
-            py::arg("replicate_logits_post_processor"), py::arg("decoder_buffers"), py::arg("world_config"),
-            py::arg("runtime"), py::arg("logits_post_processor_batched") = std::nullopt)
+        .def("__call__", &LogitsPostProcessor::operator(), py::arg("decoder_input_buffers"),
+            py::arg("replicate_logits_post_processor"), py::arg("world_config"), py::arg("stream"),
+            py::arg("logits_post_processor_batched") = std::nullopt)
         .def("name", [](LogitsPostProcessor const&) { return LogitsPostProcessor::name; });
 
     py::class_<CreateNewDecoderRequests>(m, CreateNewDecoderRequests::name)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index f7ba20920c9..63d91ddab3d 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -393,16 +393,16 @@ void initBindings(pybind11::module_& m)
             py::arg("max_num_sequences"), py::arg("model_config"), py::arg("world_config"), py::arg("buffer_manager"));
 
     py::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
-        .def(py::init<runtime::SizeType32, runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(),
-            py::arg("max_num_sequences"), py::arg("max_batch_size"), py::arg("max_tokens_per_engine_step"),
-            py::arg("manager"))
+        .def(py::init<runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(), py::arg("max_batch_size"),
+            py::arg("max_tokens_per_engine_step"), py::arg("manager"))
         .def_readwrite("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
         .def_readwrite("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
         .def_readwrite("fill_values", &tb::DecoderInputBuffers::fillValues)
         .def_readwrite("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
         .def_readwrite("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
         .def_readwrite("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
-        .def_readwrite("logits", &tb::DecoderInputBuffers::logits);
+        .def_readwrite("logits", &tb::DecoderInputBuffers::logits)
+        .def_readwrite("decoder_requests", &tb::DecoderInputBuffers::decoderRequests);
 
     py::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
         .def_readwrite("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
diff --git a/cpp/tests/batch_manager/guidedDecoderTest.cpp b/cpp/tests/batch_manager/guidedDecoderTest.cpp
index 4b193ba3498..8358e987334 100644
--- a/cpp/tests/batch_manager/guidedDecoderTest.cpp
+++ b/cpp/tests/batch_manager/guidedDecoderTest.cpp
@@ -17,9 +17,9 @@
 #include <fstream>
 #include <gtest/gtest.h>
 #include <nlohmann/json.hpp>
-#include <random>
 
 #include "tensorrt_llm/batch_manager/common.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/guidedDecoder.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/executor/executor.h"
@@ -128,11 +128,21 @@ class GuidedDecoderTest : public ::testing::Test
         RequestVector contextRequests{llmReq1, llmReq2};
         RequestVector generationRequests{};
         ScheduledRequests scheduledRequests{contextRequests, generationRequests};
+        DecoderInputBuffers decoderInputBuffers(mMaxNumRequests, 1, *mRuntimeBufferManager);
+
+        for (auto const& requests : {scheduledRequests.contextRequests, scheduledRequests.generationRequests})
+        {
+            for (auto const& llmReq : requests)
+            {
+                decoderInputBuffers.decoderRequests.push_back(llmReq);
+            }
+        }
+        decoderInputBuffers.logits = mLogits;
 
         // Context phase
         resetLogits();
         mGuidedDecoder->build(scheduledRequests);
-        mGuidedDecoder->execute(scheduledRequests, *mRuntimeBufferManager, mLogits);
+        mGuidedDecoder->execute(decoderInputBuffers, *mRuntimeBufferManager);
         syncLogitsToHost();
         mRuntimeBufferManager->getStream().synchronize();
 
@@ -143,8 +153,18 @@ class GuidedDecoderTest : public ::testing::Test
         generationRequests.push_back(llmReq1);
         llmReq2->setState(LlmRequestState::kGENERATION_IN_PROGRESS);
         generationRequests.push_back(llmReq2);
-        EXPECT_EQ(countRejected(1), mExpectedNumRejected[0]);
-        EXPECT_EQ(countRejected(2), 0);
+
+        decoderInputBuffers.decoderRequests.clear();
+        for (auto const& requests : {scheduledRequests.contextRequests, scheduledRequests.generationRequests})
+        {
+            for (auto const& llmReq : requests)
+            {
+                decoderInputBuffers.decoderRequests.push_back(llmReq);
+            }
+        }
+
+        EXPECT_EQ(countRejected(0), mExpectedNumRejected[0]);
+        EXPECT_EQ(countRejected(1), 0);
 
         // Generation phase
         for (int i = 0; i < mOutputIds.size(); i++)
@@ -154,12 +174,12 @@ class GuidedDecoderTest : public ::testing::Test
 
             resetLogits();
             mGuidedDecoder->build(scheduledRequests);
-            mGuidedDecoder->execute(scheduledRequests, *mRuntimeBufferManager, mLogits);
+            mGuidedDecoder->execute(decoderInputBuffers, *mRuntimeBufferManager);
             syncLogitsToHost();
             mRuntimeBufferManager->getStream().synchronize();
 
-            EXPECT_EQ(countRejected(1), mExpectedNumRejected[i + 1]);
-            EXPECT_EQ(countRejected(2), 0);
+            EXPECT_EQ(countRejected(0), mExpectedNumRejected[i + 1]);
+            EXPECT_EQ(countRejected(1), 0);
         }
     }
 
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index e1a86e4479a..7c152f48a9e 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -322,7 +322,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
         modelConfig, worldConfig, manager);
 
     // set up inputs and outputs
-    tb::DecoderInputBuffers inputBuffers(batchSize, batchSize, maxGeneratedTokensPerStep, manager);
+    tb::DecoderInputBuffers inputBuffers(batchSize, maxGeneratedTokensPerStep, manager);
     auto batchSlotsRange = BufferRange<SizeType32>(*inputBuffers.setupBatchSlots);
     std::iota(batchSlotsRange.begin(), batchSlotsRange.end(), 0);
 
@@ -456,7 +456,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
         modelConfig, worldConfig, manager);
 
     // set up inputs and outputs
-    tb::DecoderInputBuffers inputBuffers(batchSize, batchSize, maxGeneratedTokensPerStep, manager);
+    tb::DecoderInputBuffers inputBuffers(batchSize, maxGeneratedTokensPerStep, manager);
 
     auto decoderInputs = createDecoderInputs(
         batchSize, vocabSizePadded, dataType, samplingConfigs, generatedTokensPerSteps, computeLogProbs, manager);
@@ -610,7 +610,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     }
 
     // set up inputs and outputs
-    tb::DecoderInputBuffers inputBuffers(batchSize, batchSize, maxGeneratedTokensPerStep, manager);
+    tb::DecoderInputBuffers inputBuffers(batchSize, maxGeneratedTokensPerStep, manager);
 
     auto decoderInputs = createDecoderInputs(
         batchSize, vocabSizePadded, dataType, samplingConfigs, generatedTokensPerSteps, false, manager);
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
index b4dfdf25d45..87b21328292 100644
--- a/tensorrt_llm/_torch/pyexecutor/sampler.py
+++ b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -536,8 +536,7 @@ def _initialize_store(self):
             "buffer_manager":
             buffer_manager,
             "decoder_input_buffers": [
-                DecoderInputBuffers(self.max_num_sequences,
-                                    self.executor_config.max_batch_size,
+                DecoderInputBuffers(self.executor_config.max_batch_size,
                                     self.MAX_DECODING_TOKENS, buffer_manager)
                 for _ in range(self.num_micro_batches)
             ],

From 44040edbf0e4111a89b724cb74a9fef12eccfd3a Mon Sep 17 00:00:00 2001
From: Leslie Fang <leslief@nvidia.com>
Date: Fri, 18 Jul 2025 19:53:38 +0800
Subject: [PATCH 83/88] update broken link of PyTorchModelEngine in
 arch_overview (#6171)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
---
 docs/source/torch/arch_overview.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/torch/arch_overview.md b/docs/source/torch/arch_overview.md
index 11b12781cea..ec7f6e51abf 100644
--- a/docs/source/torch/arch_overview.md
+++ b/docs/source/torch/arch_overview.md
@@ -37,7 +37,7 @@ The single-step flow of PyExecutor involves:
 
 The core component of `PyExecutor` is the `ModelEngine`, responsible for executing the model's forward pass efficiently on the GPU.
 The key method of `ModelEngine` is `forward`, which handles the forward pass computation.
-For the PyTorch backend, the derived class is `PyTorchModelEngine`, declared in [pytorch_model_engine.py](../../../tensorrt_llm/_torch/pyexecutor/pytorch_model_engine.py).
+For the PyTorch backend, the derived class is `PyTorchModelEngine`, declared in [model_engine.py](../../../tensorrt_llm/_torch/pyexecutor/model_engine.py).
 
 ## Decoder
 

From 1c89971d95e9e6737ba2a1e2bf6ff0529dfc93c8 Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Thu, 10 Jul 2025 15:28:13 +0000
Subject: [PATCH 84/88] refactor: Set decoder inputs in DecoderInputBuffers

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 .../batch_manager/decoderBuffers.h            | 16 +++++++++++---
 .../makeDecodingBatchInputOutput.h            |  5 ++---
 .../batch_manager/guidedDecoder.cpp           |  2 +-
 .../batch_manager/handleContextLogits.cpp     |  2 +-
 .../batch_manager/handleGenerationLogits.cpp  |  2 +-
 .../batch_manager/logitsPostProcessor.cpp     |  2 +-
 .../makeDecodingBatchInputOutput.cpp          | 21 ++++++++++++-------
 .../pybind/batch_manager/bindings.cpp         |  3 ++-
 8 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h b/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
index 2af03c0af71..df507cf1001 100644
--- a/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
+++ b/cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h
@@ -38,6 +38,7 @@ class DecoderInputBuffers
 public:
     using SizeType32 = runtime::SizeType32;
     using TensorPtr = runtime::ITensor::SharedPtr;
+    using TensorConstPtr = runtime::ITensor::SharedConstPtr;
 
     explicit DecoderInputBuffers(
         SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, runtime::BufferManager const& manager);
@@ -60,13 +61,22 @@ class DecoderInputBuffers
     //! Requests for considered in decoder forward
     RequestVector decoderRequests;
 
+    //! Logits of decoder requests
+    std::vector<TensorPtr> decoderLogits;
+
+    //! Maximum number of decoding steps of decoder requests.
+    //! This is only more than 1 for external draft tokens speculative decoding.
+    SizeType32 maxDecoderSteps{1};
+
     //! Batch slots for all decoder steps, [maxDecoderSteps][maxBatchSize]
     std::vector<TensorPtr> forwardBatchSlots;
 
-    //! Logits of decoder requests
-    std::vector<TensorPtr> logits;
+    //! Logits for requests in forwardBatchSlots (in the same order).
+    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
+    std::vector<std::vector<TensorConstPtr>> batchLogits;
 
-    //! Logits for speculative decoding (Medusa)
+    //! Logits for speculative decoding (Medusa).
+    //! The vector is sparse, only slots in forwardBatchSlots are used.
     //! [maxBatchSize][maxAcceptedDraftTokensPerStep][maxDraftTokens + 1, vocabSizePadded]
     std::vector<std::vector<runtime::ITensor::SharedPtr>> predictedDraftLogits;
 };
diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
index cea23a4e7ec..2cd13906c00 100644
--- a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
+++ b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -50,9 +50,8 @@ class MakeDecodingBatchInputOutput : Algorithm
         runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig,
         SizeType32 maxNumSequences, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
-    [[nodiscard]] static std::unique_ptr<runtime::decoder_batch::Input> createDecoderBatchInputs(
-        std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState,
-        std::vector<TensorPtr> const& logits, SizeType32 maxNumSequences, std::vector<TensorPtr> const& batchSlots);
+    static void createDecoderBatchInputs(DecoderInputBuffers& inputBuffers, std::vector<SizeType32> const& activeSlots,
+        runtime::decoder::DecoderState const& decoderState, SizeType32 maxNumSequences);
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
index a5a7502c330..646f730cc3a 100644
--- a/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
+++ b/cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp
@@ -163,7 +163,7 @@ void GuidedDecoder::execute(DecoderInputBuffers const& decoderInputBuffers, Buff
             {
                 auto const seqSlot = llmReq->mSeqSlot.value();
 
-                auto const& logits = decoderInputBuffers.logits.at(requestIdx);
+                auto const& logits = decoderInputBuffers.decoderLogits.at(requestIdx);
                 auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});
 
                 // Use void* to unify the code for different mLogitsDtype
diff --git a/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp b/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp
index df3840c14b4..6f4a541ffcb 100644
--- a/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp
+++ b/cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp
@@ -79,7 +79,7 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
     auto& decoderRequests = inputBuffers.decoderRequests;
     decoderRequests.clear();
     decoderRequests.reserve(contextRequests.size());
-    auto& allDecoderLogits = inputBuffers.logits;
+    auto& allDecoderLogits = inputBuffers.decoderLogits;
     allDecoderLogits.clear();
     allDecoderLogits.reserve(contextRequests.size());
 
diff --git a/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp b/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp
index 5018ae36290..e2a7486b050 100644
--- a/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp
+++ b/cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp
@@ -85,7 +85,7 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
 
     auto& decoderRequests = inputBuffers.decoderRequests;
     decoderRequests.reserve(decoderRequests.size() + generationRequests.size());
-    auto& allDecoderLogits = inputBuffers.logits;
+    auto& allDecoderLogits = inputBuffers.decoderLogits;
     allDecoderLogits.reserve(allDecoderLogits.size() + generationRequests.size());
 
     for (auto const& llmReq : generationRequests)
diff --git a/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp b/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
index dd34de0ef9a..99b4fc192de 100644
--- a/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
+++ b/cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp
@@ -49,7 +49,7 @@ bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool rep
     for (size_t batchIdx = 0; batchIdx < inputBuffers.decoderRequests.size(); ++batchIdx)
     {
         auto const& llmReq = inputBuffers.decoderRequests.at(batchIdx);
-        auto& logits = inputBuffers.logits.at(batchIdx);
+        auto& logits = inputBuffers.decoderLogits.at(batchIdx);
 
         // Invoke non-batched processor or collect arguments for batched processor
         if (llmReq->mLogitsPostProcessor)
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
index c9b2bb0b937..ae05c8981d0 100644
--- a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
+++ b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -31,9 +31,9 @@ namespace tensorrt_llm::batch_manager
 using SizeType32 = MakeDecodingBatchInputOutput::SizeType32;
 using TensorPtr = MakeDecodingBatchInputOutput::TensorPtr;
 
-std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDecoderBatchInputs(
+void MakeDecodingBatchInputOutput::createDecoderBatchInputs(DecoderInputBuffers& inputBuffers,
     std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState,
-    std::vector<TensorPtr> const& decoderLogits, SizeType32 maxNumSequences, std::vector<TensorPtr> const& batchSlots)
+    SizeType32 maxNumSequences)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -42,6 +42,9 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDe
     auto const& maxDecodingDecoderTokens = decoderState.getMaxDecodingDecoderTokens();
     auto const maxDecoderSteps = common::ceilDiv(maxDecodingEngineTokens, maxDecodingDecoderTokens);
 
+    auto& batchSlots = inputBuffers.forwardBatchSlots;
+    auto& decoderLogits = inputBuffers.decoderLogits;
+
     for (SizeType32 step = 0; step < maxDecoderSteps; ++step)
     {
         batchSlots.at(step)->resize(maxNumSequences);
@@ -64,7 +67,7 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDe
             auto batchSlotsRange = tr::BufferRange<SizeType32>(*batchSlots.at(step));
             batchSlotsRange[batchSizes[step]] = slot;
             batchSizes[step]++;
-            TensorPtr logitsSlice = tr::ITensor::slice(logits, step, singleRequest);
+            auto logitsSlice = tr::ITensor::slice(logits, step, singleRequest);
             batchLogits[step].emplace_back(std::move(logitsSlice));
         }
     }
@@ -75,10 +78,10 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::createDe
     }
     batchLogits.resize(maxActiveDecoderSteps);
 
-    auto decodingInput = std::make_unique<tr::decoder_batch::Input>(batchLogits, maxActiveDecoderSteps);
-    decodingInput->batchSlots = batchSlots;
+    inputBuffers.maxDecoderSteps = maxActiveDecoderSteps;
+    inputBuffers.batchLogits = batchLogits;
+
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return decodingInput;
 }
 
 namespace
@@ -163,8 +166,10 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::operator
 
     auto [activeSlots, generationSteps] = getActiveSlots(inputBuffers.decoderRequests);
 
-    auto decodingInput = createDecoderBatchInputs(
-        activeSlots, decoderState, inputBuffers.logits, maxNumSequences, inputBuffers.forwardBatchSlots);
+    createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, maxNumSequences);
+
+    auto decodingInput = std::make_unique<tr::decoder_batch::Input>(inputBuffers.batchLogits, inputBuffers.maxDecoderSteps);
+    decodingInput->batchSlots = inputBuffers.forwardBatchSlots;
 
     auto const maxBeamWidth = decoderState.getMaxBeamWidth();
     if (maxBeamWidth > 1)
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 63d91ddab3d..2bda3b4f1a7 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -401,7 +401,8 @@ void initBindings(pybind11::module_& m)
         .def_readwrite("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
         .def_readwrite("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
         .def_readwrite("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
-        .def_readwrite("logits", &tb::DecoderInputBuffers::logits)
+        .def_readwrite("batch_logits", &tb::DecoderInputBuffers::batchLogits)
+        .def_readwrite("decoder_logits", &tb::DecoderInputBuffers::decoderLogits)
         .def_readwrite("decoder_requests", &tb::DecoderInputBuffers::decoderRequests);
 
     py::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")

From 45a2694cf2ffdbce873b88213c39b8479257e792 Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Thu, 10 Jul 2025 15:33:29 +0000
Subject: [PATCH 85/88] refactor: Remove decoder_batch::Input and related
 interfaces

- Modified MakeDecodingBatchInputOutput to accept DecoderInputBuffers directly, simplifying the interface.
- Removed decoder_batch::Input and related interfaces.
- Updated GptDecoderBatched to work with the new DecoderInputBuffers structure.
- Adjusted Python bindings to reflect changes in the C++ interface.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 .../tensorrt_llm/runtime/gptDecoderBatched.h  |  7 +++--
 .../tensorrt_llm/runtime/iGptDecoderBatched.h | 10 +++++--
 .../trtGptModelInflightBatching.cpp           |  2 +-
 .../runtime/gptDecoderBatched.cpp             | 14 +++++----
 cpp/tests/runtime/gptDecoderBatchedTest.cpp   | 30 +++++++++----------
 5 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
index d5dfe9b7b19..fa367134cd6 100644
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@@ -52,8 +52,9 @@ class GptDecoderBatched : public IGptDecoderBatched
 
     void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) override;
 
-    CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
-    void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) override;
+    CudaEvent forwardAsync(
+        decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input) override;
+    void forward(decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input) override;
 
     //! @brief Gather final beam search results for request `batchSlot`.
     //! Result will only be available after event returned.
@@ -77,7 +78,7 @@ class GptDecoderBatched : public IGptDecoderBatched
 
 private:
     //! @brief Calls decoders for tokens per engine step
-    void forwardDispatch(decoder::DecoderState const& decoderState, decoder_batch::Input const& input);
+    void forwardDispatch(decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input);
 
 private:
     CudaStreamPtr mRuntimeStream;
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
index 327af71f8a7..8d7bb02739e 100644
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@@ -27,8 +27,9 @@
 
 namespace tensorrt_llm::batch_manager
 {
+class DecoderInputBuffers;
 class LlmRequest;
-}
+} // namespace tensorrt_llm::batch_manager
 
 namespace tensorrt_llm::runtime
 {
@@ -94,10 +95,13 @@ class IGptDecoderBatched
     virtual void disableLookahead(RequestVector const& genRequests, TensorPtr const& batchSlots) = 0;
 
     //! @brief Run one step for all requests without blocking the host process and return the token for synchronization.
-    virtual CudaEvent forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
+    virtual CudaEvent forwardAsync(
+        decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input)
+        = 0;
 
     //! @brief Run one step for all requests and wait for completion on the host.
-    virtual void forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input) = 0;
+    virtual void forward(decoder::DecoderState const& decoderState, batch_manager::DecoderInputBuffers const& input)
+        = 0;
 
     //! @brief Gather final beam search results for request `batchIdx`.
     //! Result will only be available after event returned
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 80418b2bc73..311507e73d3 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -2063,7 +2063,7 @@ runtime::CudaEvent TrtGptModelInflightBatching::decoderStepAsync(ScheduledReques
     decodingInput = (*mMakeDecodingBatchInputOutput)(mDecoderInputBuffers.at(fusedBufferId), *mDecoderState,
         mModelConfig, getMaxNumSequences(), *fusedRuntimeBuffers);
 
-    auto decoderFinishEvent = mDecoder->forwardAsync(*mDecoderState, *decodingInput);
+    auto decoderFinishEvent = mDecoder->forwardAsync(*mDecoderState, decoderInputBuffers);
 
     auto const returnLogProbs = batchReturnLogProbs(scheduledRequests);
     auto updateDecoderBuffersEvent = (*mUpdateDecoderBuffers)(mModelConfig, mDecoderOutputBuffers.at(fusedBufferId),
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
index 6e22b8f2f49..52d518b277a 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -19,7 +19,7 @@
 #include "common.h"
 #include "decoderState.h"
 #include "iBuffer.h"
-#include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/executor/types.h"
@@ -33,6 +33,7 @@
 #include <vector>
 
 using namespace tensorrt_llm::runtime;
+namespace tb = tensorrt_llm::batch_manager;
 using TensorPtr = ITensor::SharedPtr;
 
 GptDecoderBatched::GptDecoderBatched(GptDecoderBatched::CudaStreamPtr stream)
@@ -102,7 +103,7 @@ namespace
 {
 //! @brief Prepare Input and Output for decoder step.
 // TODO: produce new input and output objects
-void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step, decoder_batch::Input const& input,
+void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step, tb::DecoderInputBuffers const& input,
     BufferManager const& bufferManager)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@@ -112,7 +113,7 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,
     auto& dInput = decoderState.getJointDecodingInput();
     auto& dOutput = decoderState.getJointDecodingOutput();
 
-    dInput.batchSlots = input.batchSlots.at(step);
+    dInput.batchSlots = input.forwardBatchSlots.at(step);
     dInput.batchSize = static_cast<SizeType32>(dInput.batchSlots->getSize());
     dInput.logitsVec = input.logits.at(step);
 
@@ -150,7 +151,7 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,
 
 } // namespace
 
-void GptDecoderBatched::forwardDispatch(decoder::DecoderState const& decoderState, decoder_batch::Input const& input)
+void GptDecoderBatched::forwardDispatch(decoder::DecoderState const& decoderState, tb::DecoderInputBuffers const& input)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -168,7 +169,8 @@ void GptDecoderBatched::forwardDispatch(decoder::DecoderState const& decoderStat
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-CudaEvent GptDecoderBatched::forwardAsync(decoder::DecoderState const& decoderState, decoder_batch::Input const& input)
+CudaEvent GptDecoderBatched::forwardAsync(
+    decoder::DecoderState const& decoderState, tb::DecoderInputBuffers const& input)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -188,7 +190,7 @@ CudaEvent GptDecoderBatched::forwardAsync(decoder::DecoderState const& decoderSt
     return eventStop;
 }
 
-void GptDecoderBatched::forward(decoder::DecoderState const& decoderState, decoder_batch::Input const& input)
+void GptDecoderBatched::forward(decoder::DecoderState const& decoderState, tb::DecoderInputBuffers const& input)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     auto decoderFinishEvent = forwardAsync(decoderState, input);
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index 7c152f48a9e..e2dbdf9d326 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -43,7 +43,7 @@ namespace tle = tensorrt_llm::executor;
 namespace tc = tensorrt_llm::common;
 namespace tb = tensorrt_llm::batch_manager;
 
-using TensorPtr = decoder_batch::Input::TensorPtr;
+using TensorPtr = ITensor::SharedPtr;
 
 namespace
 {
@@ -353,9 +353,9 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
 
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
-    auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-        activeSlots, decoderState, decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots);
-    decoder.forward(decoderState, *inputs);
+    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
+        inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+    decoder.forward(decoderState, inputBuffers);
 
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
     EXPECT_THAT(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));
@@ -366,14 +366,14 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
     // run decoder for 1 step
     advanceSequenceLengths(expectedLengths, acceptedTokensPerStep, samplingConfigs,
         getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), batchSize, maxBeamWidth);
-    decoder.forward(decoderState, *inputs);
+    decoder.forward(decoderState, inputBuffers);
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
     EXPECT_THAT(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), ::testing::Each(true));
 
     verifyResults(manager, decoderState, samplingConfigs, inputLengths, expectedLengths, batchSize, maxBeamWidth,
         maxSeqLength, inputTokenId, expectedTokenId, endId);
 
-    EXPECT_NO_THROW(decoder.forward(decoderState, *inputs));
+    EXPECT_NO_THROW(decoder.forward(decoderState, inputBuffers));
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
 
     TensorPtr batchSlotsView = ITensor::slice(inputBuffers.setupBatchSlots, 0, 1);
@@ -483,9 +483,9 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
 
         auto activeSlots = std::vector<SizeType32>(batchIdx + 1);
         std::iota(activeSlots.begin(), activeSlots.end(), 0);
-        auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-            activeSlots, decoderState, decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots);
-        decoder.forward(decoderState, *inputs);
+        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
+            inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+        decoder.forward(decoderState, inputBuffers);
 
         advanceSequenceLengths(
             expectedLengths, acceptedTokensPerStep, samplingConfigs, expectedFinished, batchIdx + 1, maxBeamWidth);
@@ -506,9 +506,9 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
     auto finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
     while (!std::all_of(expectedFinished.begin(), expectedFinished.end(), [](bool finish) { return finish; }))
     {
-        auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-            activeSlots, decoderState, decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots);
-        decoder.forward(decoderState, *inputs);
+        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
+            inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+        decoder.forward(decoderState, inputBuffers);
         finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
 
         advanceSequenceLengths(
@@ -642,9 +642,9 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
 
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
-    auto inputs = tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-        activeSlots, decoderState, decoderInputs.logits, batchSize, inputBuffers.forwardBatchSlots);
-    decoder.forward(decoderState, *inputs);
+    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
+        inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+    decoder.forward(decoderState, inputBuffers);
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
     EXPECT_THAT(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));
 

From b8091b993684c5aeabce2511fe6ae239b988a7ec Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Thu, 10 Jul 2025 15:35:24 +0000
Subject: [PATCH 86/88] refactor: Remove decoder_batch::Input and related
 interfaces

- Modified MakeDecodingBatchInputOutput to accept DecoderInputBuffers directly, simplifying the interface.
- Removed decoder_batch::Input and related interfaces.
- Updated GptDecoderBatched to work with the new DecoderInputBuffers structure.
- Adjusted Python bindings to reflect changes in the C++ interface.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 .../makeDecodingBatchInputOutput.h            |  8 ++--
 .../tensorrt_llm/runtime/iGptDecoderBatched.h | 37 -------------------
 .../makeDecodingBatchInputOutput.cpp          |  6 +--
 .../trtGptModelInflightBatching.cpp           |  7 +---
 .../trtGptModelInflightBatching.h             |  2 -
 .../pybind/batch_manager/algorithms.cpp       |  1 +
 .../pybind/batch_manager/bindings.cpp         | 32 ++++------------
 cpp/tensorrt_llm/pybind/runtime/bindings.cpp  | 23 ++++++++----
 .../runtime/gptDecoderBatched.cpp             |  2 +-
 cpp/tests/runtime/gptDecoderBatchedTest.cpp   | 12 ++----
 .../make_decoding_batch_input_output.py       | 20 +++++-----
 tensorrt_llm/_torch/pyexecutor/sampler.py     | 37 ++++++++++++-------
 12 files changed, 69 insertions(+), 118 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
index 2cd13906c00..db32b70ff63 100644
--- a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
+++ b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -40,15 +40,15 @@ class MakeDecodingBatchInputOutput : Algorithm
     constexpr static auto name{"MakeDecodingBatchInputOutput"};
 
     using SizeType32 = tensorrt_llm::runtime::SizeType32;
-    using TensorPtr = runtime::decoder_batch::Input::TensorPtr;
+    using TensorPtr = runtime::ITensor::SharedPtr;
     template <typename T>
     using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
 
     MakeDecodingBatchInputOutput() = default;
 
-    std::unique_ptr<runtime::decoder_batch::Input> operator()(DecoderInputBuffers& inputBuffers,
-        runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig,
-        SizeType32 maxNumSequences, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
+    void operator()(DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
+        runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
+        OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
     static void createDecoderBatchInputs(DecoderInputBuffers& inputBuffers, std::vector<SizeType32> const& activeSlots,
         runtime::decoder::DecoderState const& decoderState, SizeType32 maxNumSequences);
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
index 8d7bb02739e..f89f5979ec9 100644
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@@ -40,43 +40,6 @@ namespace decoder
 class DecoderState;
 }
 
-namespace decoder_batch
-{
-
-class Input
-{
-public:
-    using TensorConstPtr = ITensor::SharedConstPtr;
-    using TensorPtr = ITensor::SharedPtr;
-
-    explicit Input(std::vector<std::vector<TensorConstPtr>> const& logits, SizeType32 maxDecoderSteps)
-        : logits{logits}
-        , maxDecoderSteps{maxDecoderSteps}
-    {
-        TLLM_CHECK_WITH_INFO(
-            logits.size() == static_cast<size_t>(maxDecoderSteps), "logits vector size does not match maxDecoderSteps");
-    }
-
-    explicit Input(std::vector<TensorConstPtr> const& logits)
-        : Input{{logits}, 1}
-    {
-    }
-
-    //! Mandatory parameters
-    //! Logits
-    // FIXME: remove first dimension of tensors
-    //! [maxDecoderSteps][batchSize][1, beamWidth, vocabSizePadded], on gpu
-    std::vector<std::vector<TensorConstPtr>> logits;
-
-    //! Maximum number of decoding tokens of active slots
-    SizeType32 maxDecoderSteps;
-
-    //! Batch of active decoder slots, sorted by slots, [maxDecoderSteps][batchSize]
-    std::vector<TensorPtr> batchSlots;
-};
-
-} // namespace decoder_batch
-
 //! GPT decoder class with support for in-flight batching
 class IGptDecoderBatched
 {
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
index ae05c8981d0..a7bceb1bf69 100644
--- a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
+++ b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -158,7 +158,7 @@ void setEagleInputs(tr::DecodingInput& dInput, RuntimeBuffers const& fusedRuntim
 
 } // namespace
 
-std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::operator()(DecoderInputBuffers& inputBuffers,
+void MakeDecodingBatchInputOutput::operator()(DecoderInputBuffers& inputBuffers,
     runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
     OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const
 {
@@ -168,9 +168,6 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::operator
 
     createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, maxNumSequences);
 
-    auto decodingInput = std::make_unique<tr::decoder_batch::Input>(inputBuffers.batchLogits, inputBuffers.maxDecoderSteps);
-    decodingInput->batchSlots = inputBuffers.forwardBatchSlots;
-
     auto const maxBeamWidth = decoderState.getMaxBeamWidth();
     if (maxBeamWidth > 1)
     {
@@ -197,7 +194,6 @@ std::unique_ptr<tr::decoder_batch::Input> MakeDecodingBatchInputOutput::operator
     }
 
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
-    return decodingInput;
 }
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 311507e73d3..66d86522963 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -1545,8 +1545,6 @@ void TrtGptModelInflightBatching::createBuffers(executor::DecodingConfig const&
             mOperatingBeamWidth, getMaxSequenceLen(), mRuntime->getBufferManager()));
     }
 
-    mDecodingInputs.resize(mNumMicroBatches);
-
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -2059,9 +2057,8 @@ runtime::CudaEvent TrtGptModelInflightBatching::decoderStepAsync(ScheduledReques
     auto const fusedBufferId = getFusedBufferId();
     auto& fusedRuntimeBuffers = mBuffers.at(fusedBufferId);
 
-    auto& decodingInput = mDecodingInputs.at(mMicroBatchId);
-    decodingInput = (*mMakeDecodingBatchInputOutput)(mDecoderInputBuffers.at(fusedBufferId), *mDecoderState,
-        mModelConfig, getMaxNumSequences(), *fusedRuntimeBuffers);
+    (*mMakeDecodingBatchInputOutput)(decoderInputBuffers, *mDecoderState, mModelConfig, getMaxNumSequences(),
+        *fusedRuntimeBuffers);
 
     auto decoderFinishEvent = mDecoder->forwardAsync(*mDecoderState, decoderInputBuffers);
 
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
index 6e9f1c8ce0f..3b6cd709b25 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.h
@@ -548,8 +548,6 @@ class TrtGptModelInflightBatching : public TrtGptModel
     std::vector<std::unique_ptr<SlotDecoderBuffers>> mSlotDecoderBuffers;
     // PEFT table for each micro batch
     std::vector<PeftTable> mPeftTables;
-    // Decoder input for each micro batch.
-    std::vector<std::unique_ptr<runtime::decoder_batch::Input>> mDecodingInputs;
 
     /******************** Book keeping ********************/
     // List of requests in each micro batch
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
index f6bd8f02491..c26706f7444 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -20,6 +20,7 @@
 #include "tensorrt_llm/batch_manager/assignReqSeqSlots.h"
 #include "tensorrt_llm/batch_manager/capacityScheduler.h"
 #include "tensorrt_llm/batch_manager/createNewDecoderRequests.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/handleContextLogits.h"
 #include "tensorrt_llm/batch_manager/handleGenerationLogits.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 2bda3b4f1a7..5237300dc3b 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -392,19 +392,6 @@ void initBindings(pybind11::module_& m)
         .def(py::init<tr::SizeType32, tr::ModelConfig, tr::WorldConfig, tr::BufferManager>(),
             py::arg("max_num_sequences"), py::arg("model_config"), py::arg("world_config"), py::arg("buffer_manager"));
 
-    py::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
-        .def(py::init<runtime::SizeType32, runtime::SizeType32, tr::BufferManager>(), py::arg("max_batch_size"),
-            py::arg("max_tokens_per_engine_step"), py::arg("manager"))
-        .def_readwrite("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
-        .def_readwrite("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
-        .def_readwrite("fill_values", &tb::DecoderInputBuffers::fillValues)
-        .def_readwrite("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
-        .def_readwrite("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
-        .def_readwrite("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
-        .def_readwrite("batch_logits", &tb::DecoderInputBuffers::batchLogits)
-        .def_readwrite("decoder_logits", &tb::DecoderInputBuffers::decoderLogits)
-        .def_readwrite("decoder_requests", &tb::DecoderInputBuffers::decoderRequests);
-
     py::class_<tb::DecoderOutputBuffers>(m, "DecoderOutputBuffers")
         .def_readwrite("sequence_lengths_host", &tb::DecoderOutputBuffers::sequenceLengthsHost)
         .def_readwrite("finished_sum_host", &tb::DecoderOutputBuffers::finishedSumHost)
@@ -451,10 +438,10 @@ void initBindings(pybind11::module_& m)
 
     m.def(
         "make_decoding_batch_input",
-        [](std::vector<std::shared_ptr<tb::LlmRequest>>& contextRequests,
-            std::vector<std::shared_ptr<tb::LlmRequest>>& genRequests, tr::ITensor::SharedPtr logits, int beamWidth,
-            std::vector<int> const& numContextLogitsPrefixSum, tb::DecoderInputBuffers const& decoderInputBuffers,
-            runtime::decoder::DecoderState& decoderState, tr::BufferManager const& manager)
+        [](tb::DecoderInputBuffers& decoderInputBuffers, runtime::decoder::DecoderState& decoderState,
+            std::vector<std::shared_ptr<tb::LlmRequest>> const& contextRequests,
+            std::vector<std::shared_ptr<tb::LlmRequest>> const& genRequests, tr::ITensor::SharedPtr const& logits,
+            int beamWidth, std::vector<int> const& numContextLogitsPrefixSum, tr::BufferManager const& manager)
         {
             std::vector<int> activeSlots;
             std::vector<int> generationSteps;
@@ -512,8 +499,7 @@ void initBindings(pybind11::module_& m)
                 batchSlotsRange[i] = activeSlots[i];
             }
 
-            auto decodingInput = std::make_unique<tr::decoder_batch::Input>(logitsVec, 1);
-            decodingInput->batchSlots = batchSlots;
+            decoderInputBuffers.batchLogits = logitsVec;
 
             auto const maxBeamWidth = decoderState.getMaxBeamWidth();
             if (maxBeamWidth > 1)
@@ -521,12 +507,10 @@ void initBindings(pybind11::module_& m)
                 // For Variable-Beam-Width-Search
                 decoderState.getJointDecodingInput().generationSteps = generationSteps;
             }
-
-            return decodingInput;
         },
-        py::arg("context_requests"), py::arg("generation_requests"), py::arg("logits"), py::arg("beam_width"),
-        py::arg("num_context_logits_prefix_sum"), py::arg("decoder_input_buffers"), py::arg("decoder_state"),
-        py::arg("buffer_manager"), "Make decoding batch input.");
+        py::arg("decoder_input_buffers"), py::arg("decoder_state"), py::arg("context_requests"),
+        py::arg("generation_requests"), py::arg("logits"), py::arg("beam_width"),
+        py::arg("num_context_logits_prefix_sum"), py::arg("buffer_manager"), "Make decoding batch input.");
 }
 
 } // namespace tensorrt_llm::pybind::batch_manager
diff --git a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
index c4adb08567b..115c1a389c7 100644
--- a/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/runtime/bindings.cpp
@@ -17,6 +17,7 @@
 
 #include "bindings.h"
 #include "moeBindings.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/kernels/communicationKernels/allReduceWorkspace.h"
 #include "tensorrt_llm/kernels/communicationKernels/customLowPrecisionAllReduceKernels.h"
 #include "tensorrt_llm/kernels/customAllReduceKernels.h"
@@ -48,6 +49,7 @@
 
 namespace tr = tensorrt_llm::runtime;
 namespace te = tensorrt_llm::executor;
+namespace tb = tensorrt_llm::batch_manager;
 
 class PyITensor : public tensorrt_llm::runtime::ITensor
 {
@@ -268,14 +270,6 @@ void initBindings(pybind11::module_& m)
         .def_readwrite("medusa_tree_ids", &tr::decoder_batch::Request::medusaTreeIds)
         .def_readwrite("lookahead_runtime_config", &tr::decoder_batch::Request::lookaheadRuntimeConfig);
 
-    py::class_<tr::decoder_batch::Input>(m, "DecoderBatchInput")
-        .def(py::init<std::vector<std::vector<tr::ITensor::SharedConstPtr>>, tr::SizeType32>(), py::arg("logits"),
-            py::arg("max_decoding_engine_tokens"))
-        .def(py::init<std::vector<tr::ITensor::SharedConstPtr>>(), py::arg("logits"))
-        .def_readwrite("logits", &tr::decoder_batch::Input::logits)
-        .def_readwrite("max_decoder_steps", &tr::decoder_batch::Input::maxDecoderSteps)
-        .def_readwrite("batch_slots", &tr::decoder_batch::Input::batchSlots);
-
     py::class_<tr::LookaheadDecodingBuffers>(m, "LookaheadDecodingBuffers")
         .def(py::init<tr::SizeType32, tr::SizeType32, tr::BufferManager const&>(), py::arg("max_num_sequences"),
             py::arg("max_tokens_per_step"), py::arg("buffer_manager"))
@@ -384,6 +378,19 @@ void initBindings(pybind11::module_& m)
         .def_property("generation_steps", &tr::decoder::DecoderState::getGenerationSteps,
             &tr::decoder::DecoderState::setGenerationSteps);
 
+    py::class_<tb::DecoderInputBuffers>(m, "DecoderInputBuffers")
+        .def(py::init<tr::SizeType32, tr::SizeType32, tr::BufferManager>(), py::arg("max_batch_size"),
+            py::arg("max_tokens_per_engine_step"), py::arg("manager"))
+        .def_readwrite("setup_batch_slots", &tb::DecoderInputBuffers::setupBatchSlots)
+        .def_readwrite("setup_batch_slots_device", &tb::DecoderInputBuffers::setupBatchSlotsDevice)
+        .def_readwrite("fill_values", &tb::DecoderInputBuffers::fillValues)
+        .def_readwrite("fill_values_device", &tb::DecoderInputBuffers::fillValuesDevice)
+        .def_readwrite("inputs_ids", &tb::DecoderInputBuffers::inputsIds)
+        .def_readwrite("batch_logits", &tb::DecoderInputBuffers::batchLogits)
+        .def_readwrite("forward_batch_slots", &tb::DecoderInputBuffers::forwardBatchSlots)
+        .def_readwrite("decoder_logits", &tb::DecoderInputBuffers::decoderLogits)
+        .def_readwrite("max_decoder_steps", &tb::DecoderInputBuffers::maxDecoderSteps);
+
     py::class_<tr::GptDecoderBatched>(m, "GptDecoderBatched")
         .def(py::init<tr::GptDecoderBatched::CudaStreamPtr>(), py::arg("stream"))
         .def("setup", &tr::GptDecoderBatched::setup, py::arg("mode"), py::arg("max_batch_size"),
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
index 52d518b277a..10f86bade0d 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@@ -115,7 +115,7 @@ void prepareForward(decoder::DecoderState const& decoderState, SizeType32 step,
 
     dInput.batchSlots = input.forwardBatchSlots.at(step);
     dInput.batchSize = static_cast<SizeType32>(dInput.batchSlots->getSize());
-    dInput.logitsVec = input.logits.at(step);
+    dInput.logitsVec = input.batchLogits.at(step);
 
     TensorPtr finishedStepsInput = ITensor::slice(decoderState.getFinishedSteps(), step, 1);
     TensorPtr finishedStepsOutput
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index e2dbdf9d326..9c9ed4a762e 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -353,8 +353,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
 
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
-    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-        inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
     decoder.forward(decoderState, inputBuffers);
 
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
@@ -483,8 +482,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
 
         auto activeSlots = std::vector<SizeType32>(batchIdx + 1);
         std::iota(activeSlots.begin(), activeSlots.end(), 0);
-        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-            inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
         decoder.forward(decoderState, inputBuffers);
 
         advanceSequenceLengths(
@@ -506,8 +504,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
     auto finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
     while (!std::all_of(expectedFinished.begin(), expectedFinished.end(), [](bool finish) { return finish; }))
     {
-        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-            inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
         decoder.forward(decoderState, inputBuffers);
         finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
 
@@ -642,8 +639,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
 
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
-    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-        inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
     decoder.forward(decoderState, inputBuffers);
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
     EXPECT_THAT(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));
diff --git a/tensorrt_llm/_torch/pyexecutor/make_decoding_batch_input_output.py b/tensorrt_llm/_torch/pyexecutor/make_decoding_batch_input_output.py
index 1f92b87ff16..d83b9ba2acf 100644
--- a/tensorrt_llm/_torch/pyexecutor/make_decoding_batch_input_output.py
+++ b/tensorrt_llm/_torch/pyexecutor/make_decoding_batch_input_output.py
@@ -4,7 +4,7 @@
 import torch
 
 from tensorrt_llm._utils import nvtx_range
-from tensorrt_llm.bindings.internal.runtime import DecoderBatchInput
+from tensorrt_llm.bindings.internal.runtime import DecoderInputBuffers
 
 
 @dataclass
@@ -19,21 +19,22 @@ class MakeDecodingBatchInputOutput:
     @nvtx_range("make_decoding_batch_input_output")
     def __call__(
         self,
+        decoder_input_buffers: DecoderInputBuffers,
+        decoder_state,
         scheduled_requests,
         logits: torch.Tensor,
         beam_width: int,
         num_context_logits_prefix_sum: List[int],
-    ) -> DecoderBatchInput:
+    ):
         """Create decoder batch inputs and outputs for the given requests.
 
         Args:
+            decoder_input_buffers: Decoder input buffers
+            decoder_state: Current decoder state
             scheduled_requests: Scheduled requests
             logits: Logits tensor
             beam_width: Beam width
             num_context_logits_prefix_sum: Number of context logits prefix sum
-
-        Returns:
-            DecoderBatchInput
         """
         # In order to make a decoding_input assuming no drafting, we need:
         # 1. logits_vec = [[logits_slice of each active slot]]
@@ -61,10 +62,9 @@ def __call__(
                                   start=logits_index + i * beam_width,
                                   length=beam_width).unsqueeze(0))
 
-        decoding_input = DecoderBatchInput(logits_vec, 1)
-        decoding_input.generation_steps = generation_steps
-        decoding_input.batch_slots = [
+        decoder_state.generation_steps = generation_steps
+        decoder_input_buffers.forward_batch_slots = [
             torch.tensor(active_slots[0], dtype=torch.int32)
         ]
-
-        return decoding_input
+        decoder_input_buffers.logits = logits_vec
+        decoder_input_buffers.max_decoder_steps = 1
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
index 87b21328292..1315a868054 100644
--- a/tensorrt_llm/_torch/pyexecutor/sampler.py
+++ b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -15,8 +15,9 @@
                                             ExecutorConfig, FinishReason)
 from tensorrt_llm.bindings.internal.algorithms import CreateNewDecoderRequests
 from tensorrt_llm.bindings.internal.batch_manager import (
-    DecoderInputBuffers, add_new_tokens_to_requests, make_decoding_batch_input)
+    add_new_tokens_to_requests, make_decoding_batch_input)
 from tensorrt_llm.bindings.internal.runtime import (BufferManager, CudaEvent,
+                                                    DecoderInputBuffers,
                                                     DecoderState,
                                                     GptDecoderBatched)
 from tensorrt_llm.executor.result import Logprob
@@ -548,7 +549,6 @@ def _initialize_store(self):
                         dtype=torch.int),
             "decoder_state":
             DecoderState(),
-            "decoding_input": [None] * self.num_micro_batches,
         }
 
         self.store["decoder_state"].setup(
@@ -656,21 +656,30 @@ def sample_async(self, scheduled_requests: ScheduledRequests,
             self._update_cache_indirection_buffer(scheduled_requests)
 
         # TODO: Enable this back once nanobind is merged and/or llm request is a pure python object
-        # decoding_input = self.algs.make_decoding_batch_input_output(
-        #     scheduled_requests, model_outputs["logits"], beam_width,
-        #     num_context_logits_prefix_sum)
-
-        self.store["decoding_input"][
-            self.micro_batch_idx] = make_decoding_batch_input(
-                scheduled_requests.context_requests,
-                scheduled_requests.generation_requests, model_outputs["logits"],
-                beam_width, num_context_logits_prefix_sum,
-                self.store["decoder_input_buffers"][self.micro_batch_idx],
-                self.store["decoder_state"], self.store["buffer_manager"])
+        # self.algs.make_decoding_batch_input_output(
+        #     self.store["decoder_input_buffers"][self.micro_batch_idx],
+        #     self.store["decoder_state"],
+        #     scheduled_requests,
+        #     model_outputs["logits"],
+        #     beam_width,
+        #     num_context_logits_prefix_sum,
+        # )
+
+        make_decoding_batch_input(
+            self.store["decoder_input_buffers"][self.micro_batch_idx],
+            self.store["decoder_state"],
+            scheduled_requests.context_requests,
+            scheduled_requests.generation_requests,
+            model_outputs["logits"],
+            beam_width,
+            num_context_logits_prefix_sum,
+            self.store["buffer_manager"],
+        )
 
         self.algs.decoder.forward_async(
             self.store["decoder_state"],
-            self.store["decoding_input"][self.micro_batch_idx])
+            self.store["decoder_input_buffers"][self.micro_batch_idx],
+        )
 
         new_output_tokens = self.store["decoder_state"].all_new_tokens.to(
             'cpu', non_blocking=True)

From 043aaf46e41620da77e65c9a10d4fb6bb96186fd Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Mon, 7 Jul 2025 15:17:24 +0000
Subject: [PATCH 87/88] refactor: Remove maxNumSequences parameter from
 MakeDecodingBatchInputOutput

- Removed maxNumSequences parameter from createDecoderBatchInputs and related function calls, streamlining the interface.
- Updated all relevant implementations and tests to reflect the changes in function signatures.

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 .../batch_manager/makeDecodingBatchInputOutput.h         | 5 ++---
 .../batch_manager/makeDecodingBatchInputOutput.cpp       | 9 ++++-----
 .../batch_manager/trtGptModelInflightBatching.cpp        | 3 +--
 cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp     | 3 +--
 cpp/tests/runtime/gptDecoderBatchedTest.cpp              | 9 ++++-----
 5 files changed, 12 insertions(+), 17 deletions(-)

diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
index db32b70ff63..245f4b4b528 100644
--- a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
+++ b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -47,11 +47,10 @@ class MakeDecodingBatchInputOutput : Algorithm
     MakeDecodingBatchInputOutput() = default;
 
     void operator()(DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
-        runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
-        OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
+        runtime::ModelConfig const& modelConfig, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
     static void createDecoderBatchInputs(DecoderInputBuffers& inputBuffers, std::vector<SizeType32> const& activeSlots,
-        runtime::decoder::DecoderState const& decoderState, SizeType32 maxNumSequences);
+        runtime::decoder::DecoderState const& decoderState);
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
index a7bceb1bf69..3e494a6383e 100644
--- a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
+++ b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -32,8 +32,7 @@ using SizeType32 = MakeDecodingBatchInputOutput::SizeType32;
 using TensorPtr = MakeDecodingBatchInputOutput::TensorPtr;
 
 void MakeDecodingBatchInputOutput::createDecoderBatchInputs(DecoderInputBuffers& inputBuffers,
-    std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState,
-    SizeType32 maxNumSequences)
+    std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -47,7 +46,7 @@ void MakeDecodingBatchInputOutput::createDecoderBatchInputs(DecoderInputBuffers&
 
     for (SizeType32 step = 0; step < maxDecoderSteps; ++step)
     {
-        batchSlots.at(step)->resize(maxNumSequences);
+        batchSlots.at(step)->resize(activeSlots.size());
     }
 
     auto constexpr singleRequest = 1;
@@ -159,14 +158,14 @@ void setEagleInputs(tr::DecodingInput& dInput, RuntimeBuffers const& fusedRuntim
 } // namespace
 
 void MakeDecodingBatchInputOutput::operator()(DecoderInputBuffers& inputBuffers,
-    runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
+    runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig,
     OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     auto [activeSlots, generationSteps] = getActiveSlots(inputBuffers.decoderRequests);
 
-    createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, maxNumSequences);
+    createDecoderBatchInputs(inputBuffers, activeSlots, decoderState);
 
     auto const maxBeamWidth = decoderState.getMaxBeamWidth();
     if (maxBeamWidth > 1)
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
index 66d86522963..bcc07ed9576 100644
--- a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
+++ b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -2057,8 +2057,7 @@ runtime::CudaEvent TrtGptModelInflightBatching::decoderStepAsync(ScheduledReques
     auto const fusedBufferId = getFusedBufferId();
     auto& fusedRuntimeBuffers = mBuffers.at(fusedBufferId);
 
-    (*mMakeDecodingBatchInputOutput)(decoderInputBuffers, *mDecoderState, mModelConfig, getMaxNumSequences(),
-        *fusedRuntimeBuffers);
+    (*mMakeDecodingBatchInputOutput)(decoderInputBuffers, *mDecoderState, mModelConfig, *fusedRuntimeBuffers);
 
     auto decoderFinishEvent = mDecoder->forwardAsync(*mDecoderState, decoderInputBuffers);
 
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
index c26706f7444..2210d120f80 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/algorithms.cpp
@@ -135,8 +135,7 @@ void tensorrt_llm::pybind::batch_manager::algorithms::initBindings(pybind11::mod
     py::class_<MakeDecodingBatchInputOutput>(m, MakeDecodingBatchInputOutput::name)
         .def(py::init())
         .def("__call__", &MakeDecodingBatchInputOutput::operator(), py::arg("decoder_input_buffers"),
-            py::arg("decoder_state"), py::arg("model_config"), py::arg("max_num_sequences"),
-            py::arg("fused_runtime_buffers") = std::nullopt)
+            py::arg("decoder_state"), py::arg("model_config"), py::arg("fused_runtime_buffers") = std::nullopt)
         .def("name", [](MakeDecodingBatchInputOutput const&) { return MakeDecodingBatchInputOutput::name; });
 
     py::class_<LogitsPostProcessor>(m, LogitsPostProcessor::name)
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
index 9c9ed4a762e..bd6478d708b 100644
--- a/cpp/tests/runtime/gptDecoderBatchedTest.cpp
+++ b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -27,7 +27,6 @@
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <gmock/gmock-matchers.h>
@@ -353,7 +352,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
 
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
-    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
+    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState);
     decoder.forward(decoderState, inputBuffers);
 
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
@@ -482,7 +481,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
 
         auto activeSlots = std::vector<SizeType32>(batchIdx + 1);
         std::iota(activeSlots.begin(), activeSlots.end(), 0);
-        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
+        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState);
         decoder.forward(decoderState, inputBuffers);
 
         advanceSequenceLengths(
@@ -504,7 +503,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
     auto finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
     while (!std::all_of(expectedFinished.begin(), expectedFinished.end(), [](bool finish) { return finish; }))
     {
-        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
+        tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState);
         decoder.forward(decoderState, inputBuffers);
         finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
 
@@ -639,7 +638,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
 
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
-    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, batchSize);
+    tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(inputBuffers, activeSlots, decoderState);
     decoder.forward(decoderState, inputBuffers);
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
     EXPECT_THAT(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));

From 079793fe419dfb2d805961de7602baf5c3d7c16d Mon Sep 17 00:00:00 2001
From: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
Date: Fri, 18 Jul 2025 14:30:06 +0000
Subject: [PATCH 88/88] fixup! refactor: Set decoder inputs in
 DecoderInputBuffers

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
---
 cpp/tests/batch_manager/guidedDecoderTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/tests/batch_manager/guidedDecoderTest.cpp b/cpp/tests/batch_manager/guidedDecoderTest.cpp
index 8358e987334..a9dc6ff785f 100644
--- a/cpp/tests/batch_manager/guidedDecoderTest.cpp
+++ b/cpp/tests/batch_manager/guidedDecoderTest.cpp
@@ -137,7 +137,7 @@ class GuidedDecoderTest : public ::testing::Test
                 decoderInputBuffers.decoderRequests.push_back(llmReq);
             }
         }
-        decoderInputBuffers.logits = mLogits;
+        decoderInputBuffers.decoderLogits = mLogits;
 
         // Context phase
         resetLogits();