diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md
index 6f1383f3ef8..9bccba451c7 100644
--- a/docs/source/installation/linux.md
+++ b/docs/source/installation/linux.md
@@ -32,6 +32,7 @@
    ```bash
    pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
    ```
+   **This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.**
 
 2. Sanity check the installation by running the following in Python (tested on Python 3.12):
 
diff --git a/docs/source/performance/perf-overview.md b/docs/source/performance/perf-overview.md
index 3f55a4e1095..9e316617186 100644
--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@@ -28,101 +28,119 @@ nvidia/Llama-3.1-405B-Instruct-FP4
 ```
 
 #### Llama 3.3 70B FP4
+
 |                         | GPU     | B200      |           |           |           |
-|:-----------------------------|:---|:----------|:----------|:----------|:----------|
-|         | TP Size    | 1         | 2         | 4         | 8         |
-| ISL, OSL|    |           |           |           |           |
-|                              |    |           |           |           |           |
-| 128, 128                     |    | 11,253.28 | 17,867.66 | 24,944.50 | 27,471.49 |
-| 128, 2048                    |    | 9,925.00  | 15,459.71 | 23,608.58 | 30,742.86 |
-| 128, 4096                    |    | 6,318.92  | 8,711.88  | 17,659.74 | 24,947.05 |
-| 500, 2000                    |    | 7,559.88  | 10,602.27 | 20,910.23 | 28,182.34 |
-| 1000, 1000                   |    | 6,866.96  | 10,838.01 | 16,567.86 | 19,991.64 |
-| 1000, 2000                   |    | 6,736.88  | 9,132.08  | 15,737.02 | 20,518.04 |
-| 1024, 2048                   |    | 6,580.56  | 8,767.45  | 15,722.55 | 20,437.96 |
-| 2048, 128                    |    | 1,375.49  | 1,610.69  | 2,707.58  | 3,717.82  |
-| 2048, 2048                   |    | 4,544.73  | 6,956.14  | 12,292.23 | 15,661.22 |
-| 5000, 500                    |    | 1,488.19  | 2,379.73  | 3,588.45  | 4,810.21  |
-| 20000, 2000                  |    | 580.96    | 1,043.58  | 1,957.84  | 3,167.30  |
+|:------------------------|:--------|:----------|:----------|:----------|:----------|
+|                         | TP Size | 1         | 2         | 4         | 8         |
+| ISL, OSL                |         |           |           |           |           |
+|                         |         |           |           |           |           |
+| 128, 128                |         | 10,994.48 | 17,542.11 | 24,667.31 | 27,272.27 |
+| 128, 2048               |         | 9,580.46  | 15,432.35 | 23,568.12 | 31,174.31 |
+| 128, 4096               |         | 6,418.39  | 9,841.53  | 17,808.76 | 25,229.25 |
+| 500, 2000               |         | 7,343.32  | 11,850.57 | 20,709.67 | 28,038.78 |
+| 1000, 1000              |         | 6,752.53  | 10,815.88 | 16,413.04 | 20,060.66 |
+| 1000, 2000              |         | 6,670.07  | 9,830.73  | 15,597.49 | 20,672.37 |
+| 1024, 2048              |         | 6,636.75  | 9,807.13  | 15,519.23 | 20,617.28 |
+| 2048, 128               |         | 1,342.17  | 1,989.41  | 3,033.14  | 4,035.64  |
+| 5000, 500               |         | 1,429.67  | 2,419.67  | 3,686.84  | 5,182.96  |
+| 20000, 2000             |         | 629.77    | 1,177.01  | 2,120.66  | 3,429.03  |
 
 #### Llama 3.1 405B FP4
-|                          | GPU    | B200      |
-|:-----------------------------|:---|:----------|
-|          | TP Size   | 8         |
-| ISL, OSL|    |           |
-|                              |    |           |
-| 128, 128                     |    | 9,184.83  |
-| 128, 2048                    |    | 10,387.23 |
-| 128, 4096                    |    | 8,741.80  |
-| 500, 2000                    |    | 9,242.34  |
-| 1000, 1000                   |    | 7,565.50  |
-| 1000, 2000                   |    | 7,696.76  |
-| 1024, 2048                   |    | 7,568.93  |
-| 2048, 128                    |    | 953.57    |
-| 2048, 2048                   |    | 6,092.32  |
-| 5000, 500                    |    | 1,332.22  |
-| 20000, 2000                  |    | 961.58    |
+
+|                         | GPU     | B200     |           |
+|:------------------------|:------- |:---------|:----------|
+|                         | TP Size | 4        | 8         |
+| ISL, OSL                |         |          |           |
+|                         |         |          |           |
+| 128, 128                |         | 6,163.81 | 9,002.90  |
+| 128, 2048               |         | 7,081.21 | 10,288.28 |
+| 128, 4096               |         | 6,028.37 | 8,713.77  |
+| 500, 2000               |         | 5,858.75 | 9,125.86  |
+| 1000, 1000              |         | 4,848.00 | 7,582.97  |
+| 1000, 2000              |         | 5,375.25 | 7,626.28  |
+| 1024, 2048              |         | 5,345.70 | 7,464.03  |
+| 2048, 128               |         | 693.55   | 1,086.56  |
+| 5000, 500               |         | 947.49   | 1,532.45  |
+| 20000, 2000             |         | 641.11   | 1,097.84  |
 
 ### FP8 Models:
 ```
 nvidia/Llama-3.1-8B-Instruct-FP8
-nvidia/Llama-3.1-70B-Instruct-FP8
+nvidia/Llama-3.3-70B-Instruct-FP8
 nvidia/Llama-3.1-405B-Instruct-FP8
+nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 ```
 
 #### Llama 3.1 8B FP8
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
+
+|                         | GPU     | H200 141GB HBM3   | H100 80GB HBM3   |
 |:-----------------------------|:---|:------------------|:-----------------|
-|          | TP Size   | 1                 | 1                |
+|    | TP Size   | 1              | 1             |
 | ISL, OSL |    |                   |                  |
 |                              |    |                   |                  |
-| 128, 128                     |    | 28,447.38         | 27,568.68        |
-| 128, 2048                    |    | 23,294.74         | 22,003.62        |
-| 128, 4096                    |    | 17,481.48         | 13,640.35        |
-| 500, 2000                    |    | 21,462.57         | 17,794.39        |
-| 1000, 1000                   |    | 17,590.60         | 15,270.02        |
-| 1000, 2000                   |    | 17,139.51         | 13,850.22        |
-| 1024, 2048                   |    | 16,970.63         | 13,374.15        |
-| 2048, 128                    |    | 3,531.33          | 3,495.05         |
-| 2048, 2048                   |    | 12,022.38         | 9,653.67         |
-| 5000, 500                    |    | 3,851.65          | 3,371.16         |
-| 20000, 2000                  |    | 1,706.06          | 1,340.92         |
-
-#### Llama 3.1 70B FP8
-|                          | GPU   | H200 141GB HBM3   |          |           |           | H100 80GB HBM3   |          |           |           |
+| 128, 128                     |    | 27,970.14         | 27,688.36        |
+| 128, 2048                    |    | 23,326.38         | 21,841.15        |
+| 128, 4096                    |    | 17,508.51         | 13,730.89        |
+| 500, 2000                    |    | 21,390.41         | 17,833.34        |
+| 1000, 1000                   |    | 17,366.89         | 15,270.62        |
+| 1000, 2000                   |    | 16,831.31         | 13,798.08        |
+| 1024, 2048                   |    | 16,737.03         | 13,385.50        |
+| 2048, 128                    |    | 3,488.03          | 3,414.67         |
+| 5000, 500                    |    | 3,813.69          | 3,394.54         |
+| 20000, 2000                  |    | 1,696.66          | 1,345.42         |
+
+#### Llama 3.3 70B FP8
+
+|                          | GPU    | H200 141GB HBM3   |          |           |           | H100 80GB HBM3   |          |           |           |
 |:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------|
-|    | TP Size   | 1                 | 2        | 4         | 8         | 1                | 2        | 4         | 8         |
-| ISL, OSL|    |                   |          |           |           |                  |          |           |           |
+|    | TP Size   | 1              | 2     | 4      | 8      | 1            | 2     | 4      | 8      |
+| ISL, OSL |    |                   |          |           |           |                  |          |           |           |
 |                              |    |                   |          |           |           |                  |          |           |           |
-| 128, 128                     |    | 3,657.58          | 6,477.50 | 10,466.04 | 15,554.57 | 3,191.27         | 6,183.41 | 10,260.68 | 14,686.01 |
-| 128, 2048                    |    | 4,351.07          | 8,450.31 | 13,438.71 | 20,750.58 | 745.19           | 5,822.02 | 11,442.01 | 17,463.99 |
-| 128, 4096                    |    | 2,696.61          | 5,598.92 | 11,524.93 | 16,634.90 |                  | 3,714.87 | 8,209.91  | 12,598.55 |
-| 500, 2000                    |    | 3,475.58          | 6,712.35 | 12,332.32 | 17,311.28 |                  | 4,704.31 | 10,278.02 | 14,630.41 |
-| 1000, 1000                   |    | 2,727.42          | 5,097.36 | 8,698.15  | 12,794.92 | 734.67           | 4,191.26 | 7,427.35  | 11,082.48 |
-| 1000, 2000                   |    | 2,913.54          | 5,841.15 | 9,016.49  | 13,174.68 | 526.31           | 3,920.44 | 7,590.35  | 11,108.11 |
-| 1024, 2048                   |    | 2,893.02          | 5,565.28 | 9,017.72  | 13,117.34 | 525.43           | 3,896.14 | 7,557.32  | 11,028.32 |
-| 2048, 128                    |    | 433.30            | 772.97   | 1,278.26  | 1,947.33  | 315.90           | 747.51   | 1,240.12  | 1,840.12  |
-| 2048, 2048                   |    | 1,990.25          | 3,822.83 | 7,068.68  | 10,529.06 | 357.98           | 2,732.86 | 5,640.31  | 8,772.88  |
-| 5000, 500                    |    | 543.88            | 1,005.81 | 1,714.77  | 2,683.22  | 203.27           | 866.77   | 1,571.92  | 2,399.78  |
-| 20000, 2000                  |    | 276.99            | 618.01   | 1,175.35  | 2,021.08  |                  | 408.43   | 910.77    | 1,568.84  |
+| 128, 128                     |    | 3,605.47          | 6,427.69 | 10,407.42 | 15,434.37 | 3,128.33         | 6,216.91 |           |           |
+| 128, 2048                    |    | 4,315.80          | 8,464.03 | 13,508.59 | 20,759.72 | 756.42           | 5,782.57 | 11,464.94 | 17,424.32 |
+| 128, 4096                    |    | 2,701.17          | 5,573.55 | 11,458.56 | 16,668.75 |                  | 3,868.37 | 8,206.39  | 12,624.61 |
+| 500, 2000                    |    | 3,478.76          | 6,740.06 | 12,200.18 |           |                  | 4,684.06 | 9,903.53  | 14,553.93 |
+| 1000, 1000                   |    | 2,744.32          | 5,119.72 | 8,685.44  | 12,744.51 | 742.14           | 4,247.19 | 7,435.65  | 11,018.81 |
+| 1000, 2000                   |    | 2,896.44          | 5,847.26 | 9,031.21  | 13,141.17 | 533.74           | 3,866.53 | 7,611.12  | 11,139.22 |
+| 1024, 2048                   |    | 2,874.18          | 5,568.61 | 8,946.71  | 13,082.62 | 530.16           | 3,796.68 | 7,575.24  | 11,004.31 |
+| 2048, 128                    |    | 435.90            | 772.67   | 1,264.76  |           |                  | 736.89   | 1,213.33  | 1,839.22  |
+| 2048, 2048                   |    |                   |          |           | 10,412.85 |                  |          |           |           |
+| 5000, 500                    |    | 545.96            | 997.15   | 1,698.22  | 2,655.28  | 204.94           | 862.91   | 1,552.68  | 2,369.84  |
+| 20000, 2000                  |    | 276.66            | 620.33   | 1,161.29  | 1,985.85  |                  | 416.13   | 903.66    | 1,554.10  |
 
 #### Llama 3.1 405B FP8
-|                          | GPU   | H200 141GB HBM3   | H100 80GB HBM3   |
+
+|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
 |:-----------------------------|:---|:------------------|:-----------------|
-|          | TP Size   | 8                 | 8                |
+|   | TP Size   | 8              | 8             |
 | ISL, OSL |    |                   |                  |
 |                              |    |                   |                  |
-| 128, 128                     |    | 3,800.11          | 3,732.40         |
-| 128, 2048                    |    | 5,661.13          | 4,572.23         |
-| 128, 4096                    |    | 5,167.18          | 2,911.42         |
-| 500, 2000                    |    | 4,854.29          | 3,661.85         |
-| 1000, 1000                   |    | 3,332.15          | 2,963.36         |
-| 1000, 2000                   |    | 3,682.15          | 3,253.17         |
-| 1024, 2048                   |    | 3,685.56          | 3,089.16         |
-| 2048, 128                    |    | 453.42            | 448.89           |
-| 2048, 2048                   |    | 3,055.73          | 2,139.94         |
-| 5000, 500                    |    | 656.11            | 579.14           |
-| 20000, 2000                  |    | 514.02            | 370.26           |
+| 128, 2048                    |    | 5,567.87          |                  |
+| 128, 4096                    |    | 5,136.85          |                  |
+| 500, 2000                    |    | 4,787.61          | 3,673.91         |
+| 1000, 1000                   |    | 3,286.30          | 3,012.22         |
+| 1000, 2000                   |    | 3,636.76          | 3,262.20         |
+| 1024, 2048                   |    | 3,618.66          | 3,109.70         |
+| 2048, 128                    |    | 443.10            | 449.02           |
+| 5000, 500                    |    | 645.46            |                  |
+| 20000, 2000                  |    |                   | 372.12           |
+
+#### Llama 4 Maverick FP8
+
+|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
+|:-----------------------------|:---|:------------------|:-----------------|
+|    | TP Size    | 8              | 8             |
+| ISL, OSL |    |                   |                  |
+|                              |    |                   |                  |
+| 128, 2048                    |    | 27,543.87         |                  |
+| 128, 4096                    |    | 18,541.01         | 11,163.12        |
+| 500, 2000                    |    | 21,117.34         |                  |
+| 1000, 2000                   |    |                   | 10,556.00        |
+| 1024, 2048                   |    | 16,859.45         | 11,584.33        |
+| 2048, 128                    |    | 4,364.06          | 3,832.38         |
+| 2048, 2048                   |    | 12,800.89         |                  |
+| 5000, 500                    |    | 5,128.60          |                  |
+| 20000, 2000                  |    | 1,764.27          | 1,400.79         |
 
 ## Reproducing Benchmarked Results
 
@@ -198,6 +216,8 @@ a model name (HuggingFace reference or path to a local model), a [generated data
 trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
 ```
 
+The data collected for the v0.20 benchmarks was run with the following file:
+
 `llm_options.yml`
 ```yaml
 cuda_graph_config:
@@ -220,7 +240,7 @@ cuda_graph_config:
     - 8192
 ```
 
-In majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
+In a majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
 
 The results will be printed to the terminal upon benchmark completion. For example,
 
diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md
index b3027e0737a..12b9a5ec037 100644
--- a/docs/source/quick-start-guide.md
+++ b/docs/source/quick-start-guide.md
@@ -8,13 +8,15 @@ This is the starting point to try out TensorRT-LLM. Specifically, this Quick Sta
 
 There are multiple ways to install and run TensorRT-LLM. For most users, the options below should be ordered from simple to complex. The approaches are equivalent in terms of the supported features.
 
+Note: **This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.**
+
 1. [](installation/containers)
 
 1. Pre-built release wheels on [PyPI](https://pypi.org/project/tensorrt-llm) (see [](installation/linux))
 
 1. [Building from source](installation/build-from-source-linux)
 
-The following examples can most easily be executed using the prebuilt [Docker release container available on NGC](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/tensorrt-llm/containers/release) (see also [release.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/release.md) on GitHub).
+The following examples can most easily be executed using the prebuilt [Docker release container available on NGC](https://registry.ngc.nvidia.com/orgs/nvstaging/teams/tensorrt-llm/containers/release) (see also [release.md](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docker/release.md) on GitHub). Ensure to run these commands as a user with appropriate permissions, preferably `root`, to streamline the setup process.
 
 
 ## LLM API
@@ -92,7 +94,7 @@ For detailed examples and command syntax, refer to the [trtllm-serve](commands/t
 
 2. Open a new terminal and use the following command to directly attach to the running container:
 
-```bash
+```bash:docs/source/quick-start-guide.md
 docker exec -it <container_id> bash
 ```
 
diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md
index 37fada2c0de..0c59baf992b 100644
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@@ -25,6 +25,8 @@ TensorRT-LLM optimizes the performance of a range of well-known models on NVIDIA
 | `Qwen2ForRewardModel` | Qwen2-based | `Qwen/Qwen2.5-Math-RM-72B` | L |
 | `Qwen2VLForConditionalGeneration` | Qwen2-VL | `Qwen/Qwen2-VL-7B-Instruct` | L + V |
 | `Qwen2_5_VLForConditionalGeneration` | Qwen2.5-VL | `Qwen/Qwen2.5-VL-7B-Instruct` | L + V |
+| `Qwen3ForCausalLM` | Qwen3 | `Qwen/Qwen3-8B` | L |
+| `Qwen3MoeForCausalLM` | Qwen3MoE | `Qwen/Qwen3-30B-A3B` | L |
 
 Note:
 - L: Language only
@@ -72,7 +74,7 @@ Note:
 - [mT5](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/enc_dec)
 - [OPT](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/contrib/opt)
 - [Phi-1.5/Phi-2/Phi-3](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/phi)
-- [Qwen/Qwen1.5/Qwen2](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/qwen)
+- [Qwen/Qwen1.5/Qwen2/Qwen3](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/qwen)
 - [Qwen-VL](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/qwenvl)
 - [RecurrentGemma](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/recurrentgemma)
 - [Replit Code](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/contrib/mpt) [^replitcode]
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
index bb663aba7d2..dee84ecfde5 100644
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@@ -4,6 +4,152 @@
 
 All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
 
+## TensorRT-LLM Release 0.21.0
+
+### Key Features and Enhancements
+- **Model Support**
+  - Added Gemma3 VLM support
+- **Features**
+  - Added large-scale EP support
+  - Integrated NIXL into the communication layer of the disaggregated service
+  - Added fabric Memory support for KV Cache Transfer
+  - Added MCP in ScaffoldingLLM
+  - Added support for w4a8_mxfp4_fp8 quantization
+  - Added support for fp8 rowwise quantization
+  - Added generation logits support in TRTLLM Sampler
+  - Added log probs support in TRTLLM Sampler
+  - Optimized TRTLLM Sampler perf single beam single step
+  - Enabled Disaggregated serving for Qwen-3
+  - Added EAGLE3 support for Qwen-3
+  - Fused finalize and allreduce for Qwen-MoE model
+  - Refactored Fused MoE module
+  - Added support for chunked attention on Blackwell and Hopper
+  - Introduced sliding-window attention kernels for the generation phase on Blackwell
+  - Updated DeepSeek FP8 TRT-LLM Gen cubins to improve performance in large batch size scenarios
+  - Added FP8 block-scale GEMM support on SM89
+  - Enabled overlap scheduler between draft forwards
+  - Added Piecewise cuda graph support for MLA
+  - Added model-agnostic one-engine eagle3
+  - Enabled Finalize + Allreduce + add + rmsnorm fusion
+  - Integrated TRT-LLM Gen FP8 block scale MoE with Pytorch workflow kernel autotuner
+  - Added support for Eagle3 + disaggregated serving in two model speculative decoding flow
+  - Validated Llama 3.1 models on H200 NVL
+- Benchmark:
+  - Added all_reduce.py benchmark script for testing
+  - Added beam width to trtllm-bench latency command
+  - Fixed trtllm-bench iter_stats and cuda_graph_batch_sizes errors
+  - Enabled trtllm-bench to run LoRA and add basic e2e perf testing capability for LoRA
+  - Supported post_proc for bench
+  - Added no_kv_cache_reuse option and streaming support for trtllm serve bench
+
+### Infrastructure Changes
+- The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.05-py3`.
+- The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.05-py3`.
+- The dependent public PyTorch version is updated to 2.7.1.
+- The dependent TensorRT version is updated to 10.11.
+- The dependent NVIDIA ModelOpt version is updated to 0.31.
+- The dependent NCCL version is updated to 2.27.5.
+
+### API Changes
+- Set _AutoDeployLlmArgs as primary config object
+- Removed decoder request from decoder interface
+- Enhanced the torch_compile_config in llm args
+- Removed the redundant use_kv_cache field from PytorchConfig
+- Moved allreduce_strategy from committed api to reference
+
+### Fixed Issues
+- Fixed disaggregated service hang when MNNVL two-shot AllReduce is enabled (#4678)
+- Fixed EP load balancer with MTP layer and route offset by EP rank (#4767)
+- Fixed cuda graph padding for spec decoding (#4853)
+- Fixed llama 4 long context issue (#4809)
+- Fixed max_num_sequences calculation with overlap scheduling (#4532)
+- Fixed chunked prefill + overlap scheduling (#5761)
+- Fixed trtllm-bench hang issue due to LLM API IPC (#4798)
+- Fixed index out of bounds error in spec decoding (#5954)
+- Fixed MTP illegal memory access in cuda graph warmup (#5947)
+- Fixed no free slots error with spec decode + disagg (#5975)
+- Fixed one-off attention window size for Gemma3 1B (#5564)
+
+### Known Issues
+- accuracy/test_cli_flow::TestGpt2::test_beam_search_large is broken.
+- Enabling disaggregated serving, MTP, and the overlap scheduler at the same time can lead to accuracy problems.
+
+## TensorRT-LLM Release 0.20.0
+
+### Key Features and Enhancements
+- **Model Support**
+  - Added Qwen3 support.Refer to “Qwen3” section in `examples/models/core/qwen/README.md`.
+  - Added HyperCLOVAX-SEED-Vision support in PyTorch flow. Refer to `examples/models/contrib/hyperclovax/README.md`
+  - Added Dynasor-CoT in scaffolding examples. Refer to `examples/scaffolding/contrib/Dynasor/README.md`
+  - Added Mistral Small 3.1 24B VLM support in TRT workflow
+  - Added Gemma3-1b-it support in PyTorch workflow
+  - Added Nemotron-H model support
+  - Added Eagle-3 support for LLAMA4
+- **PyTorch workflow**
+  - Added lora support
+  - Added return logits support
+  - Adopt new logprob definition in PyTorch flow
+  - Enabled per-request stats with PyTorch backend
+  - Enabled LogitsProcessor in PyTorch backend
+- Benchmark:
+  - Add beam width to low latency.
+  - Fix trtllm-bench iter_stats and cuda_graph_batch_sizes errors.
+  - Remove deprecated Python runtime benchmark
+  - Add benchmark support for scaffolding
+- Multimodal models
+  - Added support in trtllm-serve
+  - Added support in trtllm-bench, the support is limited to image only for now
+- Supported DeepSeek-R1 W4A8 on Hopper
+- Add the RTX Pro 6000 support on single GPU
+- Integrated Llama4 input processor
+- Added CGA reduction FHMA kernels on Blackwell
+- Enabled chunked context for FlashInfer
+- Supported KV cache reuse for MLA
+- Added Piecewise CUDA Graph support
+- Supported multiple LoRA adapters and TP
+- Added KV cache-aware router for disaggregated serving
+- Unfused attention for native support
+- Added group_rms_norm kernel to normalize multiple inputs in a single operator
+- Added smart router for the MoE module
+- Added head size 72 support for QKV preprocessing kernel
+- Added MNNVL MoE A2A support
+- Optimized Large Embedding Tables in Multimodal Models
+- Supported Top-K logprobs and prompt_logprobs in LLMAPI
+- Enabled overlap scheduler in TRT workflow via executor API
+
+### Infrastructure Changes
+- **TRT-LLM team formally releases docker image on [NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)**.
+- The pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 now, which uses the CXX11 ABI
+- The dependent TensorRT version is updated to 10.10.0
+- The dependent CUDA version is updated to 12.9.0
+- The dependent public PyTorch version is updated to 2.7.0
+- The dependent NVIDIA ModelOpt version is updated to 0.29.0
+- The dependent NCCL version is maintained at 2.25.1
+- Open-sourced XQA kernels
+- Dependent datasets version was upgraded to 3.1.0
+- Migrate Triton Backend to TensorRT LLM repo to TensorRT LLM submodule
+- Downgrade gcc toolset version from 13 to 11
+
+### API Changes
+- [Breaking Change]:Enable scheduling overlap by default
+- Remove deprecated GptSession/V1 from TRT workflow
+- Set _AutoDeployLlmArgs as primary config object
+- Allow overriding CLI arguments with YAML file in trtllm-serve
+- Introduced multimodal embedding field in LlmRequest
+
+
+### Fixed Issues
+- Fix hang bug when context server doesn't have enough capacity for KV Cache (#3095)
+- Fix C++ decoder synchronization in PyTorch (#3106)
+- Fix bug of create cuda stream as default parameter which will be initialized during importing (#3764)
+- Fix bug related to creating CUDA stream as default parameter, which will be initialized during importing (#3764)
+- Fix attention DP bug on Qwen3 MoE model (#4141)
+- Fix illegal memory access when running LLaMA 4 with CUDA Graph enabled (#4101)
+- Reset planned states to avoid memory leak in TrtllmAttentionWrapper (#4227)
+
+### Known Issues
+- multi-GPU model support on RTX Pro 6000
+
 
 ## TensorRT-LLM Release 0.19.0
 
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index b1653951ac5..c8523deea2e 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -38,6 +38,7 @@
 from tqdm import tqdm
 from transformers import PretrainedConfig
 
+from tensorrt_llm._ipc_utils import can_access_peer
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.llmapi.utils import enable_llm_debug
@@ -602,6 +603,7 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig],
         self.enable_attention_dp = mapping.enable_attention_dp
 
         self.mlp_tp_size = mapping.tp_size
+        self.is_p2p_supported = can_access_peer(mapping)
 
         self.fusion_config = EagerFusionConfig()
         self.enable_fusion = os.environ.get(
@@ -796,7 +798,7 @@ def _run_MoE(hidden_states, hidden_states_fp4, do_finalize):
             not (hidden_states.shape[0] <= self.moe_allreduce.max_token
                  and self.fusion_config.POST_MOE_FUSION
                  and self.model_config.moe_backend == "TRTLLM"
-                 and self.mlp.experts.has_nvfp4))
+                 and self.mlp.experts.has_nvfp4 and self.is_p2p_supported))
 
         hidden_states = _run_MoE(hidden_states,
                                  hidden_states_fp4=None,
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
index 1a22caf2d7d..3e364ac9a91 100644
--- a/tensorrt_llm/_torch/pyexecutor/model_engine.py
+++ b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -1216,7 +1216,8 @@ def _prepare_tp_inputs(
             if next_draft_tokens_device is None or request.is_dummy or request.py_batch_idx is None:
                 # get token ids, including input token ids and draft token ids. For these dummy requests,
                 # no need to copy the token ids.
-                if not request.is_dummy:
+                if not (request.is_attention_dp_dummy
+                        or request.is_cuda_graph_dummy):
                     input_ids.append(request.get_last_tokens(0))
                     input_ids.extend(request.py_draft_tokens)
                     draft_tokens.extend(request.py_draft_tokens)
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 5b440e8b90e..934813aa4c4 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -544,14 +544,6 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                 raise ValueError(
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
-            # Check prompt length and query length against max_num_tokens to filter illegal requests.
-            # Skip check for gen-only requests
-            if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only:
-                max_num_tokens = self.args.max_num_tokens
-                if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
-                    raise ValueError(
-                        f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) should not exceed "
-                        f"max_num_tokens ({max_num_tokens})")
             return
 
         build_config = self.args.build_config
@@ -568,7 +560,7 @@ def _check_arguments(self, prompt_len: int, query_len: int,
             (sampling_params.max_tokens or 0) > max_seq_len):
             raise ValueError(
                 f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}) and query length ({query_len}) max_tokens ({sampling_params.max_tokens}) should not exceed "
-                f"max_seq_len ({max_seq_len})")
+                f"max_seq_len ({build_config.max_seq_len})")
 
         if sampling_params.use_beam_search and sampling_params.best_of > build_config.max_beam_width:
             if sampling_params.n == sampling_params.best_of:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 61f8c199e9d..fb46cd337e8 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -647,7 +647,7 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
         if torch_compile and mtp_nextn > 0:
             pytest.skip("https://nvbugs/5252313")
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
             enable_piecewise_cuda_graph=cuda_graph,
@@ -687,7 +687,7 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
             enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@@ -725,7 +725,7 @@ def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
                               overlap_scheduler, torch_compile):
         if torch_compile and mtp != "disable":
             pytest.skip("https://nvbugs/5252313")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
             enable_piecewise_cuda_graph=cuda_graph,
@@ -813,7 +813,7 @@ def test_cute_dsl_fp8_block_scales(
     @pytest.mark.skip_device_not_contain(["H100"])
     @parametrize_with_ids("mtp_nextn", [0, 2])
     def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@@ -838,7 +838,7 @@ def test_fp8_block_scales_cuda_graph_padding(self, mtp_nextn):
     @parametrize_with_ids("attention_dp", [False, True])
     def test_fp8_block_scales_cuda_graph_padding_4gpus(self, mtp_nextn,
                                                        attention_dp):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         mtp_config = None
         if mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
@@ -879,7 +879,7 @@ def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
             enable_piecewise_cuda_graph=cuda_graph and not attention_dp,
@@ -979,7 +979,7 @@ def test_cute_dsl_fp8_block_scales_4gpus(
     @pytest.mark.skip_less_device(4)
     @pytest.mark.skip_device_not_contain(["H100", "H200"])
     def test_fp8_block_scales_4gpus_static_eplb(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
 
         num_experts = 72
         num_slots = 80
@@ -1070,7 +1070,7 @@ def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler,
                    torch_compile, mtp_nextn, moe_backend):
         if torch_compile and mtp_nextn > 0:
             pytest.skip("https://nvbugs/5252313")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
             enable_piecewise_cuda_graph=cuda_graph,
@@ -1121,7 +1121,7 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
             pytest.skip("PP with torch.compile is not supported yet.")
         if moe_backend == "TRTLLM" and get_sm_version() == 120:
             pytest.skip("MOE TRTLLM backend does not support SM version 120")
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
         torch_compile_config = TorchCompileConfig(
             enable_fullgraph=True,
@@ -1178,7 +1178,7 @@ def test_no_kv_cache_reuse(self, quant_dtype, mtp_nextn, fp8kv,
         elif quant_dtype == "nvfp4":
             model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
 
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9,
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
                                         enable_block_reuse=False)
         pytorch_config = dict(
             disable_overlap_scheduler=not overlap_scheduler,
diff --git a/tests/integration/defs/triton_server/test_triton.py b/tests/integration/defs/triton_server/test_triton.py
index c25d82d271b..44b95dddf5f 100644
--- a/tests/integration/defs/triton_server/test_triton.py
+++ b/tests/integration/defs/triton_server/test_triton.py
@@ -508,7 +508,7 @@ def test_cpp_unit_tests(tritonserver_test_root, test_name, llm_root):
 
     run_shell_command(
         f"cd {llm_root}/triton_backend/inflight_batcher_llm/build && "
-        f"cmake .. -DTRTLLM_DIR={llm_root} -DCMAKE_INSTALL_PREFIX=install/ -DBUILD_TESTS=ON -DUSE_CXX11_ABI=ON "
+        f"cmake .. -DTRTLLM_DIR={llm_root} -DCMAKE_INSTALL_PREFIX=install/ -DBUILD_TESTS=ON  -DUSE_CXX11_ABI=ON -DTRITON_COMMON_REPO_TAG=r25.05 -DTRITON_CORE_REPO_TAG=r25.05 -DTRITON_THIRD_PARTY_REPO_TAG=r25.05 -DTRITON_BACKEND_REPO_TAG=r25.05 "
         "&& make -j8 install", llm_root)
 
     # Run the cpp unit tests
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
index d46287d629e..b8a846ccff6 100644
--- a/tests/integration/test_lists/test-db/l0_a100.yml
+++ b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -14,6 +14,7 @@ l0_a100:
       backend: "pytorch"
   tests:
     - unittest/llmapi/test_llm_pytorch.py
+    - unittest/llmapi/test_mpi_session.py # generic tests
 - condition:
     ranges:
       system_gpu_count:
@@ -27,7 +28,7 @@ l0_a100:
       stage: post_merge
       backend: tensorrt
   tests:
-  - unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others
+  - unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others
   - unittest/llmapi/test_llm_models.py -m "part1"
   - unittest/llmapi/test_llm_models.py -m "not (part0 or part1)"
   - unittest/llmapi/test_llm.py -m "part0"
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
index bbe1c1b8a27..0aa3e9e5fb8 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -15,5 +15,6 @@ l0_gb200_multi_nodes:
   tests:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index cc790ce4eb3..346aab5adf5 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -83,7 +83,7 @@ full:B200_PCIe/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
 full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
 full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
 full:B200_PCIe/unittest/bindings SKIP (Disable for Blackwell)
-full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
+full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
 full:B200_PCIe/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
 full:B200_PCIe/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
 full:B200_PCIe/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
@@ -155,7 +155,7 @@ full:B200/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
 full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
 full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
 full:B200/unittest/bindings SKIP (Disable for Blackwell)
-full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/llmapi/test_mpi_session.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
+full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
 full:B200/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
 full:B200/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
 full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
diff --git a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py b/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
index 6f3a7e6320d..df8214c4a55 100644
--- a/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
+++ b/tests/unittest/_torch/test_fp8_per_tensor_scale_tllmg_gemm.py
@@ -100,7 +100,7 @@ def test_fp8_block_scale_gemm(dtype, m, k, n, inference_mode):
     output_expected = output_expected.to(torch.float)
     diff = calc_diff(output, output_expected)
     assert diff < 1e-3
-    torch.testing.assert_close(output, output_expected, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(output, output_expected, atol=1e-2, rtol=1e-2)
 
 
 @pytest.mark.skipif(
diff --git a/tests/unittest/llmapi/_test_remote_mpi_session.sh b/tests/unittest/llmapi/_test_remote_mpi_session.sh
index 01eff4b2725..792ef70dc85 100644
--- a/tests/unittest/llmapi/_test_remote_mpi_session.sh
+++ b/tests/unittest/llmapi/_test_remote_mpi_session.sh
@@ -7,6 +7,6 @@ echo "Starting remote MPI session test with task: $task"
 echo "MPI processes: 2"
 
 # Add timeout to prevent infinite hanging
-timeout 60 mpirun -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task
+timeout 60 mpirun --allow-run-as-root -np 2 trtllm-llmapi-launch python3 _run_mpi_comm_task.py --task_type $task
 
 echo "Remote MPI session test completed"
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index bda6fdf3fed..ec9bac2c5b6 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -2090,36 +2090,24 @@ def success_path():
     success_path()
 
 
-def _test_llm_capture_request_error(pytorch_backend: bool, tp_size: int = 1):
-    llm_args_extra = {}
-    if pytorch_backend:
-        LLM_CLASS = LLM_torch
-        llm_args_extra["max_num_tokens"] = 64
-    else:
-        LLM_CLASS = LLM
-        build_config = BuildConfig()
-        build_config.max_num_tokens = 64
-        llm_args_extra["fast_build"] = True
-        llm_args_extra["build_config"] = build_config
+def _test_llm_capture_request_error(tp_size: int = 1):
+    build_config = BuildConfig()
+    build_config.max_num_tokens = 64
 
-    llm = LLM_CLASS(
+    llm = LLM(
         model=llama_model_path,
-        tensor_parallel_size=tp_size,
-        **llm_args_extra,
+        build_config=build_config,
+        fast_build=True,
     )
 
     prompt = 'A ' * 65  # the minimum max_num_tokens is 64
-    if pytorch_backend:
-        # pytorch backend will raise ValueError for max_num_tokens
-        with pytest.raises(ValueError):
-            llm.generate(prompt)
-    else:
-        with pytest.raises(RequestError):
-            llm.generate(prompt)
+
+    with pytest.raises(RequestError):
+        llm.generate(prompt)
 
 
 def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=False, tp_size=1)
+    _test_llm_capture_request_error(tp_size=1)
 
 
 def test_llm_shutdown_executor():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
index 40e657e7894..ecddfbe6a04 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -466,7 +466,7 @@ def test_llm_get_stats_async_tp2(pytorch_backend):
 
 
 def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=False, tp_size=2)
+    _test_llm_capture_request_error(tp_size=2)
 
 
 def test_llm_with_postprocess_parallel_tp2():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
index cb8dbf03c07..38b9e56d086 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -7,17 +7,11 @@
 from tensorrt_llm.lora_manager import LoraConfig
 from .lora_test_utils import check_llama_7b_multi_lora_from_request_test_harness
 from .test_llm_pytorch import llama_7b_lora_from_dir_test_harness
-from .test_llm import _test_llm_capture_request_error
 # isort: on
 
 global_kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
 
 
-@pytest.mark.gpu2
-def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=True, tp_size=2)
-
-
 @pytest.mark.gpu4
 def test_tinyllama_logits_processor_tp2pp2():
     tinyllama_logits_processor_test_harness(backend="pytorch",
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index dd6d2b4be31..486ceb301f5 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -6,11 +6,12 @@
 
 # isort: off
 from .lora_test_utils import check_llama_7b_multi_unique_lora_adapters_from_request
-from .test_llm import (
-    get_model_path, global_kvcache_config, llama_model_path,
-    llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts,
-    run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler,
-    tinyllama_logits_processor_test_harness, _test_llm_capture_request_error)
+from .test_llm import (get_model_path, global_kvcache_config, llama_model_path,
+                       llm_get_stats_async_test_harness,
+                       llm_get_stats_test_harness, prompts,
+                       run_llm_abort_request,
+                       run_llm_with_postprocess_parallel_and_result_handler,
+                       tinyllama_logits_processor_test_harness)
 from utils.util import (EnvVarsContextManager, force_ampere,
                         run_function_in_sub_process, similar,
                         skip_gpu_memory_less_than_40gb,
@@ -69,10 +70,6 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
         enable_iter_req_stats=enable_iter_req_stats)
 
 
-def test_llm_capture_request_error():
-    _test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
-
-
 @force_ampere
 @pytest.mark.parametrize(
     "sampling_params",
diff --git a/tests/unittest/llmapi/test_mpi_session.py b/tests/unittest/llmapi/test_mpi_session.py
index ae8b0eba7a0..484caf7381e 100644
--- a/tests/unittest/llmapi/test_mpi_session.py
+++ b/tests/unittest/llmapi/test_mpi_session.py
@@ -60,13 +60,15 @@ def test_remote_mpi_session(task_type: Literal["submit", "submit_sync"]):
     """Test RemoteMpiPoolSessionClient and RemoteMpiPoolSessionServer interaction"""
     command = ["bash", "_test_remote_mpi_session.sh", task_type]
     print(' '.join(command))
+
     with Popen(command,
                env=os.environ,
                stdout=PIPE,
                stderr=PIPE,
                bufsize=1,
                start_new_session=True,
-               universal_newlines=True) as process:
+               universal_newlines=True,
+               cwd=os.path.dirname(os.path.abspath(__file__))) as process:
 
         # Function to read from a stream and write to output
         def read_stream(stream, output_stream):
diff --git a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
index db3093a5b47..3523dff6819 100644
--- a/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
+++ b/triton_backend/ci/L0_backend_trtllm/custom_metrics_verification_tests.py
@@ -82,7 +82,7 @@ def _parse_log_file(self, filename):
 
                     return json.loads(json_string)
 
-    def _parse_triton_metrics(self, filename, is_v1):
+    def _parse_triton_metrics(self, filename):
         curl_counts = {}
         with open(filename) as metrics_file:
             for line in metrics_file:
@@ -91,12 +91,11 @@ def _parse_triton_metrics(self, filename, is_v1):
                     metric_output = re.sub(r"^.*?{", "{", line).split()
                     metric_key = metric_output[0]
                     metric_value = metric_output[1]
-                    key = self._convert_metric_key_to_stats_key(
-                        metric_key, is_v1)
+                    key = self._convert_metric_key_to_stats_key(metric_key)
                     curl_counts[key] = metric_value
         return curl_counts
 
-    def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
+    def _convert_metric_key_to_stats_key(self, metric_output):
         # Converts:
         # '{model="tensorrt_llm",request_type="context",version="1"}'
         # to:
@@ -107,15 +106,12 @@ def _convert_metric_key_to_stats_key(self, metric_output, is_v1):
             if not i.startswith('model') and not i.startswith('version')
         ][0]
         self.assertIn(key, metric_to_stat_dict)
-        if (is_v1):
-            self.assertNotIn("inflight_batcher_specific_metric", key)
-        else:
-            self.assertNotIn("v1_specific_metric", key)
+        self.assertNotIn("v1_specific_metric", key)
         return metric_to_stat_dict[key]
 
-    def _base_test(self, stats_file, metrics_file, is_v1):
+    def _base_test(self, stats_file, metrics_file):
         stats = self._parse_log_file(stats_file)
-        metrics = self._parse_triton_metrics(metrics_file, is_v1)
+        metrics = self._parse_triton_metrics(metrics_file)
         self.assertEqual(len(stats.keys()), len(metrics.keys()))
         self.assertEqual(list(stats.keys()).sort(), list(metrics.keys()).sort())
         for metric_key in stats.keys():
@@ -140,45 +136,33 @@ def _base_test(self, stats_file, metrics_file, is_v1):
                     timedelta(seconds=-1) <= difference, difference
                     <= timedelta(seconds=1))
 
-    def test_1_gpu_v1(self):
-        self._base_test("1gpu_v1_no_streaming_server.log",
-                        "1gpu_v1_no_stream_metrics.out", True)
-
     def test_1_gpu_IFB_no_stream(self):
         self._base_test("1gpu_IFB_no_streaming_server.log",
-                        "1gpu_IFB_no_stream_metrics.out", False)
+                        "1gpu_IFB_no_stream_metrics.out")
 
     def test_1_gpu_IFB_stream(self):
         self._base_test("1gpu_IFB_streaming_server.log",
-                        "1gpu_IFB_stream_metrics.out", False)
+                        "1gpu_IFB_stream_metrics.out")
 
     if AVAILABLE_GPUS >= 2:
 
-        def test_2_gpu_v1(self):
-            self._base_test("2gpu_v1_no_streaming_server.log",
-                            "2gpu_v1_no_stream_metrics.out", True)
-
         def test_2_gpu_IFB_no_stream(self):
             self._base_test("2gpu_IFB_no_streaming_server.log",
-                            "2gpu_IFB_no_stream_metrics.out", False)
+                            "2gpu_IFB_no_stream_metrics.out")
 
         def test_2_gpu_IFB_stream(self):
             self._base_test("2gpu_IFB_streaming_server.log",
-                            "2gpu_IFB_stream_metrics.out", False)
+                            "2gpu_IFB_stream_metrics.out")
 
     if AVAILABLE_GPUS >= 4:
 
-        def test_4_gpu_v1(self):
-            self._base_test("4gpu_v1_no_streaming_server.log",
-                            "4gpu_v1_no_stream_metrics.out", True)
-
         def test_4_gpu_IFB_no_stream(self):
             self._base_test("4gpu_IFB_no_streaming_server.log",
-                            "4gpu_IFB_no_stream_metrics.out", False)
+                            "4gpu_IFB_no_stream_metrics.out")
 
         def test_4_gpu_IFB_stream(self):
             self._base_test("4gpu_IFB_streaming_server.log",
-                            "4gpu_IFB_stream_metrics.out", False)
+                            "4gpu_IFB_stream_metrics.out")
 
 
 if __name__ == "__main__":
diff --git a/triton_backend/ci/L0_backend_trtllm/test.sh b/triton_backend/ci/L0_backend_trtllm/test.sh
index c09e985a266..83967d1c58c 100644
--- a/triton_backend/ci/L0_backend_trtllm/test.sh
+++ b/triton_backend/ci/L0_backend_trtllm/test.sh
@@ -228,49 +228,13 @@ for NUM_GPU in "${NUM_GPUS_TO_TEST[@]}"; do
 
     run_server "${SERVER_ARGS}"
     wait_for_server_ready ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-    if [ "$WAIT_RET" != "0" ]; then
-        # Cleanup
-        kill $SERVER_PID > /dev/null 2>&1 || true
-        echo -e "\n***\n*** Failed to start $SERVER\n***"
-        cat $SERVER_LOG
-        exit 1
-    fi
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/benchmark_core_model.py \
-        --max-input-len=500 \
-        dataset --dataset=${DATASET} \
-        --tokenizer-dir=${TOKENIZER_DIR}
-
-    if [ $? -ne 0 ]; then
-        cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 benchmark_core_model test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
-    fi
-    set +e
-
-    set -e
-    python3 ${TOOLS_DIR}/inflight_batcher_llm/end_to_end_test.py \
-        --max-input-len=500 \
-        --dataset=${DATASET}
 
-    if [ $? -ne 0 ]; then
+    # Expect invalid GPT model type error to be gracefully handled
+    if [ `grep -c "Static batching type is deprecated" $SERVER_LOG` == "0" ]; then
+        echo -e "\n***\n*** GPT model type error not handled gracefully: line ${LINENO}\n***"
         cat $SERVER_LOG
-        echo -e "\n***\n*** Error executing v1 end-to-end test with ${NUM_GPU}GPU(s): line ${LINENO}\n***"
-        kill_server
-        wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
-        RET=1
+        exit 1
     fi
-    set +e
-
-    # Make sure the metrics is retrieved after the server has updated the metrics internally
-    sleep ${SLEEP_DURATION}
-    curl localhost:8002/metrics -o ${NUM_GPU}gpu_v1_no_stream_metrics.out
-
-    kill_server
-    wait_for_server_terminated ${SERVER_TIMEOUT} ${SERVER_PID[@]}
 
     # inflight batching ON
     # streaming OFF
diff --git a/triton_backend/inflight_batcher_llm/scripts/build.sh b/triton_backend/inflight_batcher_llm/scripts/build.sh
index 8aafc4b0f81..d077746bb51 100644
--- a/triton_backend/inflight_batcher_llm/scripts/build.sh
+++ b/triton_backend/inflight_batcher_llm/scripts/build.sh
@@ -51,7 +51,8 @@ if [[ "$BUILD_UNIT_TESTS" == "true" ]]; then
   BUILD_TESTS_ARG="-DBUILD_TESTS=ON -DUSE_CXX11_ABI=ON"
 fi
 
-cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} ..
+# TODO: Remove specifying Triton version after cmake version is upgraded to 3.31.8
+cmake -DCMAKE_INSTALL_PREFIX:PATH=`pwd`/install ${BUILD_TESTS_ARG} -DTRITON_COMMON_REPO_TAG=r25.05 -DTRITON_CORE_REPO_TAG=r25.05 -DTRITON_THIRD_PARTY_REPO_TAG=r25.05 -DTRITON_BACKEND_REPO_TAG=r25.05 ..
 make install
 
 mkdir -p /opt/tritonserver/backends/tensorrtllm