NVIDIA · dc3671 · Aug 4, 2025 · Jul 22, 2025 · Jul 28, 2025 · Jul 29, 2025
diff --git a/...rrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/...rrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@@ -1336,6 +1336,8 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
     // Note max_attention_window_size is maximum of cyclic_attention_window_size among all layers.
     // By default, you can assume that they are the same.
     auto const cyclic_kv_cache_len = static_cast<unsigned>(params.cyclic_attention_window_size);
+    // The chunked attention size.
+    auto const chunked_attention_size = static_cast<unsigned>(params.chunked_attention_size);
     // The number of sink tokens in kv cache to support streamingllm
     auto const sink_token_len = static_cast<unsigned>(params.sink_token_length);
     // The current timestep (including paddings).
@@ -1361,7 +1363,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
 #ifndef MMHA_USE_FP32_ACCUM_FOR_LOGITS
     if (sizeof(Tk) != 4)
     {
-        auto const max_timesteps = min(timestep, cyclic_kv_cache_len);
+        auto const max_timesteps = min(timestep, min(cyclic_kv_cache_len, chunked_attention_size));
         logits_smem_ += divUp(max_timesteps + 1, 4u) * 16;
     }
     Tk* logits_smem = reinterpret_cast<Tk*>(logits_smem_);

@@ -12,6 +12,8 @@ Tuning batch sizes, parallelism configurations, and other options may lead to im
 
 For DeepSeek R1 performance, please check out our [performance guide](../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md)
 
+For more information on benchmarking with `trtllm-bench` see this NVIDIA [blog post](https://developer.nvidia.com/blog/llm-inference-benchmarking-performance-tuning-with-tensorrt-llm/).
+
 ## Throughput Measurements
 
 The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages),
@@ -21,50 +23,64 @@ The performance numbers below were collected using the steps described in this d
 
 Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/TensorRT-Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4).
 
-### FP4 Models:
-```
+### Hardware
+The following GPU variants were used for testing:
+- H100 SXM 80GB (DGX H100)
+- H200 SXM 141GB (DGX H200)
+- GH200 96GB HBM3 (480GB LPDDR5X)
+- B200 180GB (DGX B200)
+- GB200 192GB (GB200 NVL72)
+
+Other hardware variants may have different TDP, memory bandwidth, core count, or other features leading to performance differences on these workloads.
+
+### FP4 Models
+
+```text
 nvidia/Llama-3.3-70B-Instruct-FP4
 nvidia/Llama-3.1-405B-Instruct-FP4
 ```
 
 #### Llama 3.3 70B FP4
 
-|                         | GPU     | B200      |           |           |           |
-|:------------------------|:--------|:----------|:----------|:----------|:----------|
-|                         | TP Size | 1         | 2         | 4         | 8         |
-| ISL, OSL                |         |           |           |           |           |
-|                         |         |           |           |           |           |
-| 128, 128                |         | 10,994.48 | 17,542.11 | 24,667.31 | 27,272.27 |
-| 128, 2048               |         | 9,580.46  | 15,432.35 | 23,568.12 | 31,174.31 |
-| 128, 4096               |         | 6,418.39  | 9,841.53  | 17,808.76 | 25,229.25 |
-| 500, 2000               |         | 7,343.32  | 11,850.57 | 20,709.67 | 28,038.78 |
-| 1000, 1000              |         | 6,752.53  | 10,815.88 | 16,413.04 | 20,060.66 |
-| 1000, 2000              |         | 6,670.07  | 9,830.73  | 15,597.49 | 20,672.37 |
-| 1024, 2048              |         | 6,636.75  | 9,807.13  | 15,519.23 | 20,617.28 |
-| 2048, 128               |         | 1,342.17  | 1,989.41  | 3,033.14  | 4,035.64  |
-| 5000, 500               |         | 1,429.67  | 2,419.67  | 3,686.84  | 5,182.96  |
-| 20000, 2000             |         | 629.77    | 1,177.01  | 2,120.66  | 3,429.03  |
+|                          | GPU:   | B200     | GB200  |
+|:-----------------------------|:---|:----------|:--------------|
+|    | TP Size   | 1      | 1          |
+| ISL, OSL |    |           |               |
+|                              |    |           |               |
+| 128, 128                     |    | 10,613.84 | 11,100.97     |
+| 128, 2048                    |    | 9,445.51  | 10,276.05     |
+| 128, 4096                    |    | 6,276.85  | 7,351.12      |
+| 500, 2000                    |    | 6,983.27  | 8,194.30      |
+| 1000, 1000                   |    | 6,434.29  | 7,401.80      |
+| 1000, 2000                   |    | 6,725.03  | 6,478.72      |
+| 1024, 2048                   |    | 6,546.61  | 7,922.88      |
+| 2048, 128                    |    | 1,330.35  | 1,418.47      |
+| 2048, 2048                   |    | 4,528.48  | 5,326.77      |
+| 5000, 500                    |    | 1,427.44  | 1,502.44      |
+| 20000, 2000                  |    | 636.36    | 732.43        |
 
 #### Llama 3.1 405B FP4
 
-|                         | GPU     | B200     |           |
-|:------------------------|:------- |:---------|:----------|
-|                         | TP Size | 4        | 8         |
-| ISL, OSL                |         |          |           |
-|                         |         |          |           |
-| 128, 128                |         | 6,163.81 | 9,002.90  |
-| 128, 2048               |         | 7,081.21 | 10,288.28 |
-| 128, 4096               |         | 6,028.37 | 8,713.77  |
-| 500, 2000               |         | 5,858.75 | 9,125.86  |
-| 1000, 1000              |         | 4,848.00 | 7,582.97  |
-| 1000, 2000              |         | 5,375.25 | 7,626.28  |
-| 1024, 2048              |         | 5,345.70 | 7,464.03  |
-| 2048, 128               |         | 693.55   | 1,086.56  |
-| 5000, 500               |         | 947.49   | 1,532.45  |
-| 20000, 2000             |         | 641.11   | 1,097.84  |
-
-### FP8 Models:
-```
+|                         | GPU:    | B200    | GB200  |
+|:-----------------------------|:---|:---------|:--------------|
+|   | TP Size   | 4     | 4          |
+| ISL, OSL |    |          |               |
+|                              |    |          |               |
+| 128, 128                     |    | 6,218.89 | 6,598.97      |
+| 128, 2048                    |    | 7,178.10 | 7,497.40      |
+| 128, 4096                    |    | 5,890.89 | 5,898.19      |
+| 500, 2000                    |    | 5,844.37 | 6,198.33      |
+| 1000, 1000                   |    | 4,958.53 | 5,243.35      |
+| 1000, 2000                   |    | 4,874.16 | 4,905.51      |
+| 1024, 2048                   |    | 4,833.19 | 4,686.38      |
+| 2048, 128                    |    | 737.95   | 761.58        |
+| 2048, 2048                   |    | 4,024.02 | 4,326.56      |
+| 5000, 500                    |    | 1,032.40 | 1,078.87      |
+| 20000, 2000                  |    | 667.39   | 649.95        |
+
+### FP8 Models
+
+```text
 nvidia/Llama-3.1-8B-Instruct-FP8
 nvidia/Llama-3.3-70B-Instruct-FP8
 nvidia/Llama-3.1-405B-Instruct-FP8
@@ -73,61 +89,65 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 
 #### Llama 3.1 8B FP8
 
-|                         | GPU     | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|    | TP Size   | 1              | 1             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 128                     |    | 27,970.14         | 27,688.36        |
-| 128, 2048                    |    | 23,326.38         | 21,841.15        |
-| 128, 4096                    |    | 17,508.51         | 13,730.89        |
-| 500, 2000                    |    | 21,390.41         | 17,833.34        |
-| 1000, 1000                   |    | 17,366.89         | 15,270.62        |
-| 1000, 2000                   |    | 16,831.31         | 13,798.08        |
-| 1024, 2048                   |    | 16,737.03         | 13,385.50        |
-| 2048, 128                    |    | 3,488.03          | 3,414.67         |
-| 5000, 500                    |    | 3,813.69          | 3,394.54         |
-| 20000, 2000                  |    | 1,696.66          | 1,345.42         |
+|                          | GPU:   | GH200  | H100   | H200   |
+|:-----------------------------|:---|:--------------|:-----------------|:------------------|
+|    | TP Size   | 1          | 1             | 1              |
+| ISL, OSL |    |               |                  |                   |
+|                              |    |               |                  |                   |
+| 128, 128                     |    | 27,304.25     | 26,401.48        | 27,027.80         |
+| 128, 2048                    |    | 24,045.60     | 21,413.21        | 23,102.25         |
+| 128, 4096                    |    | 15,409.85     | 13,541.54        | 17,396.83         |
+| 500, 2000                    |    | 20,123.88     | 17,571.01        | 19,759.16         |
+| 1000, 1000                   |    | 16,352.99     | 14,991.62        | 17,162.49         |
+| 1000, 2000                   |    | 15,705.82     | 13,505.23        | 16,227.11         |
+| 1024, 2048                   |    | 16,102.52     | 13,165.91        | 16,057.66         |
+| 2048, 128                    |    | 3,573.85      | 3,275.55         | 3,390.69          |
+| 2048, 2048                   |    | 10,767.05     | 9,462.43         | 11,822.14         |
+| 5000, 500                    |    | 3,584.74      | 3,276.47         | 3,758.08          |
+| 20000, 2000                  |    | 1,393.31      | 1,340.69         | 1,705.68          |
 
 #### Llama 3.3 70B FP8
 
-|                          | GPU    | H200 141GB HBM3   |          |           |           | H100 80GB HBM3   |          |           |           |
-|:-----------------------------|:---|:------------------|:---------|:----------|:----------|:-----------------|:---------|:----------|:----------|
-|    | TP Size   | 1              | 2     | 4      | 8      | 1            | 2     | 4      | 8      |
-| ISL, OSL |    |                   |          |           |           |                  |          |           |           |
-|                              |    |                   |          |           |           |                  |          |           |           |
-| 128, 128                     |    | 3,605.47          | 6,427.69 | 10,407.42 | 15,434.37 | 3,128.33         | 6,216.91 |           |           |
-| 128, 2048                    |    | 4,315.80          | 8,464.03 | 13,508.59 | 20,759.72 | 756.42           | 5,782.57 | 11,464.94 | 17,424.32 |
-| 128, 4096                    |    | 2,701.17          | 5,573.55 | 11,458.56 | 16,668.75 |                  | 3,868.37 | 8,206.39  | 12,624.61 |
-| 500, 2000                    |    | 3,478.76          | 6,740.06 | 12,200.18 |           |                  | 4,684.06 | 9,903.53  | 14,553.93 |
-| 1000, 1000                   |    | 2,744.32          | 5,119.72 | 8,685.44  | 12,744.51 | 742.14           | 4,247.19 | 7,435.65  | 11,018.81 |
-| 1000, 2000                   |    | 2,896.44          | 5,847.26 | 9,031.21  | 13,141.17 | 533.74           | 3,866.53 | 7,611.12  | 11,139.22 |
-| 1024, 2048                   |    | 2,874.18          | 5,568.61 | 8,946.71  | 13,082.62 | 530.16           | 3,796.68 | 7,575.24  | 11,004.31 |
-| 2048, 128                    |    | 435.90            | 772.67   | 1,264.76  |           |                  | 736.89   | 1,213.33  | 1,839.22  |
-| 2048, 2048                   |    |                   |          |           | 10,412.85 |                  |          |           |           |
-| 5000, 500                    |    | 545.96            | 997.15   | 1,698.22  | 2,655.28  | 204.94           | 862.91   | 1,552.68  | 2,369.84  |
-| 20000, 2000                  |    | 276.66            | 620.33   | 1,161.29  | 1,985.85  |                  | 416.13   | 903.66    | 1,554.10  |
+|                        | GPU:     | H100   | H200   |
+|:-----------------------------|:---|:-----------------|:------------------|
+|    | TP Size   | 2             | 2              |
+| ISL, OSL |    |                  |                   |
+|                              |    |                  |                   |
+| 128, 128                     |    | 6,092.28         | 6,327.98          |
+| 128, 2048                    |    | 5,892.94         | 7,467.36          |
+| 128, 4096                    |    | 3,828.46         | 5,526.42          |
+| 500, 2000                    |    | 4,654.74         | 6,639.15          |
+| 1000, 1000                   |    | 4,181.06         | 4,773.33          |
+| 1000, 2000                   |    | 3,708.93         | 5,790.36          |
+| 1024, 2048                   |    | 3,785.04         | 5,480.44          |
+| 2048, 128                    |    | 723.40           | 747.55            |
+| 2048, 2048                   |    | 2,785.53         | 3,775.80          |
+| 5000, 500                    |    | 865.55           | 978.28            |
+| 20000, 2000                  |    | 411.85           | 609.42            |
 
 #### Llama 3.1 405B FP8
-
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
-|:-----------------------------|:---|:------------------|:-----------------|
-|   | TP Size   | 8              | 8             |
-| ISL, OSL |    |                   |                  |
-|                              |    |                   |                  |
-| 128, 2048                    |    | 5,567.87          |                  |
-| 128, 4096                    |    | 5,136.85          |                  |
-| 500, 2000                    |    | 4,787.61          | 3,673.91         |
-| 1000, 1000                   |    | 3,286.30          | 3,012.22         |
-| 1000, 2000                   |    | 3,636.76          | 3,262.20         |
-| 1024, 2048                   |    | 3,618.66          | 3,109.70         |
-| 2048, 128                    |    | 443.10            | 449.02           |
-| 5000, 500                    |    | 645.46            |                  |
-| 20000, 2000                  |    |                   | 372.12           |
+|                         | GPU:    | H100   | H200   |
+|:-----------------------------|:---|:-----------------|:------------------|
+|    | TP Size   | 8             | 8              |
+| Runtime Input/Output Lengths |    |                  |                   |
+|                              |    |                  |                   |
+| 128, 128                     |    |                  | 3,705.18          |
+| 128, 2048                    |    | 4,517.39         | 4,715.13          |
+| 128, 4096                    |    | 2,910.31         | 4,475.91          |
+| 500, 2000                    |    | 3,664.62         | 4,804.10          |
+| 1000, 1000                   |    | 2,955.50         | 3,208.25          |
+| 1000, 2000                   |    | 2,884.69         | 3,630.29          |
+| 1024, 2048                   |    | 3,237.41         | 3,609.50          |
+| 2048, 128                    |    | 433.47           | 441.35            |
+| 2048, 2048                   |    | 2,216.55         | 2,840.86          |
+| 5000, 500                    |    | 579.05           | 645.26            |
+| 20000, 2000                  |    | 363.27           | 509.87            |
 
 #### Llama 4 Maverick FP8
 
-|                          | GPU    | H200 141GB HBM3   | H100 80GB HBM3   |
+Note: Performance for Llama 4 on sequence lengths less than 8,192 tokens is affected by an issue introduced in v0.21. To reproduce the Llama 4 performance noted here, please use v0.20
+
+|                          | GPU    | H200   | H100   |
 |:-----------------------------|:---|:------------------|:-----------------|
 |    | TP Size    | 8              | 8             |
 | ISL, OSL |    |                   |                  |
@@ -140,7 +160,6 @@ nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8
 | 2048, 128                    |    | 4,364.06          | 3,832.38         |
 | 2048, 2048                   |    | 12,800.89         |                  |
 | 5000, 500                    |    | 5,128.60          |                  |
-| 20000, 2000                  |    | 1,764.27          | 1,400.79         |
 
 ## Reproducing Benchmarked Results
 
@@ -216,7 +235,7 @@ a model name (HuggingFace reference or path to a local model), a [generated data
 trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options
 ```
 
-The data collected for the v0.20 benchmarks was run with the following file:
+The data collected for the v0.21 benchmarks was run with the following file:
 
 `llm_options.yml`
 ```yaml
@@ -240,7 +259,7 @@ cuda_graph_config:
     - 8192
 ```
 
-In a majority of cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` if we hit an out of memory issue.
+In many cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` or lower if out-of-memory errors are encountered.
 
 The results will be printed to the terminal upon benchmark completion. For example,
 

@@ -73,6 +73,7 @@ All published functionality in the Release Notes has been fully tested and verif
 ### Known Issues
 - accuracy/test_cli_flow::TestGpt2::test_beam_search_large is broken.
 - Enabling disaggregated serving, MTP, and the overlap scheduler at the same time can lead to accuracy problems.
+- In 0.21, full chunked attention support has been added to make sure LLaMA4 model can functionally run with > 8K seq length, while there is a known performance regression(only affect LLaMA4 model) on Hopper due to this functional enhancement. The root cause of the regression has been identified already and the fix will be part of the future release.
 
 ## TensorRT-LLM Release 0.20.0