resolve comments

richardhuo-nv · richardhuo-nv · commit 7add6ebc0270 · 2025-08-06T12:21:17.000-07:00
diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_agg.yml
@@ -19,7 +19,6 @@ moe_expert_parallel_size: 8
 max_batch_size: 8
 max_num_tokens: 4096
 disable_overlap_scheduler: true # disable_overlap_scheduler is having acc issue on both aggregated and disaggregated serving
-enable_autotuner: false
 
 # Enable Speculative Decoding in the model engine
 speculative_config:
diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_decode.yaml
@@ -21,7 +21,6 @@ max_num_tokens: 1024
 # 8704 = 8192 ISL + 512 OSL
 max_seq_len: 8704
 disable_overlap_scheduler: true
-enable_autotuner: false
 
 # Enable Speculative Decoding in the model engine
 speculative_config:
diff --git a/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml b/components/backends/trtllm/engine_configs/llama4/eagle_one_model/eagle_prefill.yaml
@@ -21,7 +21,6 @@ max_num_tokens: 8192
 max_seq_len: 8192
 print_iter_log: true
 disable_overlap_scheduler: true
-enable_autotuner: false
 
 # Enable Speculative Decoding in the model engine
 speculative_config:
diff --git a/components/backends/trtllm/gemma3_sliding_window_attention.md b/components/backends/trtllm/gemma3_sliding_window_attention.md
@@ -21,11 +21,11 @@ This guide demonstrates how to deploy google/gemma-3-1b-it with Variable Sliding
 VSWA is a mechanism in which a model’s layers alternate between multiple sliding window sizes. An example of this is Gemma 3, which incorporates both global attention layers and sliding window layers.
 
 ## Notes
-* To run Gemma 3 with VSWA, ensure that the container has TensorRT-LLM v1.0.0rc4 installed.
-* To run Gemma 3 with VSWA and KV Routing, ensure that the container is built with the default experimental TRT-LLM commit.
+* To run Gemma 3 with VSWA and KV Routing with KV block reuse, ensure that the container is built using commit ID `c9eebcb4541d961ab390f0bd0a22e2c89f1bcc78` from Tensorrt-LLM.
 ```bash
-./container/build.sh --framework TENSORRTLLM --use-default-experimental-tensorrtllm-commit
+./container/build.sh --framework TENSORRTLLM --tensorrtllm-commit c9eebcb4541d961ab390f0bd0a22e2c89f1bcc78
 ```
+* The 1.0.0rc4 release version of TensorRT-LLM can also run Gemma 3 with VSWA, but KV block reuse cannot be turned on in that version.
 
 ### Aggregated Serving
 ```bash
diff --git a/components/backends/trtllm/src/dynamo/trtllm/publisher.py b/components/backends/trtllm/src/dynamo/trtllm/publisher.py
@@ -418,7 +418,25 @@ def update_max_window_size(self, event):
                     f"kv events max_window_size has been updated to {self.max_window_size}"
                 )
 
+    # The global attention layer will emit the KV event with the max_window_size.
+    # We only want to keep the KV event that has the max_window_size to ensure
+    # the accuracy of KV routing.
+    # TRTLLM emits a "created" event at the very beginning when it creates the KV cache,
+    # so we can use the "created" event to identify the max_window_size of the global
+    # attention layer in the model engine.
     def should_drop_event(self, event):
+        # There are two cases for KV event filtering:
+        #
+        # 1. If "window_size" is NOT in the KV event:
+        #    "window_size" was added to KV events only recently, so some older versions of TRTLLM
+        #    might not include it. In this case, the publisher will assume that all events are
+        #    from the global attention layer.
+        #
+        # 2. If "window_size" is present in the KV event:
+        #    The publisher will not drop any KV events until all initial "created" KV events
+        #    have been processed in order to capture the max_window_size.
+        #    After processing all "created" events, the publisher will only accept KV events
+        #    whose window_size is equal to the max_window_size to ensure accurate routing.
         if "window_size" not in event or self.processing_initial_created_events:
             return False
 
diff --git a/container/build.sh b/container/build.sh
@@ -88,7 +88,7 @@ TENSORRTLLM_PIP_WHEEL_DIR="/tmp/trtllm_wheel/"
 # TensorRT-LLM commit to use for building the trtllm wheel if not provided.
 # Important Note: This commit is not used in our CI pipeline. See the CI
 # variables to learn how to run a pipeline with a specific commit.
-DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="c9eebcb4541d961ab390f0bd0a22e2c89f1bcc78"
+DEFAULT_EXPERIMENTAL_TRTLLM_COMMIT="69e9f6d48944b2ae0124ff57aa59340aa4dfae15"
 TRTLLM_COMMIT=""
 TRTLLM_USE_NIXL_KVCACHE_EXPERIMENTAL="0"
 TRTLLM_GIT_URL=""