Skip to content
Prev Previous commit
Next Next commit
Fix the TRTLLM engine config locations and references
  • Loading branch information
tanmayv25 committed Nov 12, 2025
commit a49a4342e737757a4670c217a4f71733e6286a4f
10 changes: 5 additions & 5 deletions benchmarks/router/run_engines.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm"
ENGINE_CONFIG_PATH="$DYNAMO_HOME/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b"
TENSOR_PARALLEL_SIZE=1
DATA_PARALLEL_SIZE=1
USE_MOCKERS=false
Expand Down Expand Up @@ -86,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
)
elif [ "$USE_TRTLLM" = true ]; then
# Default args for TensorRT-LLM engine using predefined YAML configs
# Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml
# Config files located at: $ENGINE_CONFIG_PATH/{agg,decode,prefill}.yaml
if [ "$MODE" = "prefill" ]; then
ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml"
ENGINE_CONFIG="$ENGINE_CONFIG_PATH/prefill.yaml"
elif [ "$MODE" = "decode" ]; then
ENGINE_CONFIG="$RECIPE_PATH/decode.yaml"
ENGINE_CONFIG="$ENGINE_CONFIG_PATH/decode.yaml"
else
ENGINE_CONFIG="$RECIPE_PATH/agg.yaml"
ENGINE_CONFIG="$ENGINE_CONFIG_PATH/agg.yaml"
fi

EXTRA_ARGS=(
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/trtllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
```bash
cd $DYNAMO_HOME/examples/backends/trtllm

export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml
export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# nvidia/DeepSeek-R1-FP4 is a large model
export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
Expand Down
12 changes: 6 additions & 6 deletions docs/backends/trtllm/gemma3_sliding_window_attention.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
./launch/agg.sh
```

Expand All @@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
./launch/agg_router.sh
```

Expand All @@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
./launch/disagg.sh
```

Expand All @@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
./launch/disagg_router.sh
```
8 changes: 4 additions & 4 deletions docs/backends/trtllm/gpt-oss.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be

#### Configuration Files

**Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**:
**Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**:
- `enable_attention_dp: false` - Attention data parallelism disabled for prefill
- `enable_chunked_prefill: true` - Enables efficient chunked prefill processing
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer
- `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs

**Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**:
**Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**:
- `enable_attention_dp: true` - Attention data parallelism enabled for decode
- `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
Expand Down Expand Up @@ -145,7 +145,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
--model-path /model \
--served-model-name openai/gpt-oss-120b \
--extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \
--extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \
--dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \
--disaggregation-mode prefill \
Expand All @@ -161,7 +161,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
--model-path /model \
--served-model-name openai/gpt-oss-120b \
--extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \
--extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \
--dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \
--disaggregation-mode decode \
Expand Down
8 changes: 4 additions & 4 deletions docs/backends/trtllm/llama4_plus_eagle.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu
- The other node runs the prefill worker.

## Notes
* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder.
* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder.

## Setup

Expand All @@ -52,17 +52,17 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide
## Aggregated Serving
```bash
export NUM_NODES=1
export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml"
export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml"
./multinode/srun_aggregated.sh
```

## Disaggregated Serving

```bash
export NUM_PREFILL_NODES=1
export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml"
export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml"
export NUM_DECODE_NODES=1
export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml"
export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml"
./multinode/srun_disaggregated.sh
```

Expand Down
6 changes: 3 additions & 3 deletions docs/backends/trtllm/multimodal_support.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode
```bash
cd $DYNAMO_HOME

export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml
export AGG_ENGINE_ARGS=./examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml
export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
./launch/agg.sh
Expand Down Expand Up @@ -79,8 +79,8 @@ cd $DYNAMO_HOME

export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"}
export MODALITY=${MODALITY:-"multimodal"}

./launch/disagg.sh
Expand Down
6 changes: 3 additions & 3 deletions docs/backends/trtllm/multinode/multinode-examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:

```bash
# Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml"
# export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml"

# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
Expand Down Expand Up @@ -165,8 +165,8 @@ deployment across 8 nodes:

```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml"

# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ limitations under the License.
>
> Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command:
> ```bash
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml
> ```


Expand Down Expand Up @@ -100,8 +100,8 @@ deployment across 4 nodes:

```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml"

# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
Expand Down
2 changes: 1 addition & 1 deletion docs/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ args:
- python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml
--extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml
```

Key customization points include:
Expand Down
2 changes: 1 addition & 1 deletion examples/backends/trtllm/deploy/agg-with-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,4 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
2 changes: 1 addition & 1 deletion examples/backends/trtllm/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
2 changes: 1 addition & 1 deletion examples/backends/trtllm/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
- --publish-events-and-metrics
4 changes: 2 additions & 2 deletions examples/backends/trtllm/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill
TRTLLMDecodeWorker:
Expand All @@ -63,6 +63,6 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
4 changes: 2 additions & 2 deletions examples/backends/trtllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
TRTLLMPrefillWorker:
Expand All @@ -124,6 +124,6 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill
4 changes: 2 additions & 2 deletions examples/backends/trtllm/deploy/disagg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill
- --publish-events-and-metrics
Expand All @@ -65,6 +65,6 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ moe_config:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml
load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml

tensor_parallel_size: 16
moe_expert_parallel_size: 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ backend: pytorch
# WideEP related settings
moe_config:
backend: WIDEEP
load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml

# TP/EP/PP/DP
tensor_parallel_size: 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ backend: pytorch
# WideEP related settings
moe_config:
backend: WIDEEP
load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml

# TP/EP/PP/DP
tensor_parallel_size: 16
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
enable_attention_dp: true
disable_overlap_scheduler: false
moe_config:
backend: CUTLASS
cuda_graph_config:
enable_padding: true
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 65536
Comment on lines +21 to +23
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fix YAML indentation inconsistency in cache_transceiver_config.

Lines 22-23 use 2-space indentation while other nested configs (moe_config, cuda_graph_config) use 4-space indentation. Standardize to 4-space indentation for consistency.

 cache_transceiver_config:
-  backend: UCX
-  max_tokens_in_buffer: 65536
+    backend: UCX
+    max_tokens_in_buffer: 65536
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 65536
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 65536
🤖 Prompt for AI Agents
In examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml around lines
21 to 23, the cache_transceiver_config block uses 2-space indentation for its
nested keys while other nested blocks use 4-space indentation; update the nested
lines (backend and max_tokens_in_buffer) to use 4-space indentation so the YAML
nesting matches the rest of the file and remains consistent.

print_iter_log: false
stream_interval: 10
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
enable_attention_dp: false
disable_overlap_scheduler: true
moe_config:
backend: CUTLASS
enable_chunked_prefill: true
cuda_graph_config:
max_batch_size: 32
enable_padding: true
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 65536
Comment on lines +23 to +25
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Fix YAML indentation inconsistency in cache_transceiver_config.

Lines 24-25 use 2-space indentation while other nested configs (moe_config, cuda_graph_config) use 4-space indentation. Standardize to 4-space indentation for consistency.

 cache_transceiver_config:
-  backend: UCX
-  max_tokens_in_buffer: 65536
+    backend: UCX
+    max_tokens_in_buffer: 65536
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 65536
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 65536
🤖 Prompt for AI Agents
In examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml around
lines 23 to 25, the cache_transceiver_config block uses 2-space indentation for
its nested keys while other nested configs use 4 spaces; update the indentation
of the keys under cache_transceiver_config (backend and max_tokens_in_buffer) to
use 4 spaces so the file uses a consistent 4-space indentation for nested
configuration blocks.

print_iter_log: false
stream_interval: 10
2 changes: 1 addition & 1 deletion examples/backends/trtllm/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
#export MODALITY=${MODALITY:-"multimodal"}
Expand Down
2 changes: 1 addition & 1 deletion examples/backends/trtllm/launch/agg_metrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}
export MODALITY=${MODALITY:-"text"}

# Setup cleanup trap
Expand Down
2 changes: 1 addition & 1 deletion examples/backends/trtllm/launch/agg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"}

# Setup cleanup trap
cleanup() {
Expand Down
4 changes: 2 additions & 2 deletions examples/backends/trtllm/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
export MODALITY=${MODALITY:-"text"}
Expand Down
Loading
Loading