Skip to content
Open
10 changes: 5 additions & 5 deletions benchmarks/router/run_engines.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
NUM_WORKERS=8
MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm"
ENGINE_CONFIG_PATH="$DYNAMO_HOME/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b"
TENSOR_PARALLEL_SIZE=1
DATA_PARALLEL_SIZE=1
USE_MOCKERS=false
Expand Down Expand Up @@ -86,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then
)
elif [ "$USE_TRTLLM" = true ]; then
# Default args for TensorRT-LLM engine using predefined YAML configs
# Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml
# Config files located at: $ENGINE_CONFIG_PATH/{agg,decode,prefill}.yaml
if [ "$MODE" = "prefill" ]; then
ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml"
ENGINE_CONFIG="$ENGINE_CONFIG_PATH/prefill.yaml"
elif [ "$MODE" = "decode" ]; then
ENGINE_CONFIG="$RECIPE_PATH/decode.yaml"
ENGINE_CONFIG="$ENGINE_CONFIG_PATH/decode.yaml"
else
ENGINE_CONFIG="$RECIPE_PATH/agg.yaml"
ENGINE_CONFIG="$ENGINE_CONFIG_PATH/agg.yaml"
fi

EXTRA_ARGS=(
Expand Down
2 changes: 1 addition & 1 deletion docs/backends/trtllm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm
```bash
cd $DYNAMO_HOME/examples/backends/trtllm

export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml
export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml
export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4"
# nvidia/DeepSeek-R1-FP4 is a large model
export MODEL_PATH="nvidia/DeepSeek-R1-FP4"
Expand Down
12 changes: 6 additions & 6 deletions docs/backends/trtllm/gemma3_sliding_window_attention.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
./launch/agg.sh
```

Expand All @@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
./launch/agg_router.sh
```

Expand All @@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
./launch/disagg.sh
```

Expand All @@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
cd $DYNAMO_HOME/examples/backends/trtllm
export MODEL_PATH=google/gemma-3-1b-it
export SERVED_MODEL_NAME=$MODEL_PATH
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml
export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
./launch/disagg_router.sh
```
8 changes: 4 additions & 4 deletions docs/backends/trtllm/gpt-oss.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be

#### Configuration Files

**Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**:
**Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**:
- `enable_attention_dp: false` - Attention data parallelism disabled for prefill
- `enable_chunked_prefill: true` - Enables efficient chunked prefill processing
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
- `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer
- `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs

**Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**:
**Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**:
- `enable_attention_dp: true` - Attention data parallelism enabled for decode
- `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency
- `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers
Expand Down Expand Up @@ -145,7 +145,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 &
CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
--model-path /model \
--served-model-name openai/gpt-oss-120b \
--extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \
--extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \
--dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \
--disaggregation-mode prefill \
Expand All @@ -161,7 +161,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \
CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \
--model-path /model \
--served-model-name openai/gpt-oss-120b \
--extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \
--extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \
--dyn-reasoning-parser gpt_oss \
--dyn-tool-call-parser harmony \
--disaggregation-mode decode \
Expand Down
8 changes: 4 additions & 4 deletions docs/backends/trtllm/llama4_plus_eagle.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu
- The other node runs the prefill worker.

## Notes
* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder.
* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder.

## Setup

Expand All @@ -52,17 +52,17 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide
## Aggregated Serving
```bash
export NUM_NODES=1
export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml"
export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml"
./multinode/srun_aggregated.sh
```

## Disaggregated Serving

```bash
export NUM_PREFILL_NODES=1
export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml"
export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml"
export NUM_DECODE_NODES=1
export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml"
export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml"
./multinode/srun_disaggregated.sh
```

Expand Down
6 changes: 3 additions & 3 deletions docs/backends/trtllm/multimodal_support.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode
```bash
cd $DYNAMO_HOME

export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml
export AGG_ENGINE_ARGS=./examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml
export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct"
./launch/agg.sh
Expand Down Expand Up @@ -79,8 +79,8 @@ cd $DYNAMO_HOME

export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"}
export MODALITY=${MODALITY:-"multimodal"}

./launch/disagg.sh
Expand Down
12 changes: 7 additions & 5 deletions docs/backends/trtllm/multinode/multinode-examples.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.

# Example: Multi-node TRTLLM Workers with Dynamo on Slurm

> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/).

To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16),
the set of nodes need to be launched together in the same MPI world, such as
via `mpirun` or `srun`. This is true regardless of whether the worker is
Expand Down Expand Up @@ -106,8 +108,8 @@ export IMAGE="<dynamo_trtllm_image>"
# For example, assuming your cluster had a `/lustre` directory on the host, you
# could add that as a mount like so:
#
# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}/../:/mnt"
# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}/../../../../:/mnt"

# NOTE: In general, Deepseek R1 is very large, so it is recommended to
# pre-download the model weights and save them in some shared location,
Expand Down Expand Up @@ -136,7 +138,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes:

```bash
# Default set in srun_aggregated.sh, but can customize here.
# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml"
# export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml"

# Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG
# The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of
Expand Down Expand Up @@ -165,8 +167,8 @@ deployment across 8 nodes:

```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml"

# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
Expand Down
14 changes: 8 additions & 6 deletions docs/backends/trtllm/multinode/multinode-multimodal-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.

# Example: Multi-node TRTLLM Workers with Dynamo on Slurm for multimodal models

> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/).

> [!IMPORTANT]
> There are some known issues in tensorrt_llm==1.1.0rc5 version for multinode multimodal support. It is important to rebuild the dynamo container with a specific version of tensorrt_llm commit to use multimodal feature.
>
Expand All @@ -34,7 +36,7 @@ limitations under the License.
>
> Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command:
> ```bash
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml
> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml
> ```


Expand Down Expand Up @@ -71,8 +73,8 @@ export IMAGE="<dynamo_trtllm_image>"
# For example, assuming your cluster had a `/lustre` directory on the host, you
# could add that as a mount like so:
#
# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}/../:/mnt"
# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre"
export MOUNTS="${PWD}/../../../../:/mnt"

# Can point to local FS as weel
# export MODEL_PATH="/location/to/model"
Expand Down Expand Up @@ -100,8 +102,8 @@ deployment across 4 nodes:

```bash
# Defaults set in srun_disaggregated.sh, but can customize here.
# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml"
# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml"
# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml"

# Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG
# Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG
Expand All @@ -123,7 +125,7 @@ deployment across 4 nodes:

## Understanding the Output

1. The `srun_disaggregated.sh` launches three srun jobs instead of two. One for frontend, one for prefill worker, and one for decode worker.
1. The `srun_disaggregated.sh` launches three srun jobs instead of two. One for frontend, one for prefill worker, and one for decode worker.

2. The OpenAI frontend will listen for and dynamically discover workers as
they register themselves with Dynamo's distributed runtime:
Expand Down
2 changes: 1 addition & 1 deletion docs/kubernetes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ args:
- python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml
--extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml
```

Key customization points include:
Expand Down
2 changes: 1 addition & 1 deletion examples/backends/trtllm/deploy/agg-with-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,4 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
2 changes: 1 addition & 1 deletion examples/backends/trtllm/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,4 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
2 changes: 1 addition & 1 deletion examples/backends/trtllm/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,5 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/agg.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml
- --publish-events-and-metrics
4 changes: 2 additions & 2 deletions examples/backends/trtllm/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill
TRTLLMDecodeWorker:
Expand All @@ -63,6 +63,6 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
4 changes: 2 additions & 2 deletions examples/backends/trtllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
TRTLLMPrefillWorker:
Expand All @@ -124,6 +124,6 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill
4 changes: 2 additions & 2 deletions examples/backends/trtllm/deploy/disagg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/prefill.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill
- --publish-events-and-metrics
Expand All @@ -65,6 +65,6 @@ spec:
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./recipes/qwen3/trtllm/decode.yaml
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
43 changes: 43 additions & 0 deletions examples/backends/trtllm/engine_configs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# TensorRT-LLM Engine Configurations

This directory contains TensorRT-LLM engine configuration files for various model deployments.


## Usage

These YAML configuration files can be passed to TensorRT-LLM workers using the `--extra-engine-args` parameter:

```bash
python3 -m dynamo.trtllm \
--extra-engine-args "${ENGINE_ARGS}" \
...
```

Where `ENGINE_ARGS` points to one of the configuration files in this directory.

## Configuration Types

### Aggregated (agg/)
Single-node configurations that combine prefill and decode operations:
- **simple/**: Basic aggregated setup
- **mtp/**: Multi-token prediction configurations
- **wide_ep/**: Wide expert parallel configurations

### Disaggregated (disagg/)
Separate configurations for prefill and decode workers:
- **simple/**: Basic prefill/decode split
- **mtp/**: Multi-token prediction with separate prefill/decode
- **wide_ep/**: Wide expert parallel with expert load balancer

## Key Configuration Parameters

- **Parallelism**: `tensor_parallel_size`, `moe_expert_parallel_size`, `pipeline_parallel_size`
- **Memory**: `kv_cache_config.free_gpu_memory_fraction`, `kv_cache_config.dtype`
- **Batching**: `max_batch_size`, `max_num_tokens`, `max_seq_len`
- **Scheduling**: `disable_overlap_scheduler`, `cuda_graph_config`

## Notes

- For disaggregated setups, ensure `kv_cache_config.dtype` matches between prefill and decode configs
- WideEP configurations require an expert load balancer config (`eplb.yaml`)
- Adjust `free_gpu_memory_fraction` based on your workload and attention DP settings
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ moe_config:
# moe_max_num_tokens = max_batch_size * moe_expert_parallel_size
# 4096 = 256 * 16
# moe_max_num_tokens: 4096
load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml
load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml

tensor_parallel_size: 16
moe_expert_parallel_size: 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ backend: pytorch
# WideEP related settings
moe_config:
backend: WIDEEP
load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml

# TP/EP/PP/DP
tensor_parallel_size: 16
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ backend: pytorch
# WideEP related settings
moe_config:
backend: WIDEEP
load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml
load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml

# TP/EP/PP/DP
tensor_parallel_size: 16
Expand Down
Empty file.
Loading
Loading