diff --git a/benchmarks/router/run_engines.sh b/benchmarks/router/run_engines.sh index d03fbe26de..e186e6bd31 100755 --- a/benchmarks/router/run_engines.sh +++ b/benchmarks/router/run_engines.sh @@ -7,7 +7,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} NUM_WORKERS=8 MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" -RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm" +ENGINE_CONFIG_PATH="$DYNAMO_HOME/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b" TENSOR_PARALLEL_SIZE=1 DATA_PARALLEL_SIZE=1 USE_MOCKERS=false @@ -86,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then ) elif [ "$USE_TRTLLM" = true ]; then # Default args for TensorRT-LLM engine using predefined YAML configs - # Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml + # Config files located at: $ENGINE_CONFIG_PATH/{agg,decode,prefill}.yaml if [ "$MODE" = "prefill" ]; then - ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml" + ENGINE_CONFIG="$ENGINE_CONFIG_PATH/prefill.yaml" elif [ "$MODE" = "decode" ]; then - ENGINE_CONFIG="$RECIPE_PATH/decode.yaml" + ENGINE_CONFIG="$ENGINE_CONFIG_PATH/decode.yaml" else - ENGINE_CONFIG="$RECIPE_PATH/agg.yaml" + ENGINE_CONFIG="$ENGINE_CONFIG_PATH/agg.yaml" fi EXTRA_ARGS=( diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md index 517fadd3f4..56c83c69b3 100644 --- a/docs/backends/trtllm/README.md +++ b/docs/backends/trtllm/README.md @@ -158,7 +158,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm ```bash cd $DYNAMO_HOME/examples/backends/trtllm -export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml +export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" # nvidia/DeepSeek-R1-FP4 is a large model export MODEL_PATH="nvidia/DeepSeek-R1-FP4" diff --git a/docs/backends/trtllm/gemma3_sliding_window_attention.md b/docs/backends/trtllm/gemma3_sliding_window_attention.md index 9898e25f8d..aeccf070c4 100644 --- a/docs/backends/trtllm/gemma3_sliding_window_attention.md +++ b/docs/backends/trtllm/gemma3_sliding_window_attention.md @@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml +export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml ./launch/agg.sh ``` @@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml +export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml ./launch/agg_router.sh ``` @@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml ./launch/disagg.sh ``` @@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml ./launch/disagg_router.sh ``` diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 6f7cec195d..124f170f8e 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be #### Configuration Files -**Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**: +**Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**: - `enable_attention_dp: false` - Attention data parallelism disabled for prefill - `enable_chunked_prefill: true` - Enables efficient chunked prefill processing - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer - `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs -**Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**: +**Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**: - `enable_attention_dp: true` - Attention data parallelism enabled for decode - `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers @@ -145,7 +145,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ --model-path /model \ --served-model-name openai/gpt-oss-120b \ - --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \ + --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode prefill \ @@ -161,7 +161,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ --model-path /model \ --served-model-name openai/gpt-oss-120b \ - --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \ + --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode decode \ diff --git a/docs/backends/trtllm/llama4_plus_eagle.md b/docs/backends/trtllm/llama4_plus_eagle.md index 2be8ba6509..2cd59a2e5b 100644 --- a/docs/backends/trtllm/llama4_plus_eagle.md +++ b/docs/backends/trtllm/llama4_plus_eagle.md @@ -28,7 +28,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu - The other node runs the prefill worker. ## Notes -* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder. +* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder. ## Setup @@ -52,7 +52,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide ## Aggregated Serving ```bash export NUM_NODES=1 -export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" +export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml" ./multinode/srun_aggregated.sh ``` @@ -60,9 +60,9 @@ export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" ```bash export NUM_PREFILL_NODES=1 -export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml" +export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml" export NUM_DECODE_NODES=1 -export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml" +export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml" ./multinode/srun_disaggregated.sh ``` diff --git a/docs/backends/trtllm/multimodal_support.md b/docs/backends/trtllm/multimodal_support.md index 1bc80d7132..7f90874be7 100644 --- a/docs/backends/trtllm/multimodal_support.md +++ b/docs/backends/trtllm/multimodal_support.md @@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode ```bash cd $DYNAMO_HOME -export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml +export AGG_ENGINE_ARGS=./examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" ./launch/agg.sh @@ -79,8 +79,8 @@ cd $DYNAMO_HOME export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"} export MODALITY=${MODALITY:-"multimodal"} ./launch/disagg.sh diff --git a/docs/backends/trtllm/multinode/multinode-examples.md b/docs/backends/trtllm/multinode/multinode-examples.md index c7b18594bc..a37ead4c2d 100644 --- a/docs/backends/trtllm/multinode/multinode-examples.md +++ b/docs/backends/trtllm/multinode/multinode-examples.md @@ -17,6 +17,8 @@ limitations under the License. # Example: Multi-node TRTLLM Workers with Dynamo on Slurm +> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/). + To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16), the set of nodes need to be launched together in the same MPI world, such as via `mpirun` or `srun`. This is true regardless of whether the worker is @@ -106,8 +108,8 @@ export IMAGE="" # For example, assuming your cluster had a `/lustre` directory on the host, you # could add that as a mount like so: # -# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" -export MOUNTS="${PWD}/../:/mnt" +# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/../../../../:/mnt" # NOTE: In general, Deepseek R1 is very large, so it is recommended to # pre-download the model weights and save them in some shared location, @@ -136,7 +138,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ```bash # Default set in srun_aggregated.sh, but can customize here. -# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml" +# export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml" # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of @@ -165,8 +167,8 @@ deployment across 8 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG diff --git a/docs/backends/trtllm/multinode/multinode-multimodal-example.md b/docs/backends/trtllm/multinode/multinode-multimodal-example.md index e6ba318364..7295c5dfac 100644 --- a/docs/backends/trtllm/multinode/multinode-multimodal-example.md +++ b/docs/backends/trtllm/multinode/multinode-multimodal-example.md @@ -17,6 +17,8 @@ limitations under the License. # Example: Multi-node TRTLLM Workers with Dynamo on Slurm for multimodal models +> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/). + > [!IMPORTANT] > There are some known issues in tensorrt_llm==1.1.0rc5 version for multinode multimodal support. It is important to rebuild the dynamo container with a specific version of tensorrt_llm commit to use multimodal feature. > @@ -34,7 +36,7 @@ limitations under the License. > > Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command: > ```bash -> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml +> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml > ``` @@ -71,8 +73,8 @@ export IMAGE="" # For example, assuming your cluster had a `/lustre` directory on the host, you # could add that as a mount like so: # -# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" -export MOUNTS="${PWD}/../:/mnt" +# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/../../../../:/mnt" # Can point to local FS as weel # export MODEL_PATH="/location/to/model" @@ -100,8 +102,8 @@ deployment across 4 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG @@ -123,7 +125,7 @@ deployment across 4 nodes: ## Understanding the Output -1. The `srun_disaggregated.sh` launches three srun jobs instead of two. One for frontend, one for prefill worker, and one for decode worker. +1. The `srun_disaggregated.sh` launches three srun jobs instead of two. One for frontend, one for prefill worker, and one for decode worker. 2. The OpenAI frontend will listen for and dynamically discover workers as they register themselves with Dynamo's distributed runtime: diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index 6ffdd17a72..2505d8ceec 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -203,7 +203,7 @@ args: - python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B - --extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml + --extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml ``` Key customization points include: diff --git a/examples/backends/trtllm/deploy/agg-with-config.yaml b/examples/backends/trtllm/deploy/agg-with-config.yaml index d18d1b0fb2..839df47cf0 100644 --- a/examples/backends/trtllm/deploy/agg-with-config.yaml +++ b/examples/backends/trtllm/deploy/agg-with-config.yaml @@ -67,4 +67,4 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/agg.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml diff --git a/examples/backends/trtllm/deploy/agg.yaml b/examples/backends/trtllm/deploy/agg.yaml index 54412576a2..c66825f118 100644 --- a/examples/backends/trtllm/deploy/agg.yaml +++ b/examples/backends/trtllm/deploy/agg.yaml @@ -36,4 +36,4 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/agg.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml diff --git a/examples/backends/trtllm/deploy/agg_router.yaml b/examples/backends/trtllm/deploy/agg_router.yaml index ed42129fb4..1696ab55a9 100644 --- a/examples/backends/trtllm/deploy/agg_router.yaml +++ b/examples/backends/trtllm/deploy/agg_router.yaml @@ -39,5 +39,5 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/agg.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml - --publish-events-and-metrics diff --git a/examples/backends/trtllm/deploy/disagg.yaml b/examples/backends/trtllm/deploy/disagg.yaml index 12c0c0ae3e..7f5585e4be 100644 --- a/examples/backends/trtllm/deploy/disagg.yaml +++ b/examples/backends/trtllm/deploy/disagg.yaml @@ -37,7 +37,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/prefill.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml - --disaggregation-mode - prefill TRTLLMDecodeWorker: @@ -63,6 +63,6 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/decode.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml - --disaggregation-mode - decode diff --git a/examples/backends/trtllm/deploy/disagg_planner.yaml b/examples/backends/trtllm/deploy/disagg_planner.yaml index 4bb5501eab..8f877402a0 100644 --- a/examples/backends/trtllm/deploy/disagg_planner.yaml +++ b/examples/backends/trtllm/deploy/disagg_planner.yaml @@ -97,7 +97,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/decode.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml - --disaggregation-mode - decode TRTLLMPrefillWorker: @@ -124,6 +124,6 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/prefill.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml - --disaggregation-mode - prefill diff --git a/examples/backends/trtllm/deploy/disagg_router.yaml b/examples/backends/trtllm/deploy/disagg_router.yaml index f07e56a335..aa6db5056b 100644 --- a/examples/backends/trtllm/deploy/disagg_router.yaml +++ b/examples/backends/trtllm/deploy/disagg_router.yaml @@ -39,7 +39,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/prefill.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml - --disaggregation-mode - prefill - --publish-events-and-metrics @@ -65,6 +65,6 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/decode.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml - --disaggregation-mode - decode diff --git a/examples/backends/trtllm/engine_configs/README.md b/examples/backends/trtllm/engine_configs/README.md new file mode 100644 index 0000000000..03790575a4 --- /dev/null +++ b/examples/backends/trtllm/engine_configs/README.md @@ -0,0 +1,43 @@ +# TensorRT-LLM Engine Configurations + +This directory contains TensorRT-LLM engine configuration files for various model deployments. + + +## Usage + +These YAML configuration files can be passed to TensorRT-LLM workers using the `--extra-engine-args` parameter: + +```bash +python3 -m dynamo.trtllm \ + --extra-engine-args "${ENGINE_ARGS}" \ + ... +``` + +Where `ENGINE_ARGS` points to one of the configuration files in this directory. + +## Configuration Types + +### Aggregated (agg/) +Single-node configurations that combine prefill and decode operations: +- **simple/**: Basic aggregated setup +- **mtp/**: Multi-token prediction configurations +- **wide_ep/**: Wide expert parallel configurations + +### Disaggregated (disagg/) +Separate configurations for prefill and decode workers: +- **simple/**: Basic prefill/decode split +- **mtp/**: Multi-token prediction with separate prefill/decode +- **wide_ep/**: Wide expert parallel with expert load balancer + +## Key Configuration Parameters + +- **Parallelism**: `tensor_parallel_size`, `moe_expert_parallel_size`, `pipeline_parallel_size` +- **Memory**: `kv_cache_config.free_gpu_memory_fraction`, `kv_cache_config.dtype` +- **Batching**: `max_batch_size`, `max_num_tokens`, `max_seq_len` +- **Scheduling**: `disable_overlap_scheduler`, `cuda_graph_config` + +## Notes + +- For disaggregated setups, ensure `kv_cache_config.dtype` matches between prefill and decode configs +- WideEP configurations require an expert load balancer config (`eplb.yaml`) +- Adjust `free_gpu_memory_fraction` based on your workload and attention DP settings diff --git a/recipes/deepseek-r1-distill-llama-8b/trtllm/agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml similarity index 100% rename from recipes/deepseek-r1-distill-llama-8b/trtllm/agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml diff --git a/recipes/deepseek-r1-distill-llama-8b/trtllm/decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/decode.yaml similarity index 100% rename from recipes/deepseek-r1-distill-llama-8b/trtllm/decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/decode.yaml diff --git a/recipes/deepseek-r1-distill-llama-8b/trtllm/prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/prefill.yaml similarity index 100% rename from recipes/deepseek-r1-distill-llama-8b/trtllm/prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/simple/agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/simple/agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/simple/agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/simple/agg.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/wide_ep/dep16_agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/dep16_agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/wide_ep/dep16_agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/dep16_agg.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml similarity index 89% rename from recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml index bcd6ae87e0..0d645c412a 100644 --- a/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml +++ b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml @@ -11,7 +11,7 @@ moe_config: # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # 4096 = 256 * 16 # moe_max_num_tokens: 4096 - load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml + load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml tensor_parallel_size: 16 moe_expert_parallel_size: 16 diff --git a/recipes/deepseek-r1/trtllm/disagg/mtp/mtp_decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_decode.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/mtp/mtp_decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_decode.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/mtp/mtp_prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_prefill.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/mtp/mtp_prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/simple/decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/decode.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/simple/decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/decode.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/simple/prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/prefill.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/simple/prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml similarity index 95% rename from recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml index 39d392afe9..8b5814c023 100644 --- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml +++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml @@ -17,7 +17,7 @@ backend: pytorch # WideEP related settings moe_config: backend: WIDEEP - load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml + load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml similarity index 93% rename from recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml index 56e862a855..9f707f0129 100644 --- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml +++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml @@ -17,7 +17,7 @@ backend: pytorch # WideEP related settings moe_config: backend: WIDEEP - load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml + load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/recipes/gemma3/trtllm/vswa_agg.yaml b/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml similarity index 100% rename from recipes/gemma3/trtllm/vswa_agg.yaml rename to examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml diff --git a/recipes/gemma3/trtllm/vswa_decode.yaml b/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml similarity index 100% rename from recipes/gemma3/trtllm/vswa_decode.yaml rename to examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml diff --git a/recipes/gemma3/trtllm/vswa_prefill.yaml b/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml similarity index 100% rename from recipes/gemma3/trtllm/vswa_prefill.yaml rename to examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml diff --git a/examples/backends/trtllm/engine_configs/gpt-oss-120b/agg.yaml b/examples/backends/trtllm/engine_configs/gpt-oss-120b/agg.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml b/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml new file mode 100644 index 0000000000..1ba9844545 --- /dev/null +++ b/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +enable_attention_dp: true +disable_overlap_scheduler: false +moe_config: + backend: CUTLASS +cuda_graph_config: + enable_padding: true +cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 65536 +print_iter_log: false +stream_interval: 10 diff --git a/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml b/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml new file mode 100644 index 0000000000..87bab09fd4 --- /dev/null +++ b/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +enable_attention_dp: false +disable_overlap_scheduler: true +moe_config: + backend: CUTLASS +enable_chunked_prefill: true +cuda_graph_config: + max_batch_size: 32 + enable_padding: true +cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 65536 +print_iter_log: false +stream_interval: 10 diff --git a/recipes/llama4/trtllm/eagle/eagle_agg.yml b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml similarity index 100% rename from recipes/llama4/trtllm/eagle/eagle_agg.yml rename to examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml diff --git a/recipes/llama4/trtllm/eagle/eagle_decode.yaml b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml similarity index 100% rename from recipes/llama4/trtllm/eagle/eagle_decode.yaml rename to examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml diff --git a/recipes/llama4/trtllm/eagle/eagle_prefill.yaml b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml similarity index 100% rename from recipes/llama4/trtllm/eagle/eagle_prefill.yaml rename to examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml diff --git a/recipes/llama4/trtllm/multimodal/agg.yaml b/examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml similarity index 100% rename from recipes/llama4/trtllm/multimodal/agg.yaml rename to examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml diff --git a/recipes/llama4/trtllm/multimodal/decode.yaml b/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml similarity index 100% rename from recipes/llama4/trtllm/multimodal/decode.yaml rename to examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml diff --git a/recipes/llama4/trtllm/multimodal/prefill.yaml b/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml similarity index 100% rename from recipes/llama4/trtllm/multimodal/prefill.yaml rename to examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml diff --git a/recipes/qwen3/trtllm/agg.yaml b/examples/backends/trtllm/engine_configs/qwen3/agg.yaml similarity index 100% rename from recipes/qwen3/trtllm/agg.yaml rename to examples/backends/trtllm/engine_configs/qwen3/agg.yaml diff --git a/recipes/qwen3/trtllm/decode.yaml b/examples/backends/trtllm/engine_configs/qwen3/decode.yaml similarity index 100% rename from recipes/qwen3/trtllm/decode.yaml rename to examples/backends/trtllm/engine_configs/qwen3/decode.yaml diff --git a/recipes/qwen3/trtllm/prefill.yaml b/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml similarity index 100% rename from recipes/qwen3/trtllm/prefill.yaml rename to examples/backends/trtllm/engine_configs/qwen3/prefill.yaml diff --git a/examples/backends/trtllm/launch/agg.sh b/examples/backends/trtllm/launch/agg.sh index f141531d7d..56a842eb52 100755 --- a/examples/backends/trtllm/launch/agg.sh +++ b/examples/backends/trtllm/launch/agg.sh @@ -6,7 +6,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export MODALITY=${MODALITY:-"text"} # If you want to use multimodal, set MODALITY to "multimodal" #export MODALITY=${MODALITY:-"multimodal"} diff --git a/examples/backends/trtllm/launch/agg_metrics.sh b/examples/backends/trtllm/launch/agg_metrics.sh index ad01482a8c..64d27dd2cf 100755 --- a/examples/backends/trtllm/launch/agg_metrics.sh +++ b/examples/backends/trtllm/launch/agg_metrics.sh @@ -6,7 +6,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export MODALITY=${MODALITY:-"text"} # Setup cleanup trap diff --git a/examples/backends/trtllm/launch/agg_router.sh b/examples/backends/trtllm/launch/agg_router.sh index bb69762735..1b0568535a 100755 --- a/examples/backends/trtllm/launch/agg_router.sh +++ b/examples/backends/trtllm/launch/agg_router.sh @@ -6,7 +6,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} # Setup cleanup trap cleanup() { diff --git a/examples/backends/trtllm/launch/disagg.sh b/examples/backends/trtllm/launch/disagg.sh index 695fd94779..7f75ee908e 100755 --- a/examples/backends/trtllm/launch/disagg.sh +++ b/examples/backends/trtllm/launch/disagg.sh @@ -6,8 +6,8 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export MODALITY=${MODALITY:-"text"} diff --git a/examples/backends/trtllm/launch/disagg_router.sh b/examples/backends/trtllm/launch/disagg_router.sh index b8f8bbf5cb..1b005a44ae 100755 --- a/examples/backends/trtllm/launch/disagg_router.sh +++ b/examples/backends/trtllm/launch/disagg_router.sh @@ -6,8 +6,8 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} diff --git a/examples/backends/trtllm/launch/disagg_same_gpu.sh b/examples/backends/trtllm/launch/disagg_same_gpu.sh index 695b32b637..5975b20856 100755 --- a/examples/backends/trtllm/launch/disagg_same_gpu.sh +++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh @@ -32,8 +32,8 @@ echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/decode.yaml"} export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} export MODALITY=${MODALITY:-"text"} diff --git a/examples/backends/trtllm/launch/epd_disagg.sh b/examples/backends/trtllm/launch/epd_disagg.sh index c52d57ce0c..a2843d28ba 100755 --- a/examples/backends/trtllm/launch/epd_disagg.sh +++ b/examples/backends/trtllm/launch/epd_disagg.sh @@ -6,9 +6,9 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} -export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"} +export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"} diff --git a/examples/backends/trtllm/launch/gpt_oss_disagg.sh b/examples/backends/trtllm/launch/gpt_oss_disagg.sh index 9ada0c76ef..bbe560b231 100755 --- a/examples/backends/trtllm/launch/gpt_oss_disagg.sh +++ b/examples/backends/trtllm/launch/gpt_oss_disagg.sh @@ -6,8 +6,8 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"/model"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml"} set -e trap 'echo Cleaning up...; kill 0' EXIT diff --git a/examples/basics/multinode/trtllm/README.md b/examples/basics/multinode/trtllm/README.md new file mode 100644 index 0000000000..a36abc833d --- /dev/null +++ b/examples/basics/multinode/trtllm/README.md @@ -0,0 +1,20 @@ + + +# Example: Multi-node TRTLLM Workers with Dynamo on Slurm + +See [here](/docs/backends/trtllm/multinode) for how to setup this example. diff --git a/examples/basics/multinode/trtllm/srun_aggregated.sh b/examples/basics/multinode/trtllm/srun_aggregated.sh index 654c8ef691..c3d9792b45 100755 --- a/examples/basics/multinode/trtllm/srun_aggregated.sh +++ b/examples/basics/multinode/trtllm/srun_aggregated.sh @@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_NODES=${NUM_NODES:-4} NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} -export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}" +export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml}" # Automate settings of certain variables for convenience, but you are free # to manually set these for more control as well. diff --git a/examples/basics/multinode/trtllm/srun_disaggregated.sh b/examples/basics/multinode/trtllm/srun_disaggregated.sh index 13f66b14b0..219108529a 100755 --- a/examples/basics/multinode/trtllm/srun_disaggregated.sh +++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh @@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1} -PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}" +PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml}" NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1} -DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}" +DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml}" # Automate settings of certain variables for convenience, but you are free # to manually set these for more control as well. diff --git a/recipes/README.md b/recipes/README.md index 236a38a71a..9aaf95a3c2 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -1,297 +1,283 @@ -# Dynamo Model Serving Recipes +# Dynamo Production-Ready Recipes -This repository contains production-ready recipes for deploying large language models using the Dynamo platform. Each recipe includes deployment configurations, performance benchmarking, and model caching setup. +Production-tested Kubernetes deployment recipes for LLM inference using NVIDIA Dynamo. -## Contents -- [Available Models](#available-models) -- [Quick Start](#quick-start) -- [Prerequisites](#prerequisites) -- Deployment Methods - - [Option 1: Automated Deployment](#option-1-automated-deployment) - - [Option 2: Manual Deployment](#option-2-manual-deployment) +> **Prerequisites:** This guide assumes you have already installed the Dynamo Kubernetes Platform. +> If not, follow the **[Kubernetes Deployment Guide](../docs/kubernetes/README.md)** first. +## Available Recipes -## Available Models - -| Model Family | Framework | Deployment Mode | GPU Requirements | Status | Benchmark |GAIE-integration | -|-----------------|-----------|---------------------|------------------|--------|-----------|------------------| -| llama-3-70b | vllm | agg | 4x H100/H200 | ✅ | ✅ |✅ | -| llama-3-70b | vllm | disagg (1 node) | 8x H100/H200 | ✅ | ✅ | 🚧 | -| llama-3-70b | vllm | disagg (multi-node) | 16x H100/H200 | ✅ | ✅ |🚧 | -| deepseek-r1 | sglang | disagg (1 node, wide-ep) | 8x H200 | ✅ | 🚧 |🚧 | -| deepseek-r1 | sglang | disagg (multi-node, wide-ep) | 16x H200 | ✅ | 🚧 |🚧 | -| gpt-oss-120b | trtllm | agg | 4x GB200 | ✅ | ✅ |🚧 | +| Model | Framework | Mode | GPUs | Deployment | Benchmark Recipe | Notes | GAIE integration | +|-------|-----------|------|------|------------|------------------|-------|------------------| +| **[Llama-3-70B](llama-3-70b/vllm/agg/)** | vLLM | Aggregated | 4x H100/H200 | ✅ | ✅ | FP8 dynamic quantization | ✅ | +| **[Llama-3-70B](llama-3-70b/vllm/disagg-single-node/)** | vLLM | Disagg (Single-Node) | 8x H100/H200 | ✅ | ✅ | Prefill + Decode separation | +| **[Llama-3-70B](llama-3-70b/vllm/disagg-multi-node/)** | vLLM | Disagg (Multi-Node) | 16x H100/H200 | ✅ | ✅ | 2 nodes, 8 GPUs each | +| **[Qwen3-32B-FP8](qwen3-32b-fp8/trtllm/agg/)** | TensorRT-LLM | Aggregated | 4x GPU | ✅ | ✅ | FP8 quantization | +| **[Qwen3-32B-FP8](qwen3-32b-fp8/trtllm/disagg/)** | TensorRT-LLM | Disaggregated | 8x GPU | ✅ | ✅ | Prefill + Decode separation | +| **[GPT-OSS-120B](gpt-oss-120b/trtllm/agg/)** | TensorRT-LLM | Aggregated | 4x GB200 | ✅ | ✅ | Blackwell only, WideEP | +| **[GPT-OSS-120B](gpt-oss-120b/trtllm/disagg/)** | TensorRT-LLM | Disaggregated | TBD | ❌ | ❌ | Engine configs only, no K8s manifest | +| **[DeepSeek-R1](deepseek-r1/sglang/disagg-8gpu/)** | SGLang | Disagg WideEP | 8x H200 | ✅ | ❌ | Benchmark recipe pending | +| **[DeepSeek-R1](deepseek-r1/sglang/disagg-16gpu/)** | SGLang | Disagg WideEP | 16x H200 | ✅ | ❌ | Benchmark recipe pending | +| **[DeepSeek-R1](deepseek-r1/trtllm/disagg/wide_ep/gb200/)** | TensorRT-LLM | Disagg WideEP (GB200) | 32+4 GB200 | ✅ | ✅ | Multi-node: 8 decode + 1 prefill nodes | **Legend:** -- ✅ Functional -- 🚧 Under development +- **Deployment**: ✅ = Complete `deploy.yaml` manifest available | ❌ = Missing or incomplete +- **Benchmark Recipe**: ✅ = Includes `perf.yaml` for running AIPerf benchmarks | ❌ = No benchmark recipe provided + +## Recipe Structure +Each complete recipe follows this standard structure: -**Recipe Directory Structure:** -Recipes are organized into a directory structure that follows the pattern: -```text +``` / +├── README.md (optional) # Model-specific deployment notes ├── model-cache/ -│ ├── model-cache.yaml # PVC for model cache -│ └── model-download.yaml # Job for model download -├── / -│ └── / -│ ├── deploy.yaml # DynamoGraphDeployment CRD and optional configmap for custom configuration -│ └── perf.yaml (optional) # Performance benchmark -└── README.md (optional) # Model documentation +│ ├── model-cache.yaml # PersistentVolumeClaim for model storage +│ └── model-download.yaml # Job to download model from HuggingFace +└── / # vllm, sglang, or trtllm + └── / # agg, disagg, disagg-single-node, etc. + ├── deploy.yaml # Complete DynamoGraphDeployment manifest + └── perf.yaml (optional) # AIPerf benchmark job ``` ## Quick Start -Follow the instructions in the [Prerequisites](#prerequisites) section to set up your environment. +### Prerequisites -Choose your preferred deployment method: using the `run.sh` script or manual deployment steps. +**1. Dynamo Platform Installed** +The recipes require the Dynamo Kubernetes Platform to be installed. Follow the installation guide: -## Prerequisites +- **[Kubernetes Deployment Guide](../docs/kubernetes/README.md)** - Quickstart (~10 minutes) +- **[Detailed Installation Guide](../docs/kubernetes/installation_guide.md)** - Advanced options -### 1. Environment Setup +**2. GPU Cluster Requirements** + +Ensure your cluster has: +- GPU nodes matching recipe requirements (see table above) +- GPU operator installed +- Appropriate GPU drivers and container runtime -Create a Kubernetes namespace and set environment variable: +**3. HuggingFace Access** + +Configure authentication to download models: ```bash export NAMESPACE=your-namespace kubectl create namespace ${NAMESPACE} -``` - -### 2. Deploy Dynamo Platform - -Install the Dynamo Cloud Platform following the [Quickstart Guide](../docs/kubernetes/README.md). -### 3. GPU Cluster - -Ensure your Kubernetes cluster has: -- GPU nodes with appropriate GPU types (see model requirements above) -- GPU operator installed -- Sufficient GPU memory and compute resources - -### 4. Container Registry Access - -Ensure access to NVIDIA container registry for runtime images: -- `nvcr.io/nvidia/ai-dynamo/vllm-runtime:x.y.z` -- `nvcr.io/nvidia/ai-dynamo/trtllm-runtime:x.y.z` -- `nvcr.io/nvidia/ai-dynamo/sglang-runtime:x.y.z` +# Create HuggingFace token secret +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="your-token-here" \ + -n ${NAMESPACE} +``` -### 5. HuggingFace Access and Kubernetes Secret Creation +**4. Storage Configuration** -Set up a kubernetes secret with the HuggingFace token for model download: +Update the `storageClassName` in `/model-cache/model-cache.yaml` to match your cluster: ```bash -# Update the token in the secret file -vim hf_hub_secret/hf_hub_secret.yaml +# Find your storage class name +kubectl get storageclass -# Apply the secret -kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE} +# Edit the model-cache.yaml file and update: +# spec: +# storageClassName: "your-actual-storage-class" ``` -6. Configure Storage Class +### Deploy a Recipe + +**Step 1: Download Model** ```bash -# Check available storage classes -kubectl get storageclass -``` +# Update storageClassName in model-cache.yaml first! +kubectl apply -f /model-cache/ -n ${NAMESPACE} -Replace "your-storage-class-name" with your actual storage class in the file: `/model-cache/model-cache.yaml` +# Wait for download to complete (may take 10-60 minutes depending on model size) +kubectl wait --for=condition=Complete job/model-download -n ${NAMESPACE} --timeout=6000s -```yaml -# In /model-cache/model-cache.yaml -spec: - storageClassName: "your-actual-storage-class" # Replace this +# Monitor progress +kubectl logs -f job/model-download -n ${NAMESPACE} ``` -## Option 1: Automated Deployment +**Step 2: Deploy Service** -Use the `run.sh` script for fully automated deployment: - -**Note:** The script automatically: -- Create model cache PVC and downloads the model -- Deploy the model service -- Runs performance benchmark if a `perf.yaml` file is present in the deployment directory +```bash +kubectl apply -f ///deploy.yaml -n ${NAMESPACE} +# Check deployment status +kubectl get dynamographdeployment -n ${NAMESPACE} -#### Script Usage +# Check pod status +kubectl get pods -n ${NAMESPACE} -```bash -./run.sh [OPTIONS] --model --framework --deployment +# Wait for pods to be ready +kubectl wait --for=condition=ready pod -l nvidia.com/dynamo-graph-deployment-name= -n ${NAMESPACE} --timeout=600s ``` -**Required Options:** -- `--model `: Model name matching the directory name in the recipes directory (e.g., llama-3-70b, gpt-oss-120b, deepseek-r1) -- `--framework `: Backend framework (`vllm`, `trtllm`, `sglang`) -- `--deployment `: Deployment mode (e.g., agg, disagg, disagg-single-node, disagg-multi-node) - -**Optional Options:** -- `--namespace `: Kubernetes namespace (default: dynamo) -- `--dry-run`: Show commands without executing them -- `-h, --help`: Show help message +**Step 3: Test Deployment** -**Environment Variables:** -- `NAMESPACE`: Kubernetes namespace (default: dynamo) - -#### Example Usage ```bash -# Set up environment -export NAMESPACE=your-namespace -kubectl create namespace ${NAMESPACE} -# Configure HuggingFace token -kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE} +# Port forward to access the service locally +kubectl port-forward svc/-frontend 8000:8000 -n ${NAMESPACE} + +# In another terminal, test the endpoint +curl http://localhost:8000/v1/models -# use run.sh script to deploy the model -# Deploy Llama-3-70B with vLLM (aggregated mode) -./run.sh --model llama-3-70b --framework vllm --deployment agg +# Send a test request +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50 + }' +``` -# Deploy GPT-OSS-120B with TensorRT-LLM -./run.sh --model gpt-oss-120b --framework trtllm --deployment agg +**Step 4: Run Benchmark (Optional)** -# Deploy DeepSeek-R1 with SGLang (disaggregated mode) -./run.sh --model deepseek-r1 --framework sglang --deployment disagg +```bash +# Only if perf.yaml exists in the recipe directory +kubectl apply -f ///perf.yaml -n ${NAMESPACE} -# Deploy with custom namespace -./run.sh --namespace my-namespace --model llama-3-70b --framework vllm --deployment agg +# Monitor benchmark progress +kubectl logs -f job/ -n ${NAMESPACE} -# Dry run to see what would be executed -./run.sh --dry-run --model llama-3-70b --framework vllm --deployment agg +# View results after completion +kubectl logs job/ -n ${NAMESPACE} | tail -50 ``` -## If deploying with Gateway API Inference extension GAIE +**Step 4: GAIE Integration (Optional)** -1. Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE. +For Llama-3-70B with vLLM (Aggregated), an example of integration with the Inference Gateway is provided. -2. Apply manifests by running a script. +Follow to Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE. Then apply manifests. ```bash -# Match the block size to the cli value in your deployment file deploy.yaml: - "python3 -m dynamo.vllm ... --block-size 128" -export DYNAMO_KV_BLOCK_SIZE=128 -export EPP_IMAGE=nvcr.io/you/epp:tag -# Add --gaie argument to the script i.e.: -./run.sh --model llama-3-70b --framework vllm --gaie agg +export DEPLOY_PATH=llama-3-70b/vllm/agg/ +#DEPLOY_PATH=/// +kubectl apply -R -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE" ``` -The script will perform gateway checks and apply the manifests. -## Option 2: Manual Deployment +## Example Deployments -For step-by-step manual deployment follow these steps : +### Llama-3-70B with vLLM (Aggregated) ```bash -# 0. Set up environment (see Prerequisites section) -export NAMESPACE=your-namespace +export NAMESPACE=dynamo-demo kubectl create namespace ${NAMESPACE} -kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE} -# 1. Download model (see Model Download section) -kubectl apply -n $NAMESPACE -f /model-cache/ +# Create HF token secret +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="your-token" \ + -n ${NAMESPACE} -# 2. Deploy model (see Deployment section) -kubectl apply -n $NAMESPACE -f ///deploy.yaml +# Deploy +kubectl apply -f llama-3-70b/model-cache/ -n ${NAMESPACE} +kubectl wait --for=condition=Complete job/model-download -n ${NAMESPACE} --timeout=6000s +kubectl apply -f llama-3-70b/vllm/agg/deploy.yaml -n ${NAMESPACE} -# 3. Run benchmarks (optional, if perf.yaml exists) -kubectl apply -n $NAMESPACE -f ///perf.yaml +# Test +kubectl port-forward svc/llama3-70b-agg-frontend 8000:8000 -n ${NAMESPACE} ``` -### Step 1: Download Model +### DeepSeek-R1 on GB200 (Multi-node) -```bash -# Start the download job -kubectl apply -n $NAMESPACE -f /model-cache - -# Verify job creation -kubectl get jobs -n $NAMESPACE | grep model-download -``` +See [deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml](deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml) for the complete multi-node WideEP configuration. -Monitor and wait for the model download to complete: - -```bash +## Customization -# Wait for job completion (timeout after 100 minutes) -kubectl wait --for=condition=Complete job/model-download -n $NAMESPACE --timeout=6000s +Each `deploy.yaml` contains: +- **ConfigMap**: Engine-specific configuration (embedded in the manifest) +- **DynamoGraphDeployment**: Kubernetes resource definitions +- **Resource limits**: GPU count, memory, CPU requests/limits +- **Image references**: Container images with version tags -# Check job status -kubectl get job model-download -n $NAMESPACE +### Key Customization Points -# View download logs -kubectl logs job/model-download -n $NAMESPACE +**Model Configuration:** +```yaml +# In deploy.yaml under worker args: +args: + - python3 -m dynamo.vllm --model --served-model-name ``` -### Step 2: Deploy Model Service - -```bash -# Navigate to the specific deployment configuration -cd /// - -# Deploy the model service -kubectl apply -n $NAMESPACE -f deploy.yaml - -# Verify deployment creation -kubectl get deployments -n $NAMESPACE +**GPU Resources:** +```yaml +resources: + limits: + gpu: "4" # Adjust based on your requirements + requests: + gpu: "4" ``` -#### Wait for Deployment Ready - -```bash -# Get deployment name from the deploy.yaml file -DEPLOYMENT_NAME=$(grep "name:" deploy.yaml | head -1 | awk '{print $2}') - -# Wait for deployment to be ready (timeout after 10 minutes) -kubectl wait --for=condition=available deployment/$DEPLOYMENT_NAME -n $NAMESPACE --timeout=1200s - -# Check deployment status -kubectl get deployment $DEPLOYMENT_NAME -n $NAMESPACE - -# Check pod status -kubectl get pods -n $NAMESPACE -l app=$DEPLOYMENT_NAME +**Scaling:** +```yaml +services: + VllmDecodeWorker: + replicas: 2 # Scale to multiple workers ``` -#### Verify Model Service - -```bash -# Check if service is running -kubectl get services -n $NAMESPACE - -# Test model endpoint (port-forward to test locally) -kubectl port-forward service/${DEPLOYMENT_NAME}-frontend 8000:8000 -n $NAMESPACE - -# Test the model API (in another terminal) -curl http://localhost:8000/v1/models +**Router Mode:** +```yaml +# In Frontend args: +args: + - python3 -m dynamo.frontend --router-mode kv --http-port 8000 +# Options: round-robin, kv (KV-aware routing) +``` -# Stop port-forward when done -pkill -f "kubectl port-forward" +**Container Images:** +```yaml +image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:x.y.z +# Update version tag as needed ``` -### Step 3: Performance Benchmarking (Optional) +## Troubleshooting -Run performance benchmarks to evaluate model performance. Note that benchmarking is only available for models that include a `perf.yaml` file (optional): +### Common Issues -#### Launch Benchmark Job +**Pods stuck in Pending:** +- Check GPU availability: `kubectl describe node ` +- Verify storage class exists: `kubectl get storageclass` +- Check resource requests vs. available resources -```bash -# From the deployment directory -kubectl apply -n $NAMESPACE -f perf.yaml +**Model download fails:** +- Verify HuggingFace token is correct +- Check network connectivity from cluster +- Review job logs: `kubectl logs job/model-download -n ${NAMESPACE}` -# Verify benchmark job creation -kubectl get jobs -n $NAMESPACE -``` +**Workers fail to start:** +- Check GPU compatibility (driver version, CUDA version) +- Verify image pull secrets if using private registries +- Review pod logs: `kubectl logs -n ${NAMESPACE}` -#### Monitor Benchmark Progress +**For more troubleshooting:** +- [Kubernetes Deployment Guide](../docs/kubernetes/README.md#troubleshooting) +- [Observability Documentation](../docs/kubernetes/observability/) -```bash -# Get benchmark job name -PERF_JOB_NAME=$(grep "name:" perf.yaml | head -1 | awk '{print $2}') +## Related Documentation -# Monitor benchmark logs in real-time -kubectl logs -f job/$PERF_JOB_NAME -n $NAMESPACE +- **[Kubernetes Deployment Guide](../docs/kubernetes/README.md)** - Platform installation and concepts +- **[API Reference](../docs/kubernetes/api_reference.md)** - DynamoGraphDeployment CRD specification +- **[vLLM Backend Guide](../docs/backends/vllm/README.md)** - vLLM-specific features +- **[SGLang Backend Guide](../docs/backends/sglang/README.md)** - SGLang-specific features +- **[TensorRT-LLM Backend Guide](../docs/backends/trtllm/README.md)** - TensorRT-LLM features +- **[Observability](../docs/kubernetes/observability/)** - Monitoring and logging +- **[Benchmarking Guide](../docs/benchmarks/benchmarking.md)** - Performance testing -# Wait for benchmark completion (timeout after 100 minutes) -kubectl wait --for=condition=Complete job/$PERF_JOB_NAME -n $NAMESPACE --timeout=6000s -``` +## Contributing -#### View Benchmark Results +We welcome contributions of new recipes! See [CONTRIBUTING.md](CONTRIBUTING.md) for: +- Recipe submission guidelines +- Required components checklist +- Testing and validation requirements +- Documentation standards -```bash -# Check final benchmark results -kubectl logs job/$PERF_JOB_NAME -n $NAMESPACE | tail -50 -``` \ No newline at end of file +### Recipe Quality Standards + +A production-ready recipe must include: +- ✅ Complete `deploy.yaml` with DynamoGraphDeployment +- ✅ Model cache PVC and download job +- ✅ Benchmark recipe (`perf.yaml`) for performance testing +- ✅ Verification on target hardware +- ✅ Documentation of GPU requirements diff --git a/recipes/gpt-oss-120b/trtllm/disagg/README.md b/recipes/gpt-oss-120b/trtllm/disagg/README.md new file mode 100644 index 0000000000..10390c9587 --- /dev/null +++ b/recipes/gpt-oss-120b/trtllm/disagg/README.md @@ -0,0 +1,25 @@ +# GPT-OSS-120B Disaggregated Mode + +> **⚠️ INCOMPLETE**: This directory contains only engine configuration files and is not ready for Kubernetes deployment. + +## Current Status + +This directory contains TensorRT-LLM engine configurations for disaggregated serving: +- `decode.yaml` - Decode worker engine configuration +- `prefill.yaml` - Prefill worker engine configuration + +## Missing Components + +To complete this recipe, the following files are needed: +- `deploy.yaml` - Kubernetes DynamoGraphDeployment manifest +- `perf.yaml` - Performance benchmarking job (optional) + +## Alternative + +For a production-ready GPT-OSS-120B deployment, use the **aggregated mode**: +- [gpt-oss-120b/trtllm/agg/](../agg/) - Complete with `deploy.yaml` and `perf.yaml` + +## Contributing + +If you'd like to complete this recipe, see [recipes/CONTRIBUTING.md](../../../CONTRIBUTING.md) for guidelines on creating proper Kubernetes deployment manifests. + diff --git a/recipes/run.sh b/recipes/run.sh deleted file mode 100755 index 980c9333b6..0000000000 --- a/recipes/run.sh +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -euo pipefail -IFS=$'\n\t' - -RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )" -# Default values -NAMESPACE="${NAMESPACE:-dynamo}" -DEPLOY_TYPE="" -GAIE="${GAIE:-false}" -DEPLOYMENT="" -MODEL="" -FRAMEWORK="" -DRY_RUN="" - -# Frameworks - following container/build.sh pattern -declare -A FRAMEWORKS=(["VLLM"]=1 ["TRTLLM"]=2 ["SGLANG"]=3) -DEFAULT_FRAMEWORK=VLLM - -# Function to show usage -usage() { - echo "Usage: $0 [OPTIONS] --model --framework --deployment " - echo "" - echo "Required Options:" - echo " --model Model name (e.g., llama-3-70b)" - echo " --framework Framework one of ${!FRAMEWORKS[*]} (default: ${DEFAULT_FRAMEWORK})" - echo " --deployment Deployment type (e.g., agg, disagg etc, please refer to the README.md for available deployment types)" - echo "" - echo "Optional:" - echo " --namespace Kubernetes namespace (default: dynamo)" - echo " --dry-run Print commands without executing them" - echo " --gaie[=true|false] Enable GAIE integration subfolder (applies GAIE manifests skips benchmark) (default: ${GAIE})" - echo " -h, --help Show this help message" - echo "" - echo "Environment Variables:" - echo " NAMESPACE Kubernetes namespace (default: dynamo)" - echo "" - echo "Examples:" - echo " $0 --model llama-3-70b --framework vllm --deployment agg" - echo " $0 --model llama-3-70b --framework trtllm --deployment disagg-single-node" - echo " $0 --namespace my-ns --model llama-3-70b --framework vllm --deployment disagg-multi-node" - exit 1 -} - -missing_requirement() { - echo "ERROR: $1 requires an argument." - usage -} - -error() { - printf '%s %s\n' "$1" "$2" >&2 - exit 1 -} - -while [[ $# -gt 0 ]]; do - case $1 in - --dry-run) - DRY_RUN="echo" - shift - ;; - --model) - if [ "$2" ]; then - MODEL=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --framework) - if [ "$2" ]; then - FRAMEWORK=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --deployment) - if [ "$2" ]; then - DEPLOYMENT=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --namespace) - if [ "$2" ]; then - NAMESPACE=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --gaie) - GAIE=true - shift - ;; - --gaie=false) - GAIE=false - shift - ;; - --gaie=*) - GAIE="${1#*=}" - case "${GAIE,,}" in - true|false) GAIE="${GAIE,,}";; - *) echo "ERROR: --gaie must be true or false"; exit 1;; - esac - shift - ;; - -h|--help) - usage - ;; - -*) - error 'ERROR: Unknown option: ' "$1" - ;; - *) - error "ERROR: Unknown argument: " "$1" - ;; - esac -done - -if [ -z "$FRAMEWORK" ]; then - FRAMEWORK=$DEFAULT_FRAMEWORK -fi - -if [ -n "$FRAMEWORK" ]; then - FRAMEWORK=${FRAMEWORK^^} - if [[ -z "${FRAMEWORKS[$FRAMEWORK]}" ]]; then - error 'ERROR: Unknown framework: ' "$FRAMEWORK" - fi -fi - -# Validate required arguments -if [[ -z "$MODEL" ]] || [[ -z "$DEPLOYMENT" ]]; then - if [[ -z "$MODEL" ]]; then - echo "ERROR: --model argument is required" - fi - if [[ -z "$DEPLOYMENT" ]]; then - echo "ERROR: --deployment argument is required" - fi - echo "" - usage -fi - -# Construct paths based on new structure: recipes//// -MODEL_DIR="$RECIPES_DIR/$MODEL" -FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}" -DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOYMENT" -INTEGRATION="$([[ "${GAIE,,}" == "true" ]] && echo gaie || echo "")" - -# Check if model directory exists -if [[ ! -d "$MODEL_DIR" ]]; then - echo "Error: Model directory '$MODEL' does not exist in $RECIPES_DIR" - echo "Available models:" - ls -1 "$RECIPES_DIR" | grep -v "\.sh$\|\.md$\|model-cache$" | sed 's/^/ /' - exit 1 -fi - -# Check if framework directory exists -if [[ ! -d "$FRAMEWORK_DIR" ]]; then - echo "Error: Framework directory '${FRAMEWORK,,}' does not exist in $MODEL_DIR" - echo "Available frameworks for $MODEL:" - ls -1 "$MODEL_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/ /' - exit 1 -fi - -# Check if deployment directory exists -if [[ ! -d "$DEPLOY_PATH" ]]; then - echo "Error: Deployment type '$DEPLOYMENT' does not exist in $FRAMEWORK_DIR" - echo "Available deployment types for $MODEL/${FRAMEWORK,,}:" - ls -1 "$FRAMEWORK_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/ /' - exit 1 -fi - -# Check if deployment files exist -DEPLOY_FILE="$DEPLOY_PATH/deploy.yaml" -PERF_FILE="$DEPLOY_PATH/perf.yaml" - -if [[ ! -f "$DEPLOY_FILE" ]]; then - echo "Error: Deployment file '$DEPLOY_FILE' not found" - exit 1 -fi - -# Check if perf file exists (optional) -PERF_AVAILABLE=false -if [[ -f "$PERF_FILE" ]]; then - PERF_AVAILABLE=true - echo "Performance benchmark file found: $PERF_FILE" -else - echo "Performance benchmark file not found: $PERF_FILE (skipping benchmarks)" -fi - -# Show deployment information -echo "======================================" -echo "Dynamo Recipe Deployment" -echo "======================================" -echo "Model: $MODEL" -echo "Framework: ${FRAMEWORK,,}" -echo "Deployment Type: $DEPLOYMENT" -echo "Namespace: $NAMESPACE" -echo "GAIE integration: $GAIE" -echo "======================================" - -# Handle model downloading -MODEL_CACHE_DIR="$MODEL_DIR/model-cache" -echo "Creating PVC for model cache and downloading model..." -$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-cache.yaml -$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-download.yaml - -# Wait for the model download to complete -MODEL_DOWNLOAD_JOB_NAME=$(grep "name:" $MODEL_CACHE_DIR/model-download.yaml | head -1 | awk '{print $2}') -echo "Waiting for job '$MODEL_DOWNLOAD_JOB_NAME' to complete..." -$DRY_RUN kubectl wait --for=condition=Complete job/$MODEL_DOWNLOAD_JOB_NAME -n $NAMESPACE --timeout=6000s - -# Deploy the specified configuration -echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOYMENT configuration..." -$DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE - -if [[ "$INTEGRATION" == "gaie" ]]; then - # run gaie checks. - SCRIPT_DIR="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - "${SCRIPT_DIR}/gaie_checks.sh" - kubectl apply -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE" - # For now do not run the benchmark - exit - fi - -# Launch the benchmark job (if available) -if [[ "$PERF_AVAILABLE" == "true" ]]; then - echo "Launching benchmark job..." - $DRY_RUN kubectl apply -n $NAMESPACE -f $PERF_FILE - - # Construct job name from the perf file - JOB_NAME=$(grep "name:" $PERF_FILE | head -1 | awk '{print $2}') - echo "Waiting for job '$JOB_NAME' to complete..." - $DRY_RUN kubectl wait --for=condition=Complete job/$JOB_NAME -n $NAMESPACE --timeout=6000s - - # Print logs from the benchmark job - echo "======================================" - echo "Benchmark completed. Logs:" - echo "======================================" - $DRY_RUN kubectl logs job/$JOB_NAME -n $NAMESPACE -else - echo "======================================" - echo "Deployment completed successfully!" - echo "No performance benchmark available for this configuration." - echo "======================================" -fi \ No newline at end of file diff --git a/tests/serve/configs/trtllm/agg.yaml b/tests/serve/trtllm/engine_configs/qwen3/agg.yaml similarity index 100% rename from tests/serve/configs/trtllm/agg.yaml rename to tests/serve/trtllm/engine_configs/qwen3/agg.yaml diff --git a/tests/serve/configs/trtllm/decode.yaml b/tests/serve/trtllm/engine_configs/qwen3/decode.yaml similarity index 100% rename from tests/serve/configs/trtllm/decode.yaml rename to tests/serve/trtllm/engine_configs/qwen3/decode.yaml diff --git a/tests/serve/configs/trtllm/prefill.yaml b/tests/serve/trtllm/engine_configs/qwen3/prefill.yaml similarity index 100% rename from tests/serve/configs/trtllm/prefill.yaml rename to tests/serve/trtllm/engine_configs/qwen3/prefill.yaml