diff --git a/benchmarks/router/run_engines.sh b/benchmarks/router/run_engines.sh index d03fbe26de..e186e6bd31 100755 --- a/benchmarks/router/run_engines.sh +++ b/benchmarks/router/run_engines.sh @@ -7,7 +7,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} NUM_WORKERS=8 MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" -RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm" +ENGINE_CONFIG_PATH="$DYNAMO_HOME/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b" TENSOR_PARALLEL_SIZE=1 DATA_PARALLEL_SIZE=1 USE_MOCKERS=false @@ -86,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then ) elif [ "$USE_TRTLLM" = true ]; then # Default args for TensorRT-LLM engine using predefined YAML configs - # Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml + # Config files located at: $ENGINE_CONFIG_PATH/{agg,decode,prefill}.yaml if [ "$MODE" = "prefill" ]; then - ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml" + ENGINE_CONFIG="$ENGINE_CONFIG_PATH/prefill.yaml" elif [ "$MODE" = "decode" ]; then - ENGINE_CONFIG="$RECIPE_PATH/decode.yaml" + ENGINE_CONFIG="$ENGINE_CONFIG_PATH/decode.yaml" else - ENGINE_CONFIG="$RECIPE_PATH/agg.yaml" + ENGINE_CONFIG="$ENGINE_CONFIG_PATH/agg.yaml" fi EXTRA_ARGS=( diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md index 517fadd3f4..56c83c69b3 100644 --- a/docs/backends/trtllm/README.md +++ b/docs/backends/trtllm/README.md @@ -158,7 +158,7 @@ cd $DYNAMO_HOME/examples/backends/trtllm ```bash cd $DYNAMO_HOME/examples/backends/trtllm -export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml +export AGG_ENGINE_ARGS=./engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" # nvidia/DeepSeek-R1-FP4 is a large model export MODEL_PATH="nvidia/DeepSeek-R1-FP4" diff --git a/docs/backends/trtllm/gemma3_sliding_window_attention.md b/docs/backends/trtllm/gemma3_sliding_window_attention.md index 9898e25f8d..aeccf070c4 100644 --- a/docs/backends/trtllm/gemma3_sliding_window_attention.md +++ b/docs/backends/trtllm/gemma3_sliding_window_attention.md @@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml +export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml ./launch/agg.sh ``` @@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml +export AGG_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml ./launch/agg_router.sh ``` @@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml ./launch/disagg.sh ``` @@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml cd $DYNAMO_HOME/examples/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml ./launch/disagg_router.sh ``` diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 6f7cec195d..124f170f8e 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be #### Configuration Files -**Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**: +**Prefill Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml`)**: - `enable_attention_dp: false` - Attention data parallelism disabled for prefill - `enable_chunked_prefill: true` - Enables efficient chunked prefill processing - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer - `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs -**Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**: +**Decode Configuration (`examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml`)**: - `enable_attention_dp: true` - Attention data parallelism enabled for decode - `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers @@ -145,7 +145,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ --model-path /model \ --served-model-name openai/gpt-oss-120b \ - --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \ + --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode prefill \ @@ -161,7 +161,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ --model-path /model \ --served-model-name openai/gpt-oss-120b \ - --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \ + --extra-engine-args examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode decode \ diff --git a/docs/backends/trtllm/llama4_plus_eagle.md b/docs/backends/trtllm/llama4_plus_eagle.md index 2be8ba6509..2cd59a2e5b 100644 --- a/docs/backends/trtllm/llama4_plus_eagle.md +++ b/docs/backends/trtllm/llama4_plus_eagle.md @@ -28,7 +28,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu - The other node runs the prefill worker. ## Notes -* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder. +* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `examples/backends/trtllm/engine_configs/llama4/eagle` folder. ## Setup @@ -52,7 +52,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide ## Aggregated Serving ```bash export NUM_NODES=1 -export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" +export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml" ./multinode/srun_aggregated.sh ``` @@ -60,9 +60,9 @@ export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" ```bash export NUM_PREFILL_NODES=1 -export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml" +export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yml" export NUM_DECODE_NODES=1 -export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml" +export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yml" ./multinode/srun_disaggregated.sh ``` diff --git a/docs/backends/trtllm/multimodal_support.md b/docs/backends/trtllm/multimodal_support.md index 1bc80d7132..7f90874be7 100644 --- a/docs/backends/trtllm/multimodal_support.md +++ b/docs/backends/trtllm/multimodal_support.md @@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode ```bash cd $DYNAMO_HOME -export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml +export AGG_ENGINE_ARGS=./examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" ./launch/agg.sh @@ -79,8 +79,8 @@ cd $DYNAMO_HOME export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"} export MODALITY=${MODALITY:-"multimodal"} ./launch/disagg.sh diff --git a/docs/backends/trtllm/multinode/multinode-examples.md b/docs/backends/trtllm/multinode/multinode-examples.md index c7b18594bc..a37ead4c2d 100644 --- a/docs/backends/trtllm/multinode/multinode-examples.md +++ b/docs/backends/trtllm/multinode/multinode-examples.md @@ -17,6 +17,8 @@ limitations under the License. # Example: Multi-node TRTLLM Workers with Dynamo on Slurm +> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/). + To run a single Dynamo+TRTLLM Worker that spans multiple nodes (ex: TP16), the set of nodes need to be launched together in the same MPI world, such as via `mpirun` or `srun`. This is true regardless of whether the worker is @@ -106,8 +108,8 @@ export IMAGE="" # For example, assuming your cluster had a `/lustre` directory on the host, you # could add that as a mount like so: # -# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" -export MOUNTS="${PWD}/../:/mnt" +# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/../../../../:/mnt" # NOTE: In general, Deepseek R1 is very large, so it is recommended to # pre-download the model weights and save them in some shared location, @@ -136,7 +138,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ```bash # Default set in srun_aggregated.sh, but can customize here. -# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml" +# export ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml" # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of @@ -165,8 +167,8 @@ deployment across 8 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG diff --git a/docs/backends/trtllm/multinode/multinode-multimodal-example.md b/docs/backends/trtllm/multinode/multinode-multimodal-example.md index e6ba318364..7295c5dfac 100644 --- a/docs/backends/trtllm/multinode/multinode-multimodal-example.md +++ b/docs/backends/trtllm/multinode/multinode-multimodal-example.md @@ -17,6 +17,8 @@ limitations under the License. # Example: Multi-node TRTLLM Workers with Dynamo on Slurm for multimodal models +> **Note:** The scripts referenced in this example (such as `srun_aggregated.sh` and `srun_disaggregated.sh`) can be found in [`examples/basics/multinode/trtllm/`](https://github.com/ai-dynamo/dynamo/tree/main/examples/basics/multinode/trtllm/). + > [!IMPORTANT] > There are some known issues in tensorrt_llm==1.1.0rc5 version for multinode multimodal support. It is important to rebuild the dynamo container with a specific version of tensorrt_llm commit to use multimodal feature. > @@ -34,7 +36,7 @@ limitations under the License. > > Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command: > ```bash -> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml +> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml /mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml > ``` @@ -71,8 +73,8 @@ export IMAGE="" # For example, assuming your cluster had a `/lustre` directory on the host, you # could add that as a mount like so: # -# export MOUNTS="${PWD}/../:/mnt,/lustre:/lustre" -export MOUNTS="${PWD}/../:/mnt" +# export MOUNTS="${PWD}/../../../../:/mnt,/lustre:/lustre" +export MOUNTS="${PWD}/../../../../:/mnt" # Can point to local FS as weel # export MODEL_PATH="/location/to/model" @@ -100,8 +102,8 @@ deployment across 4 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG @@ -123,7 +125,7 @@ deployment across 4 nodes: ## Understanding the Output -1. The `srun_disaggregated.sh` launches three srun jobs instead of two. One for frontend, one for prefill worker, and one for decode worker. +1. The `srun_disaggregated.sh` launches three srun jobs instead of two. One for frontend, one for prefill worker, and one for decode worker. 2. The OpenAI frontend will listen for and dynamically discover workers as they register themselves with Dynamo's distributed runtime: diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index 2a20637bac..59e8df4541 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -203,7 +203,7 @@ args: - python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B - --extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml + --extra-engine-args /workspace/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml ``` Key customization points include: diff --git a/examples/backends/trtllm/deploy/agg-with-config.yaml b/examples/backends/trtllm/deploy/agg-with-config.yaml index d18d1b0fb2..839df47cf0 100644 --- a/examples/backends/trtllm/deploy/agg-with-config.yaml +++ b/examples/backends/trtllm/deploy/agg-with-config.yaml @@ -67,4 +67,4 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/agg.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml diff --git a/examples/backends/trtllm/deploy/agg.yaml b/examples/backends/trtllm/deploy/agg.yaml index e85d5287f8..fb9f31f0bf 100644 --- a/examples/backends/trtllm/deploy/agg.yaml +++ b/examples/backends/trtllm/deploy/agg.yaml @@ -36,4 +36,4 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/agg.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml diff --git a/examples/backends/trtllm/deploy/agg_router.yaml b/examples/backends/trtllm/deploy/agg_router.yaml index a500c84f7c..819301b748 100644 --- a/examples/backends/trtllm/deploy/agg_router.yaml +++ b/examples/backends/trtllm/deploy/agg_router.yaml @@ -39,5 +39,5 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/agg.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/agg.yaml - --publish-events-and-metrics diff --git a/examples/backends/trtllm/deploy/disagg.yaml b/examples/backends/trtllm/deploy/disagg.yaml index 49b42146cf..20925b154e 100644 --- a/examples/backends/trtllm/deploy/disagg.yaml +++ b/examples/backends/trtllm/deploy/disagg.yaml @@ -37,7 +37,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/prefill.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml - --disaggregation-mode - prefill TRTLLMDecodeWorker: @@ -63,6 +63,6 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/decode.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml - --disaggregation-mode - decode diff --git a/examples/backends/trtllm/deploy/disagg_planner.yaml b/examples/backends/trtllm/deploy/disagg_planner.yaml index d4d7c9041e..e2f4396839 100644 --- a/examples/backends/trtllm/deploy/disagg_planner.yaml +++ b/examples/backends/trtllm/deploy/disagg_planner.yaml @@ -101,7 +101,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/decode.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml - --disaggregation-mode - decode TRTLLMPrefillWorker: @@ -128,6 +128,6 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/prefill.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml - --disaggregation-mode - prefill diff --git a/examples/backends/trtllm/deploy/disagg_router.yaml b/examples/backends/trtllm/deploy/disagg_router.yaml index 1afe7488ca..ea66224ae3 100644 --- a/examples/backends/trtllm/deploy/disagg_router.yaml +++ b/examples/backends/trtllm/deploy/disagg_router.yaml @@ -39,7 +39,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/prefill.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml - --disaggregation-mode - prefill - --publish-events-and-metrics @@ -65,6 +65,6 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - ./recipes/qwen3/trtllm/decode.yaml + - ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml - --disaggregation-mode - decode diff --git a/examples/backends/trtllm/engine_configs/README.md b/examples/backends/trtllm/engine_configs/README.md new file mode 100644 index 0000000000..03790575a4 --- /dev/null +++ b/examples/backends/trtllm/engine_configs/README.md @@ -0,0 +1,43 @@ +# TensorRT-LLM Engine Configurations + +This directory contains TensorRT-LLM engine configuration files for various model deployments. + + +## Usage + +These YAML configuration files can be passed to TensorRT-LLM workers using the `--extra-engine-args` parameter: + +```bash +python3 -m dynamo.trtllm \ + --extra-engine-args "${ENGINE_ARGS}" \ + ... +``` + +Where `ENGINE_ARGS` points to one of the configuration files in this directory. + +## Configuration Types + +### Aggregated (agg/) +Single-node configurations that combine prefill and decode operations: +- **simple/**: Basic aggregated setup +- **mtp/**: Multi-token prediction configurations +- **wide_ep/**: Wide expert parallel configurations + +### Disaggregated (disagg/) +Separate configurations for prefill and decode workers: +- **simple/**: Basic prefill/decode split +- **mtp/**: Multi-token prediction with separate prefill/decode +- **wide_ep/**: Wide expert parallel with expert load balancer + +## Key Configuration Parameters + +- **Parallelism**: `tensor_parallel_size`, `moe_expert_parallel_size`, `pipeline_parallel_size` +- **Memory**: `kv_cache_config.free_gpu_memory_fraction`, `kv_cache_config.dtype` +- **Batching**: `max_batch_size`, `max_num_tokens`, `max_seq_len` +- **Scheduling**: `disable_overlap_scheduler`, `cuda_graph_config` + +## Notes + +- For disaggregated setups, ensure `kv_cache_config.dtype` matches between prefill and decode configs +- WideEP configurations require an expert load balancer config (`eplb.yaml`) +- Adjust `free_gpu_memory_fraction` based on your workload and attention DP settings diff --git a/recipes/deepseek-r1-distill-llama-8b/trtllm/agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml similarity index 100% rename from recipes/deepseek-r1-distill-llama-8b/trtllm/agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/agg.yaml diff --git a/recipes/deepseek-r1-distill-llama-8b/trtllm/decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/decode.yaml similarity index 100% rename from recipes/deepseek-r1-distill-llama-8b/trtllm/decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/decode.yaml diff --git a/recipes/deepseek-r1-distill-llama-8b/trtllm/prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/prefill.yaml similarity index 100% rename from recipes/deepseek-r1-distill-llama-8b/trtllm/prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1-distill-llama-8b/prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/mtp/mtp_agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/mtp/mtp_agg.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/simple/agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/simple/agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/simple/agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/simple/agg.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/wide_ep/dep16_agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/dep16_agg.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/wide_ep/dep16_agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/dep16_agg.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml diff --git a/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml similarity index 89% rename from recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml index bcd6ae87e0..0d645c412a 100644 --- a/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml +++ b/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml @@ -11,7 +11,7 @@ moe_config: # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # 4096 = 256 * 16 # moe_max_num_tokens: 4096 - load_balancer: /mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/eplb.yaml + load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/eplb.yaml tensor_parallel_size: 16 moe_expert_parallel_size: 16 diff --git a/recipes/deepseek-r1/trtllm/disagg/mtp/mtp_decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_decode.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/mtp/mtp_decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_decode.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/mtp/mtp_prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_prefill.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/mtp/mtp_prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/mtp/mtp_prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/simple/decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/decode.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/simple/decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/decode.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/simple/prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/prefill.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/simple/prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/simple/prefill.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml similarity index 100% rename from recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml similarity index 95% rename from recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml index 39d392afe9..8b5814c023 100644 --- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml +++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml @@ -17,7 +17,7 @@ backend: pytorch # WideEP related settings moe_config: backend: WIDEEP - load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml + load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml similarity index 93% rename from recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml rename to examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml index 56e862a855..9f707f0129 100644 --- a/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml +++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml @@ -17,7 +17,7 @@ backend: pytorch # WideEP related settings moe_config: backend: WIDEEP - load_balancer: /mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/eplb.yaml + load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/recipes/gemma3/trtllm/vswa_agg.yaml b/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml similarity index 100% rename from recipes/gemma3/trtllm/vswa_agg.yaml rename to examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml diff --git a/recipes/gemma3/trtllm/vswa_decode.yaml b/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml similarity index 100% rename from recipes/gemma3/trtllm/vswa_decode.yaml rename to examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml diff --git a/recipes/gemma3/trtllm/vswa_prefill.yaml b/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml similarity index 100% rename from recipes/gemma3/trtllm/vswa_prefill.yaml rename to examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml diff --git a/examples/backends/trtllm/engine_configs/gpt-oss-120b/agg.yaml b/examples/backends/trtllm/engine_configs/gpt-oss-120b/agg.yaml new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml b/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml new file mode 100644 index 0000000000..1ba9844545 --- /dev/null +++ b/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml @@ -0,0 +1,25 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +enable_attention_dp: true +disable_overlap_scheduler: false +moe_config: + backend: CUTLASS +cuda_graph_config: + enable_padding: true +cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 65536 +print_iter_log: false +stream_interval: 10 diff --git a/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml b/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml new file mode 100644 index 0000000000..87bab09fd4 --- /dev/null +++ b/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml @@ -0,0 +1,27 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +enable_attention_dp: false +disable_overlap_scheduler: true +moe_config: + backend: CUTLASS +enable_chunked_prefill: true +cuda_graph_config: + max_batch_size: 32 + enable_padding: true +cache_transceiver_config: + backend: UCX + max_tokens_in_buffer: 65536 +print_iter_log: false +stream_interval: 10 diff --git a/recipes/llama4/trtllm/eagle/eagle_agg.yml b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml similarity index 100% rename from recipes/llama4/trtllm/eagle/eagle_agg.yml rename to examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml diff --git a/recipes/llama4/trtllm/eagle/eagle_decode.yaml b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml similarity index 100% rename from recipes/llama4/trtllm/eagle/eagle_decode.yaml rename to examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml diff --git a/recipes/llama4/trtllm/eagle/eagle_prefill.yaml b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml similarity index 100% rename from recipes/llama4/trtllm/eagle/eagle_prefill.yaml rename to examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml diff --git a/recipes/llama4/trtllm/multimodal/agg.yaml b/examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml similarity index 100% rename from recipes/llama4/trtllm/multimodal/agg.yaml rename to examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml diff --git a/recipes/llama4/trtllm/multimodal/decode.yaml b/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml similarity index 100% rename from recipes/llama4/trtllm/multimodal/decode.yaml rename to examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml diff --git a/recipes/llama4/trtllm/multimodal/prefill.yaml b/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml similarity index 100% rename from recipes/llama4/trtllm/multimodal/prefill.yaml rename to examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml similarity index 100% rename from recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml rename to examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml diff --git a/recipes/qwen3/trtllm/agg.yaml b/examples/backends/trtllm/engine_configs/qwen3/agg.yaml similarity index 100% rename from recipes/qwen3/trtllm/agg.yaml rename to examples/backends/trtllm/engine_configs/qwen3/agg.yaml diff --git a/recipes/qwen3/trtllm/decode.yaml b/examples/backends/trtllm/engine_configs/qwen3/decode.yaml similarity index 100% rename from recipes/qwen3/trtllm/decode.yaml rename to examples/backends/trtllm/engine_configs/qwen3/decode.yaml diff --git a/recipes/qwen3/trtllm/prefill.yaml b/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml similarity index 100% rename from recipes/qwen3/trtllm/prefill.yaml rename to examples/backends/trtllm/engine_configs/qwen3/prefill.yaml diff --git a/examples/backends/trtllm/launch/agg.sh b/examples/backends/trtllm/launch/agg.sh index f141531d7d..56a842eb52 100755 --- a/examples/backends/trtllm/launch/agg.sh +++ b/examples/backends/trtllm/launch/agg.sh @@ -6,7 +6,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export MODALITY=${MODALITY:-"text"} # If you want to use multimodal, set MODALITY to "multimodal" #export MODALITY=${MODALITY:-"multimodal"} diff --git a/examples/backends/trtllm/launch/agg_metrics.sh b/examples/backends/trtllm/launch/agg_metrics.sh index 2a69e41ea3..61671b4960 100755 --- a/examples/backends/trtllm/launch/agg_metrics.sh +++ b/examples/backends/trtllm/launch/agg_metrics.sh @@ -6,7 +6,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} export MODALITY=${MODALITY:-"text"} # Setup cleanup trap diff --git a/examples/backends/trtllm/launch/agg_router.sh b/examples/backends/trtllm/launch/agg_router.sh index bb69762735..1b0568535a 100755 --- a/examples/backends/trtllm/launch/agg_router.sh +++ b/examples/backends/trtllm/launch/agg_router.sh @@ -6,7 +6,7 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/agg.yaml"} # Setup cleanup trap cleanup() { diff --git a/examples/backends/trtllm/launch/disagg.sh b/examples/backends/trtllm/launch/disagg.sh index 695fd94779..7f75ee908e 100755 --- a/examples/backends/trtllm/launch/disagg.sh +++ b/examples/backends/trtllm/launch/disagg.sh @@ -6,8 +6,8 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export MODALITY=${MODALITY:-"text"} diff --git a/examples/backends/trtllm/launch/disagg_router.sh b/examples/backends/trtllm/launch/disagg_router.sh index b8f8bbf5cb..1b005a44ae 100755 --- a/examples/backends/trtllm/launch/disagg_router.sh +++ b/examples/backends/trtllm/launch/disagg_router.sh @@ -6,8 +6,8 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} diff --git a/examples/backends/trtllm/launch/disagg_same_gpu.sh b/examples/backends/trtllm/launch/disagg_same_gpu.sh index 1036329e8d..348c1ce61e 100755 --- a/examples/backends/trtllm/launch/disagg_same_gpu.sh +++ b/examples/backends/trtllm/launch/disagg_same_gpu.sh @@ -32,8 +32,8 @@ echo "GPU memory check passed: ${FREE_GPU_GB}GB available (required: ${REQUIRED_ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/configs/trtllm/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/tests/serve/trtllm/engine_configs/qwen3/decode.yaml"} export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-"0"} export MODALITY=${MODALITY:-"text"} diff --git a/examples/backends/trtllm/launch/epd_disagg.sh b/examples/backends/trtllm/launch/epd_disagg.sh index c52d57ce0c..a2843d28ba 100755 --- a/examples/backends/trtllm/launch/epd_disagg.sh +++ b/examples/backends/trtllm/launch/epd_disagg.sh @@ -6,9 +6,9 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} -export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml"} +export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"} diff --git a/examples/backends/trtllm/launch/gpt_oss_disagg.sh b/examples/backends/trtllm/launch/gpt_oss_disagg.sh index 9ada0c76ef..bbe560b231 100755 --- a/examples/backends/trtllm/launch/gpt_oss_disagg.sh +++ b/examples/backends/trtllm/launch/gpt_oss_disagg.sh @@ -6,8 +6,8 @@ export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"/model"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/gpt-oss-120b/decode.yaml"} set -e trap 'echo Cleaning up...; kill 0' EXIT diff --git a/examples/basics/multinode/trtllm/README.md b/examples/basics/multinode/trtllm/README.md new file mode 100644 index 0000000000..a36abc833d --- /dev/null +++ b/examples/basics/multinode/trtllm/README.md @@ -0,0 +1,20 @@ + + +# Example: Multi-node TRTLLM Workers with Dynamo on Slurm + +See [here](/docs/backends/trtllm/multinode) for how to setup this example. diff --git a/examples/basics/multinode/trtllm/srun_aggregated.sh b/examples/basics/multinode/trtllm/srun_aggregated.sh index 654c8ef691..c3d9792b45 100755 --- a/examples/basics/multinode/trtllm/srun_aggregated.sh +++ b/examples/basics/multinode/trtllm/srun_aggregated.sh @@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_NODES=${NUM_NODES:-4} NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} -export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/agg/wide_ep/wide_ep_agg.yaml}" +export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/agg/wide_ep/wide_ep_agg.yaml}" # Automate settings of certain variables for convenience, but you are free # to manually set these for more control as well. diff --git a/examples/basics/multinode/trtllm/srun_disaggregated.sh b/examples/basics/multinode/trtllm/srun_disaggregated.sh index 13f66b14b0..219108529a 100755 --- a/examples/basics/multinode/trtllm/srun_disaggregated.sh +++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh @@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1} -PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_prefill.yaml}" +PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml}" NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1} -DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/disagg/wide_ep/wide_ep_decode.yaml}" +DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml}" # Automate settings of certain variables for convenience, but you are free # to manually set these for more control as well. diff --git a/recipes/README.md b/recipes/README.md index b8c6981dc4..f7ef60decf 100644 --- a/recipes/README.md +++ b/recipes/README.md @@ -1,297 +1,282 @@ -# Dynamo Model Serving Recipes +# Dynamo Production-Ready Recipes -This repository contains production-ready recipes for deploying large language models using the Dynamo platform. Each recipe includes deployment configurations, performance benchmarking, and model caching setup. +Production-tested Kubernetes deployment recipes for LLM inference using NVIDIA Dynamo. -## Contents -- [Available Models](#available-models) -- [Quick Start](#quick-start) -- [Prerequisites](#prerequisites) -- Deployment Methods - - [Option 1: Automated Deployment](#option-1-automated-deployment) - - [Option 2: Manual Deployment](#option-2-manual-deployment) +> **Prerequisites:** This guide assumes you have already installed the Dynamo Kubernetes Platform. +> If not, follow the **[Kubernetes Deployment Guide](../docs/kubernetes/README.md)** first. +## Available Recipes -## Available Models - -| Model Family | Framework | Deployment Mode | GPU Requirements | Status | Benchmark |GAIE-integration | -|-----------------|-----------|---------------------|------------------|--------|-----------|------------------| -| llama-3-70b | vllm | agg | 4x H100/H200 | ✅ | ✅ |✅ | -| llama-3-70b | vllm | disagg (1 node) | 8x H100/H200 | ✅ | ✅ | 🚧 | -| llama-3-70b | vllm | disagg (multi-node) | 16x H100/H200 | ✅ | ✅ |🚧 | -| deepseek-r1 | sglang | disagg (1 node, wide-ep) | 8x H200 | ✅ | 🚧 |🚧 | -| deepseek-r1 | sglang | disagg (multi-node, wide-ep) | 16x H200 | ✅ | 🚧 |🚧 | -| gpt-oss-120b | trtllm | agg | 4x GB200 | ✅ | ✅ |🚧 | +| Model | Framework | Mode | GPUs | Deployment | Benchmark Recipe | Notes |GAIE integration | +|-------|-----------|------|------|------------|------------------|-------|------------------| +| **[Llama-3-70B](llama-3-70b/vllm/agg/)** | vLLM | Aggregated | 4x H100/H200 | ✅ | ✅ | FP8 dynamic quantization | ✅ | ❌ | +| **[Llama-3-70B](llama-3-70b/vllm/disagg-single-node/)** | vLLM | Disagg (Single-Node) | 8x H100/H200 | ✅ | ✅ | Prefill + Decode separation | ❌ | +| **[Llama-3-70B](llama-3-70b/vllm/disagg-multi-node/)** | vLLM | Disagg (Multi-Node) | 16x H100/H200 | ✅ | ✅ | 2 nodes, 8 GPUs each | ❌ | +| **[Qwen3-32B-FP8](qwen3-32b-fp8/trtllm/agg/)** | TensorRT-LLM | Aggregated | 4x GPU | ✅ | ✅ | FP8 quantization | ❌ | +| **[Qwen3-32B-FP8](qwen3-32b-fp8/trtllm/disagg/)** | TensorRT-LLM | Disaggregated | 8x GPU | ✅ | ✅ | Prefill + Decode separation | ❌ | +| **[GPT-OSS-120B](gpt-oss-120b/trtllm/agg/)** | TensorRT-LLM | Aggregated | 4x GB200 | ✅ | ✅ | Blackwell only, WideEP | ❌ | +| **[GPT-OSS-120B](gpt-oss-120b/trtllm/disagg/)** | TensorRT-LLM | Disaggregated | TBD | ❌ | ❌ | Engine configs only, no K8s manifest | ❌ | +| **[DeepSeek-R1](deepseek-r1/sglang/disagg-8gpu/)** | SGLang | Disagg WideEP | 8x H200 | ✅ | ❌ | Benchmark recipe pending | ❌ | +| **[DeepSeek-R1](deepseek-r1/sglang/disagg-16gpu/)** | SGLang | Disagg WideEP | 16x H200 | ✅ | ❌ | Benchmark recipe pending | ❌ | +| **[DeepSeek-R1](deepseek-r1/trtllm/disagg/wide_ep/gb200/)** | TensorRT-LLM | Disagg WideEP (GB200) | 32+4 GB200 | ✅ | ✅ |Multi-node: 8 decode + 1 prefill nodes | ❌ | **Legend:** -- ✅ Functional -- 🚧 Under development +- **Deployment**: ✅ = Complete `deploy.yaml` manifest available | ❌ = Missing or incomplete +- **Benchmark Recipe**: ✅ = Includes `perf.yaml` for running AIPerf benchmarks | ❌ = No benchmark recipe provided + +## Recipe Structure +Each complete recipe follows this standard structure: -**Recipe Directory Structure:** -Recipes are organized into a directory structure that follows the pattern: -```text +``` / +├── README.md (optional) # Model-specific deployment notes ├── model-cache/ -│ ├── model-cache.yaml # PVC for model cache -│ └── model-download.yaml # Job for model download -├── / -│ └── / -│ ├── deploy.yaml # DynamoGraphDeployment CRD and optional configmap for custom configuration -│ └── perf.yaml (optional) # Performance benchmark -└── README.md (optional) # Model documentation +│ ├── model-cache.yaml # PersistentVolumeClaim for model storage +│ └── model-download.yaml # Job to download model from HuggingFace +└── / # vllm, sglang, or trtllm + └── / # agg, disagg, disagg-single-node, etc. + ├── deploy.yaml # Complete DynamoGraphDeployment manifest + └── perf.yaml (optional) # AIPerf benchmark job ``` ## Quick Start -Follow the instructions in the [Prerequisites](#prerequisites) section to set up your environment. - -Choose your preferred deployment method: using the `run.sh` script or manual deployment steps. - +### Prerequisites -## Prerequisites +**1. Dynamo Platform Installed** -### 1. Environment Setup +The recipes require the Dynamo Kubernetes Platform to be installed. Follow the installation guide: -Create a Kubernetes namespace and set environment variable: +- **[Kubernetes Deployment Guide](../docs/kubernetes/README.md)** - Quickstart (~10 minutes) +- **[Detailed Installation Guide](../docs/kubernetes/installation_guide.md)** - Advanced options -```bash -export NAMESPACE=your-namespace -kubectl create namespace ${NAMESPACE} -``` +**2. GPU Cluster Requirements** -### 2. Deploy Dynamo Platform - -Install the Dynamo Cloud Platform following the [Quickstart Guide](../docs/kubernetes/README.md). - -### 3. GPU Cluster - -Ensure your Kubernetes cluster has: -- GPU nodes with appropriate GPU types (see model requirements above) +Ensure your cluster has: +- GPU nodes matching recipe requirements (see table above) - GPU operator installed -- Sufficient GPU memory and compute resources - -### 4. Container Registry Access +- Appropriate GPU drivers and container runtime -Ensure access to NVIDIA container registry for runtime images: -- `nvcr.io/nvidia/ai-dynamo/vllm-runtime:x.y.z` -- `nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:x.y.z` -- `nvcr.io/nvidia/ai-dynamo/sglang-runtime:x.y.z` +**3. HuggingFace Access** -### 5. HuggingFace Access and Kubernetes Secret Creation - -Set up a kubernetes secret with the HuggingFace token for model download: +Configure authentication to download models: ```bash -# Update the token in the secret file -vim hf_hub_secret/hf_hub_secret.yaml +export NAMESPACE=your-namespace +kubectl create namespace ${NAMESPACE} -# Apply the secret -kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE} +# Create HuggingFace token secret +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="your-token-here" \ + -n ${NAMESPACE} ``` -6. Configure Storage Class +**4. Storage Configuration** + +Update the `storageClassName` in `/model-cache/model-cache.yaml` to match your cluster: ```bash -# Check available storage classes +# Find your storage class name kubectl get storageclass -``` - -Replace "your-storage-class-name" with your actual storage class in the file: `/model-cache/model-cache.yaml` -```yaml -# In /model-cache/model-cache.yaml -spec: - storageClassName: "your-actual-storage-class" # Replace this +# Edit the model-cache.yaml file and update: +# spec: +# storageClassName: "your-actual-storage-class" ``` -## Option 1: Automated Deployment - -Use the `run.sh` script for fully automated deployment: - -**Note:** The script automatically: -- Create model cache PVC and downloads the model -- Deploy the model service -- Runs performance benchmark if a `perf.yaml` file is present in the deployment directory +### Deploy a Recipe - -#### Script Usage +**Step 1: Download Model** ```bash -./run.sh [OPTIONS] --model --framework --deployment -``` +# Update storageClassName in model-cache.yaml first! +kubectl apply -f /model-cache/ -n ${NAMESPACE} -**Required Options:** -- `--model `: Model name matching the directory name in the recipes directory (e.g., llama-3-70b, gpt-oss-120b, deepseek-r1) -- `--framework `: Backend framework (`vllm`, `trtllm`, `sglang`) -- `--deployment `: Deployment mode (e.g., agg, disagg, disagg-single-node, disagg-multi-node) +# Wait for download to complete (may take 10-60 minutes depending on model size) +kubectl wait --for=condition=Complete job/model-download -n ${NAMESPACE} --timeout=6000s -**Optional Options:** -- `--namespace `: Kubernetes namespace (default: dynamo) -- `--dry-run`: Show commands without executing them -- `-h, --help`: Show help message +# Monitor progress +kubectl logs -f job/model-download -n ${NAMESPACE} +``` -**Environment Variables:** -- `NAMESPACE`: Kubernetes namespace (default: dynamo) +**Step 2: Deploy Service** -#### Example Usage ```bash -# Set up environment -export NAMESPACE=your-namespace -kubectl create namespace ${NAMESPACE} -# Configure HuggingFace token -kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE} - -# use run.sh script to deploy the model -# Deploy Llama-3-70B with vLLM (aggregated mode) -./run.sh --model llama-3-70b --framework vllm --deployment agg +kubectl apply -f ///deploy.yaml -n ${NAMESPACE} -# Deploy GPT-OSS-120B with TensorRT-LLM -./run.sh --model gpt-oss-120b --framework trtllm --deployment agg - -# Deploy DeepSeek-R1 with SGLang (disaggregated mode) -./run.sh --model deepseek-r1 --framework sglang --deployment disagg +# Check deployment status +kubectl get dynamographdeployment -n ${NAMESPACE} -# Deploy with custom namespace -./run.sh --namespace my-namespace --model llama-3-70b --framework vllm --deployment agg +# Check pod status +kubectl get pods -n ${NAMESPACE} -# Dry run to see what would be executed -./run.sh --dry-run --model llama-3-70b --framework vllm --deployment agg +# Wait for pods to be ready +kubectl wait --for=condition=ready pod -l nvidia.com/dynamo-graph-deployment-name= -n ${NAMESPACE} --timeout=600s ``` -## If deploying with Gateway API Inference extension GAIE +**Step 3: Test Deployment** -1. Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE. +```bash +# Port forward to access the service locally +kubectl port-forward svc/-frontend 8000:8000 -n ${NAMESPACE} -2. Apply manifests by running a script. +# In another terminal, test the endpoint +curl http://localhost:8000/v1/models -```bash -# Match the block size to the cli value in your deployment file deploy.yaml: - "python3 -m dynamo.vllm ... --block-size 128" -export DYNAMO_KV_BLOCK_SIZE=128 -export EPP_IMAGE=nvcr.io/you/epp:tag -# Add --gaie argument to the script i.e.: -./run.sh --model llama-3-70b --framework vllm --gaie agg --deployment agg +# Send a test request +curl http://localhost:8000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "", + "messages": [{"role": "user", "content": "Hello!"}], + "max_tokens": 50 + }' ``` -The script will perform gateway checks and apply the manifests. -## Option 2: Manual Deployment - -For step-by-step manual deployment follow these steps : +**Step 4: Run Benchmark (Optional)** ```bash -# 0. Set up environment (see Prerequisites section) -export NAMESPACE=your-namespace -kubectl create namespace ${NAMESPACE} -kubectl apply -f hf_hub_secret/hf_hub_secret.yaml -n ${NAMESPACE} +# Only if perf.yaml exists in the recipe directory +kubectl apply -f ///perf.yaml -n ${NAMESPACE} -# 1. Download model (see Model Download section) -kubectl apply -n $NAMESPACE -f /model-cache/ +# Monitor benchmark progress +kubectl logs -f job/ -n ${NAMESPACE} -# 2. Deploy model (see Deployment section) -kubectl apply -n $NAMESPACE -f ///deploy.yaml - -# 3. Run benchmarks (optional, if perf.yaml exists) -kubectl apply -n $NAMESPACE -f ///perf.yaml +# View results after completion +kubectl logs job/ -n ${NAMESPACE} | tail -50 ``` -### Step 1: Download Model +** Inference Gateway (GAIE) Integration (Optional)** -```bash -# Start the download job -kubectl apply -n $NAMESPACE -f /model-cache +For Llama-3-70B with vLLM (Aggregated), an example of integration with the Inference Gateway is provided. -# Verify job creation -kubectl get jobs -n $NAMESPACE | grep model-download -``` - -Monitor and wait for the model download to complete: +Follow to Follow [Deploy Inference Gateway Section 2](../deploy/inference-gateway/README.md#2-deploy-inference-gateway) to install GAIE. Then apply manifests. ```bash +export DEPLOY_PATH=llama-3-70b/vllm/agg/ +#DEPLOY_PATH=/// +kubectl apply -R -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE" -# Wait for job completion (timeout after 100 minutes) -kubectl wait --for=condition=Complete job/model-download -n $NAMESPACE --timeout=6000s +## Example Deployments -# Check job status -kubectl get job model-download -n $NAMESPACE - -# View download logs -kubectl logs job/model-download -n $NAMESPACE -``` - -### Step 2: Deploy Model Service +### Llama-3-70B with vLLM (Aggregated) ```bash -# Navigate to the specific deployment configuration -cd /// +export NAMESPACE=dynamo-demo +kubectl create namespace ${NAMESPACE} -# Deploy the model service -kubectl apply -n $NAMESPACE -f deploy.yaml +# Create HF token secret +kubectl create secret generic hf-token-secret \ + --from-literal=HF_TOKEN="your-token" \ + -n ${NAMESPACE} -# Verify deployment creation -kubectl get deployments -n $NAMESPACE +# Deploy +kubectl apply -f llama-3-70b/model-cache/ -n ${NAMESPACE} +kubectl wait --for=condition=Complete job/model-download -n ${NAMESPACE} --timeout=6000s +kubectl apply -f llama-3-70b/vllm/agg/deploy.yaml -n ${NAMESPACE} + +# Test +kubectl port-forward svc/llama3-70b-agg-frontend 8000:8000 -n ${NAMESPACE} ``` -#### Wait for Deployment Ready +### DeepSeek-R1 on GB200 (Multi-node) -```bash -# Get deployment name from the deploy.yaml file -DEPLOYMENT_NAME=$(grep "name:" deploy.yaml | head -1 | awk '{print $2}') +See [deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml](deepseek-r1/trtllm/disagg/wide_ep/gb200/deploy.yaml) for the complete multi-node WideEP configuration. -# Wait for deployment to be ready (timeout after 10 minutes) -kubectl wait --for=condition=available deployment/$DEPLOYMENT_NAME -n $NAMESPACE --timeout=1200s +## Customization -# Check deployment status -kubectl get deployment $DEPLOYMENT_NAME -n $NAMESPACE +Each `deploy.yaml` contains: +- **ConfigMap**: Engine-specific configuration (embedded in the manifest) +- **DynamoGraphDeployment**: Kubernetes resource definitions +- **Resource limits**: GPU count, memory, CPU requests/limits +- **Image references**: Container images with version tags -# Check pod status -kubectl get pods -n $NAMESPACE -l app=$DEPLOYMENT_NAME -``` +### Key Customization Points -#### Verify Model Service +**Model Configuration:** +```yaml +# In deploy.yaml under worker args: +args: + - python3 -m dynamo.vllm --model --served-model-name +``` -```bash -# Check if service is running -kubectl get services -n $NAMESPACE +**GPU Resources:** +```yaml +resources: + limits: + gpu: "4" # Adjust based on your requirements + requests: + gpu: "4" +``` -# Test model endpoint (port-forward to test locally) -kubectl port-forward service/${DEPLOYMENT_NAME}-frontend 8000:8000 -n $NAMESPACE +**Scaling:** +```yaml +services: + VllmDecodeWorker: + replicas: 2 # Scale to multiple workers +``` -# Test the model API (in another terminal) -curl http://localhost:8000/v1/models +**Router Mode:** +```yaml +# In Frontend args: +args: + - python3 -m dynamo.frontend --router-mode kv --http-port 8000 +# Options: round-robin, kv (KV-aware routing) +``` -# Stop port-forward when done -pkill -f "kubectl port-forward" +**Container Images:** +```yaml +image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:x.y.z +# Update version tag as needed ``` -### Step 3: Performance Benchmarking (Optional) +## Troubleshooting -Run performance benchmarks to evaluate model performance. Note that benchmarking is only available for models that include a `perf.yaml` file (optional): +### Common Issues -#### Launch Benchmark Job +**Pods stuck in Pending:** +- Check GPU availability: `kubectl describe node ` +- Verify storage class exists: `kubectl get storageclass` +- Check resource requests vs. available resources -```bash -# From the deployment directory -kubectl apply -n $NAMESPACE -f perf.yaml +**Model download fails:** +- Verify HuggingFace token is correct +- Check network connectivity from cluster +- Review job logs: `kubectl logs job/model-download -n ${NAMESPACE}` -# Verify benchmark job creation -kubectl get jobs -n $NAMESPACE -``` +**Workers fail to start:** +- Check GPU compatibility (driver version, CUDA version) +- Verify image pull secrets if using private registries +- Review pod logs: `kubectl logs -n ${NAMESPACE}` -#### Monitor Benchmark Progress +**For more troubleshooting:** +- [Kubernetes Deployment Guide](../docs/kubernetes/README.md#troubleshooting) +- [Observability Documentation](../docs/kubernetes/observability/) -```bash -# Get benchmark job name -PERF_JOB_NAME=$(grep "name:" perf.yaml | head -1 | awk '{print $2}') +## Related Documentation -# Monitor benchmark logs in real-time -kubectl logs -f job/$PERF_JOB_NAME -n $NAMESPACE +- **[Kubernetes Deployment Guide](../docs/kubernetes/README.md)** - Platform installation and concepts +- **[API Reference](../docs/kubernetes/api_reference.md)** - DynamoGraphDeployment CRD specification +- **[vLLM Backend Guide](../docs/backends/vllm/README.md)** - vLLM-specific features +- **[SGLang Backend Guide](../docs/backends/sglang/README.md)** - SGLang-specific features +- **[TensorRT-LLM Backend Guide](../docs/backends/trtllm/README.md)** - TensorRT-LLM features +- **[Observability](../docs/kubernetes/observability/)** - Monitoring and logging +- **[Benchmarking Guide](../docs/benchmarks/benchmarking.md)** - Performance testing -# Wait for benchmark completion (timeout after 100 minutes) -kubectl wait --for=condition=Complete job/$PERF_JOB_NAME -n $NAMESPACE --timeout=6000s -``` +## Contributing -#### View Benchmark Results +We welcome contributions of new recipes! See [CONTRIBUTING.md](CONTRIBUTING.md) for: +- Recipe submission guidelines +- Required components checklist +- Testing and validation requirements +- Documentation standards -```bash -# Check final benchmark results -kubectl logs job/$PERF_JOB_NAME -n $NAMESPACE | tail -50 -``` \ No newline at end of file +### Recipe Quality Standards + +A production-ready recipe must include: +- ✅ Complete `deploy.yaml` with DynamoGraphDeployment +- ✅ Model cache PVC and download job +- ✅ Benchmark recipe (`perf.yaml`) for performance testing +- ✅ Verification on target hardware +- ✅ Documentation of GPU requirements diff --git a/recipes/gpt-oss-120b/trtllm/disagg/README.md b/recipes/gpt-oss-120b/trtllm/disagg/README.md new file mode 100644 index 0000000000..10390c9587 --- /dev/null +++ b/recipes/gpt-oss-120b/trtllm/disagg/README.md @@ -0,0 +1,25 @@ +# GPT-OSS-120B Disaggregated Mode + +> **⚠️ INCOMPLETE**: This directory contains only engine configuration files and is not ready for Kubernetes deployment. + +## Current Status + +This directory contains TensorRT-LLM engine configurations for disaggregated serving: +- `decode.yaml` - Decode worker engine configuration +- `prefill.yaml` - Prefill worker engine configuration + +## Missing Components + +To complete this recipe, the following files are needed: +- `deploy.yaml` - Kubernetes DynamoGraphDeployment manifest +- `perf.yaml` - Performance benchmarking job (optional) + +## Alternative + +For a production-ready GPT-OSS-120B deployment, use the **aggregated mode**: +- [gpt-oss-120b/trtllm/agg/](../agg/) - Complete with `deploy.yaml` and `perf.yaml` + +## Contributing + +If you'd like to complete this recipe, see [recipes/CONTRIBUTING.md](../../../CONTRIBUTING.md) for guidelines on creating proper Kubernetes deployment manifests. + diff --git a/recipes/run.sh b/recipes/run.sh deleted file mode 100755 index 185f30a467..0000000000 --- a/recipes/run.sh +++ /dev/null @@ -1,261 +0,0 @@ -#!/usr/bin/env bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -euo pipefail -IFS=$'\n\t' - -RECIPES_DIR="$( cd "$( dirname "$0" )" && pwd )" -# Default values -NAMESPACE="${NAMESPACE:-dynamo}" -DEPLOY_TYPE="" -GAIE="${GAIE:-false}" -DEPLOYMENT="" -MODEL="" -FRAMEWORK="" -DRY_RUN="" - -# Frameworks - following container/build.sh pattern -declare -A FRAMEWORKS=(["VLLM"]=1 ["TRTLLM"]=2 ["SGLANG"]=3) -DEFAULT_FRAMEWORK=VLLM - -# Function to show usage -usage() { - echo "Usage: $0 [OPTIONS] --model --framework --deployment " - echo "" - echo "Required Options:" - echo " --model Model name (e.g., llama-3-70b)" - echo " --framework Framework one of ${!FRAMEWORKS[*]} (default: ${DEFAULT_FRAMEWORK})" - echo " --deployment Deployment type (e.g., agg, disagg etc, please refer to the README.md for available deployment types)" - echo "" - echo "Optional:" - echo " --namespace Kubernetes namespace (default: dynamo)" - echo " --dry-run Print commands without executing them" - echo " --gaie[=true|false] Enable GAIE integration subfolder (applies GAIE manifests skips benchmark) (default: ${GAIE})" - echo " -h, --help Show this help message" - echo "" - echo "Environment Variables:" - echo " NAMESPACE Kubernetes namespace (default: dynamo)" - echo "" - echo "Examples:" - echo " $0 --model llama-3-70b --framework vllm --deployment agg" - echo " $0 --model llama-3-70b --framework trtllm --deployment disagg-single-node" - echo " $0 --namespace my-ns --model llama-3-70b --framework vllm --deployment disagg-multi-node" - exit 1 -} - -missing_requirement() { - echo "ERROR: $1 requires an argument." - usage -} - -error() { - printf '%s %s\n' "$1" "$2" >&2 - exit 1 -} - -while [[ $# -gt 0 ]]; do - case $1 in - --dry-run) - DRY_RUN="echo" - shift - ;; - --model) - if [ "$2" ]; then - MODEL=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --framework) - if [ "$2" ]; then - FRAMEWORK=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --deployment) - if [ "$2" ]; then - DEPLOYMENT=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --namespace) - if [ "$2" ]; then - NAMESPACE=$2 - shift 2 - else - missing_requirement "$1" - fi - ;; - --gaie) - GAIE=true - shift - ;; - --gaie=false) - GAIE=false - shift - ;; - --gaie=*) - GAIE="${1#*=}" - case "${GAIE,,}" in - true|false) GAIE="${GAIE,,}";; - *) echo "ERROR: --gaie must be true or false"; exit 1;; - esac - shift - ;; - -h|--help) - usage - ;; - -*) - error 'ERROR: Unknown option: ' "$1" - ;; - *) - error "ERROR: Unknown argument: " "$1" - ;; - esac -done - -if [ -z "$FRAMEWORK" ]; then - FRAMEWORK=$DEFAULT_FRAMEWORK -fi - -if [ -n "$FRAMEWORK" ]; then - FRAMEWORK=${FRAMEWORK^^} - if [[ -z "${FRAMEWORKS[$FRAMEWORK]}" ]]; then - error 'ERROR: Unknown framework: ' "$FRAMEWORK" - fi -fi - -# Validate required arguments -if [[ -z "$MODEL" ]] || [[ -z "$DEPLOYMENT" ]]; then - if [[ -z "$MODEL" ]]; then - echo "ERROR: --model argument is required" - fi - if [[ -z "$DEPLOYMENT" ]]; then - echo "ERROR: --deployment argument is required" - fi - echo "" - usage -fi - -# Construct paths based on new structure: recipes//// -MODEL_DIR="$RECIPES_DIR/$MODEL" -FRAMEWORK_DIR="$MODEL_DIR/${FRAMEWORK,,}" -DEPLOY_PATH="$FRAMEWORK_DIR/$DEPLOYMENT" -INTEGRATION="$([[ "${GAIE,,}" == "true" ]] && echo gaie || echo "")" - -# Check if model directory exists -if [[ ! -d "$MODEL_DIR" ]]; then - echo "Error: Model directory '$MODEL' does not exist in $RECIPES_DIR" - echo "Available models:" - ls -1 "$RECIPES_DIR" | grep -v "\.sh$\|\.md$\|model-cache$" | sed 's/^/ /' - exit 1 -fi - -# Check if framework directory exists -if [[ ! -d "$FRAMEWORK_DIR" ]]; then - echo "Error: Framework directory '${FRAMEWORK,,}' does not exist in $MODEL_DIR" - echo "Available frameworks for $MODEL:" - ls -1 "$MODEL_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/ /' - exit 1 -fi - -# Check if deployment directory exists -if [[ ! -d "$DEPLOY_PATH" ]]; then - echo "Error: Deployment type '$DEPLOYMENT' does not exist in $FRAMEWORK_DIR" - echo "Available deployment types for $MODEL/${FRAMEWORK,,}:" - ls -1 "$FRAMEWORK_DIR" | grep -v "\.sh$\|\.md$" | sed 's/^/ /' - exit 1 -fi - -# Check if deployment files exist -DEPLOY_FILE="$DEPLOY_PATH/deploy.yaml" -PERF_FILE="$DEPLOY_PATH/perf.yaml" - -if [[ ! -f "$DEPLOY_FILE" ]]; then - echo "Error: Deployment file '$DEPLOY_FILE' not found" - exit 1 -fi - -# Check if perf file exists (optional) -PERF_AVAILABLE=false -if [[ -f "$PERF_FILE" ]]; then - PERF_AVAILABLE=true - echo "Performance benchmark file found: $PERF_FILE" -else - echo "Performance benchmark file not found: $PERF_FILE (skipping benchmarks)" -fi - -# Show deployment information -echo "======================================" -echo "Dynamo Recipe Deployment" -echo "======================================" -echo "Model: $MODEL" -echo "Framework: ${FRAMEWORK,,}" -echo "Deployment Type: $DEPLOYMENT" -echo "Namespace: $NAMESPACE" -echo "GAIE integration: $GAIE" -echo "======================================" - -# Handle model downloading -MODEL_CACHE_DIR="$MODEL_DIR/model-cache" -echo "Creating PVC for model cache and downloading model..." -$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-cache.yaml -$DRY_RUN kubectl apply -n $NAMESPACE -f $MODEL_CACHE_DIR/model-download.yaml - -# Wait for the model download to complete -MODEL_DOWNLOAD_JOB_NAME=$(grep "name:" $MODEL_CACHE_DIR/model-download.yaml | head -1 | awk '{print $2}') -echo "Waiting for job '$MODEL_DOWNLOAD_JOB_NAME' to complete..." -$DRY_RUN kubectl wait --for=condition=Complete job/$MODEL_DOWNLOAD_JOB_NAME -n $NAMESPACE --timeout=6000s - -# Deploy the specified configuration -echo "Deploying $MODEL ${FRAMEWORK,,} $DEPLOYMENT configuration..." -$DRY_RUN kubectl apply -n $NAMESPACE -f $DEPLOY_FILE - -if [[ "$INTEGRATION" == "gaie" ]]; then - # run gaie checks. - SCRIPT_DIR="$(cd -- "$(dirname "${BASH_SOURCE[0]}")" && pwd)" - "${SCRIPT_DIR}/gaie_checks.sh" - $DRY_RUN kubectl apply -R -f "$DEPLOY_PATH/gaie/k8s-manifests" -n "$NAMESPACE" - # For now do not run the benchmark - exit - fi - -# Launch the benchmark job (if available) -if [[ "$PERF_AVAILABLE" == "true" ]]; then - echo "Launching benchmark job..." - $DRY_RUN kubectl apply -n $NAMESPACE -f $PERF_FILE - - # Construct job name from the perf file - JOB_NAME=$(grep "name:" $PERF_FILE | head -1 | awk '{print $2}') - echo "Waiting for job '$JOB_NAME' to complete..." - $DRY_RUN kubectl wait --for=condition=Complete job/$JOB_NAME -n $NAMESPACE --timeout=6000s - - # Print logs from the benchmark job - echo "======================================" - echo "Benchmark completed. Logs:" - echo "======================================" - $DRY_RUN kubectl logs job/$JOB_NAME -n $NAMESPACE -else - echo "======================================" - echo "Deployment completed successfully!" - echo "No performance benchmark available for this configuration." - echo "======================================" -fi \ No newline at end of file diff --git a/tests/serve/configs/trtllm/agg.yaml b/tests/serve/trtllm/engine_configs/qwen3/agg.yaml similarity index 100% rename from tests/serve/configs/trtllm/agg.yaml rename to tests/serve/trtllm/engine_configs/qwen3/agg.yaml diff --git a/tests/serve/configs/trtllm/decode.yaml b/tests/serve/trtllm/engine_configs/qwen3/decode.yaml similarity index 100% rename from tests/serve/configs/trtllm/decode.yaml rename to tests/serve/trtllm/engine_configs/qwen3/decode.yaml diff --git a/tests/serve/configs/trtllm/prefill.yaml b/tests/serve/trtllm/engine_configs/qwen3/prefill.yaml similarity index 100% rename from tests/serve/configs/trtllm/prefill.yaml rename to tests/serve/trtllm/engine_configs/qwen3/prefill.yaml