From 95a7fbeccbf299b3b25fe0f01a99b99568ea020c Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Tue, 21 Oct 2025 07:02:44 -0700 Subject: [PATCH 1/8] refactor: move engine configs out of components directory Signed-off-by: Anant Sharma --- .../trtllm/engine_configs/multimodal/agg.yaml | 33 ------------------- .../engine_configs/multimodal/decode.yaml | 29 ---------------- .../engine_configs/multimodal/prefill.yaml | 31 ----------------- .../backends/trtllm/launch/gpt_oss_disagg.sh | 4 +-- .../trtllm/multinode/srun_aggregated.sh | 6 ++-- .../trtllm/multinode/srun_disaggregated.sh | 10 +++--- docs/backends/trtllm/README.md | 2 +- .../trtllm/gemma3_sliding_window_attention.md | 12 +++---- docs/backends/trtllm/gpt-oss.md | 8 ++--- docs/backends/trtllm/llama4_plus_eagle.md | 8 ++--- docs/backends/trtllm/multimodal_support.md | 6 ++-- .../trtllm/multinode/multinode-examples.md | 6 ++-- .../multinode/multinode-multimodal-example.md | 6 ++-- .../scripts}/start_frontend_services.sh | 0 .../scripts}/start_trtllm_worker.sh | 0 .../deepseek-r1/trtllm}/mtp/mtp_agg.yaml | 0 .../deepseek-r1/trtllm}/mtp/mtp_decode.yaml | 0 .../deepseek-r1/trtllm}/mtp/mtp_prefill.yaml | 0 .../deepseek-r1/trtllm}/simple/agg.yaml | 0 .../deepseek-r1/trtllm}/simple/decode.yaml | 0 .../deepseek-r1/trtllm}/simple/prefill.yaml | 0 .../trtllm}/wide_ep/dep16_agg.yaml | 0 .../deepseek-r1/trtllm}/wide_ep/eplb.yaml | 0 .../trtllm}/wide_ep/wide_ep_agg.yaml | 2 +- .../trtllm}/wide_ep/wide_ep_decode.yaml | 2 +- .../trtllm}/wide_ep/wide_ep_prefill.yaml | 2 +- .../gemma3/trtllm}/vswa_agg.yaml | 0 .../gemma3/trtllm}/vswa_decode.yaml | 0 .../gemma3/trtllm}/vswa_prefill.yaml | 0 .../gpt-oss-120b/trtllm/disagg}/decode.yaml | 0 .../gpt-oss-120b/trtllm/disagg}/prefill.yaml | 0 .../llama4/trtllm}/eagle/eagle_agg.yml | 0 .../llama4/trtllm}/eagle/eagle_decode.yaml | 0 .../llama4/trtllm}/eagle/eagle_prefill.yaml | 0 .../llama4/trtllm/multimodal}/decode.yaml | 0 .../llama4/trtllm/multimodal}/prefill.yaml | 0 36 files changed, 37 insertions(+), 130 deletions(-) delete mode 100644 components/backends/trtllm/engine_configs/multimodal/agg.yaml delete mode 100644 components/backends/trtllm/engine_configs/multimodal/decode.yaml delete mode 100644 components/backends/trtllm/engine_configs/multimodal/prefill.yaml rename {components/backends/trtllm/multinode => examples/multimodal/scripts}/start_frontend_services.sh (100%) rename {components/backends/trtllm/multinode => examples/multimodal/scripts}/start_trtllm_worker.sh (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/mtp/mtp_agg.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/mtp/mtp_decode.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/mtp/mtp_prefill.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/simple/agg.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/simple/decode.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/simple/prefill.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/wide_ep/dep16_agg.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/wide_ep/eplb.yaml (100%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/wide_ep/wide_ep_agg.yaml (92%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/wide_ep/wide_ep_decode.yaml (96%) rename {components/backends/trtllm/engine_configs/deepseek_r1 => recipes/deepseek-r1/trtllm}/wide_ep/wide_ep_prefill.yaml (95%) rename {components/backends/trtllm/engine_configs/gemma3 => recipes/gemma3/trtllm}/vswa_agg.yaml (100%) rename {components/backends/trtllm/engine_configs/gemma3 => recipes/gemma3/trtllm}/vswa_decode.yaml (100%) rename {components/backends/trtllm/engine_configs/gemma3 => recipes/gemma3/trtllm}/vswa_prefill.yaml (100%) rename {components/backends/trtllm/engine_configs/gpt_oss => recipes/gpt-oss-120b/trtllm/disagg}/decode.yaml (100%) rename {components/backends/trtllm/engine_configs/gpt_oss => recipes/gpt-oss-120b/trtllm/disagg}/prefill.yaml (100%) rename {components/backends/trtllm/engine_configs/llama4 => recipes/llama4/trtllm}/eagle/eagle_agg.yml (100%) rename {components/backends/trtllm/engine_configs/llama4 => recipes/llama4/trtllm}/eagle/eagle_decode.yaml (100%) rename {components/backends/trtllm/engine_configs/llama4 => recipes/llama4/trtllm}/eagle/eagle_prefill.yaml (100%) rename {components/backends/trtllm/engine_configs/multimodal/llama4 => recipes/llama4/trtllm/multimodal}/decode.yaml (100%) rename {components/backends/trtllm/engine_configs/multimodal/llama4 => recipes/llama4/trtllm/multimodal}/prefill.yaml (100%) diff --git a/components/backends/trtllm/engine_configs/multimodal/agg.yaml b/components/backends/trtllm/engine_configs/multimodal/agg.yaml deleted file mode 100644 index 754f8ce759..0000000000 --- a/components/backends/trtllm/engine_configs/multimodal/agg.yaml +++ /dev/null @@ -1,33 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -tensor_parallel_size: 8 -moe_expert_parallel_size: 1 -enable_attention_dp: false -max_num_tokens: 4096 -max_batch_size: 8 -trust_remote_code: true -backend: pytorch -enable_chunked_prefill: true - -kv_cache_config: - free_gpu_memory_fraction: 0.3 - enable_block_reuse: false - -cache_transceiver_config: - backend: DEFAULT -# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 -# NOTE: overlap_scheduler enabled by default since this commit and changed -# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 diff --git a/components/backends/trtllm/engine_configs/multimodal/decode.yaml b/components/backends/trtllm/engine_configs/multimodal/decode.yaml deleted file mode 100644 index 6dbd676ee4..0000000000 --- a/components/backends/trtllm/engine_configs/multimodal/decode.yaml +++ /dev/null @@ -1,29 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -tensor_parallel_size: 1 -moe_expert_parallel_size: 1 -enable_attention_dp: false -max_num_tokens: 8192 -max_batch_size: 16 -trust_remote_code: true -backend: pytorch -enable_chunked_prefill: true -disable_overlap_scheduler: false -kv_cache_config: - free_gpu_memory_fraction: 0.30 - enable_block_reuse: false - -cache_transceiver_config: - backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml b/components/backends/trtllm/engine_configs/multimodal/prefill.yaml deleted file mode 100644 index 83a65e8bf3..0000000000 --- a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml +++ /dev/null @@ -1,31 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -tensor_parallel_size: 1 -moe_expert_parallel_size: 1 -enable_attention_dp: false -max_num_tokens: 8192 -max_batch_size: 16 -trust_remote_code: true -backend: pytorch -enable_chunked_prefill: true -# Overlap scheduler not currently supported in prefill only workers. -disable_overlap_scheduler: true - -kv_cache_config: - free_gpu_memory_fraction: 0.30 - enable_block_reuse: false - -cache_transceiver_config: - backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/launch/gpt_oss_disagg.sh b/components/backends/trtllm/launch/gpt_oss_disagg.sh index db42c01771..606ba2a8a2 100755 --- a/components/backends/trtllm/launch/gpt_oss_disagg.sh +++ b/components/backends/trtllm/launch/gpt_oss_disagg.sh @@ -6,8 +6,8 @@ export MODEL_PATH=${MODEL_PATH:-"/model"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/gpt_oss/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/gpt_oss/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"} set -e trap 'echo Cleaning up...; kill 0' EXIT diff --git a/components/backends/trtllm/multinode/srun_aggregated.sh b/components/backends/trtllm/multinode/srun_aggregated.sh index ac1187bf00..e9568db9cd 100755 --- a/components/backends/trtllm/multinode/srun_aggregated.sh +++ b/components/backends/trtllm/multinode/srun_aggregated.sh @@ -18,7 +18,7 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_NODES=${NUM_NODES:-4} NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} -export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml}" +export ENGINE_CONFIG="${ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml}" # Automate settings of certain variables for convenience, but you are free # to manually set these for more control as well. @@ -51,7 +51,7 @@ srun \ --nodelist "${HEAD_NODE}" \ --nodes 1 \ --jobid "${SLURM_JOB_ID}" \ - /mnt/multinode/start_frontend_services.sh & + /mnt/examples/multimodal/scripts/start_frontend_services.sh & # NOTE: Output streamed to stdout for ease of understanding the example, but # in practice you would probably set `srun --output ... --error ...` to pipe @@ -71,4 +71,4 @@ srun \ --nodes "${NUM_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/multinode/start_trtllm_worker.sh & \ No newline at end of file + /mnt/examples/multimodal/scripts/start_trtllm_worker.sh & \ No newline at end of file diff --git a/components/backends/trtllm/multinode/srun_disaggregated.sh b/components/backends/trtllm/multinode/srun_disaggregated.sh index c8d9ac99cb..fddac1ccbe 100755 --- a/components/backends/trtllm/multinode/srun_disaggregated.sh +++ b/components/backends/trtllm/multinode/srun_disaggregated.sh @@ -17,11 +17,11 @@ NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4} NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1} -PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml}" +PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml}" NUM_DECODE_NODES=${NUM_DECODE_NODES:-4} NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1} -DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml}" +DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml}" DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} @@ -56,7 +56,7 @@ srun \ --nodelist "${HEAD_NODE}" \ --nodes 1 \ --jobid "${SLURM_JOB_ID}" \ - /mnt/multinode/start_frontend_services.sh & + /mnt/examples/multimodal/scripts/start_frontend_services.sh & # NOTE: Output streamed to stdout for ease of understanding the example, but # in practice you would probably set `srun --output ... --error ...` to pipe @@ -78,7 +78,7 @@ for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do --nodes "${NUM_PREFILL_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/multinode/start_trtllm_worker.sh & + /mnt/examples/multimodal/scripts/start_trtllm_worker.sh & done for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do @@ -98,5 +98,5 @@ for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do --nodes "${NUM_DECODE_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/multinode/start_trtllm_worker.sh & + /mnt/examples/multimodal/scripts/start_trtllm_worker.sh & done \ No newline at end of file diff --git a/docs/backends/trtllm/README.md b/docs/backends/trtllm/README.md index a7e3107659..a43176e24f 100644 --- a/docs/backends/trtllm/README.md +++ b/docs/backends/trtllm/README.md @@ -162,7 +162,7 @@ cd $DYNAMO_HOME/components/backends/trtllm ```bash cd $DYNAMO_HOME/components/backends/trtllm -export AGG_ENGINE_ARGS=./engine_configs/deepseek_r1/mtp/mtp_agg.yaml +export AGG_ENGINE_ARGS=./recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml export SERVED_MODEL_NAME="nvidia/DeepSeek-R1-FP4" # nvidia/DeepSeek-R1-FP4 is a large model export MODEL_PATH="nvidia/DeepSeek-R1-FP4" diff --git a/docs/backends/trtllm/gemma3_sliding_window_attention.md b/docs/backends/trtllm/gemma3_sliding_window_attention.md index 5f9cca904c..5161332205 100644 --- a/docs/backends/trtllm/gemma3_sliding_window_attention.md +++ b/docs/backends/trtllm/gemma3_sliding_window_attention.md @@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml +export AGG_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_agg.yaml ./launch/agg.sh ``` @@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml +export AGG_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_agg.yaml ./launch/agg_router.sh ``` @@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=engine_configs/gemma3/vswa_agg.yaml cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=engine_configs/gemma3/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_decode.yaml ./launch/disagg.sh ``` @@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=engine_configs/gemma3/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=engine_configs/gemma3/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_decode.yaml ./launch/disagg_router.sh ``` diff --git a/docs/backends/trtllm/gpt-oss.md b/docs/backends/trtllm/gpt-oss.md index 9c1f130522..279f2fd1c2 100644 --- a/docs/backends/trtllm/gpt-oss.md +++ b/docs/backends/trtllm/gpt-oss.md @@ -90,14 +90,14 @@ The deployment uses configuration files and command-line arguments to control be #### Configuration Files -**Prefill Configuration (`engine_configs/gpt_oss/prefill.yaml`)**: +**Prefill Configuration (`recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml`)**: - `enable_attention_dp: false` - Attention data parallelism disabled for prefill - `enable_chunked_prefill: true` - Enables efficient chunked prefill processing - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers - `cache_transceiver_config.backend: ucx` - Uses UCX for efficient KV cache transfer - `cuda_graph_config.max_batch_size: 32` - Maximum batch size for CUDA graphs -**Decode Configuration (`engine_configs/gpt_oss/decode.yaml`)**: +**Decode Configuration (`recipes/gpt-oss-120b/trtllm/disagg/decode.yaml`)**: - `enable_attention_dp: true` - Attention data parallelism enabled for decode - `disable_overlap_scheduler: false` - Enables overlapping for decode efficiency - `moe_config.backend: CUTLASS` - Uses optimized CUTLASS kernels for MoE layers @@ -147,7 +147,7 @@ python3 -m dynamo.frontend --router-mode round-robin --http-port 8000 & CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ --model-path /model \ --served-model-name openai/gpt-oss-120b \ - --extra-engine-args engine_configs/gpt_oss/prefill.yaml \ + --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode prefill \ @@ -164,7 +164,7 @@ CUDA_VISIBLE_DEVICES=0,1,2,3 python3 -m dynamo.trtllm \ CUDA_VISIBLE_DEVICES=4,5,6,7 python3 -m dynamo.trtllm \ --model-path /model \ --served-model-name openai/gpt-oss-120b \ - --extra-engine-args engine_configs/gpt_oss/decode.yaml \ + --extra-engine-args recipes/gpt-oss-120b/trtllm/disagg/decode.yaml \ --dyn-reasoning-parser gpt_oss \ --dyn-tool-call-parser harmony \ --disaggregation-mode decode \ diff --git a/docs/backends/trtllm/llama4_plus_eagle.md b/docs/backends/trtllm/llama4_plus_eagle.md index 201b185243..f15bfa669f 100644 --- a/docs/backends/trtllm/llama4_plus_eagle.md +++ b/docs/backends/trtllm/llama4_plus_eagle.md @@ -30,7 +30,7 @@ This guide demonstrates how to deploy Llama 4 Maverick Instruct with Eagle Specu For advanced control over how requests are routed between prefill and decode workers in disaggregated mode, refer to the [Disaggregation Strategy](./README.md#disaggregation-strategy) section. ## Notes -* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `engine_configs/llama4/eagle` folder. +* Make sure the (`eagle3_one_model: true`) is set in the LLM API config inside the `recipes/llama4/trtllm/eagle` folder. ## Setup @@ -54,7 +54,7 @@ See [this](./multinode/multinode-examples.md#setup) section from multinode guide ## Aggregated Serving ```bash export NUM_NODES=1 -export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" +export ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_agg.yml" ./multinode/srun_aggregated.sh ``` @@ -62,9 +62,9 @@ export ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_agg.yaml" ```bash export NUM_PREFILL_NODES=1 -export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_prefill.yaml" +export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_prefill.yaml" export NUM_DECODE_NODES=1 -export DECODE_ENGINE_CONFIG="/mnt/engine_configs/llama4/eagle/eagle_decode.yaml" +export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/eagle/eagle_decode.yaml" ./multinode/srun_disaggregated.sh ``` diff --git a/docs/backends/trtllm/multimodal_support.md b/docs/backends/trtllm/multimodal_support.md index a8cb246f41..3e6fffc353 100644 --- a/docs/backends/trtllm/multimodal_support.md +++ b/docs/backends/trtllm/multimodal_support.md @@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode ```bash cd $DYNAMO_HOME/components/backends/trtllm -export AGG_ENGINE_ARGS=./engine_configs/multinode/agg.yaml +export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" ./launch/agg.sh @@ -80,8 +80,8 @@ cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/multimodal/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/multimodal/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/llama4/trtllm/multimodal/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/llama4/trtllm/multimodal/decode.yaml"} export MODALITY=${MODALITY:-"multimodal"} ./launch/disagg.sh diff --git a/docs/backends/trtllm/multinode/multinode-examples.md b/docs/backends/trtllm/multinode/multinode-examples.md index be76bad1ba..622ab10637 100644 --- a/docs/backends/trtllm/multinode/multinode-examples.md +++ b/docs/backends/trtllm/multinode/multinode-examples.md @@ -136,7 +136,7 @@ follow these steps below to launch an **aggregated** deployment across 4 nodes: ```bash # Default set in srun_aggregated.sh, but can customize here. -# export ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml" +# export ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml" # Customize NUM_NODES to match the desired parallelism in ENGINE_CONFIG # The product of NUM_NODES*NUM_GPUS_PER_NODE should match the number of @@ -165,8 +165,8 @@ deployment across 8 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG diff --git a/docs/backends/trtllm/multinode/multinode-multimodal-example.md b/docs/backends/trtllm/multinode/multinode-multimodal-example.md index fe050efd3c..9546f7a210 100644 --- a/docs/backends/trtllm/multinode/multinode-multimodal-example.md +++ b/docs/backends/trtllm/multinode/multinode-multimodal-example.md @@ -34,7 +34,7 @@ limitations under the License. > > Before running the deployment, you must update the engine configuration files to change `backend: DEFAULT` to `backend: default` (lowercase). Run the following command: > ```bash -> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/engine_configs/multimodal/llama4/prefill.yaml /mnt/engine_configs/multimodal/llama4/decode.yaml +> sed -i 's/backend: DEFAULT/backend: default/g' /mnt/recipes/llama4/trtllm/multimodal/prefill.yaml /mnt/recipes/llama4/trtllm/multimodal/decode.yaml > ``` @@ -100,8 +100,8 @@ deployment across 4 nodes: ```bash # Defaults set in srun_disaggregated.sh, but can customize here. -# export PREFILL_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/prefill.yaml" -# export DECODE_ENGINE_CONFIG="/mnt/engine_configs/multimodal/llama4/decode.yaml" +# export PREFILL_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/prefill.yaml" +# export DECODE_ENGINE_CONFIG="/mnt/recipes/llama4/trtllm/multimodal/decode.yaml" # Customize NUM_PREFILL_NODES to match the desired parallelism in PREFILL_ENGINE_CONFIG # Customize NUM_DECODE_NODES to match the desired parallelism in DECODE_ENGINE_CONFIG diff --git a/components/backends/trtllm/multinode/start_frontend_services.sh b/examples/multimodal/scripts/start_frontend_services.sh similarity index 100% rename from components/backends/trtllm/multinode/start_frontend_services.sh rename to examples/multimodal/scripts/start_frontend_services.sh diff --git a/components/backends/trtllm/multinode/start_trtllm_worker.sh b/examples/multimodal/scripts/start_trtllm_worker.sh similarity index 100% rename from components/backends/trtllm/multinode/start_trtllm_worker.sh rename to examples/multimodal/scripts/start_trtllm_worker.sh diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml b/recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_agg.yaml rename to recipes/deepseek-r1/trtllm/mtp/mtp_agg.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml b/recipes/deepseek-r1/trtllm/mtp/mtp_decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_decode.yaml rename to recipes/deepseek-r1/trtllm/mtp/mtp_decode.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml b/recipes/deepseek-r1/trtllm/mtp/mtp_prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/mtp/mtp_prefill.yaml rename to recipes/deepseek-r1/trtllm/mtp/mtp_prefill.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml b/recipes/deepseek-r1/trtllm/simple/agg.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/simple/agg.yaml rename to recipes/deepseek-r1/trtllm/simple/agg.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml b/recipes/deepseek-r1/trtllm/simple/decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/simple/decode.yaml rename to recipes/deepseek-r1/trtllm/simple/decode.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml b/recipes/deepseek-r1/trtllm/simple/prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/simple/prefill.yaml rename to recipes/deepseek-r1/trtllm/simple/prefill.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml b/recipes/deepseek-r1/trtllm/wide_ep/dep16_agg.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/dep16_agg.yaml rename to recipes/deepseek-r1/trtllm/wide_ep/dep16_agg.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/eplb.yaml b/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/eplb.yaml rename to recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml similarity index 92% rename from components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml rename to recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml index d32aab2dd3..31d7e395bd 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_agg.yaml +++ b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_agg.yaml @@ -11,7 +11,7 @@ moe_config: # moe_max_num_tokens = max_batch_size * moe_expert_parallel_size # 4096 = 256 * 16 # moe_max_num_tokens: 4096 - load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml + load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml tensor_parallel_size: 16 moe_expert_parallel_size: 16 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml similarity index 96% rename from components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml rename to recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml index 8f953c6472..6d36ab5ce6 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml +++ b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_decode.yaml @@ -17,7 +17,7 @@ backend: pytorch # WideEP related settings moe_config: backend: WIDEEP - load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml + load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml similarity index 95% rename from components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml rename to recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml index 8a756cc32b..7af74c74ae 100644 --- a/components/backends/trtllm/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml +++ b/recipes/deepseek-r1/trtllm/wide_ep/wide_ep_prefill.yaml @@ -17,7 +17,7 @@ backend: pytorch # WideEP related settings moe_config: backend: WIDEEP - load_balancer: /mnt/engine_configs/deepseek_r1/wide_ep/eplb.yaml + load_balancer: /mnt/recipes/deepseek-r1/trtllm/wide_ep/eplb.yaml # TP/EP/PP/DP tensor_parallel_size: 16 diff --git a/components/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml b/recipes/gemma3/trtllm/vswa_agg.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml rename to recipes/gemma3/trtllm/vswa_agg.yaml diff --git a/components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml b/recipes/gemma3/trtllm/vswa_decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml rename to recipes/gemma3/trtllm/vswa_decode.yaml diff --git a/components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml b/recipes/gemma3/trtllm/vswa_prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml rename to recipes/gemma3/trtllm/vswa_prefill.yaml diff --git a/components/backends/trtllm/engine_configs/gpt_oss/decode.yaml b/recipes/gpt-oss-120b/trtllm/disagg/decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/gpt_oss/decode.yaml rename to recipes/gpt-oss-120b/trtllm/disagg/decode.yaml diff --git a/components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml b/recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/gpt_oss/prefill.yaml rename to recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml b/recipes/llama4/trtllm/eagle/eagle_agg.yml similarity index 100% rename from components/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml rename to recipes/llama4/trtllm/eagle/eagle_agg.yml diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml b/recipes/llama4/trtllm/eagle/eagle_decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml rename to recipes/llama4/trtllm/eagle/eagle_decode.yaml diff --git a/components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml b/recipes/llama4/trtllm/eagle/eagle_prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml rename to recipes/llama4/trtllm/eagle/eagle_prefill.yaml diff --git a/components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml b/recipes/llama4/trtllm/multimodal/decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/multimodal/llama4/decode.yaml rename to recipes/llama4/trtllm/multimodal/decode.yaml diff --git a/components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml b/recipes/llama4/trtllm/multimodal/prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/multimodal/llama4/prefill.yaml rename to recipes/llama4/trtllm/multimodal/prefill.yaml From 1c5116458e6503e6c4bc309b979151cd4c74692b Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Tue, 21 Oct 2025 07:18:41 -0700 Subject: [PATCH 2/8] fix multi node files Signed-off-by: Anant Sharma --- .../trtllm/engine_configs/multimodal/agg.yaml | 29 +++++++++++++++++ .../engine_configs/multimodal/decode.yaml | 29 +++++++++++++++++ .../engine_configs/multimodal/prefill.yaml | 31 +++++++++++++++++++ docs/backends/trtllm/multimodal_support.md | 6 ++-- .../multinode/trtllm}/srun_aggregated.sh | 2 +- .../multinode/trtllm}/srun_disaggregated.sh | 2 +- .../trtllm}/start_frontend_services.sh | 0 .../multinode/trtllm}/start_trtllm_worker.sh | 0 8 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 components/backends/trtllm/engine_configs/multimodal/agg.yaml create mode 100644 components/backends/trtllm/engine_configs/multimodal/decode.yaml create mode 100644 components/backends/trtllm/engine_configs/multimodal/prefill.yaml rename {components/backends/trtllm/multinode => examples/basics/multinode/trtllm}/srun_aggregated.sh (98%) rename {components/backends/trtllm/multinode => examples/basics/multinode/trtllm}/srun_disaggregated.sh (99%) rename examples/{multimodal/scripts => basics/multinode/trtllm}/start_frontend_services.sh (100%) rename examples/{multimodal/scripts => basics/multinode/trtllm}/start_trtllm_worker.sh (100%) diff --git a/components/backends/trtllm/engine_configs/multimodal/agg.yaml b/components/backends/trtllm/engine_configs/multimodal/agg.yaml new file mode 100644 index 0000000000..6dbd676ee4 --- /dev/null +++ b/components/backends/trtllm/engine_configs/multimodal/agg.yaml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +max_batch_size: 16 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true +disable_overlap_scheduler: false +kv_cache_config: + free_gpu_memory_fraction: 0.30 + enable_block_reuse: false + +cache_transceiver_config: + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/multimodal/decode.yaml b/components/backends/trtllm/engine_configs/multimodal/decode.yaml new file mode 100644 index 0000000000..6dbd676ee4 --- /dev/null +++ b/components/backends/trtllm/engine_configs/multimodal/decode.yaml @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +max_batch_size: 16 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true +disable_overlap_scheduler: false +kv_cache_config: + free_gpu_memory_fraction: 0.30 + enable_block_reuse: false + +cache_transceiver_config: + backend: DEFAULT \ No newline at end of file diff --git a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml b/components/backends/trtllm/engine_configs/multimodal/prefill.yaml new file mode 100644 index 0000000000..83a65e8bf3 --- /dev/null +++ b/components/backends/trtllm/engine_configs/multimodal/prefill.yaml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +max_batch_size: 16 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true +# Overlap scheduler not currently supported in prefill only workers. +disable_overlap_scheduler: true + +kv_cache_config: + free_gpu_memory_fraction: 0.30 + enable_block_reuse: false + +cache_transceiver_config: + backend: DEFAULT \ No newline at end of file diff --git a/docs/backends/trtllm/multimodal_support.md b/docs/backends/trtllm/multimodal_support.md index 3e6fffc353..a8cb246f41 100644 --- a/docs/backends/trtllm/multimodal_support.md +++ b/docs/backends/trtllm/multimodal_support.md @@ -27,7 +27,7 @@ Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode ```bash cd $DYNAMO_HOME/components/backends/trtllm -export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml +export AGG_ENGINE_ARGS=./engine_configs/multinode/agg.yaml export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" ./launch/agg.sh @@ -80,8 +80,8 @@ cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/llama4/trtllm/multimodal/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/llama4/trtllm/multimodal/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/multimodal/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/multimodal/decode.yaml"} export MODALITY=${MODALITY:-"multimodal"} ./launch/disagg.sh diff --git a/components/backends/trtllm/multinode/srun_aggregated.sh b/examples/basics/multinode/trtllm/srun_aggregated.sh similarity index 98% rename from components/backends/trtllm/multinode/srun_aggregated.sh rename to examples/basics/multinode/trtllm/srun_aggregated.sh index e9568db9cd..d1645e522b 100755 --- a/components/backends/trtllm/multinode/srun_aggregated.sh +++ b/examples/basics/multinode/trtllm/srun_aggregated.sh @@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" # but you may freely customize the mounts based on your cluster. A common practice # is to mount paths to NFS storage for common scripts, model weights, etc. # NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}/../:/mnt" +DEFAULT_MOUNT="${PWD}/../../../:/mnt" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. diff --git a/components/backends/trtllm/multinode/srun_disaggregated.sh b/examples/basics/multinode/trtllm/srun_disaggregated.sh similarity index 99% rename from components/backends/trtllm/multinode/srun_disaggregated.sh rename to examples/basics/multinode/trtllm/srun_disaggregated.sh index fddac1ccbe..5c8e5bd755 100755 --- a/components/backends/trtllm/multinode/srun_disaggregated.sh +++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh @@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" # but you may freely customize the mounts based on your cluster. A common practice # is to mount paths to NFS storage for common scripts, model weights, etc. # NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}/../:/mnt" +DEFAULT_MOUNT="${PWD}/../../../:/mnt" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} diff --git a/examples/multimodal/scripts/start_frontend_services.sh b/examples/basics/multinode/trtllm/start_frontend_services.sh similarity index 100% rename from examples/multimodal/scripts/start_frontend_services.sh rename to examples/basics/multinode/trtllm/start_frontend_services.sh diff --git a/examples/multimodal/scripts/start_trtllm_worker.sh b/examples/basics/multinode/trtllm/start_trtllm_worker.sh similarity index 100% rename from examples/multimodal/scripts/start_trtllm_worker.sh rename to examples/basics/multinode/trtllm/start_trtllm_worker.sh From d2fcf83ab07cb03b4223aefb15e7aef7f6f36107 Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Tue, 21 Oct 2025 07:26:16 -0700 Subject: [PATCH 3/8] copy paste fix Signed-off-by: Anant Sharma --- .../trtllm/engine_configs/multimodal/agg.yaml | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/components/backends/trtllm/engine_configs/multimodal/agg.yaml b/components/backends/trtllm/engine_configs/multimodal/agg.yaml index 6dbd676ee4..24bc75601c 100644 --- a/components/backends/trtllm/engine_configs/multimodal/agg.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/agg.yaml @@ -12,18 +12,22 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -tensor_parallel_size: 1 +tensor_parallel_size: 8 moe_expert_parallel_size: 1 enable_attention_dp: false -max_num_tokens: 8192 -max_batch_size: 16 +max_num_tokens: 4096 +max_batch_size: 8 trust_remote_code: true backend: pytorch enable_chunked_prefill: true -disable_overlap_scheduler: false + kv_cache_config: - free_gpu_memory_fraction: 0.30 + free_gpu_memory_fraction: 0.3 enable_block_reuse: false cache_transceiver_config: - backend: DEFAULT \ No newline at end of file + backend: DEFAULT +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 \ No newline at end of file From 126ccd4b6c5c86ba0410f87b47599b7a2cee8317 Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Tue, 21 Oct 2025 07:26:57 -0700 Subject: [PATCH 4/8] fix Signed-off-by: Anant Sharma --- components/backends/trtllm/engine_configs/multimodal/agg.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/components/backends/trtllm/engine_configs/multimodal/agg.yaml b/components/backends/trtllm/engine_configs/multimodal/agg.yaml index 24bc75601c..754f8ce759 100644 --- a/components/backends/trtllm/engine_configs/multimodal/agg.yaml +++ b/components/backends/trtllm/engine_configs/multimodal/agg.yaml @@ -30,4 +30,4 @@ cache_transceiver_config: # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 # NOTE: overlap_scheduler enabled by default since this commit and changed # config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': -# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 \ No newline at end of file +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 From b14ec2723a0198c9dc7bcd5c240d12a5b3afa2ce Mon Sep 17 00:00:00 2001 From: tanmayv25 Date: Wed, 22 Oct 2025 15:14:33 -0700 Subject: [PATCH 5/8] move remaining recipes --- benchmarks/router/run_engines.sh | 10 +++--- .../trtllm/deploy/agg-with-config.yaml | 4 +-- components/backends/trtllm/deploy/agg.yaml | 4 +-- .../backends/trtllm/deploy/agg_router.yaml | 4 +-- .../trtllm/deploy/disagg-multinode.yaml | 12 +++---- components/backends/trtllm/deploy/disagg.yaml | 8 ++--- .../trtllm/deploy/disagg_planner.yaml | 8 ++--- .../backends/trtllm/deploy/disagg_router.yaml | 8 ++--- components/backends/trtllm/launch/agg.sh | 3 +- .../backends/trtllm/launch/agg_metrics.sh | 3 +- .../backends/trtllm/launch/agg_router.sh | 3 +- components/backends/trtllm/launch/disagg.sh | 5 +-- .../backends/trtllm/launch/disagg_router.sh | 5 +-- .../backends/trtllm/launch/epd_disagg.sh | 7 ++-- .../backends/trtllm/launch/gpt_oss_disagg.sh | 5 +-- docs/backends/trtllm/multimodal_support.md | 10 +++--- docs/kubernetes/README.md | 2 +- .../trtllm}/agg.yaml | 0 .../trtllm}/decode.yaml | 0 .../trtllm}/prefill.yaml | 0 .../llama4/trtllm}/multimodal/agg.yaml | 0 recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml | 33 ++++++++++++++++++ .../qwen2-vl-7b-instruct/trtllm}/decode.yaml | 0 .../qwen2-vl-7b-instruct/trtllm}/encode.yaml | 0 .../qwen2-vl-7b-instruct/trtllm}/prefill.yaml | 0 recipes/qwen3/trtllm/agg.yaml | 34 +++++++++++++++++++ recipes/qwen3/trtllm/decode.yaml | 31 +++++++++++++++++ recipes/qwen3/trtllm/prefill.yaml | 30 ++++++++++++++++ 28 files changed, 183 insertions(+), 46 deletions(-) rename {components/backends/trtllm/engine_configs => recipes/deepseek-r1-distill-llama-8b/trtllm}/agg.yaml (100%) rename {components/backends/trtllm/engine_configs => recipes/deepseek-r1-distill-llama-8b/trtllm}/decode.yaml (100%) rename {components/backends/trtllm/engine_configs => recipes/deepseek-r1-distill-llama-8b/trtllm}/prefill.yaml (100%) rename {components/backends/trtllm/engine_configs => recipes/llama4/trtllm}/multimodal/agg.yaml (100%) create mode 100644 recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml rename {components/backends/trtllm/engine_configs/multimodal => recipes/qwen2-vl-7b-instruct/trtllm}/decode.yaml (100%) rename {components/backends/trtllm/engine_configs => recipes/qwen2-vl-7b-instruct/trtllm}/encode.yaml (100%) rename {components/backends/trtllm/engine_configs/multimodal => recipes/qwen2-vl-7b-instruct/trtllm}/prefill.yaml (100%) create mode 100644 recipes/qwen3/trtllm/agg.yaml create mode 100644 recipes/qwen3/trtllm/decode.yaml create mode 100644 recipes/qwen3/trtllm/prefill.yaml diff --git a/benchmarks/router/run_engines.sh b/benchmarks/router/run_engines.sh index 18a97c8e28..2bda93c632 100755 --- a/benchmarks/router/run_engines.sh +++ b/benchmarks/router/run_engines.sh @@ -4,8 +4,10 @@ # SPDX-License-Identifier: Apache-2.0 # Parse command-line arguments +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} NUM_WORKERS=8 MODEL_PATH="deepseek-ai/DeepSeek-R1-Distill-Llama-8B" +RECIPE_PATH="$DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/trtllm" TENSOR_PARALLEL_SIZE=1 DATA_PARALLEL_SIZE=1 USE_MOCKERS=false @@ -84,13 +86,13 @@ if [ ${#EXTRA_ARGS[@]} -eq 0 ]; then ) elif [ "$USE_TRTLLM" = true ]; then # Default args for TensorRT-LLM engine using predefined YAML configs - # Config files located at: ../../components/backends/trtllm/engine_configs/{agg,decode,prefill}.yaml + # Config files located at: $RECIPE_PATH/{agg,decode,prefill}.yaml if [ "$MODE" = "prefill" ]; then - ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/prefill.yaml" + ENGINE_CONFIG="$RECIPE_PATH/prefill.yaml" elif [ "$MODE" = "decode" ]; then - ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/decode.yaml" + ENGINE_CONFIG="$RECIPE_PATH/decode.yaml" else - ENGINE_CONFIG="../../components/backends/trtllm/engine_configs/agg.yaml" + ENGINE_CONFIG="$RECIPE_PATH/agg.yaml" fi EXTRA_ARGS=( diff --git a/components/backends/trtllm/deploy/agg-with-config.yaml b/components/backends/trtllm/deploy/agg-with-config.yaml index dd15e56e65..e40ca48ada 100644 --- a/components/backends/trtllm/deploy/agg-with-config.yaml +++ b/components/backends/trtllm/deploy/agg-with-config.yaml @@ -55,7 +55,7 @@ spec: # mount the configmap as a volume volumeMounts: - name: nvidia-config - mountPath: /workspace/components/backends/trtllm/engine_configs + mountPath: /workspace/ readOnly: true command: - python3 @@ -67,4 +67,4 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/agg.yaml + - ./recipes/qwen3/trtllm/agg.yaml diff --git a/components/backends/trtllm/deploy/agg.yaml b/components/backends/trtllm/deploy/agg.yaml index c7187673e4..54412576a2 100644 --- a/components/backends/trtllm/deploy/agg.yaml +++ b/components/backends/trtllm/deploy/agg.yaml @@ -25,7 +25,7 @@ spec: extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -36,4 +36,4 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/agg.yaml + - ./recipes/qwen3/trtllm/agg.yaml diff --git a/components/backends/trtllm/deploy/agg_router.yaml b/components/backends/trtllm/deploy/agg_router.yaml index 787deb9847..ed42129fb4 100644 --- a/components/backends/trtllm/deploy/agg_router.yaml +++ b/components/backends/trtllm/deploy/agg_router.yaml @@ -28,7 +28,7 @@ spec: extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -39,5 +39,5 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/agg.yaml + - ./recipes/qwen3/trtllm/agg.yaml - --publish-events-and-metrics diff --git a/components/backends/trtllm/deploy/disagg-multinode.yaml b/components/backends/trtllm/deploy/disagg-multinode.yaml index 3da492107a..2906cfd193 100644 --- a/components/backends/trtllm/deploy/disagg-multinode.yaml +++ b/components/backends/trtllm/deploy/disagg-multinode.yaml @@ -125,10 +125,10 @@ spec: mainContainer: volumeMounts: - name: nvidia-config - mountPath: /workspace/components/backends/trtllm/engine_configs + mountPath: /workspace/ readOnly: true image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -139,7 +139,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/prefill.yaml + - ./recipes/qwen3/trtllm/prefill.yaml - --disaggregation-mode - prefill - --disaggregation-strategy @@ -165,10 +165,10 @@ spec: mainContainer: volumeMounts: - name: nvidia-config - mountPath: /workspace/components/backends/trtllm/engine_configs + mountPath: /workspace/ readOnly: true image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -179,7 +179,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/decode.yaml + - ./recipes/qwen3/trtllm/decode.yaml - --disaggregation-mode - decode - --disaggregation-strategy diff --git a/components/backends/trtllm/deploy/disagg.yaml b/components/backends/trtllm/deploy/disagg.yaml index 9055967dfe..501d2a4c20 100644 --- a/components/backends/trtllm/deploy/disagg.yaml +++ b/components/backends/trtllm/deploy/disagg.yaml @@ -26,7 +26,7 @@ spec: extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -37,7 +37,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/prefill.yaml + - ./recipes/qwen3/trtllm/prefill.yaml - --disaggregation-mode - prefill - --disaggregation-strategy @@ -54,7 +54,7 @@ spec: extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -65,7 +65,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/decode.yaml + - ./recipes/qwen3/trtllm/decode.yaml - --disaggregation-mode - decode - --disaggregation-strategy diff --git a/components/backends/trtllm/deploy/disagg_planner.yaml b/components/backends/trtllm/deploy/disagg_planner.yaml index 09326e786d..5f0ef6b808 100644 --- a/components/backends/trtllm/deploy/disagg_planner.yaml +++ b/components/backends/trtllm/deploy/disagg_planner.yaml @@ -86,7 +86,7 @@ spec: terminationGracePeriodSeconds: 600 mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 args: @@ -97,7 +97,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/decode.yaml + - ./recipes/qwen3/trtllm/decode.yaml - --disaggregation-mode - decode - --disaggregation-strategy @@ -115,7 +115,7 @@ spec: terminationGracePeriodSeconds: 600 mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 args: @@ -126,7 +126,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/prefill.yaml + - ./recipes/qwen3/trtllm/prefill.yaml - --disaggregation-mode - prefill - --disaggregation-strategy diff --git a/components/backends/trtllm/deploy/disagg_router.yaml b/components/backends/trtllm/deploy/disagg_router.yaml index 31fde39e05..f687354a95 100644 --- a/components/backends/trtllm/deploy/disagg_router.yaml +++ b/components/backends/trtllm/deploy/disagg_router.yaml @@ -28,7 +28,7 @@ spec: extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -39,7 +39,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/prefill.yaml + - ./recipes/qwen3/trtllm/prefill.yaml - --disaggregation-mode - prefill - --disaggregation-strategy @@ -56,7 +56,7 @@ spec: extraPodSpec: mainContainer: image: my-registry/trtllm-runtime:my-tag - workingDir: /workspace/components/backends/trtllm + workingDir: /workspace/ command: - python3 - -m @@ -67,7 +67,7 @@ spec: - --served-model-name - Qwen/Qwen3-0.6B - --extra-engine-args - - engine_configs/decode.yaml + - ./recipes/qwen3/trtllm/decode.yaml - --disaggregation-mode - decode - --disaggregation-strategy diff --git a/components/backends/trtllm/launch/agg.sh b/components/backends/trtllm/launch/agg.sh index 5c7021c59c..f141531d7d 100755 --- a/components/backends/trtllm/launch/agg.sh +++ b/components/backends/trtllm/launch/agg.sh @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} export MODALITY=${MODALITY:-"text"} # If you want to use multimodal, set MODALITY to "multimodal" #export MODALITY=${MODALITY:-"multimodal"} diff --git a/components/backends/trtllm/launch/agg_metrics.sh b/components/backends/trtllm/launch/agg_metrics.sh index 3232576d76..ad01482a8c 100755 --- a/components/backends/trtllm/launch/agg_metrics.sh +++ b/components/backends/trtllm/launch/agg_metrics.sh @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} export MODALITY=${MODALITY:-"text"} # Setup cleanup trap diff --git a/components/backends/trtllm/launch/agg_router.sh b/components/backends/trtllm/launch/agg_router.sh index ca6d439e63..bb69762735 100755 --- a/components/backends/trtllm/launch/agg_router.sh +++ b/components/backends/trtllm/launch/agg_router.sh @@ -3,9 +3,10 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} -export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"} +export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/agg.yaml"} # Setup cleanup trap cleanup() { diff --git a/components/backends/trtllm/launch/disagg.sh b/components/backends/trtllm/launch/disagg.sh index f89eba5c9e..a068c2979a 100755 --- a/components/backends/trtllm/launch/disagg.sh +++ b/components/backends/trtllm/launch/disagg.sh @@ -3,11 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export MODALITY=${MODALITY:-"text"} diff --git a/components/backends/trtllm/launch/disagg_router.sh b/components/backends/trtllm/launch/disagg_router.sh index e29c851a56..7fdfee7746 100755 --- a/components/backends/trtllm/launch/disagg_router.sh +++ b/components/backends/trtllm/launch/disagg_router.sh @@ -3,11 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen3/trtllm/decode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} diff --git a/components/backends/trtllm/launch/epd_disagg.sh b/components/backends/trtllm/launch/epd_disagg.sh index 60cfa1c249..ebe2e42330 100755 --- a/components/backends/trtllm/launch/epd_disagg.sh +++ b/components/backends/trtllm/launch/epd_disagg.sh @@ -3,12 +3,13 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"} -export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"engine_configs/encode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} +export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"} diff --git a/components/backends/trtllm/launch/gpt_oss_disagg.sh b/components/backends/trtllm/launch/gpt_oss_disagg.sh index 606ba2a8a2..931b505804 100755 --- a/components/backends/trtllm/launch/gpt_oss_disagg.sh +++ b/components/backends/trtllm/launch/gpt_oss_disagg.sh @@ -3,11 +3,12 @@ # SPDX-License-Identifier: Apache-2.0 # Environment variables with defaults +export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export MODEL_PATH=${MODEL_PATH:-"/model"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"openai/gpt-oss-120b"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/recipes/gpt-oss-120b/trtllm/disagg/decode.yaml"} set -e trap 'echo Cleaning up...; kill 0' EXIT diff --git a/docs/backends/trtllm/multimodal_support.md b/docs/backends/trtllm/multimodal_support.md index a8cb246f41..0a589840b7 100644 --- a/docs/backends/trtllm/multimodal_support.md +++ b/docs/backends/trtllm/multimodal_support.md @@ -25,9 +25,9 @@ Please note that you should provide **either image URLs or embedding file paths* Here are quick steps to launch Llama-4 Maverick BF16 in aggregated mode ```bash -cd $DYNAMO_HOME/components/backends/trtllm +cd $DYNAMO_HOME -export AGG_ENGINE_ARGS=./engine_configs/multinode/agg.yaml +export AGG_ENGINE_ARGS=./recipes/llama4/trtllm/multimodal/agg.yaml export SERVED_MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct" export MODEL_PATH="meta-llama/Llama-4-Maverick-17B-128E-Instruct" ./launch/agg.sh @@ -75,13 +75,13 @@ Here are quick steps to launch in disaggregated mode. The following is an example of launching a model in disaggregated mode. While this example uses `Qwen/Qwen2-VL-7B-Instruct`, you can adapt it for other models by modifying the environment variables for the model path and engine configurations. ```bash -cd $DYNAMO_HOME/components/backends/trtllm +cd $DYNAMO_HOME export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"} export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"} -export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/multimodal/prefill.yaml"} -export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/multimodal/decode.yaml"} +export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml"} +export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml"} export MODALITY=${MODALITY:-"multimodal"} ./launch/disagg.sh diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index fb2408ba82..519796f901 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -182,7 +182,7 @@ args: - python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B - --extra-engine-args engine_configs/agg.yaml + --extra-engine-args $DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/agg.yaml ``` Key customization points include: diff --git a/components/backends/trtllm/engine_configs/agg.yaml b/recipes/deepseek-r1-distill-llama-8b/trtllm/agg.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/agg.yaml rename to recipes/deepseek-r1-distill-llama-8b/trtllm/agg.yaml diff --git a/components/backends/trtllm/engine_configs/decode.yaml b/recipes/deepseek-r1-distill-llama-8b/trtllm/decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/decode.yaml rename to recipes/deepseek-r1-distill-llama-8b/trtllm/decode.yaml diff --git a/components/backends/trtllm/engine_configs/prefill.yaml b/recipes/deepseek-r1-distill-llama-8b/trtllm/prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/prefill.yaml rename to recipes/deepseek-r1-distill-llama-8b/trtllm/prefill.yaml diff --git a/components/backends/trtllm/engine_configs/multimodal/agg.yaml b/recipes/llama4/trtllm/multimodal/agg.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/multimodal/agg.yaml rename to recipes/llama4/trtllm/multimodal/agg.yaml diff --git a/recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml b/recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml new file mode 100644 index 0000000000..754f8ce759 --- /dev/null +++ b/recipes/qwen2-vl-7b-instruct/trtllm/agg.yaml @@ -0,0 +1,33 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 8 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 4096 +max_batch_size: 8 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true + +kv_cache_config: + free_gpu_memory_fraction: 0.3 + enable_block_reuse: false + +cache_transceiver_config: + backend: DEFAULT +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 diff --git a/components/backends/trtllm/engine_configs/multimodal/decode.yaml b/recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/multimodal/decode.yaml rename to recipes/qwen2-vl-7b-instruct/trtllm/decode.yaml diff --git a/components/backends/trtllm/engine_configs/encode.yaml b/recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/encode.yaml rename to recipes/qwen2-vl-7b-instruct/trtllm/encode.yaml diff --git a/components/backends/trtllm/engine_configs/multimodal/prefill.yaml b/recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml similarity index 100% rename from components/backends/trtllm/engine_configs/multimodal/prefill.yaml rename to recipes/qwen2-vl-7b-instruct/trtllm/prefill.yaml diff --git a/recipes/qwen3/trtllm/agg.yaml b/recipes/qwen3/trtllm/agg.yaml new file mode 100644 index 0000000000..53e0e6ce38 --- /dev/null +++ b/recipes/qwen3/trtllm/agg.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +max_batch_size: 16 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true + +kv_cache_config: + free_gpu_memory_fraction: 0.85 + +# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603 +# NOTE: overlap_scheduler enabled by default since this commit and changed +# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler': +# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428 + + +cuda_graph_config: + max_batch_size: 16 \ No newline at end of file diff --git a/recipes/qwen3/trtllm/decode.yaml b/recipes/qwen3/trtllm/decode.yaml new file mode 100644 index 0000000000..a0154bb6e3 --- /dev/null +++ b/recipes/qwen3/trtllm/decode.yaml @@ -0,0 +1,31 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true +disable_overlap_scheduler: false + +cuda_graph_config: + max_batch_size: 16 + +kv_cache_config: + free_gpu_memory_fraction: 0.85 + +cache_transceiver_config: + backend: DEFAULT diff --git a/recipes/qwen3/trtllm/prefill.yaml b/recipes/qwen3/trtllm/prefill.yaml new file mode 100644 index 0000000000..4996c1fdc6 --- /dev/null +++ b/recipes/qwen3/trtllm/prefill.yaml @@ -0,0 +1,30 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +tensor_parallel_size: 1 +moe_expert_parallel_size: 1 +enable_attention_dp: false +max_num_tokens: 8192 +trust_remote_code: true +backend: pytorch +enable_chunked_prefill: true +# Overlap scheduler not currently supported in prefill only workers. +disable_overlap_scheduler: true +cuda_graph_config: + max_batch_size: 16 +kv_cache_config: + free_gpu_memory_fraction: 0.85 + +cache_transceiver_config: + backend: DEFAULT \ No newline at end of file From c499173efd575d4d8dc56bf3581b75b771adea17 Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Thu, 23 Oct 2025 08:51:14 -0700 Subject: [PATCH 6/8] rabbit Signed-off-by: Anant Sharma --- .../trtllm/gemma3_sliding_window_attention.md | 12 ++++++------ examples/basics/multinode/trtllm/srun_aggregated.sh | 2 +- .../basics/multinode/trtllm/srun_disaggregated.sh | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/backends/trtllm/gemma3_sliding_window_attention.md b/docs/backends/trtllm/gemma3_sliding_window_attention.md index 5161332205..5226ad5338 100644 --- a/docs/backends/trtllm/gemma3_sliding_window_attention.md +++ b/docs/backends/trtllm/gemma3_sliding_window_attention.md @@ -30,7 +30,7 @@ VSWA is a mechanism in which a model’s layers alternate between multiple slidi cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_agg.yaml +export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml ./launch/agg.sh ``` @@ -39,7 +39,7 @@ export AGG_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_agg.yaml cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export AGG_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_agg.yaml +export AGG_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_agg.yaml ./launch/agg_router.sh ``` @@ -48,8 +48,8 @@ export AGG_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_agg.yaml cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml ./launch/disagg.sh ``` @@ -58,7 +58,7 @@ export DECODE_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_decode.yaml cd $DYNAMO_HOME/components/backends/trtllm export MODEL_PATH=google/gemma-3-1b-it export SERVED_MODEL_NAME=$MODEL_PATH -export PREFILL_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_prefill.yaml -export DECODE_ENGINE_ARGS=recipes/gemma3/trtllm/vswa_decode.yaml +export PREFILL_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_prefill.yaml +export DECODE_ENGINE_ARGS=$DYNAMO_HOME/recipes/gemma3/trtllm/vswa_decode.yaml ./launch/disagg_router.sh ``` diff --git a/examples/basics/multinode/trtllm/srun_aggregated.sh b/examples/basics/multinode/trtllm/srun_aggregated.sh index d1645e522b..18f41160d4 100755 --- a/examples/basics/multinode/trtllm/srun_aggregated.sh +++ b/examples/basics/multinode/trtllm/srun_aggregated.sh @@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" # but you may freely customize the mounts based on your cluster. A common practice # is to mount paths to NFS storage for common scripts, model weights, etc. # NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}/../../../:/mnt" +DEFAULT_MOUNT="${PWD}/../../../../:/mnt" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" # Example values, assuming 4 nodes with 4 GPUs on each node, such as 4xGB200 nodes. diff --git a/examples/basics/multinode/trtllm/srun_disaggregated.sh b/examples/basics/multinode/trtllm/srun_disaggregated.sh index 5c8e5bd755..9d55784084 100755 --- a/examples/basics/multinode/trtllm/srun_disaggregated.sh +++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh @@ -10,7 +10,7 @@ IMAGE="${IMAGE:-""}" # but you may freely customize the mounts based on your cluster. A common practice # is to mount paths to NFS storage for common scripts, model weights, etc. # NOTE: This can be a comma separated list of multiple mounts as well. -DEFAULT_MOUNT="${PWD}/../../../:/mnt" +DEFAULT_MOUNT="${PWD}/../../../../:/mnt" MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}" NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4} From d9d40d280ee8caf1d462c03cc77344523dda3096 Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Thu, 23 Oct 2025 09:01:56 -0700 Subject: [PATCH 7/8] more rabbit Signed-off-by: Anant Sharma --- docs/kubernetes/README.md | 2 +- examples/basics/multinode/trtllm/srun_aggregated.sh | 4 ++-- examples/basics/multinode/trtllm/srun_disaggregated.sh | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/kubernetes/README.md b/docs/kubernetes/README.md index 46dea4ad80..844c3858dd 100644 --- a/docs/kubernetes/README.md +++ b/docs/kubernetes/README.md @@ -203,7 +203,7 @@ args: - python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B - --extra-engine-args $DYNAMO_HOME/recipes/deepseek-r1-distill-llama-8b/agg.yaml + --extra-engine-args /workspace/recipes/deepseek-r1-distill-llama-8b/agg.yaml ``` Key customization points include: diff --git a/examples/basics/multinode/trtllm/srun_aggregated.sh b/examples/basics/multinode/trtllm/srun_aggregated.sh index 18f41160d4..46044c9265 100755 --- a/examples/basics/multinode/trtllm/srun_aggregated.sh +++ b/examples/basics/multinode/trtllm/srun_aggregated.sh @@ -51,7 +51,7 @@ srun \ --nodelist "${HEAD_NODE}" \ --nodes 1 \ --jobid "${SLURM_JOB_ID}" \ - /mnt/examples/multimodal/scripts/start_frontend_services.sh & + /mnt/examples/basics/multinode/trtllm/start_frontend_services.sh & # NOTE: Output streamed to stdout for ease of understanding the example, but # in practice you would probably set `srun --output ... --error ...` to pipe @@ -71,4 +71,4 @@ srun \ --nodes "${NUM_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/examples/multimodal/scripts/start_trtllm_worker.sh & \ No newline at end of file + /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh & \ No newline at end of file diff --git a/examples/basics/multinode/trtllm/srun_disaggregated.sh b/examples/basics/multinode/trtllm/srun_disaggregated.sh index 9d55784084..8b6aabf3b6 100755 --- a/examples/basics/multinode/trtllm/srun_disaggregated.sh +++ b/examples/basics/multinode/trtllm/srun_disaggregated.sh @@ -56,7 +56,7 @@ srun \ --nodelist "${HEAD_NODE}" \ --nodes 1 \ --jobid "${SLURM_JOB_ID}" \ - /mnt/examples/multimodal/scripts/start_frontend_services.sh & + /mnt/examples/basics/multinode/trtllm/start_frontend_services.sh & # NOTE: Output streamed to stdout for ease of understanding the example, but # in practice you would probably set `srun --output ... --error ...` to pipe @@ -78,7 +78,7 @@ for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do --nodes "${NUM_PREFILL_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/examples/multimodal/scripts/start_trtllm_worker.sh & + /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh & done for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do @@ -98,5 +98,5 @@ for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do --nodes "${NUM_DECODE_NODES}" \ --ntasks-per-node "${NUM_GPUS_PER_NODE}" \ --jobid "${SLURM_JOB_ID}" \ - /mnt/examples/multimodal/scripts/start_trtllm_worker.sh & + /mnt/examples/basics/multinode/trtllm/start_trtllm_worker.sh & done \ No newline at end of file From 42bd2dfc231ff0c2e303b462ab5de028d3940ff6 Mon Sep 17 00:00:00 2001 From: Anant Sharma Date: Fri, 24 Oct 2025 08:52:43 -0700 Subject: [PATCH 8/8] add recipes to docker Signed-off-by: Anant Sharma --- container/Dockerfile.trtllm | 1 + 1 file changed, 1 insertion(+) diff --git a/container/Dockerfile.trtllm b/container/Dockerfile.trtllm index 58af47f354..2334813543 100644 --- a/container/Dockerfile.trtllm +++ b/container/Dockerfile.trtllm @@ -272,6 +272,7 @@ COPY examples /workspace/examples COPY benchmarks /workspace/benchmarks COPY deploy /workspace/deploy COPY components/ /workspace/components/ +COPY recipes/ /workspace/recipes/ # Copy attribution files COPY ATTRIBUTION* LICENSE /workspace/