@@ -16,9 +16,11 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
1616NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:- 4}
1717
1818NUM_PREFILL_NODES=${NUM_PREFILL_NODES:- 4}
19+ NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:- 1}
1920PREFILL_ENGINE_CONFIG=" ${PREFILL_ENGINE_CONFIG:-/ mnt/ engine_configs/ deepseek_r1/ wide_ep/ wide_ep_prefill.yaml} "
2021
2122NUM_DECODE_NODES=${NUM_DECODE_NODES:- 4}
23+ NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:- 1}
2224DECODE_ENGINE_CONFIG=" ${DECODE_ENGINE_CONFIG:-/ mnt/ engine_configs/ deepseek_r1/ wide_ep/ wide_ep_decode.yaml} "
2325
2426DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:- " decode_first" }
@@ -59,38 +61,42 @@ srun \
5961# NOTE: Output streamed to stdout for ease of understanding the example, but
6062# in practice you would probably set `srun --output ... --error ...` to pipe
6163# the stdout/stderr to files.
62- echo " Launching multi-node prefill worker in background."
63- DISAGGREGATION_MODE=prefill \
64- ENGINE_CONFIG=${PREFILL_ENGINE_CONFIG} \
65- srun \
66- --mpi pmix \
67- --oversubscribe \
68- --container-image " ${IMAGE} " \
69- --container-mounts " ${MOUNTS} " \
70- --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
71- --verbose \
72- --label \
73- -A " ${ACCOUNT} " \
74- -J " ${ACCOUNT} -dynamo.trtllm" \
75- --nodes " ${NUM_PREFILL_NODES} " \
76- --ntasks-per-node " ${NUM_GPUS_PER_NODE} " \
77- --jobid " ${SLURM_JOB_ID} " \
78- /mnt/multinode/start_trtllm_worker.sh &
64+ for (( i= 1 ; i<= ${NUM_PREFILL_WORKERS} ; i++ )) ; do
65+ echo " Launching multi-node prefill worker in background."
66+ DISAGGREGATION_MODE=prefill \
67+ ENGINE_CONFIG=${PREFILL_ENGINE_CONFIG} \
68+ srun \
69+ --mpi pmix \
70+ --oversubscribe \
71+ --container-image " ${IMAGE} " \
72+ --container-mounts " ${MOUNTS} " \
73+ --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
74+ --verbose \
75+ --label \
76+ -A " ${ACCOUNT} " \
77+ -J " ${ACCOUNT} -dynamo.trtllm" \
78+ --nodes " ${NUM_PREFILL_NODES} " \
79+ --ntasks-per-node " ${NUM_GPUS_PER_NODE} " \
80+ --jobid " ${SLURM_JOB_ID} " \
81+ /mnt/multinode/start_trtllm_worker.sh &
82+ done
7983
80- echo " Launching multi-node decode worker in background."
81- DISAGGREGATION_MODE=decode \
82- ENGINE_CONFIG=${DECODE_ENGINE_CONFIG} \
83- srun \
84- --mpi pmix \
85- --oversubscribe \
86- --container-image " ${IMAGE} " \
87- --container-mounts " ${MOUNTS} " \
88- --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
89- --verbose \
90- --label \
91- -A " ${ACCOUNT} " \
92- -J " ${ACCOUNT} -dynamo.trtllm" \
93- --nodes " ${NUM_DECODE_NODES} " \
94- --ntasks-per-node " ${NUM_GPUS_PER_NODE} " \
95- --jobid " ${SLURM_JOB_ID} " \
96- /mnt/multinode/start_trtllm_worker.sh &
84+ for (( i= 1 ; i<= ${NUM_DECODE_WORKERS} ; i++ )) ; do
85+ echo " Launching multi-node decode worker in background."
86+ DISAGGREGATION_MODE=decode \
87+ ENGINE_CONFIG=${DECODE_ENGINE_CONFIG} \
88+ srun \
89+ --mpi pmix \
90+ --oversubscribe \
91+ --container-image " ${IMAGE} " \
92+ --container-mounts " ${MOUNTS} " \
93+ --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
94+ --verbose \
95+ --label \
96+ -A " ${ACCOUNT} " \
97+ -J " ${ACCOUNT} -dynamo.trtllm" \
98+ --nodes " ${NUM_DECODE_NODES} " \
99+ --ntasks-per-node " ${NUM_GPUS_PER_NODE} " \
100+ --jobid " ${SLURM_JOB_ID} " \
101+ /mnt/multinode/start_trtllm_worker.sh &
102+ done
0 commit comments