Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
port 0.7.0 fixes + aiperf/sa bench support
Signed-off-by: jthomson04 <[email protected]>
  • Loading branch information
jthomson04 committed Dec 8, 2025
commit bb6486a3bcdb9e7f7bc688e4919e140d3f13aa05
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ ctx_max_num_tokens=$6
num_gen_servers=$7
gen_tp_size=$8
gen_ep_size=$9
gen_batch_size=$10
gen_batch_size=${10}
gen_max_num_tokens=${11}
gen_enable_attention_dp=${12}
gen_gpu_memory_fraction=${13}
Expand All @@ -34,6 +34,7 @@ served_model_name=${20}
image=${21}
isl=${22}
osl=${23}
benchmark_kind=${24}

CACHE_TRANSCEIVER_MAX_NUM_TOKENS=${CACHE_TRANSCEIVER_MAX_NUM_TOKENS:-$((${isl} + ${osl} + 512))}

Expand Down Expand Up @@ -179,7 +180,7 @@ for ((i=1; i<=DECODE_COUNT; i++)); do
--ntasks $gen_tp_size \
--oversubscribe \
--overlap \
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' &> ${full_logdir}/output_decode_worker_${i}.log &
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/decode_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'decode' $gen_enable_attention_dp &> ${full_logdir}/output_decode_worker_${i}.log &
echo "$!" >> "$PID_FILE"
done

Expand All @@ -204,7 +205,7 @@ for ((i=1; i<=PREFILL_COUNT; i++)); do
--overlap \
--ntasks $(( ctx_tp_size < 4 ? ctx_tp_size : 4 )) \
--nodes 1 \
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' &> ${full_logdir}/output_prefill_worker_${i}.log &
bash ${SCRIPTS_DIR}/scripts/start_disagg_worker.sh ${full_logdir}/prefill_config.yaml ${ctx_gpus} ${served_model_name} ${model_path} 'prefill' $ctx_enable_attention_dp &> ${full_logdir}/output_prefill_worker_${i}.log &
prefill_pids+=($!)
echo "$!" >> "$PID_FILE"
done
Expand All @@ -216,7 +217,7 @@ srun -l --container-name=${CONTAINER_NAME} \
--container-env HEAD_NODE_IP,HEAD_NODE,SCRIPTS_DIR \
--mpi=pmix --overlap -N 1 -n 1 \
-w ${nodes[0]} \
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} > ${full_logdir}/bench.log 2>&1
bash ${SCRIPTS_DIR}/scripts/bench.sh ${served_model_name} ${MULTI_ROUND} ${num_gen_servers} "${concurrency_list}" ${STREAMING} ${full_logdir} ${ctx_gpus} ${gen_gpus} ${model_path} ${isl} ${osl} ${kind} ${benchmark_kind} > ${full_logdir}/bench.log 2>&1


# Cleanup will be handled by the EXIT trap
72 changes: 64 additions & 8 deletions components/backends/trtllm/performance_sweeps/scripts/bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,10 +37,11 @@ model_path=${9}
isl=${10}
osl=${11}
kind=${12}
benchmark_kind=${13}

if [ "$#" -ne 12 ]; then
echo "Error: Expected 12 arguments, got $#"
echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind>"
if [ "$#" -ne 13 ]; then
echo "Error: Expected 13 arguments, got $#"
echo "Usage: $0 <model> <multi_round> <num_gen_servers> <concurrency_list> <streaming> <log_path> <prefill_gpus> <decode_gpus> <model_path> <isl> <osl> <kind> <benchmark_kind>"
exit 1
fi

Expand All @@ -58,8 +59,12 @@ echo " model_path: $model_path"
echo " isl: $isl"
echo " osl: $osl"
echo " kind: $kind"
echo " benchmark_kind: $benchmark_kind"


if ! ( [[ "$benchmark_kind" == "sa" || "$benchmark_kind" == "aiperf" ]] ); then
echo "Invalid benchmark kind! Expected 'sa' or 'aiperf'"
exit 0
fi

# check process id is not 0
if [[ ${SLURM_PROCID} != "0" ]]; then
Expand Down Expand Up @@ -112,13 +117,13 @@ for ((i=1; i<=50; i++)); do
# https://github.com/ai-dynamo/dynamo/pull/2683
if [[ "$http_code" == "200" ]] && echo "$body" | grep -q '"status":"healthy"' && echo "$body" | grep -q '"endpoints":\[[^]]*"dyn://dynamo.tensorrt_llm.generate"'; then
if [[ "$kind" == *disagg* ]]; then
if echo "$body" | grep -q '"tensorrt_llm_next"'; then
if echo "$body" | grep -q '"dyn://dynamo.prefill.generate"'; then
echo "Health check succeeded on attempt $i"
echo "$body"
failed=false
break
else
echo "Attempt $i: tensorrt_llm_next key not found in etcd."
echo "Attempt $i: prefill generate endpoint not found in etcd."
fi
else
echo "Health check succeeded on attempt $i"
Expand Down Expand Up @@ -150,7 +155,9 @@ curl -v -w "%{http_code}" "${hostname}:${port}/v1/chat/completions" \
"max_tokens": 30
}'

python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
# aiperf already does a warmup
if [[ "$benchmark_kind" == "sa" ]]; then
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--served-model-name ${model} \
--model ${model_path} \
--dataset-name random \
Expand All @@ -166,6 +173,7 @@ python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--max-concurrency "1" \
--host ${hostname} \
--port ${port}
fi

mkdir -p ${log_path}/results
echo "Starting benchmark..."
Expand All @@ -175,7 +183,55 @@ for concurrency in ${concurrency_list}; do
num_prompts=$((concurrency * multi_round))
echo "Benchmarking with concurrency ${concurrency} ... ${num_prompts} prompts"
mkdir -p ${log_path}/concurrency_${concurrency}


if [[ "$benchmark_kind" == "sa" ]]; then
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--served-model-name ${model} \
--model ${model_path} \
--dataset-name random \
--num-prompts "$num_prompts" \
--random-input-len ${isl} \
--random-output-len ${osl} \
--random-range-ratio 0.8 \
--use-chat-template \
--ignore-eos \
--use-chat-template \
--backend "dynamo" \
--endpoint "/v1/completions" \
--percentile-metrics ttft,tpot,itl,e2el \
--max-concurrency "$concurrency" \
--host ${hostname} \
--port ${port} \
--save-result \
--result-dir "${log_path}/results" \
--result-filename "results_concurrency_${original_concurrency}_gpus_${total_gpus}_ctx_${prefill_gpus}_gen_${decode_gpus}.json"
else
aiperf profile \
--model ${model} \
--tokenizer ${model_path} \
--endpoint-type completions \
--endpoint /v1/completions \
--streaming \
--url ${hostname}:${port} \
--synthetic-input-tokens-mean ${isl} \
--synthetic-input-tokens-stddev 0 \
--output-tokens-mean ${osl} \
--output-tokens-stddev 0 \
--extra-inputs max_tokens:${osl} \
--extra-inputs min_tokens:${osl} \
--extra-inputs ignore_eos:true \
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
--concurrency $concurrency \
--request-count $num_prompts \
--warmup-request-count $(($concurrency*2)) \
--num-dataset-entries ${num_prompts} \
--random-seed 100 \
--artifact-dir "${log_path}/results/concurrency_${original_concurrency}" \
--ui simple \
-v \
-H 'Authorization: Bearer NOT USED' \
-H 'Accept: text/event-stream'
fi
python3 ${SCRIPTS_DIR}/scripts/bench/benchmark_serving.py \
--served-model-name ${model} \
--model ${model_path} \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,10 @@ ctx_gpus=$2
model_name=$3
model_path=$4
disaggregation_mode=$5
is_dep=$6

unset UCX_TLS
echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}"
echo "config_file: ${config_file}, ctx_gpus: ${ctx_gpus}, disaggregation_mode: ${disaggregation_mode}, is_dep: ${is_dep}"

# Read configuration values from the YAML config file
if [ ! -f "${config_file}" ]; then
Expand Down Expand Up @@ -39,16 +41,15 @@ echo " max_batch_size: ${max_batch_size}"
echo " max_seq_len: ${max_seq_len}"

export TLLM_LOG_LEVEL=INFO
# NOTE: This var is default behavior in recent trtllm commits, and can
# be removed. Keeping it here in case the script is ran with older commits.
export TRTLLM_MOE_ENABLE_ALLTOALL_WITHOUT_ALLGATHER=1
# NOTE: This var was replaced with an LLM API / yaml engine config field
# "moe_backend.use_low_precision_combine: true" in recent trtllm commits, and
# can be removed. Keeping it here in case the script is ran with older commits.
export TRTLLM_MOE_USE_LOW_PRECISION_COMBINE=1
# TODO: Is there ever a case where we don't want this enabled?
export TRTLLM_ENABLE_PDL=1

if [ "$is_dep" = "true" ]; then
echo "Using DEP. Setting env vars."
export TRTLLM_MOE_ALLTOALL_BACKEND="mnnvlthroughput"
export TRTLLM_FORCE_ALLTOALL_METHOD="MNNVL"
export TRTLLM_MOE_A2A_WORKSPACE_MB="2048"
fi

if [[ "${model_path,,}" != *r1* ]]; then
echo "Inferred gpt-oss style model. Setting OVERRIDE_QUANT_ALGO to W4A8_MXFP4_MXFP8"
export OVERRIDE_QUANT_ALGO=W4A8_MXFP4_MXFP8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ NTASKS_PER_NODE="${NTASKS_PER_NODE:-4}"

ISL="${ISL:-8150}"
OSL="${OSL:-1024}"
BENCHMARK_KIND="${BENCHMARK_KIND:-sa}"

# Build slurm_args step-by-step with validation and defaults
slurm_args="--time=04:00:00"
Expand Down Expand Up @@ -98,7 +99,7 @@ run_single() {
total_nodes=$((ctx_num + gen_nodes))
total_tasks=$((total_nodes * 4))
set -x
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL}
sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${NTASKS_PER_NODE} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} ${ctx_tp_size} ${ctx_ep_size} ${ctx_enable_attention_dp} 30 20000 ${gen_num} ${gen_tp_size} ${gen_ep_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} ${BENCHMARK_KIND}
set +x
}

Expand Down
Loading