NVIDIA · kaiyux · Aug 28, 2025 · Aug 27, 2025 · Aug 28, 2025
@@ -83,8 +83,11 @@ echo "ntasks_per_node: ${ntasks_per_node}"
 echo "==========================================="
 
 
-ctx_max_seq_len=$((isl + 1))
-gen_max_seq_len=$((isl + osl))
+nsys_on=""
+# nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
+numa_bind=true # Only allocate memory from nodes, this only works on GB200
+ctx_max_seq_len=$((isl + 10))
+gen_max_seq_len=$((isl + osl + 10))
 ctx_gpu_frac=${ctx_gpu_memory_fraction}
 cache_transceiver_max_num_tokens=8448
 
@@ -120,9 +123,6 @@ if [ -z "${TRT_LLM_GIT_COMMIT:-}" ]; then
     echo "TRT_LLM_GIT_COMMIT: ${TRT_LLM_GIT_COMMIT}"
 fi
 
-nsys_on=""
-# nsys_on=${full_logdir} # Uncomment this line to enable Nsys profiling
-
 # start the container
 srun -l --container-image=${container_image} \
         --container-name=${container_name} \
@@ -193,7 +193,7 @@ for i in $(seq 0 $((num_gen_servers - 1))); do
         --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix \
-        bash ${workdir}/start_worker.sh "GEN" ${i} ${model_dir} "8336" ${benchmark_mode} ${concurrency} ${enable_pdl} ${full_logdir} ${nsys_on} \
+        bash ${workdir}/start_worker.sh "GEN" ${i} ${model_dir} "8336" ${benchmark_mode} ${concurrency} ${enable_pdl} ${numa_bind} ${full_logdir} ${nsys_on} \
         &> ${full_logdir}/output_gen_${i}.log &
 done
 
@@ -206,7 +206,7 @@ for i in $(seq 0 $((num_ctx_servers - 1))); do
         --container-name=${container_name} \
         --container-mounts=${mounts} \
         --mpi=pmix \
-        bash ${workdir}/start_worker.sh "CTX" ${i} ${model_dir} "8336" ${benchmark_mode} ${concurrency} ${enable_pdl} ${full_logdir} ${nsys_on} \
+        bash ${workdir}/start_worker.sh "CTX" ${i} ${model_dir} "8336" ${benchmark_mode} ${concurrency} ${enable_pdl} ${numa_bind} ${full_logdir} ${nsys_on} \
             &> ${full_logdir}/output_ctx_${i}.log &
 done
 

@@ -10,8 +10,9 @@ port=$4
 benchmark_mode=$5
 concurrency=$6
 enable_pdl=$7
-work_dir=$8
-nsys_folder=${9:-}
+numa_bind=$8
+work_dir=$9
+nsys_folder=${10:-}
 
 unset UCX_TLS
 echo "concurrency: ${concurrency}, enable_pdl: ${enable_pdl}, work_dir: ${work_dir}"
@@ -23,6 +24,14 @@ if [ "${enable_pdl}" = "true" ]; then
     export TRTLLM_ENABLE_PDL=1
 fi
 
+if [ "${numa_bind}" = "true" ]; then
+    numa_bind_cmd="numactl -m 0,1"
+    echo "numactl -m 0,1 - Only allocate memory from nodes on GB200"
+else
+    numa_bind_cmd=""
+    echo "Not binding memory. If on GB200, use \"numactl -m 0,1\" to only allocate memory from nodes."
+fi
+
 if [ "${benchmark_mode}" = "gen_only" ]; then
     export TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP=1
     export TLLM_BENCHMARK_REQ_QUEUES_SIZE=${concurrency}
@@ -50,7 +59,7 @@ fi
 #check if nsys_folder is provided
 if [ -z "${nsys_folder:-}" ]; then
     echo "nsys is not enabled, start normal flow"
-    trtllm-llmapi-launch trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file}
+    trtllm-llmapi-launch ${numa_bind_cmd} trtllm-serve ${model_path} --host $(hostname) --port ${port} --extra_llm_api_options ${config_file}
 else
     nsys_prefix=""
     nsys_file=${nsys_folder}/nsys_worker_proc_${instance_id}_${SLURM_PROCID}
@@ -63,7 +72,7 @@ else
     elif [ "${role}" = "CTX" ]; then
         echo "nsys is not enabled on ctx_gpus"
     fi
-    trtllm-llmapi-launch ${nsys_prefix} \
+    trtllm-llmapi-launch ${numa_bind_cmd} ${nsys_prefix} \
         trtllm-serve ${model_path} \
             --host $(hostname) --port ${port} \
             --extra_llm_api_options ${config_file}