Merge branch 'main' of https://github.com/ai-dynamo/dynamo into mabdulwahhab/defaults

mohammedabdulwahhab · mohammedabdulwahhab · commit bf8db83a5535 · 2025-08-13T17:05:08.000-07:00
diff --git a/components/backends/trtllm/engine_configs/agg.yaml b/components/backends/trtllm/engine_configs/agg.yaml
@@ -22,7 +22,7 @@ backend: pytorch
 enable_chunked_prefill: true
 
 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
+  free_gpu_memory_fraction: 0.85
 
 # NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
 # NOTE: overlap_scheduler enabled by default since this commit and changed
diff --git a/components/backends/trtllm/engine_configs/decode.yaml b/components/backends/trtllm/engine_configs/decode.yaml
@@ -25,7 +25,7 @@ cuda_graph_config:
   max_batch_size: 16
 
 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
+  free_gpu_memory_fraction: 0.85
 
 cache_transceiver_config:
   backend: default
diff --git a/components/backends/trtllm/engine_configs/prefill.yaml b/components/backends/trtllm/engine_configs/prefill.yaml
@@ -24,7 +24,7 @@ disable_overlap_scheduler: true
 cuda_graph_config:
   max_batch_size: 16
 kv_cache_config:
-  free_gpu_memory_fraction: 0.95
+  free_gpu_memory_fraction: 0.85
 
 cache_transceiver_config:
   backend: default
diff --git a/components/backends/trtllm/multinode/multinode-examples.md b/components/backends/trtllm/multinode/multinode-examples.md
@@ -186,6 +186,10 @@ deployment across 8 nodes:
 ./srun_disaggregated.sh
 ```
 
+> [!Tip]
+> To launch multiple replicas of the configured prefill/decode workers, you can set
+> NUM_PREFILL_WORKERS and NUM_DECODE_WORKERS respectively (default: 1).
+
 ## Understanding the Output
 
 1. The `srun_aggregated.sh` launches two `srun` jobs. The first launches
diff --git a/components/backends/trtllm/multinode/srun_disaggregated.sh b/components/backends/trtllm/multinode/srun_disaggregated.sh
@@ -16,9 +16,11 @@ MOUNTS="${MOUNTS:-${DEFAULT_MOUNT}}"
 NUM_GPUS_PER_NODE=${NUM_GPUS_PER_NODE:-4}
 
 NUM_PREFILL_NODES=${NUM_PREFILL_NODES:-4}
+NUM_PREFILL_WORKERS=${NUM_PREFILL_WORKERS:-1}
 PREFILL_ENGINE_CONFIG="${PREFILL_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_prefill.yaml}"
 
 NUM_DECODE_NODES=${NUM_DECODE_NODES:-4}
+NUM_DECODE_WORKERS=${NUM_DECODE_WORKERS:-1}
 DECODE_ENGINE_CONFIG="${DECODE_ENGINE_CONFIG:-/mnt/engine_configs/deepseek_r1/wide_ep/wide_ep_decode.yaml}"
 
 DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
@@ -59,38 +61,42 @@ srun \
 # NOTE: Output streamed to stdout for ease of understanding the example, but
 # in practice you would probably set `srun --output ... --error ...` to pipe
 # the stdout/stderr to files.
-echo "Launching multi-node prefill worker in background."
-DISAGGREGATION_MODE=prefill \
-ENGINE_CONFIG=${PREFILL_ENGINE_CONFIG} \
-srun \
-  --mpi pmix \
-  --oversubscribe \
-  --container-image "${IMAGE}" \
-  --container-mounts "${MOUNTS}" \
-  --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
-  --verbose \
-  --label \
-  -A "${ACCOUNT}" \
-  -J "${ACCOUNT}-dynamo.trtllm" \
-  --nodes "${NUM_PREFILL_NODES}" \
-  --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
-  --jobid "${SLURM_JOB_ID}" \
-  /mnt/multinode/start_trtllm_worker.sh &
+for ((i=1; i<=${NUM_PREFILL_WORKERS}; i++)); do
+  echo "Launching multi-node prefill worker in background."
+  DISAGGREGATION_MODE=prefill \
+  ENGINE_CONFIG=${PREFILL_ENGINE_CONFIG} \
+  srun \
+    --mpi pmix \
+    --oversubscribe \
+    --container-image "${IMAGE}" \
+    --container-mounts "${MOUNTS}" \
+    --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
+    --verbose \
+    --label \
+    -A "${ACCOUNT}" \
+    -J "${ACCOUNT}-dynamo.trtllm" \
+    --nodes "${NUM_PREFILL_NODES}" \
+    --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
+    --jobid "${SLURM_JOB_ID}" \
+    /mnt/multinode/start_trtllm_worker.sh &
+done
 
-echo "Launching multi-node decode worker in background."
-DISAGGREGATION_MODE=decode \
-ENGINE_CONFIG=${DECODE_ENGINE_CONFIG} \
-srun \
-  --mpi pmix \
-  --oversubscribe \
-  --container-image "${IMAGE}" \
-  --container-mounts "${MOUNTS}" \
-  --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
-  --verbose \
-  --label \
-  -A "${ACCOUNT}" \
-  -J "${ACCOUNT}-dynamo.trtllm" \
-  --nodes "${NUM_DECODE_NODES}" \
-  --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
-  --jobid "${SLURM_JOB_ID}" \
-  /mnt/multinode/start_trtllm_worker.sh &
+for ((i=1; i<=${NUM_DECODE_WORKERS}; i++)); do
+  echo "Launching multi-node decode worker in background."
+  DISAGGREGATION_MODE=decode \
+  ENGINE_CONFIG=${DECODE_ENGINE_CONFIG} \
+  srun \
+    --mpi pmix \
+    --oversubscribe \
+    --container-image "${IMAGE}" \
+    --container-mounts "${MOUNTS}" \
+    --container-env ETCD_ENDPOINTS,NATS_SERVER,HEAD_NODE_IP,HEAD_NODE,DISAGGREGATION_MODE,DISAGGREGATION_STRATEGY,ENGINE_CONFIG \
+    --verbose \
+    --label \
+    -A "${ACCOUNT}" \
+    -J "${ACCOUNT}-dynamo.trtllm" \
+    --nodes "${NUM_DECODE_NODES}" \
+    --ntasks-per-node "${NUM_GPUS_PER_NODE}" \
+    --jobid "${SLURM_JOB_ID}" \
+    /mnt/multinode/start_trtllm_worker.sh &
+done
diff --git a/lib/runtime/src/distributed.rs b/lib/runtime/src/distributed.rs
@@ -44,38 +44,17 @@ impl MetricsRegistry for DistributedRuntime {
 
 impl DistributedRuntime {
     pub async fn new(runtime: Runtime, config: DistributedConfig) -> Result<Self> {
-        let secondary = runtime.secondary();
         let (etcd_config, nats_config, is_static) = config.dissolve();
 
         let runtime_clone = runtime.clone();
 
         let etcd_client = if is_static {
             None
         } else {
-            Some(
-                secondary
-                    .spawn(async move {
-                        let client = etcd::Client::new(etcd_config.clone(), runtime_clone)
-                            .await
-                            .context(format!(
-                                "Failed to connect to etcd server with config {:?}",
-                                etcd_config
-                            ))?;
-                        OK(client)
-                    })
-                    .await??,
-            )
+            Some(etcd::Client::new(etcd_config.clone(), runtime_clone).await?)
         };
 
-        let nats_client = secondary
-            .spawn(async move {
-                let client = nats_config.clone().connect().await.context(format!(
-                    "Failed to connect to NATS server with config {:?}",
-                    nats_config
-                ))?;
-                anyhow::Ok(client)
-            })
-            .await??;
+        let nats_client = nats_config.clone().connect().await?;
 
         // Start system status server for health and metrics if enabled in configuration
         let config = crate::config::RuntimeConfig::from_settings().unwrap_or_default();
diff --git a/lib/runtime/src/transports.rs b/lib/runtime/src/transports.rs
@@ -21,4 +21,5 @@
 pub mod etcd;
 pub mod nats;
 pub mod tcp;
+mod utils;
 pub mod zmq;
diff --git a/lib/runtime/src/transports/etcd.rs b/lib/runtime/src/transports/etcd.rs
@@ -37,6 +37,8 @@ mod path;
 use lease::*;
 pub use path::*;
 
+use super::utils::build_in_runtime;
+
 //pub use etcd::ConnectOptions as EtcdConnectOptions;
 
 /// ETCD Client
@@ -45,6 +47,7 @@ pub struct Client {
     client: etcd_client::Client,
     primary_lease: i64,
     runtime: Runtime,
+    rt: Arc<tokio::runtime::Runtime>,
 }
 
 #[derive(Debug, Clone)]
@@ -101,33 +104,36 @@ impl Client {
     /// If the lease expires, the [`Runtime`] will be shutdown.
     /// If the [`Runtime`] is shutdown, the lease will be revoked.
     pub async fn new(config: ClientOptions, runtime: Runtime) -> Result<Self> {
-        runtime
-            .secondary()
-            .spawn(Self::create(config, runtime.clone()))
-            .await?
-    }
-
-    /// Create a new etcd client and tie the primary [`CancellationToken`] to the primary etcd lease.
-    async fn create(config: ClientOptions, runtime: Runtime) -> Result<Self> {
         let token = runtime.primary_token();
-        let client =
-            etcd_client::Client::connect(config.etcd_url, config.etcd_connect_options).await?;
 
-        let lease_id = if config.attach_lease {
-            let lease_client = client.lease_client();
+        let ((client, lease_id), rt) = build_in_runtime(
+            async move {
+                let client =
+                    etcd_client::Client::connect(config.etcd_url, config.etcd_connect_options)
+                        .await?;
 
-            let lease = create_lease(lease_client, 10, token)
-                .await
-                .context("creating primary lease")?;
+                let lease_id = if config.attach_lease {
+                    let lease_client = client.lease_client();
 
-            lease.id
-        } else {
-            0
-        };
+                    let lease = create_lease(lease_client, 10, token)
+                        .await
+                        .context("creating primary lease")?;
+
+                    lease.id
+                } else {
+                    0
+                };
+
+                Ok((client, lease_id))
+            },
+            1,
+        )
+        .await?;
 
         Ok(Client {
             client,
             primary_lease: lease_id,
+            rt,
             runtime,
         })
     }
@@ -155,19 +161,15 @@ impl Client {
     pub async fn create_lease(&self, ttl: i64) -> Result<Lease> {
         let token = self.runtime.child_token();
         let lease_client = self.client.lease_client();
-        self.runtime
-            .secondary()
+        self.rt
             .spawn(create_lease(lease_client, ttl, token))
             .await?
     }
 
     // Revoke an etcd lease given its lease id. A wrapper over etcd_client::LeaseClient::revoke
     pub async fn revoke_lease(&self, lease_id: i64) -> Result<()> {
         let lease_client = self.client.lease_client();
-        self.runtime
-            .secondary()
-            .spawn(revoke_lease(lease_client, lease_id))
-            .await?
+        self.rt.spawn(revoke_lease(lease_client, lease_id)).await?
     }
 
     pub async fn kv_create(&self, key: &str, value: Vec<u8>, lease_id: Option<i64>) -> Result<()> {
@@ -340,7 +342,7 @@ impl Client {
 
         let (tx, rx) = mpsc::channel(32);
 
-        self.runtime.secondary().spawn(async move {
+        self.rt.spawn(async move {
             for kv in kvs {
                 if tx.send(WatchEvent::Put(kv)).await.is_err() {
                     // receiver is already closed
diff --git a/lib/runtime/src/transports/nats.rs b/lib/runtime/src/transports/nats.rs
@@ -35,6 +35,7 @@ use bytes::Bytes;
 use derive_builder::Builder;
 use futures::{StreamExt, TryStreamExt};
 use std::path::{Path, PathBuf};
+use std::sync::Arc;
 use tokio::fs::File as TokioFile;
 use tokio::io::AsyncRead;
 use tokio::time;
@@ -44,6 +45,8 @@ use validator::{Validate, ValidationError};
 pub use crate::slug::Slug;
 use tracing as log;
 
+use super::utils::build_in_runtime;
+
 pub const URL_PREFIX: &str = "nats://";
 
 #[derive(Clone)]
@@ -236,7 +239,9 @@ fn validate_nats_server(server: &str) -> Result<(), ValidationError> {
     }
 }
 
-#[allow(dead_code)]
+// TODO(jthomson04): We really shouldn't be hardcoding this.
+const NATS_WORKER_THREADS: usize = 4;
+
 impl ClientOptions {
     /// Create a new [`ClientOptionsBuilder`]
     pub fn builder() -> ClientOptionsBuilder {
@@ -258,7 +263,17 @@ impl ClientOptions {
             }
         };
 
-        let client = client.connect(self.server).await?;
+        let (client, _) = build_in_runtime(
+            async move {
+                client
+                    .connect(self.server)
+                    .await
+                    .map_err(|e| anyhow::anyhow!("Failed to connect to NATS: {e}"))
+            },
+            NATS_WORKER_THREADS,
+        )
+        .await?;
+
         let js_ctx = jetstream::new(client.clone());
 
         Ok(Client { client, js_ctx })
diff --git a/lib/runtime/src/transports/utils.rs b/lib/runtime/src/transports/utils.rs
@@ -0,0 +1,38 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+use std::{future::Future, sync::Arc};
+
+use anyhow::Result;
+
+pub async fn build_in_runtime<
+    T: Send + Sync + 'static,
+    F: Future<Output = Result<T>> + Send + 'static,
+>(
+    f: F,
+    num_threads: usize,
+) -> Result<(T, Arc<tokio::runtime::Runtime>)> {
+    let (tx, rx) = tokio::sync::oneshot::channel();
+
+    let runtime = Arc::new(
+        tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(num_threads)
+            .enable_all()
+            .build()?,
+    );
+
+    let runtime_clone = runtime.clone();
+    std::thread::spawn(move || {
+        runtime_clone.block_on(async move {
+            let result = f.await;
+            tx.send(result)
+                .unwrap_or_else(|_| panic!("This should never happen!"));
+
+            std::future::pending::<()>().await;
+        })
+    });
+
+    let result = rx.await??;
+
+    Ok((result, runtime))
+}