ai-dynamo · grahamking · Jun 30, 2025 · Jun 27, 2025 · Jun 27, 2025 · Jun 30, 2025
diff --git a/Cargo.lock b/Cargo.lock
@@ -47,7 +47,7 @@ struct Args {
 
     /// Block size for the router
     #[arg(long)]
-    block_size: usize,
+    block_size: u32,
 }
 
 fn main() -> Result<()> {
@@ -88,7 +88,7 @@ impl WorkerSelector for CustomWorkerSelector {
         &self,
         workers: &ProcessedEndpoints,
         request: &SchedulingRequest,
-        block_size: usize,
+        block_size: u32,
     ) -> Result<WorkerSelectionResult, KvSchedulerError> {
         // customize logic here
         // F12 into [DefaultWorkerSelector] to see the original logic

@@ -34,7 +34,6 @@ anyhow = { workspace = true }
 async-stream = { workspace = true }
 async-trait = { workspace = true }
 futures = { workspace = true }
-humantime = { workspace = true }
 libc = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }
@@ -47,7 +46,6 @@ uuid = { workspace = true }
 
 async-openai = { workspace = true }
 clap = { version = "4.5", features = ["derive", "env"] }
-dialoguer = { version = "0.11", default-features = false, features = ["editor", "history"] }
 futures-util = { version = "0.3" }
 regex = "1"
 

@@ -17,9 +17,13 @@ use std::collections::HashMap;
 use std::path::PathBuf;
 
 use clap::ValueEnum;
+use dynamo_llm::entrypoint::RouterConfig;
 use dynamo_llm::kv_router::KvRouterConfig;
+use dynamo_llm::local_model::LocalModel;
 use dynamo_runtime::pipeline::RouterMode as RuntimeRouterMode;
 
+use crate::Output;
+
 /// Required options depend on the in and out choices
 #[derive(clap::Parser, Debug, Clone)]
 #[command(version, about, long_about = None)]
@@ -125,11 +129,11 @@ pub struct Flags {
     /// context length (e.g. Llama 4).
     /// Defaults to the model's max, which is usually model_max_length in tokenizer_config.json.
     #[arg(long)]
-    pub context_length: Option<usize>,
+    pub context_length: Option<u32>,
 
     /// KV cache block size (vllm only)
     #[arg(long)]
-    pub kv_cache_block_size: Option<usize>,
+    pub kv_cache_block_size: Option<u32>,
 
     /// Additional engine-specific arguments from a JSON file.
     /// Contains a mapping of parameter names to values.
@@ -154,66 +158,63 @@ pub struct Flags {
 }
 
 impl Flags {
-    /// Get KV router configuration
-    pub fn kv_router_config(&self) -> KvRouterConfig {
-        KvRouterConfig::new(
-            self.kv_overlap_score_weight,
-            self.kv_gpu_cache_usage_weight,
-            self.kv_waiting_requests_weight,
-        )
+    /// For each Output variant, check if it would be able to run.
+    /// This takes validation out of the main engine creation path.
+    pub fn validate(&self, local_model: &LocalModel, out_opt: &Output) -> anyhow::Result<()> {
+        match out_opt {
+            Output::Dynamic => {
+                if self.context_length.is_some() {
+                    anyhow::bail!("'--context-length' flag should only be used on the worker node, not on the ingress");
+                }
+                if self.kv_cache_block_size.is_some() {
+                    anyhow::bail!("'--kv-cache-block-size' flag should only be used on the worker node, not on the ingress");
+                }
+            }
+            Output::EchoFull => {}
+            Output::EchoCore => {
+                if !local_model.card().has_tokenizer() {
+                    anyhow::bail!(
+                        "out=echo_core need to find the tokenizer. Pass flag --model-path <path>"
+                    );
+                };
+            }
+            #[cfg(feature = "mistralrs")]
+            Output::MistralRs => {}
+            Output::SgLang => {
+                if !local_model.path().is_dir() {
+                    // TODO GGUF support for sglang: https://github.com/ai-dynamo/dynamo/issues/572
+                    anyhow::bail!("`--model-path should point at a HuggingFace repo checkout");
+                }
+            }
+            Output::Vllm => {
+                if self.base_gpu_id != 0 {
+                    anyhow::bail!("vllm does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
+                }
+            }
+            Output::Trtllm => {
+                if self.base_gpu_id != 0 {
+                    anyhow::bail!("TRTLLM does not support base_gpu_id. Set environment variable CUDA_VISIBLE_DEVICES instead.");
+                }
+            }
+            #[cfg(feature = "llamacpp")]
+            Output::LlamaCpp => {
+                if !local_model.path().is_file() {
+                    anyhow::bail!("--model-path should refer to a GGUF file. llama_cpp does not support safetensors.");
+                }
+            }
+        }
+        Ok(())
     }
 
-    /// Convert the flags back to a command line. Including only the non-null values, but
-    /// include the defaults. Includes the canonicalized model path and normalized model name.
-    ///
-    /// Used to pass arguments to python engines via `pystr` and `pytok`.
-    pub fn as_vec(&self, path: &str, name: &str) -> Vec<String> {
-        let mut out = vec![
-            "--model-path".to_string(),
-            path.to_string(),
-            "--model-name".to_string(),
-            name.to_string(),
-            "--http-port".to_string(),
-            self.http_port.to_string(),
-            // Default 1
-            "--tensor-parallel-size".to_string(),
-            self.tensor_parallel_size.to_string(),
-            // Default 0
-            "--base-gpu-id".to_string(),
-            self.base_gpu_id.to_string(),
-            // Default 1
-            "--num-nodes".to_string(),
-            self.num_nodes.to_string(),
-            // Default 0
-            "--node-rank".to_string(),
-            self.node_rank.to_string(),
-        ];
-        if let Some(model_config_path) = self.model_config.as_ref() {
-            out.push("--model-config".to_string());
-            out.push(model_config_path.display().to_string());
-        }
-        if let Some(leader) = self.leader_addr.as_ref() {
-            out.push("--leader-addr".to_string());
-            out.push(leader.to_string());
-        }
-        if let Some(extra_engine_args) = self.extra_engine_args.as_ref() {
-            out.push("--extra-engine-args".to_string());
-            out.push(extra_engine_args.display().to_string());
-        }
-        if let Some(weight) = self.kv_overlap_score_weight {
-            out.push("--kv-overlap-score-weight".to_string());
-            out.push(weight.to_string());
-        }
-        if let Some(weight) = self.kv_gpu_cache_usage_weight {
-            out.push("--kv-gpu-cache-usage-weight".to_string());
-            out.push(weight.to_string());
-        }
-        if let Some(weight) = self.kv_waiting_requests_weight {
-            out.push("--kv-waiting-requests-weight".to_string());
-            out.push(weight.to_string());
-        }
-        out.extend(self.last.clone());
-        out
+    pub fn router_config(&self) -> RouterConfig {
+        RouterConfig::new(
+            self.router_mode.into(),
+            KvRouterConfig::new(
+                self.kv_overlap_score_weight,
+                self.kv_gpu_cache_usage_weight,
+                self.kv_waiting_requests_weight,
+            ),
+        )
     }
 
     /// Load extra engine arguments from a JSON file