ai-dynamo · PeaBrane · Jul 1, 2025 · May 27, 2025 · May 27, 2025 · May 27, 2025
diff --git a/docs/guides/dynamo_run.md b/docs/guides/dynamo_run.md
@@ -8,7 +8,7 @@ It supports these engines: mistralrs, llamacpp, sglang, vllm, and tensorrt-llm.
 
 Usage:
 ```
-dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
+dynamo-run in=[http|text|dyn://<path>|batch:<folder>] out=echo_core|echo_full|mocker|mistralrs|llamacpp|sglang|vllm|dyn [--http-port 8080] [--model-path <path>] [--model-name <served-model-name>] [--model-config <hf-repo>] [--tensor-parallel-size=1] [--context-length=N] [--num-nodes=1] [--node-rank=0] [--leader-addr=127.0.0.1:9876] [--base-gpu-id=0] [--extra-engine-args=args.json] [--router-mode random|round-robin|kv] [--kv-overlap-score-weight=2.0] [--kv-gpu-cache-usage-weight=1.0] [--kv-waiting-requests-weight=1.0] [--verbosity (-v|-vv)]
 ```
 
 Example: `dynamo run Qwen/Qwen3-0.6B`
@@ -514,6 +514,33 @@ The output looks like this:
 {"text":"What is the capital of Spain?","response":".The capital of Spain is Madrid.","tokens_in":7,"tokens_out":7,"elapsed_ms":855}
 ```
 
+#### Mocker engine
+
+The mocker engine is a mock vLLM implementation designed for testing and development purposes. It simulates realistic token generation timing without requiring actual model inference, making it useful for:
+
+- Testing distributed system components without GPU resources
+- Benchmarking infrastructure and networking overhead
+- Developing and debugging Dynamo components
+- Load testing and performance analysis
+
+**Basic usage:**
+
+The `--model-path` is required but can point to any valid model path - the mocker doesn't actually load the model weights. The arguments `block-size`, `num-gpu-blocks`, `max-num-seqs`, `max-num-batched-tokens`, and `enable-prefix-caching` are common arguments shared with the real VLLM engine.
+
+And below are arguments that are mocker-specific:
+- `speedup_ratio`: Speed multiplier for token generation (default: 1.0). Higher values make the simulation engines run faster.
+- `dp_size`: Number of data parallel workers to simulate (default: 1)
+- `watermark`: KV cache watermark threshold as a fraction (default: 0.01). This argument also exists for the real VLLM engine but cannot be passed as an engine arg.
+
+>[!NOTE]
+>Currently, `enable_chunked_prefill` is always assumed to be false, which mirrors the vllm v0 behavior. This is also the current behavior in `examples/llm`. This will be updated in the near future as we move to support vllm v1 (and deprecate support for vllm v0).
+
+```bash
+echo '{"speedup_ratio": 10.0}' > mocker_args.json
+dynamo-run in=dyn://dynamo.mocker.generate out=mocker --model-path TinyLlama/TinyLlama-1.1B-Chat-v1.0 --extra-engine-args mocker_args.json
+dynamo-run in=http out=dyn --router-mode kv
+```
+
 ### Extra engine arguments
 The vllm and sglang backends support passing any argument the engine accepts.
 Put the arguments in a JSON file:

diff --git a/launch/dynamo-run/src/flags.rs b/launch/dynamo-run/src/flags.rs
@@ -216,19 +216,27 @@ impl Flags {
         out
     }
 
-    /// Load extra engine arguments from a JSON file
+    /// Load extra arguments from a JSON file
     /// Returns a HashMap of parameter names to values
-    pub fn load_extra_engine_args(
-        &self,
+    fn load_json_args(
+        path: &Option<PathBuf>,
     ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
-        if let Some(path) = &self.extra_engine_args {
+        if let Some(path) = path {
             let file_content = std::fs::read_to_string(path)?;
             let args: HashMap<String, serde_json::Value> = serde_json::from_str(&file_content)?;
             Ok(Some(args))
         } else {
             Ok(None)
         }
     }
+
+    /// Load extra engine arguments from a JSON file
+    /// Returns a HashMap of parameter names to values
+    pub fn load_extra_engine_args(
+        &self,
+    ) -> anyhow::Result<Option<HashMap<String, serde_json::Value>>> {
+        Self::load_json_args(&self.extra_engine_args)
+    }
 }
 
 #[derive(Default, PartialEq, Eq, ValueEnum, Clone, Debug, Copy)]

diff --git a/launch/dynamo-run/src/lib.rs b/launch/dynamo-run/src/lib.rs
@@ -9,6 +9,7 @@ use dynamo_llm::{backend::ExecutionContext, engines::StreamingEngine, local_mode
 use dynamo_runtime::protocols::Endpoint as EndpointId;
 use dynamo_runtime::slug::Slug;
 use dynamo_runtime::{CancellationToken, DistributedRuntime};
+use tokio::sync::OnceCell;
 
 mod flags;
 pub use flags::Flags;
@@ -64,6 +65,21 @@ pub async fn run(
         .clone()
         .or(flags.model_path_flag.clone());
 
+    // Create a OnceCell for lazy initialization of distributed runtime
+    let distributed_runtime_cell: OnceCell<DistributedRuntime> = OnceCell::new();
+    let runtime_clone = runtime.clone();
+
+    // Helper closure to get or initialize the distributed runtime
+    let get_distributed_runtime = || async {
+        distributed_runtime_cell
+            .get_or_init(|| async {
+                DistributedRuntime::from_settings(runtime_clone.clone())
+                    .await
+                    .expect("Failed to create distributed runtime")
+            })
+            .await
+    };
+
     let mut local_model: LocalModel = if is_out_dynamic(&out_opt) {
         // If output is dynamic we are ingress and don't have a local model, but making an
         // empty one cleans up the code.
@@ -285,6 +301,73 @@ pub async fn run(
                 model: Box::new(local_model),
             }
         }
+
+        Output::Mocker => {
+            let endpoint = match &in_opt {
+                Input::Endpoint(path) => path.parse()?,
+                _ => internal_endpoint("mocker"),
+            };
+
+            // Load mocker args from JSON file if provided
+            let engine_args = flags.load_extra_engine_args()?;
+
+            let mut builder = dynamo_llm::mocker::protocols::MockEngineArgs::builder();
+
+            // Use kv_cache_block_size flag as block_size if provided
+            if let Some(block_size) = flags.kv_cache_block_size {
+                builder = builder.block_size(block_size);
+            }
+
+            // Apply args from JSON file if provided
+            if let Some(args) = engine_args {
+                // This overwrites the kv_cache_block_size passed in
+                if let Some(v) = args.get("block_size").and_then(|v| v.as_u64()) {
+                    builder = builder.block_size(v as usize);
+                }
+                if let Some(v) = args.get("num_gpu_blocks").and_then(|v| v.as_u64()) {
+                    builder = builder.num_gpu_blocks(v as usize);
+                }
+                if let Some(v) = args.get("max_num_seqs").and_then(|v| v.as_u64()) {
+                    builder = builder.max_num_seqs(Some(v as usize));
+                }
+                if let Some(v) = args.get("max_num_batched_tokens").and_then(|v| v.as_u64()) {
+                    builder = builder.max_num_batched_tokens(Some(v as usize));
+                }
+                if let Some(v) = args.get("enable_prefix_caching").and_then(|v| v.as_bool()) {
+                    builder = builder.enable_prefix_caching(v);
+                }
+
+                // These are mocker-specific args
+                if let Some(v) = args.get("speedup_ratio").and_then(|v| v.as_f64()) {
+                    builder = builder.speedup_ratio(v);
+                }
+                if let Some(v) = args.get("watermark").and_then(|v| v.as_f64()) {
+                    builder = builder.watermark(v);
+                }
+                if let Some(v) = args.get("dp_size").and_then(|v| v.as_u64()) {
+                    builder = builder.dp_size(v as u32);
+                }
+            }
+
+            let args = builder
+                .build()
+                .map_err(|e| anyhow::anyhow!("Failed to build MockEngineArgs: {e}"))?;
+
+            // Get or initialize the distributed runtime
+            let distributed_runtime = get_distributed_runtime().await;
+            let engine = dynamo_llm::mocker::engine::make_mocker_engine(
+                distributed_runtime.clone(),
+                endpoint,
+                args,
+            )
+            .await
+            .map_err(|e| anyhow::anyhow!("Failed to create mocker engine: {e}"))?;
+
+            EngineConfig::StaticCore {
+                engine,
+                model: Box::new(local_model),
+            }
+        }
     };
 
     match in_opt {
@@ -311,8 +394,9 @@ pub async fn run(
                 .await?;
         }
         Input::Endpoint(path) => {
-            let distributed_runtime = DistributedRuntime::from_settings(runtime.clone()).await?;
-            crate::input::endpoint::run(distributed_runtime, path, engine_config).await?;
+            // Get or initialize the distributed runtime
+            let distributed_runtime = get_distributed_runtime().await;
+            crate::input::endpoint::run(distributed_runtime.clone(), path, engine_config).await?;
         }
     }
 

diff --git a/launch/dynamo-run/src/opt.rs b/launch/dynamo-run/src/opt.rs
@@ -90,6 +90,9 @@ pub enum Output {
     /// Listen for models on nats/etcd, add/remove dynamically
     Dynamic,
 
+    /// Mock vLLM engine for testing and development
+    Mocker,
+
     #[cfg(feature = "mistralrs")]
     /// Run inference on a model in a GGUF file using mistralrs w/ candle
     MistralRs,
@@ -126,6 +129,7 @@ impl TryFrom<&str> for Output {
 
             "echo_full" => Ok(Output::EchoFull),
             "echo_core" => Ok(Output::EchoCore),
+            "mocker" => Ok(Output::Mocker),
 
             "dyn" => Ok(Output::Dynamic),
 
@@ -160,6 +164,8 @@ impl fmt::Display for Output {
             Output::EchoCore => "echo_core",
 
             Output::Dynamic => "dyn",
+
+            Output::Mocker => "mocker",
         };
         write!(f, "{s}")
     }
@@ -168,7 +174,11 @@ impl fmt::Display for Output {
 impl Output {
     #[allow(unused_mut)]
     pub fn available_engines() -> Vec<String> {
-        let mut out = vec!["echo_core".to_string(), "echo_full".to_string()];
+        let mut out = vec![
+            "echo_core".to_string(),
+            "echo_full".to_string(),
+            "mocker".to_string(),
+        ];
         #[cfg(feature = "mistralrs")]
         {
             out.push(Output::MistralRs.to_string());

@@ -13,6 +13,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+pub mod engine;
 pub mod evictor;
 pub mod kv_manager;
 pub mod protocols;