feat(python): Python bindings for the Dynamo CLI tools

Example dynamo-run style Python CLI in `examples/cli/cli/py`. Slower than the pure-Rust binary, need to find out why. There are extra steps for using mistralrs or llamacpp, see the README.
ai-dynamo · grahamking · Jul 8, 2025 · Jul 3, 2025 · Jul 7, 2025 · Jul 8, 2025
commit 9def849c39855dea6abb27c8bd181b47d91d00d0
diff --git a/Cargo.lock b/Cargo.lock
@@ -0,0 +1,116 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Example cli using the Python bindings.
+# Usage: `python cli.py text mistralrs --model-path <your-model>`.
+# If `--model-path` not provided defaults to Qwen3 0.6B.
+# Must be in a virtualenv with the bindings (or wheel) installed.
+
+import argparse
+import sys
+from pathlib import Path
+
+import uvloop
+
+from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
+from dynamo.runtime import DistributedRuntime, dynamo_worker
+
+
+def parse_args():
+    """
+    Parses command-line arguments for the program.
+    """
+    parser = argparse.ArgumentParser(
+        description="Run a Dynamo LLM engine with configurable parameters.",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,  # Show default values in help
+    )
+
+    # Positional arguments (replacing sys.argv[1] and sys.argv[2])
+    parser.add_argument(
+        "input_source",
+        type=str,
+        help="Input source for the engine: 'text', 'http', 'stdin', 'batch:file.jsonl', 'dyn://<name>'",
+    )
+    parser.add_argument(
+        "output_type",
+        type=str,
+        help="Output type (engine type): 'echo', 'mistralrs', 'llamacpp', 'dyn'",
+    )
+
+    # Optional arguments corresponding to EntrypointArgs fields
+    # model_path: Option<PathBuf>
+    parser.add_argument(
+        "--model-path",
+        type=Path,
+        default=Path("Qwen/Qwen3-0.6B"),
+        help="Path to the model directory.",
+    )
+    # model_name: Option<String>
+    parser.add_argument("--model-name", type=str, help="Name of the model to load.")
+    # model_config: Option<PathBuf>
+    parser.add_argument(
+        "--model-config", type=Path, help="Path to the model configuration file."
+    )
+    # context_length: Option<u32>
+    parser.add_argument(
+        "--context-length", type=int, help="Maximum context length for the model (u32)."
+    )
+    # template_file: Option<PathBuf>
+    parser.add_argument(
+        "--template-file",
+        type=Path,
+        help="Path to the template file for text generation.",
+    )
+    # kv_cache_block_size: Option<u32>
+    parser.add_argument(
+        "--kv-cache-block-size", type=int, help="KV cache block size (u32)."
+    )
+    # http_port: Option<u16>
+    parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).")
+
+    args = parser.parse_args()
+    return args
+
+
+@dynamo_worker(static=False)
+async def run(runtime: DistributedRuntime):
+    args = parse_args()
+
+    input = args.input_source
+    output = args.output_type
+
+    if output == "echo":
+        engine_type = EngineType.Echo
+    elif output == "mistralrs":
+        engine_type = EngineType.MistralRs
+    elif output == "llamacpp":
+        engine_type = EngineType.LlamaCpp
+    elif output == "dyn":
+        engine_type = EngineType.Dynamic
+    else:
+        print(f"Unsupported output type: {output}")
+        sys.exit(1)
+
+    # TODO: The "vllm", "sglang" and "trtllm" cases should call Python directly
+
+    entrypoint_kwargs = {"model_path": args.model_path}
+    if args.model_name is not None:
+        entrypoint_kwargs["model_name"] = args.model_name
+    if args.model_config is not None:
+        entrypoint_kwargs["model_config"] = args.model_config
+    if args.context_length is not None:
+        entrypoint_kwargs["context_length"] = args.context_length
+    if args.template_file is not None:
+        entrypoint_kwargs["template_file"] = args.template_file
+    if args.kv_cache_block_size is not None:
+        entrypoint_kwargs["kv_cache_block_size"] = args.kv_cache_block_size
+    if args.http_port is not None:
+        entrypoint_kwargs["http_port"] = args.http_port
+
+    e = EntrypointArgs(engine_type, **entrypoint_kwargs)
+    engine = await make_engine(runtime, e)
+    await run_input(runtime, input, engine)
+
+
+if __name__ == "__main__":
+    uvloop.run(run())
@@ -34,6 +34,7 @@ anyhow = { workspace = true }
 async-openai = { workspace = true }
 async-stream = { workspace = true }
 async-trait = { workspace = true }
+either = { workspace = true }
 futures = { workspace = true }
 libc = { workspace = true }
 serde = { workspace = true }

@@ -11,6 +11,7 @@ use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
 use dynamo_runtime::CancellationToken;
 
 mod flags;
+use either::Either;
 pub use flags::Flags;
 mod opt;
 pub use dynamo_llm::request_template::RequestTemplate;
@@ -41,14 +42,19 @@ pub async fn run(
         .kv_cache_block_size(flags.kv_cache_block_size)
         // Only set if user provides. Usually loaded from tokenizer_config.json
         .context_length(flags.context_length)
-        .http_port(flags.http_port)
+        .http_port(Some(flags.http_port))
         .router_config(flags.router_config())
         .request_template(flags.request_template.clone());
 
     // If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
     // If not, then the endpoint isn't exposed so we let LocalModel invent one.
+    let mut rt = Either::Left(runtime.clone());
     if let Input::Endpoint(path) = &in_opt {
-        builder.endpoint_id(path.parse().with_context(|| path.clone())?);
+        builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));
+
+        let distributed_runtime =
+            dynamo_runtime::DistributedRuntime::from_settings(runtime.clone()).await?;
+        rt = Either::Right(distributed_runtime);
     };
 
     let local_model = builder.build().await?;
@@ -70,8 +76,7 @@ pub async fn run(
     //
     // Run in from an input
     //
-
-    dynamo_llm::entrypoint::input::run_input(in_opt, runtime, engine_config).await?;
+    dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?;
 
     // Allow engines to ask main thread to wait on an extra future.
     // We use this to stop the vllm and sglang sub-process