Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
feat(python): Python bindings for the Dynamo CLI tools
Example dynamo-run style Python CLI in `examples/cli/cli/py`.

Slower than the pure-Rust binary, need to find out why.

There are extra steps for using mistralrs or llamacpp, see the README.
  • Loading branch information
grahamking committed Jul 8, 2025
commit 9def849c39855dea6abb27c8bd181b47d91d00d0
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

116 changes: 116 additions & 0 deletions examples/cli/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

# Example cli using the Python bindings.
# Usage: `python cli.py text mistralrs --model-path <your-model>`.
# If `--model-path` not provided defaults to Qwen3 0.6B.
# Must be in a virtualenv with the bindings (or wheel) installed.

import argparse
import sys
from pathlib import Path

import uvloop

from dynamo.llm import EngineType, EntrypointArgs, make_engine, run_input
from dynamo.runtime import DistributedRuntime, dynamo_worker


def parse_args():
"""
Parses command-line arguments for the program.
"""
parser = argparse.ArgumentParser(
description="Run a Dynamo LLM engine with configurable parameters.",
formatter_class=argparse.ArgumentDefaultsHelpFormatter, # Show default values in help
)

# Positional arguments (replacing sys.argv[1] and sys.argv[2])
parser.add_argument(
"input_source",
type=str,
help="Input source for the engine: 'text', 'http', 'stdin', 'batch:file.jsonl', 'dyn://<name>'",
)
parser.add_argument(
"output_type",
type=str,
help="Output type (engine type): 'echo', 'mistralrs', 'llamacpp', 'dyn'",
)

# Optional arguments corresponding to EntrypointArgs fields
# model_path: Option<PathBuf>
parser.add_argument(
"--model-path",
type=Path,
default=Path("Qwen/Qwen3-0.6B"),
help="Path to the model directory.",
)
# model_name: Option<String>
parser.add_argument("--model-name", type=str, help="Name of the model to load.")
# model_config: Option<PathBuf>
parser.add_argument(
"--model-config", type=Path, help="Path to the model configuration file."
)
# context_length: Option<u32>
parser.add_argument(
"--context-length", type=int, help="Maximum context length for the model (u32)."
)
# template_file: Option<PathBuf>
parser.add_argument(
"--template-file",
type=Path,
help="Path to the template file for text generation.",
)
# kv_cache_block_size: Option<u32>
parser.add_argument(
"--kv-cache-block-size", type=int, help="KV cache block size (u32)."
)
# http_port: Option<u16>
parser.add_argument("--http-port", type=int, help="HTTP port for the engine (u16).")

args = parser.parse_args()
return args


@dynamo_worker(static=False)
async def run(runtime: DistributedRuntime):
args = parse_args()

input = args.input_source
output = args.output_type

if output == "echo":
engine_type = EngineType.Echo
elif output == "mistralrs":
engine_type = EngineType.MistralRs
elif output == "llamacpp":
engine_type = EngineType.LlamaCpp
elif output == "dyn":
engine_type = EngineType.Dynamic
else:
print(f"Unsupported output type: {output}")
sys.exit(1)

# TODO: The "vllm", "sglang" and "trtllm" cases should call Python directly

entrypoint_kwargs = {"model_path": args.model_path}
if args.model_name is not None:
entrypoint_kwargs["model_name"] = args.model_name
if args.model_config is not None:
entrypoint_kwargs["model_config"] = args.model_config
if args.context_length is not None:
entrypoint_kwargs["context_length"] = args.context_length
if args.template_file is not None:
entrypoint_kwargs["template_file"] = args.template_file
if args.kv_cache_block_size is not None:
entrypoint_kwargs["kv_cache_block_size"] = args.kv_cache_block_size
if args.http_port is not None:
entrypoint_kwargs["http_port"] = args.http_port

e = EntrypointArgs(engine_type, **entrypoint_kwargs)
engine = await make_engine(runtime, e)
await run_input(runtime, input, engine)


if __name__ == "__main__":
uvloop.run(run())
1 change: 1 addition & 0 deletions launch/dynamo-run/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ anyhow = { workspace = true }
async-openai = { workspace = true }
async-stream = { workspace = true }
async-trait = { workspace = true }
either = { workspace = true }
futures = { workspace = true }
libc = { workspace = true }
serde = { workspace = true }
Expand Down
13 changes: 9 additions & 4 deletions launch/dynamo-run/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use dynamo_llm::local_model::{LocalModel, LocalModelBuilder};
use dynamo_runtime::CancellationToken;

mod flags;
use either::Either;
pub use flags::Flags;
mod opt;
pub use dynamo_llm::request_template::RequestTemplate;
Expand Down Expand Up @@ -41,14 +42,19 @@ pub async fn run(
.kv_cache_block_size(flags.kv_cache_block_size)
// Only set if user provides. Usually loaded from tokenizer_config.json
.context_length(flags.context_length)
.http_port(flags.http_port)
.http_port(Some(flags.http_port))
.router_config(flags.router_config())
.request_template(flags.request_template.clone());

// If `in=dyn` we want the trtllm/sglang/vllm subprocess to listen on that endpoint.
// If not, then the endpoint isn't exposed so we let LocalModel invent one.
let mut rt = Either::Left(runtime.clone());
if let Input::Endpoint(path) = &in_opt {
builder.endpoint_id(path.parse().with_context(|| path.clone())?);
builder.endpoint_id(Some(path.parse().with_context(|| path.clone())?));

let distributed_runtime =
dynamo_runtime::DistributedRuntime::from_settings(runtime.clone()).await?;
rt = Either::Right(distributed_runtime);
};

let local_model = builder.build().await?;
Expand All @@ -70,8 +76,7 @@ pub async fn run(
//
// Run in from an input
//

dynamo_llm::entrypoint::input::run_input(in_opt, runtime, engine_config).await?;
dynamo_llm::entrypoint::input::run_input(rt, in_opt, engine_config).await?;

// Allow engines to ask main thread to wait on an extra future.
// We use this to stop the vllm and sglang sub-process
Expand Down
Loading