Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion components/backends/sglang/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,7 @@ Send a test request to verify your deployment:
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
Expand Down
4 changes: 2 additions & 2 deletions components/backends/sglang/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
Expand Down
4 changes: 2 additions & 2 deletions components/backends/sglang/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
Expand Down
4 changes: 2 additions & 2 deletions components/backends/sglang/deploy/disagg-multinode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path meta-llama/Llama-3.3-70B-Instruct
--served-model-name meta-llama/Llama-3.3-70B-Instruct
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--tp-size 8
--trust-remote-code
--skip-tokenizer-init
Expand Down
8 changes: 4 additions & 4 deletions components/backends/sglang/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
Expand All @@ -59,8 +59,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
Expand Down
8 changes: 4 additions & 4 deletions components/backends/sglang/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -116,8 +116,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
Expand All @@ -142,8 +142,8 @@ spec:
args:
- >-
python3 -m dynamo.sglang
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--page-size 16
--tp 1
--trust-remote-code
Expand Down
6 changes: 3 additions & 3 deletions components/backends/sglang/docs/sgl-hicache-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ This guide shows how to enable SGLang's Hierarchical Cache (HiCache) inside Dyna

```bash
python -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--host 0.0.0.0 --port 8000 \
--page-size 64 \
--enable-hierarchical-cache \
Expand Down Expand Up @@ -39,7 +39,7 @@ python -m dynamo.frontend --http-port 8000
curl localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"model": "Qwen/Qwen3-0.6B",
"messages": [
{
"role": "user",
Expand All @@ -56,7 +56,7 @@ curl localhost:8000/v1/chat/completions \
Run the perf script:
```bash
bash -x /workspace/benchmarks/llm/perf.sh \
--model deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model Qwen/Qwen3-0.6B \
--tensor-parallelism 1 \
--data-parallelism 1 \
--concurrency "2,4,8" \
Expand Down
4 changes: 2 additions & 2 deletions components/backends/sglang/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ DYNAMO_PID=$!

# run worker
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
Expand Down
8 changes: 4 additions & 4 deletions components/backends/sglang/launch/agg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ DYNAMO_PID=$!

# run worker
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
Expand All @@ -30,8 +30,8 @@ python3 -m dynamo.sglang \
WORKER_PID=$!

CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
Expand Down
8 changes: 4 additions & 4 deletions components/backends/sglang/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ DYNAMO_PID=$!

# run prefill worker
python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
Expand All @@ -32,8 +32,8 @@ PREFILL_PID=$!

# run decode worker
CUDA_VISIBLE_DEVICES=1 python3 -m dynamo.sglang \
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B \
--model-path Qwen/Qwen3-0.6B \
--served-model-name Qwen/Qwen3-0.6B \
--page-size 16 \
--tp 1 \
--trust-remote-code \
Expand Down
4 changes: 2 additions & 2 deletions components/backends/trtllm/deploy/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ spec:
args:
- >-
python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--extra-engine-args engine_configs/agg.yaml
4 changes: 2 additions & 2 deletions components/backends/trtllm/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ spec:
args:
- >-
python3 -m dynamo.trtllm
--model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B
--model-path Qwen/Qwen3-0.6B
--served-model-name Qwen/Qwen3-0.6B
--extra-engine-args engine_configs/agg.yaml
--publish-events-and-metrics
4 changes: 2 additions & 2 deletions components/backends/trtllm/deploy/disagg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy decode_first"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg
envFromSecret: hf-token-secret
Expand All @@ -47,4 +47,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy decode_first"
4 changes: 2 additions & 2 deletions components/backends/trtllm/deploy/disagg_router.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/prefill.yaml --disaggregation-mode prefill --disaggregation-strategy prefill_first --publish-events-and-metrics"
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-v1-disagg-router
envFromSecret: hf-token-secret
Expand All @@ -50,4 +50,4 @@ spec:
- /bin/sh
- -c
args:
- "python3 -m dynamo.trtllm --model-path deepseek-ai/DeepSeek-R1-Distill-Llama-8B --served-model-name deepseek-ai/DeepSeek-R1-Distill-Llama-8B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
- "python3 -m dynamo.trtllm --model-path Qwen/Qwen3-0.6B --served-model-name Qwen/Qwen3-0.6B --extra-engine-args engine_configs/decode.yaml --disaggregation-mode decode --disaggregation-strategy prefill_first"
4 changes: 2 additions & 2 deletions components/backends/trtllm/launch/agg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}
export MODALITY=${MODALITY:-"text"}
# If you want to use multimodal, set MODALITY to "multimodal"
Expand Down
4 changes: 2 additions & 2 deletions components/backends/trtllm/launch/agg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"engine_configs/agg.yaml"}

# Setup cleanup trap
Expand Down
4 changes: 2 additions & 2 deletions components/backends/trtllm/launch/disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"decode_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
Expand Down
4 changes: 2 additions & 2 deletions components/backends/trtllm/launch/disagg_router.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
# SPDX-License-Identifier: Apache-2.0

# Environment variables with defaults
export MODEL_PATH=${MODEL_PATH:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-0.6B"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-0.6B"}
export DISAGGREGATION_STRATEGY=${DISAGGREGATION_STRATEGY:-"prefill_first"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"engine_configs/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"engine_configs/decode.yaml"}
Expand Down
Loading