Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
feat: add crds for llm example, no planner
  • Loading branch information
hhzhang16 committed Jul 3, 2025
commit ec912faa33d5692ee6ba36bdacfc2717f9072264
92 changes: 92 additions & 0 deletions examples/llm/deploy/agg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llm-agg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-agg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-agg
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-agg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
119 changes: 119 additions & 0 deletions examples/llm/deploy/agg_router.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: agg-router
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-agg-router
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg_router:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg_router:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
Router:
dynamoNamespace: llm-agg-router
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg_router:Router
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Router
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-agg-router
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.agg_router:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
121 changes: 121 additions & 0 deletions examples/llm/deploy/disagg.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: llm-disagg
spec:
envs:
- name: DYN_DEPLOYMENT_CONFIG
value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
services:
Frontend:
dynamoNamespace: llm-disagg
componentType: main
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.disagg:Frontend
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Frontend
Processor:
dynamoNamespace: llm-disagg
componentType: worker
replicas: 1
resources:
requests:
cpu: "1"
memory: "2Gi"
limits:
cpu: "1"
memory: "2Gi"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.disagg:Processor
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- Processor
VllmWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.disagg:VllmWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- VllmWorker
PrefillWorker:
envFromSecret: hf-token-secret
dynamoNamespace: llm-disagg
replicas: 1
resources:
requests:
cpu: "10"
memory: "20Gi"
gpu: "1"
limits:
cpu: "10"
memory: "20Gi"
gpu: "1"
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
args:
- cd
- /workspace/examples/llm
- "&&"
- dynamo
- serve
- graphs.disagg:PrefillWorker
- --system-app-port
- "5000"
- --enable-system-app
- --use-default-health-checks
- --service-name
- PrefillWorker
Loading
Loading