From 7ac97b014f86a753fa30a14d77904316c6a832a5 Mon Sep 17 00:00:00 2001 From: mohammedabdulwahhab Date: Thu, 3 Jul 2025 10:46:32 -0700 Subject: [PATCH 01/14] fix: fix --- examples/vllm_v0/deploy/agg.yaml | 59 +++++++++ examples/vllm_v0/deploy/disagg.yaml | 88 ++++++++++++ examples/vllm_v0/deploy/disagg_planner.yaml | 140 ++++++++++++++++++++ 3 files changed, 287 insertions(+) create mode 100644 examples/vllm_v0/deploy/agg.yaml create mode 100644 examples/vllm_v0/deploy/disagg.yaml create mode 100644 examples/vllm_v0/deploy/disagg_planner.yaml diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml new file mode 100644 index 0000000000..eb21051ea3 --- /dev/null +++ b/examples/vllm_v0/deploy/agg.yaml @@ -0,0 +1,59 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg +spec: + Frontend: + dynamoNamespace: inference + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + - -f + - ./configs/agg.yaml + VllmWorker: + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + - -f + - ./configs/agg.yaml \ No newline at end of file diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml new file mode 100644 index 0000000000..c5b546a78a --- /dev/null +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -0,0 +1,88 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg +spec: + Frontend: + dynamoNamespace: inference + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + - -f + - ./configs/disagg.yaml + + VllmWorker: + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + - -f + - ./configs/disagg.yaml + + PrefillWorker: + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker + - -f + - ./configs/disagg.yaml \ No newline at end of file diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml new file mode 100644 index 0000000000..f272418f6f --- /dev/null +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -0,0 +1,140 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg-planner +spec: + Frontend: + dynamoNamespace: inference + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + - -f + - ./configs/disagg_planner.yaml + + VllmWorker: + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + - -f + - ./configs/disagg_planner.yaml + + PrefillWorker: + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker + - -f + - ./configs/disagg_planner.yaml + + Planner: + replicas: 1 + resources: + requests: + cpu: "1" + memory: "1Gi" + limits: + cpu: "1" + memory: "1Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:Planner + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Planner + - -f + - ./configs/disagg_planner.yaml + + Prometheus: + replicas: 1 + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "500m" + memory: "500Mi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:Prometheus + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Prometheus + - -f + - ./configs/disagg_planner.yaml \ No newline at end of file From 4279528d8f4ef635868211bd27d28a6c7fbbce99 Mon Sep 17 00:00:00 2001 From: mohammedabdulwahhab Date: Thu, 3 Jul 2025 10:57:43 -0700 Subject: [PATCH 02/14] fix: fix --- examples/llm/crd.yaml | 81 +++++++++++++++++++++ examples/vllm_v0/deploy/disagg.yaml | 4 +- examples/vllm_v0/deploy/disagg_planner.yaml | 12 +-- 3 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 examples/llm/crd.yaml diff --git a/examples/llm/crd.yaml b/examples/llm/crd.yaml new file mode 100644 index 0000000000..965515c9de --- /dev/null +++ b/examples/llm/crd.yaml @@ -0,0 +1,81 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg +spec: + envs: + services: + Frontend: + dynamoNamespace: inference + componentType: main + replicas: 1 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + command: + - dynamo + - serve + - graphs.agg:Frontend + - -f + - ./configs/agg.yaml + Middle: + dynamoNamespace: inference + replicas: 1 + extraPodSpec: + mainContainer: + image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8 + command: + - sh + - -c + args: + - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference + Backend: + dynamoNamespace: inference + envs: + - name: SOME_ENV + value: ‘somevalue’ + replicas: 1 + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + extraPodSpec: + mainContainer: + image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8 + command: + - sh + - -c + args: + - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference + Planner: + dynamoNamespace: inference + envs: + - name: SOME_ENV + value: ‘somevalue’ + replicas: 1 + componentType: planner + resources: + requests: + cpu: 100m + memory: 100Mi + limits: + cpu: 1000m + memory: 1000Mi + extraPodSpec: + mainContainer: + image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:planner + command: + - sh + - -c + args: + - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --service-name Planner graphs.disagg_router:Frontend --Planner.ServiceArgs.dynamo.namespace=inference --Planner.environment=kubernetes diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml index c5b546a78a..92252f6d52 100644 --- a/examples/vllm_v0/deploy/disagg.yaml +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -49,7 +49,7 @@ spec: args: - dynamo - serve - - graphs.disagg:VllmWorker + - graphs.disagg:Frontend - --system-app-port - "5000" - --enable-system-app @@ -77,7 +77,7 @@ spec: args: - dynamo - serve - - graphs.disagg:PrefillWorker + - graphs.disagg:Frontend - --system-app-port - "5000" - --enable-system-app diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index f272418f6f..649aa3f672 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -49,7 +49,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:VllmWorker + - graphs.disagg_planner:Frontend - --system-app-port - "5000" - --enable-system-app @@ -77,7 +77,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:PrefillWorker + - graphs.disagg_planner:Frontend - --system-app-port - "5000" - --enable-system-app @@ -89,6 +89,7 @@ spec: Planner: replicas: 1 + componentType: planner resources: requests: cpu: "1" @@ -103,7 +104,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:Planner + - graphs.disagg_planner:Frontend - --system-app-port - "5000" - --enable-system-app @@ -112,6 +113,7 @@ spec: - Planner - -f - ./configs/disagg_planner.yaml + - --Planner.environment=kubernetes Prometheus: replicas: 1 @@ -129,7 +131,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:Prometheus + - graphs.disagg_planner:Frontend - --system-app-port - "5000" - --enable-system-app @@ -137,4 +139,4 @@ spec: - --service-name - Prometheus - -f - - ./configs/disagg_planner.yaml \ No newline at end of file + - ./configs/disagg_planner.yaml \ No newline at end of file From 1a3e9ad9b77b9212925265fd84624916e62bf31b Mon Sep 17 00:00:00 2001 From: mohammedabdulwahhab Date: Thu, 3 Jul 2025 11:30:09 -0700 Subject: [PATCH 03/14] Delete examples/llm/crd.yaml Signed-off-by: mohammedabdulwahhab --- examples/llm/crd.yaml | 81 ------------------------------------------- 1 file changed, 81 deletions(-) delete mode 100644 examples/llm/crd.yaml diff --git a/examples/llm/crd.yaml b/examples/llm/crd.yaml deleted file mode 100644 index 965515c9de..0000000000 --- a/examples/llm/crd.yaml +++ /dev/null @@ -1,81 +0,0 @@ -apiVersion: nvidia.com/v1alpha1 -kind: DynamoGraphDeployment -metadata: - name: agg -spec: - envs: - services: - Frontend: - dynamoNamespace: inference - componentType: main - replicas: 1 - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 1000m - memory: 1000Mi - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - command: - - dynamo - - serve - - graphs.agg:Frontend - - -f - - ./configs/agg.yaml - Middle: - dynamoNamespace: inference - replicas: 1 - extraPodSpec: - mainContainer: - image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8 - command: - - sh - - -c - args: - - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference - Backend: - dynamoNamespace: inference - envs: - - name: SOME_ENV - value: ‘somevalue’ - replicas: 1 - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 1000m - memory: 1000Mi - extraPodSpec: - mainContainer: - image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8 - command: - - sh - - -c - args: - - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference - Planner: - dynamoNamespace: inference - envs: - - name: SOME_ENV - value: ‘somevalue’ - replicas: 1 - componentType: planner - resources: - requests: - cpu: 100m - memory: 100Mi - limits: - cpu: 1000m - memory: 1000Mi - extraPodSpec: - mainContainer: - image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:planner - command: - - sh - - -c - args: - - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --service-name Planner graphs.disagg_router:Frontend --Planner.ServiceArgs.dynamo.namespace=inference --Planner.environment=kubernetes From 93859d6c8021fcae5b8f3dcd26c9a6d72bb92987 Mon Sep 17 00:00:00 2001 From: mohammedabdulwahhab Date: Thu, 3 Jul 2025 13:44:26 -0700 Subject: [PATCH 04/14] fix: fix --- examples/vllm_v0/deploy/agg.yaml | 114 ++++++++++++++++--------------- 1 file changed, 60 insertions(+), 54 deletions(-) diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml index eb21051ea3..7993c066aa 100644 --- a/examples/vllm_v0/deploy/agg.yaml +++ b/examples/vllm_v0/deploy/agg.yaml @@ -3,57 +3,63 @@ kind: DynamoGraphDeployment metadata: name: agg spec: - Frontend: - dynamoNamespace: inference - componentType: main - replicas: 1 - resources: - requests: - cpu: "1" - memory: "2Gi" - limits: - cpu: "1" - memory: "2Gi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.agg:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - Frontend - - -f - - ./configs/agg.yaml - VllmWorker: - replicas: 1 - resources: - requests: - cpu: "10" - memory: "20Gi" - gpu: "1" - limits: - cpu: "10" - memory: "20Gi" - gpu: "1" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.agg:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - VllmWorker - - -f - - ./configs/agg.yaml \ No newline at end of file + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}' + services: + Frontend: + dynamoNamespace: dynamo + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: dynamo + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.agg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker \ No newline at end of file From ec912faa33d5692ee6ba36bdacfc2717f9072264 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Thu, 3 Jul 2025 15:31:14 -0700 Subject: [PATCH 05/14] feat: add crds for llm example, no planner --- examples/llm/deploy/agg.yaml | 92 +++++++++++++++ examples/llm/deploy/agg_router.yaml | 119 ++++++++++++++++++++ examples/llm/deploy/disagg.yaml | 121 ++++++++++++++++++++ examples/llm/deploy/disagg_router.yaml | 148 +++++++++++++++++++++++++ 4 files changed, 480 insertions(+) create mode 100644 examples/llm/deploy/agg.yaml create mode 100644 examples/llm/deploy/agg_router.yaml create mode 100644 examples/llm/deploy/disagg.yaml create mode 100644 examples/llm/deploy/disagg_router.yaml diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml new file mode 100644 index 0000000000..a515d87d57 --- /dev/null +++ b/examples/llm/deploy/agg.yaml @@ -0,0 +1,92 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: llm-agg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-agg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-agg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-agg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml new file mode 100644 index 0000000000..a2d5ceed65 --- /dev/null +++ b/examples/llm/deploy/agg_router.yaml @@ -0,0 +1,119 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg-router +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-agg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg_router:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-agg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg_router:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + Router: + dynamoNamespace: llm-agg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg_router:Router + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Router + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-agg-router + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg_router:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml new file mode 100644 index 0000000000..c9f6aab17f --- /dev/null +++ b/examples/llm/deploy/disagg.yaml @@ -0,0 +1,121 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: llm-disagg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-disagg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + PrefillWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml new file mode 100644 index 0000000000..ddc5a6c519 --- /dev/null +++ b/examples/llm/deploy/disagg_router.yaml @@ -0,0 +1,148 @@ +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg-router +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-disagg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg_router:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-disagg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg_router:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + Router: + dynamoNamespace: llm-disagg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg_router:Router + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Router + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg-router + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg_router:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + PrefillWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg-router + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.disagg_router:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker From 17a054fefbdf72fb6b9f0ae1bfefe35f115726f3 Mon Sep 17 00:00:00 2001 From: mohammedabdulwahhab Date: Thu, 3 Jul 2025 15:34:02 -0700 Subject: [PATCH 06/14] fix: fix --- examples/vllm_v0/deploy/disagg.yaml | 172 ++++++------ examples/vllm_v0/deploy/disagg_planner.yaml | 276 ++++++++++---------- 2 files changed, 232 insertions(+), 216 deletions(-) diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml index 92252f6d52..45c90eb2cd 100644 --- a/examples/vllm_v0/deploy/disagg.yaml +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -3,86 +3,92 @@ kind: DynamoGraphDeployment metadata: name: disagg spec: - Frontend: - dynamoNamespace: inference - componentType: main - replicas: 1 - resources: - requests: - cpu: "1" - memory: "2Gi" - limits: - cpu: "1" - memory: "2Gi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - Frontend - - -f - - ./configs/disagg.yaml - - VllmWorker: - replicas: 1 - resources: - requests: - cpu: "10" - memory: "20Gi" - gpu: "1" - limits: - cpu: "10" - memory: "20Gi" - gpu: "1" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - VllmWorker - - -f - - ./configs/disagg.yaml - - PrefillWorker: - replicas: 1 - resources: - requests: - cpu: "10" - memory: "20Gi" - gpu: "1" - limits: - cpu: "10" - memory: "20Gi" - gpu: "1" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - PrefillWorker - - -f - - ./configs/disagg.yaml \ No newline at end of file + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}' + services: + Frontend: + dynamoNamespace: dynamo + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + VllmWorker: + dynamoNamespace: dynamo + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + PrefillWorker: + dynamoNamespace: dynamo + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker \ No newline at end of file diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index 649aa3f672..04a6708073 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -3,140 +3,150 @@ kind: DynamoGraphDeployment metadata: name: disagg-planner spec: - Frontend: - dynamoNamespace: inference - componentType: main - replicas: 1 - resources: - requests: - cpu: "1" - memory: "2Gi" - limits: - cpu: "1" - memory: "2Gi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg_planner:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - Frontend - - -f - - ./configs/disagg_planner.yaml + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}' + services: + Frontend: + dynamoNamespace: dynamo + componentType: main + replicas: 1 + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "2" + memory: "4Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend - VllmWorker: - replicas: 1 - resources: - requests: - cpu: "10" - memory: "20Gi" - gpu: "1" - limits: - cpu: "10" - memory: "20Gi" - gpu: "1" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg_planner:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - VllmWorker - - -f - - ./configs/disagg_planner.yaml + VllmWorker: + dynamoNamespace: dynamo + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker - PrefillWorker: - replicas: 1 - resources: - requests: - cpu: "10" - memory: "20Gi" - gpu: "1" - limits: - cpu: "10" - memory: "20Gi" - gpu: "1" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg_planner:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - PrefillWorker - - -f - - ./configs/disagg_planner.yaml + PrefillWorker: + dynamoNamespace: dynamo + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker - Planner: - replicas: 1 - componentType: planner - resources: - requests: - cpu: "1" - memory: "1Gi" - limits: - cpu: "1" - memory: "1Gi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg_planner:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - Planner - - -f - - ./configs/disagg_planner.yaml - - --Planner.environment=kubernetes + Planner: + dynamoNamespace: dynamo + replicas: 1 + componentType: planner + resources: + requests: + cpu: "2" + memory: "2Gi" + limits: + cpu: "2" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg_planner:Planner + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Planner + - --Planner.environment=kubernetes - Prometheus: - replicas: 1 - resources: - requests: - cpu: "500m" - memory: "500Mi" - limits: - cpu: "500m" - memory: "500Mi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - workingDir: /workspace/examples/vllm_v0 - args: - - dynamo - - serve - - graphs.disagg_planner:Frontend - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - Prometheus - - -f - - ./configs/disagg_planner.yaml \ No newline at end of file + Prometheus: + dynamoNamespace: dynamo + replicas: 1 + resources: + requests: + cpu: "1000m" + memory: "1000Mi" + limits: + cpu: "1000m" + memory: "1000Mi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/vllm_v0 + - "&&" + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Prometheus \ No newline at end of file From 1a09de800d531f28a114c03bee12d0e60cc0825d Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Thu, 3 Jul 2025 15:34:24 -0700 Subject: [PATCH 07/14] feat: update namespaces --- examples/vllm_v0/deploy/agg.yaml | 6 +++--- examples/vllm_v0/deploy/disagg.yaml | 8 ++++---- examples/vllm_v0/deploy/disagg_planner.yaml | 12 ++++++------ 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml index 7993c066aa..984244821b 100644 --- a/examples/vllm_v0/deploy/agg.yaml +++ b/examples/vllm_v0/deploy/agg.yaml @@ -8,7 +8,7 @@ spec: value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}' services: Frontend: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-agg componentType: main replicas: 1 resources: @@ -36,7 +36,7 @@ spec: - Frontend VllmWorker: envFromSecret: hf-token-secret - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-agg replicas: 1 resources: requests: @@ -62,4 +62,4 @@ spec: - --enable-system-app - --use-default-health-checks - --service-name - - VllmWorker \ No newline at end of file + - VllmWorker diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml index 45c90eb2cd..eb018e076a 100644 --- a/examples/vllm_v0/deploy/disagg.yaml +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -8,7 +8,7 @@ spec: value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}' services: Frontend: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg componentType: main replicas: 1 resources: @@ -35,7 +35,7 @@ spec: - --service-name - Frontend VllmWorker: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg envFromSecret: hf-token-secret replicas: 1 resources: @@ -64,7 +64,7 @@ spec: - --service-name - VllmWorker PrefillWorker: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg envFromSecret: hf-token-secret replicas: 1 resources: @@ -91,4 +91,4 @@ spec: - --enable-system-app - --use-default-health-checks - --service-name - - PrefillWorker \ No newline at end of file + - PrefillWorker diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index 04a6708073..174b98af8b 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -8,7 +8,7 @@ spec: value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}' services: Frontend: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg-planner componentType: main replicas: 1 resources: @@ -36,7 +36,7 @@ spec: - Frontend VllmWorker: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg-planner envFromSecret: hf-token-secret replicas: 1 resources: @@ -66,7 +66,7 @@ spec: - VllmWorker PrefillWorker: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg-planner envFromSecret: hf-token-secret replicas: 1 resources: @@ -96,7 +96,7 @@ spec: - PrefillWorker Planner: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg-planner replicas: 1 componentType: planner resources: @@ -125,7 +125,7 @@ spec: - --Planner.environment=kubernetes Prometheus: - dynamoNamespace: dynamo + dynamoNamespace: vllm-v0-disagg-planner replicas: 1 resources: requests: @@ -149,4 +149,4 @@ spec: - --enable-system-app - --use-default-health-checks - --service-name - - Prometheus \ No newline at end of file + - Prometheus From 063dddbedf49437fceffb3a4cf27f3e41ffd4715 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Thu, 3 Jul 2025 15:39:23 -0700 Subject: [PATCH 08/14] feat: add non-working planner to llm agg --- examples/llm/deploy/agg.yaml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml index a515d87d57..8e5262e088 100644 --- a/examples/llm/deploy/agg.yaml +++ b/examples/llm/deploy/agg.yaml @@ -90,3 +90,32 @@ spec: - --use-default-health-checks - --service-name - VllmWorker + + Planner: + dynamoNamespace: llm-agg + replicas: 1 + componentType: planner + resources: + requests: + cpu: "2" + memory: "2Gi" + limits: + cpu: "2" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + args: + - cd + - /workspace/examples/llm + - "&&" + - dynamo + - serve + - graphs.agg:Planner + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Planner + - --Planner.environment=kubernetes From 84677d2650600fc1d1d8d0dfcee75574be860909 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Mon, 7 Jul 2025 10:22:56 -0700 Subject: [PATCH 09/14] feat: use workingDir instead of cd --- examples/llm/deploy/agg.yaml | 41 ++------------------------ examples/llm/deploy/agg_router.yaml | 16 +++------- examples/llm/deploy/disagg.yaml | 16 +++------- examples/llm/deploy/disagg_router.yaml | 20 ++++--------- 4 files changed, 16 insertions(+), 77 deletions(-) diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml index 8e5262e088..bfc9620857 100644 --- a/examples/llm/deploy/agg.yaml +++ b/examples/llm/deploy/agg.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg:Frontend @@ -48,10 +46,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg:Processor @@ -77,10 +73,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg:VllmWorker @@ -90,32 +84,3 @@ spec: - --use-default-health-checks - --service-name - VllmWorker - - Planner: - dynamoNamespace: llm-agg - replicas: 1 - componentType: planner - resources: - requests: - cpu: "2" - memory: "2Gi" - limits: - cpu: "2" - memory: "2Gi" - extraPodSpec: - mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 - args: - - cd - - /workspace/examples/llm - - "&&" - - dynamo - - serve - - graphs.agg:Planner - - --system-app-port - - "5000" - - --enable-system-app - - --use-default-health-checks - - --service-name - - Planner - - --Planner.environment=kubernetes diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml index a2d5ceed65..d5c726fe34 100644 --- a/examples/llm/deploy/agg_router.yaml +++ b/examples/llm/deploy/agg_router.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg_router:Frontend @@ -48,10 +46,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg_router:Processor @@ -75,10 +71,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg_router:Router @@ -104,10 +98,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.agg_router:VllmWorker diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml index c9f6aab17f..f666b10918 100644 --- a/examples/llm/deploy/disagg.yaml +++ b/examples/llm/deploy/disagg.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg:Frontend @@ -48,10 +46,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg:Processor @@ -77,10 +73,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg:VllmWorker @@ -106,10 +100,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg:PrefillWorker diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml index ddc5a6c519..dbd1dfe832 100644 --- a/examples/llm/deploy/disagg_router.yaml +++ b/examples/llm/deploy/disagg_router.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg_router:Frontend @@ -48,10 +46,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg_router:Processor @@ -75,10 +71,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg_router:Router @@ -104,10 +98,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg_router:VllmWorker @@ -133,10 +125,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/llm args: - - cd - - /workspace/examples/llm - - "&&" - dynamo - serve - graphs.disagg_router:PrefillWorker From 7747765e0a9935b3ad0cb15a0d3f0545d7c9a1d0 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Mon, 7 Jul 2025 10:24:44 -0700 Subject: [PATCH 10/14] feat: use workingDir instead of cd --- examples/vllm_v0/deploy/agg.yaml | 8 ++------ examples/vllm_v0/deploy/disagg.yaml | 12 +++--------- examples/vllm_v0/deploy/disagg_planner.yaml | 20 +++++--------------- 3 files changed, 10 insertions(+), 30 deletions(-) diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml index 984244821b..62e1325982 100644 --- a/examples/vllm_v0/deploy/agg.yaml +++ b/examples/vllm_v0/deploy/agg.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.agg:Frontend @@ -50,10 +48,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.agg:VllmWorker diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml index eb018e076a..a7a2b4738a 100644 --- a/examples/vllm_v0/deploy/disagg.yaml +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg:Frontend @@ -50,10 +48,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg:VllmWorker @@ -79,10 +75,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg:PrefillWorker diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index 174b98af8b..7cccf70782 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -21,10 +21,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg_planner:Frontend @@ -51,10 +49,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg_planner:Frontend @@ -81,10 +77,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg_planner:Frontend @@ -109,10 +103,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg_planner:Planner @@ -137,10 +129,8 @@ spec: extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + workingDir: /workspace/examples/vllm_v0 args: - - cd - - /workspace/examples/vllm_v0 - - "&&" - dynamo - serve - graphs.disagg_planner:Frontend From 9b99cd6c900cd6c7f5df4d090d32b197fa01302c Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Mon, 7 Jul 2025 12:52:58 -0700 Subject: [PATCH 11/14] feat: coderabbit-requested changes, add copyright headers --- examples/llm/deploy/agg.yaml | 18 +++++++++++++++-- examples/llm/deploy/agg_router.yaml | 18 +++++++++++++++-- examples/llm/deploy/disagg.yaml | 22 +++++++++++++++++---- examples/llm/deploy/disagg_router.yaml | 22 +++++++++++++++++---- examples/vllm_v0/deploy/agg.yaml | 18 +++++++++++++++-- examples/vllm_v0/deploy/disagg.yaml | 22 +++++++++++++++++---- examples/vllm_v0/deploy/disagg_planner.yaml | 22 +++++++++++++++++---- 7 files changed, 120 insertions(+), 22 deletions(-) diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml index bfc9620857..242c0e470f 100644 --- a/examples/llm/deploy/agg.yaml +++ b/examples/llm/deploy/agg.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -65,11 +79,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml index d5c726fe34..94218ddb53 100644 --- a/examples/llm/deploy/agg_router.yaml +++ b/examples/llm/deploy/agg_router.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -90,11 +104,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml index f666b10918..87a76b7b32 100644 --- a/examples/llm/deploy/disagg.yaml +++ b/examples/llm/deploy/disagg.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -65,11 +79,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 @@ -92,11 +106,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml index dbd1dfe832..c91b36aba5 100644 --- a/examples/llm/deploy/disagg_router.yaml +++ b/examples/llm/deploy/disagg_router.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -90,11 +104,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 @@ -117,11 +131,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml index 62e1325982..6c6b04e1ff 100644 --- a/examples/vllm_v0/deploy/agg.yaml +++ b/examples/vllm_v0/deploy/agg.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -40,11 +54,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml index a7a2b4738a..22712e2527 100644 --- a/examples/vllm_v0/deploy/disagg.yaml +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -40,11 +54,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 @@ -67,11 +81,11 @@ spec: requests: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" limits: cpu: "10" memory: "20Gi" - gpu: "1" + nvidia.com/gpu: "1" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index 7cccf70782..de6ddf7029 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. apiVersion: nvidia.com/v1alpha1 kind: DynamoGraphDeployment metadata: @@ -41,11 +55,11 @@ spec: requests: cpu: "20" memory: "40Gi" - gpu: "2" + nvidia.com/gpu: "2" limits: cpu: "20" memory: "40Gi" - gpu: "2" + nvidia.com/gpu: "2" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 @@ -69,11 +83,11 @@ spec: requests: cpu: "20" memory: "40Gi" - gpu: "2" + nvidia.com/gpu: "2" limits: cpu: "20" memory: "40Gi" - gpu: "2" + nvidia.com/gpu: "2" extraPodSpec: mainContainer: image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 From f7a1f53449f4f8383b33025d82629e94e0a28239 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Mon, 7 Jul 2025 12:54:57 -0700 Subject: [PATCH 12/14] fix: disagg_planner crd serve --- examples/vllm_v0/deploy/disagg_planner.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index de6ddf7029..8386efc91a 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -67,7 +67,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:Frontend + - graphs.disagg_planner:VllmWorker - --system-app-port - "5000" - --enable-system-app @@ -95,7 +95,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:Frontend + - graphs.disagg_planner:PrefillWorker - --system-app-port - "5000" - --enable-system-app @@ -147,7 +147,7 @@ spec: args: - dynamo - serve - - graphs.disagg_planner:Frontend + - graphs.disagg_planner:Prometheus - --system-app-port - "5000" - --enable-system-app From cce31485e2db099b4fd5e3d704cc70ece17cc77d Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Mon, 7 Jul 2025 15:56:32 -0700 Subject: [PATCH 13/14] feat: use latest instead of hardcoding vllm runtime image --- examples/llm/deploy/agg.yaml | 6 +++--- examples/llm/deploy/agg_router.yaml | 8 ++++---- examples/llm/deploy/disagg.yaml | 8 ++++---- examples/llm/deploy/disagg_router.yaml | 10 +++++----- examples/vllm_v0/deploy/agg.yaml | 4 ++-- examples/vllm_v0/deploy/disagg.yaml | 6 +++--- examples/vllm_v0/deploy/disagg_planner.yaml | 10 +++++----- 7 files changed, 26 insertions(+), 26 deletions(-) diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml index 242c0e470f..c277e974c6 100644 --- a/examples/llm/deploy/agg.yaml +++ b/examples/llm/deploy/agg.yaml @@ -34,7 +34,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -59,7 +59,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -86,7 +86,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml index 94218ddb53..fa40fe2e31 100644 --- a/examples/llm/deploy/agg_router.yaml +++ b/examples/llm/deploy/agg_router.yaml @@ -34,7 +34,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -59,7 +59,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -84,7 +84,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -111,7 +111,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml index 87a76b7b32..d64089f5a2 100644 --- a/examples/llm/deploy/disagg.yaml +++ b/examples/llm/deploy/disagg.yaml @@ -34,7 +34,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -59,7 +59,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -86,7 +86,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -113,7 +113,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml index c91b36aba5..152d09f7ed 100644 --- a/examples/llm/deploy/disagg_router.yaml +++ b/examples/llm/deploy/disagg_router.yaml @@ -34,7 +34,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -59,7 +59,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -84,7 +84,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -111,7 +111,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo @@ -138,7 +138,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/llm args: - dynamo diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml index 6c6b04e1ff..45af6f3cee 100644 --- a/examples/vllm_v0/deploy/agg.yaml +++ b/examples/vllm_v0/deploy/agg.yaml @@ -34,7 +34,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -61,7 +61,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml index 22712e2527..c4f41342e8 100644 --- a/examples/vllm_v0/deploy/disagg.yaml +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -34,7 +34,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -61,7 +61,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -88,7 +88,7 @@ spec: nvidia.com/gpu: "1" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml index 8386efc91a..082e3cb9bc 100644 --- a/examples/vllm_v0/deploy/disagg_planner.yaml +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -34,7 +34,7 @@ spec: memory: "4Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -62,7 +62,7 @@ spec: nvidia.com/gpu: "2" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -90,7 +90,7 @@ spec: nvidia.com/gpu: "2" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -116,7 +116,7 @@ spec: memory: "2Gi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo @@ -142,7 +142,7 @@ spec: memory: "1000Mi" extraPodSpec: mainContainer: - image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1 + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest workingDir: /workspace/examples/vllm_v0 args: - dynamo From 0fd6d1b7447a84b98e1f5375937622d4fd632ad6 Mon Sep 17 00:00:00 2001 From: Hannah Zhang Date: Mon, 7 Jul 2025 16:04:35 -0700 Subject: [PATCH 14/14] feat: add crds for vllm v1 examples to prepare for 0.3.2 release --- examples/vllm_v1/deploy/agg.yaml | 100 +++++++++++ examples/vllm_v1/deploy/disagg.yaml | 127 ++++++++++++++ examples/vllm_v1/deploy/disagg_planner.yaml | 182 ++++++++++++++++++++ 3 files changed, 409 insertions(+) create mode 100644 examples/vllm_v1/deploy/agg.yaml create mode 100644 examples/vllm_v1/deploy/disagg.yaml create mode 100644 examples/vllm_v1/deploy/disagg_planner.yaml diff --git a/examples/vllm_v1/deploy/agg.yaml b/examples/vllm_v1/deploy/agg.yaml new file mode 100644 index 0000000000..08dd5e22fc --- /dev/null +++ b/examples/vllm_v1/deploy/agg.yaml @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}' + services: + Frontend: + dynamoNamespace: vllm-v1-agg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + SimpleLoadBalancer: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-agg + replicas: 1 + resources: + requests: + cpu: "1" + memory: "20Gi" + limits: + cpu: "1" + memory: "20Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.agg:SimpleLoadBalancer + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - SimpleLoadBalancer + VllmDecodeWorker: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-agg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.agg:VllmDecodeWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmDecodeWorker diff --git a/examples/vllm_v1/deploy/disagg.yaml b/examples/vllm_v1/deploy/disagg.yaml new file mode 100644 index 0000000000..a85459f07d --- /dev/null +++ b/examples/vllm_v1/deploy/disagg.yaml @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}' + services: + Frontend: + dynamoNamespace: vllm-v1-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + SimpleLoadBalancer: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-disagg + replicas: 1 + resources: + requests: + cpu: "1" + memory: "20Gi" + limits: + cpu: "1" + memory: "20Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:SimpleLoadBalancer + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - SimpleLoadBalancer + VllmDecodeWorker: + dynamoNamespace: vllm-v1-disagg + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:VllmDecodeWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmDecodeWorker + VllmPrefillWorker: + dynamoNamespace: vllm-v1-disagg + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:VllmPrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmPrefillWorker diff --git a/examples/vllm_v1/deploy/disagg_planner.yaml b/examples/vllm_v1/deploy/disagg_planner.yaml new file mode 100644 index 0000000000..bf73f35e44 --- /dev/null +++ b/examples/vllm_v1/deploy/disagg_planner.yaml @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg-planner +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}' + services: + Frontend: + dynamoNamespace: vllm-v1-disagg-planner + componentType: main + replicas: 1 + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "2" + memory: "4Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + + SimpleLoadBalancer: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-disagg-planner + replicas: 1 + resources: + requests: + cpu: "1" + memory: "20Gi" + limits: + cpu: "1" + memory: "20Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:SimpleLoadBalancer + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - SimpleLoadBalancer + + VllmDecodeWorker: + dynamoNamespace: vllm-v1-disagg-planner + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:VllmDecodeWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmDecodeWorker + + VllmPrefillWorker: + dynamoNamespace: vllm-v1-disagg-planner + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:VllmPrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmPrefillWorker + + Planner: + dynamoNamespace: vllm-v1-disagg-planner + replicas: 1 + componentType: planner + resources: + requests: + cpu: "2" + memory: "2Gi" + limits: + cpu: "2" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:Planner + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Planner + - --Planner.environment=kubernetes + + Prometheus: + dynamoNamespace: vllm-v1-disagg-planner + replicas: 1 + resources: + requests: + cpu: "1000m" + memory: "1000Mi" + limits: + cpu: "1000m" + memory: "1000Mi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:Prometheus + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Prometheus