feat: update k8s deploy yamls to use binary/python3

ai-dynamo · hhzhang16 · Jul 15, 2025 · Jul 11, 2025 · Jul 12, 2025 · Jul 14, 2025
commit 065cb2ad0e0f7065417b9917192b5ca44df4dfb2
diff --git a/examples/vllm_v1/deploy/agg.yaml b/examples/vllm_v1/deploy/agg.yaml
@@ -31,47 +31,13 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
             - dynamo
-            - serve
-            - graphs.agg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/agg.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-agg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "20Gi"
-        limits:
-          cpu: "1"
-          memory: "20Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.agg:SimpleLoadBalancer
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/agg.yaml
+            - run
+            - in=http
+            - out=dyn
     VllmDecodeWorker:
       envFromSecret: hf-token-secret
       dynamoNamespace: vllm-v1-agg
@@ -87,17 +53,11 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
-            - dynamo
-            - serve
-            - graphs.agg:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/agg.yaml
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
diff --git a/examples/vllm_v1/deploy/agg_router.yaml b/examples/vllm_v1/deploy/agg_router.yaml
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - run
+            - in=http
+            - out=dyn
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-agg
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
diff --git a/examples/vllm_v1/deploy/disagg.yaml b/examples/vllm_v1/deploy/disagg.yaml
@@ -31,47 +31,13 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
             - dynamo
-            - serve
-            - graphs.disagg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/disagg.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-disagg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "20Gi"
-        limits:
-          cpu: "1"
-          memory: "20Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:SimpleLoadBalancer
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/disagg.yaml
+            - run
+            - in=http
+            - out=dyn
     VllmDecodeWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
@@ -87,20 +53,14 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
-            - dynamo
-            - serve
-            - graphs.disagg:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/disagg.yaml
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
     VllmPrefillWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
@@ -116,17 +76,12 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
-            - dynamo
-            - serve
-            - graphs.disagg:VllmPrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmPrefillWorker
-            - -f
-            - ./configs/disagg.yaml
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
+            - --is-prefill-worker
diff --git a/examples/vllm_v1/deploy/disagg_router.yaml b/examples/vllm_v1/deploy/disagg_router.yaml
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - run
+            - in=http
+            - out=dyn
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
+            - --is-prefill-worker