From 7ac97b014f86a753fa30a14d77904316c6a832a5 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Thu, 3 Jul 2025 10:46:32 -0700
Subject: [PATCH 01/14] fix: fix

---
 examples/vllm_v0/deploy/agg.yaml            |  59 +++++++++
 examples/vllm_v0/deploy/disagg.yaml         |  88 ++++++++++++
 examples/vllm_v0/deploy/disagg_planner.yaml | 140 ++++++++++++++++++++
 3 files changed, 287 insertions(+)
 create mode 100644 examples/vllm_v0/deploy/agg.yaml
 create mode 100644 examples/vllm_v0/deploy/disagg.yaml
 create mode 100644 examples/vllm_v0/deploy/disagg_planner.yaml

diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml
new file mode 100644
index 0000000000..eb21051ea3
--- /dev/null
+++ b/examples/vllm_v0/deploy/agg.yaml
@@ -0,0 +1,59 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg
+spec:
+  Frontend:
+    dynamoNamespace: inference
+    componentType: main
+    replicas: 1
+    resources:
+      requests:
+        cpu: "1"
+        memory: "2Gi"
+      limits:
+        cpu: "1"
+        memory: "2Gi"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.agg:Frontend
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - Frontend
+          - -f
+          - ./configs/agg.yaml
+  VllmWorker:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+      limits:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.agg:Frontend
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - VllmWorker
+          - -f
+          - ./configs/agg.yaml
\ No newline at end of file
diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
new file mode 100644
index 0000000000..c5b546a78a
--- /dev/null
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -0,0 +1,88 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg
+spec:
+  Frontend:
+    dynamoNamespace: inference
+    componentType: main
+    replicas: 1
+    resources:
+      requests:
+        cpu: "1"
+        memory: "2Gi"
+      limits:
+        cpu: "1"
+        memory: "2Gi"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg:Frontend
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - Frontend
+          - -f
+          - ./configs/disagg.yaml
+
+  VllmWorker:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+      limits:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg:VllmWorker
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - VllmWorker
+          - -f
+          - ./configs/disagg.yaml
+
+  PrefillWorker:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+      limits:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg:PrefillWorker
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - PrefillWorker
+          - -f
+          - ./configs/disagg.yaml 
\ No newline at end of file
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
new file mode 100644
index 0000000000..f272418f6f
--- /dev/null
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -0,0 +1,140 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-planner
+spec:
+  Frontend:
+    dynamoNamespace: inference
+    componentType: main
+    replicas: 1
+    resources:
+      requests:
+        cpu: "1"
+        memory: "2Gi"
+      limits:
+        cpu: "1"
+        memory: "2Gi"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg_planner:Frontend
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - Frontend
+          - -f
+          - ./configs/disagg_planner.yaml
+
+  VllmWorker:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+      limits:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg_planner:VllmWorker
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - VllmWorker
+          - -f
+          - ./configs/disagg_planner.yaml
+
+  PrefillWorker:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+      limits:
+        cpu: "10"
+        memory: "20Gi"
+        gpu: "1"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg_planner:PrefillWorker
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - PrefillWorker
+          - -f
+          - ./configs/disagg_planner.yaml
+
+  Planner:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "1"
+        memory: "1Gi"
+      limits:
+        cpu: "1"
+        memory: "1Gi"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg_planner:Planner
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - Planner
+          - -f
+          - ./configs/disagg_planner.yaml
+
+  Prometheus:
+    replicas: 1
+    resources:
+      requests:
+        cpu: "500m"
+        memory: "500Mi"
+      limits:
+        cpu: "500m"
+        memory: "500Mi"
+    extraPodSpec:
+      mainContainer:
+        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+        workingDir: /workspace/examples/vllm_v0
+        args:
+          - dynamo
+          - serve
+          - graphs.disagg_planner:Prometheus
+          - --system-app-port
+          - "5000"
+          - --enable-system-app
+          - --use-default-health-checks
+          - --service-name
+          - Prometheus
+          - -f
+          - ./configs/disagg_planner.yaml 
\ No newline at end of file

From 4279528d8f4ef635868211bd27d28a6c7fbbce99 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Thu, 3 Jul 2025 10:57:43 -0700
Subject: [PATCH 02/14] fix: fix

---
 examples/llm/crd.yaml                       | 81 +++++++++++++++++++++
 examples/vllm_v0/deploy/disagg.yaml         |  4 +-
 examples/vllm_v0/deploy/disagg_planner.yaml | 12 +--
 3 files changed, 90 insertions(+), 7 deletions(-)
 create mode 100644 examples/llm/crd.yaml

diff --git a/examples/llm/crd.yaml b/examples/llm/crd.yaml
new file mode 100644
index 0000000000..965515c9de
--- /dev/null
+++ b/examples/llm/crd.yaml
@@ -0,0 +1,81 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+ name: agg
+spec:
+ envs:
+ services:
+   Frontend:
+     dynamoNamespace: inference
+     componentType: main
+     replicas: 1
+     resources:
+      requests:
+        cpu: 100m
+        memory: 100Mi
+      limits:
+        cpu: 1000m
+        memory: 1000Mi
+     extraPodSpec:
+       mainContainer:
+         image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+         command: 
+         - dynamo
+         - serve
+         - graphs.agg:Frontend
+         - -f
+         - ./configs/agg.yaml
+   Middle:
+     dynamoNamespace: inference
+     replicas: 1
+     extraPodSpec:
+       mainContainer:
+         image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8
+         command: 
+         - sh
+         - -c
+         args:
+         - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference
+   Backend:
+     dynamoNamespace: inference
+     envs: 
+     - name: SOME_ENV
+       value: ‘somevalue’  
+     replicas: 1
+     resources:
+      requests:
+        cpu: 100m
+        memory: 100Mi
+      limits:
+        cpu: 1000m
+        memory: 1000Mi
+     extraPodSpec:
+       mainContainer:
+         image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8
+         command: 
+         - sh
+         - -c
+         args:
+         - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference
+    Planner:
+     dynamoNamespace: inference
+     envs: 
+     - name: SOME_ENV
+       value: ‘somevalue’  
+     replicas: 1
+     componentType: planner
+     resources:
+      requests:
+        cpu: 100m
+        memory: 100Mi
+      limits:
+        cpu: 1000m
+        memory: 1000Mi
+     extraPodSpec:
+       mainContainer:
+         image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:planner
+         command: 
+         - sh
+         - -c
+         args:
+         - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --service-name Planner graphs.disagg_router:Frontend --Planner.ServiceArgs.dynamo.namespace=inference --Planner.environment=kubernetes
diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
index c5b546a78a..92252f6d52 100644
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -49,7 +49,7 @@ spec:
         args:
           - dynamo
           - serve
-          - graphs.disagg:VllmWorker
+          - graphs.disagg:Frontend
           - --system-app-port
           - "5000"
           - --enable-system-app
@@ -77,7 +77,7 @@ spec:
         args:
           - dynamo
           - serve
-          - graphs.disagg:PrefillWorker
+          - graphs.disagg:Frontend
           - --system-app-port
           - "5000"
           - --enable-system-app
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index f272418f6f..649aa3f672 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -49,7 +49,7 @@ spec:
         args:
           - dynamo
           - serve
-          - graphs.disagg_planner:VllmWorker
+          - graphs.disagg_planner:Frontend
           - --system-app-port
           - "5000"
           - --enable-system-app
@@ -77,7 +77,7 @@ spec:
         args:
           - dynamo
           - serve
-          - graphs.disagg_planner:PrefillWorker
+          - graphs.disagg_planner:Frontend
           - --system-app-port
           - "5000"
           - --enable-system-app
@@ -89,6 +89,7 @@ spec:
 
   Planner:
     replicas: 1
+    componentType: planner
     resources:
       requests:
         cpu: "1"
@@ -103,7 +104,7 @@ spec:
         args:
           - dynamo
           - serve
-          - graphs.disagg_planner:Planner
+          - graphs.disagg_planner:Frontend
           - --system-app-port
           - "5000"
           - --enable-system-app
@@ -112,6 +113,7 @@ spec:
           - Planner
           - -f
           - ./configs/disagg_planner.yaml
+          - --Planner.environment=kubernetes
 
   Prometheus:
     replicas: 1
@@ -129,7 +131,7 @@ spec:
         args:
           - dynamo
           - serve
-          - graphs.disagg_planner:Prometheus
+          - graphs.disagg_planner:Frontend
           - --system-app-port
           - "5000"
           - --enable-system-app
@@ -137,4 +139,4 @@ spec:
           - --service-name
           - Prometheus
           - -f
-          - ./configs/disagg_planner.yaml 
\ No newline at end of file
+          - ./configs/disagg_planner.yaml
\ No newline at end of file

From 1a3e9ad9b77b9212925265fd84624916e62bf31b Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Thu, 3 Jul 2025 11:30:09 -0700
Subject: [PATCH 03/14] Delete examples/llm/crd.yaml

Signed-off-by: mohammedabdulwahhab <furkhan324@berkeley.edu>
---
 examples/llm/crd.yaml | 81 -------------------------------------------
 1 file changed, 81 deletions(-)
 delete mode 100644 examples/llm/crd.yaml

diff --git a/examples/llm/crd.yaml b/examples/llm/crd.yaml
deleted file mode 100644
index 965515c9de..0000000000
--- a/examples/llm/crd.yaml
+++ /dev/null
@@ -1,81 +0,0 @@
-apiVersion: nvidia.com/v1alpha1
-kind: DynamoGraphDeployment
-metadata:
- name: agg
-spec:
- envs:
- services:
-   Frontend:
-     dynamoNamespace: inference
-     componentType: main
-     replicas: 1
-     resources:
-      requests:
-        cpu: 100m
-        memory: 100Mi
-      limits:
-        cpu: 1000m
-        memory: 1000Mi
-     extraPodSpec:
-       mainContainer:
-         image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-         command: 
-         - dynamo
-         - serve
-         - graphs.agg:Frontend
-         - -f
-         - ./configs/agg.yaml
-   Middle:
-     dynamoNamespace: inference
-     replicas: 1
-     extraPodSpec:
-       mainContainer:
-         image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8
-         command: 
-         - sh
-         - -c
-         args:
-         - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference
-   Backend:
-     dynamoNamespace: inference
-     envs: 
-     - name: SOME_ENV
-       value: ‘somevalue’  
-     replicas: 1
-     resources:
-      requests:
-        cpu: 100m
-        memory: 100Mi
-      limits:
-        cpu: 1000m
-        memory: 1000Mi
-     extraPodSpec:
-       mainContainer:
-         image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:dynamo.existing.444f19b8
-         command: 
-         - sh
-         - -c
-         args:
-         - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --use-default-health-checks --service-name Middle hello_world:Frontend --Backend.ServiceArgs.dynamo.namespace=inference
-    Planner:
-     dynamoNamespace: inference
-     envs: 
-     - name: SOME_ENV
-       value: ‘somevalue’  
-     replicas: 1
-     componentType: planner
-     resources:
-      requests:
-        cpu: 100m
-        memory: 100Mi
-      limits:
-        cpu: 1000m
-        memory: 1000Mi
-     extraPodSpec:
-       mainContainer:
-         image: gitlab-master.nvidia.com:5005/aire/microservices/compoundai/dynamo-pipelines:planner
-         command: 
-         - sh
-         - -c
-         args:
-         - cd src && uv run dynamo serve --system-app-port 5000 --enable-system-app --service-name Planner graphs.disagg_router:Frontend --Planner.ServiceArgs.dynamo.namespace=inference --Planner.environment=kubernetes

From 93859d6c8021fcae5b8f3dcd26c9a6d72bb92987 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Thu, 3 Jul 2025 13:44:26 -0700
Subject: [PATCH 04/14] fix: fix

---
 examples/vllm_v0/deploy/agg.yaml | 114 ++++++++++++++++---------------
 1 file changed, 60 insertions(+), 54 deletions(-)

diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml
index eb21051ea3..7993c066aa 100644
--- a/examples/vllm_v0/deploy/agg.yaml
+++ b/examples/vllm_v0/deploy/agg.yaml
@@ -3,57 +3,63 @@ kind: DynamoGraphDeployment
 metadata:
   name: agg
 spec:
-  Frontend:
-    dynamoNamespace: inference
-    componentType: main
-    replicas: 1
-    resources:
-      requests:
-        cpu: "1"
-        memory: "2Gi"
-      limits:
-        cpu: "1"
-        memory: "2Gi"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.agg:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - Frontend
-          - -f
-          - ./configs/agg.yaml
-  VllmWorker:
-    replicas: 1
-    resources:
-      requests:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-      limits:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.agg:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - VllmWorker
-          - -f
-          - ./configs/agg.yaml
\ No newline at end of file
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: dynamo
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: dynamo 
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
\ No newline at end of file

From ec912faa33d5692ee6ba36bdacfc2717f9072264 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Thu, 3 Jul 2025 15:31:14 -0700
Subject: [PATCH 05/14] feat: add crds for llm example, no planner

---
 examples/llm/deploy/agg.yaml           |  92 +++++++++++++++
 examples/llm/deploy/agg_router.yaml    | 119 ++++++++++++++++++++
 examples/llm/deploy/disagg.yaml        | 121 ++++++++++++++++++++
 examples/llm/deploy/disagg_router.yaml | 148 +++++++++++++++++++++++++
 4 files changed, 480 insertions(+)
 create mode 100644 examples/llm/deploy/agg.yaml
 create mode 100644 examples/llm/deploy/agg_router.yaml
 create mode 100644 examples/llm/deploy/disagg.yaml
 create mode 100644 examples/llm/deploy/disagg_router.yaml

diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml
new file mode 100644
index 0000000000..a515d87d57
--- /dev/null
+++ b/examples/llm/deploy/agg.yaml
@@ -0,0 +1,92 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: llm-agg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-agg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml
new file mode 100644
index 0000000000..a2d5ceed65
--- /dev/null
+++ b/examples/llm/deploy/agg_router.yaml
@@ -0,0 +1,119 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg-router
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-agg-router
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg_router:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-agg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg_router:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    Router:
+      dynamoNamespace: llm-agg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg_router:Router
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Router
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-agg-router
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg_router:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml
new file mode 100644
index 0000000000..c9f6aab17f
--- /dev/null
+++ b/examples/llm/deploy/disagg.yaml
@@ -0,0 +1,121 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: llm-disagg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-disagg
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+    PrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml
new file mode 100644
index 0000000000..ddc5a6c519
--- /dev/null
+++ b/examples/llm/deploy/disagg_router.yaml
@@ -0,0 +1,148 @@
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-router
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}'
+  services:
+    Frontend:
+      dynamoNamespace: llm-disagg-router
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_router:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    Processor:
+      dynamoNamespace: llm-disagg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_router:Processor
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Processor
+    Router:
+      dynamoNamespace: llm-disagg-router
+      componentType: worker
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_router:Router
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Router
+    VllmWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg-router
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_router:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+    PrefillWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: llm-disagg-router
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_router:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker

From 17a054fefbdf72fb6b9f0ae1bfefe35f115726f3 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Thu, 3 Jul 2025 15:34:02 -0700
Subject: [PATCH 06/14] fix: fix

---
 examples/vllm_v0/deploy/disagg.yaml         | 172 ++++++------
 examples/vllm_v0/deploy/disagg_planner.yaml | 276 ++++++++++----------
 2 files changed, 232 insertions(+), 216 deletions(-)

diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
index 92252f6d52..45c90eb2cd 100644
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -3,86 +3,92 @@ kind: DynamoGraphDeployment
 metadata:
   name: disagg
 spec:
-  Frontend:
-    dynamoNamespace: inference
-    componentType: main
-    replicas: 1
-    resources:
-      requests:
-        cpu: "1"
-        memory: "2Gi"
-      limits:
-        cpu: "1"
-        memory: "2Gi"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - Frontend
-          - -f
-          - ./configs/disagg.yaml
-
-  VllmWorker:
-    replicas: 1
-    resources:
-      requests:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-      limits:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - VllmWorker
-          - -f
-          - ./configs/disagg.yaml
-
-  PrefillWorker:
-    replicas: 1
-    resources:
-      requests:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-      limits:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - PrefillWorker
-          - -f
-          - ./configs/disagg.yaml 
\ No newline at end of file
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: dynamo
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    VllmWorker:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:VllmWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
+    PrefillWorker:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg:PrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
\ No newline at end of file
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index 649aa3f672..04a6708073 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -3,140 +3,150 @@ kind: DynamoGraphDeployment
 metadata:
   name: disagg-planner
 spec:
-  Frontend:
-    dynamoNamespace: inference
-    componentType: main
-    replicas: 1
-    resources:
-      requests:
-        cpu: "1"
-        memory: "2Gi"
-      limits:
-        cpu: "1"
-        memory: "2Gi"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg_planner:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - Frontend
-          - -f
-          - ./configs/disagg_planner.yaml
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
+  services:
+    Frontend:
+      dynamoNamespace: dynamo
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "2"
+          memory: "4Gi"
+        limits:
+          cpu: "2"
+          memory: "4Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
 
-  VllmWorker:
-    replicas: 1
-    resources:
-      requests:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-      limits:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg_planner:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - VllmWorker
-          - -f
-          - ./configs/disagg_planner.yaml
+    VllmWorker:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmWorker
 
-  PrefillWorker:
-    replicas: 1
-    resources:
-      requests:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-      limits:
-        cpu: "10"
-        memory: "20Gi"
-        gpu: "1"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg_planner:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - PrefillWorker
-          - -f
-          - ./configs/disagg_planner.yaml
+    PrefillWorker:
+      dynamoNamespace: dynamo
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - PrefillWorker
 
-  Planner:
-    replicas: 1
-    componentType: planner
-    resources:
-      requests:
-        cpu: "1"
-        memory: "1Gi"
-      limits:
-        cpu: "1"
-        memory: "1Gi"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg_planner:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - Planner
-          - -f
-          - ./configs/disagg_planner.yaml
-          - --Planner.environment=kubernetes
+    Planner:
+      dynamoNamespace: dynamo
+      replicas: 1
+      componentType: planner
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "2"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Planner
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Planner
+            - --Planner.environment=kubernetes
 
-  Prometheus:
-    replicas: 1
-    resources:
-      requests:
-        cpu: "500m"
-        memory: "500Mi"
-      limits:
-        cpu: "500m"
-        memory: "500Mi"
-    extraPodSpec:
-      mainContainer:
-        image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-        workingDir: /workspace/examples/vllm_v0
-        args:
-          - dynamo
-          - serve
-          - graphs.disagg_planner:Frontend
-          - --system-app-port
-          - "5000"
-          - --enable-system-app
-          - --use-default-health-checks
-          - --service-name
-          - Prometheus
-          - -f
-          - ./configs/disagg_planner.yaml
\ No newline at end of file
+    Prometheus:
+      dynamoNamespace: dynamo
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1000m"
+          memory: "1000Mi"
+        limits:
+          cpu: "1000m"
+          memory: "1000Mi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/vllm_v0
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Prometheus
\ No newline at end of file

From 1a09de800d531f28a114c03bee12d0e60cc0825d Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Thu, 3 Jul 2025 15:34:24 -0700
Subject: [PATCH 07/14] feat: update namespaces

---
 examples/vllm_v0/deploy/agg.yaml            |  6 +++---
 examples/vllm_v0/deploy/disagg.yaml         |  8 ++++----
 examples/vllm_v0/deploy/disagg_planner.yaml | 12 ++++++------
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml
index 7993c066aa..984244821b 100644
--- a/examples/vllm_v0/deploy/agg.yaml
+++ b/examples/vllm_v0/deploy/agg.yaml
@@ -8,7 +8,7 @@ spec:
       value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
   services:
     Frontend:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-agg
       componentType: main
       replicas: 1
       resources:
@@ -36,7 +36,7 @@ spec:
             - Frontend
     VllmWorker:
       envFromSecret: hf-token-secret
-      dynamoNamespace: dynamo 
+      dynamoNamespace: vllm-v0-agg
       replicas: 1
       resources:
         requests:
@@ -62,4 +62,4 @@ spec:
             - --enable-system-app
             - --use-default-health-checks
             - --service-name
-            - VllmWorker
\ No newline at end of file
+            - VllmWorker
diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
index 45c90eb2cd..eb018e076a 100644
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -8,7 +8,7 @@ spec:
       value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
   services:
     Frontend:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg
       componentType: main
       replicas: 1
       resources:
@@ -35,7 +35,7 @@ spec:
             - --service-name
             - Frontend
     VllmWorker:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg
       envFromSecret: hf-token-secret
       replicas: 1
       resources:
@@ -64,7 +64,7 @@ spec:
             - --service-name
             - VllmWorker
     PrefillWorker:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg
       envFromSecret: hf-token-secret
       replicas: 1
       resources:
@@ -91,4 +91,4 @@ spec:
             - --enable-system-app
             - --use-default-health-checks
             - --service-name
-            - PrefillWorker
\ No newline at end of file
+            - PrefillWorker
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index 04a6708073..174b98af8b 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -8,7 +8,7 @@ spec:
       value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
   services:
     Frontend:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg-planner
       componentType: main
       replicas: 1
       resources:
@@ -36,7 +36,7 @@ spec:
             - Frontend
 
     VllmWorker:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg-planner
       envFromSecret: hf-token-secret
       replicas: 1
       resources:
@@ -66,7 +66,7 @@ spec:
             - VllmWorker
 
     PrefillWorker:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg-planner
       envFromSecret: hf-token-secret
       replicas: 1
       resources:
@@ -96,7 +96,7 @@ spec:
             - PrefillWorker
 
     Planner:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg-planner
       replicas: 1
       componentType: planner
       resources:
@@ -125,7 +125,7 @@ spec:
             - --Planner.environment=kubernetes
 
     Prometheus:
-      dynamoNamespace: dynamo
+      dynamoNamespace: vllm-v0-disagg-planner
       replicas: 1
       resources:
         requests:
@@ -149,4 +149,4 @@ spec:
             - --enable-system-app
             - --use-default-health-checks
             - --service-name
-            - Prometheus
\ No newline at end of file
+            - Prometheus

From 063dddbedf49437fceffb3a4cf27f3e41ffd4715 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Thu, 3 Jul 2025 15:39:23 -0700
Subject: [PATCH 08/14] feat: add non-working planner to llm agg

---
 examples/llm/deploy/agg.yaml | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml
index a515d87d57..8e5262e088 100644
--- a/examples/llm/deploy/agg.yaml
+++ b/examples/llm/deploy/agg.yaml
@@ -90,3 +90,32 @@ spec:
             - --use-default-health-checks
             - --service-name
             - VllmWorker
+
+    Planner:
+      dynamoNamespace: llm-agg
+      replicas: 1
+      componentType: planner
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "2"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          args:
+            - cd
+            - /workspace/examples/llm
+            - "&&"
+            - dynamo
+            - serve
+            - graphs.agg:Planner
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Planner
+            - --Planner.environment=kubernetes

From 84677d2650600fc1d1d8d0dfcee75574be860909 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 7 Jul 2025 10:22:56 -0700
Subject: [PATCH 09/14] feat: use workingDir instead of cd

---
 examples/llm/deploy/agg.yaml           | 41 ++------------------------
 examples/llm/deploy/agg_router.yaml    | 16 +++-------
 examples/llm/deploy/disagg.yaml        | 16 +++-------
 examples/llm/deploy/disagg_router.yaml | 20 ++++---------
 4 files changed, 16 insertions(+), 77 deletions(-)

diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml
index 8e5262e088..bfc9620857 100644
--- a/examples/llm/deploy/agg.yaml
+++ b/examples/llm/deploy/agg.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg:Frontend
@@ -48,10 +46,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg:Processor
@@ -77,10 +73,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg:VllmWorker
@@ -90,32 +84,3 @@ spec:
             - --use-default-health-checks
             - --service-name
             - VllmWorker
-
-    Planner:
-      dynamoNamespace: llm-agg
-      replicas: 1
-      componentType: planner
-      resources:
-        requests:
-          cpu: "2"
-          memory: "2Gi"
-        limits:
-          cpu: "2"
-          memory: "2Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
-            - dynamo
-            - serve
-            - graphs.agg:Planner
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Planner
-            - --Planner.environment=kubernetes
diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml
index a2d5ceed65..d5c726fe34 100644
--- a/examples/llm/deploy/agg_router.yaml
+++ b/examples/llm/deploy/agg_router.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg_router:Frontend
@@ -48,10 +46,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg_router:Processor
@@ -75,10 +71,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg_router:Router
@@ -104,10 +98,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.agg_router:VllmWorker
diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml
index c9f6aab17f..f666b10918 100644
--- a/examples/llm/deploy/disagg.yaml
+++ b/examples/llm/deploy/disagg.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:Frontend
@@ -48,10 +46,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:Processor
@@ -77,10 +73,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:VllmWorker
@@ -106,10 +100,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:PrefillWorker
diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml
index ddc5a6c519..dbd1dfe832 100644
--- a/examples/llm/deploy/disagg_router.yaml
+++ b/examples/llm/deploy/disagg_router.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_router:Frontend
@@ -48,10 +46,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_router:Processor
@@ -75,10 +71,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_router:Router
@@ -104,10 +98,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_router:VllmWorker
@@ -133,10 +125,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/llm
           args:
-            - cd
-            - /workspace/examples/llm
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_router:PrefillWorker

From 7747765e0a9935b3ad0cb15a0d3f0545d7c9a1d0 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 7 Jul 2025 10:24:44 -0700
Subject: [PATCH 10/14] feat: use workingDir instead of cd

---
 examples/vllm_v0/deploy/agg.yaml            |  8 ++------
 examples/vllm_v0/deploy/disagg.yaml         | 12 +++---------
 examples/vllm_v0/deploy/disagg_planner.yaml | 20 +++++---------------
 3 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml
index 984244821b..62e1325982 100644
--- a/examples/vllm_v0/deploy/agg.yaml
+++ b/examples/vllm_v0/deploy/agg.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.agg:Frontend
@@ -50,10 +48,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.agg:VllmWorker
diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
index eb018e076a..a7a2b4738a 100644
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:Frontend
@@ -50,10 +48,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:VllmWorker
@@ -79,10 +75,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg:PrefillWorker
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index 174b98af8b..7cccf70782 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -21,10 +21,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_planner:Frontend
@@ -51,10 +49,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_planner:Frontend
@@ -81,10 +77,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_planner:Frontend
@@ -109,10 +103,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_planner:Planner
@@ -137,10 +129,8 @@ spec:
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          workingDir: /workspace/examples/vllm_v0
           args:
-            - cd
-            - /workspace/examples/vllm_v0
-            - "&&"
             - dynamo
             - serve
             - graphs.disagg_planner:Frontend

From 9b99cd6c900cd6c7f5df4d090d32b197fa01302c Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 7 Jul 2025 12:52:58 -0700
Subject: [PATCH 11/14] feat: coderabbit-requested changes, add copyright
 headers

---
 examples/llm/deploy/agg.yaml                | 18 +++++++++++++++--
 examples/llm/deploy/agg_router.yaml         | 18 +++++++++++++++--
 examples/llm/deploy/disagg.yaml             | 22 +++++++++++++++++----
 examples/llm/deploy/disagg_router.yaml      | 22 +++++++++++++++++----
 examples/vllm_v0/deploy/agg.yaml            | 18 +++++++++++++++--
 examples/vllm_v0/deploy/disagg.yaml         | 22 +++++++++++++++++----
 examples/vllm_v0/deploy/disagg_planner.yaml | 22 +++++++++++++++++----
 7 files changed, 120 insertions(+), 22 deletions(-)

diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml
index bfc9620857..242c0e470f 100644
--- a/examples/llm/deploy/agg.yaml
+++ b/examples/llm/deploy/agg.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -65,11 +79,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml
index d5c726fe34..94218ddb53 100644
--- a/examples/llm/deploy/agg_router.yaml
+++ b/examples/llm/deploy/agg_router.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -90,11 +104,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml
index f666b10918..87a76b7b32 100644
--- a/examples/llm/deploy/disagg.yaml
+++ b/examples/llm/deploy/disagg.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -65,11 +79,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
@@ -92,11 +106,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml
index dbd1dfe832..c91b36aba5 100644
--- a/examples/llm/deploy/disagg_router.yaml
+++ b/examples/llm/deploy/disagg_router.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -90,11 +104,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
@@ -117,11 +131,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml
index 62e1325982..6c6b04e1ff 100644
--- a/examples/vllm_v0/deploy/agg.yaml
+++ b/examples/vllm_v0/deploy/agg.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -40,11 +54,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
index a7a2b4738a..22712e2527 100644
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -40,11 +54,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
@@ -67,11 +81,11 @@ spec:
         requests:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
         limits:
           cpu: "10"
           memory: "20Gi"
-          gpu: "1"
+          nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index 7cccf70782..de6ddf7029 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
@@ -41,11 +55,11 @@ spec:
         requests:
           cpu: "20"
           memory: "40Gi"
-          gpu: "2"
+          nvidia.com/gpu: "2"
         limits:
           cpu: "20"
           memory: "40Gi"
-          gpu: "2"
+          nvidia.com/gpu: "2"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
@@ -69,11 +83,11 @@ spec:
         requests:
           cpu: "20"
           memory: "40Gi"
-          gpu: "2"
+          nvidia.com/gpu: "2"
         limits:
           cpu: "20"
           memory: "40Gi"
-          gpu: "2"
+          nvidia.com/gpu: "2"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1

From f7a1f53449f4f8383b33025d82629e94e0a28239 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 7 Jul 2025 12:54:57 -0700
Subject: [PATCH 12/14] fix: disagg_planner crd serve

---
 examples/vllm_v0/deploy/disagg_planner.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index de6ddf7029..8386efc91a 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -67,7 +67,7 @@ spec:
           args:
             - dynamo
             - serve
-            - graphs.disagg_planner:Frontend
+            - graphs.disagg_planner:VllmWorker
             - --system-app-port
             - "5000"
             - --enable-system-app
@@ -95,7 +95,7 @@ spec:
           args:
             - dynamo
             - serve
-            - graphs.disagg_planner:Frontend
+            - graphs.disagg_planner:PrefillWorker
             - --system-app-port
             - "5000"
             - --enable-system-app
@@ -147,7 +147,7 @@ spec:
           args:
             - dynamo
             - serve
-            - graphs.disagg_planner:Frontend
+            - graphs.disagg_planner:Prometheus
             - --system-app-port
             - "5000"
             - --enable-system-app

From cce31485e2db099b4fd5e3d704cc70ece17cc77d Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 7 Jul 2025 15:56:32 -0700
Subject: [PATCH 13/14] feat: use latest instead of hardcoding vllm runtime
 image

---
 examples/llm/deploy/agg.yaml                |  6 +++---
 examples/llm/deploy/agg_router.yaml         |  8 ++++----
 examples/llm/deploy/disagg.yaml             |  8 ++++----
 examples/llm/deploy/disagg_router.yaml      | 10 +++++-----
 examples/vllm_v0/deploy/agg.yaml            |  4 ++--
 examples/vllm_v0/deploy/disagg.yaml         |  6 +++---
 examples/vllm_v0/deploy/disagg_planner.yaml | 10 +++++-----
 7 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml
index 242c0e470f..c277e974c6 100644
--- a/examples/llm/deploy/agg.yaml
+++ b/examples/llm/deploy/agg.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -59,7 +59,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -86,7 +86,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml
index 94218ddb53..fa40fe2e31 100644
--- a/examples/llm/deploy/agg_router.yaml
+++ b/examples/llm/deploy/agg_router.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -59,7 +59,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -84,7 +84,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -111,7 +111,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml
index 87a76b7b32..d64089f5a2 100644
--- a/examples/llm/deploy/disagg.yaml
+++ b/examples/llm/deploy/disagg.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -59,7 +59,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -86,7 +86,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -113,7 +113,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml
index c91b36aba5..152d09f7ed 100644
--- a/examples/llm/deploy/disagg_router.yaml
+++ b/examples/llm/deploy/disagg_router.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -59,7 +59,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -84,7 +84,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -111,7 +111,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
@@ -138,7 +138,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/llm
           args:
             - dynamo
diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml
index 6c6b04e1ff..45af6f3cee 100644
--- a/examples/vllm_v0/deploy/agg.yaml
+++ b/examples/vllm_v0/deploy/agg.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -61,7 +61,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml
index 22712e2527..c4f41342e8 100644
--- a/examples/vllm_v0/deploy/disagg.yaml
+++ b/examples/vllm_v0/deploy/disagg.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -61,7 +61,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -88,7 +88,7 @@ spec:
           nvidia.com/gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml
index 8386efc91a..082e3cb9bc 100644
--- a/examples/vllm_v0/deploy/disagg_planner.yaml
+++ b/examples/vllm_v0/deploy/disagg_planner.yaml
@@ -34,7 +34,7 @@ spec:
           memory: "4Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -62,7 +62,7 @@ spec:
           nvidia.com/gpu: "2"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -90,7 +90,7 @@ spec:
           nvidia.com/gpu: "2"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -116,7 +116,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo
@@ -142,7 +142,7 @@ spec:
           memory: "1000Mi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
           workingDir: /workspace/examples/vllm_v0
           args:
             - dynamo

From 0fd6d1b7447a84b98e1f5375937622d4fd632ad6 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 7 Jul 2025 16:04:35 -0700
Subject: [PATCH 14/14] feat: add crds for vllm v1 examples to prepare for
 0.3.2 release

---
 examples/vllm_v1/deploy/agg.yaml            | 100 +++++++++++
 examples/vllm_v1/deploy/disagg.yaml         | 127 ++++++++++++++
 examples/vllm_v1/deploy/disagg_planner.yaml | 182 ++++++++++++++++++++
 3 files changed, 409 insertions(+)
 create mode 100644 examples/vllm_v1/deploy/agg.yaml
 create mode 100644 examples/vllm_v1/deploy/disagg.yaml
 create mode 100644 examples/vllm_v1/deploy/disagg_planner.yaml

diff --git a/examples/vllm_v1/deploy/agg.yaml b/examples/vllm_v1/deploy/agg.yaml
new file mode 100644
index 0000000000..08dd5e22fc
--- /dev/null
+++ b/examples/vllm_v1/deploy/agg.yaml
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    SimpleLoadBalancer:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "20Gi"
+        limits:
+          cpu: "1"
+          memory: "20Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:SimpleLoadBalancer
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - SimpleLoadBalancer
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-agg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.agg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
diff --git a/examples/vllm_v1/deploy/disagg.yaml b/examples/vllm_v1/deploy/disagg.yaml
new file mode 100644
index 0000000000..a85459f07d
--- /dev/null
+++ b/examples/vllm_v1/deploy/disagg.yaml
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+    SimpleLoadBalancer:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-disagg
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "20Gi"
+        limits:
+          cpu: "1"
+          memory: "20Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:SimpleLoadBalancer
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - SimpleLoadBalancer
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          nvidia.com/gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg:VllmPrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmPrefillWorker
diff --git a/examples/vllm_v1/deploy/disagg_planner.yaml b/examples/vllm_v1/deploy/disagg_planner.yaml
new file mode 100644
index 0000000000..bf73f35e44
--- /dev/null
+++ b/examples/vllm_v1/deploy/disagg_planner.yaml
@@ -0,0 +1,182 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg-planner
+spec:
+  envs:
+    - name: DYN_DEPLOYMENT_CONFIG
+      value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}'
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg-planner
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "2"
+          memory: "4Gi"
+        limits:
+          cpu: "2"
+          memory: "4Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Frontend
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Frontend
+
+    SimpleLoadBalancer:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-disagg-planner
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "20Gi"
+        limits:
+          cpu: "1"
+          memory: "20Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:SimpleLoadBalancer
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - SimpleLoadBalancer
+
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg-planner
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:VllmDecodeWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmDecodeWorker
+
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg-planner
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+        limits:
+          cpu: "20"
+          memory: "40Gi"
+          nvidia.com/gpu: "2"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:VllmPrefillWorker
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - VllmPrefillWorker
+
+    Planner:
+      dynamoNamespace: vllm-v1-disagg-planner
+      replicas: 1
+      componentType: planner
+      resources:
+        requests:
+          cpu: "2"
+          memory: "2Gi"
+        limits:
+          cpu: "2"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Planner
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Planner
+            - --Planner.environment=kubernetes
+
+    Prometheus:
+      dynamoNamespace: vllm-v1-disagg-planner
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1000m"
+          memory: "1000Mi"
+        limits:
+          cpu: "1000m"
+          memory: "1000Mi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - serve
+            - graphs.disagg_planner:Prometheus
+            - --system-app-port
+            - "5000"
+            - --enable-system-app
+            - --use-default-health-checks
+            - --service-name
+            - Prometheus