From 065cb2ad0e0f7065417b9917192b5ca44df4dfb2 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 11 Jul 2025 09:28:04 -0700
Subject: [PATCH 01/58] feat: update k8s deploy yamls to use binary/python3

---
 examples/vllm_v1/deploy/agg.yaml           | 60 +++------------
 examples/vllm_v1/deploy/agg_router.yaml    | 63 ++++++++++++++++
 examples/vllm_v1/deploy/disagg.yaml        | 79 +++++---------------
 examples/vllm_v1/deploy/disagg_router.yaml | 87 ++++++++++++++++++++++
 4 files changed, 177 insertions(+), 112 deletions(-)
 create mode 100644 examples/vllm_v1/deploy/agg_router.yaml
 create mode 100644 examples/vllm_v1/deploy/disagg_router.yaml

diff --git a/examples/vllm_v1/deploy/agg.yaml b/examples/vllm_v1/deploy/agg.yaml
index 50b1ed8d22b..c3fda4060de 100644
--- a/examples/vllm_v1/deploy/agg.yaml
+++ b/examples/vllm_v1/deploy/agg.yaml
@@ -31,47 +31,13 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
             - dynamo
-            - serve
-            - graphs.agg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/agg.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-agg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "20Gi"
-        limits:
-          cpu: "1"
-          memory: "20Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.agg:SimpleLoadBalancer
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/agg.yaml
+            - run
+            - in=http
+            - out=dyn
     VllmDecodeWorker:
       envFromSecret: hf-token-secret
       dynamoNamespace: vllm-v1-agg
@@ -87,17 +53,11 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
-            - dynamo
-            - serve
-            - graphs.agg:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/agg.yaml
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
diff --git a/examples/vllm_v1/deploy/agg_router.yaml b/examples/vllm_v1/deploy/agg_router.yaml
new file mode 100644
index 00000000000..2e1117fb398
--- /dev/null
+++ b/examples/vllm_v1/deploy/agg_router.yaml
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: agg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-agg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - run
+            - in=http
+            - out=dyn
+    VllmDecodeWorker:
+      envFromSecret: hf-token-secret
+      dynamoNamespace: vllm-v1-agg
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
diff --git a/examples/vllm_v1/deploy/disagg.yaml b/examples/vllm_v1/deploy/disagg.yaml
index 67377558d2b..6c93c7f49c9 100644
--- a/examples/vllm_v1/deploy/disagg.yaml
+++ b/examples/vllm_v1/deploy/disagg.yaml
@@ -31,47 +31,13 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
             - dynamo
-            - serve
-            - graphs.disagg:Frontend
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - Frontend
-            - -f
-            - ./configs/disagg.yaml
-    SimpleLoadBalancer:
-      envFromSecret: hf-token-secret
-      dynamoNamespace: vllm-v1-disagg
-      replicas: 1
-      resources:
-        requests:
-          cpu: "1"
-          memory: "20Gi"
-        limits:
-          cpu: "1"
-          memory: "20Gi"
-      extraPodSpec:
-        mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
-          workingDir: /workspace/examples/vllm_v1
-          args:
-            - dynamo
-            - serve
-            - graphs.disagg:SimpleLoadBalancer
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - SimpleLoadBalancer
-            - -f
-            - ./configs/disagg.yaml
+            - run
+            - in=http
+            - out=dyn
     VllmDecodeWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
@@ -87,20 +53,14 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
-            - dynamo
-            - serve
-            - graphs.disagg:VllmDecodeWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmDecodeWorker
-            - -f
-            - ./configs/disagg.yaml
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
     VllmPrefillWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
@@ -116,17 +76,12 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.1
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
           workingDir: /workspace/examples/vllm_v1
           args:
-            - dynamo
-            - serve
-            - graphs.disagg:VllmPrefillWorker
-            - --system-app-port
-            - "5000"
-            - --enable-system-app
-            - --use-default-health-checks
-            - --service-name
-            - VllmPrefillWorker
-            - -f
-            - ./configs/disagg.yaml
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
+            - --is-prefill-worker
diff --git a/examples/vllm_v1/deploy/disagg_router.yaml b/examples/vllm_v1/deploy/disagg_router.yaml
new file mode 100644
index 00000000000..09c55997df9
--- /dev/null
+++ b/examples/vllm_v1/deploy/disagg_router.yaml
@@ -0,0 +1,87 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: disagg
+spec:
+  services:
+    Frontend:
+      dynamoNamespace: vllm-v1-disagg
+      componentType: main
+      replicas: 1
+      resources:
+        requests:
+          cpu: "1"
+          memory: "2Gi"
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - dynamo
+            - run
+            - in=http
+            - out=dyn
+    VllmDecodeWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 2
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
+    VllmPrefillWorker:
+      dynamoNamespace: vllm-v1-disagg
+      envFromSecret: hf-token-secret
+      replicas: 1
+      resources:
+        requests:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+        limits:
+          cpu: "10"
+          memory: "20Gi"
+          gpu: "1"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          workingDir: /workspace/examples/vllm_v1
+          args:
+            - python3
+            - components/main.py
+            - --model
+            - Qwen/Qwen3-0.6B
+            - --enforce-eager
+            - --is-prefill-worker

From aee478c0d42811d0025961de7d222780a6f988a6 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Fri, 11 Jul 2025 15:09:55 -0700
Subject: [PATCH 02/58] config part working

---
 benchmarks/profiler/profile_sla.py    |  29 +++-
 benchmarks/profiler/utils/config.py   | 213 +++++++-------------------
 benchmarks/profiler/utils/defaults.py |   3 +
 3 files changed, 78 insertions(+), 167 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 8bf0ad81611..7d1f632ce47 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -52,12 +52,12 @@
     parser.add_argument(
         "--backend",
         type=str,
-        default="vllm_v0",
-        choices=["vllm_v0", "vllm_v1"],
-        help="backend type (currently only vllm is supported)",
+        default="vllm_v1",
+        choices=["vllm_v1"],
+        help="backend type, currently support [vllm_v1]",
     )
     parser.add_argument(
-        "--config", type=str, required=True, help="Path to the dynamo config file"
+        "--config", type=str, required=True, help="Path to the DynamoGraphDeployment config file"
     )
     parser.add_argument(
         "--example-dir",
@@ -71,6 +71,18 @@
         default="profiling_results",
         help="Path to the output results directory",
     )
+    parser.add_argument(
+        "--min-num-gpus-per-engine",
+        type=int,
+        default=1,
+        help="minimum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--max-num-gpus-per-engine",
+        type=int,
+        default=8,
+        help="maximum number of GPUs per engine",
+    )
     parser.add_argument(
         "--isl", type=int, default=3000, help="target input sequence length"
     )
@@ -121,10 +133,11 @@
     with open(args.config, "r") as f:
         config = yaml.safe_load(f)
 
-    # Get the number of available GPUs
-    available_gpus = get_available_gpu_count()
-
-    profile_tp_size = [2**i for i in range(int(math.log2(available_gpus)) + 1)]
+    profile_tp_size = [
+        2**i
+        for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
+        if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine
+    ]
     logger.info(f"Profiling TP sizes: {profile_tp_size}")
 
     os.makedirs(args.output_dir, exist_ok=True)
diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index 0703cc64b72..2cdb8db2e70 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -14,8 +14,10 @@
 # limitations under the License.
 
 import logging
+from copy import deepcopy
 from typing import Literal
 
+from utils.defaults import DEFAULT_MODEL_NAME, DYNAMO_RUN_DEFAULT_PORT
 from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 logger = logging.getLogger(__name__)
@@ -28,190 +30,84 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
-
-class VllmV0ConfigModifier:
+class VllmV1ConfigModifier:
     @classmethod
     def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
-        config = config.copy()
+        config = deepcopy(config)
 
         # disable planner
-        if "Planner" in config:
-            config["Planner"]["no-operation"] = True
+        if "Planner" in config["spec"]["services"]:
+            del config["spec"]["services"]["Planner"]
 
         if target == "prefill":
-            if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
-                # make PrefillWorker into VllmWorker
-                del config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
-                config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker] = config[
-                    WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker
-                ]
-                del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]
-
-            # to profile prefill, we disable prefix caching
-            config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "enable-prefix-caching"
-            ] = False
-        elif target == "decode":
-            if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
-                del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]
+            # convert prefill worker into decode worker
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker] = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
+            del config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
 
-            # to profile prefill, we enable prefix caching to pass the prefill stage
-            config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "enable-prefix-caching"
-            ] = True
+            args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["extraPodSpec"]["mainContainer"]["args"]
 
-        # set num workers to 1
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
-            "workers"
-        ] = 1
-
-        # set PP to 1
-        if (
-            "pipeline-parallel-size"
-            in config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
-            and config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "pipeline-parallel-size"
-            ]
-            > 1
-        ):
-            logger.warning("Currently we only support TP, setting PP to 1")
-            config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-                "pipeline-parallel-size"
-            ] = 1
-
-        # always local prefill
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-            "remote-prefill"
-        ] = False
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-            "conditional-disagg"
-        ] = False
+            # remove --is-prefill-worker flag
+            args.remove("--is-prefill-worker")
 
-        return config
+            # disable prefix caching
+            if "--enable-prefix-caching" in args:
+                args.remove("--enable-prefix-caching")
+            if "--no-enable-prefix-caching" not in args:
+                args.append("--no-enable-prefix-caching")
 
-    @classmethod
-    def set_config_tp_size(cls, config: dict, tp_size: int):
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
-            "tensor-parallel-size"
-        ] = tp_size
-        config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
-            "resources"
-        ]["gpu"] = tp_size
-        return config
-
-    @classmethod
-    def get_model_name(cls, config: dict) -> str:
-        if "Common" in config and "served_model_name" in config["Common"]:
-            return config["Common"]["served_model_name"]
-        else:
-            return config["Frontend"]["served_model_name"]
-
-    @classmethod
-    def get_port(cls, config: dict) -> int:
-        if "Common" in config and "port" in config["Common"]:
-            return config["Common"]["port"]
-        else:
-            return config["Frontend"]["port"]
-
-    @classmethod
-    def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
-        try:
-            with open(dynamo_log_fn, "r") as f:
-                for line in f:
-                    if "Maximum concurrency for" in line:
-                        line = line.strip().split("Maximum concurrency for ")[1]
-                        token_count = int(line.split(" tokens per request: ")[0])
-                        concurrency = float(line.split(" tokens per request: ")[1][:-1])
-
-                        logger.info(
-                            f"Found KV cache info: {token_count} x {concurrency} = {int(token_count * concurrency)}"
-                        )
-                        return int(token_count * concurrency)
-        except Exception as e:
-            logger.warning(
-                f"Failed to parse KV cache size from line: {line}. Error: {e}"
-            )
-        return 0
-
-
-class VllmV1ConfigModifier:
-    @classmethod
-    def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
-        config = config.copy()
-
-        # disable planner
-        if "Planner" in config:
-            config["Planner"]["no-operation"] = True
-
-        # turn-off disagg
-        config["SimpleLoadBalancer"]["enable_disagg"] = False
-
-        if target == "prefill":
-            if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
-                # make VllmPrefillWorker into VllmDecodeWorker
-                del config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
-                config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker] = config[
-                    WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
-                ]
-                del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
-
-            # to profile prefill, we disable prefix caching
-            config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-                "enable-prefix-caching"
-            ] = False
         elif target == "decode":
-            if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
-                del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
+            # delete prefill worker
+            del config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
 
-            # to profile prefill, we enable prefix caching to pass the prefill stage
-            config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-                "enable-prefix-caching"
-            ] = True
+            args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["extraPodSpec"]["mainContainer"]["args"]
+
+            # enable prefix caching
+            if "--enable-prefix-caching" not in args:
+                args.append("--enable-prefix-caching")
+            if "--no-enable-prefix-caching" in args:
+                args.remove("--no-enable-prefix-caching")
 
         # set num workers to 1
-        config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
-            "workers"
-        ] = 1
-
-        # set PP to 1
-        if (
-            "pipeline-parallel-size"
-            in config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
-            and config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-                "pipeline-parallel-size"
-            ]
-            > 1
-        ):
-            logger.warning("Currently we only support TP, setting PP to 1")
-            config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-                "pipeline-parallel-size"
-            ] = 1
+        decode_worker_config = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
+        decode_worker_config["replicas"] = 1
 
         return config
 
     @classmethod
     def set_config_tp_size(cls, config: dict, tp_size: int):
-        config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
-            "tensor-parallel-size"
-        ] = tp_size
-        config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
-            "resources"
-        ]["gpu"] = tp_size
+        config = deepcopy(config)
+
+        args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["extraPodSpec"]["mainContainer"]["args"]
+
+        try:
+            idx = args.index("--tensor-parallel-size")
+            args[idx + 1] = str(tp_size)
+        except ValueError:
+            args.append("--tensor-parallel-size")
+            args.append(str(tp_size))
+
         return config
 
     @classmethod
     def get_model_name(cls, config: dict) -> str:
-        if "Common" in config and "served_model_name" in config["Common"]:
-            return config["Common"]["served_model_name"]
-        else:
-            return config["Frontend"]["served_model_name"]
+        worker_name = WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"]["args"]
+
+        for i, arg in enumerate(args):
+            if arg == "--model" and i + 1 < len(args):
+                return args[i + 1]
+        
+        logger.warning(f"Model name not found in configuration args, using default model name: {DEFAULT_MODEL_NAME}")
+        return DEFAULT_MODEL_NAME
 
     @classmethod
     def get_port(cls, config: dict) -> int:
-        if "Common" in config and "port" in config["Common"]:
-            return config["Common"]["port"]
-        else:
-            return config["Frontend"]["port"]
+        args = config["spec"]["services"]["Frontend"]["extraPodSpec"]["mainContainer"]["args"]
+        for arg in args:
+            if arg.startswith("port="):
+                return int(arg.split("=")[1])
+        logger.warning(f"Port not found in configuration args, using default port: {DYNAMO_RUN_DEFAULT_PORT}")
+        return DYNAMO_RUN_DEFAULT_PORT
 
     @classmethod
     def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
@@ -237,6 +133,5 @@ def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
 
 
 CONFIG_MODIFIERS = {
-    "vllm_v0": VllmV0ConfigModifier,
     "vllm_v1": VllmV1ConfigModifier,
 }
diff --git a/benchmarks/profiler/utils/defaults.py b/benchmarks/profiler/utils/defaults.py
index efbdbe07af0..21fd7b43f6f 100644
--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -29,3 +29,6 @@
     450,
     500,
 ]
+
+DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B"
+DYNAMO_RUN_DEFAULT_PORT = 8080
\ No newline at end of file

From 9455ad16b2ec34ba9d0c193dfaa25b5f7349be8c Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 11 Jul 2025 23:32:02 -0700
Subject: [PATCH 03/58] feat: add component type worker and bump image

---
 examples/vllm_v1/deploy/agg.yaml           | 7 ++++---
 examples/vllm_v1/deploy/agg_router.yaml    | 7 ++++---
 examples/vllm_v1/deploy/disagg.yaml        | 2 ++
 examples/vllm_v1/deploy/disagg_router.yaml | 2 ++
 4 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/examples/vllm_v1/deploy/agg.yaml b/examples/vllm_v1/deploy/agg.yaml
index c3fda4060de..c1305c98aec 100644
--- a/examples/vllm_v1/deploy/agg.yaml
+++ b/examples/vllm_v1/deploy/agg.yaml
@@ -15,7 +15,7 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: agg
+  name: vllm-v1-agg
 spec:
   services:
     Frontend:
@@ -31,7 +31,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.3
           workingDir: /workspace/examples/vllm_v1
           args:
             - dynamo
@@ -41,6 +41,7 @@ spec:
     VllmDecodeWorker:
       envFromSecret: hf-token-secret
       dynamoNamespace: vllm-v1-agg
+      componentType: worker
       replicas: 1
       resources:
         requests:
@@ -53,7 +54,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.3
           workingDir: /workspace/examples/vllm_v1
           args:
             - python3
diff --git a/examples/vllm_v1/deploy/agg_router.yaml b/examples/vllm_v1/deploy/agg_router.yaml
index 2e1117fb398..76f81716865 100644
--- a/examples/vllm_v1/deploy/agg_router.yaml
+++ b/examples/vllm_v1/deploy/agg_router.yaml
@@ -15,7 +15,7 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: agg
+  name: vllm-v1-agg-router
 spec:
   services:
     Frontend:
@@ -31,7 +31,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.0
           workingDir: /workspace/examples/vllm_v1
           args:
             - dynamo
@@ -41,6 +41,7 @@ spec:
     VllmDecodeWorker:
       envFromSecret: hf-token-secret
       dynamoNamespace: vllm-v1-agg
+      componentType: worker
       replicas: 2
       resources:
         requests:
@@ -53,7 +54,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.0
           workingDir: /workspace/examples/vllm_v1
           args:
             - python3
diff --git a/examples/vllm_v1/deploy/disagg.yaml b/examples/vllm_v1/deploy/disagg.yaml
index 6c93c7f49c9..65a4d48a363 100644
--- a/examples/vllm_v1/deploy/disagg.yaml
+++ b/examples/vllm_v1/deploy/disagg.yaml
@@ -41,6 +41,7 @@ spec:
     VllmDecodeWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
+      componentType: worker
       replicas: 1
       resources:
         requests:
@@ -64,6 +65,7 @@ spec:
     VllmPrefillWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
+      componentType: worker
       replicas: 1
       resources:
         requests:
diff --git a/examples/vllm_v1/deploy/disagg_router.yaml b/examples/vllm_v1/deploy/disagg_router.yaml
index 09c55997df9..790541ea11f 100644
--- a/examples/vllm_v1/deploy/disagg_router.yaml
+++ b/examples/vllm_v1/deploy/disagg_router.yaml
@@ -41,6 +41,7 @@ spec:
     VllmDecodeWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
+      componentType: worker
       replicas: 2
       resources:
         requests:
@@ -64,6 +65,7 @@ spec:
     VllmPrefillWorker:
       dynamoNamespace: vllm-v1-disagg
       envFromSecret: hf-token-secret
+      componentType: worker
       replicas: 1
       resources:
         requests:

From 7de97eff81c36ed41f625496928fbb7746e880d4 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Mon, 14 Jul 2025 13:09:08 -0700
Subject: [PATCH 04/58] fix: using health checks exposed by dynamo-run

---
 examples/vllm/deploy/agg.yaml | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/examples/vllm/deploy/agg.yaml b/examples/vllm/deploy/agg.yaml
index c1305c98aec..be9397f06a0 100644
--- a/examples/vllm/deploy/agg.yaml
+++ b/examples/vllm/deploy/agg.yaml
@@ -19,6 +19,24 @@ metadata:
 spec:
   services:
     Frontend:
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8000
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'curl -s http://localhost:8000/health | jq -e ".status == \"healthy\""'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
       dynamoNamespace: vllm-v1-agg
       componentType: main
       replicas: 1
@@ -38,6 +56,8 @@ spec:
             - run
             - in=http
             - out=dyn
+            - --http-port
+            - "8000"
     VllmDecodeWorker:
       envFromSecret: hf-token-secret
       dynamoNamespace: vllm-v1-agg

From 51835db6e2c6d0f9b38f933f42859dda4a2c0095 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Mon, 14 Jul 2025 13:44:26 -0700
Subject: [PATCH 05/58] fix: check for message in logs

---
 examples/vllm/deploy/agg.yaml | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/examples/vllm/deploy/agg.yaml b/examples/vllm/deploy/agg.yaml
index be9397f06a0..c498c265d43 100644
--- a/examples/vllm/deploy/agg.yaml
+++ b/examples/vllm/deploy/agg.yaml
@@ -60,6 +60,25 @@ spec:
             - "8000"
     VllmDecodeWorker:
       envFromSecret: hf-token-secret
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
       dynamoNamespace: vllm-v1-agg
       componentType: worker
       replicas: 1
@@ -77,8 +96,4 @@ spec:
           image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.3
           workingDir: /workspace/examples/vllm_v1
           args:
-            - python3
-            - components/main.py
-            - --model
-            - Qwen/Qwen3-0.6B
-            - --enforce-eager
+            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"

From 34bc79cd48c1b5bb0beb39e502ff3fa2288cbc53 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 14 Jul 2025 14:25:26 -0700
Subject: [PATCH 06/58] define apis

---
 benchmarks/profiler/profile_sla.py     | 12 ++++++++++--
 benchmarks/profiler/utils/k8s_utils.py |  7 +++++++
 2 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/profiler/utils/k8s_utils.py

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 7d1f632ce47..86916987c66 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -36,6 +36,7 @@
     shutdown_deployment,
     wait_for_server_ready,
 )
+from utils.k8s_utils import deploy_dynamo_graph_deployment, shutdown_deployment
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -160,9 +161,16 @@
         os.makedirs(work_dir, exist_ok=True)
 
         prefill_config_fn = f"{work_dir}/config.yaml"
-        dynamo_log_fn = f"{work_dir}/dynamo.log"
         with open(prefill_config_fn, "w") as f:
             yaml.dump(prefill_config, f)
+            
+        k8s_deployment = deploy_dynamo_graph_deployment(
+            config=prefill_config,
+            log_dir=f"{work_dir}/log",
+            model_name=model_name,
+            port=port,
+            timeout=600, # 10 minutes timeout waiting for server to be ready 
+        )
 
         # Start the dynamo serve process
         logger.info(f"Starting dynamo serve with TP size {tp_size}...")
@@ -192,7 +200,7 @@
             prefill_ttft.append(ttft)
             prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
 
-        shutdown_deployment(dynamo_process)
+        shutdown_deployment(k8s_deployment)
 
     # Plot the results as a 2D scatter plot
     if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
diff --git a/benchmarks/profiler/utils/k8s_utils.py b/benchmarks/profiler/utils/k8s_utils.py
new file mode 100644
index 00000000000..c4ed91b28ec
--- /dev/null
+++ b/benchmarks/profiler/utils/k8s_utils.py
@@ -0,0 +1,7 @@
+def deploy_dynamo_graph_deployment(config, log_dir, model_name, port, timeout):
+    # TODO
+    return deployment_object
+
+def shutdown_deployment(deployment_object):
+    # TODO
+    pass
\ No newline at end of file

From 8c22d1411ba86ac46edfe604820a7eb525bfc8c0 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 14 Jul 2025 14:45:59 -0700
Subject: [PATCH 07/58] update script

---
 benchmarks/profiler/profile_sla.py  | 220 +++++++++++-----------------
 benchmarks/profiler/utils/config.py |   1 +
 2 files changed, 88 insertions(+), 133 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 86916987c66..d1733c71a47 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -172,23 +172,6 @@
             timeout=600, # 10 minutes timeout waiting for server to be ready 
         )
 
-        # Start the dynamo serve process
-        logger.info(f"Starting dynamo serve with TP size {tp_size}...")
-        dynamo_serve_cmd = get_dynamo_serve_cmd(prefill_config_fn)
-        with open(dynamo_log_fn, "w") as dynamo_log_f:
-            dynamo_process = subprocess.Popen(
-                dynamo_serve_cmd,
-                stdout=dynamo_log_f,
-                stderr=subprocess.STDOUT,
-                text=True,
-                cwd=args.example_dir,
-                preexec_fn=os.setsid,  # Use process group for clean termination
-            )
-
-        if not wait_for_server_ready(model_name, port):
-            logger.error(f"Server did not become ready, skip profiling tp={tp_size}")
-            break
-
         # run genai-perf
         genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
         gap_result = benchmark_prefill(
@@ -230,28 +213,18 @@
         os.makedirs(work_dir, exist_ok=True)
 
         decode_config_fn = f"{work_dir}/config.yaml"
-        dynamo_log_fn = f"{work_dir}/dynamo.log"
         with open(decode_config_fn, "w") as f:
             yaml.dump(decode_config, f)
+            
+        k8s_deployment = deploy_dynamo_graph_deployment(
+            config=decode_config,
+            log_dir=f"{work_dir}/log",
+            model_name=model_name,
+            port=port,
+            timeout=600, # 10 minutes timeout waiting for server to be ready 
+        )
 
-        # Start the dynamo serve process
-        logger.info(f"Starting dynamo serve with TP size {tp_size}...")
-        dynamo_serve_cmd = get_dynamo_serve_cmd(decode_config_fn)
-        with open(dynamo_log_fn, "w") as dynamo_log_f:
-            dynamo_process = subprocess.Popen(
-                dynamo_serve_cmd,
-                stdout=dynamo_log_f,
-                stderr=subprocess.STDOUT,
-                text=True,
-                cwd=args.example_dir,
-                preexec_fn=os.setsid,  # Use process group for clean termination
-            )
-
-        if not wait_for_server_ready(model_name, port):
-            logger.error(f"Server did not become ready, skip profiling tp={tp_size}")
-            break
-
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(dynamo_log_fn)
+        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/log")
         max_concurrency = max_kv_tokens // (args.isl + args.osl)
         sweep_num_request = [
             num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
@@ -283,7 +256,7 @@
                 decode_concurrency.append(num_request)
                 decode_kv_cache_size.append(max_kv_tokens)
 
-        shutdown_deployment(dynamo_process)
+        shutdown_deployment(k8s_deployment)
 
         # Store partial results for plotting later
         decode_results.append((tp_size, engine_decode_itl, engine_decode_thpt_per_gpu))
@@ -364,44 +337,34 @@
     os.makedirs(work_dir, exist_ok=True)
 
     prefill_config_fn = f"{work_dir}/config.yaml"
-
-    dynamo_log_fn = f"{work_dir}/dynamo.log"
     with open(prefill_config_fn, "w") as f:
         yaml.dump(prefill_config, f)
+            
+    k8s_deployment = deploy_dynamo_graph_deployment(
+        config=prefill_config,
+        log_dir=f"{work_dir}/log",
+        model_name=model_name,
+        port=port,
+        timeout=600, # 10 minutes timeout waiting for server to be ready 
+    )
 
-    # Start the dynamo serve process
-    logger.info(f"Starting dynamo serve with TP size {tp_size}...")
-    dynamo_serve_cmd = get_dynamo_serve_cmd(prefill_config_fn)
-    with open(dynamo_log_fn, "w") as dynamo_log_f:
-        dynamo_process = subprocess.Popen(
-            dynamo_serve_cmd,
-            stdout=dynamo_log_f,
-            stderr=subprocess.STDOUT,
-            text=True,
-            cwd=args.example_dir,
-            preexec_fn=os.setsid,  # Use process group for clean termination
+    for isl in range(
+        100,
+        args.max_context_length,
+        (args.max_context_length - 100) // args.prefill_interpolation_granularity,
+    ):
+        # run genai-perf
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+        gap_result = benchmark_prefill(
+            isl, genai_perf_artifact_dir, model_name, port
         )
+        if gap_result is not None:
+            ttft = gap_result["time_to_first_token"]["avg"]
+            prefill_isl.append(isl)
+            prefill_ttft.append(ttft)
+            prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
-    if not wait_for_server_ready(model_name, port):
-        logger.error(f"Server did not become ready, skip profiling tp={tp_size}")
-    else:
-        for isl in range(
-            100,
-            args.max_context_length,
-            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-        ):
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-            gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, port
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_isl.append(isl)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
-
-    shutdown_deployment(dynamo_process)
+    shutdown_deployment(k8s_deployment)
 
     # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
     if len(prefill_isl) > 2:
@@ -443,73 +406,64 @@
     os.makedirs(work_dir, exist_ok=True)
 
     decode_config_fn = f"{work_dir}/config.yaml"
-    dynamo_log_fn = f"{work_dir}/dynamo.log"
     with open(decode_config_fn, "w") as f:
         yaml.dump(decode_config, f)
+        
+    k8s_deployment = deploy_dynamo_graph_deployment(
+        config=decode_config,
+        log_dir=f"{work_dir}/log",
+        model_name=model_name,
+        port=port,
+        timeout=600, # 10 minutes timeout waiting for server to be ready 
+    )
 
-    # Start the dynamo serve process
-    logger.info(f"Starting dynamo serve with TP size {tp_size}...")
-    dynamo_serve_cmd = get_dynamo_serve_cmd(decode_config_fn)
-    with open(dynamo_log_fn, "w") as dynamo_log_f:
-        dynamo_process = subprocess.Popen(
-            dynamo_serve_cmd,
-            stdout=dynamo_log_f,
-            stderr=subprocess.STDOUT,
-            text=True,
-            cwd=args.example_dir,
-            preexec_fn=os.setsid,  # Use process group for clean termination
+    max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/log")
+
+    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
+    for isl in range(
+        100,
+        args.max_context_length - osl,
+        (args.max_context_length - osl) // args.decode_interpolation_granularity,
+    ):
+        max_concurrency = max_kv_tokens // (isl + osl)
+        sweep_num_request = list(
+            range(
+                1,
+                max_concurrency,
+                max_concurrency // args.decode_interpolation_granularity,
+            )
         )
-
-    if not wait_for_server_ready(model_name, port):
-        logger.error(f"Server did not become ready, skip profiling tp={tp_size}")
-    else:
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(dynamo_log_fn)
-
-        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-        for isl in range(
-            100,
-            args.max_context_length - osl,
-            (args.max_context_length - osl) // args.decode_interpolation_granularity,
-        ):
-            max_concurrency = max_kv_tokens // (isl + osl)
-            sweep_num_request = list(
-                range(
-                    1,
-                    max_concurrency,
-                    max_concurrency // args.decode_interpolation_granularity,
-                )
+        for num_request in sweep_num_request:
+            genai_perf_artifact_dir = (
+                f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
             )
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = (
-                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-                )
-                gap_result = benchmark_decode(
-                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+            gap_result = benchmark_decode(
+                isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+            )
+            if gap_result is not None:
+                itl = gap_result["inter_token_latency"]["avg"]
+                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                y_context_length.append(isl + osl / 2)
+                z_itl.append(itl)
+                z_thpt_per_gpu.append(
+                    gap_result["output_token_throughput"]["avg"] / tp_size
                 )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                    y_context_length.append(isl + osl / 2)
-                    z_itl.append(itl)
-                    z_thpt_per_gpu.append(
-                        gap_result["output_token_throughput"]["avg"] / tp_size
-                    )
-
-        shutdown_deployment(dynamo_process)
-
-        # Save the data points to a .npz file
-        save_path = f"{work_dir}/raw_data.npz"
-        np.savez(
-            save_path,
-            x_kv_usage=np.array(x_kv_usage),
-            y_context_length=np.array(y_context_length),
-            z_itl=np.array(z_itl),
-            z_thpt_per_gpu=np.array(z_thpt_per_gpu),
-            max_kv_tokens=np.array([max_kv_tokens]),
-        )
-        logger.info(f"Saved data points to {save_path}")
 
-        # Plot 3D surface
-        plot_decode_3d_surface(
-            x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
-        )
+    shutdown_deployment(k8s_deployment)
+
+    # Save the data points to a .npz file
+    save_path = f"{work_dir}/raw_data.npz"
+    np.savez(
+        save_path,
+        x_kv_usage=np.array(x_kv_usage),
+        y_context_length=np.array(y_context_length),
+        z_itl=np.array(z_itl),
+        z_thpt_per_gpu=np.array(z_thpt_per_gpu),
+        max_kv_tokens=np.array([max_kv_tokens]),
+    )
+    logger.info(f"Saved data points to {save_path}")
+
+    # Plot 3D surface
+    plot_decode_3d_surface(
+        x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
+    )
diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index 2cdb8db2e70..f4e59a28a60 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -111,6 +111,7 @@ def get_port(cls, config: dict) -> int:
 
     @classmethod
     def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
+        # TODO
         try:
             with open(dynamo_log_fn, "r") as f:
                 for line in f:

From 9856dde5673f764d0a7d1096c4dc673e352b188b Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Mon, 14 Jul 2025 16:18:27 -0700
Subject: [PATCH 08/58] fix: add dynamodeployment lib

---
 .../profiler/utils/dynamo_deployment.py       | 252 ++++++++++++++++++
 1 file changed, 252 insertions(+)
 create mode 100644 benchmarks/profiler/utils/dynamo_deployment.py

diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
new file mode 100644
index 00000000000..d9cc4c386c8
--- /dev/null
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env -S uv run --script
+#
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "PyYAML",
+#   "aiofiles",
+#   "kubernetes-asyncio",
+# ]
+# ///
+
+import asyncio
+import json
+import os
+import random
+import yaml
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+import aiofiles
+import kubernetes_asyncio as kubernetes
+from kubernetes_asyncio import client, config
+from contextlib import asynccontextmanager
+
+# Example chat completion request for testing deployments
+EXAMPLE_CHAT_REQUEST = {
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [
+        {
+            "role": "user",
+            "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
+        }
+    ],
+    "stream": False,
+    "max_tokens": 30
+}
+
+class DynamoDeploymentClient:
+    def __init__(self, namespace: str, deployment_name: str = "vllm-v1-agg", base_log_dir: Optional[str] = None):
+        """
+        Initialize the client with the namespace and deployment name.
+        
+        Args:
+            namespace: The Kubernetes namespace
+            deployment_name: Name of the deployment, defaults to vllm-v1-agg
+            base_log_dir: Base directory for storing logs, defaults to ./logs if not specified
+        """
+        self.namespace = namespace
+        self.deployment_name = deployment_name
+        self.components = []  # Will store component names from CR
+        self.deployment_spec = None  # Will store the full deployment spec
+        self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
+        
+    async def _init_kubernetes(self):
+        """Initialize kubernetes client"""
+        await config.load_kube_config()
+        self.k8s_client = client.ApiClient()
+        self.custom_api = client.CustomObjectsApi(self.k8s_client)
+        self.core_api = client.CoreV1Api(self.k8s_client)
+
+    async def create_deployment(self, deployment: Union[dict, str]):
+        """
+        Create a DynamoGraphDeployment from either a dict or yaml file path.
+        
+        Args:
+            deployment: Either a dict containing the deployment spec or a path to a yaml file
+        """
+        await self._init_kubernetes()
+        
+        if isinstance(deployment, str):
+            # Load from yaml file
+            async with aiofiles.open(deployment, 'r') as f:
+                content = await f.read()
+                self.deployment_spec = yaml.safe_load(content)
+        else:
+            self.deployment_spec = deployment
+            
+        # Extract component names
+        self.components = [svc.lower() for svc in self.deployment_spec['spec']['services'].keys()]
+        
+        # Ensure name and namespace are set correctly
+        self.deployment_spec['metadata']['name'] = self.deployment_name
+        self.deployment_spec['metadata']['namespace'] = self.namespace
+        
+        try:
+            await self.custom_api.create_namespaced_custom_object(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=self.namespace,
+                plural="dynamographdeployments",
+                body=self.deployment_spec
+            )
+        except kubernetes.client.rest.ApiException as e:
+            if e.status == 409:  # Already exists
+                print(f"Deployment {self.deployment_name} already exists")
+            else:
+                raise
+
+    async def wait_for_deployment_ready(self, timeout: int = 300):
+        """
+        Wait for the custom resource to be ready.
+        
+        Args:
+            timeout: Maximum time to wait in seconds
+        """
+        start_time = asyncio.get_event_loop().time()
+        while (asyncio.get_event_loop().time() - start_time) < timeout:
+            try:
+                status = await self.custom_api.get_namespaced_custom_object_status(
+                    group="nvidia.com",
+                    version="v1alpha1",
+                    namespace=self.namespace,
+                    plural="dynamographdeployments",
+                    name=self.deployment_name
+                )
+                if status.get('status', {}).get('ready', False):
+                    return True
+            except kubernetes.client.rest.ApiException:
+                pass
+            await asyncio.sleep(5)
+        raise TimeoutError("Deployment failed to become ready within timeout")
+
+    @asynccontextmanager
+    async def port_forward(self, port: Optional[int] = None):
+        """
+        Port forward the frontend service to local machine.
+        
+        Args:
+            port: Local port to use. If None, uses a random port.
+            
+        Yields:
+            The local port number being used
+        """
+        if port is None:
+            port = random.randint(49152, 65535)
+            
+        service_name = f"{self.deployment_name}-frontend"
+        cmd = f"kubectl port-forward service/{service_name} {port}:8000 -n {self.namespace}"
+        
+        process = await asyncio.create_subprocess_shell(
+            cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE
+        )
+        
+        try:
+            # Wait briefly to ensure port-forward is established
+            await asyncio.sleep(2)
+            yield port
+        finally:
+            process.terminate()
+            await process.wait()
+
+    async def check_chat_completion(self):
+        """
+        Test the deployment with a chat completion request.
+        """
+        async with self.port_forward() as port:
+            url = f"http://localhost:{port}/v1/chat/completions"
+            
+            cmd = f"""curl -X POST {url} \\
+                     -H "Content-Type: application/json" \\
+                     -d '{json.dumps(EXAMPLE_CHAT_REQUEST)}'"""
+            
+            process = await asyncio.create_subprocess_shell(
+                cmd,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await process.communicate()
+            return stdout.decode()
+
+    async def get_deployment_logs(self):
+        """
+        Get logs from all pods in the deployment, organized by component.
+        """
+        # Create logs directory
+        base_dir = self.base_log_dir / self.deployment_name
+        base_dir.mkdir(parents=True, exist_ok=True)
+        
+        for component in self.components:
+            component_dir = base_dir / component
+            component_dir.mkdir(exist_ok=True)
+            
+            # List pods for this component
+            label_selector = f"app={self.deployment_name}-{component}"
+            pods = await self.core_api.list_namespaced_pod(
+                namespace=self.namespace,
+                label_selector=label_selector
+            )
+            
+            # Get logs for each pod
+            for i, pod in enumerate(pods.items):
+                try:
+                    logs = await self.core_api.read_namespaced_pod_log(
+                        name=pod.metadata.name,
+                        namespace=self.namespace
+                    )
+                    async with aiofiles.open(component_dir / f"replica_{i}.log", 'w') as f:
+                        await f.write(logs)
+                except kubernetes.client.rest.ApiException as e:
+                    print(f"Error getting logs for pod {pod.metadata.name}: {e}")
+
+    async def delete_deployment(self):
+        """
+        Delete the DynamoGraphDeployment CR.
+        """
+        try:
+            await self.custom_api.delete_namespaced_custom_object(
+                group="nvidia.com",
+                version="v1alpha1",
+                namespace=self.namespace,
+                plural="dynamographdeployments",
+                name=self.deployment_name
+            )
+        except kubernetes.client.rest.ApiException as e:
+            if e.status != 404:  # Ignore if already deleted
+                raise
+
+async def main():
+    # Example usage with custom log directory
+    client = DynamoDeploymentClient(
+        namespace="default",
+        base_log_dir="/tmp/dynamo_logs"  # Example custom log directory
+    )
+    
+    try:
+        # Create deployment from yaml file
+        await client.create_deployment("examples/vllm/deploy/agg.yaml")
+        
+        # Wait for deployment to be ready
+        print("Waiting for deployment to be ready...")
+        await client.wait_for_deployment_ready()
+        print("Deployment is ready!")
+        
+        # Test chat completion
+        print("Testing chat completion...")
+        response = await client.check_chat_completion()
+        print(f"Chat completion response: {response}")
+        
+        # Get logs
+        print("Getting deployment logs...")
+        await client.get_deployment_logs()
+        print(f"Logs have been saved to {client.base_log_dir / client.deployment_name}!")
+        
+    finally:
+        # Cleanup
+        print("Cleaning up deployment...")
+        await client.delete_deployment()
+        print("Deployment deleted!")
+
+if __name__ == "__main__":
+    asyncio.run(main()) 
\ No newline at end of file

From 61a215b86460685ec31f9252814bebe5b99c10b5 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Mon, 14 Jul 2025 16:54:55 -0700
Subject: [PATCH 09/58] fix: working client lib

---
 .../profiler/utils/dynamo_deployment.py       | 59 +++++++++++++++----
 1 file changed, 46 insertions(+), 13 deletions(-)

diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index d9cc4c386c8..9d25eb07e6e 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -14,6 +14,7 @@
 import os
 import random
 import yaml
+import argparse
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 import aiofiles
@@ -103,6 +104,7 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
             timeout: Maximum time to wait in seconds
         """
         start_time = asyncio.get_event_loop().time()
+        # TODO: A little brittle, also should output intermediate status every so often.
         while (asyncio.get_event_loop().time() - start_time) < timeout:
             try:
                 status = await self.custom_api.get_namespaced_custom_object_status(
@@ -112,11 +114,30 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
                     plural="dynamographdeployments",
                     name=self.deployment_name
                 )
-                if status.get('status', {}).get('ready', False):
+                # print(f"Current status: {status.get('status', {})}")
+                
+                # Check both conditions:
+                # 1. Ready condition is True
+                # 2. State is successful
+                status_obj = status.get('status', {})
+                conditions = status_obj.get('conditions', [])
+                
+                ready_condition = False
+                for condition in conditions:
+                    if (condition.get('type') == 'Ready' and 
+                        condition.get('status') == 'True'):
+                        ready_condition = True
+                        break
+                
+                state_successful = status_obj.get('state') == 'successful'
+                
+                if ready_condition and state_successful:
+                    print("Deployment is ready: Ready condition is True and state is successful")
                     return True
+                    
             except kubernetes.client.rest.ApiException:
                 pass
-            await asyncio.sleep(5)
+            await asyncio.sleep(20)
         raise TimeoutError("Deployment failed to become ready within timeout")
 
     @asynccontextmanager
@@ -181,8 +202,10 @@ async def get_deployment_logs(self):
             component_dir = base_dir / component
             component_dir.mkdir(exist_ok=True)
             
-            # List pods for this component
-            label_selector = f"app={self.deployment_name}-{component}"
+            # List pods for this component using the selector label
+            # nvidia.com/selector: deployment-name-component
+            label_selector = f"nvidia.com/selector={self.deployment_name}-{component.lower()}"
+            
             pods = await self.core_api.list_namespaced_pod(
                 namespace=self.namespace,
                 label_selector=label_selector
@@ -195,7 +218,7 @@ async def get_deployment_logs(self):
                         name=pod.metadata.name,
                         namespace=self.namespace
                     )
-                    async with aiofiles.open(component_dir / f"replica_{i}.log", 'w') as f:
+                    async with aiofiles.open(component_dir / f"{i}.log", 'w') as f:
                         await f.write(logs)
                 except kubernetes.client.rest.ApiException as e:
                     print(f"Error getting logs for pod {pod.metadata.name}: {e}")
@@ -217,25 +240,35 @@ async def delete_deployment(self):
                 raise
 
 async def main():
-    # Example usage with custom log directory
+    parser = argparse.ArgumentParser(description='Deploy and manage DynamoGraphDeployment CRDs')
+    parser.add_argument('--namespace', '-n', required=True,
+                      help='Kubernetes namespace to deploy to (default: default)')
+    parser.add_argument('--yaml-file', '-f', required=True,
+                      help='Path to the DynamoGraphDeployment YAML file')
+    parser.add_argument('--log-dir', '-l', default='/tmp/dynamo_logs',
+                      help='Base directory for logs (default: /tmp/dynamo_logs)')
+    
+    args = parser.parse_args()
+
+    # Example usage with parsed arguments
     client = DynamoDeploymentClient(
-        namespace="default",
-        base_log_dir="/tmp/dynamo_logs"  # Example custom log directory
+        namespace=args.namespace,
+        base_log_dir=args.log_dir
     )
     
     try:
         # Create deployment from yaml file
-        await client.create_deployment("examples/vllm/deploy/agg.yaml")
+        await client.create_deployment(args.yaml_file)
         
         # Wait for deployment to be ready
         print("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
         print("Deployment is ready!")
         
-        # Test chat completion
-        print("Testing chat completion...")
-        response = await client.check_chat_completion()
-        print(f"Chat completion response: {response}")
+        # # Test chat completion
+        # print("Testing chat completion...")
+        # response = await client.check_chat_completion()
+        # print(f"Chat completion response: {response}")
         
         # Get logs
         print("Getting deployment logs...")

From 51413345ebf770749bab600c41cd3555608f95a9 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Mon, 14 Jul 2025 16:55:30 -0700
Subject: [PATCH 10/58] fix: working client lib

---
 benchmarks/profiler/utils/dynamo_deployment.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index 9d25eb07e6e..9bd9d408d00 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -281,5 +281,7 @@ async def main():
         await client.delete_deployment()
         print("Deployment deleted!")
 
+# run with:
+# uv run benchmarks/profiler/utils/dynamo_deployment.py -n mo-dyn-cloud -f ./examples/vllm/deploy/agg.yaml -l ./client_logs
 if __name__ == "__main__":
     asyncio.run(main()) 
\ No newline at end of file

From 8e25a299ed159668da66c287b3468640a9c71980 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 14 Jul 2025 17:32:49 -0700
Subject: [PATCH 11/58] integrate with utils.dynamo_deployment

---
 benchmarks/profiler/profile_sla.py     | 291 ++++++++++++++-----------
 benchmarks/profiler/utils/k8s_utils.py |   7 -
 2 files changed, 159 insertions(+), 139 deletions(-)
 delete mode 100644 benchmarks/profiler/utils/k8s_utils.py

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index d1733c71a47..a5ba38e129a 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -17,7 +17,7 @@
 import logging
 import math
 import os
-import subprocess
+import asyncio
 
 import numpy as np
 import yaml
@@ -30,13 +30,7 @@
     plot_prefill_interpolation,
     plot_prefill_performance,
 )
-from utils.utils import (
-    get_available_gpu_count,
-    get_dynamo_serve_cmd,
-    shutdown_deployment,
-    wait_for_server_ready,
-)
-from utils.k8s_utils import deploy_dynamo_graph_deployment, shutdown_deployment
+from utils.dynamo_deployment import DynamoDeploymentClient
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -50,6 +44,12 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default="dynamo-sla-profiler",
+        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
+    )
     parser.add_argument(
         "--backend",
         type=str,
@@ -144,7 +144,6 @@
     os.makedirs(args.output_dir, exist_ok=True)
 
     model_name = config_modifier.get_model_name(config)
-    port = config_modifier.get_port(config)
 
     # first profile prefill
     prefill_tp_size = []
@@ -163,27 +162,34 @@
         prefill_config_fn = f"{work_dir}/config.yaml"
         with open(prefill_config_fn, "w") as f:
             yaml.dump(prefill_config, f)
-            
-        k8s_deployment = deploy_dynamo_graph_deployment(
-            config=prefill_config,
-            log_dir=f"{work_dir}/log",
-            model_name=model_name,
-            port=port,
-            timeout=600, # 10 minutes timeout waiting for server to be ready 
-        )
-
-        # run genai-perf
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
-        gap_result = benchmark_prefill(
-            args.isl, genai_perf_artifact_dir, model_name, port
-        )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
-            prefill_tp_size.append(tp_size)
-            prefill_ttft.append(ttft)
-            prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
+        
+        with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+            asyncio.run(client.create_deployment(prefill_config_fn))
+            logger.info("Waiting for deployment to be ready...")
+            asyncio.run(client.wait_for_deployment_ready())
+            logger.info("Deployment is ready")
+
+            port = asyncio.run(client.port_forward())
+            logger.info(f"Port forwarded to {port}")
+
+            logger.info("Getting deployment logs...")
+            asyncio.run(client.get_deployment_logs())
+            logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+
+            # run genai-perf
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
+            gap_result = benchmark_prefill(
+                args.isl, genai_perf_artifact_dir, model_name, port
+            )
+            if gap_result is not None:
+                ttft = gap_result["time_to_first_token"]["avg"]
+                prefill_tp_size.append(tp_size)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
 
-        shutdown_deployment(k8s_deployment)
+            print("Cleaning up deployment...")
+            asyncio.run(client.delete_deployment())
+            print("Deployment deleted")
 
     # Plot the results as a 2D scatter plot
     if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
@@ -216,47 +222,54 @@
         with open(decode_config_fn, "w") as f:
             yaml.dump(decode_config, f)
             
-        k8s_deployment = deploy_dynamo_graph_deployment(
-            config=decode_config,
-            log_dir=f"{work_dir}/log",
-            model_name=model_name,
-            port=port,
-            timeout=600, # 10 minutes timeout waiting for server to be ready 
-        )
-
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/log")
-        max_concurrency = max_kv_tokens // (args.isl + args.osl)
-        sweep_num_request = [
-            num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
-        ]
-        logger.info(
-            f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
-        )
-
-        engine_decode_itl = []
-        engine_decode_thpt_per_gpu = []
-        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
-            gap_result = benchmark_decode(
-                args.isl,
-                args.osl,
-                num_request,
-                genai_perf_artifact_dir,
-                model_name,
-                port,
+        with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+            asyncio.run(client.create_deployment(decode_config_fn))
+            logger.info("Waiting for deployment to be ready...")
+            asyncio.run(client.wait_for_deployment_ready())
+            logger.info("Deployment is ready")
+
+            port = asyncio.run(client.port_forward())
+            logger.info(f"Port forwarded to {port}")
+
+            logger.info("Getting deployment logs...")
+            asyncio.run(client.get_deployment_logs())
+            logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+
+            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log")
+            max_concurrency = max_kv_tokens // (args.isl + args.osl)
+            sweep_num_request = [
+                num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
+            ]
+            logger.info(
+                f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
             )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
-                thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / tp_size
-                engine_decode_itl.append(itl)
-                engine_decode_thpt_per_gpu.append(thpt_per_gpu)
-                decode_tp_size.append(tp_size)
-                decode_itl.append(itl)
-                decode_thpt_per_gpu.append(thpt_per_gpu)
-                decode_concurrency.append(num_request)
-                decode_kv_cache_size.append(max_kv_tokens)
-
-        shutdown_deployment(k8s_deployment)
+
+            engine_decode_itl = []
+            engine_decode_thpt_per_gpu = []
+            for num_request in sweep_num_request:
+                genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
+                gap_result = benchmark_decode(
+                    args.isl,
+                    args.osl,
+                    num_request,
+                    genai_perf_artifact_dir,
+                    model_name,
+                    port,
+                )
+                if gap_result is not None:
+                    itl = gap_result["inter_token_latency"]["avg"]
+                    thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / tp_size
+                    engine_decode_itl.append(itl)
+                    engine_decode_thpt_per_gpu.append(thpt_per_gpu)
+                    decode_tp_size.append(tp_size)
+                    decode_itl.append(itl)
+                    decode_thpt_per_gpu.append(thpt_per_gpu)
+                    decode_concurrency.append(num_request)
+                    decode_kv_cache_size.append(max_kv_tokens)
+                
+            print("Cleaning up deployment...")
+            asyncio.run(client.delete_deployment())
+            print("Deployment deleted")
 
         # Store partial results for plotting later
         decode_results.append((tp_size, engine_decode_itl, engine_decode_thpt_per_gpu))
@@ -340,31 +353,38 @@
     with open(prefill_config_fn, "w") as f:
         yaml.dump(prefill_config, f)
             
-    k8s_deployment = deploy_dynamo_graph_deployment(
-        config=prefill_config,
-        log_dir=f"{work_dir}/log",
-        model_name=model_name,
-        port=port,
-        timeout=600, # 10 minutes timeout waiting for server to be ready 
-    )
-
-    for isl in range(
-        100,
-        args.max_context_length,
-        (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-    ):
-        # run genai-perf
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-        gap_result = benchmark_prefill(
-            isl, genai_perf_artifact_dir, model_name, port
-        )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
-            prefill_isl.append(isl)
-            prefill_ttft.append(ttft)
-            prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+    with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+        asyncio.run(client.create_deployment(prefill_config_fn))
+        logger.info("Waiting for deployment to be ready...")
+        asyncio.run(client.wait_for_deployment_ready())
+        logger.info("Deployment is ready")
+
+        port = asyncio.run(client.port_forward())
+        logger.info(f"Port forwarded to {port}")
+
+        logger.info("Getting deployment logs...")
+        asyncio.run(client.get_deployment_logs())
+        logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+
+        for isl in range(
+            100,
+            args.max_context_length,
+            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
+        ):
+            # run genai-perf
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+            gap_result = benchmark_prefill(
+                isl, genai_perf_artifact_dir, model_name, port
+            )
+            if gap_result is not None:
+                ttft = gap_result["time_to_first_token"]["avg"]
+                prefill_isl.append(isl)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
-    shutdown_deployment(k8s_deployment)
+        print("Cleaning up deployment...")
+        asyncio.run(client.delete_deployment())
+        print("Deployment deleted")
 
     # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
     if len(prefill_isl) > 2:
@@ -409,47 +429,54 @@
     with open(decode_config_fn, "w") as f:
         yaml.dump(decode_config, f)
         
-    k8s_deployment = deploy_dynamo_graph_deployment(
-        config=decode_config,
-        log_dir=f"{work_dir}/log",
-        model_name=model_name,
-        port=port,
-        timeout=600, # 10 minutes timeout waiting for server to be ready 
-    )
-
-    max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/log")
-
-    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    for isl in range(
-        100,
-        args.max_context_length - osl,
-        (args.max_context_length - osl) // args.decode_interpolation_granularity,
-    ):
-        max_concurrency = max_kv_tokens // (isl + osl)
-        sweep_num_request = list(
-            range(
-                1,
-                max_concurrency,
-                max_concurrency // args.decode_interpolation_granularity,
-            )
-        )
-        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = (
-                f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-            )
-            gap_result = benchmark_decode(
-                isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+    with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+        asyncio.run(client.create_deployment(decode_config_fn))
+        logger.info("Waiting for deployment to be ready...")
+        asyncio.run(client.wait_for_deployment_ready())
+        logger.info("Deployment is ready")
+
+        port = asyncio.run(client.port_forward())
+        logger.info(f"Port forwarded to {port}")
+
+        logger.info("Getting deployment logs...")
+        asyncio.run(client.get_deployment_logs())
+        logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+
+        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log")
+
+        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
+        for isl in range(
+            100,
+            args.max_context_length - osl,
+            (args.max_context_length - osl) // args.decode_interpolation_granularity,
+        ):
+            max_concurrency = max_kv_tokens // (isl + osl)
+            sweep_num_request = list(
+                range(
+                    1,
+                    max_concurrency,
+                    max_concurrency // args.decode_interpolation_granularity,
+                )
             )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
-                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                y_context_length.append(isl + osl / 2)
-                z_itl.append(itl)
-                z_thpt_per_gpu.append(
-                    gap_result["output_token_throughput"]["avg"] / tp_size
+            for num_request in sweep_num_request:
+                genai_perf_artifact_dir = (
+                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
                 )
-
-    shutdown_deployment(k8s_deployment)
+                gap_result = benchmark_decode(
+                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+                )
+                if gap_result is not None:
+                    itl = gap_result["inter_token_latency"]["avg"]
+                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                    y_context_length.append(isl + osl / 2)
+                    z_itl.append(itl)
+                    z_thpt_per_gpu.append(
+                        gap_result["output_token_throughput"]["avg"] / tp_size
+                    )
+
+        print("Cleaning up deployment...")
+        asyncio.run(client.delete_deployment())
+        print("Deployment deleted")
 
     # Save the data points to a .npz file
     save_path = f"{work_dir}/raw_data.npz"
diff --git a/benchmarks/profiler/utils/k8s_utils.py b/benchmarks/profiler/utils/k8s_utils.py
deleted file mode 100644
index c4ed91b28ec..00000000000
--- a/benchmarks/profiler/utils/k8s_utils.py
+++ /dev/null
@@ -1,7 +0,0 @@
-def deploy_dynamo_graph_deployment(config, log_dir, model_name, port, timeout):
-    # TODO
-    return deployment_object
-
-def shutdown_deployment(deployment_object):
-    # TODO
-    pass
\ No newline at end of file

From 1d8716404c9c4dd6eaac0c5339d69b3f9e678469 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Mon, 14 Jul 2025 17:50:10 -0700
Subject: [PATCH 12/58] fix: port forward works

---
 .../profiler/utils/dynamo_deployment.py       | 60 +++++++------------
 1 file changed, 22 insertions(+), 38 deletions(-)

diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index 9bd9d408d00..ee53a59a2e0 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -6,6 +6,8 @@
 #   "PyYAML",
 #   "aiofiles",
 #   "kubernetes-asyncio",
+#   "kr8s",           # added
+#   "httpx",          # added
 # ]
 # ///
 
@@ -21,6 +23,9 @@
 import kubernetes_asyncio as kubernetes
 from kubernetes_asyncio import client, config
 from contextlib import asynccontextmanager
+import httpx                          # added for HTTP requests
+from kr8s.asyncio.objects import Service
+
 
 # Example chat completion request for testing deployments
 EXAMPLE_CHAT_REQUEST = {
@@ -143,52 +148,31 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
     @asynccontextmanager
     async def port_forward(self, port: Optional[int] = None):
         """
-        Port forward the frontend service to local machine.
-        
-        Args:
-            port: Local port to use. If None, uses a random port.
-            
-        Yields:
-            The local port number being used
+        Forward the service's HTTP port to a local port.
         """
         if port is None:
             port = random.randint(49152, 65535)
-            
-        service_name = f"{self.deployment_name}-frontend"
-        cmd = f"kubectl port-forward service/{service_name} {port}:8000 -n {self.namespace}"
-        
-        process = await asyncio.create_subprocess_shell(
-            cmd,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.PIPE
-        )
-        
+        svc_name = f"{self.deployment_name}-frontend"
+        # Get the Service and forward its HTTP port (8000)
+        service = await Service.get(svc_name, namespace=self.namespace)
+        pf = service.portforward(remote_port=8000, local_port=port)
+        await pf.start()
         try:
-            # Wait briefly to ensure port-forward is established
-            await asyncio.sleep(2)
             yield port
         finally:
-            process.terminate()
-            await process.wait()
+            await pf.stop()
 
     async def check_chat_completion(self):
         """
-        Test the deployment with a chat completion request.
+        Test the deployment with a chat completion request using httpx.
         """
         async with self.port_forward() as port:
             url = f"http://localhost:{port}/v1/chat/completions"
-            
-            cmd = f"""curl -X POST {url} \\
-                     -H "Content-Type: application/json" \\
-                     -d '{json.dumps(EXAMPLE_CHAT_REQUEST)}'"""
-            
-            process = await asyncio.create_subprocess_shell(
-                cmd,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE
-            )
-            stdout, stderr = await process.communicate()
-            return stdout.decode()
+            async with httpx.AsyncClient() as client:
+                response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
+                response.raise_for_status()
+                return response.text
+
 
     async def get_deployment_logs(self):
         """
@@ -265,10 +249,10 @@ async def main():
         await client.wait_for_deployment_ready()
         print("Deployment is ready!")
         
-        # # Test chat completion
-        # print("Testing chat completion...")
-        # response = await client.check_chat_completion()
-        # print(f"Chat completion response: {response}")
+        # Test chat completion
+        print("Testing chat completion...")
+        response = await client.check_chat_completion()
+        print(f"Chat completion response: {response}")
         
         # Get logs
         print("Getting deployment logs...")

From 65dec078170dd5670f7022a9c59a9762d40f54c6 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 14 Jul 2025 17:53:36 -0700
Subject: [PATCH 13/58] pc

---
 benchmarks/profiler/profile_sla.py            |  67 +++++---
 benchmarks/profiler/utils/config.py           |  52 ++++--
 benchmarks/profiler/utils/defaults.py         |   2 +-
 .../profiler/utils/dynamo_deployment.py       | 160 ++++++++++--------
 4 files changed, 179 insertions(+), 102 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index a5ba38e129a..f62436a7f1b 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -14,15 +14,16 @@
 # limitations under the License.
 
 import argparse
+import asyncio
 import logging
 import math
 import os
-import asyncio
 
 import numpy as np
 import yaml
 from utils.config import CONFIG_MODIFIERS
 from utils.defaults import DECODE_NUM_REQUESTS_RANGE
+from utils.dynamo_deployment import DynamoDeploymentClient
 from utils.genai_perf import benchmark_decode, benchmark_prefill
 from utils.plot import (
     plot_decode_3d_surface,
@@ -30,7 +31,6 @@
     plot_prefill_interpolation,
     plot_prefill_performance,
 )
-from utils.dynamo_deployment import DynamoDeploymentClient
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -43,7 +43,9 @@
 logger.addHandler(console_handler)
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
+    parser = argparse.ArgumentParser(
+        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
+    )
     parser.add_argument(
         "--namespace",
         type=str,
@@ -58,7 +60,10 @@
         help="backend type, currently support [vllm_v1]",
     )
     parser.add_argument(
-        "--config", type=str, required=True, help="Path to the DynamoGraphDeployment config file"
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the DynamoGraphDeployment config file",
     )
     parser.add_argument(
         "--example-dir",
@@ -162,8 +167,10 @@
         prefill_config_fn = f"{work_dir}/config.yaml"
         with open(prefill_config_fn, "w") as f:
             yaml.dump(prefill_config, f)
-        
-        with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+
+        with DynamoDeploymentClient(
+            namespace=args.namespace, base_log_dir=work_dir
+        ) as client:
             asyncio.run(client.create_deployment(prefill_config_fn))
             logger.info("Waiting for deployment to be ready...")
             asyncio.run(client.wait_for_deployment_ready())
@@ -174,7 +181,9 @@
 
             logger.info("Getting deployment logs...")
             asyncio.run(client.get_deployment_logs())
-            logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+            logger.info(
+                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            )
 
             # run genai-perf
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
@@ -221,8 +230,10 @@
         decode_config_fn = f"{work_dir}/config.yaml"
         with open(decode_config_fn, "w") as f:
             yaml.dump(decode_config, f)
-            
-        with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+
+        with DynamoDeploymentClient(
+            namespace=args.namespace, base_log_dir=work_dir
+        ) as client:
             asyncio.run(client.create_deployment(decode_config_fn))
             logger.info("Waiting for deployment to be ready...")
             asyncio.run(client.wait_for_deployment_ready())
@@ -233,9 +244,13 @@
 
             logger.info("Getting deployment logs...")
             asyncio.run(client.get_deployment_logs())
-            logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+            logger.info(
+                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            )
 
-            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log")
+            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+                f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
+            )
             max_concurrency = max_kv_tokens // (args.isl + args.osl)
             sweep_num_request = [
                 num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
@@ -258,7 +273,9 @@
                 )
                 if gap_result is not None:
                     itl = gap_result["inter_token_latency"]["avg"]
-                    thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / tp_size
+                    thpt_per_gpu = (
+                        gap_result["output_token_throughput"]["avg"] / tp_size
+                    )
                     engine_decode_itl.append(itl)
                     engine_decode_thpt_per_gpu.append(thpt_per_gpu)
                     decode_tp_size.append(tp_size)
@@ -266,7 +283,7 @@
                     decode_thpt_per_gpu.append(thpt_per_gpu)
                     decode_concurrency.append(num_request)
                     decode_kv_cache_size.append(max_kv_tokens)
-                
+
             print("Cleaning up deployment...")
             asyncio.run(client.delete_deployment())
             print("Deployment deleted")
@@ -352,8 +369,10 @@
     prefill_config_fn = f"{work_dir}/config.yaml"
     with open(prefill_config_fn, "w") as f:
         yaml.dump(prefill_config, f)
-            
-    with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+
+    with DynamoDeploymentClient(
+        namespace=args.namespace, base_log_dir=work_dir
+    ) as client:
         asyncio.run(client.create_deployment(prefill_config_fn))
         logger.info("Waiting for deployment to be ready...")
         asyncio.run(client.wait_for_deployment_ready())
@@ -364,7 +383,9 @@
 
         logger.info("Getting deployment logs...")
         asyncio.run(client.get_deployment_logs())
-        logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+        logger.info(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+        )
 
         for isl in range(
             100,
@@ -428,8 +449,10 @@
     decode_config_fn = f"{work_dir}/config.yaml"
     with open(decode_config_fn, "w") as f:
         yaml.dump(decode_config, f)
-        
-    with DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir) as client:
+
+    with DynamoDeploymentClient(
+        namespace=args.namespace, base_log_dir=work_dir
+    ) as client:
         asyncio.run(client.create_deployment(decode_config_fn))
         logger.info("Waiting for deployment to be ready...")
         asyncio.run(client.wait_for_deployment_ready())
@@ -440,9 +463,13 @@
 
         logger.info("Getting deployment logs...")
         asyncio.run(client.get_deployment_logs())
-        logger.info(f"Logs have been saved to {client.base_log_dir / client.deployment_name}")
+        logger.info(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+        )
 
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log")
+        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
+        )
 
         osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
         for isl in range(
diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index f4e59a28a60..73b53c3d072 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -18,6 +18,7 @@
 from typing import Literal
 
 from utils.defaults import DEFAULT_MODEL_NAME, DYNAMO_RUN_DEFAULT_PORT
+
 from dynamo.planner.defaults import WORKER_COMPONENT_NAMES
 
 logger = logging.getLogger(__name__)
@@ -30,6 +31,7 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
+
 class VllmV1ConfigModifier:
     @classmethod
     def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
@@ -41,10 +43,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
 
         if target == "prefill":
             # convert prefill worker into decode worker
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker] = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
-            del config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
-
-            args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["extraPodSpec"]["mainContainer"]["args"]
+            config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            ] = config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+            ]
+            del config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+            ]
+
+            args = config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            ]["extraPodSpec"]["mainContainer"]["args"]
 
             # remove --is-prefill-worker flag
             args.remove("--is-prefill-worker")
@@ -57,9 +67,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
 
         elif target == "decode":
             # delete prefill worker
-            del config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]
+            del config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+            ]
 
-            args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["extraPodSpec"]["mainContainer"]["args"]
+            args = config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            ]["extraPodSpec"]["mainContainer"]["args"]
 
             # enable prefix caching
             if "--enable-prefix-caching" not in args:
@@ -68,7 +82,9 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
                 args.remove("--no-enable-prefix-caching")
 
         # set num workers to 1
-        decode_worker_config = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
+        decode_worker_config = config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        ]
         decode_worker_config["replicas"] = 1
 
         return config
@@ -77,7 +93,9 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
     def set_config_tp_size(cls, config: dict, tp_size: int):
         config = deepcopy(config)
 
-        args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["extraPodSpec"]["mainContainer"]["args"]
+        args = config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        ]["extraPodSpec"]["mainContainer"]["args"]
 
         try:
             idx = args.index("--tensor-parallel-size")
@@ -91,22 +109,30 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
     @classmethod
     def get_model_name(cls, config: dict) -> str:
         worker_name = WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-        args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"]["args"]
+        args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
+            "args"
+        ]
 
         for i, arg in enumerate(args):
             if arg == "--model" and i + 1 < len(args):
                 return args[i + 1]
-        
-        logger.warning(f"Model name not found in configuration args, using default model name: {DEFAULT_MODEL_NAME}")
+
+        logger.warning(
+            f"Model name not found in configuration args, using default model name: {DEFAULT_MODEL_NAME}"
+        )
         return DEFAULT_MODEL_NAME
 
     @classmethod
     def get_port(cls, config: dict) -> int:
-        args = config["spec"]["services"]["Frontend"]["extraPodSpec"]["mainContainer"]["args"]
+        args = config["spec"]["services"]["Frontend"]["extraPodSpec"]["mainContainer"][
+            "args"
+        ]
         for arg in args:
             if arg.startswith("port="):
                 return int(arg.split("=")[1])
-        logger.warning(f"Port not found in configuration args, using default port: {DYNAMO_RUN_DEFAULT_PORT}")
+        logger.warning(
+            f"Port not found in configuration args, using default port: {DYNAMO_RUN_DEFAULT_PORT}"
+        )
         return DYNAMO_RUN_DEFAULT_PORT
 
     @classmethod
diff --git a/benchmarks/profiler/utils/defaults.py b/benchmarks/profiler/utils/defaults.py
index 21fd7b43f6f..3084ca66075 100644
--- a/benchmarks/profiler/utils/defaults.py
+++ b/benchmarks/profiler/utils/defaults.py
@@ -31,4 +31,4 @@
 ]
 
 DEFAULT_MODEL_NAME = "Qwen/Qwen3-0.6B"
-DYNAMO_RUN_DEFAULT_PORT = 8080
\ No newline at end of file
+DYNAMO_RUN_DEFAULT_PORT = 8080
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index ee53a59a2e0..fd1f5419c1b 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -11,21 +11,19 @@
 # ]
 # ///
 
+import argparse
 import asyncio
-import json
-import os
 import random
-import yaml
-import argparse
+from contextlib import asynccontextmanager
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Optional, Union
+
 import aiofiles
+import httpx  # added for HTTP requests
 import kubernetes_asyncio as kubernetes
-from kubernetes_asyncio import client, config
-from contextlib import asynccontextmanager
-import httpx                          # added for HTTP requests
+import yaml
 from kr8s.asyncio.objects import Service
-
+from kubernetes_asyncio import client, config
 
 # Example chat completion request for testing deployments
 EXAMPLE_CHAT_REQUEST = {
@@ -33,18 +31,24 @@
     "messages": [
         {
             "role": "user",
-            "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
+            "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden.",
         }
     ],
     "stream": False,
-    "max_tokens": 30
+    "max_tokens": 30,
 }
 
+
 class DynamoDeploymentClient:
-    def __init__(self, namespace: str, deployment_name: str = "vllm-v1-agg", base_log_dir: Optional[str] = None):
+    def __init__(
+        self,
+        namespace: str,
+        deployment_name: str = "vllm-v1-agg",
+        base_log_dir: Optional[str] = None,
+    ):
         """
         Initialize the client with the namespace and deployment name.
-        
+
         Args:
             namespace: The Kubernetes namespace
             deployment_name: Name of the deployment, defaults to vllm-v1-agg
@@ -55,7 +59,7 @@ def __init__(self, namespace: str, deployment_name: str = "vllm-v1-agg", base_lo
         self.components = []  # Will store component names from CR
         self.deployment_spec = None  # Will store the full deployment spec
         self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
-        
+
     async def _init_kubernetes(self):
         """Initialize kubernetes client"""
         await config.load_kube_config()
@@ -66,34 +70,36 @@ async def _init_kubernetes(self):
     async def create_deployment(self, deployment: Union[dict, str]):
         """
         Create a DynamoGraphDeployment from either a dict or yaml file path.
-        
+
         Args:
             deployment: Either a dict containing the deployment spec or a path to a yaml file
         """
         await self._init_kubernetes()
-        
+
         if isinstance(deployment, str):
             # Load from yaml file
-            async with aiofiles.open(deployment, 'r') as f:
+            async with aiofiles.open(deployment, "r") as f:
                 content = await f.read()
                 self.deployment_spec = yaml.safe_load(content)
         else:
             self.deployment_spec = deployment
-            
+
         # Extract component names
-        self.components = [svc.lower() for svc in self.deployment_spec['spec']['services'].keys()]
-        
+        self.components = [
+            svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
+        ]
+
         # Ensure name and namespace are set correctly
-        self.deployment_spec['metadata']['name'] = self.deployment_name
-        self.deployment_spec['metadata']['namespace'] = self.namespace
-        
+        self.deployment_spec["metadata"]["name"] = self.deployment_name
+        self.deployment_spec["metadata"]["namespace"] = self.namespace
+
         try:
             await self.custom_api.create_namespaced_custom_object(
                 group="nvidia.com",
                 version="v1alpha1",
                 namespace=self.namespace,
                 plural="dynamographdeployments",
-                body=self.deployment_spec
+                body=self.deployment_spec,
             )
         except kubernetes.client.rest.ApiException as e:
             if e.status == 409:  # Already exists
@@ -104,7 +110,7 @@ async def create_deployment(self, deployment: Union[dict, str]):
     async def wait_for_deployment_ready(self, timeout: int = 300):
         """
         Wait for the custom resource to be ready.
-        
+
         Args:
             timeout: Maximum time to wait in seconds
         """
@@ -117,29 +123,33 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
                     version="v1alpha1",
                     namespace=self.namespace,
                     plural="dynamographdeployments",
-                    name=self.deployment_name
+                    name=self.deployment_name,
                 )
                 # print(f"Current status: {status.get('status', {})}")
-                
+
                 # Check both conditions:
                 # 1. Ready condition is True
                 # 2. State is successful
-                status_obj = status.get('status', {})
-                conditions = status_obj.get('conditions', [])
-                
+                status_obj = status.get("status", {})
+                conditions = status_obj.get("conditions", [])
+
                 ready_condition = False
                 for condition in conditions:
-                    if (condition.get('type') == 'Ready' and 
-                        condition.get('status') == 'True'):
+                    if (
+                        condition.get("type") == "Ready"
+                        and condition.get("status") == "True"
+                    ):
                         ready_condition = True
                         break
-                
-                state_successful = status_obj.get('state') == 'successful'
-                
+
+                state_successful = status_obj.get("state") == "successful"
+
                 if ready_condition and state_successful:
-                    print("Deployment is ready: Ready condition is True and state is successful")
+                    print(
+                        "Deployment is ready: Ready condition is True and state is successful"
+                    )
                     return True
-                    
+
             except kubernetes.client.rest.ApiException:
                 pass
             await asyncio.sleep(20)
@@ -173,7 +183,6 @@ async def check_chat_completion(self):
                 response.raise_for_status()
                 return response.text
 
-
     async def get_deployment_logs(self):
         """
         Get logs from all pods in the deployment, organized by component.
@@ -181,28 +190,28 @@ async def get_deployment_logs(self):
         # Create logs directory
         base_dir = self.base_log_dir / self.deployment_name
         base_dir.mkdir(parents=True, exist_ok=True)
-        
+
         for component in self.components:
             component_dir = base_dir / component
             component_dir.mkdir(exist_ok=True)
-            
+
             # List pods for this component using the selector label
             # nvidia.com/selector: deployment-name-component
-            label_selector = f"nvidia.com/selector={self.deployment_name}-{component.lower()}"
-            
+            label_selector = (
+                f"nvidia.com/selector={self.deployment_name}-{component.lower()}"
+            )
+
             pods = await self.core_api.list_namespaced_pod(
-                namespace=self.namespace,
-                label_selector=label_selector
+                namespace=self.namespace, label_selector=label_selector
             )
-            
+
             # Get logs for each pod
             for i, pod in enumerate(pods.items):
                 try:
                     logs = await self.core_api.read_namespaced_pod_log(
-                        name=pod.metadata.name,
-                        namespace=self.namespace
+                        name=pod.metadata.name, namespace=self.namespace
                     )
-                    async with aiofiles.open(component_dir / f"{i}.log", 'w') as f:
+                    async with aiofiles.open(component_dir / f"{i}.log", "w") as f:
                         await f.write(logs)
                 except kubernetes.client.rest.ApiException as e:
                     print(f"Error getting logs for pod {pod.metadata.name}: {e}")
@@ -217,55 +226,70 @@ async def delete_deployment(self):
                 version="v1alpha1",
                 namespace=self.namespace,
                 plural="dynamographdeployments",
-                name=self.deployment_name
+                name=self.deployment_name,
             )
         except kubernetes.client.rest.ApiException as e:
             if e.status != 404:  # Ignore if already deleted
                 raise
 
+
 async def main():
-    parser = argparse.ArgumentParser(description='Deploy and manage DynamoGraphDeployment CRDs')
-    parser.add_argument('--namespace', '-n', required=True,
-                      help='Kubernetes namespace to deploy to (default: default)')
-    parser.add_argument('--yaml-file', '-f', required=True,
-                      help='Path to the DynamoGraphDeployment YAML file')
-    parser.add_argument('--log-dir', '-l', default='/tmp/dynamo_logs',
-                      help='Base directory for logs (default: /tmp/dynamo_logs)')
-    
+    parser = argparse.ArgumentParser(
+        description="Deploy and manage DynamoGraphDeployment CRDs"
+    )
+    parser.add_argument(
+        "--namespace",
+        "-n",
+        required=True,
+        help="Kubernetes namespace to deploy to (default: default)",
+    )
+    parser.add_argument(
+        "--yaml-file",
+        "-f",
+        required=True,
+        help="Path to the DynamoGraphDeployment YAML file",
+    )
+    parser.add_argument(
+        "--log-dir",
+        "-l",
+        default="/tmp/dynamo_logs",
+        help="Base directory for logs (default: /tmp/dynamo_logs)",
+    )
+
     args = parser.parse_args()
 
     # Example usage with parsed arguments
-    client = DynamoDeploymentClient(
-        namespace=args.namespace,
-        base_log_dir=args.log_dir
-    )
-    
+    client = DynamoDeploymentClient(namespace=args.namespace, base_log_dir=args.log_dir)
+
     try:
         # Create deployment from yaml file
         await client.create_deployment(args.yaml_file)
-        
+
         # Wait for deployment to be ready
         print("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
         print("Deployment is ready!")
-        
+
         # Test chat completion
         print("Testing chat completion...")
         response = await client.check_chat_completion()
         print(f"Chat completion response: {response}")
-        
+
         # Get logs
         print("Getting deployment logs...")
         await client.get_deployment_logs()
-        print(f"Logs have been saved to {client.base_log_dir / client.deployment_name}!")
-        
+        print(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}!"
+        )
+
     finally:
         # Cleanup
         print("Cleaning up deployment...")
         await client.delete_deployment()
         print("Deployment deleted!")
 
+
 # run with:
 # uv run benchmarks/profiler/utils/dynamo_deployment.py -n mo-dyn-cloud -f ./examples/vllm/deploy/agg.yaml -l ./client_logs
 if __name__ == "__main__":
-    asyncio.run(main()) 
\ No newline at end of file
+    asyncio.run(main())

From 0af209b0668edcc451955cdd7dc4507c077eeb06 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Tue, 15 Jul 2025 10:03:11 -0700
Subject: [PATCH 14/58] add dep; bug fix

---
 benchmarks/profiler/profile_sla.py            | 478 +++++++++---------
 benchmarks/profiler/utils/config.py           |   3 +
 .../profiler/utils/dynamo_deployment.py       |   5 +-
 container/deps/requirements.txt               |   3 +
 4 files changed, 250 insertions(+), 239 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index f62436a7f1b..d98aec726f7 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -42,86 +42,7 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
-    )
-    parser.add_argument(
-        "--namespace",
-        type=str,
-        default="dynamo-sla-profiler",
-        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
-    )
-    parser.add_argument(
-        "--backend",
-        type=str,
-        default="vllm_v1",
-        choices=["vllm_v1"],
-        help="backend type, currently support [vllm_v1]",
-    )
-    parser.add_argument(
-        "--config",
-        type=str,
-        required=True,
-        help="Path to the DynamoGraphDeployment config file",
-    )
-    parser.add_argument(
-        "--example-dir",
-        type=str,
-        default=None,
-        help="path to the example directory, if not provided, will try to infer from config file location",
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default="profiling_results",
-        help="Path to the output results directory",
-    )
-    parser.add_argument(
-        "--min-num-gpus-per-engine",
-        type=int,
-        default=1,
-        help="minimum number of GPUs per engine",
-    )
-    parser.add_argument(
-        "--max-num-gpus-per-engine",
-        type=int,
-        default=8,
-        help="maximum number of GPUs per engine",
-    )
-    parser.add_argument(
-        "--isl", type=int, default=3000, help="target input sequence length"
-    )
-    parser.add_argument(
-        "--osl", type=int, default=500, help="target output sequence length"
-    )
-    parser.add_argument(
-        "--ttft", type=int, default=50, help="target Time To First Token in ms"
-    )
-    parser.add_argument(
-        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
-    )
-    # below are arguments used for interpolating TTFT and ITL under different ISL/OSL
-    parser.add_argument(
-        "--max-context-length",
-        type=int,
-        default=16384,
-        help="maximum context length supported by the served model",
-    )
-    parser.add_argument(
-        "--prefill-interpolation-granularity",
-        type=int,
-        default=16,
-        help="how many samples to benchmark to interpolate TTFT under different ISL",
-    )
-    parser.add_argument(
-        "--decode-interpolation-granularity",
-        type=int,
-        default=6,
-        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
-    )
-    args = parser.parse_args()
-
+async def run_profile(args):
     config_modifier = CONFIG_MODIFIERS[args.backend]
 
     if args.example_dir is None:
@@ -168,37 +89,37 @@
         with open(prefill_config_fn, "w") as f:
             yaml.dump(prefill_config, f)
 
-        with DynamoDeploymentClient(
+        client = DynamoDeploymentClient(
             namespace=args.namespace, base_log_dir=work_dir
-        ) as client:
-            asyncio.run(client.create_deployment(prefill_config_fn))
-            logger.info("Waiting for deployment to be ready...")
-            asyncio.run(client.wait_for_deployment_ready())
-            logger.info("Deployment is ready")
-
-            port = asyncio.run(client.port_forward())
-            logger.info(f"Port forwarded to {port}")
-
-            logger.info("Getting deployment logs...")
-            asyncio.run(client.get_deployment_logs())
-            logger.info(
-                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-            )
+        ) 
+        await client.create_deployment(prefill_config_fn)
+        logger.info("Waiting for deployment to be ready...")
+        await client.wait_for_deployment_ready()
+        logger.info("Deployment is ready")
 
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
-            gap_result = benchmark_prefill(
-                args.isl, genai_perf_artifact_dir, model_name, port
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_tp_size.append(tp_size)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
+        port = await client.port_forward()
+        logger.info(f"Port forwarded to {port}")
+
+        logger.info("Getting deployment logs...")
+        await client.get_deployment_logs()
+        logger.info(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+        )
+
+        # run genai-perf
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
+        gap_result = benchmark_prefill(
+            args.isl, genai_perf_artifact_dir, model_name, port
+        )
+        if gap_result is not None:
+            ttft = gap_result["time_to_first_token"]["avg"]
+            prefill_tp_size.append(tp_size)
+            prefill_ttft.append(ttft)
+            prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
 
-            print("Cleaning up deployment...")
-            asyncio.run(client.delete_deployment())
-            print("Deployment deleted")
+        print("Cleaning up deployment...")
+        await client.delete_deployment()
+        print("Deployment deleted")
 
     # Plot the results as a 2D scatter plot
     if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
@@ -231,62 +152,62 @@
         with open(decode_config_fn, "w") as f:
             yaml.dump(decode_config, f)
 
-        with DynamoDeploymentClient(
+        client = DynamoDeploymentClient(
             namespace=args.namespace, base_log_dir=work_dir
-        ) as client:
-            asyncio.run(client.create_deployment(decode_config_fn))
-            logger.info("Waiting for deployment to be ready...")
-            asyncio.run(client.wait_for_deployment_ready())
-            logger.info("Deployment is ready")
-
-            port = asyncio.run(client.port_forward())
-            logger.info(f"Port forwarded to {port}")
-
-            logger.info("Getting deployment logs...")
-            asyncio.run(client.get_deployment_logs())
-            logger.info(
-                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-            )
+        ) 
+        await client.create_deployment(decode_config_fn)
+        logger.info("Waiting for deployment to be ready...")
+        await client.wait_for_deployment_ready()
+        logger.info("Deployment is ready")
 
-            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-                f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
-            )
-            max_concurrency = max_kv_tokens // (args.isl + args.osl)
-            sweep_num_request = [
-                num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
-            ]
-            logger.info(
-                f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
-            )
+        port = await client.port_forward()
+        logger.info(f"Port forwarded to {port}")
 
-            engine_decode_itl = []
-            engine_decode_thpt_per_gpu = []
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
-                gap_result = benchmark_decode(
-                    args.isl,
-                    args.osl,
-                    num_request,
-                    genai_perf_artifact_dir,
-                    model_name,
-                    port,
+        logger.info("Getting deployment logs...")
+        await client.get_deployment_logs()
+        logger.info(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+        )
+
+        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
+        )
+        max_concurrency = max_kv_tokens // (args.isl + args.osl)
+        sweep_num_request = [
+            num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
+        ]
+        logger.info(
+            f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
+        )
+
+        engine_decode_itl = []
+        engine_decode_thpt_per_gpu = []
+        for num_request in sweep_num_request:
+            genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
+            gap_result = benchmark_decode(
+                args.isl,
+                args.osl,
+                num_request,
+                genai_perf_artifact_dir,
+                model_name,
+                port,
+            )
+            if gap_result is not None:
+                itl = gap_result["inter_token_latency"]["avg"]
+                thpt_per_gpu = (
+                    gap_result["output_token_throughput"]["avg"] / tp_size
                 )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    thpt_per_gpu = (
-                        gap_result["output_token_throughput"]["avg"] / tp_size
-                    )
-                    engine_decode_itl.append(itl)
-                    engine_decode_thpt_per_gpu.append(thpt_per_gpu)
-                    decode_tp_size.append(tp_size)
-                    decode_itl.append(itl)
-                    decode_thpt_per_gpu.append(thpt_per_gpu)
-                    decode_concurrency.append(num_request)
-                    decode_kv_cache_size.append(max_kv_tokens)
-
-            print("Cleaning up deployment...")
-            asyncio.run(client.delete_deployment())
-            print("Deployment deleted")
+                engine_decode_itl.append(itl)
+                engine_decode_thpt_per_gpu.append(thpt_per_gpu)
+                decode_tp_size.append(tp_size)
+                decode_itl.append(itl)
+                decode_thpt_per_gpu.append(thpt_per_gpu)
+                decode_concurrency.append(num_request)
+                decode_kv_cache_size.append(max_kv_tokens)
+
+        print("Cleaning up deployment...")
+        await client.delete_deployment()
+        print("Deployment deleted")
 
         # Store partial results for plotting later
         decode_results.append((tp_size, engine_decode_itl, engine_decode_thpt_per_gpu))
@@ -370,42 +291,42 @@
     with open(prefill_config_fn, "w") as f:
         yaml.dump(prefill_config, f)
 
-    with DynamoDeploymentClient(
+    client = DynamoDeploymentClient(
         namespace=args.namespace, base_log_dir=work_dir
-    ) as client:
-        asyncio.run(client.create_deployment(prefill_config_fn))
-        logger.info("Waiting for deployment to be ready...")
-        asyncio.run(client.wait_for_deployment_ready())
-        logger.info("Deployment is ready")
+    ) 
+    await client.create_deployment(prefill_config_fn)
+    logger.info("Waiting for deployment to be ready...")
+    await client.wait_for_deployment_ready()
+    logger.info("Deployment is ready")
 
-        port = asyncio.run(client.port_forward())
-        logger.info(f"Port forwarded to {port}")
+    port = await client.port_forward()
+    logger.info(f"Port forwarded to {port}")
 
-        logger.info("Getting deployment logs...")
-        asyncio.run(client.get_deployment_logs())
-        logger.info(
-            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-        )
+    logger.info("Getting deployment logs...")
+    await client.get_deployment_logs()
+    logger.info(
+        f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+    )
 
-        for isl in range(
-            100,
-            args.max_context_length,
-            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-        ):
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-            gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, port
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_isl.append(isl)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+    for isl in range(
+        100,
+        args.max_context_length,
+        (args.max_context_length - 100) // args.prefill_interpolation_granularity,
+    ):
+        # run genai-perf
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+        gap_result = benchmark_prefill(
+            isl, genai_perf_artifact_dir, model_name, port
+        )
+        if gap_result is not None:
+            ttft = gap_result["time_to_first_token"]["avg"]
+            prefill_isl.append(isl)
+            prefill_ttft.append(ttft)
+            prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
-        print("Cleaning up deployment...")
-        asyncio.run(client.delete_deployment())
-        print("Deployment deleted")
+    print("Cleaning up deployment...")
+    await client.delete_deployment()
+    print("Deployment deleted")
 
     # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
     if len(prefill_isl) > 2:
@@ -450,60 +371,60 @@
     with open(decode_config_fn, "w") as f:
         yaml.dump(decode_config, f)
 
-    with DynamoDeploymentClient(
+    client = DynamoDeploymentClient(
         namespace=args.namespace, base_log_dir=work_dir
-    ) as client:
-        asyncio.run(client.create_deployment(decode_config_fn))
-        logger.info("Waiting for deployment to be ready...")
-        asyncio.run(client.wait_for_deployment_ready())
-        logger.info("Deployment is ready")
+    ) 
+    await client.create_deployment(decode_config_fn)
+    logger.info("Waiting for deployment to be ready...")
+    await client.wait_for_deployment_ready()
+    logger.info("Deployment is ready")
 
-        port = asyncio.run(client.port_forward())
-        logger.info(f"Port forwarded to {port}")
+    port = await client.port_forward()
+    logger.info(f"Port forwarded to {port}")
 
-        logger.info("Getting deployment logs...")
-        asyncio.run(client.get_deployment_logs())
-        logger.info(
-            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-        )
+    logger.info("Getting deployment logs...")
+    await client.get_deployment_logs()
+    logger.info(
+        f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+    )
 
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
-        )
+    max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+        f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
+    )
 
-        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-        for isl in range(
-            100,
-            args.max_context_length - osl,
-            (args.max_context_length - osl) // args.decode_interpolation_granularity,
-        ):
-            max_concurrency = max_kv_tokens // (isl + osl)
-            sweep_num_request = list(
-                range(
-                    1,
-                    max_concurrency,
-                    max_concurrency // args.decode_interpolation_granularity,
-                )
+    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
+    for isl in range(
+        100,
+        args.max_context_length - osl,
+        (args.max_context_length - osl) // args.decode_interpolation_granularity,
+    ):
+        max_concurrency = max_kv_tokens // (isl + osl)
+        sweep_num_request = list(
+            range(
+                1,
+                max_concurrency,
+                max_concurrency // args.decode_interpolation_granularity,
             )
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = (
-                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-                )
-                gap_result = benchmark_decode(
-                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+        )
+        for num_request in sweep_num_request:
+            genai_perf_artifact_dir = (
+                f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+            )
+            gap_result = benchmark_decode(
+                isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+            )
+            if gap_result is not None:
+                itl = gap_result["inter_token_latency"]["avg"]
+                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                y_context_length.append(isl + osl / 2)
+                z_itl.append(itl)
+                z_thpt_per_gpu.append(
+                    gap_result["output_token_throughput"]["avg"] / tp_size
                 )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                    y_context_length.append(isl + osl / 2)
-                    z_itl.append(itl)
-                    z_thpt_per_gpu.append(
-                        gap_result["output_token_throughput"]["avg"] / tp_size
-                    )
 
-        print("Cleaning up deployment...")
-        asyncio.run(client.delete_deployment())
-        print("Deployment deleted")
+    print("Cleaning up deployment...")
+    await client.delete_deployment()
+    print("Deployment deleted")
 
     # Save the data points to a .npz file
     save_path = f"{work_dir}/raw_data.npz"
@@ -521,3 +442,86 @@
     plot_decode_3d_surface(
         x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
     )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Profile the TTFT and ITL of the Prefill and Decode engine with different parallelization mapping. When profiling prefill we mock/fix decode,when profiling decode we mock/fix prefill."
+    )
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default="dynamo-sla-profiler",
+        help="Kubernetes namespace to deploy the DynamoGraphDeployment",
+    )
+    parser.add_argument(
+        "--backend",
+        type=str,
+        default="vllm_v1",
+        choices=["vllm_v1"],
+        help="backend type, currently support [vllm_v1]",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        required=True,
+        help="Path to the DynamoGraphDeployment config file",
+    )
+    parser.add_argument(
+        "--example-dir",
+        type=str,
+        default=None,
+        help="path to the example directory, if not provided, will try to infer from config file location",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="profiling_results",
+        help="Path to the output results directory",
+    )
+    parser.add_argument(
+        "--min-num-gpus-per-engine",
+        type=int,
+        default=1,
+        help="minimum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--max-num-gpus-per-engine",
+        type=int,
+        default=8,
+        help="maximum number of GPUs per engine",
+    )
+    parser.add_argument(
+        "--isl", type=int, default=3000, help="target input sequence length"
+    )
+    parser.add_argument(
+        "--osl", type=int, default=500, help="target output sequence length"
+    )
+    parser.add_argument(
+        "--ttft", type=int, default=50, help="target Time To First Token in ms"
+    )
+    parser.add_argument(
+        "--itl", type=int, default=10, help="target Inter Token Latency in ms"
+    )
+    # below are arguments used for interpolating TTFT and ITL under different ISL/OSL
+    parser.add_argument(
+        "--max-context-length",
+        type=int,
+        default=16384,
+        help="maximum context length supported by the served model",
+    )
+    parser.add_argument(
+        "--prefill-interpolation-granularity",
+        type=int,
+        default=16,
+        help="how many samples to benchmark to interpolate TTFT under different ISL",
+    )
+    parser.add_argument(
+        "--decode-interpolation-granularity",
+        type=int,
+        default=6,
+        help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
+    )
+    args = parser.parse_args()
+
+    asyncio.run(run_profile(args))
\ No newline at end of file
diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index 73b53c3d072..c574a6ea624 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -37,6 +37,9 @@ class VllmV1ConfigModifier:
     def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
         config = deepcopy(config)
 
+        # set metadata name
+        config["metadata"]["name"] = "vllm-v1-agg"
+
         # disable planner
         if "Planner" in config["spec"]["services"]:
             del config["spec"]["services"]["Planner"]
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index fd1f5419c1b..fcfbd5d5235 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -14,6 +14,7 @@
 import argparse
 import asyncio
 import random
+import time 
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Optional, Union
@@ -114,9 +115,9 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
         Args:
             timeout: Maximum time to wait in seconds
         """
-        start_time = asyncio.get_event_loop().time()
+        start_time = time.time()
         # TODO: A little brittle, also should output intermediate status every so often.
-        while (asyncio.get_event_loop().time() - start_time) < timeout:
+        while (time.time() - start_time) < timeout:
             try:
                 status = await self.custom_api.get_namespaced_custom_object_status(
                     group="nvidia.com",
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
index 6b7241257de..cd0d42ac852 100644
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -45,3 +45,6 @@ tensorboardX==2.6.2.2
 transformers
 types-PyYAML
 uvicorn
+aiofiles
+kubernetes_asyncio
+kr8s
\ No newline at end of file

From 3f900ef6685b8c184d66fc40661cb4c2f8f15d60 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Tue, 15 Jul 2025 11:00:19 -0700
Subject: [PATCH 15/58] staging, port forward not working

---
 benchmarks/profiler/profile_sla.py            | 165 +++++++++---------
 benchmarks/profiler/utils/config.py           |  68 +++++++-
 .../profiler/utils/dynamo_deployment.py       |   3 +
 3 files changed, 146 insertions(+), 90 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index d98aec726f7..73db2b2a2e1 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -90,16 +90,13 @@ async def run_profile(args):
             yaml.dump(prefill_config, f)
 
         client = DynamoDeploymentClient(
-            namespace=args.namespace, base_log_dir=work_dir
+            namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
         ) 
         await client.create_deployment(prefill_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
         logger.info("Deployment is ready")
 
-        port = await client.port_forward()
-        logger.info(f"Port forwarded to {port}")
-
         logger.info("Getting deployment logs...")
         await client.get_deployment_logs()
         logger.info(
@@ -107,15 +104,17 @@ async def run_profile(args):
         )
 
         # run genai-perf
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
-        gap_result = benchmark_prefill(
-            args.isl, genai_perf_artifact_dir, model_name, port
-        )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
-            prefill_tp_size.append(tp_size)
-            prefill_ttft.append(ttft)
-            prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
+        async with client.port_forward() as port:
+            import pdb; pdb.set_trace()
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
+            gap_result = benchmark_prefill(
+                args.isl, genai_perf_artifact_dir, model_name, port
+            )
+            if gap_result is not None:
+                ttft = gap_result["time_to_first_token"]["avg"]
+                prefill_tp_size.append(tp_size)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
@@ -153,16 +152,13 @@ async def run_profile(args):
             yaml.dump(decode_config, f)
 
         client = DynamoDeploymentClient(
-            namespace=args.namespace, base_log_dir=work_dir
+            namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
         ) 
         await client.create_deployment(decode_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
         logger.info("Deployment is ready")
 
-        port = await client.port_forward()
-        logger.info(f"Port forwarded to {port}")
-
         logger.info("Getting deployment logs...")
         await client.get_deployment_logs()
         logger.info(
@@ -182,28 +178,29 @@ async def run_profile(args):
 
         engine_decode_itl = []
         engine_decode_thpt_per_gpu = []
-        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
-            gap_result = benchmark_decode(
-                args.isl,
-                args.osl,
-                num_request,
-                genai_perf_artifact_dir,
-                model_name,
-                port,
-            )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
-                thpt_per_gpu = (
-                    gap_result["output_token_throughput"]["avg"] / tp_size
+        async with client.port_forward() as port:
+            for num_request in sweep_num_request:
+                genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
+                gap_result = benchmark_decode(
+                    args.isl,
+                    args.osl,
+                    num_request,
+                    genai_perf_artifact_dir,
+                    model_name,
+                    port,
                 )
-                engine_decode_itl.append(itl)
-                engine_decode_thpt_per_gpu.append(thpt_per_gpu)
-                decode_tp_size.append(tp_size)
-                decode_itl.append(itl)
-                decode_thpt_per_gpu.append(thpt_per_gpu)
-                decode_concurrency.append(num_request)
-                decode_kv_cache_size.append(max_kv_tokens)
+                if gap_result is not None:
+                    itl = gap_result["inter_token_latency"]["avg"]
+                    thpt_per_gpu = (
+                        gap_result["output_token_throughput"]["avg"] / tp_size
+                    )
+                    engine_decode_itl.append(itl)
+                    engine_decode_thpt_per_gpu.append(thpt_per_gpu)
+                    decode_tp_size.append(tp_size)
+                    decode_itl.append(itl)
+                    decode_thpt_per_gpu.append(thpt_per_gpu)
+                    decode_concurrency.append(num_request)
+                    decode_kv_cache_size.append(max_kv_tokens)
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
@@ -292,37 +289,35 @@ async def run_profile(args):
         yaml.dump(prefill_config, f)
 
     client = DynamoDeploymentClient(
-        namespace=args.namespace, base_log_dir=work_dir
+        namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
     ) 
     await client.create_deployment(prefill_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
     logger.info("Deployment is ready")
 
-    port = await client.port_forward()
-    logger.info(f"Port forwarded to {port}")
-
     logger.info("Getting deployment logs...")
     await client.get_deployment_logs()
     logger.info(
         f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
     )
 
-    for isl in range(
-        100,
-        args.max_context_length,
-        (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-    ):
-        # run genai-perf
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-        gap_result = benchmark_prefill(
-            isl, genai_perf_artifact_dir, model_name, port
-        )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
-            prefill_isl.append(isl)
-            prefill_ttft.append(ttft)
-            prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+    async with client.port_forward() as port:
+        for isl in range(
+            100,
+            args.max_context_length,
+            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
+        ):
+            # run genai-perf
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+            gap_result = benchmark_prefill(
+                isl, genai_perf_artifact_dir, model_name, port
+            )
+            if gap_result is not None:
+                ttft = gap_result["time_to_first_token"]["avg"]
+                prefill_isl.append(isl)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
     print("Cleaning up deployment...")
     await client.delete_deployment()
@@ -379,9 +374,6 @@ async def run_profile(args):
     await client.wait_for_deployment_ready()
     logger.info("Deployment is ready")
 
-    port = await client.port_forward()
-    logger.info(f"Port forwarded to {port}")
-
     logger.info("Getting deployment logs...")
     await client.get_deployment_logs()
     logger.info(
@@ -393,34 +385,35 @@ async def run_profile(args):
     )
 
     osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    for isl in range(
-        100,
-        args.max_context_length - osl,
-        (args.max_context_length - osl) // args.decode_interpolation_granularity,
-    ):
-        max_concurrency = max_kv_tokens // (isl + osl)
-        sweep_num_request = list(
-            range(
-                1,
-                max_concurrency,
-                max_concurrency // args.decode_interpolation_granularity,
-            )
-        )
-        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = (
-                f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-            )
-            gap_result = benchmark_decode(
-                isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+    async with client.port_forward() as port:
+        for isl in range(
+            100,
+            args.max_context_length - osl,
+            (args.max_context_length - osl) // args.decode_interpolation_granularity,
+        ):
+            max_concurrency = max_kv_tokens // (isl + osl)
+            sweep_num_request = list(
+                range(
+                    1,
+                    max_concurrency,
+                    max_concurrency // args.decode_interpolation_granularity,
+                )
             )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
-                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                y_context_length.append(isl + osl / 2)
-                z_itl.append(itl)
-                z_thpt_per_gpu.append(
-                    gap_result["output_token_throughput"]["avg"] / tp_size
+            for num_request in sweep_num_request:
+                genai_perf_artifact_dir = (
+                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+                )
+                gap_result = benchmark_decode(
+                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
                 )
+                if gap_result is not None:
+                    itl = gap_result["inter_token_latency"]["avg"]
+                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                    y_context_length.append(isl + osl / 2)
+                    z_itl.append(itl)
+                    z_thpt_per_gpu.append(
+                        gap_result["output_token_throughput"]["avg"] / tp_size
+                    )
 
     print("Cleaning up deployment...")
     await client.delete_deployment()
diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index c574a6ea624..ee5e0de9634 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -31,6 +31,44 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
+def break_arguments(args: list[str]) -> list[str]:
+    ans = []
+    if isinstance(args, str):
+        ans = args.split(" ")
+    else:
+        for arg in args:
+            ans.extend(arg.split(" "))
+    return ans
+
+def join_arguments(args: list[str]) -> str:
+    return [" ".join(args)]
+
+def append_argument(args: list[str], to_append) -> list[str]:
+    idx = find_arg_index(args)
+    if isinstance(to_append, list):
+        args[idx:idx] = to_append
+    else:
+        args.insert(idx, to_append)
+    return args
+
+def find_arg_index(args: list[str]) -> int:
+    # find the correct index to insert an argument
+    idx = len(args)
+
+    try:
+        new_idx = args.index("|")
+        idx = min(idx, new_idx)
+    except ValueError:
+        pass
+
+    try:
+        new_idx = args.index("2>&1")
+        idx = min(idx, new_idx)
+    except ValueError:
+        pass
+
+    return idx
+
 
 class VllmV1ConfigModifier:
     @classmethod
@@ -59,6 +97,8 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
                 WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
             ]["extraPodSpec"]["mainContainer"]["args"]
 
+            args = break_arguments(args)
+
             # remove --is-prefill-worker flag
             args.remove("--is-prefill-worker")
 
@@ -66,7 +106,11 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--enable-prefix-caching" in args:
                 args.remove("--enable-prefix-caching")
             if "--no-enable-prefix-caching" not in args:
-                args.append("--no-enable-prefix-caching")
+                args = append_argument(args, "--no-enable-prefix-caching")
+            
+            config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
 
         elif target == "decode":
             # delete prefill worker
@@ -78,12 +122,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
                 WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
             ]["extraPodSpec"]["mainContainer"]["args"]
 
+            args = break_arguments(args)
+
             # enable prefix caching
             if "--enable-prefix-caching" not in args:
-                args.append("--enable-prefix-caching")
+                args = append_argument(args, "--enable-prefix-caching")
             if "--no-enable-prefix-caching" in args:
                 args.remove("--no-enable-prefix-caching")
 
+            config["spec"]["services"][
+                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
+
         # set num workers to 1
         decode_worker_config = config["spec"]["services"][
             WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
@@ -96,16 +146,24 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
     def set_config_tp_size(cls, config: dict, tp_size: int):
         config = deepcopy(config)
 
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["resources"]["requests"]["gpu"] = str(tp_size)
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["resources"]["limits"]["gpu"] = str(tp_size)
+
         args = config["spec"]["services"][
             WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
         ]["extraPodSpec"]["mainContainer"]["args"]
 
+        args = break_arguments(args)
+
         try:
             idx = args.index("--tensor-parallel-size")
             args[idx + 1] = str(tp_size)
         except ValueError:
-            args.append("--tensor-parallel-size")
-            args.append(str(tp_size))
+            args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
+        
+        config["spec"]["services"][
+            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
 
         return config
 
@@ -116,6 +174,7 @@ def get_model_name(cls, config: dict) -> str:
             "args"
         ]
 
+        args = break_arguments(args)
         for i, arg in enumerate(args):
             if arg == "--model" and i + 1 < len(args):
                 return args[i + 1]
@@ -130,6 +189,7 @@ def get_port(cls, config: dict) -> int:
         args = config["spec"]["services"]["Frontend"]["extraPodSpec"]["mainContainer"][
             "args"
         ]
+        args = break_arguments(args)
         for arg in args:
             if arg.startswith("port="):
                 return int(arg.split("=")[1])
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index fcfbd5d5235..b8f7820f0cb 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -44,6 +44,7 @@ class DynamoDeploymentClient:
     def __init__(
         self,
         namespace: str,
+        model_name: str = "Qwen/Qwen3-0.6B",
         deployment_name: str = "vllm-v1-agg",
         base_log_dir: Optional[str] = None,
     ):
@@ -57,6 +58,7 @@ def __init__(
         """
         self.namespace = namespace
         self.deployment_name = deployment_name
+        self.model_name = model_name
         self.components = []  # Will store component names from CR
         self.deployment_spec = None  # Will store the full deployment spec
         self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
@@ -177,6 +179,7 @@ async def check_chat_completion(self):
         """
         Test the deployment with a chat completion request using httpx.
         """
+        EXAMPLE_CHAT_REQUEST["model"] = self.model_name
         async with self.port_forward() as port:
             url = f"http://localhost:{port}/v1/chat/completions"
             async with httpx.AsyncClient() as client:

From bd12d404aecc724c4aa92e2bdc92392e3016e735 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Tue, 15 Jul 2025 13:10:56 -0700
Subject: [PATCH 16/58] stage

---
 benchmarks/profiler/profile_sla.py      | 1 -
 benchmarks/profiler/utils/genai_perf.py | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 73db2b2a2e1..c17910ef281 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -105,7 +105,6 @@ async def run_profile(args):
 
         # run genai-perf
         async with client.port_forward() as port:
-            import pdb; pdb.set_trace()
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
             gap_result = benchmark_prefill(
                 args.isl, genai_perf_artifact_dir, model_name, port
diff --git a/benchmarks/profiler/utils/genai_perf.py b/benchmarks/profiler/utils/genai_perf.py
index 1e8f2d1b2ef..b182fd5ff92 100644
--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -151,6 +151,7 @@ def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
     genai_perf_cmd = get_prefill_genai_perf_cmd(
         isl, genai_perf_artifact_dir, model=model_name, port=port
     )
+    import pdb; pdb.set_trace()
     gap_process = subprocess.Popen(
         genai_perf_cmd,
         stdout=subprocess.PIPE,

From 9971acfa8d32f573b1e49ccc59dba43a9ddb2fc0 Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Tue, 15 Jul 2025 17:35:48 -0700
Subject: [PATCH 17/58] fix: running script

---
 benchmarks/profiler/README.md                 | 10 ++++
 benchmarks/profiler/profile_sla.py            |  9 +--
 .../profiler/utils/dynamo_deployment.py       | 18 +++---
 benchmarks/profiler/utils/genai_perf.py       |  3 +-
 k8s.sh                                        | 58 +++++++++++++++++++
 5 files changed, 84 insertions(+), 14 deletions(-)
 create mode 100644 benchmarks/profiler/README.md
 create mode 100755 k8s.sh

diff --git a/benchmarks/profiler/README.md b/benchmarks/profiler/README.md
new file mode 100644
index 00000000000..03a6c166faf
--- /dev/null
+++ b/benchmarks/profiler/README.md
@@ -0,0 +1,10 @@
+# Profiler
+
+## Setup
+
+From within the dynamo container:
+```bash
+./k8s.sh  # install binaries, auth into aks cluster
+cd benchmarks/profiler
+python -m profile_sla --config ../../examples/vllm/deploy/disagg.yaml --namespace mo-dyn-cloud # run the profiler
+```
\ No newline at end of file
diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index c17910ef281..78a5871273f 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -104,7 +104,7 @@ async def run_profile(args):
         )
 
         # run genai-perf
-        async with client.port_forward() as port:
+        with client.port_forward() as port:
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
             gap_result = benchmark_prefill(
                 args.isl, genai_perf_artifact_dir, model_name, port
@@ -177,7 +177,8 @@ async def run_profile(args):
 
         engine_decode_itl = []
         engine_decode_thpt_per_gpu = []
-        async with client.port_forward() as port:
+        with client.port_forward() as port:
+
             for num_request in sweep_num_request:
                 genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
                 gap_result = benchmark_decode(
@@ -301,7 +302,7 @@ async def run_profile(args):
         f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
     )
 
-    async with client.port_forward() as port:
+    with client.port_forward() as port:
         for isl in range(
             100,
             args.max_context_length,
@@ -384,7 +385,7 @@ async def run_profile(args):
     )
 
     osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    async with client.port_forward() as port:
+    with client.port_forward() as port:
         for isl in range(
             100,
             args.max_context_length - osl,
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index b8f7820f0cb..8fd67810052 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -15,7 +15,7 @@
 import asyncio
 import random
 import time 
-from contextlib import asynccontextmanager
+from contextlib import contextmanager
 from pathlib import Path
 from typing import Optional, Union
 
@@ -23,7 +23,7 @@
 import httpx  # added for HTTP requests
 import kubernetes_asyncio as kubernetes
 import yaml
-from kr8s.asyncio.objects import Service
+from kr8s.objects import Service
 from kubernetes_asyncio import client, config
 
 # Example chat completion request for testing deployments
@@ -158,8 +158,8 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
             await asyncio.sleep(20)
         raise TimeoutError("Deployment failed to become ready within timeout")
 
-    @asynccontextmanager
-    async def port_forward(self, port: Optional[int] = None):
+    @contextmanager
+    def port_forward(self, port: Optional[int] = None):
         """
         Forward the service's HTTP port to a local port.
         """
@@ -167,21 +167,21 @@ async def port_forward(self, port: Optional[int] = None):
             port = random.randint(49152, 65535)
         svc_name = f"{self.deployment_name}-frontend"
         # Get the Service and forward its HTTP port (8000)
-        service = await Service.get(svc_name, namespace=self.namespace)
+        service = Service.get(svc_name, namespace=self.namespace)
         pf = service.portforward(remote_port=8000, local_port=port)
-        await pf.start()
+        pf.start()
         try:
             yield port
         finally:
-            await pf.stop()
+            pf.stop()
 
     async def check_chat_completion(self):
         """
         Test the deployment with a chat completion request using httpx.
         """
         EXAMPLE_CHAT_REQUEST["model"] = self.model_name
-        async with self.port_forward() as port:
-            url = f"http://localhost:{port}/v1/chat/completions"
+        with self.port_forward() as port:
+            url = f"http://localhost:{port}/v1/chat/completions"            
             async with httpx.AsyncClient() as client:
                 response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
                 response.raise_for_status()
diff --git a/benchmarks/profiler/utils/genai_perf.py b/benchmarks/profiler/utils/genai_perf.py
index b182fd5ff92..8416059afb5 100644
--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -151,7 +151,8 @@ def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
     genai_perf_cmd = get_prefill_genai_perf_cmd(
         isl, genai_perf_artifact_dir, model=model_name, port=port
     )
-    import pdb; pdb.set_trace()
+    print(f"genai-perf cmd: {genai_perf_cmd}")
+    # import pdb; pdb.set_trace()
     gap_process = subprocess.Popen(
         genai_perf_cmd,
         stdout=subprocess.PIPE,
diff --git a/k8s.sh b/k8s.sh
new file mode 100755
index 00000000000..2c213fe5587
--- /dev/null
+++ b/k8s.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# 1. Install Homebrew if missing
+if ! command -v brew &> /dev/null; then
+  echo "Homebrew not found—installing prerequisites and Homebrew…"
+  # Install build-time prerequisites
+  apt-get update
+  apt-get install -y build-essential procps curl file git
+
+  # Non-interactive Homebrew install
+  NONINTERACTIVE=1 \
+    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
+
+  # Load brew into this shell
+  eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
+else
+  echo "Homebrew already installed, skipping."
+fi
+
+# 2. Ensure brew is up-to-date
+echo "Updating Homebrew…"
+brew update
+
+# 3. Install Azure CLI
+if ! command -v az &> /dev/null; then
+  echo "Installing Azure CLI (az)…"
+  brew install azure-cli
+else
+  echo "Azure CLI already installed, skipping."
+fi
+
+# 4. Install kubelogin
+if ! command -v kubelogin &> /dev/null; then
+  echo "Installing kubelogin…"
+  brew install Azure/kubelogin/kubelogin
+else
+  echo "kubelogin already installed, skipping."
+fi
+
+# 5. Install kubectl
+if ! command -v kubectl &> /dev/null; then
+  echo "Installing kubectl (kubernetes-cli)…"
+  brew install kubernetes-cli
+else
+  echo "kubectl already installed, skipping."
+fi
+
+echo "✅ All tools are installed and up-to-date."
+
+echo >> /root/.bashrc
+echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"' >> /root/.bashrc
+eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
+
+az login
+az aks get-credentials --resource-group rg-aks-dynamo-dev --name aks-dynamo-dev
+kubelogin convert-kubeconfig -l azurecli
+kubectl auth can-i create deployments

From a5d8acab4cc8f77a22f418096148ba140487630e Mon Sep 17 00:00:00 2001
From: mohammedabdulwahhab <furkhan324@berkeley.edu>
Date: Tue, 15 Jul 2025 17:57:53 -0700
Subject: [PATCH 18/58] fix: fix

---
 examples/vllm/deploy/disagg.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/vllm/deploy/disagg.yaml b/examples/vllm/deploy/disagg.yaml
index 31113f5245c..2cc866c7ae8 100644
--- a/examples/vllm/deploy/disagg.yaml
+++ b/examples/vllm/deploy/disagg.yaml
@@ -85,11 +85,11 @@ spec:
       resources:
         requests:
           cpu: "10"
-          memory: "20Gi"
+          memory: "40Gi"
           gpu: "1"
         limits:
           cpu: "10"
-          memory: "20Gi"
+          memory: "40Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:
@@ -124,11 +124,11 @@ spec:
       resources:
         requests:
           cpu: "10"
-          memory: "20Gi"
+          memory: "40Gi"
           gpu: "1"
         limits:
           cpu: "10"
-          memory: "20Gi"
+          memory: "40Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:

From f8f936341056bf3e0a77631db518e656b26bafda Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Wed, 16 Jul 2025 14:28:00 -0700
Subject: [PATCH 19/58] add logic to find a free port

---
 benchmarks/profiler/utils/dynamo_deployment.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index 8fd67810052..f2d2b1530a6 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -15,6 +15,7 @@
 import asyncio
 import random
 import time 
+import socket
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Optional, Union
@@ -164,7 +165,18 @@ def port_forward(self, port: Optional[int] = None):
         Forward the service's HTTP port to a local port.
         """
         if port is None:
-            port = random.randint(49152, 65535)
+            # Find a free port in the ephemeral port range
+            for _ in range(100):  # Try up to 100 times
+                candidate_port = random.randint(49152, 65535)
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    try:
+                        s.bind(('localhost', candidate_port))
+                        port = candidate_port
+                        break
+                    except OSError:
+                        continue  # Port is in use, try another
+            if port is None:
+                raise RuntimeError("Could not find a free port after 100 attempts")
         svc_name = f"{self.deployment_name}-frontend"
         # Get the Service and forward its HTTP port (8000)
         service = Service.get(svc_name, namespace=self.namespace)

From 8e292f6fad30350c0285e76082a17d4af9fc74aa Mon Sep 17 00:00:00 2001
From: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:59:25 -0700
Subject: [PATCH 20/58] feat: add Kubernetes service account configuration for
 SLA profiling jobs (#1975)

---
 .../profiler/utils/dynamo_deployment.py       | 14 ++++++++---
 examples/vllm/deploy/profile_sla_binding.yaml | 13 ++++++++++
 examples/vllm/deploy/profile_sla_rbac.yaml    | 25 +++++++++++++++++++
 examples/vllm/deploy/profile_sla_sa.yaml      |  7 ++++++
 4 files changed, 55 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 benchmarks/profiler/utils/dynamo_deployment.py
 create mode 100644 examples/vllm/deploy/profile_sla_binding.yaml
 create mode 100644 examples/vllm/deploy/profile_sla_rbac.yaml
 create mode 100644 examples/vllm/deploy/profile_sla_sa.yaml

diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
old mode 100644
new mode 100755
index f2d2b1530a6..488b51f31bc
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -64,9 +64,15 @@ def __init__(
         self.deployment_spec = None  # Will store the full deployment spec
         self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
 
-    async def _init_kubernetes(self):
+    def _init_kubernetes(self):
         """Initialize kubernetes client"""
-        await config.load_kube_config()
+        try:
+            # Try in-cluster config first (for pods with service accounts)
+            config.load_incluster_config()
+        except Exception:
+            # Fallback to kube config file (for local development)
+            config.load_kube_config()
+
         self.k8s_client = client.ApiClient()
         self.custom_api = client.CustomObjectsApi(self.k8s_client)
         self.core_api = client.CoreV1Api(self.k8s_client)
@@ -78,7 +84,7 @@ async def create_deployment(self, deployment: Union[dict, str]):
         Args:
             deployment: Either a dict containing the deployment spec or a path to a yaml file
         """
-        await self._init_kubernetes()
+        self._init_kubernetes()
 
         if isinstance(deployment, str):
             # Load from yaml file
@@ -193,7 +199,7 @@ async def check_chat_completion(self):
         """
         EXAMPLE_CHAT_REQUEST["model"] = self.model_name
         with self.port_forward() as port:
-            url = f"http://localhost:{port}/v1/chat/completions"            
+            url = f"http://localhost:{port}/v1/chat/completions"
             async with httpx.AsyncClient() as client:
                 response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
                 response.raise_for_status()
diff --git a/examples/vllm/deploy/profile_sla_binding.yaml b/examples/vllm/deploy/profile_sla_binding.yaml
new file mode 100644
index 00000000000..f32a0f2f51a
--- /dev/null
+++ b/examples/vllm/deploy/profile_sla_binding.yaml
@@ -0,0 +1,13 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: profile-sla-binding
+  namespace: ${NAMESPACE}
+subjects:
+- kind: ServiceAccount
+  name: profile-sla-sa
+  namespace: ${NAMESPACE}
+roleRef:
+  kind: Role
+  name: profile-sla-role
+  apiGroup: rbac.authorization.k8s.io
diff --git a/examples/vllm/deploy/profile_sla_rbac.yaml b/examples/vllm/deploy/profile_sla_rbac.yaml
new file mode 100644
index 00000000000..5cf2f6e5a82
--- /dev/null
+++ b/examples/vllm/deploy/profile_sla_rbac.yaml
@@ -0,0 +1,25 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: profile-sla-role
+  namespace: ${NAMESPACE}
+rules:
+  # DynamoGraphDeployment custom resources
+  - apiGroups: ["nvidia.com"]
+    resources: ["dynamographdeployments"]
+    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
+  # Core resources needed for deployment management
+  - apiGroups: [""]
+    resources: ["pods", "services", "configmaps", "secrets"]
+    verbs: ["get", "list", "watch"]
+  - apiGroups: [""]
+    resources: ["pods/log"]
+    verbs: ["get", "list"]
+  # Apps resources
+  - apiGroups: ["apps"]
+    resources: ["deployments", "replicasets"]
+    verbs: ["get", "list", "watch"]
+  # For port forwarding
+  - apiGroups: [""]
+    resources: ["pods/portforward"]
+    verbs: ["create"]
diff --git a/examples/vllm/deploy/profile_sla_sa.yaml b/examples/vllm/deploy/profile_sla_sa.yaml
new file mode 100644
index 00000000000..6e6955e655d
--- /dev/null
+++ b/examples/vllm/deploy/profile_sla_sa.yaml
@@ -0,0 +1,7 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: profile-sla-sa
+  namespace: ${NAMESPACE}
+imagePullSecrets:
+  - name: nvcr-imagepullsecret

From d62731f78aab6e1a9a68249053a1a48122bd3100 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Thu, 17 Jul 2025 15:37:51 -0700
Subject: [PATCH 21/58] feat: use service DNS for interfacing with deployments
 when profiling in k8s

---
 benchmarks/profiler/profile_sla.py            | 35 ++++++------
 .../profiler/utils/dynamo_deployment.py       | 57 ++++++++++++++++---
 benchmarks/profiler/utils/genai_perf.py       | 32 +++++++----
 3 files changed, 90 insertions(+), 34 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 78a5871273f..955c317ffbd 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -42,6 +42,7 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
+
 async def run_profile(args):
     config_modifier = CONFIG_MODIFIERS[args.backend]
 
@@ -91,7 +92,7 @@ async def run_profile(args):
 
         client = DynamoDeploymentClient(
             namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-        ) 
+        )
         await client.create_deployment(prefill_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
@@ -104,10 +105,10 @@ async def run_profile(args):
         )
 
         # run genai-perf
-        with client.port_forward() as port:
+        with client.get_service_url_with_port_forward() as base_url:
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
             gap_result = benchmark_prefill(
-                args.isl, genai_perf_artifact_dir, model_name, port
+                args.isl, genai_perf_artifact_dir, model_name, base_url=base_url
             )
             if gap_result is not None:
                 ttft = gap_result["time_to_first_token"]["avg"]
@@ -152,7 +153,7 @@ async def run_profile(args):
 
         client = DynamoDeploymentClient(
             namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-        ) 
+        )
         await client.create_deployment(decode_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
@@ -177,8 +178,7 @@ async def run_profile(args):
 
         engine_decode_itl = []
         engine_decode_thpt_per_gpu = []
-        with client.port_forward() as port:
-
+        with client.get_service_url_with_port_forward() as base_url:
             for num_request in sweep_num_request:
                 genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
                 gap_result = benchmark_decode(
@@ -187,7 +187,7 @@ async def run_profile(args):
                     num_request,
                     genai_perf_artifact_dir,
                     model_name,
-                    port,
+                    base_url=base_url,
                 )
                 if gap_result is not None:
                     itl = gap_result["inter_token_latency"]["avg"]
@@ -290,7 +290,7 @@ async def run_profile(args):
 
     client = DynamoDeploymentClient(
         namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-    ) 
+    )
     await client.create_deployment(prefill_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
@@ -302,7 +302,7 @@ async def run_profile(args):
         f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
     )
 
-    with client.port_forward() as port:
+    with client.get_service_url_with_port_forward() as base_url:
         for isl in range(
             100,
             args.max_context_length,
@@ -311,7 +311,7 @@ async def run_profile(args):
             # run genai-perf
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
             gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, port
+                isl, genai_perf_artifact_dir, model_name, base_url=base_url
             )
             if gap_result is not None:
                 ttft = gap_result["time_to_first_token"]["avg"]
@@ -366,9 +366,7 @@ async def run_profile(args):
     with open(decode_config_fn, "w") as f:
         yaml.dump(decode_config, f)
 
-    client = DynamoDeploymentClient(
-        namespace=args.namespace, base_log_dir=work_dir
-    ) 
+    client = DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir)
     await client.create_deployment(decode_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
@@ -385,7 +383,7 @@ async def run_profile(args):
     )
 
     osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    with client.port_forward() as port:
+    with client.get_service_url_with_port_forward() as base_url:
         for isl in range(
             100,
             args.max_context_length - osl,
@@ -404,7 +402,12 @@ async def run_profile(args):
                     f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
                 )
                 gap_result = benchmark_decode(
-                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+                    isl,
+                    osl,
+                    num_request,
+                    genai_perf_artifact_dir,
+                    model_name,
+                    base_url=base_url,
                 )
                 if gap_result is not None:
                     itl = gap_result["inter_token_latency"]["avg"]
@@ -517,4 +520,4 @@ async def run_profile(args):
     )
     args = parser.parse_args()
 
-    asyncio.run(run_profile(args))
\ No newline at end of file
+    asyncio.run(run_profile(args))
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index 488b51f31bc..eb9c5d7fe26 100755
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -13,9 +13,10 @@
 
 import argparse
 import asyncio
+import os
 import random
-import time 
 import socket
+import time
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Optional, Union
@@ -77,6 +78,26 @@ def _init_kubernetes(self):
         self.custom_api = client.CustomObjectsApi(self.k8s_client)
         self.core_api = client.CoreV1Api(self.k8s_client)
 
+    def _is_running_in_kubernetes(self) -> bool:
+        """
+        Detect if we're running inside a Kubernetes cluster by checking for the service account token.
+        """
+        return os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/token")
+
+    def _get_service_url(self) -> str:
+        """
+        Get the service URL. Returns service DNS if running in Kubernetes, otherwise localhost with port forwarding.
+        """
+        if self._is_running_in_kubernetes():
+            # Use Kubernetes service DNS: service-name.namespace.svc.cluster.local:port
+            svc_name = f"{self.deployment_name}-frontend"
+            return f"http://{svc_name}.{self.namespace}.svc.cluster.local:8000"
+        else:
+            # For local development, we need to use port forwarding
+            raise RuntimeError(
+                "Port forwarding is required when not running in Kubernetes. Use get_service_url_with_port_forward() instead."
+            )
+
     async def create_deployment(self, deployment: Union[dict, str]):
         """
         Create a DynamoGraphDeployment from either a dict or yaml file path.
@@ -128,20 +149,22 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
         # TODO: A little brittle, also should output intermediate status every so often.
         while (time.time() - start_time) < timeout:
             try:
-                status = await self.custom_api.get_namespaced_custom_object_status(
+                status = await self.custom_api.get_namespaced_custom_object(
                     group="nvidia.com",
                     version="v1alpha1",
                     namespace=self.namespace,
                     plural="dynamographdeployments",
                     name=self.deployment_name,
                 )
-                # print(f"Current status: {status.get('status', {})}")
-
                 # Check both conditions:
                 # 1. Ready condition is True
                 # 2. State is successful
                 status_obj = status.get("status", {})
                 conditions = status_obj.get("conditions", [])
+                current_state = status_obj.get("state", "unknown")
+
+                print(f"Current deployment state: {current_state}")
+                print(f"Current conditions: {conditions}")
 
                 ready_condition = False
                 for condition in conditions:
@@ -159,12 +182,32 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
                         "Deployment is ready: Ready condition is True and state is successful"
                     )
                     return True
+                else:
+                    print(
+                        f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
+                    )
 
             except kubernetes.client.rest.ApiException:
                 pass
             await asyncio.sleep(20)
         raise TimeoutError("Deployment failed to become ready within timeout")
 
+    @contextmanager
+    def get_service_url_with_port_forward(self, port: Optional[int] = None):
+        """
+        Get the service URL with automatic detection of environment.
+
+        When running in Kubernetes: yields the service DNS URL directly (no port forwarding needed)
+        When running locally: sets up port forwarding and yields localhost URL
+        """
+        if self._is_running_in_kubernetes():
+            # No port forwarding needed, use service DNS directly
+            yield self._get_service_url()
+        else:
+            # Use port forwarding for local development - delegate to existing method
+            with self.port_forward(port) as forwarded_port:
+                yield f"http://localhost:{forwarded_port}"
+
     @contextmanager
     def port_forward(self, port: Optional[int] = None):
         """
@@ -176,7 +219,7 @@ def port_forward(self, port: Optional[int] = None):
                 candidate_port = random.randint(49152, 65535)
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                     try:
-                        s.bind(('localhost', candidate_port))
+                        s.bind(("localhost", candidate_port))
                         port = candidate_port
                         break
                     except OSError:
@@ -198,8 +241,8 @@ async def check_chat_completion(self):
         Test the deployment with a chat completion request using httpx.
         """
         EXAMPLE_CHAT_REQUEST["model"] = self.model_name
-        with self.port_forward() as port:
-            url = f"http://localhost:{port}/v1/chat/completions"
+        with self.get_service_url_with_port_forward() as base_url:
+            url = f"{base_url}/v1/chat/completions"
             async with httpx.AsyncClient() as client:
                 response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
                 response.raise_for_status()
diff --git a/benchmarks/profiler/utils/genai_perf.py b/benchmarks/profiler/utils/genai_perf.py
index 8416059afb5..19aab8a9850 100644
--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -34,7 +34,7 @@ def _get_common_genai_perf_cmd(
     artifact_dir,
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    port=8000,
+    base_url="http://localhost:8000",
 ):
     return [
         "genai-perf",
@@ -49,7 +49,7 @@ def _get_common_genai_perf_cmd(
         "/v1/chat/completions",
         "--streaming",
         "--url",
-        f"http://localhost:{port}",
+        base_url,
         "--extra-inputs",
         "ignore_eos:true",
         "--extra-inputs",
@@ -69,13 +69,13 @@ def get_prefill_genai_perf_cmd(
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
     osl=5,
-    port=8000,
+    base_url="http://localhost:8000",
 ):
     return _get_common_genai_perf_cmd(
         artifact_dir,
         seed,
         model,
-        port,
+        base_url,
     ) + [
         "--synthetic-input-tokens-mean",
         str(isl),
@@ -103,13 +103,13 @@ def get_decode_genai_perf_cmd(
     num_request,
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    port=8000,
+    base_url="http://localhost:8000",
 ):
     return _get_common_genai_perf_cmd(
         artifact_dir,
         seed,
         model,
-        port,
+        base_url,
     ) + [
         "--synthetic-input-tokens-mean",
         str(isl),
@@ -146,10 +146,12 @@ def get_gap_result(artifact_dir: str) -> dict:
         return json.load(f)
 
 
-def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
+def benchmark_prefill(
+    isl, genai_perf_artifact_dir, model_name, base_url="http://localhost:8000"
+):
     logger.info(f"Running genai-perf with isl {isl}")
     genai_perf_cmd = get_prefill_genai_perf_cmd(
-        isl, genai_perf_artifact_dir, model=model_name, port=port
+        isl, genai_perf_artifact_dir, model=model_name, base_url=base_url
     )
     print(f"genai-perf cmd: {genai_perf_cmd}")
     # import pdb; pdb.set_trace()
@@ -171,12 +173,20 @@ def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
         return None
 
 
-def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name, port):
+def benchmark_decode(
+    isl,
+    osl,
+    num_request,
+    genai_perf_artifact_dir,
+    model_name,
+    base_url="http://localhost:8000",
+):
     logger.info(f"Profiling decode with num_request {num_request}...")
 
     # first warm-up the engine by pre-computing all prefill tokens
     # we use the same random seed to make sure the prompt is the same
     seed = random.randint(0, 1000000)
+
     genai_perf_cmd = get_decode_genai_perf_cmd(
         isl,
         osl,
@@ -184,7 +194,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
         num_request,
         seed=seed,
         model=model_name,
-        port=port,
+        base_url=base_url,
     )
     gap_process = subprocess.Popen(
         genai_perf_cmd,
@@ -201,7 +211,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
         num_request,
         seed=seed,
         model=model_name,
-        port=port,
+        base_url=base_url,
     )
     gap_process = subprocess.Popen(
         genai_perf_cmd,

From a1aea5ac6d778e90eb39c3b37713d3fc590dfa39 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Thu, 17 Jul 2025 15:45:21 -0700
Subject: [PATCH 22/58] Revert "feat: use service DNS for interfacing with
 deployments when profiling in k8s"

This reverts commit d62731f78aab6e1a9a68249053a1a48122bd3100.
---
 benchmarks/profiler/profile_sla.py            | 35 ++++++------
 .../profiler/utils/dynamo_deployment.py       | 57 +++----------------
 benchmarks/profiler/utils/genai_perf.py       | 32 ++++-------
 3 files changed, 34 insertions(+), 90 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 955c317ffbd..78a5871273f 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -42,7 +42,6 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
-
 async def run_profile(args):
     config_modifier = CONFIG_MODIFIERS[args.backend]
 
@@ -92,7 +91,7 @@ async def run_profile(args):
 
         client = DynamoDeploymentClient(
             namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-        )
+        ) 
         await client.create_deployment(prefill_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
@@ -105,10 +104,10 @@ async def run_profile(args):
         )
 
         # run genai-perf
-        with client.get_service_url_with_port_forward() as base_url:
+        with client.port_forward() as port:
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
             gap_result = benchmark_prefill(
-                args.isl, genai_perf_artifact_dir, model_name, base_url=base_url
+                args.isl, genai_perf_artifact_dir, model_name, port
             )
             if gap_result is not None:
                 ttft = gap_result["time_to_first_token"]["avg"]
@@ -153,7 +152,7 @@ async def run_profile(args):
 
         client = DynamoDeploymentClient(
             namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-        )
+        ) 
         await client.create_deployment(decode_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
@@ -178,7 +177,8 @@ async def run_profile(args):
 
         engine_decode_itl = []
         engine_decode_thpt_per_gpu = []
-        with client.get_service_url_with_port_forward() as base_url:
+        with client.port_forward() as port:
+
             for num_request in sweep_num_request:
                 genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
                 gap_result = benchmark_decode(
@@ -187,7 +187,7 @@ async def run_profile(args):
                     num_request,
                     genai_perf_artifact_dir,
                     model_name,
-                    base_url=base_url,
+                    port,
                 )
                 if gap_result is not None:
                     itl = gap_result["inter_token_latency"]["avg"]
@@ -290,7 +290,7 @@ async def run_profile(args):
 
     client = DynamoDeploymentClient(
         namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-    )
+    ) 
     await client.create_deployment(prefill_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
@@ -302,7 +302,7 @@ async def run_profile(args):
         f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
     )
 
-    with client.get_service_url_with_port_forward() as base_url:
+    with client.port_forward() as port:
         for isl in range(
             100,
             args.max_context_length,
@@ -311,7 +311,7 @@ async def run_profile(args):
             # run genai-perf
             genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
             gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, base_url=base_url
+                isl, genai_perf_artifact_dir, model_name, port
             )
             if gap_result is not None:
                 ttft = gap_result["time_to_first_token"]["avg"]
@@ -366,7 +366,9 @@ async def run_profile(args):
     with open(decode_config_fn, "w") as f:
         yaml.dump(decode_config, f)
 
-    client = DynamoDeploymentClient(namespace=args.namespace, base_log_dir=work_dir)
+    client = DynamoDeploymentClient(
+        namespace=args.namespace, base_log_dir=work_dir
+    ) 
     await client.create_deployment(decode_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
@@ -383,7 +385,7 @@ async def run_profile(args):
     )
 
     osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    with client.get_service_url_with_port_forward() as base_url:
+    with client.port_forward() as port:
         for isl in range(
             100,
             args.max_context_length - osl,
@@ -402,12 +404,7 @@ async def run_profile(args):
                     f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
                 )
                 gap_result = benchmark_decode(
-                    isl,
-                    osl,
-                    num_request,
-                    genai_perf_artifact_dir,
-                    model_name,
-                    base_url=base_url,
+                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
                 )
                 if gap_result is not None:
                     itl = gap_result["inter_token_latency"]["avg"]
@@ -520,4 +517,4 @@ async def run_profile(args):
     )
     args = parser.parse_args()
 
-    asyncio.run(run_profile(args))
+    asyncio.run(run_profile(args))
\ No newline at end of file
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index eb9c5d7fe26..488b51f31bc 100755
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -13,10 +13,9 @@
 
 import argparse
 import asyncio
-import os
 import random
+import time 
 import socket
-import time
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Optional, Union
@@ -78,26 +77,6 @@ def _init_kubernetes(self):
         self.custom_api = client.CustomObjectsApi(self.k8s_client)
         self.core_api = client.CoreV1Api(self.k8s_client)
 
-    def _is_running_in_kubernetes(self) -> bool:
-        """
-        Detect if we're running inside a Kubernetes cluster by checking for the service account token.
-        """
-        return os.path.exists("/var/run/secrets/kubernetes.io/serviceaccount/token")
-
-    def _get_service_url(self) -> str:
-        """
-        Get the service URL. Returns service DNS if running in Kubernetes, otherwise localhost with port forwarding.
-        """
-        if self._is_running_in_kubernetes():
-            # Use Kubernetes service DNS: service-name.namespace.svc.cluster.local:port
-            svc_name = f"{self.deployment_name}-frontend"
-            return f"http://{svc_name}.{self.namespace}.svc.cluster.local:8000"
-        else:
-            # For local development, we need to use port forwarding
-            raise RuntimeError(
-                "Port forwarding is required when not running in Kubernetes. Use get_service_url_with_port_forward() instead."
-            )
-
     async def create_deployment(self, deployment: Union[dict, str]):
         """
         Create a DynamoGraphDeployment from either a dict or yaml file path.
@@ -149,22 +128,20 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
         # TODO: A little brittle, also should output intermediate status every so often.
         while (time.time() - start_time) < timeout:
             try:
-                status = await self.custom_api.get_namespaced_custom_object(
+                status = await self.custom_api.get_namespaced_custom_object_status(
                     group="nvidia.com",
                     version="v1alpha1",
                     namespace=self.namespace,
                     plural="dynamographdeployments",
                     name=self.deployment_name,
                 )
+                # print(f"Current status: {status.get('status', {})}")
+
                 # Check both conditions:
                 # 1. Ready condition is True
                 # 2. State is successful
                 status_obj = status.get("status", {})
                 conditions = status_obj.get("conditions", [])
-                current_state = status_obj.get("state", "unknown")
-
-                print(f"Current deployment state: {current_state}")
-                print(f"Current conditions: {conditions}")
 
                 ready_condition = False
                 for condition in conditions:
@@ -182,32 +159,12 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
                         "Deployment is ready: Ready condition is True and state is successful"
                     )
                     return True
-                else:
-                    print(
-                        f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
-                    )
 
             except kubernetes.client.rest.ApiException:
                 pass
             await asyncio.sleep(20)
         raise TimeoutError("Deployment failed to become ready within timeout")
 
-    @contextmanager
-    def get_service_url_with_port_forward(self, port: Optional[int] = None):
-        """
-        Get the service URL with automatic detection of environment.
-
-        When running in Kubernetes: yields the service DNS URL directly (no port forwarding needed)
-        When running locally: sets up port forwarding and yields localhost URL
-        """
-        if self._is_running_in_kubernetes():
-            # No port forwarding needed, use service DNS directly
-            yield self._get_service_url()
-        else:
-            # Use port forwarding for local development - delegate to existing method
-            with self.port_forward(port) as forwarded_port:
-                yield f"http://localhost:{forwarded_port}"
-
     @contextmanager
     def port_forward(self, port: Optional[int] = None):
         """
@@ -219,7 +176,7 @@ def port_forward(self, port: Optional[int] = None):
                 candidate_port = random.randint(49152, 65535)
                 with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                     try:
-                        s.bind(("localhost", candidate_port))
+                        s.bind(('localhost', candidate_port))
                         port = candidate_port
                         break
                     except OSError:
@@ -241,8 +198,8 @@ async def check_chat_completion(self):
         Test the deployment with a chat completion request using httpx.
         """
         EXAMPLE_CHAT_REQUEST["model"] = self.model_name
-        with self.get_service_url_with_port_forward() as base_url:
-            url = f"{base_url}/v1/chat/completions"
+        with self.port_forward() as port:
+            url = f"http://localhost:{port}/v1/chat/completions"
             async with httpx.AsyncClient() as client:
                 response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
                 response.raise_for_status()
diff --git a/benchmarks/profiler/utils/genai_perf.py b/benchmarks/profiler/utils/genai_perf.py
index 19aab8a9850..8416059afb5 100644
--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -34,7 +34,7 @@ def _get_common_genai_perf_cmd(
     artifact_dir,
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    base_url="http://localhost:8000",
+    port=8000,
 ):
     return [
         "genai-perf",
@@ -49,7 +49,7 @@ def _get_common_genai_perf_cmd(
         "/v1/chat/completions",
         "--streaming",
         "--url",
-        base_url,
+        f"http://localhost:{port}",
         "--extra-inputs",
         "ignore_eos:true",
         "--extra-inputs",
@@ -69,13 +69,13 @@ def get_prefill_genai_perf_cmd(
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
     osl=5,
-    base_url="http://localhost:8000",
+    port=8000,
 ):
     return _get_common_genai_perf_cmd(
         artifact_dir,
         seed,
         model,
-        base_url,
+        port,
     ) + [
         "--synthetic-input-tokens-mean",
         str(isl),
@@ -103,13 +103,13 @@ def get_decode_genai_perf_cmd(
     num_request,
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    base_url="http://localhost:8000",
+    port=8000,
 ):
     return _get_common_genai_perf_cmd(
         artifact_dir,
         seed,
         model,
-        base_url,
+        port,
     ) + [
         "--synthetic-input-tokens-mean",
         str(isl),
@@ -146,12 +146,10 @@ def get_gap_result(artifact_dir: str) -> dict:
         return json.load(f)
 
 
-def benchmark_prefill(
-    isl, genai_perf_artifact_dir, model_name, base_url="http://localhost:8000"
-):
+def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
     logger.info(f"Running genai-perf with isl {isl}")
     genai_perf_cmd = get_prefill_genai_perf_cmd(
-        isl, genai_perf_artifact_dir, model=model_name, base_url=base_url
+        isl, genai_perf_artifact_dir, model=model_name, port=port
     )
     print(f"genai-perf cmd: {genai_perf_cmd}")
     # import pdb; pdb.set_trace()
@@ -173,20 +171,12 @@ def benchmark_prefill(
         return None
 
 
-def benchmark_decode(
-    isl,
-    osl,
-    num_request,
-    genai_perf_artifact_dir,
-    model_name,
-    base_url="http://localhost:8000",
-):
+def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name, port):
     logger.info(f"Profiling decode with num_request {num_request}...")
 
     # first warm-up the engine by pre-computing all prefill tokens
     # we use the same random seed to make sure the prompt is the same
     seed = random.randint(0, 1000000)
-
     genai_perf_cmd = get_decode_genai_perf_cmd(
         isl,
         osl,
@@ -194,7 +184,7 @@ def benchmark_decode(
         num_request,
         seed=seed,
         model=model_name,
-        base_url=base_url,
+        port=port,
     )
     gap_process = subprocess.Popen(
         genai_perf_cmd,
@@ -211,7 +201,7 @@ def benchmark_decode(
         num_request,
         seed=seed,
         model=model_name,
-        base_url=base_url,
+        port=port,
     )
     gap_process = subprocess.Popen(
         genai_perf_cmd,

From 06bfe3b343ac045ea9c688c4554bdec39c22a93a Mon Sep 17 00:00:00 2001
From: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
Date: Fri, 18 Jul 2025 11:21:32 -0700
Subject: [PATCH 23/58] feat: use service DNS instead of port forwarding for
 K8s-deployed SLA profiler (#2004)

---
 benchmarks/profiler/profile_sla.py            | 187 ++++++++++--------
 benchmarks/profiler/utils/config.py           |  34 ++--
 .../profiler/utils/dynamo_deployment.py       | 106 +++++-----
 benchmarks/profiler/utils/genai_perf.py       |  32 +--
 container/deps/requirements.txt               |   6 +-
 examples/vllm/deploy/profile_sla_binding.yaml |  14 ++
 examples/vllm/deploy/profile_sla_rbac.yaml    |  34 ++--
 examples/vllm/deploy/profile_sla_sa.yaml      |  14 ++
 k8s.sh                                        |  15 ++
 9 files changed, 261 insertions(+), 181 deletions(-)
 mode change 100755 => 100644 benchmarks/profiler/utils/dynamo_deployment.py

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 78a5871273f..1a5a287fd4b 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -42,6 +42,7 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
+
 async def run_profile(args):
     config_modifier = CONFIG_MODIFIERS[args.backend]
 
@@ -90,8 +91,11 @@ async def run_profile(args):
             yaml.dump(prefill_config, f)
 
         client = DynamoDeploymentClient(
-            namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-        ) 
+            namespace=args.namespace,
+            base_log_dir=work_dir,
+            model_name=model_name,
+            service_name=args.service_name,
+        )
         await client.create_deployment(prefill_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
@@ -104,16 +108,16 @@ async def run_profile(args):
         )
 
         # run genai-perf
-        with client.port_forward() as port:
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
-            gap_result = benchmark_prefill(
-                args.isl, genai_perf_artifact_dir, model_name, port
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_tp_size.append(tp_size)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
+        base_url = client.get_service_url()
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
+        gap_result = benchmark_prefill(
+            args.isl, genai_perf_artifact_dir, model_name, base_url=base_url
+        )
+        if gap_result is not None:
+            ttft = gap_result["time_to_first_token"]["avg"]
+            prefill_tp_size.append(tp_size)
+            prefill_ttft.append(ttft)
+            prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
@@ -151,8 +155,11 @@ async def run_profile(args):
             yaml.dump(decode_config, f)
 
         client = DynamoDeploymentClient(
-            namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-        ) 
+            namespace=args.namespace,
+            base_log_dir=work_dir,
+            model_name=model_name,
+            service_name=args.service_name,
+        )
         await client.create_deployment(decode_config_fn)
         logger.info("Waiting for deployment to be ready...")
         await client.wait_for_deployment_ready()
@@ -177,30 +184,27 @@ async def run_profile(args):
 
         engine_decode_itl = []
         engine_decode_thpt_per_gpu = []
-        with client.port_forward() as port:
-
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
-                gap_result = benchmark_decode(
-                    args.isl,
-                    args.osl,
-                    num_request,
-                    genai_perf_artifact_dir,
-                    model_name,
-                    port,
-                )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    thpt_per_gpu = (
-                        gap_result["output_token_throughput"]["avg"] / tp_size
-                    )
-                    engine_decode_itl.append(itl)
-                    engine_decode_thpt_per_gpu.append(thpt_per_gpu)
-                    decode_tp_size.append(tp_size)
-                    decode_itl.append(itl)
-                    decode_thpt_per_gpu.append(thpt_per_gpu)
-                    decode_concurrency.append(num_request)
-                    decode_kv_cache_size.append(max_kv_tokens)
+        base_url = client.get_service_url()
+        for num_request in sweep_num_request:
+            genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
+            gap_result = benchmark_decode(
+                args.isl,
+                args.osl,
+                num_request,
+                genai_perf_artifact_dir,
+                model_name,
+                base_url=base_url,
+            )
+            if gap_result is not None:
+                itl = gap_result["inter_token_latency"]["avg"]
+                thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / tp_size
+                engine_decode_itl.append(itl)
+                engine_decode_thpt_per_gpu.append(thpt_per_gpu)
+                decode_tp_size.append(tp_size)
+                decode_itl.append(itl)
+                decode_thpt_per_gpu.append(thpt_per_gpu)
+                decode_concurrency.append(num_request)
+                decode_kv_cache_size.append(max_kv_tokens)
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
@@ -289,8 +293,11 @@ async def run_profile(args):
         yaml.dump(prefill_config, f)
 
     client = DynamoDeploymentClient(
-        namespace=args.namespace, base_log_dir=work_dir, model_name=model_name
-    ) 
+        namespace=args.namespace,
+        base_log_dir=work_dir,
+        model_name=model_name,
+        service_name=args.service_name,
+    )
     await client.create_deployment(prefill_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
@@ -302,22 +309,22 @@ async def run_profile(args):
         f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
     )
 
-    with client.port_forward() as port:
-        for isl in range(
-            100,
-            args.max_context_length,
-            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-        ):
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-            gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, port
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_isl.append(isl)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+    base_url = client.get_service_url()
+    for isl in range(
+        100,
+        args.max_context_length,
+        (args.max_context_length - 100) // args.prefill_interpolation_granularity,
+    ):
+        # run genai-perf
+        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+        gap_result = benchmark_prefill(
+            isl, genai_perf_artifact_dir, model_name, base_url=base_url
+        )
+        if gap_result is not None:
+            ttft = gap_result["time_to_first_token"]["avg"]
+            prefill_isl.append(isl)
+            prefill_ttft.append(ttft)
+            prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
     print("Cleaning up deployment...")
     await client.delete_deployment()
@@ -367,8 +374,8 @@ async def run_profile(args):
         yaml.dump(decode_config, f)
 
     client = DynamoDeploymentClient(
-        namespace=args.namespace, base_log_dir=work_dir
-    ) 
+        namespace=args.namespace, base_log_dir=work_dir, service_name=args.service_name
+    )
     await client.create_deployment(decode_config_fn)
     logger.info("Waiting for deployment to be ready...")
     await client.wait_for_deployment_ready()
@@ -385,35 +392,38 @@ async def run_profile(args):
     )
 
     osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    with client.port_forward() as port:
-        for isl in range(
-            100,
-            args.max_context_length - osl,
-            (args.max_context_length - osl) // args.decode_interpolation_granularity,
-        ):
-            max_concurrency = max_kv_tokens // (isl + osl)
-            sweep_num_request = list(
-                range(
-                    1,
-                    max_concurrency,
-                    max_concurrency // args.decode_interpolation_granularity,
-                )
+    base_url = client.get_service_url()
+    for isl in range(
+        100,
+        args.max_context_length - osl,
+        (args.max_context_length - osl) // args.decode_interpolation_granularity,
+    ):
+        max_concurrency = max_kv_tokens // (isl + osl)
+        sweep_num_request = list(
+            range(
+                1,
+                max_concurrency,
+                max_concurrency // args.decode_interpolation_granularity,
             )
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = (
-                    f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-                )
-                gap_result = benchmark_decode(
-                    isl, osl, num_request, genai_perf_artifact_dir, model_name, port
+        )
+        for num_request in sweep_num_request:
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+            gap_result = benchmark_decode(
+                isl,
+                osl,
+                num_request,
+                genai_perf_artifact_dir,
+                model_name,
+                base_url=base_url,
+            )
+            if gap_result is not None:
+                itl = gap_result["inter_token_latency"]["avg"]
+                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                y_context_length.append(isl + osl / 2)
+                z_itl.append(itl)
+                z_thpt_per_gpu.append(
+                    gap_result["output_token_throughput"]["avg"] / tp_size
                 )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                    y_context_length.append(isl + osl / 2)
-                    z_itl.append(itl)
-                    z_thpt_per_gpu.append(
-                        gap_result["output_token_throughput"]["avg"] / tp_size
-                    )
 
     print("Cleaning up deployment...")
     await client.delete_deployment()
@@ -515,6 +525,11 @@ async def run_profile(args):
         default=6,
         help="how many samples to benchmark to interpolate ITL under different active kv cache size and decode context length",
     )
+    parser.add_argument(
+        "--service-name",
+        type=str,
+        help="Service name for port forwarding (default: {deployment_name}-frontend)",
+    )
     args = parser.parse_args()
 
-    asyncio.run(run_profile(args))
\ No newline at end of file
+    asyncio.run(run_profile(args))
diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index ee5e0de9634..962013af133 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -31,6 +31,7 @@
 console_handler.setFormatter(formatter)
 logger.addHandler(console_handler)
 
+
 def break_arguments(args: list[str]) -> list[str]:
     ans = []
     if isinstance(args, str):
@@ -40,9 +41,11 @@ def break_arguments(args: list[str]) -> list[str]:
             ans.extend(arg.split(" "))
     return ans
 
+
 def join_arguments(args: list[str]) -> str:
     return [" ".join(args)]
 
+
 def append_argument(args: list[str], to_append) -> list[str]:
     idx = find_arg_index(args)
     if isinstance(to_append, list):
@@ -51,6 +54,7 @@ def append_argument(args: list[str], to_append) -> list[str]:
         args.insert(idx, to_append)
     return args
 
+
 def find_arg_index(args: list[str]) -> int:
     # find the correct index to insert an argument
     idx = len(args)
@@ -107,10 +111,10 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
                 args.remove("--enable-prefix-caching")
             if "--no-enable-prefix-caching" not in args:
                 args = append_argument(args, "--no-enable-prefix-caching")
-            
-            config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
+
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+                "extraPodSpec"
+            ]["mainContainer"]["args"] = join_arguments(args)
 
         elif target == "decode":
             # delete prefill worker
@@ -130,9 +134,9 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" in args:
                 args.remove("--no-enable-prefix-caching")
 
-            config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-            ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+                "extraPodSpec"
+            ]["mainContainer"]["args"] = join_arguments(args)
 
         # set num workers to 1
         decode_worker_config = config["spec"]["services"][
@@ -146,8 +150,12 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
     def set_config_tp_size(cls, config: dict, tp_size: int):
         config = deepcopy(config)
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["resources"]["requests"]["gpu"] = str(tp_size)
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["resources"]["limits"]["gpu"] = str(tp_size)
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            "resources"
+        ]["requests"]["gpu"] = str(tp_size)
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            "resources"
+        ]["limits"]["gpu"] = str(tp_size)
 
         args = config["spec"]["services"][
             WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
@@ -160,10 +168,10 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
             args[idx + 1] = str(tp_size)
         except ValueError:
             args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
-        
-        config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-        ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args)
+
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            "extraPodSpec"
+        ]["mainContainer"]["args"] = join_arguments(args)
 
         return config
 
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
old mode 100755
new mode 100644
index 488b51f31bc..0dfd5602308
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -1,22 +1,21 @@
-#!/usr/bin/env -S uv run --script
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
 #
-# /// script
-# requires-python = ">=3.10"
-# dependencies = [
-#   "PyYAML",
-#   "aiofiles",
-#   "kubernetes-asyncio",
-#   "kr8s",           # added
-#   "httpx",          # added
-# ]
-# ///
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 import argparse
 import asyncio
-import random
-import time 
-import socket
-from contextlib import contextmanager
+import time
 from pathlib import Path
 from typing import Optional, Union
 
@@ -24,7 +23,6 @@
 import httpx  # added for HTTP requests
 import kubernetes_asyncio as kubernetes
 import yaml
-from kr8s.objects import Service
 from kubernetes_asyncio import client, config
 
 # Example chat completion request for testing deployments
@@ -48,6 +46,7 @@ def __init__(
         model_name: str = "Qwen/Qwen3-0.6B",
         deployment_name: str = "vllm-v1-agg",
         base_log_dir: Optional[str] = None,
+        service_name: Optional[str] = None,
     ):
         """
         Initialize the client with the namespace and deployment name.
@@ -56,10 +55,12 @@ def __init__(
             namespace: The Kubernetes namespace
             deployment_name: Name of the deployment, defaults to vllm-v1-agg
             base_log_dir: Base directory for storing logs, defaults to ./logs if not specified
+            service_name: Service name for connecting to the service, defaults to {deployment_name}-frontend
         """
         self.namespace = namespace
         self.deployment_name = deployment_name
         self.model_name = model_name
+        self.service_name = service_name or f"{deployment_name}-frontend"
         self.components = []  # Will store component names from CR
         self.deployment_spec = None  # Will store the full deployment spec
         self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
@@ -77,6 +78,16 @@ def _init_kubernetes(self):
         self.custom_api = client.CustomObjectsApi(self.k8s_client)
         self.core_api = client.CoreV1Api(self.k8s_client)
 
+    def get_service_url(self) -> str:
+        """
+        Get the service URL using Kubernetes service DNS.
+        """
+        service_url = (
+            f"http://{self.service_name}.{self.namespace}.svc.cluster.local:8000"
+        )
+        print(f"Using service URL: {service_url}")
+        return service_url
+
     async def create_deployment(self, deployment: Union[dict, str]):
         """
         Create a DynamoGraphDeployment from either a dict or yaml file path.
@@ -128,20 +139,22 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
         # TODO: A little brittle, also should output intermediate status every so often.
         while (time.time() - start_time) < timeout:
             try:
-                status = await self.custom_api.get_namespaced_custom_object_status(
+                status = await self.custom_api.get_namespaced_custom_object(
                     group="nvidia.com",
                     version="v1alpha1",
                     namespace=self.namespace,
                     plural="dynamographdeployments",
                     name=self.deployment_name,
                 )
-                # print(f"Current status: {status.get('status', {})}")
-
                 # Check both conditions:
                 # 1. Ready condition is True
                 # 2. State is successful
                 status_obj = status.get("status", {})
                 conditions = status_obj.get("conditions", [])
+                current_state = status_obj.get("state", "unknown")
+
+                print(f"Current deployment state: {current_state}")
+                print(f"Current conditions: {conditions}")
 
                 ready_condition = False
                 for condition in conditions:
@@ -159,51 +172,27 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
                         "Deployment is ready: Ready condition is True and state is successful"
                     )
                     return True
+                else:
+                    print(
+                        f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
+                    )
 
             except kubernetes.client.rest.ApiException:
                 pass
             await asyncio.sleep(20)
         raise TimeoutError("Deployment failed to become ready within timeout")
 
-    @contextmanager
-    def port_forward(self, port: Optional[int] = None):
-        """
-        Forward the service's HTTP port to a local port.
-        """
-        if port is None:
-            # Find a free port in the ephemeral port range
-            for _ in range(100):  # Try up to 100 times
-                candidate_port = random.randint(49152, 65535)
-                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-                    try:
-                        s.bind(('localhost', candidate_port))
-                        port = candidate_port
-                        break
-                    except OSError:
-                        continue  # Port is in use, try another
-            if port is None:
-                raise RuntimeError("Could not find a free port after 100 attempts")
-        svc_name = f"{self.deployment_name}-frontend"
-        # Get the Service and forward its HTTP port (8000)
-        service = Service.get(svc_name, namespace=self.namespace)
-        pf = service.portforward(remote_port=8000, local_port=port)
-        pf.start()
-        try:
-            yield port
-        finally:
-            pf.stop()
-
     async def check_chat_completion(self):
         """
         Test the deployment with a chat completion request using httpx.
         """
         EXAMPLE_CHAT_REQUEST["model"] = self.model_name
-        with self.port_forward() as port:
-            url = f"http://localhost:{port}/v1/chat/completions"
-            async with httpx.AsyncClient() as client:
-                response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
-                response.raise_for_status()
-                return response.text
+        base_url = self.get_service_url()
+        url = f"{base_url}/v1/chat/completions"
+        async with httpx.AsyncClient() as client:
+            response = await client.post(url, json=EXAMPLE_CHAT_REQUEST)
+            response.raise_for_status()
+            return response.text
 
     async def get_deployment_logs(self):
         """
@@ -277,11 +266,20 @@ async def main():
         default="/tmp/dynamo_logs",
         help="Base directory for logs (default: /tmp/dynamo_logs)",
     )
+    parser.add_argument(
+        "--service-name",
+        "-s",
+        help="Service name for connecting to the service (default: {deployment_name}-frontend)",
+    )
 
     args = parser.parse_args()
 
     # Example usage with parsed arguments
-    client = DynamoDeploymentClient(namespace=args.namespace, base_log_dir=args.log_dir)
+    client = DynamoDeploymentClient(
+        namespace=args.namespace,
+        base_log_dir=args.log_dir,
+        service_name=args.service_name,
+    )
 
     try:
         # Create deployment from yaml file
diff --git a/benchmarks/profiler/utils/genai_perf.py b/benchmarks/profiler/utils/genai_perf.py
index 8416059afb5..19aab8a9850 100644
--- a/benchmarks/profiler/utils/genai_perf.py
+++ b/benchmarks/profiler/utils/genai_perf.py
@@ -34,7 +34,7 @@ def _get_common_genai_perf_cmd(
     artifact_dir,
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    port=8000,
+    base_url="http://localhost:8000",
 ):
     return [
         "genai-perf",
@@ -49,7 +49,7 @@ def _get_common_genai_perf_cmd(
         "/v1/chat/completions",
         "--streaming",
         "--url",
-        f"http://localhost:{port}",
+        base_url,
         "--extra-inputs",
         "ignore_eos:true",
         "--extra-inputs",
@@ -69,13 +69,13 @@ def get_prefill_genai_perf_cmd(
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
     osl=5,
-    port=8000,
+    base_url="http://localhost:8000",
 ):
     return _get_common_genai_perf_cmd(
         artifact_dir,
         seed,
         model,
-        port,
+        base_url,
     ) + [
         "--synthetic-input-tokens-mean",
         str(isl),
@@ -103,13 +103,13 @@ def get_decode_genai_perf_cmd(
     num_request,
     seed=100,
     model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
-    port=8000,
+    base_url="http://localhost:8000",
 ):
     return _get_common_genai_perf_cmd(
         artifact_dir,
         seed,
         model,
-        port,
+        base_url,
     ) + [
         "--synthetic-input-tokens-mean",
         str(isl),
@@ -146,10 +146,12 @@ def get_gap_result(artifact_dir: str) -> dict:
         return json.load(f)
 
 
-def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
+def benchmark_prefill(
+    isl, genai_perf_artifact_dir, model_name, base_url="http://localhost:8000"
+):
     logger.info(f"Running genai-perf with isl {isl}")
     genai_perf_cmd = get_prefill_genai_perf_cmd(
-        isl, genai_perf_artifact_dir, model=model_name, port=port
+        isl, genai_perf_artifact_dir, model=model_name, base_url=base_url
     )
     print(f"genai-perf cmd: {genai_perf_cmd}")
     # import pdb; pdb.set_trace()
@@ -171,12 +173,20 @@ def benchmark_prefill(isl, genai_perf_artifact_dir, model_name, port):
         return None
 
 
-def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name, port):
+def benchmark_decode(
+    isl,
+    osl,
+    num_request,
+    genai_perf_artifact_dir,
+    model_name,
+    base_url="http://localhost:8000",
+):
     logger.info(f"Profiling decode with num_request {num_request}...")
 
     # first warm-up the engine by pre-computing all prefill tokens
     # we use the same random seed to make sure the prompt is the same
     seed = random.randint(0, 1000000)
+
     genai_perf_cmd = get_decode_genai_perf_cmd(
         isl,
         osl,
@@ -184,7 +194,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
         num_request,
         seed=seed,
         model=model_name,
-        port=port,
+        base_url=base_url,
     )
     gap_process = subprocess.Popen(
         genai_perf_cmd,
@@ -201,7 +211,7 @@ def benchmark_decode(isl, osl, num_request, genai_perf_artifact_dir, model_name,
         num_request,
         seed=seed,
         model=model_name,
-        port=port,
+        base_url=base_url,
     )
     gap_process = subprocess.Popen(
         genai_perf_cmd,
diff --git a/container/deps/requirements.txt b/container/deps/requirements.txt
index cd0d42ac852..beb2551eec8 100644
--- a/container/deps/requirements.txt
+++ b/container/deps/requirements.txt
@@ -14,13 +14,16 @@
 # limitations under the License.
 
 accelerate==1.6.0
+aiofiles
 av==15.0.0
 fastapi==0.115.6
 ftfy
 genai-perf==0.0.13
 grpcio-tools==1.66.0
 httpx
+kr8s
 kubernetes==32.0.1
+kubernetes_asyncio
 matplotlib
 msgspec
 mypy
@@ -45,6 +48,3 @@ tensorboardX==2.6.2.2
 transformers
 types-PyYAML
 uvicorn
-aiofiles
-kubernetes_asyncio
-kr8s
\ No newline at end of file
diff --git a/examples/vllm/deploy/profile_sla_binding.yaml b/examples/vllm/deploy/profile_sla_binding.yaml
index f32a0f2f51a..6743dd4c52e 100644
--- a/examples/vllm/deploy/profile_sla_binding.yaml
+++ b/examples/vllm/deploy/profile_sla_binding.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: rbac.authorization.k8s.io/v1
 kind: RoleBinding
 metadata:
diff --git a/examples/vllm/deploy/profile_sla_rbac.yaml b/examples/vllm/deploy/profile_sla_rbac.yaml
index 5cf2f6e5a82..65494d9a389 100644
--- a/examples/vllm/deploy/profile_sla_rbac.yaml
+++ b/examples/vllm/deploy/profile_sla_rbac.yaml
@@ -1,25 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: rbac.authorization.k8s.io/v1
 kind: Role
 metadata:
   name: profile-sla-role
   namespace: ${NAMESPACE}
 rules:
-  # DynamoGraphDeployment custom resources
+  # DynamoGraphDeployment custom resources - needed for create/get/delete operations
   - apiGroups: ["nvidia.com"]
     resources: ["dynamographdeployments"]
-    verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
-  # Core resources needed for deployment management
+    verbs: ["get", "create", "delete"]
+  # Pods - needed for listing pods by label selector and getting logs
   - apiGroups: [""]
-    resources: ["pods", "services", "configmaps", "secrets"]
-    verbs: ["get", "list", "watch"]
+    resources: ["pods"]
+    verbs: ["list"]
   - apiGroups: [""]
     resources: ["pods/log"]
-    verbs: ["get", "list"]
-  # Apps resources
-  - apiGroups: ["apps"]
-    resources: ["deployments", "replicasets"]
-    verbs: ["get", "list", "watch"]
-  # For port forwarding
-  - apiGroups: [""]
-    resources: ["pods/portforward"]
-    verbs: ["create"]
+    verbs: ["get"]
diff --git a/examples/vllm/deploy/profile_sla_sa.yaml b/examples/vllm/deploy/profile_sla_sa.yaml
index 6e6955e655d..e918a7d275c 100644
--- a/examples/vllm/deploy/profile_sla_sa.yaml
+++ b/examples/vllm/deploy/profile_sla_sa.yaml
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 apiVersion: v1
 kind: ServiceAccount
 metadata:
diff --git a/k8s.sh b/k8s.sh
index 2c213fe5587..08fb06e62ae 100755
--- a/k8s.sh
+++ b/k8s.sh
@@ -1,4 +1,19 @@
 #!/usr/bin/env bash
+
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 set -euo pipefail
 
 # 1. Install Homebrew if missing

From ff96b9e61e3122d19b56fee29fc8b31b916aee43 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Fri, 18 Jul 2025 15:07:51 -0700
Subject: [PATCH 24/58] add try-catch waiting for deployment

---
 benchmarks/profiler/profile_sla.py            | 156 ++++++++++--------
 .../dynamo/planner/kubernetes_connector.py    |  18 ++
 2 files changed, 106 insertions(+), 68 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 1a5a287fd4b..279e65049ad 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -98,8 +98,12 @@ async def run_profile(args):
         )
         await client.create_deployment(prefill_config_fn)
         logger.info("Waiting for deployment to be ready...")
-        await client.wait_for_deployment_ready()
-        logger.info("Deployment is ready")
+        try:
+            await client.wait_for_deployment_ready()
+            logger.info("Deployment is ready")
+        except TimeoutError:
+            logger.error("Deployment failed to become ready within timeout, skipping profiling")
+            continue
 
         logger.info("Getting deployment logs...")
         await client.get_deployment_logs()
@@ -162,8 +166,12 @@ async def run_profile(args):
         )
         await client.create_deployment(decode_config_fn)
         logger.info("Waiting for deployment to be ready...")
-        await client.wait_for_deployment_ready()
-        logger.info("Deployment is ready")
+        try:
+            await client.wait_for_deployment_ready()
+            logger.info("Deployment is ready")
+        except TimeoutError:
+            logger.error("Deployment failed to become ready within timeout, skipping profiling")
+            continue
 
         logger.info("Getting deployment logs...")
         await client.get_deployment_logs()
@@ -300,31 +308,37 @@ async def run_profile(args):
     )
     await client.create_deployment(prefill_config_fn)
     logger.info("Waiting for deployment to be ready...")
-    await client.wait_for_deployment_ready()
-    logger.info("Deployment is ready")
-
-    logger.info("Getting deployment logs...")
-    await client.get_deployment_logs()
-    logger.info(
-        f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-    )
+    try:
+        await client.wait_for_deployment_ready()
+        logger.info("Deployment is ready")
+        skip_profile = False
+    except TimeoutError:
+        logger.error("Deployment failed to become ready within timeout, skipping profiling")
+        skip_profile = True
 
-    base_url = client.get_service_url()
-    for isl in range(
-        100,
-        args.max_context_length,
-        (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-    ):
-        # run genai-perf
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-        gap_result = benchmark_prefill(
-            isl, genai_perf_artifact_dir, model_name, base_url=base_url
+    if not skip_profile:
+        logger.info("Getting deployment logs...")
+        await client.get_deployment_logs()
+        logger.info(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
         )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
-            prefill_isl.append(isl)
-            prefill_ttft.append(ttft)
-            prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
+
+        base_url = client.get_service_url()
+        for isl in range(
+            100,
+            args.max_context_length,
+            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
+        ):
+            # run genai-perf
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+            gap_result = benchmark_prefill(
+                isl, genai_perf_artifact_dir, model_name, base_url=base_url
+            )
+            if gap_result is not None:
+                ttft = gap_result["time_to_first_token"]["avg"]
+                prefill_isl.append(isl)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
     print("Cleaning up deployment...")
     await client.delete_deployment()
@@ -378,52 +392,58 @@ async def run_profile(args):
     )
     await client.create_deployment(decode_config_fn)
     logger.info("Waiting for deployment to be ready...")
-    await client.wait_for_deployment_ready()
-    logger.info("Deployment is ready")
-
-    logger.info("Getting deployment logs...")
-    await client.get_deployment_logs()
-    logger.info(
-        f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-    )
+    try:
+        await client.wait_for_deployment_ready()
+        logger.info("Deployment is ready")
+        skip_profile = False
+    except TimeoutError:
+        logger.error("Deployment failed to become ready within timeout, skipping profiling")
+        skip_profile = True
 
-    max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-        f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
-    )
+    if not skip_profile:
+        logger.info("Getting deployment logs...")
+        await client.get_deployment_logs()
+        logger.info(
+            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+        )
 
-    osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-    base_url = client.get_service_url()
-    for isl in range(
-        100,
-        args.max_context_length - osl,
-        (args.max_context_length - osl) // args.decode_interpolation_granularity,
-    ):
-        max_concurrency = max_kv_tokens // (isl + osl)
-        sweep_num_request = list(
-            range(
-                1,
-                max_concurrency,
-                max_concurrency // args.decode_interpolation_granularity,
-            )
+        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
         )
-        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-            gap_result = benchmark_decode(
-                isl,
-                osl,
-                num_request,
-                genai_perf_artifact_dir,
-                model_name,
-                base_url=base_url,
+
+        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
+        base_url = client.get_service_url()
+        for isl in range(
+            100,
+            args.max_context_length - osl,
+            (args.max_context_length - osl) // args.decode_interpolation_granularity,
+        ):
+            max_concurrency = max_kv_tokens // (isl + osl)
+            sweep_num_request = list(
+                range(
+                    1,
+                    max_concurrency,
+                    max_concurrency // args.decode_interpolation_granularity,
+                )
             )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
-                x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                y_context_length.append(isl + osl / 2)
-                z_itl.append(itl)
-                z_thpt_per_gpu.append(
-                    gap_result["output_token_throughput"]["avg"] / tp_size
+            for num_request in sweep_num_request:
+                genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+                gap_result = benchmark_decode(
+                    isl,
+                    osl,
+                    num_request,
+                    genai_perf_artifact_dir,
+                    model_name,
+                    base_url=base_url,
                 )
+                if gap_result is not None:
+                    itl = gap_result["inter_token_latency"]["avg"]
+                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                    y_context_length.append(isl + osl / 2)
+                    z_itl.append(itl)
+                    z_thpt_per_gpu.append(
+                        gap_result["output_token_throughput"]["avg"] / tp_size
+                    )
 
     print("Cleaning up deployment...")
     await client.delete_deployment()
diff --git a/components/planner/src/dynamo/planner/kubernetes_connector.py b/components/planner/src/dynamo/planner/kubernetes_connector.py
index 021943f94d1..e089d9a83fb 100644
--- a/components/planner/src/dynamo/planner/kubernetes_connector.py
+++ b/components/planner/src/dynamo/planner/kubernetes_connector.py
@@ -77,3 +77,21 @@ def _get_current_replicas(self, deployment: dict, component_name: str) -> int:
     def _get_graph_deployment_name(self, deployment: dict) -> str:
         """Get the name of the graph deployment"""
         return deployment["metadata"]["name"]
+
+if __name__ == "__main__":
+    import argparse
+    import asyncio
+    
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--namespace", type=str, default="dynamo")
+    parser.add_argument("--action", type=str, choices=["add", "remove"])
+    parser.add_argument("--component", type=str, default="planner")
+    parser.add_argument("--blocking", action="store_true")
+    args = parser.parse_args()
+    connector = KubernetesConnector(args.namespace)
+
+    if args.action == "add":
+        task = connector.add_component(args.component, args.blocking)
+    elif args.action == "remove":
+        task = connector.remove_component(args.component, args.blocking)
+    asyncio.run(task)
\ No newline at end of file

From d2b6b00e95fc3a179c10e1d48781c2aa29cd7644 Mon Sep 17 00:00:00 2001
From: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
Date: Mon, 21 Jul 2025 10:25:36 -0700
Subject: [PATCH 25/58] feat: clean up outlying DGDs upon SLA profiling failure
 (#2016)

---
 benchmarks/profiler/profile_sla.py            | 771 ++++++++++--------
 .../profiler/utils/dynamo_deployment.py       |  10 +-
 2 files changed, 424 insertions(+), 357 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 279e65049ad..458e3f1b1d2 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -43,47 +43,301 @@
 logger.addHandler(console_handler)
 
 
+async def cleanup_remaining_deployments(deployment_clients, namespace):
+    """Clean up any remaining tracked deployments, handling errors gracefully."""
+    if not deployment_clients:
+        logger.info("No deployments to clean up")
+        return
+
+    logger.info(f"Cleaning up {len(deployment_clients)} remaining deployments...")
+    for client in deployment_clients:
+        try:
+            logger.info(f"Attempting to delete deployment {client.deployment_name}...")
+            await client.delete_deployment()
+            logger.info(f"Successfully deleted deployment {client.deployment_name}")
+        except Exception as e:
+            # If deployment doesn't exist (404), that's fine - it was already cleaned up
+            if "404" in str(e) or "not found" in str(e).lower():
+                logger.info(f"Deployment {client.deployment_name} was already deleted")
+            else:
+                logger.error(
+                    f"Failed to delete deployment {client.deployment_name}: {e}"
+                )
+
+
 async def run_profile(args):
-    config_modifier = CONFIG_MODIFIERS[args.backend]
+    # List to track all created deployment clients for cleanup in case of failure
+    deployment_clients = []
+
+    try:
+        config_modifier = CONFIG_MODIFIERS[args.backend]
+
+        if args.example_dir is None:
+            logger.info(
+                "Example directory not provided, inferring from config file location..."
+            )
+            try:
+                args.example_dir = os.path.dirname(os.path.dirname(args.config))
+            except Exception:
+                logger.error(
+                    "Failed to infer example directory, please provide explicitly using --example-dir <path-to-example-dir>"
+                )
+                exit(1)
+
+        with open(args.config, "r") as f:
+            config = yaml.safe_load(f)
 
-    if args.example_dir is None:
+        profile_tp_size = [
+            2**i
+            for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
+            if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine
+        ]
+        logger.info(f"Profiling TP sizes: {profile_tp_size}")
+
+        os.makedirs(args.output_dir, exist_ok=True)
+
+        model_name = config_modifier.get_model_name(config)
+
+        # first profile prefill
+        prefill_tp_size = []
+        prefill_ttft = []
+        prefill_thpt_per_gpu = []
+        logger.info("Profiling prefill...")
+        prefill_config = config_modifier.convert_config(config, "prefill")
+        for tp_size in profile_tp_size:
+            logger.info(f"Profiling prefill with TP size {tp_size}...")
+            prefill_config = config_modifier.set_config_tp_size(prefill_config, tp_size)
+            logger.info(f"Dynamo config: {prefill_config}")
+
+            work_dir = f"{args.output_dir}/prefill_tp{tp_size}"
+            os.makedirs(work_dir, exist_ok=True)
+
+            prefill_config_fn = f"{work_dir}/config.yaml"
+            with open(prefill_config_fn, "w") as f:
+                yaml.dump(prefill_config, f)
+
+            client = DynamoDeploymentClient(
+                namespace=args.namespace,
+                base_log_dir=work_dir,
+                model_name=model_name,
+                service_name=args.service_name,
+            )
+            deployment_clients.append(client)  # Track for cleanup
+            await client.create_deployment(prefill_config_fn)
+            logger.info("Waiting for deployment to be ready...")
+            try:
+                await client.wait_for_deployment_ready()
+                logger.info("Deployment is ready")
+            except TimeoutError:
+                logger.error(
+                    "Deployment failed to become ready within timeout, skipping profiling"
+                )
+                continue
+
+            logger.info("Getting deployment logs...")
+            await client.get_deployment_logs()
+            logger.info(
+                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            )
+
+            # run genai-perf
+            base_url = client.get_service_url()
+            genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
+            gap_result = benchmark_prefill(
+                args.isl, genai_perf_artifact_dir, model_name, base_url=base_url
+            )
+            if gap_result is not None:
+                ttft = gap_result["time_to_first_token"]["avg"]
+                prefill_tp_size.append(tp_size)
+                prefill_ttft.append(ttft)
+                prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
+
+            print("Cleaning up deployment...")
+            await client.delete_deployment()
+            deployment_clients.remove(
+                client
+            )  # Remove from cleanup list since it's deleted
+            print("Deployment deleted")
+
+        # Plot the results as a 2D scatter plot
+        if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
+            plot_prefill_performance(
+                prefill_tp_size,
+                prefill_ttft,
+                prefill_thpt_per_gpu,
+                args.ttft,
+                args.output_dir,
+            )
+
+        # then profile decode
+        decode_tp_size = []
+        decode_itl = []
+        decode_thpt_per_gpu = []
+        decode_concurrency = []
+        decode_kv_cache_size = []
+        decode_results = []  # Store partial results for plotting later
+        logger.info("Profiling decode...")
+        decode_config = config_modifier.convert_config(config, "decode")
+        for tp_size in profile_tp_size:
+            logger.info(f"Profiling decode with TP size {tp_size}...")
+            decode_config = config_modifier.set_config_tp_size(decode_config, tp_size)
+            logger.info(f"Dynamo config: {decode_config}")
+
+            work_dir = f"{args.output_dir}/decode_tp{tp_size}"
+            os.makedirs(work_dir, exist_ok=True)
+
+            decode_config_fn = f"{work_dir}/config.yaml"
+            with open(decode_config_fn, "w") as f:
+                yaml.dump(decode_config, f)
+
+            client = DynamoDeploymentClient(
+                namespace=args.namespace,
+                base_log_dir=work_dir,
+                model_name=model_name,
+                service_name=args.service_name,
+            )
+            deployment_clients.append(client)  # Track for cleanup
+            await client.create_deployment(decode_config_fn)
+            logger.info("Waiting for deployment to be ready...")
+            try:
+                await client.wait_for_deployment_ready()
+                logger.info("Deployment is ready")
+            except TimeoutError:
+                logger.error(
+                    "Deployment failed to become ready within timeout, skipping profiling"
+                )
+                continue
+
+            logger.info("Getting deployment logs...")
+            await client.get_deployment_logs()
+            logger.info(
+                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            )
+
+            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+                f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
+            )
+            max_concurrency = max_kv_tokens // (args.isl + args.osl)
+            sweep_num_request = [
+                num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
+            ]
+            logger.info(
+                f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
+            )
+
+            engine_decode_itl = []
+            engine_decode_thpt_per_gpu = []
+            base_url = client.get_service_url()
+            for num_request in sweep_num_request:
+                genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
+                gap_result = benchmark_decode(
+                    args.isl,
+                    args.osl,
+                    num_request,
+                    genai_perf_artifact_dir,
+                    model_name,
+                    base_url=base_url,
+                )
+                if gap_result is not None:
+                    itl = gap_result["inter_token_latency"]["avg"]
+                    thpt_per_gpu = (
+                        gap_result["output_token_throughput"]["avg"] / tp_size
+                    )
+                    engine_decode_itl.append(itl)
+                    engine_decode_thpt_per_gpu.append(thpt_per_gpu)
+                    decode_tp_size.append(tp_size)
+                    decode_itl.append(itl)
+                    decode_thpt_per_gpu.append(thpt_per_gpu)
+                    decode_concurrency.append(num_request)
+                    decode_kv_cache_size.append(max_kv_tokens)
+
+            print("Cleaning up deployment...")
+            await client.delete_deployment()
+            deployment_clients.remove(
+                client
+            )  # Remove from cleanup list since it's deleted
+            print("Deployment deleted")
+
+            # Store partial results for plotting later
+            decode_results.append(
+                (tp_size, engine_decode_itl, engine_decode_thpt_per_gpu)
+            )
+
+        # Plot all decode results after profiling is complete
+        if decode_results:
+            plot_decode_performance(decode_results, args.itl, args.output_dir)
+
+        logger.info("Analyzing results and generate recommendations...")
+        # select best tp size for prefill
+        if min(prefill_ttft) > args.ttft:
+            logger.info(
+                "No TP size satisfies the TTFT requirement, please try a smaller model or a more powerful GPU SKU"
+            )
+            selected_prefill_idx = int(np.argmin(np.array(prefill_ttft)))
+        else:
+            valid_indices = [
+                i for i, ttft in enumerate(prefill_ttft) if ttft <= args.ttft
+            ]
+            # Among valid TP sizes, select the one with highest throughput per GPU
+            valid_thpts = [prefill_thpt_per_gpu[i] for i in valid_indices]
+            max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
+            selected_prefill_idx = max_thpt_idx
         logger.info(
-            "Example directory not provided, inferring from config file location..."
+            f"Suggested prefill TP:{prefill_tp_size[selected_prefill_idx]} (TTFT {prefill_ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
         )
-        try:
-            args.example_dir = os.path.dirname(os.path.dirname(args.config))
-        except Exception:
-            logger.error(
-                "Failed to infer example directory, please provide explicitly using --example-dir <path-to-example-dir>"
+
+        # scale up if estimated TTFT is 120% of target TTFT
+        prefill_queue_size_upper_bound = max(
+            0.1, args.ttft * 1.2 / prefill_ttft[selected_prefill_idx] - 1
+        )
+        # scale down if estimated TTFT is 80% of target TTFT
+        prefill_queue_size_lower_bound = max(
+            0.1, args.ttft * 0.8 / prefill_ttft[selected_prefill_idx] - 1
+        )
+        logger.info(
+            f"Suggested planner upper/lower bound for prefill queue size: {prefill_queue_size_upper_bound:.2f}/{prefill_queue_size_lower_bound:.2f}"
+        )
+
+        # select best tp size for decode
+        if min(decode_itl) > args.itl:
+            logger.info(
+                "No TP size satisfies the ITL requirement, please try a smaller model or a more powerful GPU SKU"
             )
-            exit(1)
-
-    with open(args.config, "r") as f:
-        config = yaml.safe_load(f)
-
-    profile_tp_size = [
-        2**i
-        for i in range(int(math.log2(args.max_num_gpus_per_engine)) + 1)
-        if args.min_num_gpus_per_engine <= 2**i <= args.max_num_gpus_per_engine
-    ]
-    logger.info(f"Profiling TP sizes: {profile_tp_size}")
-
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    model_name = config_modifier.get_model_name(config)
-
-    # first profile prefill
-    prefill_tp_size = []
-    prefill_ttft = []
-    prefill_thpt_per_gpu = []
-    logger.info("Profiling prefill...")
-    prefill_config = config_modifier.convert_config(config, "prefill")
-    for tp_size in profile_tp_size:
-        logger.info(f"Profiling prefill with TP size {tp_size}...")
+            selected_decode_idx = int(np.argmin(np.array(decode_itl)))
+        else:
+            valid_indices = [i for i, itl in enumerate(decode_itl) if itl <= args.itl]
+            # Among valid TP sizes, select the one with highest throughput per GPU
+            valid_thpts = [decode_thpt_per_gpu[i] for i in valid_indices]
+            max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
+            selected_decode_idx = max_thpt_idx
+        logger.info(
+            f"Suggested decode TP:{decode_tp_size[selected_decode_idx]} (ITL {decode_itl[selected_decode_idx]:.2f} ms, throughput {decode_thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
+        )
+
+        # calculate kv cache utlization for the selected TP and concurrency
+        selected_decode_kv_cache_utilization = (
+            decode_concurrency[selected_decode_idx]
+            * (args.isl + args.osl / 2)
+            / decode_kv_cache_size[selected_decode_idx]
+        )
+        # set a +- 20% range for the kv cache utilization
+        logger.info(
+            f"Suggested planner upper/lower bound for decode kv cache utilization: {min(1, selected_decode_kv_cache_utilization + 0.2):.2f}/{max(0.1, selected_decode_kv_cache_utilization - 0.2):.2f}"
+        )
+
+        # interpolate ISL - TTFT with best prefill TP
+        best_prefill_tp = prefill_tp_size[selected_prefill_idx]
+        prefill_isl = []
+        prefill_ttft = []
+        prefill_thpt_per_gpu = []
+        logger.info(
+            f"Profiling prefill under best TP {best_prefill_tp} with different ISL..."
+        )
+        prefill_config = config_modifier.convert_config(config, "prefill")
         prefill_config = config_modifier.set_config_tp_size(prefill_config, tp_size)
         logger.info(f"Dynamo config: {prefill_config}")
 
-        work_dir = f"{args.output_dir}/prefill_tp{tp_size}"
+        work_dir = f"{args.output_dir}/selected_prefill_interpolation"
         os.makedirs(work_dir, exist_ok=True)
 
         prefill_config_fn = f"{work_dir}/config.yaml"
@@ -96,62 +350,88 @@ async def run_profile(args):
             model_name=model_name,
             service_name=args.service_name,
         )
+        deployment_clients.append(client)  # Track for cleanup
         await client.create_deployment(prefill_config_fn)
         logger.info("Waiting for deployment to be ready...")
         try:
             await client.wait_for_deployment_ready()
             logger.info("Deployment is ready")
+            skip_profile = False
         except TimeoutError:
-            logger.error("Deployment failed to become ready within timeout, skipping profiling")
-            continue
+            logger.error(
+                "Deployment failed to become ready within timeout, skipping profiling"
+            )
+            skip_profile = True
 
-        logger.info("Getting deployment logs...")
-        await client.get_deployment_logs()
-        logger.info(
-            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-        )
+        if not skip_profile:
+            logger.info("Getting deployment logs...")
+            await client.get_deployment_logs()
+            logger.info(
+                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            )
 
-        # run genai-perf
-        base_url = client.get_service_url()
-        genai_perf_artifact_dir = f"{work_dir}/gap_isl{args.isl}"
-        gap_result = benchmark_prefill(
-            args.isl, genai_perf_artifact_dir, model_name, base_url=base_url
-        )
-        if gap_result is not None:
-            ttft = gap_result["time_to_first_token"]["avg"]
-            prefill_tp_size.append(tp_size)
-            prefill_ttft.append(ttft)
-            prefill_thpt_per_gpu.append(args.isl / ttft / tp_size * 1000)
+            base_url = client.get_service_url()
+            for isl in range(
+                100,
+                args.max_context_length,
+                (args.max_context_length - 100)
+                // args.prefill_interpolation_granularity,
+            ):
+                # run genai-perf
+                genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
+                gap_result = benchmark_prefill(
+                    isl, genai_perf_artifact_dir, model_name, base_url=base_url
+                )
+                if gap_result is not None:
+                    ttft = gap_result["time_to_first_token"]["avg"]
+                    prefill_isl.append(isl)
+                    prefill_ttft.append(ttft)
+                    prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
+        deployment_clients.remove(client)  # Remove from cleanup list since it's deleted
         print("Deployment deleted")
 
-    # Plot the results as a 2D scatter plot
-    if prefill_tp_size and prefill_ttft and prefill_thpt_per_gpu:
-        plot_prefill_performance(
-            prefill_tp_size,
-            prefill_ttft,
-            prefill_thpt_per_gpu,
-            args.ttft,
-            args.output_dir,
-        )
+        # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
+        if len(prefill_isl) > 2:
+            logger.info("Interpolating prefill TTFT and throughput vs ISL...")
+
+            # Convert to numpy arrays for easier manipulation
+            prefill_isl_np = np.array(prefill_isl)
+            prefill_ttft_np = np.array(prefill_ttft)
+            prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
+
+            save_path = f"{work_dir}/raw_data.npz"
+            np.savez(
+                save_path,
+                prefill_isl=prefill_isl_np,
+                prefill_ttft=prefill_ttft_np,
+                prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
+            )
+
+            # Call the plotting function
+            plot_prefill_interpolation(
+                prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
+            )
+        else:
+            logger.warning(
+                "Not enough data points to perform interpolation (need at least 3 points)"
+            )
 
-    # then profile decode
-    decode_tp_size = []
-    decode_itl = []
-    decode_thpt_per_gpu = []
-    decode_concurrency = []
-    decode_kv_cache_size = []
-    decode_results = []  # Store partial results for plotting later
-    logger.info("Profiling decode...")
-    decode_config = config_modifier.convert_config(config, "decode")
-    for tp_size in profile_tp_size:
-        logger.info(f"Profiling decode with TP size {tp_size}...")
-        decode_config = config_modifier.set_config_tp_size(decode_config, tp_size)
+        # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
+        x_kv_usage = []
+        y_context_length = []
+        z_itl = []
+        z_thpt_per_gpu = []
+        best_decode_tp = decode_tp_size[selected_decode_idx]
+        logger.info(f"Profiling decode with TP size {best_decode_tp}...")
+        decode_config = config_modifier.set_config_tp_size(
+            decode_config, best_decode_tp
+        )
         logger.info(f"Dynamo config: {decode_config}")
 
-        work_dir = f"{args.output_dir}/decode_tp{tp_size}"
+        work_dir = f"{args.output_dir}/selected_decode_interpolation"
         os.makedirs(work_dir, exist_ok=True)
 
         decode_config_fn = f"{work_dir}/config.yaml"
@@ -161,310 +441,93 @@ async def run_profile(args):
         client = DynamoDeploymentClient(
             namespace=args.namespace,
             base_log_dir=work_dir,
-            model_name=model_name,
             service_name=args.service_name,
         )
+        deployment_clients.append(client)  # Track for cleanup
         await client.create_deployment(decode_config_fn)
         logger.info("Waiting for deployment to be ready...")
         try:
             await client.wait_for_deployment_ready()
             logger.info("Deployment is ready")
+            skip_profile = False
         except TimeoutError:
-            logger.error("Deployment failed to become ready within timeout, skipping profiling")
-            continue
-
-        logger.info("Getting deployment logs...")
-        await client.get_deployment_logs()
-        logger.info(
-            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-        )
+            logger.error(
+                "Deployment failed to become ready within timeout, skipping profiling"
+            )
+            skip_profile = True
 
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
-        )
-        max_concurrency = max_kv_tokens // (args.isl + args.osl)
-        sweep_num_request = [
-            num for num in DECODE_NUM_REQUESTS_RANGE if num < max_concurrency
-        ]
-        logger.info(
-            f"Sweeping num_request range based on maximum number of kv tokens: {sweep_num_request}"
-        )
+        if not skip_profile:
+            logger.info("Getting deployment logs...")
+            await client.get_deployment_logs()
+            logger.info(
+                f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            )
 
-        engine_decode_itl = []
-        engine_decode_thpt_per_gpu = []
-        base_url = client.get_service_url()
-        for num_request in sweep_num_request:
-            genai_perf_artifact_dir = f"{work_dir}/gap_request{num_request}_isl{args.isl}_osl{args.osl}_n{num_request}"
-            gap_result = benchmark_decode(
-                args.isl,
-                args.osl,
-                num_request,
-                genai_perf_artifact_dir,
-                model_name,
-                base_url=base_url,
+            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
+                f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
             )
-            if gap_result is not None:
-                itl = gap_result["inter_token_latency"]["avg"]
-                thpt_per_gpu = gap_result["output_token_throughput"]["avg"] / tp_size
-                engine_decode_itl.append(itl)
-                engine_decode_thpt_per_gpu.append(thpt_per_gpu)
-                decode_tp_size.append(tp_size)
-                decode_itl.append(itl)
-                decode_thpt_per_gpu.append(thpt_per_gpu)
-                decode_concurrency.append(num_request)
-                decode_kv_cache_size.append(max_kv_tokens)
+
+            osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
+            base_url = client.get_service_url()
+            for isl in range(
+                100,
+                args.max_context_length - osl,
+                (args.max_context_length - osl)
+                // args.decode_interpolation_granularity,
+            ):
+                max_concurrency = max_kv_tokens // (isl + osl)
+                sweep_num_request = list(
+                    range(
+                        1,
+                        max_concurrency,
+                        max_concurrency // args.decode_interpolation_granularity,
+                    )
+                )
+                for num_request in sweep_num_request:
+                    genai_perf_artifact_dir = (
+                        f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
+                    )
+                    gap_result = benchmark_decode(
+                        isl,
+                        osl,
+                        num_request,
+                        genai_perf_artifact_dir,
+                        model_name,
+                        base_url=base_url,
+                    )
+                    if gap_result is not None:
+                        itl = gap_result["inter_token_latency"]["avg"]
+                        x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
+                        y_context_length.append(isl + osl / 2)
+                        z_itl.append(itl)
+                        z_thpt_per_gpu.append(
+                            gap_result["output_token_throughput"]["avg"] / tp_size
+                        )
 
         print("Cleaning up deployment...")
         await client.delete_deployment()
+        deployment_clients.remove(client)  # Remove from cleanup list since it's deleted
         print("Deployment deleted")
 
-        # Store partial results for plotting later
-        decode_results.append((tp_size, engine_decode_itl, engine_decode_thpt_per_gpu))
-
-    # Plot all decode results after profiling is complete
-    if decode_results:
-        plot_decode_performance(decode_results, args.itl, args.output_dir)
-
-    logger.info("Analyzing results and generate recommendations...")
-    # select best tp size for prefill
-    if min(prefill_ttft) > args.ttft:
-        logger.info(
-            "No TP size satisfies the TTFT requirement, please try a smaller model or a more powerful GPU SKU"
-        )
-        selected_prefill_idx = int(np.argmin(np.array(prefill_ttft)))
-    else:
-        valid_indices = [i for i, ttft in enumerate(prefill_ttft) if ttft <= args.ttft]
-        # Among valid TP sizes, select the one with highest throughput per GPU
-        valid_thpts = [prefill_thpt_per_gpu[i] for i in valid_indices]
-        max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
-        selected_prefill_idx = max_thpt_idx
-    logger.info(
-        f"Suggested prefill TP:{prefill_tp_size[selected_prefill_idx]} (TTFT {prefill_ttft[selected_prefill_idx]:.2f} ms, throughput {prefill_thpt_per_gpu[selected_prefill_idx]:.2f} tokens/s/GPU)"
-    )
-
-    # scale up if estimated TTFT is 120% of target TTFT
-    prefill_queue_size_upper_bound = max(
-        0.1, args.ttft * 1.2 / prefill_ttft[selected_prefill_idx] - 1
-    )
-    # scale down if estimated TTFT is 80% of target TTFT
-    prefill_queue_size_lower_bound = max(
-        0.1, args.ttft * 0.8 / prefill_ttft[selected_prefill_idx] - 1
-    )
-    logger.info(
-        f"Suggested planner upper/lower bound for prefill queue size: {prefill_queue_size_upper_bound:.2f}/{prefill_queue_size_lower_bound:.2f}"
-    )
-
-    # select best tp size for decode
-    if min(decode_itl) > args.itl:
-        logger.info(
-            "No TP size satisfies the ITL requirement, please try a smaller model or a more powerful GPU SKU"
-        )
-        selected_decode_idx = int(np.argmin(np.array(decode_itl)))
-    else:
-        valid_indices = [i for i, itl in enumerate(decode_itl) if itl <= args.itl]
-        # Among valid TP sizes, select the one with highest throughput per GPU
-        valid_thpts = [decode_thpt_per_gpu[i] for i in valid_indices]
-        max_thpt_idx = valid_indices[int(np.argmax(valid_thpts))]
-        selected_decode_idx = max_thpt_idx
-    logger.info(
-        f"Suggested decode TP:{decode_tp_size[selected_decode_idx]} (ITL {decode_itl[selected_decode_idx]:.2f} ms, throughput {decode_thpt_per_gpu[selected_decode_idx]:.2f} tokens/s/GPU)"
-    )
-
-    # calculate kv cache utlization for the selected TP and concurrency
-    selected_decode_kv_cache_utilization = (
-        decode_concurrency[selected_decode_idx]
-        * (args.isl + args.osl / 2)
-        / decode_kv_cache_size[selected_decode_idx]
-    )
-    # set a +- 20% range for the kv cache utilization
-    logger.info(
-        f"Suggested planner upper/lower bound for decode kv cache utilization: {min(1, selected_decode_kv_cache_utilization + 0.2):.2f}/{max(0.1, selected_decode_kv_cache_utilization - 0.2):.2f}"
-    )
-
-    # interpolate ISL - TTFT with best prefill TP
-    best_prefill_tp = prefill_tp_size[selected_prefill_idx]
-    prefill_isl = []
-    prefill_ttft = []
-    prefill_thpt_per_gpu = []
-    logger.info(
-        f"Profiling prefill under best TP {best_prefill_tp} with different ISL..."
-    )
-    prefill_config = config_modifier.convert_config(config, "prefill")
-    prefill_config = config_modifier.set_config_tp_size(prefill_config, tp_size)
-    logger.info(f"Dynamo config: {prefill_config}")
-
-    work_dir = f"{args.output_dir}/selected_prefill_interpolation"
-    os.makedirs(work_dir, exist_ok=True)
-
-    prefill_config_fn = f"{work_dir}/config.yaml"
-    with open(prefill_config_fn, "w") as f:
-        yaml.dump(prefill_config, f)
-
-    client = DynamoDeploymentClient(
-        namespace=args.namespace,
-        base_log_dir=work_dir,
-        model_name=model_name,
-        service_name=args.service_name,
-    )
-    await client.create_deployment(prefill_config_fn)
-    logger.info("Waiting for deployment to be ready...")
-    try:
-        await client.wait_for_deployment_ready()
-        logger.info("Deployment is ready")
-        skip_profile = False
-    except TimeoutError:
-        logger.error("Deployment failed to become ready within timeout, skipping profiling")
-        skip_profile = True
-
-    if not skip_profile:
-        logger.info("Getting deployment logs...")
-        await client.get_deployment_logs()
-        logger.info(
-            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
-        )
-
-        base_url = client.get_service_url()
-        for isl in range(
-            100,
-            args.max_context_length,
-            (args.max_context_length - 100) // args.prefill_interpolation_granularity,
-        ):
-            # run genai-perf
-            genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}"
-            gap_result = benchmark_prefill(
-                isl, genai_perf_artifact_dir, model_name, base_url=base_url
-            )
-            if gap_result is not None:
-                ttft = gap_result["time_to_first_token"]["avg"]
-                prefill_isl.append(isl)
-                prefill_ttft.append(ttft)
-                prefill_thpt_per_gpu.append(isl / ttft / best_prefill_tp * 1000)
-
-    print("Cleaning up deployment...")
-    await client.delete_deployment()
-    print("Deployment deleted")
-
-    # Interpolate prefill_ttft vs prefill_isl with quadratic function (y=ax^2+bx+c)
-    if len(prefill_isl) > 2:
-        logger.info("Interpolating prefill TTFT and throughput vs ISL...")
-
-        # Convert to numpy arrays for easier manipulation
-        prefill_isl_np = np.array(prefill_isl)
-        prefill_ttft_np = np.array(prefill_ttft)
-        prefill_thpt_per_gpu_np = np.array(prefill_thpt_per_gpu)
-
+        # Save the data points to a .npz file
         save_path = f"{work_dir}/raw_data.npz"
         np.savez(
             save_path,
-            prefill_isl=prefill_isl_np,
-            prefill_ttft=prefill_ttft_np,
-            prefill_thpt_per_gpu=prefill_thpt_per_gpu_np,
-        )
-
-        # Call the plotting function
-        plot_prefill_interpolation(
-            prefill_isl_np, prefill_ttft_np, prefill_thpt_per_gpu_np, work_dir
-        )
-    else:
-        logger.warning(
-            "Not enough data points to perform interpolation (need at least 3 points)"
-        )
-
-    # interpolate ITL - Active_KV_Cache - Decode_Context_Length with best decode TP
-    x_kv_usage = []
-    y_context_length = []
-    z_itl = []
-    z_thpt_per_gpu = []
-    best_decode_tp = decode_tp_size[selected_decode_idx]
-    logger.info(f"Profiling decode with TP size {best_decode_tp}...")
-    decode_config = config_modifier.set_config_tp_size(decode_config, best_decode_tp)
-    logger.info(f"Dynamo config: {decode_config}")
-
-    work_dir = f"{args.output_dir}/selected_decode_interpolation"
-    os.makedirs(work_dir, exist_ok=True)
-
-    decode_config_fn = f"{work_dir}/config.yaml"
-    with open(decode_config_fn, "w") as f:
-        yaml.dump(decode_config, f)
-
-    client = DynamoDeploymentClient(
-        namespace=args.namespace, base_log_dir=work_dir, service_name=args.service_name
-    )
-    await client.create_deployment(decode_config_fn)
-    logger.info("Waiting for deployment to be ready...")
-    try:
-        await client.wait_for_deployment_ready()
-        logger.info("Deployment is ready")
-        skip_profile = False
-    except TimeoutError:
-        logger.error("Deployment failed to become ready within timeout, skipping profiling")
-        skip_profile = True
-
-    if not skip_profile:
-        logger.info("Getting deployment logs...")
-        await client.get_deployment_logs()
-        logger.info(
-            f"Logs have been saved to {client.base_log_dir / client.deployment_name}"
+            x_kv_usage=np.array(x_kv_usage),
+            y_context_length=np.array(y_context_length),
+            z_itl=np.array(z_itl),
+            z_thpt_per_gpu=np.array(z_thpt_per_gpu),
+            max_kv_tokens=np.array([max_kv_tokens]),
         )
+        logger.info(f"Saved data points to {save_path}")
 
-        max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-            f"{work_dir}/vllm-v1-agg/vllmdecodeworker/0.log"
+        # Plot 3D surface
+        plot_decode_3d_surface(
+            x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
         )
 
-        osl = 500  # not too large to reduce ITL variance, not too small to have stable measurement
-        base_url = client.get_service_url()
-        for isl in range(
-            100,
-            args.max_context_length - osl,
-            (args.max_context_length - osl) // args.decode_interpolation_granularity,
-        ):
-            max_concurrency = max_kv_tokens // (isl + osl)
-            sweep_num_request = list(
-                range(
-                    1,
-                    max_concurrency,
-                    max_concurrency // args.decode_interpolation_granularity,
-                )
-            )
-            for num_request in sweep_num_request:
-                genai_perf_artifact_dir = f"{work_dir}/gap_isl{isl}_osl{osl}_n{num_request}"
-                gap_result = benchmark_decode(
-                    isl,
-                    osl,
-                    num_request,
-                    genai_perf_artifact_dir,
-                    model_name,
-                    base_url=base_url,
-                )
-                if gap_result is not None:
-                    itl = gap_result["inter_token_latency"]["avg"]
-                    x_kv_usage.append((isl + osl / 2) * num_request / max_kv_tokens)
-                    y_context_length.append(isl + osl / 2)
-                    z_itl.append(itl)
-                    z_thpt_per_gpu.append(
-                        gap_result["output_token_throughput"]["avg"] / tp_size
-                    )
-
-    print("Cleaning up deployment...")
-    await client.delete_deployment()
-    print("Deployment deleted")
-
-    # Save the data points to a .npz file
-    save_path = f"{work_dir}/raw_data.npz"
-    np.savez(
-        save_path,
-        x_kv_usage=np.array(x_kv_usage),
-        y_context_length=np.array(y_context_length),
-        z_itl=np.array(z_itl),
-        z_thpt_per_gpu=np.array(z_thpt_per_gpu),
-        max_kv_tokens=np.array([max_kv_tokens]),
-    )
-    logger.info(f"Saved data points to {save_path}")
-
-    # Plot 3D surface
-    plot_decode_3d_surface(
-        x_kv_usage, y_context_length, z_itl, best_decode_tp, work_dir
-    )
+    finally:
+        await cleanup_remaining_deployments(deployment_clients, args.namespace)
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index 0dfd5602308..94ada76fab2 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -128,7 +128,7 @@ async def create_deployment(self, deployment: Union[dict, str]):
             else:
                 raise
 
-    async def wait_for_deployment_ready(self, timeout: int = 300):
+    async def wait_for_deployment_ready(self, timeout: int = 600):
         """
         Wait for the custom resource to be ready.
 
@@ -155,6 +155,7 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
 
                 print(f"Current deployment state: {current_state}")
                 print(f"Current conditions: {conditions}")
+                print(f"Elapsed time: {time.time() - start_time:.1f}s / {timeout}s")
 
                 ready_condition = False
                 for condition in conditions:
@@ -177,8 +178,11 @@ async def wait_for_deployment_ready(self, timeout: int = 300):
                         f"Deployment not ready yet - Ready condition: {ready_condition}, State successful: {state_successful}"
                     )
 
-            except kubernetes.client.rest.ApiException:
-                pass
+            except kubernetes.client.rest.ApiException as e:
+                print(f"API Exception while checking deployment status: {e}")
+                print(f"Status code: {e.status}, Reason: {e.reason}")
+            except Exception as e:
+                print(f"Unexpected exception while checking deployment status: {e}")
             await asyncio.sleep(20)
         raise TimeoutError("Deployment failed to become ready within timeout")
 

From 450d37114c0c13d0e15323356fa43b9ac9be71a0 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 21 Jul 2025 17:12:34 -0700
Subject: [PATCH 26/58] add debug info

---
 examples/vllm/README.md | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/examples/vllm/README.md b/examples/vllm/README.md
index d3a0224a097..73807308061 100644
--- a/examples/vllm/README.md
+++ b/examples/vllm/README.md
@@ -150,6 +150,17 @@ cd ~/dynamo/examples/vllm/deploy
 kubectl apply -f disagg.yaml
 ```
 
+To change `DYN_LOG` level, edit the yaml file by adding
+
+```yaml
+...
+spec:
+  envs:
+    - name: DYN_LOG
+      value: "debug" # or other log levels
+  ...
+```
+
 ### Testing the Deployment
 
 Send a test request to verify your deployment:

From 769c98e8c9632504821a48f489589f24407c0f89 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Tue, 22 Jul 2025 09:24:14 -0700
Subject: [PATCH 27/58] sla planner

---
 .../planner/src/dynamo/planner/defaults.py    |   2 +-
 .../src/dynamo/planner/local_connector.py     | 310 ------------------
 .../planner/src/dynamo/planner/planner_sla.py | 200 ++++++-----
 .../src/dynamo/planner/utils/planner_core.py  |   4 +-
 4 files changed, 124 insertions(+), 392 deletions(-)
 delete mode 100644 components/planner/src/dynamo/planner/local_connector.py

diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 60a6216f02a..7ee8d2f85b2 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -17,7 +17,7 @@
 # Source of truth for planner defaults
 class BasePlannerDefaults:
     namespace = "dynamo"
-    environment = "local"
+    environment = "kubernetes"
     backend = "vllm_v0"
     no_operation = False
     log_dir = None
diff --git a/components/planner/src/dynamo/planner/local_connector.py b/components/planner/src/dynamo/planner/local_connector.py
deleted file mode 100644
index e8654231ee8..00000000000
--- a/components/planner/src/dynamo/planner/local_connector.py
+++ /dev/null
@@ -1,310 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import asyncio
-import json
-import logging
-import os
-from pathlib import Path
-from typing import Any, Dict, List
-
-import filelock
-
-from dynamo.planner.circusd import CircusController
-from dynamo.planner.planner_connector import PlannerConnector
-from dynamo.runtime import DistributedRuntime
-from dynamo.runtime.logging import configure_dynamo_logging
-
-configure_dynamo_logging()
-logger = logging.getLogger(__name__)
-
-
-class LocalConnector(PlannerConnector):
-    def __init__(self, namespace: str, runtime: DistributedRuntime):
-        """
-        Initialize LocalConnector and connect to CircusController.
-
-        Args:
-            namespace: The Dynamo namespace
-            runtime: Optional DistributedRuntime instance
-        """
-        self.namespace = namespace
-        self.runtime = runtime
-        self.state_file = Path.home() / ".dynamo" / "state" / f"{namespace}.json"
-        self.circus = CircusController.from_state_file(namespace)
-        self._lockfile = self.state_file.with_suffix(".lock")
-        self._file_lock = filelock.FileLock(self._lockfile)
-        self.worker_client: Any | None = None
-        self.prefill_client: Any | None = None
-        self.etcd_client: Any | None = None
-
-    async def _load_state(self) -> Dict[str, Any]:
-        """Load state from state file.
-
-        Returns:
-            State dictionary
-        """
-        if not self.state_file.exists():
-            raise FileNotFoundError(f"State file not found: {self.state_file}")
-
-        with self._file_lock:
-            with open(self.state_file, "r") as f:
-                return json.load(f)
-
-    async def _save_state(self, state: Dict[str, Any]) -> bool:
-        """Save state to state file.
-
-        Args:
-            state: State dictionary to save
-
-        Returns:
-            True if successful
-        """
-        try:
-            with self._file_lock:
-                with open(self.state_file, "w") as f:
-                    json.dump(state, f, indent=2)
-            return True
-        except Exception as e:
-            logger.error(f"Failed to save state: {e}")
-            return False
-
-    async def _get_available_gpus(self) -> List[str]:
-        """Get list of unallocated GPU IDs.
-
-        Returns:
-            List of available GPU IDs
-        """
-        state = await self._load_state()
-        system_resources = state.get("environment", {}).get("SYSTEM_RESOURCES", {})
-        all_gpus = set(str(gpu) for gpu in system_resources.get("gpu_info", []))
-
-        allocated_gpus: set[str] = set()
-        for component_info in state.get("components", {}).values():
-            resources = component_info.get("resources", {})
-            gpu_list = resources.get("allocated_gpus", [])
-            allocated_gpus.update(str(gpu) for gpu in gpu_list)
-
-        logger.info(f"Allocated GPUs: {allocated_gpus}")
-        available = sorted(list(all_gpus - allocated_gpus))
-        logger.info(f"Available GPUs: {available}")
-        return available
-
-    async def add_component(self, component_name: str, blocking: bool = True) -> bool:
-        """
-        Add a component. The steps are as follows:
-
-        1. Load state
-        2. Find max suffix to create unique watcher name
-        3. Built environment and command for watcher
-        4. Block until component is running
-
-        Args:
-            component_name: Name of the component
-
-        Returns:
-            True if successful
-        """
-        state = await self._load_state()
-        # Find max suffix
-        max_suffix = 0
-        for watcher_name in state["components"].keys():
-            if watcher_name.startswith(f"{self.namespace}_{component_name}_"):
-                suffix = int(
-                    watcher_name.replace(f"{self.namespace}_{component_name}_", "")
-                )
-                max_suffix = max(max_suffix, suffix)
-
-        watcher_name = f"{self.namespace}_{component_name}_{max_suffix + 1}"
-
-        if component_name not in [
-            c.replace(f"{self.namespace}_", "") for c in state["components"]
-        ]:
-            raise ValueError(
-                f"Component {component_name} not found in state configuration"
-            )
-
-        # Get base command and config
-        component_info = state["components"][f"{self.namespace}_{component_name}"]
-        base_cmd = component_info["cmd"].split("--worker-env")[0].strip()
-        service_config = state["environment"].get("DYNAMO_SERVICE_CONFIG")
-
-        # Build environment
-        watcher_env = os.environ.copy()
-        if component_name in ["VllmWorker", "PrefillWorker"]:
-            available_gpus = await self._get_available_gpus()
-            if not available_gpus:
-                raise ValueError("No GPUs available for allocation")
-            gpu_id = available_gpus[0]
-            watcher_env["CUDA_VISIBLE_DEVICES"] = gpu_id
-
-        watcher_env["DYNAMO_SERVICE_CONFIG"] = service_config
-
-        # Build worker env list and command
-        worker_env_list = [watcher_env]
-        worker_env_arg = json.dumps(worker_env_list)
-        # We add a custom component name to ensure that the lease is attatched to this specific watcher
-        full_cmd = f"{base_cmd} --worker-env '{worker_env_arg}' --custom-component-name '{watcher_name}'"
-
-        pre_add_endpoint_ids = await self._count_instance_ids(component_name)
-        logger.info(f"Pre-add endpoint IDs: {pre_add_endpoint_ids}")
-
-        logger.info(f"Adding watcher {watcher_name}")
-        success = await self.circus.add_watcher(
-            name=watcher_name, cmd=full_cmd, env=watcher_env, singleton=True
-        )
-
-        if success:
-            resources = {}
-            if component_name in ["VllmWorker", "PrefillWorker"]:
-                resources["allocated_gpus"] = [gpu_id]
-
-            state["components"][watcher_name] = {
-                "watcher_name": watcher_name,
-                "cmd": full_cmd,
-                "resources": resources,
-            }
-            await self._save_state(state)
-            logger.info(
-                f"Succesfully created {watcher_name}. Waiting for worker to start..."
-            )
-
-        if blocking:
-            required_endpoint_ids = pre_add_endpoint_ids + 1
-            while True:
-                current_endpoint_ids = await self._count_instance_ids(component_name)
-                if current_endpoint_ids == required_endpoint_ids:
-                    break
-                logger.info(
-                    f"Waiting for {component_name} to start. Current endpoint IDs: {current_endpoint_ids}, Required endpoint IDs: {required_endpoint_ids}"
-                )
-                await asyncio.sleep(5)
-
-        return success
-
-    async def remove_component(
-        self, component_name: str, blocking: bool = True
-    ) -> bool:
-        """
-        Remove a component. The initial components are not numbered so we simply remove their resources
-        and lease but keep the entry in order to use the cmd. This allows us to re-add the component
-        without having to re-specify the cmd. For components that have been added, we remove their entry
-        entry
-
-        Args:
-            component_name: Name of the component
-
-        Returns:
-            True if successful
-        """
-        logger.info(f"Attempting to remove component {component_name}")
-        state = await self._load_state()
-        matching_components = {}
-
-        base_name = f"{self.namespace}_{component_name}"
-        base_name_with_underscore = f"{base_name}_"
-
-        for watcher_name in state["components"].keys():
-            if watcher_name == base_name:
-                matching_components[0] = watcher_name
-            elif watcher_name.startswith(base_name_with_underscore):
-                suffix = int(watcher_name.replace(base_name_with_underscore, ""))
-                matching_components[suffix] = watcher_name
-
-        if not matching_components:
-            logger.error(f"No matching components found for {component_name}")
-            return False
-
-        highest_suffix = max(matching_components.keys())
-        target_watcher = matching_components[highest_suffix]
-        logger.info(f"Removing watcher {target_watcher}")
-
-        success = await self.circus.remove_watcher(
-            name=target_watcher, blocking=blocking
-        )
-        if not blocking:
-            logger.info(
-                f"Circus remove_watcher for {target_watcher} {'succeeded' if success else 'failed'}"
-            )
-
-        if success:
-            if highest_suffix > 0:  # Numbered watcher - remove entire entry
-                if target_watcher in state["components"]:
-                    del state["components"][target_watcher]
-            else:  # Base watcher - just clear resources and lease
-                if target_watcher in state["components"]:
-                    state["components"][target_watcher]["resources"] = {}
-                    state["components"][target_watcher]["lease"] = None
-            await self._save_state(state)
-
-        return success
-
-    async def _count_instance_ids(self, component_name: str) -> int:
-        """
-        Count the instance IDs for the 'generate' endpoint of given component.
-
-        Args:
-            component_name: Name of the component
-
-        Returns:
-            Number of endpoint IDs for a component
-        """
-        if component_name == "VllmWorker":
-            if self.worker_client is None:
-                self.worker_client = (
-                    await self.runtime.namespace(self.namespace)
-                    .component(component_name)
-                    .endpoint("generate")
-                    .client()
-                )
-            worker_ids = self.worker_client.instance_ids()
-            return len(worker_ids)
-        elif component_name == "PrefillWorker":
-            if self.prefill_client is None:
-                self.prefill_client = (
-                    await self.runtime.namespace(self.namespace)
-                    .component(component_name)
-                    .endpoint("mock")
-                    .client()
-                )
-            prefill_ids = self.prefill_client.instance_ids()
-            return len(prefill_ids)
-        else:
-            raise ValueError(f"Component {component_name} not supported")
-
-    async def _revoke_lease(self, lease_id: int) -> bool:
-        """
-        Wrapper function around the etcd client to revoke a lease
-
-        Args:
-            lease_id: Lease ID to revoke
-
-        Returns:
-            True if successful
-        """
-        if self.etcd_client is None:
-            self.etcd_client = self.runtime.etcd_client()  # type: ignore
-        try:
-            await self.etcd_client.revoke_lease(lease_id)
-            logger.info(f"Revoked lease {lease_id}")
-            return True
-        except Exception as e:
-            logger.error(f"Failed to revoke lease {lease_id}: {e}")
-            return False
-
-    def __del__(self):
-        """Cleanup circus controller connection on deletion."""
-        if hasattr(self, "circus"):
-            self.circus.close()
diff --git a/components/planner/src/dynamo/planner/planner_sla.py b/components/planner/src/dynamo/planner/planner_sla.py
index c43ea52006b..cdbdc0adaa7 100644
--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -20,12 +20,9 @@
 from pydantic import BaseModel
 
 from dynamo.planner.defaults import SLAPlannerDefaults
+from dynamo.runtime import DistributedRuntime, dynamo_worker
+
 from dynamo.planner.utils.planner_core import start_sla_planner
-from dynamo.runtime.logging import configure_dynamo_logging
-from dynamo.sdk import async_on_start, dynamo_context, endpoint, service
-from dynamo.sdk.core.protocol.interface import ComponentType
-from dynamo.sdk.lib.config import ServiceConfig
-from dynamo.sdk.lib.image import DYNAMO_IMAGE
 
 logger = logging.getLogger(__name__)
 
@@ -37,80 +34,127 @@
 class RequestType(BaseModel):
     text: str
 
+@dynamo_worker(static=False)
+async def init_planner(runtime: DistributedRuntime, args):
+
+    await asyncio.sleep(INIT_PLANNER_START_DELAY)
+
+    await start_sla_planner(runtime, args)
+
+    component = runtime.namespace(args.namespace).component("Planner")
+    await component.create_service()
 
-@service(
-    dynamo={
-        "namespace": "dynamo",
-        "component_type": ComponentType.PLANNER,
-    },
-    resources={"cpu": "10", "memory": "20Gi"},
-    workers=1,
-    image=DYNAMO_IMAGE,
-)
-class Planner:
-    def __init__(self):
-        configure_dynamo_logging(service_name="Planner")
-        logger.info("Starting planner")
-        self.runtime = dynamo_context["runtime"]
-
-        config = ServiceConfig.get_instance()
-
-        # Get namespace directly from dynamo_context as it contains the active namespace
-        self.namespace = dynamo_context["namespace"]
-        config_instance = config.get("Planner", {})
-
-        self.args = argparse.Namespace(
-            namespace=self.namespace,
-            environment=config_instance.get(
-                "environment", SLAPlannerDefaults.environment
-            ),
-            backend=config_instance.get("backend", SLAPlannerDefaults.backend),
-            no_operation=config_instance.get(
-                "no-operation", SLAPlannerDefaults.no_operation
-            ),
-            log_dir=config_instance.get("log-dir", SLAPlannerDefaults.log_dir),
-            adjustment_interval=config_instance.get(
-                "adjustment-interval", SLAPlannerDefaults.adjustment_interval
-            ),
-            max_gpu_budget=config_instance.get(
-                "max-gpu-budget", SLAPlannerDefaults.max_gpu_budget
-            ),
-            min_endpoint=config_instance.get(
-                "min-endpoint", SLAPlannerDefaults.min_endpoint
-            ),
-            decode_engine_num_gpu=config_instance.get(
-                "decode-engine-num-gpu", SLAPlannerDefaults.decode_engine_num_gpu
-            ),
-            prefill_engine_num_gpu=config_instance.get(
-                "prefill-engine-num-gpu", SLAPlannerDefaults.prefill_engine_num_gpu
-            ),
-            prometheus_endpoint=config_instance.get(
-                "prometheus-endpoint", SLAPlannerDefaults.prometheus_endpoint
-            ),
-            profile_results_dir=config_instance.get(
-                "profile-results-dir", SLAPlannerDefaults.profile_results_dir
-            ),
-            isl=config_instance.get("isl", SLAPlannerDefaults.isl),
-            osl=config_instance.get("osl", SLAPlannerDefaults.osl),
-            ttft=config_instance.get("ttft", SLAPlannerDefaults.ttft),
-            itl=config_instance.get("itl", SLAPlannerDefaults.itl),
-            load_predictor=config_instance.get(
-                "load-predictor", SLAPlannerDefaults.load_predictor
-            ),
-            load_prediction_window_size=config_instance.get(
-                "load-prediction-window-size",
-                SLAPlannerDefaults.load_prediction_window_size,
-            ),
-        )
-
-    @async_on_start
-    async def async_init(self):
-        await asyncio.sleep(INIT_PLANNER_START_DELAY)
-        logger.info("Calling start_planner")
-        await start_sla_planner(self.runtime, self.args)
-        logger.info("Planner started")
-
-    @endpoint()
     async def generate(self, request: RequestType):
         """Dummy endpoint to satisfy that each component has an endpoint"""
         yield "mock endpoint"
+
+    generate_endpoint = component.endpoint("generate")
+    await generate_endpoint.serve_endpoint(generate)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SLA Planner")
+    parser.add_argument(
+        "--namespace", 
+        default=SLAPlannerDefaults.namespace,
+        help="Namespace for the planner"
+    )
+    parser.add_argument(
+        "--environment", 
+        default=SLAPlannerDefaults.environment,
+        help="Environment type"
+    )
+    parser.add_argument(
+        "--backend", 
+        default=SLAPlannerDefaults.backend,
+        help="Backend type"
+    )
+    parser.add_argument(
+        "--no-operation", 
+        action="store_true",
+        default=SLAPlannerDefaults.no_operation,
+        help="Enable no-operation mode"
+    )
+    parser.add_argument(
+        "--log-dir", 
+        default=SLAPlannerDefaults.log_dir,
+        help="Log directory path"
+    )
+    parser.add_argument(
+        "--adjustment-interval", 
+        type=int,
+        default=SLAPlannerDefaults.adjustment_interval,
+        help="Adjustment interval in seconds"
+    )
+    parser.add_argument(
+        "--max-gpu-budget", 
+        type=int,
+        default=SLAPlannerDefaults.max_gpu_budget,
+        help="Maximum GPU budget"
+    )
+    parser.add_argument(
+        "--min-endpoint", 
+        type=int,
+        default=SLAPlannerDefaults.min_endpoint,
+        help="Minimum number of endpoints"
+    )
+    parser.add_argument(
+        "--decode-engine-num-gpu", 
+        type=int,
+        default=SLAPlannerDefaults.decode_engine_num_gpu,
+        help="Number of GPUs for decode engine"
+    )
+    parser.add_argument(
+        "--prefill-engine-num-gpu", 
+        type=int,
+        default=SLAPlannerDefaults.prefill_engine_num_gpu,
+        help="Number of GPUs for prefill engine"
+    )
+    parser.add_argument(
+        "--prometheus-endpoint", 
+        default=SLAPlannerDefaults.prometheus_endpoint,
+        help="Prometheus endpoint URL"
+    )
+    parser.add_argument(
+        "--profile-results-dir", 
+        default=SLAPlannerDefaults.profile_results_dir,
+        help="Profile results directory"
+    )
+    parser.add_argument(
+        "--isl", 
+        type=int,
+        default=SLAPlannerDefaults.isl,
+        help="Input sequence length"
+    )
+    parser.add_argument(
+        "--osl", 
+        type=int,
+        default=SLAPlannerDefaults.osl,
+        help="Output sequence length"
+    )
+    parser.add_argument(
+        "--ttft", 
+        type=float,
+        default=SLAPlannerDefaults.ttft,
+        help="Time to first token"
+    )
+    parser.add_argument(
+        "--itl", 
+        type=float,
+        default=SLAPlannerDefaults.itl,
+        help="Inter-token latency"
+    )
+    parser.add_argument(
+        "--load-predictor", 
+        default=SLAPlannerDefaults.load_predictor,
+        help="Load predictor type"
+    )
+    parser.add_argument(
+        "--load-prediction-window-size", 
+        type=int,
+        default=SLAPlannerDefaults.load_prediction_window_size,
+        help="Load prediction window size"
+    )
+    
+    args = parser.parse_args()
+    asyncio.run(init_planner(args))
diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index f6d4de5e063..1bbf5e1bd99 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -55,9 +55,7 @@ def __init__(self, runtime: DistributedRuntime, args: argparse.Namespace):
         self.namespace = args.namespace
 
         if not args.no_operation:
-            if args.environment == "local":
-                self.connector = LocalConnector(args.namespace, runtime)
-            elif args.environment == "kubernetes":
+            if args.environment == "kubernetes":
                 self.connector = KubernetesConnector(args.namespace)
             else:
                 raise ValueError(f"Invalid environment: {args.environment}")

From e726d4316cef18138d69182ba14c4562a4edf32e Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Tue, 22 Jul 2025 09:25:39 -0700
Subject: [PATCH 28/58] add choices

---
 components/planner/src/dynamo/planner/defaults.py    | 2 +-
 components/planner/src/dynamo/planner/planner_sla.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 7ee8d2f85b2..929da9ff004 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -18,7 +18,7 @@
 class BasePlannerDefaults:
     namespace = "dynamo"
     environment = "kubernetes"
-    backend = "vllm_v0"
+    backend = "vllm_v1"
     no_operation = False
     log_dir = None
     adjustment_interval = 180  # in seconds
diff --git a/components/planner/src/dynamo/planner/planner_sla.py b/components/planner/src/dynamo/planner/planner_sla.py
index cdbdc0adaa7..c7f819b50b4 100644
--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -62,11 +62,13 @@ async def generate(self, request: RequestType):
     parser.add_argument(
         "--environment", 
         default=SLAPlannerDefaults.environment,
+        choices=["kubernetes"],
         help="Environment type"
     )
     parser.add_argument(
         "--backend", 
         default=SLAPlannerDefaults.backend,
+        choices=["vllm_v1"],
         help="Backend type"
     )
     parser.add_argument(

From ff6c4910c2392cf1e7b66272681177b2159ffd42 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Tue, 22 Jul 2025 10:57:45 -0700
Subject: [PATCH 29/58] feat: vllm_v1 -> vllm and remove vllm_v0 from planner

---
 .../planner/src/dynamo/planner/defaults.py    | 14 +---
 .../planner/src/dynamo/planner/planner_sla.py | 84 ++++++++-----------
 2 files changed, 39 insertions(+), 59 deletions(-)

diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 929da9ff004..7b4fffdef9b 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -18,7 +18,7 @@
 class BasePlannerDefaults:
     namespace = "dynamo"
     environment = "kubernetes"
-    backend = "vllm_v1"
+    backend = "vllm"
     no_operation = False
     log_dir = None
     adjustment_interval = 180  # in seconds
@@ -47,14 +47,7 @@ class SLAPlannerDefaults(BasePlannerDefaults):
     load_prediction_window_size = 50  # predict load using how many recent load samples
 
 
-class VllmV0ComponentName:
-    prefill_worker = "PrefillWorker"
-    prefill_worker_endpoint = "mock"
-    decode_worker = "VllmWorker"
-    decode_worker_endpoint = "generate"
-
-
-class VllmV1ComponentName:
+class VllmComponentName:
     prefill_worker = "VllmPrefillWorker"
     prefill_worker_endpoint = "generate"
     decode_worker = "VllmDecodeWorker"
@@ -62,6 +55,5 @@ class VllmV1ComponentName:
 
 
 WORKER_COMPONENT_NAMES = {
-    "vllm_v0": VllmV0ComponentName,
-    "vllm_v1": VllmV1ComponentName,
+    "vllm": VllmComponentName,
 }
diff --git a/components/planner/src/dynamo/planner/planner_sla.py b/components/planner/src/dynamo/planner/planner_sla.py
index c7f819b50b4..5874d69daa0 100644
--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -20,9 +20,8 @@
 from pydantic import BaseModel
 
 from dynamo.planner.defaults import SLAPlannerDefaults
-from dynamo.runtime import DistributedRuntime, dynamo_worker
-
 from dynamo.planner.utils.planner_core import start_sla_planner
+from dynamo.runtime import DistributedRuntime, dynamo_worker
 
 logger = logging.getLogger(__name__)
 
@@ -34,9 +33,9 @@
 class RequestType(BaseModel):
     text: str
 
+
 @dynamo_worker(static=False)
 async def init_planner(runtime: DistributedRuntime, args):
-
     await asyncio.sleep(INIT_PLANNER_START_DELAY)
 
     await start_sla_planner(runtime, args)
@@ -55,108 +54,97 @@ async def generate(self, request: RequestType):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="SLA Planner")
     parser.add_argument(
-        "--namespace", 
+        "--namespace",
         default=SLAPlannerDefaults.namespace,
-        help="Namespace for the planner"
+        help="Namespace for the planner",
     )
     parser.add_argument(
-        "--environment", 
+        "--environment",
         default=SLAPlannerDefaults.environment,
         choices=["kubernetes"],
-        help="Environment type"
+        help="Environment type",
     )
     parser.add_argument(
-        "--backend", 
+        "--backend",
         default=SLAPlannerDefaults.backend,
-        choices=["vllm_v1"],
-        help="Backend type"
+        choices=["vllm"],
+        help="Backend type",
     )
     parser.add_argument(
-        "--no-operation", 
+        "--no-operation",
         action="store_true",
         default=SLAPlannerDefaults.no_operation,
-        help="Enable no-operation mode"
+        help="Enable no-operation mode",
     )
     parser.add_argument(
-        "--log-dir", 
-        default=SLAPlannerDefaults.log_dir,
-        help="Log directory path"
+        "--log-dir", default=SLAPlannerDefaults.log_dir, help="Log directory path"
     )
     parser.add_argument(
-        "--adjustment-interval", 
+        "--adjustment-interval",
         type=int,
         default=SLAPlannerDefaults.adjustment_interval,
-        help="Adjustment interval in seconds"
+        help="Adjustment interval in seconds",
     )
     parser.add_argument(
-        "--max-gpu-budget", 
+        "--max-gpu-budget",
         type=int,
         default=SLAPlannerDefaults.max_gpu_budget,
-        help="Maximum GPU budget"
+        help="Maximum GPU budget",
     )
     parser.add_argument(
-        "--min-endpoint", 
+        "--min-endpoint",
         type=int,
         default=SLAPlannerDefaults.min_endpoint,
-        help="Minimum number of endpoints"
+        help="Minimum number of endpoints",
     )
     parser.add_argument(
-        "--decode-engine-num-gpu", 
+        "--decode-engine-num-gpu",
         type=int,
         default=SLAPlannerDefaults.decode_engine_num_gpu,
-        help="Number of GPUs for decode engine"
+        help="Number of GPUs for decode engine",
     )
     parser.add_argument(
-        "--prefill-engine-num-gpu", 
+        "--prefill-engine-num-gpu",
         type=int,
         default=SLAPlannerDefaults.prefill_engine_num_gpu,
-        help="Number of GPUs for prefill engine"
+        help="Number of GPUs for prefill engine",
     )
     parser.add_argument(
-        "--prometheus-endpoint", 
+        "--prometheus-endpoint",
         default=SLAPlannerDefaults.prometheus_endpoint,
-        help="Prometheus endpoint URL"
+        help="Prometheus endpoint URL",
     )
     parser.add_argument(
-        "--profile-results-dir", 
+        "--profile-results-dir",
         default=SLAPlannerDefaults.profile_results_dir,
-        help="Profile results directory"
+        help="Profile results directory",
     )
     parser.add_argument(
-        "--isl", 
-        type=int,
-        default=SLAPlannerDefaults.isl,
-        help="Input sequence length"
+        "--isl", type=int, default=SLAPlannerDefaults.isl, help="Input sequence length"
     )
     parser.add_argument(
-        "--osl", 
-        type=int,
-        default=SLAPlannerDefaults.osl,
-        help="Output sequence length"
+        "--osl", type=int, default=SLAPlannerDefaults.osl, help="Output sequence length"
     )
     parser.add_argument(
-        "--ttft", 
+        "--ttft",
         type=float,
         default=SLAPlannerDefaults.ttft,
-        help="Time to first token"
+        help="Time to first token",
     )
     parser.add_argument(
-        "--itl", 
-        type=float,
-        default=SLAPlannerDefaults.itl,
-        help="Inter-token latency"
+        "--itl", type=float, default=SLAPlannerDefaults.itl, help="Inter-token latency"
     )
     parser.add_argument(
-        "--load-predictor", 
+        "--load-predictor",
         default=SLAPlannerDefaults.load_predictor,
-        help="Load predictor type"
+        help="Load predictor type",
     )
     parser.add_argument(
-        "--load-prediction-window-size", 
+        "--load-prediction-window-size",
         type=int,
         default=SLAPlannerDefaults.load_prediction_window_size,
-        help="Load prediction window size"
+        help="Load prediction window size",
     )
-    
+
     args = parser.parse_args()
     asyncio.run(init_planner(args))

From 6ebfe73fdb404706d00b89b8167a2fd1351881e9 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Tue, 22 Jul 2025 13:05:59 -0700
Subject: [PATCH 30/58] feat: remove local connector from init

---
 components/planner/src/dynamo/planner/__init__.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/components/planner/src/dynamo/planner/__init__.py b/components/planner/src/dynamo/planner/__init__.py
index 09b52a2f156..5bddaa17a29 100644
--- a/components/planner/src/dynamo/planner/__init__.py
+++ b/components/planner/src/dynamo/planner/__init__.py
@@ -15,7 +15,6 @@
 
 __all__ = [
     "CircusController",
-    "LocalConnector",
     "PlannerConnector",
     "KubernetesConnector",
     "LoadPlannerDefaults",
@@ -26,5 +25,4 @@
 from dynamo.planner.circusd import CircusController
 from dynamo.planner.defaults import LoadPlannerDefaults, SLAPlannerDefaults
 from dynamo.planner.kubernetes_connector import KubernetesConnector
-from dynamo.planner.local_connector import LocalConnector
 from dynamo.planner.planner_connector import PlannerConnector

From fb89fc2ae805529651f806274544d16b47a0b116 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Tue, 22 Jul 2025 14:11:55 -0700
Subject: [PATCH 31/58] feat: remove LocalConnector from core

---
 components/planner/src/dynamo/planner/utils/planner_core.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index 1bbf5e1bd99..cc724d22457 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -21,7 +21,7 @@
 from dataclasses import dataclass
 from typing import Optional
 
-from dynamo.planner import KubernetesConnector, LocalConnector
+from dynamo.planner import KubernetesConnector
 from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SLAPlannerDefaults
 from dynamo.planner.utils.load_predictor import LOAD_PREDICTORS
 from dynamo.planner.utils.perf_interpolation import (

From 047cecbfbfc4df6bff46f502466fac6035e94272 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Tue, 22 Jul 2025 16:17:30 -0700
Subject: [PATCH 32/58] feat: rework prometheus file for planner deployment

---
 .../planner/src/dynamo/planner/prometheus.py  |  72 ++++-----
 examples/vllm/deploy/disagg_planner.yaml      | 139 +++++++++++++++---
 2 files changed, 156 insertions(+), 55 deletions(-)

diff --git a/components/planner/src/dynamo/planner/prometheus.py b/components/planner/src/dynamo/planner/prometheus.py
index fc0370e56eb..be583342ce1 100644
--- a/components/planner/src/dynamo/planner/prometheus.py
+++ b/components/planner/src/dynamo/planner/prometheus.py
@@ -13,55 +13,59 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import asyncio
 import logging
 import subprocess
 import tempfile
 
 import yaml
 
-from dynamo.sdk import service
+from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.sdk.lib.config import ServiceConfig
-from dynamo.sdk.lib.image import DYNAMO_IMAGE
 
 logger = logging.getLogger(__name__)
 
 
-@service(
-    dynamo={
-        "namespace": "dynamo",
-    },
-    workers=1,
-    image=DYNAMO_IMAGE,
-)
-class Prometheus:
-    def __init__(self):
-        """Initialize Frontend service with HTTP server and model configuration."""
-        self.config = ServiceConfig.get_parsed_config("Prometheus")
-        self.process = None
+@dynamo_worker(static=False)
+async def worker(runtime: DistributedRuntime):
+    """Initialize and run Prometheus server with Dynamo config."""
+    config = ServiceConfig.get_parsed_config("Prometheus")
 
-        logger.info(f"Prometheus config: {self.config}")
+    logger.info(f"Prometheus config: {config}")
 
-        self.start_prometheus_server()
+    await start_prometheus_server(config)
 
-    def start_prometheus_server(self):
-        logger.info("Starting prometheus server...")
 
-        self.temp_file = tempfile.NamedTemporaryFile(
-            mode="w", suffix=".yml", delete=False
-        )
-        yaml.dump(self.config, self.temp_file)
-        self.temp_file.close()
-        config_path = self.temp_file.name
+async def start_prometheus_server(config):
+    logger.info("Starting prometheus server...")
 
-        cmd = [
-            "prometheus",
-            f"--config.file={config_path}",
-        ]
+    temp_file = tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False)
+    yaml.dump(config, temp_file)
+    temp_file.close()
+    config_path = temp_file.name
 
-        logger.info(f"Prometheus cmd: {cmd}")
+    cmd = [
+        "prometheus",
+        f"--config.file={config_path}",
+    ]
 
-        self.process = subprocess.Popen(
-            cmd,
-            stdout=None,
-            stderr=None,
-        )
+    logger.info(f"Prometheus cmd: {cmd}")
+
+    process = subprocess.Popen(
+        cmd,
+        stdout=None,
+        stderr=None,
+    )
+
+    # Keep the worker running
+    try:
+        while True:
+            await asyncio.sleep(1)
+            if process.poll() is not None:
+                logger.error("Prometheus process died")
+                break
+    except asyncio.CancelledError:
+        logger.info("Shutting down Prometheus...")
+        process.terminate()
+        process.wait()
+        raise
diff --git a/examples/vllm/deploy/disagg_planner.yaml b/examples/vllm/deploy/disagg_planner.yaml
index 13641d10e04..9fd672d337e 100644
--- a/examples/vllm/deploy/disagg_planner.yaml
+++ b/examples/vllm/deploy/disagg_planner.yaml
@@ -15,11 +15,11 @@
 apiVersion: nvidia.com/v1alpha1
 kind: DynamoGraphDeployment
 metadata:
-  name: vllm-v1-disagg-planner
+  name: vllm-disagg-planner
 spec:
   services:
     Frontend:
-      dynamoNamespace: vllm-v1-disagg-planner
+      dynamoNamespace: vllm-disagg-planner
       componentType: main
       replicas: 1
       livenessProbe:
@@ -42,14 +42,14 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "1"
-          memory: "2Gi"
+          cpu: "2"
+          memory: "4Gi"
         limits:
-          cpu: "1"
-          memory: "2Gi"
+          cpu: "2"
+          memory: "4Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
           workingDir: /workspace/examples/vllm
           args:
             - dynamo
@@ -58,8 +58,57 @@ spec:
             - out=dyn
             - --http-port
             - "8000"
+    Planner:
+      dynamoNamespace: vllm-disagg-planner
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 1
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "2"
+          memory: "4Gi"
+        limits:
+          cpu: "2"
+          memory: "4Gi"
+      pvc:
+        create: false
+        name: profiling-pvc
+        mountPoint: /workspace/profiling_results
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
+          workingDir: /workspace/components/planner/src/dynamo/planner
+          args:
+            - python
+            - -m
+            - planner_sla
+            - --namespace=vllm-disagg-planner
+            - --environment=kubernetes
+            - --backend=vllm
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/profiling_results
+            - --prometheus-endpoint=http://vllm-disagg-planner-prometheus.sla-planner-1.svc.cluster.local:9090
     VllmDecodeWorker:
-      dynamoNamespace: vllm-v1-disagg-planner
+      dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
       replicas: 1
@@ -84,21 +133,69 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "40Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
           workingDir: /workspace/examples/vllm
           args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 components/main.py --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+    Prometheus:
+      dynamoNamespace: vllm-disagg-planner
+      componentType: worker
+      replicas: 1
+      config:
+        global:
+          scrape_interval: 10s
+          evaluation_interval: 10s
+        scrape_configs:
+          - job_name: 'vllm-frontend'
+            static_configs:
+              - targets: ['vllm-disagg-planner-frontend:8000']
+            metrics_path: /metrics
+            scrape_interval: 5s
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 30
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      resources:
+        requests:
+          cpu: "200m"
+          memory: "512Mi"
+        limits:
+          cpu: "500m"
+          memory: "1Gi"
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
+          workingDir: /workspace
+          args:
+            - python
+            - -m
+            - dynamo.planner.prometheus
     VllmPrefillWorker:
-      dynamoNamespace: vllm-v1-disagg-planner
+      dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
       replicas: 1
@@ -123,16 +220,16 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "10"
-          memory: "20Gi"
+          cpu: "32"
+          memory: "40Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm_v1-runtime:dep-216.4
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
           workingDir: /workspace/examples/vllm
           args:
-            - "python3 components/main.py --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
+            - "python3 components/main.py --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"

From 0f5082cfbf7ee83f20c6e117d2ea2c50d5cd7386 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Wed, 23 Jul 2025 14:55:05 -0700
Subject: [PATCH 33/58] deprecate old docs

---
 components/planner/README.md | 111 +----------------------------------
 1 file changed, 1 insertion(+), 110 deletions(-)

diff --git a/components/planner/README.md b/components/planner/README.md
index 0c28aebddbb..e0409c366f1 100644
--- a/components/planner/README.md
+++ b/components/planner/README.md
@@ -15,113 +15,4 @@ See the License for the specific language governing permissions and
 limitations under the License.
 -->
 
-# Planner
-
-The planner is a component that monitors the state of the system and makes adjustments to the number of workers to ensure that the system is running efficiently. It can dynamically scale prefill/decode workers up and down based on a variety of KV metrics. You can find documentation and benchmarking examples in the [planner docs](../../docs/guides/planner.md).
-
-## Usage
-
-After you've deployed a dynamo graph, you can start the planner with the following command:
-
-```bash
-PYTHONPATH=/workspace/examples/llm python components/planner.py --namespace <namespace>
-```
-
-## Backends
-
-1. `local` - uses circus to start/stop worker subprocesses
-2. `kubernetes` - uses the kubernetes API to adjust replicas of the DynamoGraphDeployment resource, which automatically scales the corresponding worker pods up or down
-
-## Local Backend (LocalPlanner)
-
-The LocalPlanner is built on top of circus, which is what we use to manage component subprocesses when running dynamo serve. LocalPlanner allows the planner component to scale workers up and down based on system metrics.
-
-**Current limitations**
-1. Single node only
-2. Workers must be using only a single GPU
-3. Your initial deployment must be replicas=1 for both prefill and decode
-
-We are working on addressing these as fast as possible.
-
-### Under the Hood
-
-Circus has a concept of an arbiter and a watcher:
-- **Arbiter**: The supervisor process that manages all watchers
-- **Watcher**: A process that encodes environment variables, command, name, and other information needed to run a component
-
-When a service is started, each worker process is spun up as a watcher. For example, when starting a VllmWorker, a watcher is created that looks like:
-
-```json
-{
-  "dynamo_VllmWorker": {
-    "watcher_name": "dynamo_VllmWorker",
-    "cmd": "/opt/dynamo/venv/bin/python3 -m dynamo.sdk.cli.serve_dynamo graphs.agg_router:Frontend --service-name VllmWorker --worker-id $(CIRCUS.WID) --worker-env [{\"CUDA_VISIBLE_DEVICES\": \"0\"}]",
-    "resources": {
-      "allocated_gpus": [
-        0
-      ]
-    },
-    "lease": 7587886183172559418
-  }
-}
-```
-
-The arbiter exposes an endpoint allowing messages to add/remove/change watchers. The LocalPlanner leverages this functionality to dynamically adjust worker counts.
-
-### Implementation
-
-The planner architecture is designed to be simple and extensible:
-- An abstract class supports basic add/remove component operations
-- This is implemented in `local_connector.py`
-- Circus interaction logic is in `circusd.py`, which reads the statefile, connects to the endpoint, and provides add/remove functionality
-- Planner starts an instance of `LocalConnector` and uses it to modify the deployment topology
-
-### Statefile
-
-The statefile maintains the current state of all running workers and is used by the LocalPlanner to track and modify the deployment. It's stored at `~/.dynamo/state/{namespace}.json` (or in the directory specified by `DYN_LOCAL_STATE_DIR`). The statefile is automatically created when you run dynamo serve and is cleaned up when the arbiter terminates. Each worker is identified as `{namespace}_{component_name}` with an optional numeric suffix for additional instances.
-
-#### Example: Adding and Removing Workers
-
-Starting with a single decode worker:
-```json
-{
-  "dynamo_VllmWorker": {..., "resources":{...}}
-}
-```
-
-After adding a worker:
-```json
-{
-  "dynamo_VllmWorker": {..., "resources":{...}},
-  "dynamo_VllmWorker_1": {..., "resources":{...}}
-}
-```
-
-After removing a worker (removes the highest suffix):
-```json
-{
-  "dynamo_VllmWorker": {..., "resources":{...}}
-}
-```
-
-If scaled to zero, the initial entry is kept without resources to maintain configuration information:
-```json
-{
-  "dynamo_VllmWorker": {...}
-}
-```
-
-### Looking forward
-
-- Support for a multinode LocalPlanner
-- Storing the statefile (and initial configurations) in ETCD using the the new `EtcdKvCache`.
-
-### Testing
-
-For manual testing, you can use the controller_test.py file to add/remove components after you've run a serve command on a Dynamo pipeline where the planner is linked.
-
-## Kubernetes Backend
-
-The Kubernetes backend works by updating the replicas count of the DynamoGraphDeployment custom resource. When the planner determines that workers need to be scaled up or down based on workload metrics, it uses the Kubernetes API to patch the DynamoGraphDeployment resource specification, changing the replicas count for the appropriate worker component. The Kubernetes operator then reconciles this change by creating or terminating the necessary pods. This provides a seamless autoscaling experience in Kubernetes environments without requiring manual intervention.
-
-The Kubernetes backend will automatically be used by Planner when your pipeline is deployed using a DynamoGraphDeployment CR. By default, the planner will run in no-op mode, which means it will monitor metrics but not take scaling actions. To enable actual scaling, you should also specify `--Planner.no-operation=false`.
+Please refer to [planner docs](../../docs/architecture/planner_intro.rst) for planner documentation.
\ No newline at end of file

From 33371db3fc6f5968317f667974b7db35cd12420d Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Thu, 24 Jul 2025 14:30:16 -0700
Subject: [PATCH 34/58] feat: update prometheus to work

---
 .../backends/vllm/deploy/disagg_planner.yaml  | 99 ++++++++++---------
 .../planner/src/dynamo/planner/prometheus.py  |  7 ++
 2 files changed, 59 insertions(+), 47 deletions(-)

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 8907eb0306f..7b2bbea27c0 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -6,6 +6,9 @@ kind: DynamoGraphDeployment
 metadata:
   name: vllm-disagg-planner
 spec:
+  envs:
+    - name: DYNAMO_SERVICE_CONFIG
+      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]}}'
   services:
     Frontend:
       dynamoNamespace: vllm-disagg-planner
@@ -38,7 +41,7 @@ spec:
           memory: "4Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
           workingDir: /workspace/examples/vllm
           args:
             - dynamo
@@ -73,18 +76,18 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "2"
-          memory: "4Gi"
+          cpu: "1"
+          memory: "2Gi"
         limits:
-          cpu: "2"
-          memory: "4Gi"
+          cpu: "1"
+          memory: "2Gi"
       pvc:
         create: false
         name: profiling-pvc
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
           workingDir: /workspace/components/planner/src/dynamo/planner
           args:
             - python
@@ -95,12 +98,14 @@ spec:
             - --backend=vllm
             - --adjustment-interval=60
             - --profile-results-dir=/workspace/profiling_results
-            - --prometheus-endpoint=http://vllm-disagg-planner-prometheus.sla-planner-1.svc.cluster.local:9090
-    VllmDecodeWorker:
+            - --prometheus-endpoint=http://vllm-disagg-planner-prometheus.hzhou-dynamo.svc.cluster.local:9090
+    Prometheus:
       dynamoNamespace: vllm-disagg-planner
-      envFromSecret: hf-token-secret
-      componentType: worker
+      componentType: main
       replicas: 1
+      envs:
+        - name: PYTHONPATH
+          value: "/workspace/components/planner/src"
       livenessProbe:
         exec:
           command:
@@ -115,40 +120,36 @@ spec:
           command:
             - /bin/sh
             - -c
-            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
-        initialDelaySeconds: 60
+            - "exit 0"
+        initialDelaySeconds: 30
         periodSeconds: 60
         timeoutSeconds: 30
         failureThreshold: 10
       resources:
         requests:
-          cpu: "32"
-          memory: "40Gi"
-          gpu: "1"
+          cpu: "1"
+          memory: "2Gi"
         limits:
-          cpu: "32"
-          memory: "40Gi"
-          gpu: "1"
+          cpu: "1"
+          memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
-          workingDir: /workspace/examples/vllm
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
+          workingDir: /workspace/components/planner/src
+          # env:
+          #   - name: DYNAMO_SERVICE_CONFIG
+          #     value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]}}'
+            # - name: PYTHONPATH
+            #   value: "/workspace/components/planner/src"
           args:
-            - "python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --enforce-eager 2>&1 | tee /tmp/vllm.log"
-    Prometheus:
+            - python
+            - -m
+            - dynamo.planner.prometheus
+    VllmDecodeWorker:
       dynamoNamespace: vllm-disagg-planner
+      envFromSecret: hf-token-secret
       componentType: worker
       replicas: 1
-      config:
-        global:
-          scrape_interval: 10s
-          evaluation_interval: 10s
-        scrape_configs:
-          - job_name: 'vllm-frontend'
-            static_configs:
-              - targets: ['vllm-disagg-planner-frontend:8000']
-            metrics_path: /metrics
-            scrape_interval: 5s
       livenessProbe:
         exec:
           command:
@@ -163,26 +164,28 @@ spec:
           command:
             - /bin/sh
             - -c
-            - "exit 0"
-        initialDelaySeconds: 30
+            - 'grep "VllmWorker.*has been initialized" /tmp/vllm.log'
+        initialDelaySeconds: 60
         periodSeconds: 60
         timeoutSeconds: 30
         failureThreshold: 10
       resources:
         requests:
-          cpu: "200m"
-          memory: "512Mi"
+          cpu: "10"
+          memory: "40Gi"
+          gpu: "1"
         limits:
-          cpu: "500m"
-          memory: "1Gi"
+          cpu: "10"
+          memory: "40Gi"
+          gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
-          workingDir: /workspace
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
+          workingDir: /workspace/examples/vllm
           args:
-            - python
-            - -m
-            - dynamo.planner.prometheus
+            - /bin/sh
+            - -c
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -209,16 +212,18 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "32"
+          cpu: "10"
           memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "32"
+          cpu: "10"
           memory: "40Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.2
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
           workingDir: /workspace/examples/vllm
           args:
-            - "python3 -m dynamo.vllm --model deepseek-ai/DeepSeek-R1-Distill-Llama-8B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
+            - /bin/sh
+            - -c
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log"
diff --git a/components/planner/src/dynamo/planner/prometheus.py b/components/planner/src/dynamo/planner/prometheus.py
index be583342ce1..0be1d379012 100644
--- a/components/planner/src/dynamo/planner/prometheus.py
+++ b/components/planner/src/dynamo/planner/prometheus.py
@@ -69,3 +69,10 @@ async def start_prometheus_server(config):
         process.terminate()
         process.wait()
         raise
+
+
+if __name__ == "__main__":
+    # The dynamo_worker decorator handles runtime setup
+    import asyncio
+
+    asyncio.run(worker())

From 60dd89d6ab30c404da48f50e45e5dad5cf3ba960 Mon Sep 17 00:00:00 2001
From: Hongkuan Zhou <tedzhouhk@gmail.com>
Date: Fri, 25 Jul 2025 09:29:46 -0700
Subject: [PATCH 35/58] feat: k8s connector scaling P/D in one call (#2103)

Signed-off-by: Hongkuan Zhou <tedzhouhk@gmail.com>
Co-authored-by: hhzhang16 <54051230+hhzhang16@users.noreply.github.com>
Co-authored-by: Hannah Zhang <hannahz@nvidia.com>
---
 benchmarks/profiler/utils/config.py           |   2 +-
 .../profiler/utils/dynamo_deployment.py       |  15 +-
 components/planner/src/dynamo/planner/kube.py |  53 +++--
 .../dynamo/planner/kubernetes_connector.py    |  72 +++++--
 .../src/dynamo/planner/utils/planner_core.py  |  32 +--
 components/planner/test/kube.py               | 191 +++++++++++++-----
 .../planner/test/kubernetes_connector.py      |   2 +-
 7 files changed, 260 insertions(+), 107 deletions(-)

diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index 962013af133..0f134e45a0c 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -42,7 +42,7 @@ def break_arguments(args: list[str]) -> list[str]:
     return ans
 
 
-def join_arguments(args: list[str]) -> str:
+def join_arguments(args: list[str]) -> list[str]:
     return [" ".join(args)]
 
 
diff --git a/benchmarks/profiler/utils/dynamo_deployment.py b/benchmarks/profiler/utils/dynamo_deployment.py
index 94ada76fab2..c40407ea395 100644
--- a/benchmarks/profiler/utils/dynamo_deployment.py
+++ b/benchmarks/profiler/utils/dynamo_deployment.py
@@ -17,9 +17,9 @@
 import asyncio
 import time
 from pathlib import Path
-from typing import Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
-import aiofiles
+import aiofiles  # type: ignore[import-untyped]
 import httpx  # added for HTTP requests
 import kubernetes_asyncio as kubernetes
 import yaml
@@ -61,8 +61,10 @@ def __init__(
         self.deployment_name = deployment_name
         self.model_name = model_name
         self.service_name = service_name or f"{deployment_name}-frontend"
-        self.components = []  # Will store component names from CR
-        self.deployment_spec = None  # Will store the full deployment spec
+        self.components: List[str] = []  # Will store component names from CR
+        self.deployment_spec: Optional[
+            Dict[str, Any]
+        ] = None  # Will store the full deployment spec
         self.base_log_dir = Path(base_log_dir) if base_log_dir else Path("logs")
 
     def _init_kubernetes(self):
@@ -105,6 +107,11 @@ async def create_deployment(self, deployment: Union[dict, str]):
         else:
             self.deployment_spec = deployment
 
+        # Ensure deployment_spec is not None
+        assert (
+            self.deployment_spec is not None
+        ), "deployment_spec should not be None after assignment"
+
         # Extract component names
         self.components = [
             svc.lower() for svc in self.deployment_spec["spec"]["services"].keys()
diff --git a/components/planner/src/dynamo/planner/kube.py b/components/planner/src/dynamo/planner/kube.py
index b368ffa20b1..127d0392e2a 100644
--- a/components/planner/src/dynamo/planner/kube.py
+++ b/components/planner/src/dynamo/planner/kube.py
@@ -17,15 +17,19 @@
 from typing import Optional
 
 from kubernetes import client, config
+from kubernetes.config.config_exception import ConfigException
 
 
 class KubernetesAPI:
-    def __init__(self):
+    def __init__(self, k8s_namespace: Optional[str] = None):
         # Load kubernetes configuration
-        config.load_incluster_config()  # for in-cluster deployment
+        try:
+            config.load_incluster_config()  # for in-cluster deployment
+        except ConfigException:
+            config.load_kube_config()  # for out-of-cluster deployment
 
         self.custom_api = client.CustomObjectsApi()
-        self.current_namespace = self._get_current_namespace()
+        self.current_namespace = k8s_namespace or self._get_current_namespace()
 
     def _get_current_namespace(self) -> str:
         """Get the current namespace if running inside a k8s cluster"""
@@ -38,6 +42,18 @@ def _get_current_namespace(self) -> str:
             # Fallback to 'default' if not running in k8s
             return "default"
 
+    def _get_graph_deployment_from_name(
+        self, graph_deployment_name: str
+    ) -> Optional[dict]:
+        """Get the graph deployment from the dynamo graph deployment name"""
+        return self.custom_api.get_namespaced_custom_object(
+            group="nvidia.com",
+            version="v1alpha1",
+            namespace=self.current_namespace,
+            plural="dynamographdeployments",
+            name=graph_deployment_name,
+        )
+
     async def get_graph_deployment(
         self, component_name: str, dynamo_namespace: str
     ) -> Optional[dict]:
@@ -98,12 +114,8 @@ async def get_graph_deployment(
             if not graph_deployment_name:
                 return None
 
-            graph_deployment = self.custom_api.get_namespaced_custom_object(
-                group="nvidia.com",
-                version="v1alpha1",
-                namespace=self.current_namespace,
-                plural="dynamographdeployments",
-                name=graph_deployment_name,
+            graph_deployment = self._get_graph_deployment_from_name(
+                graph_deployment_name
             )
 
             return graph_deployment
@@ -127,19 +139,36 @@ async def update_graph_replicas(
             body=patch,
         )
 
+    async def is_deployment_ready(self, graph_deployment_name: str) -> bool:
+        """Check if a graph deployment is ready"""
+
+        graph_deployment = self._get_graph_deployment_from_name(graph_deployment_name)
+
+        if not graph_deployment:
+            raise ValueError(f"Graph deployment {graph_deployment_name} not found")
+
+        conditions = graph_deployment.get("status", {}).get("conditions", [])
+        ready_condition = next(
+            (c for c in conditions if c.get("type") == "Ready"), None
+        )
+
+        return ready_condition is not None and ready_condition.get("status") == "True"
+
     async def wait_for_graph_deployment_ready(
         self,
         graph_deployment_name: str,
-        max_attempts: int = 60,  # default: 10 minutes total
+        max_attempts: int = 180,  # default: 30 minutes total
         delay_seconds: int = 10,  # default: check every 10 seconds
     ) -> None:
         """Wait for a graph deployment to be ready"""
 
         for attempt in range(max_attempts):
             await asyncio.sleep(delay_seconds)
-            graph_deployment = await self.get_graph_deployment(
-                graph_deployment_name, self.current_namespace
+
+            graph_deployment = self._get_graph_deployment_from_name(
+                graph_deployment_name
             )
+
             if not graph_deployment:
                 raise ValueError(f"Graph deployment {graph_deployment_name} not found")
 
diff --git a/components/planner/src/dynamo/planner/kubernetes_connector.py b/components/planner/src/dynamo/planner/kubernetes_connector.py
index e089d9a83fb..a70e71e8416 100644
--- a/components/planner/src/dynamo/planner/kubernetes_connector.py
+++ b/components/planner/src/dynamo/planner/kubernetes_connector.py
@@ -13,24 +13,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .kube import KubernetesAPI
-from .planner_connector import PlannerConnector
+import logging
+from typing import Optional
+
+from dynamo.planner.kube import KubernetesAPI
+from dynamo.planner.planner_connector import PlannerConnector
+from dynamo.runtime.logging import configure_dynamo_logging
+
+configure_dynamo_logging()
+logger = logging.getLogger(__name__)
 
 
 class KubernetesConnector(PlannerConnector):
-    def __init__(self, namespace: str):
-        self.kube_api = KubernetesAPI()
-        self.namespace = namespace
+    def __init__(self, dynamo_namespace: str, k8s_namespace: Optional[str] = None):
+        self.kube_api = KubernetesAPI(k8s_namespace)
+        self.dynamo_namespace = dynamo_namespace
 
     async def add_component(self, component_name: str, blocking: bool = True):
         """Add a component by increasing its replica count by 1"""
+
         deployment = await self.kube_api.get_graph_deployment(
-            component_name, self.namespace
+            component_name, self.dynamo_namespace
         )
         if deployment is None:
             raise ValueError(
-                f"Graph not found for component {component_name} in dynamo namespace {self.namespace}"
+                f"Graph not found for component {component_name} in dynamo namespace {self.dynamo_namespace}"
             )
+
         # get current replicas or 1 if not found
         current_replicas = self._get_current_replicas(deployment, component_name)
         await self.kube_api.update_graph_replicas(
@@ -45,13 +54,15 @@ async def add_component(self, component_name: str, blocking: bool = True):
 
     async def remove_component(self, component_name: str, blocking: bool = True):
         """Remove a component by decreasing its replica count by 1"""
+
         deployment = await self.kube_api.get_graph_deployment(
-            component_name, self.namespace
+            component_name, self.dynamo_namespace
         )
         if deployment is None:
             raise ValueError(
-                f"Graph {component_name} not found for namespace {self.namespace}"
+                f"Graph {component_name} not found for namespace {self.dynamo_namespace}"
             )
+
         # get current replicas or 1 if not found
         current_replicas = self._get_current_replicas(deployment, component_name)
         if current_replicas > 0:
@@ -65,6 +76,39 @@ async def remove_component(self, component_name: str, blocking: bool = True):
                     self._get_graph_deployment_name(deployment)
                 )
 
+    async def set_component_replicas(
+        self, target_replicas: dict[str, int], blocking: bool = True
+    ):
+        """Set the replicas for multiple components at once"""
+
+        deployment = await self.kube_api.get_graph_deployment(
+            next(iter(target_replicas)), self.dynamo_namespace
+        )
+        if deployment is None:
+            raise ValueError(
+                f"Graph {next(iter(target_replicas))} not found for namespace {self.dynamo_namespace}"
+            )
+
+        if not await self.kube_api.is_deployment_ready(
+            self._get_graph_deployment_name(deployment)
+        ):
+            logger.warning(
+                f"Deployment {self._get_graph_deployment_name(deployment)} is not ready, ignoring this scaling"
+            )
+            return
+
+        for component_name, replicas in target_replicas.items():
+            await self.kube_api.update_graph_replicas(
+                self._get_graph_deployment_name(deployment),
+                component_name,
+                replicas,
+            )
+
+        if blocking:
+            await self.kube_api.wait_for_graph_deployment_ready(
+                self._get_graph_deployment_name(deployment)
+            )
+
     def _get_current_replicas(self, deployment: dict, component_name: str) -> int:
         """Get the current replicas for a component in a graph deployment"""
         return (
@@ -78,20 +122,22 @@ def _get_graph_deployment_name(self, deployment: dict) -> str:
         """Get the name of the graph deployment"""
         return deployment["metadata"]["name"]
 
+
 if __name__ == "__main__":
     import argparse
     import asyncio
-    
+
     parser = argparse.ArgumentParser()
-    parser.add_argument("--namespace", type=str, default="dynamo")
+    parser.add_argument("--dynamo_namespace", type=str, default="dynamo")
+    parser.add_argument("--k8s_namespace", type=str, default="default")
     parser.add_argument("--action", type=str, choices=["add", "remove"])
     parser.add_argument("--component", type=str, default="planner")
     parser.add_argument("--blocking", action="store_true")
     args = parser.parse_args()
-    connector = KubernetesConnector(args.namespace)
+    connector = KubernetesConnector(args.dynamo_namespace, args.k8s_namespace)
 
     if args.action == "add":
         task = connector.add_component(args.component, args.blocking)
     elif args.action == "remove":
         task = connector.remove_component(args.component, args.blocking)
-    asyncio.run(task)
\ No newline at end of file
+    asyncio.run(task)
diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index cc724d22457..5b44b70f404 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -270,33 +270,11 @@ async def make_adjustments(self):
             return
 
         if not self.args.no_operation:
-            # scale up/down the number of prefill/decode non-blockingly
-            # TODO: add a check to avoid scaling before the previous scaling is completed
-            if next_num_p > len(self.p_endpoints):
-                for _ in range(next_num_p - len(self.p_endpoints)):
-                    self.connector.add_component(
-                        WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker,
-                        blocking=False,
-                    )
-            elif next_num_p < len(self.p_endpoints):
-                for _ in range(len(self.p_endpoints) - next_num_p):
-                    self.connector.remove_component(
-                        WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker,
-                        blocking=False,
-                    )
-
-            if next_num_d > len(self.d_endpoints):
-                for _ in range(next_num_d - len(self.d_endpoints)):
-                    self.connector.add_component(
-                        WORKER_COMPONENT_NAMES[self.args.backend].decode_worker,
-                        blocking=False,
-                    )
-            elif next_num_d < len(self.d_endpoints):
-                for _ in range(len(self.d_endpoints) - next_num_d):
-                    self.connector.remove_component(
-                        WORKER_COMPONENT_NAMES[self.args.backend].decode_worker,
-                        blocking=False,
-                    )
+            target_replicas = {
+                WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
+                WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
+            }
+            self.connector.set_component_replicas(target_replicas, blocking=False)
 
     async def run(self):
         """Main loop for the planner"""
diff --git a/components/planner/test/kube.py b/components/planner/test/kube.py
index 2a2a1243453..0f2f1bc04f4 100644
--- a/components/planner/test/kube.py
+++ b/components/planner/test/kube.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 
 from typing import Any, Dict
-from unittest.mock import AsyncMock, MagicMock, patch
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -39,9 +39,45 @@ def k8s_api(mock_custom_api, mock_config):
     return KubernetesAPI()
 
 
+@pytest.fixture
+def k8s_api_with_namespace(mock_custom_api, mock_config):
+    return KubernetesAPI(k8s_namespace="test-namespace")
+
+
+def test_kubernetes_api_init_with_namespace(mock_custom_api, mock_config):
+    """Test KubernetesAPI initialization with custom namespace"""
+    api = KubernetesAPI(k8s_namespace="custom-namespace")
+    assert api.current_namespace == "custom-namespace"
+
+
+def test_kubernetes_api_init_without_namespace(mock_custom_api, mock_config):
+    """Test KubernetesAPI initialization without custom namespace"""
+    api = KubernetesAPI()
+    # Should use the default namespace logic
+    assert api.current_namespace == "default"
+
+
+def test_get_graph_deployment_from_name(k8s_api, mock_custom_api):
+    """Test _get_graph_deployment_from_name method"""
+    mock_deployment = {"metadata": {"name": "test-deployment"}}
+    mock_custom_api.get_namespaced_custom_object.return_value = mock_deployment
+
+    result = k8s_api._get_graph_deployment_from_name("test-deployment")
+
+    assert result == mock_deployment
+    mock_custom_api.get_namespaced_custom_object.assert_called_once_with(
+        group="nvidia.com",
+        version="v1alpha1",
+        namespace=k8s_api.current_namespace,
+        plural="dynamographdeployments",
+        name="test-deployment",
+    )
+
+
 @pytest.mark.asyncio
-async def test_wait_for_graph_deployment_ready_success(k8s_api, mock_custom_api):
-    # Mock the get_graph_deployment response
+async def test_is_deployment_ready_true(k8s_api, mock_custom_api):
+    """Test is_deployment_ready method when deployment is ready"""
+    # Mock the _get_graph_deployment_from_name response
     mock_deployment: Dict[str, Any] = {
         "status": {
             "conditions": [
@@ -49,22 +85,18 @@ async def test_wait_for_graph_deployment_ready_success(k8s_api, mock_custom_api)
             ]
         }
     }
-    k8s_api.get_graph_deployment = AsyncMock(return_value=mock_deployment)
-
-    # Test with minimal attempts and delay for faster testing
-    await k8s_api.wait_for_graph_deployment_ready(
-        "test-deployment", max_attempts=2, delay_seconds=0.1
-    )
 
-    # Verify get_graph_deployment was called
-    k8s_api.get_graph_deployment.assert_called_once_with(
-        "test-deployment", k8s_api.current_namespace
-    )
+    # Mock the method on the instance
+    with patch.object(
+        k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment
+    ):
+        result = await k8s_api.is_deployment_ready("test-deployment")
+        assert result is True
 
 
 @pytest.mark.asyncio
-async def test_wait_for_graph_deployment_ready_timeout(k8s_api, mock_custom_api):
-    # Mock the get_graph_deployment response with not ready status
+async def test_is_deployment_ready_false(k8s_api, mock_custom_api):
+    """Test is_deployment_ready method when deployment is not ready"""
     mock_deployment: Dict[str, Any] = {
         "status": {
             "conditions": [
@@ -76,54 +108,115 @@ async def test_wait_for_graph_deployment_ready_timeout(k8s_api, mock_custom_api)
             ]
         }
     }
-    k8s_api.get_graph_deployment = AsyncMock(return_value=mock_deployment)
 
-    # Test with minimal attempts and delay for faster testing
-    with pytest.raises(TimeoutError) as exc_info:
-        await k8s_api.wait_for_graph_deployment_ready(
-            "test-deployment", max_attempts=2, delay_seconds=0.1
-        )
+    # Mock the method on the instance
+    with patch.object(
+        k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment
+    ):
+        result = await k8s_api.is_deployment_ready("test-deployment")
+        assert result is False
 
-    assert "is not ready after" in str(exc_info.value)
-    assert k8s_api.get_graph_deployment.call_count == 2
+
+@pytest.mark.asyncio
+async def test_is_deployment_ready_not_found(k8s_api, mock_custom_api):
+    """Test is_deployment_ready method when deployment is not found"""
+    # Mock the method on the instance
+    with patch.object(k8s_api, "_get_graph_deployment_from_name", return_value=None):
+        with pytest.raises(ValueError) as exc_info:
+            await k8s_api.is_deployment_ready("test-deployment")
+
+        assert "not found" in str(exc_info.value)
 
 
 @pytest.mark.asyncio
-async def test_wait_for_graph_deployment_not_found(k8s_api, mock_custom_api):
-    # Mock the get_graph_deployment response to return None
-    k8s_api.get_graph_deployment = AsyncMock(return_value=None)
+async def test_wait_for_graph_deployment_ready_success(k8s_api, mock_custom_api):
+    """Test wait_for_graph_deployment_ready when deployment becomes ready"""
+    # Mock the _get_graph_deployment_from_name response
+    mock_deployment: Dict[str, Any] = {
+        "status": {
+            "conditions": [
+                {"type": "Ready", "status": "True", "message": "Deployment is ready"}
+            ]
+        }
+    }
 
-    # Test with minimal attempts and delay for faster testing
-    with pytest.raises(ValueError) as exc_info:
+    # Mock the method on the instance
+    with patch.object(
+        k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment
+    ):
+        # Test with minimal attempts and delay for faster testing
         await k8s_api.wait_for_graph_deployment_ready(
             "test-deployment", max_attempts=2, delay_seconds=0.1
         )
 
-    assert "not found" in str(exc_info.value)
-    assert k8s_api.get_graph_deployment.call_count == 1
+
+@pytest.mark.asyncio
+async def test_wait_for_graph_deployment_ready_timeout(k8s_api, mock_custom_api):
+    """Test wait_for_graph_deployment_ready when deployment times out"""
+    # Mock the _get_graph_deployment_from_name response with not ready status
+    mock_deployment: Dict[str, Any] = {
+        "status": {
+            "conditions": [
+                {
+                    "type": "Ready",
+                    "status": "False",
+                    "message": "Deployment is not ready",
+                }
+            ]
+        }
+    }
+
+    # Mock the method on the instance
+    with patch.object(
+        k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment
+    ):
+        # Test with minimal attempts and delay for faster testing
+        with pytest.raises(TimeoutError) as exc_info:
+            await k8s_api.wait_for_graph_deployment_ready(
+                "test-deployment", max_attempts=2, delay_seconds=0.1
+            )
+
+        assert "is not ready after" in str(exc_info.value)
+
+
+@pytest.mark.asyncio
+async def test_wait_for_graph_deployment_not_found(k8s_api, mock_custom_api):
+    """Test wait_for_graph_deployment_ready when deployment is not found"""
+    # Mock the _get_graph_deployment_from_name response to return None
+    with patch.object(k8s_api, "_get_graph_deployment_from_name", return_value=None):
+        # Test with minimal attempts and delay for faster testing
+        with pytest.raises(ValueError) as exc_info:
+            await k8s_api.wait_for_graph_deployment_ready(
+                "test-deployment", max_attempts=2, delay_seconds=0.1
+            )
+
+        assert "not found" in str(exc_info.value)
 
 
 @pytest.mark.asyncio
 async def test_wait_for_graph_deployment_no_conditions(k8s_api, mock_custom_api):
-    # Mock the get_graph_deployment response with no conditions
+    """Test wait_for_graph_deployment_ready when deployment has no conditions"""
+    # Mock the _get_graph_deployment_from_name response with no conditions
     mock_deployment: Dict[str, Any] = {"status": {}}
-    k8s_api.get_graph_deployment = AsyncMock(return_value=mock_deployment)
 
-    # Test with minimal attempts and delay for faster testing
-    with pytest.raises(TimeoutError) as exc_info:
-        await k8s_api.wait_for_graph_deployment_ready(
-            "test-deployment", max_attempts=2, delay_seconds=0.1
-        )
+    with patch.object(
+        k8s_api, "_get_graph_deployment_from_name", return_value=mock_deployment
+    ):
+        # Test with minimal attempts and delay for faster testing
+        with pytest.raises(TimeoutError) as exc_info:
+            await k8s_api.wait_for_graph_deployment_ready(
+                "test-deployment", max_attempts=2, delay_seconds=0.1
+            )
 
-    assert "is not ready after" in str(exc_info.value)
-    assert k8s_api.get_graph_deployment.call_count == 2
+        assert "is not ready after" in str(exc_info.value)
 
 
 @pytest.mark.asyncio
 async def test_wait_for_graph_deployment_ready_on_second_attempt(
     k8s_api, mock_custom_api
 ):
-    # Mock the get_graph_deployment response to return not ready first, then ready
+    """Test wait_for_graph_deployment_ready when deployment becomes ready on second attempt"""
+    # Mock the _get_graph_deployment_from_name response to return not ready first, then ready
     mock_deployment_not_ready: Dict[str, Any] = {
         "status": {
             "conditions": [
@@ -142,13 +235,13 @@ async def test_wait_for_graph_deployment_ready_on_second_attempt(
             ]
         }
     }
-    k8s_api.get_graph_deployment = AsyncMock(
-        side_effect=[mock_deployment_not_ready, mock_deployment_ready]
-    )
-
-    # Test with minimal attempts and delay for faster testing
-    await k8s_api.wait_for_graph_deployment_ready(
-        "test-deployment", max_attempts=2, delay_seconds=0.1
-    )
 
-    assert k8s_api.get_graph_deployment.call_count == 2
+    with patch.object(
+        k8s_api,
+        "_get_graph_deployment_from_name",
+        side_effect=[mock_deployment_not_ready, mock_deployment_ready],
+    ):
+        # Test with minimal attempts and delay for faster testing
+        await k8s_api.wait_for_graph_deployment_ready(
+            "test-deployment", max_attempts=2, delay_seconds=0.1
+        )
diff --git a/components/planner/test/kubernetes_connector.py b/components/planner/test/kubernetes_connector.py
index 318cb40954d..c795248bc07 100644
--- a/components/planner/test/kubernetes_connector.py
+++ b/components/planner/test/kubernetes_connector.py
@@ -63,7 +63,7 @@ async def test_add_component_increases_replicas(kubernetes_connector, mock_kube_
 
     # Assert
     mock_kube_api.get_graph_deployment.assert_called_once_with(
-        component_name, kubernetes_connector.namespace
+        component_name, kubernetes_connector.dynamo_namespace
     )
     mock_kube_api.update_graph_replicas.assert_called_once_with(
         "test-graph", component_name, 2

From 41f1ca0d0c5056c6ff848ff2db62474362c6128a Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 11:50:35 -0700
Subject: [PATCH 36/58] fix: vllm_v1 -> vllm

---
 benchmarks/profiler/profile_sla.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/profiler/profile_sla.py b/benchmarks/profiler/profile_sla.py
index 3ea2f0ceb95..cc4404956bd 100644
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -589,9 +589,9 @@ async def run_profile(args):
     parser.add_argument(
         "--backend",
         type=str,
-        default="vllm_v1",
-        choices=["vllm_v1"],
-        help="backend type, currently support [vllm_v1]",
+        default="vllm",
+        choices=["vllm"],
+        help="backend type, currently support [vllm]",
     )
     parser.add_argument(
         "--config",

From 1584cd0176f6f3565a41a0dfd0673ec1ffbf7080 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 11:52:18 -0700
Subject: [PATCH 37/58] feat: remove unneeded files

---
 .../vllm/deploy/profile_sla_binding.yaml      | 27 ----------------
 .../vllm/deploy/profile_sla_rbac.yaml         | 31 -------------------
 .../backends/vllm/deploy/profile_sla_sa.yaml  | 21 -------------
 3 files changed, 79 deletions(-)
 delete mode 100644 components/backends/vllm/deploy/profile_sla_binding.yaml
 delete mode 100644 components/backends/vllm/deploy/profile_sla_rbac.yaml
 delete mode 100644 components/backends/vllm/deploy/profile_sla_sa.yaml

diff --git a/components/backends/vllm/deploy/profile_sla_binding.yaml b/components/backends/vllm/deploy/profile_sla_binding.yaml
deleted file mode 100644
index 6743dd4c52e..00000000000
--- a/components/backends/vllm/deploy/profile_sla_binding.yaml
+++ /dev/null
@@ -1,27 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: profile-sla-binding
-  namespace: ${NAMESPACE}
-subjects:
-- kind: ServiceAccount
-  name: profile-sla-sa
-  namespace: ${NAMESPACE}
-roleRef:
-  kind: Role
-  name: profile-sla-role
-  apiGroup: rbac.authorization.k8s.io
diff --git a/components/backends/vllm/deploy/profile_sla_rbac.yaml b/components/backends/vllm/deploy/profile_sla_rbac.yaml
deleted file mode 100644
index 65494d9a389..00000000000
--- a/components/backends/vllm/deploy/profile_sla_rbac.yaml
+++ /dev/null
@@ -1,31 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  name: profile-sla-role
-  namespace: ${NAMESPACE}
-rules:
-  # DynamoGraphDeployment custom resources - needed for create/get/delete operations
-  - apiGroups: ["nvidia.com"]
-    resources: ["dynamographdeployments"]
-    verbs: ["get", "create", "delete"]
-  # Pods - needed for listing pods by label selector and getting logs
-  - apiGroups: [""]
-    resources: ["pods"]
-    verbs: ["list"]
-  - apiGroups: [""]
-    resources: ["pods/log"]
-    verbs: ["get"]
diff --git a/components/backends/vllm/deploy/profile_sla_sa.yaml b/components/backends/vllm/deploy/profile_sla_sa.yaml
deleted file mode 100644
index e918a7d275c..00000000000
--- a/components/backends/vllm/deploy/profile_sla_sa.yaml
+++ /dev/null
@@ -1,21 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: profile-sla-sa
-  namespace: ${NAMESPACE}
-imagePullSecrets:
-  - name: nvcr-imagepullsecret

From dd3f161e572aaa557cf8fa9de42b3b5e15505307 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 11:52:34 -0700
Subject: [PATCH 38/58] docs: update docs

---
 benchmarks/profiler/README.md    | 11 +----------
 docs/architecture/sla_planner.md | 10 +++++-----
 2 files changed, 6 insertions(+), 15 deletions(-)
 mode change 100644 => 120000 benchmarks/profiler/README.md

diff --git a/benchmarks/profiler/README.md b/benchmarks/profiler/README.md
deleted file mode 100644
index 03a6c166faf..00000000000
--- a/benchmarks/profiler/README.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Profiler
-
-## Setup
-
-From within the dynamo container:
-```bash
-./k8s.sh  # install binaries, auth into aks cluster
-cd benchmarks/profiler
-python -m profile_sla --config ../../examples/vllm/deploy/disagg.yaml --namespace mo-dyn-cloud # run the profiler
-```
\ No newline at end of file
diff --git a/benchmarks/profiler/README.md b/benchmarks/profiler/README.md
new file mode 120000
index 00000000000..30861f2786b
--- /dev/null
+++ b/benchmarks/profiler/README.md
@@ -0,0 +1 @@
+../../docs/architecture/pre_deployment_profiling.md
\ No newline at end of file
diff --git a/docs/architecture/sla_planner.md b/docs/architecture/sla_planner.md
index 1fc71373921..fb5876a0229 100644
--- a/docs/architecture/sla_planner.md
+++ b/docs/architecture/sla_planner.md
@@ -108,9 +108,9 @@ Finally, SLA planner applies the change by scaling up/down the number of prefill
 
 To deploy SLA-planner, ensure etcd and NATS are running first, then use the frontend that reports metrics at `/metrics` HTTP endpoint. You can also use your own frontend, but it must report number of requests, ISL, OSL, TTFT, ITL in the same format.
 
-SLA-planner and prometheus server are provided as common components that can be directly imported from `dynamo` package. The following changes are needed:
-- Add `Planner` and `Prometheus` components' dependency in `Frontend`.
-- Link `Planner` and `Prometheus` in the graph.
-- Add `Planner` and `Prometheus` configurations in the config file.
+SLA-planner and prometheus server are provided as common components that can be directly imported from `dynamo` package.
 
-The SLA planner integration with the new frontend + worker architecture is currently a work in progress. This documentation will be updated with the new deployment patterns and code examples once the SLA planner component has been fully adapted to the new workflow.
\ No newline at end of file
+```bash
+cd components/backends/vllm/deploy
+kubectl apply -f disagg_planner.yaml -n {$NAMESPACE}
+```

From 97f3f88723d9714acefe439cdc178c89cde03aed Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 14:55:04 -0700
Subject: [PATCH 39/58] fix: vllm config in profiler

---
 benchmarks/profiler/utils/config.py | 36 ++++++++++++++---------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py
index d1886167383..17ec95c6cc2 100644
--- a/benchmarks/profiler/utils/config.py
+++ b/benchmarks/profiler/utils/config.py
@@ -80,7 +80,7 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
         config = deepcopy(config)
 
         # set metadata name
-        config["metadata"]["name"] = "vllm-v1-agg"
+        config["metadata"]["name"] = "vllm-agg"
 
         # disable planner
         if "Planner" in config["spec"]["services"]:
@@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
         if target == "prefill":
             # convert prefill worker into decode worker
             config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker
             ] = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
             ]
             del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
             ]
 
             args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker
             ]["extraPodSpec"]["mainContainer"]["args"]
 
             args = break_arguments(args)
@@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" not in args:
                 args = append_argument(args, "--no-enable-prefix-caching")
 
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
                 "extraPodSpec"
             ]["mainContainer"]["args"] = join_arguments(args)
 
         elif target == "decode":
             # delete prefill worker
             del config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
+                WORKER_COMPONENT_NAMES["vllm"].prefill_worker
             ]
 
             args = config["spec"]["services"][
-                WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+                WORKER_COMPONENT_NAMES["vllm"].decode_worker
             ]["extraPodSpec"]["mainContainer"]["args"]
 
             args = break_arguments(args)
@@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
             if "--no-enable-prefix-caching" in args:
                 args.remove("--no-enable-prefix-caching")
 
-            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+            config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
                 "extraPodSpec"
             ]["mainContainer"]["args"] = join_arguments(args)
 
         # set num workers to 1
         decode_worker_config = config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+            WORKER_COMPONENT_NAMES["vllm"].decode_worker
         ]
         decode_worker_config["replicas"] = 1
 
@@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d
     def set_config_tp_size(cls, config: dict, tp_size: int):
         config = deepcopy(config)
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
             "resources"
         ]["requests"]["gpu"] = str(tp_size)
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
             "resources"
         ]["limits"]["gpu"] = str(tp_size)
 
-        args = config["spec"]["services"][
-            WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
-        ]["extraPodSpec"]["mainContainer"]["args"]
+        args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
+            "extraPodSpec"
+        ]["mainContainer"]["args"]
 
         args = break_arguments(args)
 
@@ -169,7 +169,7 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
         except ValueError:
             args = append_argument(args, ["--tensor-parallel-size", str(tp_size)])
 
-        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
+        config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][
             "extraPodSpec"
         ]["mainContainer"]["args"] = join_arguments(args)
 
@@ -177,7 +177,7 @@ def set_config_tp_size(cls, config: dict, tp_size: int):
 
     @classmethod
     def get_model_name(cls, config: dict) -> str:
-        worker_name = WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker
+        worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker
         args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][
             "args"
         ]
@@ -232,5 +232,5 @@ def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
 
 
 CONFIG_MODIFIERS = {
-    "vllm_v1": VllmV1ConfigModifier,
+    "vllm": VllmV1ConfigModifier,
 }

From 9b34ee9520e4e3423e40e9b07d4a2c5181055cec Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 15:41:20 -0700
Subject: [PATCH 40/58] feat: wip but tentatively working planner, with
 documentation

---
 .../backends/vllm/deploy/disagg_planner.yaml  |  44 +++---
 .../backends/vllm/src/dynamo/vllm/args.py     |  64 ++++++---
 .../planner/src/dynamo/planner/defaults.py    |   4 +-
 .../planner/src/dynamo/planner/prometheus.py  |   1 +
 .../src/dynamo/planner/utils/planner_core.py  |  11 +-
 docs/architecture/sla_planner.md              |   8 +-
 .../dynamo_deploy/sla_planner_deployment.md   | 136 ++++++++++++++++++
 7 files changed, 221 insertions(+), 47 deletions(-)
 create mode 100644 docs/guides/dynamo_deploy/sla_planner_deployment.md

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 5c2f2474c67..ff77cecb39d 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -8,7 +8,7 @@ metadata:
 spec:
   envs:
     - name: DYNAMO_SERVICE_CONFIG
-      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]}}'
+      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
   services:
     Frontend:
       dynamoNamespace: vllm-disagg-planner
@@ -34,22 +34,20 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "2"
-          memory: "4Gi"
+          cpu: "4"
+          memory: "16Gi"
         limits:
-          cpu: "2"
-          memory: "4Gi"
+          cpu: "4"
+          memory: "16Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
-          workingDir: /workspace/examples/vllm
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          workingDir: /workspace/components/backends/vllm
+          command:
+            - /bin/sh
+            - -c
           args:
-            - dynamo
-            - run
-            - in=http
-            - out=dyn
-            - --http-port
-            - "8000"
+            - "python3 -m dynamo.frontend --http-port 8000"
     Planner:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -87,7 +85,7 @@ spec:
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/planner/src/dynamo/planner
           args:
             - python
@@ -106,6 +104,8 @@ spec:
       envs:
         - name: PYTHONPATH
           value: "/workspace/components/planner/src"
+        - name: DYNAMO_PORT
+          value: "9090"
       livenessProbe:
         exec:
           command:
@@ -134,15 +134,13 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - python
-            - -m
-            - dynamo.planner.prometheus
+            - "python3 -m dynamo.planner.prometheus"
     VllmDecodeWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -178,15 +176,13 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - /bin/sh
-            - -c
-            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --namespace vllm-disagg-planner 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -222,10 +218,10 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log
+            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --namespace vllm-disagg-planner 2>&1 | tee /tmp/vllm.log
diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py
index 79f5d8e4b77..6da016a2ede 100644
--- a/components/backends/vllm/src/dynamo/vllm/args.py
+++ b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -52,10 +52,16 @@ def parse_args() -> Config:
         default=DEFAULT_ENDPOINT,
         help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
     )
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default="dynamo",
+        help="Dynamo namespace for this worker. Default: dynamo",
+    )
     parser.add_argument(
         "--is-prefill-worker",
         action="store_true",
-        help="Enable prefill functionality for this worker. Currently overwrites the --endpoint to be a specially chosen dyn://dynamo.prefill.generate",
+        help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
     )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
@@ -80,7 +86,7 @@ def parse_args() -> Config:
         config.served_model_name = None
 
     if args.is_prefill_worker:
-        args.endpoint = "dyn://dynamo.prefill.generate"
+        args.endpoint = f"dyn://{args.namespace}.prefill.generate"
 
     endpoint_str = args.endpoint.replace("dyn://", "", 1)
     endpoint_parts = endpoint_str.split(".")
@@ -127,6 +133,14 @@ async def allocate_and_reserve_port(
     """
 
     node_name = socket.gethostname()
+    try:
+        node_ip = socket.gethostbyname(node_name)
+    except socket.gaierror:
+        # If hostname cannot be resolved, fall back to localhost
+        logger.warning(
+            f"Hostname '{node_name}' cannot be resolved, falling back to '127.0.0.1'"
+        )
+        node_ip = "127.0.0.1"
 
     for attempt in range(1, max_attempts + 1):
         # Hold socket open just long enough to reserve in ETCD
@@ -136,7 +150,7 @@ async def allocate_and_reserve_port(
             port = sock.getsockname()[1]
 
             # Reserve in ETCD while holding the socket
-            key = f"dyn://{namespace}/ports/{node_name}/{port}"
+            key = f"dyn://{namespace}/ports/{node_ip}/{port}"
             value = {
                 "worker_id": worker_id,
                 "reason": reason,
@@ -238,23 +252,41 @@ def overwrite_args(config):
             raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.")
 
 
-def set_side_channel_host_and_port(config: Config, hostname: Optional[str] = None):
-    """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors.
-    This sets the port number for the side channel.
+def get_host_ip() -> str:
+    """Get the IP address of the host.
+    This is needed for the side channel to work in multi-node deployments.
     """
-    if hostname is None:
-        hostname = socket.gethostname()
-        # Test if hostname is usable by attempting to bind to it
+    try:
+        host_name = socket.gethostname()
+    except socket.error as e:
+        logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'")
+        return "127.0.0.1"
+    else:
         try:
+            # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments
+            host_ip = socket.gethostbyname(host_name)
+            # Test if the IP is actually usable by binding to it
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket:
-                test_socket.bind((hostname, 0))
-        except (socket.error, socket.gaierror):
-            # If hostname is not usable, fall back to localhost
+                test_socket.bind((host_ip, 0))
+            return host_ip
+        except socket.gaierror as e:
             logger.warning(
-                f"Hostname '{hostname}' is not usable, falling back to '127.0.0.1'"
+                f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'"
             )
-            hostname = "127.0.0.1"
+            return "127.0.0.1"
+        except socket.error as e:
+            # If hostname is not usable for binding, fall back to localhost
+            logger.warning(
+                f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'"
+            )
+            return "127.0.0.1"
+
 
-    os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = hostname
+def set_side_channel_host_and_port(config: Config):
+    """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors.
+    This sets the port number for the side channel.
+    """
+    host_ip = get_host_ip()
+    os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = host_ip
     os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(config.side_channel_port)
-    logger.debug(f"Set NIXL side channel to {hostname}:{config.side_channel_port}")
+    logger.debug(f"Set NIXL side channel to {host_ip}:{config.side_channel_port}")
diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 7b4fffdef9b..38019f8c1c7 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -48,9 +48,9 @@ class SLAPlannerDefaults(BasePlannerDefaults):
 
 
 class VllmComponentName:
-    prefill_worker = "VllmPrefillWorker"
+    prefill_worker = "prefill"
     prefill_worker_endpoint = "generate"
-    decode_worker = "VllmDecodeWorker"
+    decode_worker = "backend"
     decode_worker_endpoint = "generate"
 
 
diff --git a/components/planner/src/dynamo/planner/prometheus.py b/components/planner/src/dynamo/planner/prometheus.py
index 0be1d379012..95a0d0686cb 100644
--- a/components/planner/src/dynamo/planner/prometheus.py
+++ b/components/planner/src/dynamo/planner/prometheus.py
@@ -47,6 +47,7 @@ async def start_prometheus_server(config):
     cmd = [
         "prometheus",
         f"--config.file={config_path}",
+        "--web.listen-address=0.0.0.0:9090",
     ]
 
     logger.info(f"Prometheus cmd: {cmd}")
diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index 5b44b70f404..3a0d4de5935 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -222,7 +222,14 @@ async def make_adjustments(self):
 
             # compute how many replicas are needed for decode
             # 1. apply d_correction_factor to the ITL SLA
-            corrected_itl = self.args.itl / self.d_correction_factor
+            # Prevent divide by zero when d_correction_factor is 0 (no metrics yet)
+            if self.d_correction_factor <= 0:
+                logger.warning(
+                    f"d_correction_factor is {self.d_correction_factor}, using default value of 1.0"
+                )
+                corrected_itl = self.args.itl
+            else:
+                corrected_itl = self.args.itl / self.d_correction_factor
             # 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length
             pred_decode_thpt_per_gpu = (
                 self.decode_interpolator.find_best_throughput_per_gpu(
@@ -274,7 +281,7 @@ async def make_adjustments(self):
                 WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
                 WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
             }
-            self.connector.set_component_replicas(target_replicas, blocking=False)
+            await self.connector.set_component_replicas(target_replicas, blocking=False)
 
     async def run(self):
         """Main loop for the planner"""
diff --git a/docs/architecture/sla_planner.md b/docs/architecture/sla_planner.md
index fb5876a0229..6e3ec836d27 100644
--- a/docs/architecture/sla_planner.md
+++ b/docs/architecture/sla_planner.md
@@ -106,11 +106,13 @@ Finally, SLA planner applies the change by scaling up/down the number of prefill
 
 ## Deploying
 
-To deploy SLA-planner, ensure etcd and NATS are running first, then use the frontend that reports metrics at `/metrics` HTTP endpoint. You can also use your own frontend, but it must report number of requests, ISL, OSL, TTFT, ITL in the same format.
-
-SLA-planner and prometheus server are provided as common components that can be directly imported from `dynamo` package.
+For detailed deployment instructions including setup, configuration, troubleshooting, and architecture overview, see the [SLA Planner Deployment Guide](../guides/dynamo_deploy/sla_planner_deployment.md).
 
+**Quick Start:**
 ```bash
 cd components/backends/vllm/deploy
 kubectl apply -f disagg_planner.yaml -n {$NAMESPACE}
 ```
+
+> [!NOTE]
+> The SLA planner requires a frontend that reports metrics at `/metrics` HTTP endpoint with number of requests, ISL, OSL, TTFT, ITL in the correct format. The VLLM frontend provides these metrics automatically.
diff --git a/docs/guides/dynamo_deploy/sla_planner_deployment.md b/docs/guides/dynamo_deploy/sla_planner_deployment.md
new file mode 100644
index 00000000000..a4d662143e2
--- /dev/null
+++ b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -0,0 +1,136 @@
+# SLA Planner Deployment Guide
+
+Quick deployment guide for the vLLM disaggregated planner with automatic scaling.
+
+## Architecture Overview
+
+**Components:**
+- **Frontend**: Serves requests and exposes `/metrics`
+- **Prometheus**: Scrapes frontend metrics every 5 seconds
+- **Planner**: Queries Prometheus and adjusts worker scaling every 60 seconds
+- **Workers**: VllmDecodeWorker and VllmPrefillWorker handle inference
+
+```mermaid
+flowchart LR
+  Frontend --"/metrics"--> Prometheus
+  Prometheus --"scrape (5s)"--> Prometheus
+  Planner --"query API"--> Prometheus
+  Planner --"scaling decisions"--> Workers["VllmPrefillWorker<br/>VllmDecodeWorker"]
+  Frontend -.->|"requests"| Workers
+```
+
+## Prerequisites
+- Kubernetes cluster with GPU nodes
+- `hf-token-secret` created in target namespace
+
+```bash
+export NAMESPACE=your-namespace
+```
+
+## 1. Deploy the System
+
+```bash
+# Apply the disaggregated planner deployment
+kubectl apply -f components/backends/vllm/deploy/disagg_planner.yaml -n $NAMESPACE
+
+# Check deployment status
+kubectl get pods -n $NAMESPACE
+```
+
+Expected pods (all should be `1/1 Running`):
+```
+vllm-disagg-planner-frontend-*        1/1 Running
+vllm-disagg-planner-prometheus-*      1/1 Running
+vllm-disagg-planner-planner-*         1/1 Running
+vllm-disagg-planner-vllmdecodeworker-*    1/1 Running
+vllm-disagg-planner-vllmprefillworker-*   1/1 Running
+```
+
+## 2. Apply Prometheus Port Fix (Required)
+
+Due to a current operator limitation, manually patch the Prometheus deployment:
+
+```bash
+# Fix container port
+kubectl patch deployment vllm-disagg-planner-prometheus -n $NAMESPACE \
+  --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/ports/0/containerPort", "value": 9090}]'
+
+# Fix environment variable
+kubectl patch deployment vllm-disagg-planner-prometheus -n $NAMESPACE \
+  --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/env/0/value", "value": "9090"}]'
+
+# Fix port name
+kubectl patch deployment vllm-disagg-planner-prometheus -n $NAMESPACE \
+  --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/ports/0/name", "value": "prometheus"}]'
+
+# Wait for rollout
+kubectl rollout status deployment/vllm-disagg-planner-prometheus -n $NAMESPACE
+```
+
+## 3. Test the System
+
+```bash
+# Port forward to frontend
+kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
+
+# Send a streaming request (required for full metrics)
+curl http://localhost:8000/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "model": "Qwen/Qwen3-0.6B",
+    "messages": [
+    {
+        "role": "user",
+        "content": "In the heart of Eldoria, an ancient land of boundless magic and mysterious creatures, lies the long-forgotten city of Aeloria. Once a beacon of knowledge and power, Aeloria was buried beneath the shifting sands of time, lost to the world for centuries. You are an intrepid explorer, known for your unparalleled curiosity and courage, who has stumbled upon an ancient map hinting at ests that Aeloria holds a secret so profound that it has the potential to reshape the very fabric of reality. Your journey will take you through treacherous deserts, enchanted forests, and across perilous mountain ranges. Your Task: Character Background: Develop a detailed background for your character. Describe their motivations for seeking out Aeloria, their skills and weaknesses, and any personal connections to the ancient city or its legends. Are they driven by a quest for knowledge, a search for lost familt clue is hidden."
+    }
+    ],
+    "stream":true,
+    "max_tokens": 30
+  }' | jq
+```
+
+## 4. Monitor Scaling
+
+```bash
+# Check planner logs for scaling decisions
+kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
+
+# Expected successful output:
+# "Number of prefill workers: 1, number of decode workers: 1"
+# "Observed ttft: X.XXXs itl: X.XXXs" (after streaming requests)
+```
+
+## 5. Key Notes
+
+### Metrics Requirements
+- **Basic metrics** (request count): Available with any request type
+- **Latency metrics** (TTFT/ITL): Only available with `"stream": true` requests
+- **Scaling decisions**: Require sufficient request volume and streaming requests
+
+### Current Status
+✅ **Working**: All core functionality, worker discovery, Prometheus connectivity
+🔧 **Manual Fix Required**: Prometheus port configuration (until operator fix)
+📊 **Expected**: Some warnings until metrics accumulate from streaming requests
+
+## 6. Troubleshooting
+
+**Connection Issues:**
+```bash
+# Verify Prometheus is listening on 9090
+kubectl exec -n $NAMESPACE deployment/vllm-disagg-planner-prometheus -- netstat -tlnp | grep 9090
+
+# Test Prometheus API
+kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:9090
+curl "http://localhost:9090/api/v1/query?query=up"
+```
+
+**Missing Metrics:**
+```bash
+# Check frontend metrics
+kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
+curl http://localhost:8000/metrics | grep nv_llm_http_service
+```
+
+**Worker Issues:**
+- Large models can take 10+ minutes to initialize
+- Check worker logs: `kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-vllmdecodeworker`

From eb56dbb1e484052c778c8bba0a65fa23bc7c5b1e Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 16:15:34 -0700
Subject: [PATCH 41/58] fi: use provided namespace for decode

---
 components/backends/vllm/src/dynamo/vllm/args.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py
index 6da016a2ede..2295cdd2acc 100644
--- a/components/backends/vllm/src/dynamo/vllm/args.py
+++ b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -87,6 +87,9 @@ def parse_args() -> Config:
 
     if args.is_prefill_worker:
         args.endpoint = f"dyn://{args.namespace}.prefill.generate"
+    else:
+        # For decode workers, also use the provided namespace instead of hardcoded "dynamo"
+        args.endpoint = f"dyn://{args.namespace}.backend.generate"
 
     endpoint_str = args.endpoint.replace("dyn://", "", 1)
     endpoint_parts = endpoint_str.split(".")

From b68779d956b4c53094490cc874fb5b4550b44533 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 16:17:07 -0700
Subject: [PATCH 42/58] feat: use k8s deployment info instead of hardcoding
 prometheus endpoint

---
 .../backends/vllm/deploy/disagg_planner.yaml  |  3 +-
 .../planner/src/dynamo/planner/defaults.py    | 59 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index ff77cecb39d..1ca017332c0 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -53,6 +53,7 @@ spec:
       envFromSecret: hf-token-secret
       componentType: worker
       replicas: 1
+
       livenessProbe:
         exec:
           command:
@@ -96,7 +97,7 @@ spec:
             - --backend=vllm
             - --adjustment-interval=60
             - --profile-results-dir=/workspace/profiling_results
-            - --prometheus-endpoint=http://vllm-disagg-planner-prometheus.hzhou-dynamo.svc.cluster.local:9090
+
     Prometheus:
       dynamoNamespace: vllm-disagg-planner
       componentType: main
diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 38019f8c1c7..60ba34054aa 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -13,6 +13,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
+import socket
+
 
 # Source of truth for planner defaults
 class BasePlannerDefaults:
@@ -36,8 +39,62 @@ class LoadPlannerDefaults(BasePlannerDefaults):
     prefill_queue_scale_down_threshold = 0.2
 
 
+def _get_dynamo_namespace_from_k8s() -> str:
+    """Get the dynamo namespace from current pod's Kubernetes labels"""
+    try:
+        from kubernetes import client
+
+        from dynamo.planner.kube import KubernetesAPI
+
+        k8s_api = KubernetesAPI()
+        v1 = client.CoreV1Api()
+
+        # Get current pod name from hostname
+        hostname = socket.gethostname()
+
+        # Get current pod to read its labels
+        pod = v1.read_namespaced_pod(name=hostname, namespace=k8s_api.current_namespace)
+        labels = pod.metadata.labels or {}
+
+        # Extract dynamo namespace from labels
+        dynamo_namespace = labels.get("nvidia.com/dynamo-namespace")
+        if not dynamo_namespace:
+            raise RuntimeError(
+                "Failed to determine the dynamo namespace from Kubernetes pod labels"
+            )
+        return dynamo_namespace
+
+    except Exception as e:
+        raise RuntimeError(
+            "Failed to determine the dynamo namespace from Kubernetes pod labels"
+        ) from e
+
+
+def _get_default_prometheus_endpoint(port: str):
+    """Compute default prometheus endpoint using Kubernetes service discovery"""
+
+    # Try to get current namespace and deployment name from Kubernetes
+    try:
+        from dynamo.planner.kube import KubernetesAPI
+
+        k8s_api = KubernetesAPI()
+        k8s_namespace = k8s_api.current_namespace
+
+        if k8s_namespace and k8s_namespace != "default":
+            dynamo_namespace = _get_dynamo_namespace_from_k8s()
+            prometheus_service = f"{dynamo_namespace}-prometheus"
+            return (
+                f"http://{prometheus_service}.{k8s_namespace}.svc.cluster.local:{port}"
+            )
+    except Exception as e:
+        raise RuntimeError(
+            "Failed to determine the prometheus endpoint from Kubernetes service discovery"
+        ) from e
+
+
 class SLAPlannerDefaults(BasePlannerDefaults):
-    prometheus_endpoint = "http://localhost:9090"
+    port = os.environ.get("DYNAMO_PORT", "9090")
+    prometheus_endpoint = _get_default_prometheus_endpoint(port)
     profile_results_dir = "profiling_results"
     isl = 3000  # in number of tokens
     osl = 150  # in number of tokens

From c8e394d502883ffd4ad6dcad98c51d1df918c250 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 16:17:45 -0700
Subject: [PATCH 43/58] fix: if no requests have been made yet, don't try to
 access list

---
 .../src/dynamo/planner/utils/prometheus.py    | 50 +++++++++++--------
 1 file changed, 30 insertions(+), 20 deletions(-)

diff --git a/components/planner/src/dynamo/planner/utils/prometheus.py b/components/planner/src/dynamo/planner/utils/prometheus.py
index 6d63863bcae..46982b4e092 100644
--- a/components/planner/src/dynamo/planner/utils/prometheus.py
+++ b/components/planner/src/dynamo/planner/utils/prometheus.py
@@ -29,33 +29,39 @@ def __init__(self, url: str):
 
     def get_avg_inter_token_latency(self, interval: str):
         try:
-            return float(
-                self.prom.custom_query(
-                    query=f"increase(nv_llm_http_service_inter_token_latency_seconds_sum[{interval}])/increase(nv_llm_http_service_inter_token_latency_seconds_count[{interval}])",
-                )[0]["value"][1]
+            result = self.prom.custom_query(
+                query=f"increase(nv_llm_http_service_inter_token_latency_seconds_sum[{interval}])/increase(nv_llm_http_service_inter_token_latency_seconds_count[{interval}])",
             )
+            if not result:
+                # No data available yet (no requests made) - return 0 silently
+                return 0
+            return float(result[0]["value"][1])
         except Exception as e:
             logger.error(f"Error getting avg inter token latency: {e}")
             return 0
 
     def get_avg_time_to_first_token(self, interval: str):
         try:
-            return float(
-                self.prom.custom_query(
-                    query=f"increase(nv_llm_http_service_time_to_first_token_seconds_sum[{interval}])/increase(nv_llm_http_service_time_to_first_token_seconds_count[{interval}])",
-                )[0]["value"][1]
+            result = self.prom.custom_query(
+                query=f"increase(nv_llm_http_service_time_to_first_token_seconds_sum[{interval}])/increase(nv_llm_http_service_time_to_first_token_seconds_count[{interval}])",
             )
+            if not result:
+                # No data available yet (no requests made) - return 0 silently
+                return 0
+            return float(result[0]["value"][1])
         except Exception as e:
             logger.error(f"Error getting avg time to first token: {e}")
             return 0
 
     def get_avg_request_duration(self, interval: str):
         try:
-            return float(
-                self.prom.custom_query(
-                    query=f"increase(nv_llm_http_service_request_duration_seconds_sum[{interval}])/increase(nv_llm_http_service_request_duration_seconds_count[{interval}])",
-                )[0]["value"][1]
+            result = self.prom.custom_query(
+                query=f"increase(nv_llm_http_service_request_duration_seconds_sum[{interval}])/increase(nv_llm_http_service_request_duration_seconds_count[{interval}])",
             )
+            if not result:
+                # No data available yet (no requests made) - return 0 silently
+                return 0
+            return float(result[0]["value"][1])
         except Exception as e:
             logger.error(f"Error getting avg request duration: {e}")
             return 0
@@ -76,22 +82,26 @@ def get_avg_request_count(self, interval: str):
 
     def get_avg_input_sequence_tokens(self, interval: str):
         try:
-            return float(
-                self.prom.custom_query(
-                    query=f"increase(nv_llm_http_service_input_sequence_tokens_sum[{interval}])/increase(nv_llm_http_service_input_sequence_tokens_count[{interval}])",
-                )[0]["value"][1]
+            result = self.prom.custom_query(
+                query=f"increase(nv_llm_http_service_input_sequence_tokens_sum[{interval}])/increase(nv_llm_http_service_input_sequence_tokens_count[{interval}])",
             )
+            if not result:
+                # No data available yet (no requests made) - return 0 silently
+                return 0
+            return float(result[0]["value"][1])
         except Exception as e:
             logger.error(f"Error getting avg input sequence tokens: {e}")
             return 0
 
     def get_avg_output_sequence_tokens(self, interval: str):
         try:
-            return float(
-                self.prom.custom_query(
-                    query=f"increase(nv_llm_http_service_output_sequence_tokens_sum[{interval}])/increase(nv_llm_http_service_output_sequence_tokens_count[{interval}])",
-                )[0]["value"][1]
+            result = self.prom.custom_query(
+                query=f"increase(nv_llm_http_service_output_sequence_tokens_sum[{interval}])/increase(nv_llm_http_service_output_sequence_tokens_count[{interval}])",
             )
+            if not result:
+                # No data available yet (no requests made) - return 0 silently
+                return 0
+            return float(result[0]["value"][1])
         except Exception as e:
             logger.error(f"Error getting avg output sequence tokens: {e}")
             return 0

From 1bbfd8d6da791fc903834ad8209b8436d9b484b4 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 16:18:21 -0700
Subject: [PATCH 44/58] feat: use SLAPLannerDefaults port

---
 components/planner/src/dynamo/planner/prometheus.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/components/planner/src/dynamo/planner/prometheus.py b/components/planner/src/dynamo/planner/prometheus.py
index 95a0d0686cb..2b81112b7e9 100644
--- a/components/planner/src/dynamo/planner/prometheus.py
+++ b/components/planner/src/dynamo/planner/prometheus.py
@@ -20,6 +20,7 @@
 
 import yaml
 
+from dynamo.planner.defaults import SLAPlannerDefaults
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.sdk.lib.config import ServiceConfig
 
@@ -44,10 +45,12 @@ async def start_prometheus_server(config):
     temp_file.close()
     config_path = temp_file.name
 
+    # Use port from SLAPlannerDefaults (which reads DYNAMO_PORT with fallback to 9090)
+    prometheus_port = SLAPlannerDefaults.port
     cmd = [
         "prometheus",
         f"--config.file={config_path}",
-        "--web.listen-address=0.0.0.0:9090",
+        f"--web.listen-address=0.0.0.0:{prometheus_port}",
     ]
 
     logger.info(f"Prometheus cmd: {cmd}")

From ba6b5c14506a36c546d81838a6258156d421da29 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 16:27:01 -0700
Subject: [PATCH 45/58] docs: clean up sla planner deployment docs

---
 .../dynamo_deploy/sla_planner_deployment.md   | 51 +++++--------------
 1 file changed, 12 insertions(+), 39 deletions(-)

diff --git a/docs/guides/dynamo_deploy/sla_planner_deployment.md b/docs/guides/dynamo_deploy/sla_planner_deployment.md
index a4d662143e2..6d4cafd0940 100644
--- a/docs/guides/dynamo_deploy/sla_planner_deployment.md
+++ b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -2,6 +2,9 @@
 
 Quick deployment guide for the vLLM disaggregated planner with automatic scaling.
 
+> [!NOTE]
+> For high-level architecture and concepts, see [SLA-based Planner](../../architecture/sla_planner.md).
+
 ## Architecture Overview
 
 **Components:**
@@ -39,35 +42,14 @@ kubectl get pods -n $NAMESPACE
 
 Expected pods (all should be `1/1 Running`):
 ```
-vllm-disagg-planner-frontend-*        1/1 Running
-vllm-disagg-planner-prometheus-*      1/1 Running
-vllm-disagg-planner-planner-*         1/1 Running
+vllm-disagg-planner-frontend-*            1/1 Running
+vllm-disagg-planner-prometheus-*          1/1 Running
+vllm-disagg-planner-planner-*             1/1 Running
 vllm-disagg-planner-vllmdecodeworker-*    1/1 Running
 vllm-disagg-planner-vllmprefillworker-*   1/1 Running
 ```
 
-## 2. Apply Prometheus Port Fix (Required)
-
-Due to a current operator limitation, manually patch the Prometheus deployment:
-
-```bash
-# Fix container port
-kubectl patch deployment vllm-disagg-planner-prometheus -n $NAMESPACE \
-  --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/ports/0/containerPort", "value": 9090}]'
-
-# Fix environment variable
-kubectl patch deployment vllm-disagg-planner-prometheus -n $NAMESPACE \
-  --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/env/0/value", "value": "9090"}]'
-
-# Fix port name
-kubectl patch deployment vllm-disagg-planner-prometheus -n $NAMESPACE \
-  --type='json' -p='[{"op": "replace", "path": "/spec/template/spec/containers/0/ports/0/name", "value": "prometheus"}]'
-
-# Wait for rollout
-kubectl rollout status deployment/vllm-disagg-planner-prometheus -n $NAMESPACE
-```
-
-## 3. Test the System
+## 2. Test the System
 
 ```bash
 # Port forward to frontend
@@ -89,7 +71,7 @@ curl http://localhost:8000/v1/chat/completions \
   }' | jq
 ```
 
-## 4. Monitor Scaling
+## 3. Monitor Scaling
 
 ```bash
 # Check planner logs for scaling decisions
@@ -100,27 +82,17 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 # "Observed ttft: X.XXXs itl: X.XXXs" (after streaming requests)
 ```
 
-## 5. Key Notes
-
 ### Metrics Requirements
 - **Basic metrics** (request count): Available with any request type
 - **Latency metrics** (TTFT/ITL): Only available with `"stream": true` requests
 - **Scaling decisions**: Require sufficient request volume and streaming requests
 
-### Current Status
-✅ **Working**: All core functionality, worker discovery, Prometheus connectivity
-🔧 **Manual Fix Required**: Prometheus port configuration (until operator fix)
-📊 **Expected**: Some warnings until metrics accumulate from streaming requests
-
-## 6. Troubleshooting
+## 4. Troubleshooting
 
 **Connection Issues:**
 ```bash
-# Verify Prometheus is listening on 9090
-kubectl exec -n $NAMESPACE deployment/vllm-disagg-planner-prometheus -- netstat -tlnp | grep 9090
-
-# Test Prometheus API
-kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:9090
+# Verify Prometheus is accessible (default port 8000)
+kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:8000
 curl "http://localhost:9090/api/v1/query?query=up"
 ```
 
@@ -134,3 +106,4 @@ curl http://localhost:8000/metrics | grep nv_llm_http_service
 **Worker Issues:**
 - Large models can take 10+ minutes to initialize
 - Check worker logs: `kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-vllmdecodeworker`
+- Ensure GPU resources are available for workers

From e533ddae6fd545edc62ad28e94f1f47628cb1890 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Fri, 25 Jul 2025 17:23:30 -0700
Subject: [PATCH 46/58] feat: use DYNAMO_NAMESPACE env var instead of
 --namespace arg

---
 .../backends/vllm/deploy/disagg_planner.yaml  | 21 +++---
 .../backends/vllm/src/dynamo/vllm/args.py     | 12 ++--
 .../planner/src/dynamo/planner/defaults.py    | 64 ++++---------------
 .../planner/src/dynamo/planner/planner_sla.py | 12 +---
 4 files changed, 27 insertions(+), 82 deletions(-)

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 1ca017332c0..73bae3a20f2 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -9,6 +9,8 @@ spec:
   envs:
     - name: DYNAMO_SERVICE_CONFIG
       value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
+    - name: DYNAMO_NAMESPACE
+      value: "vllm-disagg-planner"
   services:
     Frontend:
       dynamoNamespace: vllm-disagg-planner
@@ -41,7 +43,7 @@ spec:
           memory: "16Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
@@ -53,7 +55,6 @@ spec:
       envFromSecret: hf-token-secret
       componentType: worker
       replicas: 1
-
       livenessProbe:
         exec:
           command:
@@ -86,18 +87,16 @@ spec:
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
           workingDir: /workspace/components/planner/src/dynamo/planner
           args:
             - python
             - -m
             - planner_sla
-            - --namespace=vllm-disagg-planner
             - --environment=kubernetes
             - --backend=vllm
             - --adjustment-interval=60
             - --profile-results-dir=/workspace/profiling_results
-
     Prometheus:
       dynamoNamespace: vllm-disagg-planner
       componentType: main
@@ -105,8 +104,6 @@ spec:
       envs:
         - name: PYTHONPATH
           value: "/workspace/components/planner/src"
-        - name: DYNAMO_PORT
-          value: "9090"
       livenessProbe:
         exec:
           command:
@@ -135,7 +132,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
@@ -177,13 +174,13 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --namespace vllm-disagg-planner 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -219,10 +216,10 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --namespace vllm-disagg-planner 2>&1 | tee /tmp/vllm.log
+            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker 2>&1 | tee /tmp/vllm.log
diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py
index 2295cdd2acc..d39131e5c88 100644
--- a/components/backends/vllm/src/dynamo/vllm/args.py
+++ b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -52,12 +52,6 @@ def parse_args() -> Config:
         default=DEFAULT_ENDPOINT,
         help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
     )
-    parser.add_argument(
-        "--namespace",
-        type=str,
-        default="dynamo",
-        help="Dynamo namespace for this worker. Default: dynamo",
-    )
     parser.add_argument(
         "--is-prefill-worker",
         action="store_true",
@@ -85,11 +79,13 @@ def parse_args() -> Config:
         # This becomes an `Option` on the Rust side
         config.served_model_name = None
 
+    namespace = os.environ.get("DYNAMO_NAMESPACE", "dynamo")
+
     if args.is_prefill_worker:
-        args.endpoint = f"dyn://{args.namespace}.prefill.generate"
+        args.endpoint = f"dyn://{namespace}.prefill.generate"
     else:
         # For decode workers, also use the provided namespace instead of hardcoded "dynamo"
-        args.endpoint = f"dyn://{args.namespace}.backend.generate"
+        args.endpoint = f"dyn://{namespace}.backend.generate"
 
     endpoint_str = args.endpoint.replace("dyn://", "", 1)
     endpoint_parts = endpoint_str.split(".")
diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 60ba34054aa..b337966f56d 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -14,7 +14,8 @@
 # limitations under the License.
 
 import os
-import socket
+
+from dynamo.planner.kube import KubernetesAPI
 
 
 # Source of truth for planner defaults
@@ -39,62 +40,23 @@ class LoadPlannerDefaults(BasePlannerDefaults):
     prefill_queue_scale_down_threshold = 0.2
 
 
-def _get_dynamo_namespace_from_k8s() -> str:
-    """Get the dynamo namespace from current pod's Kubernetes labels"""
-    try:
-        from kubernetes import client
-
-        from dynamo.planner.kube import KubernetesAPI
-
-        k8s_api = KubernetesAPI()
-        v1 = client.CoreV1Api()
-
-        # Get current pod name from hostname
-        hostname = socket.gethostname()
-
-        # Get current pod to read its labels
-        pod = v1.read_namespaced_pod(name=hostname, namespace=k8s_api.current_namespace)
-        labels = pod.metadata.labels or {}
-
-        # Extract dynamo namespace from labels
-        dynamo_namespace = labels.get("nvidia.com/dynamo-namespace")
-        if not dynamo_namespace:
-            raise RuntimeError(
-                "Failed to determine the dynamo namespace from Kubernetes pod labels"
-            )
-        return dynamo_namespace
-
-    except Exception as e:
-        raise RuntimeError(
-            "Failed to determine the dynamo namespace from Kubernetes pod labels"
-        ) from e
-
-
-def _get_default_prometheus_endpoint(port: str):
-    """Compute default prometheus endpoint using Kubernetes service discovery"""
-
-    # Try to get current namespace and deployment name from Kubernetes
-    try:
-        from dynamo.planner.kube import KubernetesAPI
+def _get_default_prometheus_endpoint(port: str, namespace: str):
+    """Compute default prometheus endpoint using environment variables and Kubernetes service discovery"""
 
-        k8s_api = KubernetesAPI()
-        k8s_namespace = k8s_api.current_namespace
+    k8s_api = KubernetesAPI()
+    k8s_namespace = k8s_api.current_namespace
 
-        if k8s_namespace and k8s_namespace != "default":
-            dynamo_namespace = _get_dynamo_namespace_from_k8s()
-            prometheus_service = f"{dynamo_namespace}-prometheus"
-            return (
-                f"http://{prometheus_service}.{k8s_namespace}.svc.cluster.local:{port}"
-            )
-    except Exception as e:
-        raise RuntimeError(
-            "Failed to determine the prometheus endpoint from Kubernetes service discovery"
-        ) from e
+    if k8s_namespace and k8s_namespace != "default":
+        prometheus_service = f"{namespace}-prometheus"
+        return f"http://{prometheus_service}.{k8s_namespace}.svc.cluster.local:{port}"
+    else:
+        raise RuntimeError("Can't find a prometheus endpoint for the planner!")
 
 
 class SLAPlannerDefaults(BasePlannerDefaults):
     port = os.environ.get("DYNAMO_PORT", "9090")
-    prometheus_endpoint = _get_default_prometheus_endpoint(port)
+    namespace = os.environ.get("DYNAMO_NAMESPACE", "vllm-disagg-planner")
+    prometheus_endpoint = _get_default_prometheus_endpoint(port, namespace)
     profile_results_dir = "profiling_results"
     isl = 3000  # in number of tokens
     osl = 150  # in number of tokens
diff --git a/components/planner/src/dynamo/planner/planner_sla.py b/components/planner/src/dynamo/planner/planner_sla.py
index 5874d69daa0..65b546ba7b2 100644
--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -40,7 +40,7 @@ async def init_planner(runtime: DistributedRuntime, args):
 
     await start_sla_planner(runtime, args)
 
-    component = runtime.namespace(args.namespace).component("Planner")
+    component = runtime.namespace(SLAPlannerDefaults.namespace).component("Planner")
     await component.create_service()
 
     async def generate(self, request: RequestType):
@@ -53,11 +53,6 @@ async def generate(self, request: RequestType):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="SLA Planner")
-    parser.add_argument(
-        "--namespace",
-        default=SLAPlannerDefaults.namespace,
-        help="Namespace for the planner",
-    )
     parser.add_argument(
         "--environment",
         default=SLAPlannerDefaults.environment,
@@ -109,11 +104,6 @@ async def generate(self, request: RequestType):
         default=SLAPlannerDefaults.prefill_engine_num_gpu,
         help="Number of GPUs for prefill engine",
     )
-    parser.add_argument(
-        "--prometheus-endpoint",
-        default=SLAPlannerDefaults.prometheus_endpoint,
-        help="Prometheus endpoint URL",
-    )
     parser.add_argument(
         "--profile-results-dir",
         default=SLAPlannerDefaults.profile_results_dir,

From a548d74fd91496b6282398f6383e1028cf4d184c Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Sat, 26 Jul 2025 00:29:38 -0700
Subject: [PATCH 47/58] feat: fixes for working planner

---
 .../backends/vllm/deploy/disagg_planner.yaml  | 20 ++++++++++---------
 .../planner/src/dynamo/planner/defaults.py    |  2 +-
 .../planner/src/dynamo/planner/prometheus.py  |  1 -
 .../src/dynamo/planner/utils/planner_core.py  | 14 +++++--------
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 73bae3a20f2..90b17452e13 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -9,6 +9,8 @@ spec:
   envs:
     - name: DYNAMO_SERVICE_CONFIG
       value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
+    - name: DYNAMO_PORT
+      value: "8000"
     - name: DYNAMO_NAMESPACE
       value: "vllm-disagg-planner"
   services:
@@ -43,7 +45,7 @@ spec:
           memory: "16Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
@@ -87,7 +89,7 @@ spec:
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
           workingDir: /workspace/components/planner/src/dynamo/planner
           args:
             - python
@@ -132,18 +134,18 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
             - "python3 -m dynamo.planner.prometheus"
-    VllmDecodeWorker:
+    backend:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
-      replicas: 1
+      replicas: 2
       livenessProbe:
         exec:
           command:
@@ -174,18 +176,18 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
             - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
-    VllmPrefillWorker:
+    prefill:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
-      replicas: 1
+      replicas: 2
       livenessProbe:
         exec:
           command:
@@ -216,7 +218,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.13
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index b337966f56d..e0f68b2ab70 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -54,7 +54,7 @@ def _get_default_prometheus_endpoint(port: str, namespace: str):
 
 
 class SLAPlannerDefaults(BasePlannerDefaults):
-    port = os.environ.get("DYNAMO_PORT", "9090")
+    port = os.environ.get("DYNAMO_PORT", "8000")
     namespace = os.environ.get("DYNAMO_NAMESPACE", "vllm-disagg-planner")
     prometheus_endpoint = _get_default_prometheus_endpoint(port, namespace)
     profile_results_dir = "profiling_results"
diff --git a/components/planner/src/dynamo/planner/prometheus.py b/components/planner/src/dynamo/planner/prometheus.py
index 2b81112b7e9..dc0a5d96f71 100644
--- a/components/planner/src/dynamo/planner/prometheus.py
+++ b/components/planner/src/dynamo/planner/prometheus.py
@@ -45,7 +45,6 @@ async def start_prometheus_server(config):
     temp_file.close()
     config_path = temp_file.name
 
-    # Use port from SLAPlannerDefaults (which reads DYNAMO_PORT with fallback to 9090)
     prometheus_port = SLAPlannerDefaults.port
     cmd = [
         "prometheus",
diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index 3a0d4de5935..40b8d9676d9 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -52,15 +52,17 @@ class Planner:
     def __init__(self, runtime: DistributedRuntime, args: argparse.Namespace):
         self.runtime = runtime
         self.args = args
-        self.namespace = args.namespace
+        self.namespace = SLAPlannerDefaults.namespace
 
         if not args.no_operation:
             if args.environment == "kubernetes":
-                self.connector = KubernetesConnector(args.namespace)
+                self.connector = KubernetesConnector(self.namespace)
             else:
                 raise ValueError(f"Invalid environment: {args.environment}")
 
-        self.prometheus_api_client = PrometheusAPIClient(args.prometheus_endpoint)
+        self.prometheus_api_client = PrometheusAPIClient(
+            SLAPlannerDefaults.prometheus_endpoint
+        )
 
         self.num_req_predictor = LOAD_PREDICTORS[args.load_predictor](
             window_size=args.load_prediction_window_size,
@@ -312,12 +314,6 @@ async def start_sla_planner(runtime: DistributedRuntime, args: argparse.Namespac
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # Common planner arguments
-    parser.add_argument(
-        "--namespace",
-        type=str,
-        default=SLAPlannerDefaults.namespace,
-        help="Namespace planner will look at",
-    )
     parser.add_argument(
         "--environment",
         type=str,

From f6af0d537f7348e47e658909e42f0ce114cfcd85 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Sat, 26 Jul 2025 00:34:22 -0700
Subject: [PATCH 48/58] feat: skip adjustments if no traffic

---
 .../planner/src/dynamo/planner/utils/planner_core.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index 40b8d9676d9..51aafa56268 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -167,6 +167,18 @@ def observe_metrics(self):
 
     async def make_adjustments(self):
         try:
+            # Check if metrics are valid (not NaN) - skip adjustment if no traffic
+            if (
+                math.isnan(self.last_metrics.ttft)
+                or math.isnan(self.last_metrics.itl)
+                or math.isnan(self.last_metrics.isl)
+                or math.isnan(self.last_metrics.osl)
+            ):
+                logger.info(
+                    "Metrics contain NaN values (no active requests), skipping adjustment"
+                )
+                return
+
             self.p_endpoints, self.d_endpoints = await self.get_workers_info()
             logger.info(
                 f"Number of prefill workers: {len(self.p_endpoints)}, number of decode workers: {len(self.d_endpoints)}"

From 445fe7455101f432ed383cbb81274fd59b4c70f6 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Sat, 26 Jul 2025 00:41:19 -0700
Subject: [PATCH 49/58] docs: doc updates for planner deployment

---
 .../dynamo_deploy/sla_planner_deployment.md   | 28 +++++++++++--------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/docs/guides/dynamo_deploy/sla_planner_deployment.md b/docs/guides/dynamo_deploy/sla_planner_deployment.md
index 6d4cafd0940..d18d5d9f5aa 100644
--- a/docs/guides/dynamo_deploy/sla_planner_deployment.md
+++ b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -11,14 +11,14 @@ Quick deployment guide for the vLLM disaggregated planner with automatic scaling
 - **Frontend**: Serves requests and exposes `/metrics`
 - **Prometheus**: Scrapes frontend metrics every 5 seconds
 - **Planner**: Queries Prometheus and adjusts worker scaling every 60 seconds
-- **Workers**: VllmDecodeWorker and VllmPrefillWorker handle inference
+- **Workers**: prefill and backend workers handle inference
 
 ```mermaid
 flowchart LR
   Frontend --"/metrics"--> Prometheus
-  Prometheus --"scrape (5s)"--> Prometheus
+  Prometheus --"scrape"--> Prometheus
   Planner --"query API"--> Prometheus
-  Planner --"scaling decisions"--> Workers["VllmPrefillWorker<br/>VllmDecodeWorker"]
+  Planner --"scaling decisions"--> Workers["prefill<br/>backend"]
   Frontend -.->|"requests"| Workers
 ```
 
@@ -45,12 +45,14 @@ Expected pods (all should be `1/1 Running`):
 vllm-disagg-planner-frontend-*            1/1 Running
 vllm-disagg-planner-prometheus-*          1/1 Running
 vllm-disagg-planner-planner-*             1/1 Running
-vllm-disagg-planner-vllmdecodeworker-*    1/1 Running
-vllm-disagg-planner-vllmprefillworker-*   1/1 Running
+vllm-disagg-planner-backend-*             1/1 Running
+vllm-disagg-planner-prefill-*             1/1 Running
 ```
 
 ## 2. Test the System
 
+**Important:** Streaming requests (`"stream": true`) are required for the planner to collect latency metrics and make scaling decisions. Non-streaming requests will produce successful inference outputs but won't provide the necessary telemetry for automatic scaling.
+
 ```bash
 # Port forward to frontend
 kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-frontend 8000:8000
@@ -68,7 +70,7 @@ curl http://localhost:8000/v1/chat/completions \
     ],
     "stream":true,
     "max_tokens": 30
-  }' | jq
+  }'
 ```
 
 ## 3. Monitor Scaling
@@ -77,9 +79,11 @@ curl http://localhost:8000/v1/chat/completions \
 # Check planner logs for scaling decisions
 kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 
-# Expected successful output:
+# Expected successful output (after streaming requests):
+# New adjustment interval started!
+# Observed num_req: X.XXX isl: X.XXX osl: X.XXX
+# "Observed ttft: X.XXXs itl: X.XXXs"
 # "Number of prefill workers: 1, number of decode workers: 1"
-# "Observed ttft: X.XXXs itl: X.XXXs" (after streaming requests)
 ```
 
 ### Metrics Requirements
@@ -91,9 +95,9 @@ kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 
 **Connection Issues:**
 ```bash
-# Verify Prometheus is accessible (default port 8000)
-kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 9090:8000
-curl "http://localhost:9090/api/v1/query?query=up"
+# Verify Prometheus is accessible (runs on port 8000)
+kubectl port-forward -n $NAMESPACE deployment/vllm-disagg-planner-prometheus 8000:8000
+curl "http://localhost:8000/api/v1/query?query=up"
 ```
 
 **Missing Metrics:**
@@ -105,5 +109,5 @@ curl http://localhost:8000/metrics | grep nv_llm_http_service
 
 **Worker Issues:**
 - Large models can take 10+ minutes to initialize
-- Check worker logs: `kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-vllmdecodeworker`
+- Check worker logs: `kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-backend`
 - Ensure GPU resources are available for workers

From bab714c52d6ce3bf37309b9c58e9033ceb8a4068 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Sat, 26 Jul 2025 00:46:45 -0700
Subject: [PATCH 50/58] feat: delete k8s.sh

---
 k8s.sh | 73 ----------------------------------------------------------
 1 file changed, 73 deletions(-)
 delete mode 100755 k8s.sh

diff --git a/k8s.sh b/k8s.sh
deleted file mode 100755
index 08fb06e62ae..00000000000
--- a/k8s.sh
+++ /dev/null
@@ -1,73 +0,0 @@
-#!/usr/bin/env bash
-
-# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-set -euo pipefail
-
-# 1. Install Homebrew if missing
-if ! command -v brew &> /dev/null; then
-  echo "Homebrew not found—installing prerequisites and Homebrew…"
-  # Install build-time prerequisites
-  apt-get update
-  apt-get install -y build-essential procps curl file git
-
-  # Non-interactive Homebrew install
-  NONINTERACTIVE=1 \
-    /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
-
-  # Load brew into this shell
-  eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
-else
-  echo "Homebrew already installed, skipping."
-fi
-
-# 2. Ensure brew is up-to-date
-echo "Updating Homebrew…"
-brew update
-
-# 3. Install Azure CLI
-if ! command -v az &> /dev/null; then
-  echo "Installing Azure CLI (az)…"
-  brew install azure-cli
-else
-  echo "Azure CLI already installed, skipping."
-fi
-
-# 4. Install kubelogin
-if ! command -v kubelogin &> /dev/null; then
-  echo "Installing kubelogin…"
-  brew install Azure/kubelogin/kubelogin
-else
-  echo "kubelogin already installed, skipping."
-fi
-
-# 5. Install kubectl
-if ! command -v kubectl &> /dev/null; then
-  echo "Installing kubectl (kubernetes-cli)…"
-  brew install kubernetes-cli
-else
-  echo "kubectl already installed, skipping."
-fi
-
-echo "✅ All tools are installed and up-to-date."
-
-echo >> /root/.bashrc
-echo 'eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"' >> /root/.bashrc
-eval "$(/home/linuxbrew/.linuxbrew/bin/brew shellenv)"
-
-az login
-az aks get-credentials --resource-group rg-aks-dynamo-dev --name aks-dynamo-dev
-kubelogin convert-kubeconfig -l azurecli
-kubectl auth can-i create deployments

From a29c397139a06cec23403cc03ffff8576c3a8357 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Sat, 26 Jul 2025 08:20:32 -0700
Subject: [PATCH 51/58] docs: slight doc modification

---
 components/backends/vllm/deploy/disagg_planner.yaml | 12 ++++++------
 docs/guides/dynamo_deploy/sla_planner_deployment.md |  5 +++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 90b17452e13..ce941c4d7f4 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -45,7 +45,7 @@ spec:
           memory: "16Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
@@ -55,7 +55,7 @@ spec:
     Planner:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
-      componentType: worker
+      componentType: planner
       replicas: 1
       livenessProbe:
         exec:
@@ -89,7 +89,7 @@ spec:
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/planner/src/dynamo/planner
           args:
             - python
@@ -134,7 +134,7 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
@@ -176,7 +176,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
@@ -218,7 +218,7 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.16
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
diff --git a/docs/guides/dynamo_deploy/sla_planner_deployment.md b/docs/guides/dynamo_deploy/sla_planner_deployment.md
index d18d5d9f5aa..e79e25a8e05 100644
--- a/docs/guides/dynamo_deploy/sla_planner_deployment.md
+++ b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -80,10 +80,11 @@ curl http://localhost:8000/v1/chat/completions \
 kubectl logs -n $NAMESPACE deployment/vllm-disagg-planner-planner --tail=10
 
 # Expected successful output (after streaming requests):
+
 # New adjustment interval started!
 # Observed num_req: X.XXX isl: X.XXX osl: X.XXX
-# "Observed ttft: X.XXXs itl: X.XXXs"
-# "Number of prefill workers: 1, number of decode workers: 1"
+# Observed ttft: X.XXXs itl: X.XXXs
+# Number of prefill workers: 1, number of decode workers: 1
 ```
 
 ### Metrics Requirements

From daa3c4e2fd46d866575b688cce99a67b9192d8e6 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Sat, 26 Jul 2025 09:53:00 -0700
Subject: [PATCH 52/58] update resources

---
 .../backends/vllm/deploy/disagg_planner.yaml  | 36 +++++++++----------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index ce941c4d7f4..1e92b370a56 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -38,11 +38,11 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "4"
-          memory: "16Gi"
+          cpu: "32"
+          memory: "10Gi"
         limits:
-          cpu: "4"
-          memory: "16Gi"
+          cpu: "32"
+          memory: "10Gi"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
@@ -78,11 +78,11 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "1"
-          memory: "2Gi"
+          cpu: "10"
+          memory: "10Gi"
         limits:
-          cpu: "1"
-          memory: "2Gi"
+          cpu: "10"
+          memory: "10Gi"
       pvc:
         create: false
         name: profiling-pvc
@@ -127,11 +127,11 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "1"
-          memory: "2Gi"
+          cpu: "10"
+          memory: "10Gi"
         limits:
-          cpu: "1"
-          memory: "2Gi"
+          cpu: "10"
+          memory: "10Gi"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
@@ -141,7 +141,7 @@ spec:
             - -c
           args:
             - "python3 -m dynamo.planner.prometheus"
-    backend:
+    VllmDecodeWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
@@ -167,11 +167,11 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "10"
+          cpu: "32"
           memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "10"
+          cpu: "32"
           memory: "40Gi"
           gpu: "1"
       extraPodSpec:
@@ -183,7 +183,7 @@ spec:
             - -c
           args:
             - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
-    prefill:
+    VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
@@ -209,11 +209,11 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "10"
+          cpu: "32"
           memory: "40Gi"
           gpu: "1"
         limits:
-          cpu: "10"
+          cpu: "32"
           memory: "40Gi"
           gpu: "1"
       extraPodSpec:

From 539ff3eb24f73617940de78994578a7c64902b7c Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Sat, 26 Jul 2025 10:05:42 -0700
Subject: [PATCH 53/58] update readme

---
 components/backends/vllm/README.md                  | 3 +++
 docs/architecture/pre_deployment_profiling.md       | 2 +-
 docs/architecture/sla_planner.md                    | 4 ++--
 docs/guides/dynamo_deploy/sla_planner_deployment.md | 2 ++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md
index 53f5d4d8531..3ea9b9d0f76 100644
--- a/components/backends/vllm/README.md
+++ b/components/backends/vllm/README.md
@@ -112,6 +112,7 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
 - `agg_router.yaml` - Aggregated serving with KV routing
 - `disagg.yaml` - Disaggregated serving
 - `disagg_router.yaml` - Disaggregated serving with KV routing
+- `disagg_planner.yaml` - Disaggregated serving with [SLA Planner](../../../docs/architecture/sla_planner.md). See [SLA Planner Deployment Guide](../../../docs/guides/dynamo_deploy/sla_planner_deployment.md) for more details.
 
 #### Prerequisites
 
@@ -124,6 +125,8 @@ For Kubernetes deployment, YAML manifests are provided in the `deploy/` director
   # Update the image references in the YAML files
   ```
 
+- **Pre-Deployment Profiling (if Using SLA Planner)**: Follow the [pre-deployment profiling guide](../../../docs/architecture/pre_deployment_profiling.md) to run pre-deployment profiling. The results will be saved to the `profiling-pvc` PVC and queried by the SLA Planner.
+
 - **Port Forwarding**: After deployment, forward the frontend service to access the API:
   ```bash
   kubectl port-forward deployment/vllm-v1-disagg-frontend-<pod-uuid-info> 8080:8000
diff --git a/docs/architecture/pre_deployment_profiling.md b/docs/architecture/pre_deployment_profiling.md
index d0c745b4422..ea2ddaab2c2 100644
--- a/docs/architecture/pre_deployment_profiling.md
+++ b/docs/architecture/pre_deployment_profiling.md
@@ -29,7 +29,7 @@ The script will recommend the best TP size for prefill and decode, as well as th
 2025-05-16 15:20:24 - __main__ - INFO - Suggested planner upper/lower bound for decode kv cache utilization: 0.20/0.10
 ```
 
-After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner. The results will be saved to `<output_dir>/<decode/prefill>_tp<best_tp>_interpolation`.
+After finding the best TP size for prefill and decode, the script will then interpolate the TTFT with ISL and ITL with active KV cache and decode context length. This is to provide a more accurate estimation of the performance when ISL and OSL changes and will be used in the sla-planner. The results will be saved to `<output_dir>/<decode/prefill>_tp<best_tp>_interpolation`. Please change the prefill and decode TP size in the config file to match the best TP sizes obtained from the profiling script.
 
 ### Prefill Interpolation Data
 
diff --git a/docs/architecture/sla_planner.md b/docs/architecture/sla_planner.md
index 6e3ec836d27..3c2fcf428cf 100644
--- a/docs/architecture/sla_planner.md
+++ b/docs/architecture/sla_planner.md
@@ -8,7 +8,7 @@ The SLA (Service Level Agreement)-based planner is an intelligent autoscaling sy
 > Currently, SLA-based planner only supports disaggregated setup.
 
 > [!WARNING]
-> Bare metal deployment with local connector is deprecated. The only option to deploy SLA-based planner is via k8s. We will update the examples in this document soon.
+> Bare metal deployment with local connector is deprecated. Please deploy the SLA planner in k8s.
 
 ## Features
 
@@ -115,4 +115,4 @@ kubectl apply -f disagg_planner.yaml -n {$NAMESPACE}
 ```
 
 > [!NOTE]
-> The SLA planner requires a frontend that reports metrics at `/metrics` HTTP endpoint with number of requests, ISL, OSL, TTFT, ITL in the correct format. The VLLM frontend provides these metrics automatically.
+> The SLA planner requires a frontend that reports metrics at `/metrics` HTTP endpoint with number of requests, ISL, OSL, TTFT, ITL in the correct format. The dynamo frontend provides these metrics automatically.
diff --git a/docs/guides/dynamo_deploy/sla_planner_deployment.md b/docs/guides/dynamo_deploy/sla_planner_deployment.md
index e79e25a8e05..04af6b1d515 100644
--- a/docs/guides/dynamo_deploy/sla_planner_deployment.md
+++ b/docs/guides/dynamo_deploy/sla_planner_deployment.md
@@ -25,6 +25,8 @@ flowchart LR
 ## Prerequisites
 - Kubernetes cluster with GPU nodes
 - `hf-token-secret` created in target namespace
+- [Pre-Deployment Profiling](../../architecture/pre_deployment_profiling.md) results saved to `profiling-pvc` PVC.
+- Prefill and decode worker uses the best parallelization mapping suggested by the pre-deployment profiling script.
 
 ```bash
 export NAMESPACE=your-namespace

From f99412853c010b528ff38689078e79543a049981 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 28 Jul 2025 09:09:33 -0700
Subject: [PATCH 54/58] feat: address coderabbit MR comments

---
 components/backends/vllm/README.md            | 11 -----
 .../backends/vllm/deploy/disagg_planner.yaml  | 38 ++++++++--------
 .../planner/src/dynamo/planner/defaults.py    | 11 ++---
 components/planner/src/dynamo/planner/kube.py | 23 +++++-----
 .../dynamo/planner/kubernetes_connector.py    | 43 ++++++++++++++++---
 .../planner/src/dynamo/planner/planner_sla.py |  2 +-
 6 files changed, 73 insertions(+), 55 deletions(-)

diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md
index 3ea9b9d0f76..d93753087b6 100644
--- a/components/backends/vllm/README.md
+++ b/components/backends/vllm/README.md
@@ -154,17 +154,6 @@ spec:
   ...
 ```
 
-To change `DYN_LOG` level, edit the yaml file by adding
-
-```yaml
-...
-spec:
-  envs:
-    - name: DYN_LOG
-      value: "debug" # or other log levels
-  ...
-```
-
 ### Testing the Deployment
 
 Send a test request to verify your deployment:
diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
index 1e92b370a56..fab9dfdc7d7 100644
--- a/components/backends/vllm/deploy/disagg_planner.yaml
+++ b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -78,14 +78,14 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "10"
-          memory: "10Gi"
+          cpu: "2"
+          memory: "2Gi"
         limits:
-          cpu: "10"
-          memory: "10Gi"
+          cpu: "2"
+          memory: "2Gi"
       pvc:
         create: false
-        name: profiling-pvc
+        name: profiling-pvc # Must be pre-created before deployment and SLA profiler must have been run
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
@@ -127,11 +127,11 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "10"
-          memory: "10Gi"
+          cpu: "2"
+          memory: "2Gi"
         limits:
-          cpu: "10"
-          memory: "10Gi"
+          cpu: "2"
+          memory: "2Gi"
       extraPodSpec:
         mainContainer:
           image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.17
@@ -141,7 +141,7 @@ spec:
             - -c
           args:
             - "python3 -m dynamo.planner.prometheus"
-    VllmDecodeWorker:
+    backend:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
@@ -167,12 +167,12 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "32"
-          memory: "40Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
         limits:
-          cpu: "32"
-          memory: "40Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:
@@ -183,7 +183,7 @@ spec:
             - -c
           args:
             - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log"
-    VllmPrefillWorker:
+    prefill:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
       componentType: worker
@@ -209,12 +209,12 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "32"
-          memory: "40Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
         limits:
-          cpu: "32"
-          memory: "40Gi"
+          cpu: "8"
+          memory: "16Gi"
           gpu: "1"
       extraPodSpec:
         mainContainer:
diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index e0f68b2ab70..7b866fef805 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -15,7 +15,7 @@
 
 import os
 
-from dynamo.planner.kube import KubernetesAPI
+from dynamo.planner.kube import get_current_k8s_namespace
 
 
 # Source of truth for planner defaults
@@ -43,14 +43,15 @@ class LoadPlannerDefaults(BasePlannerDefaults):
 def _get_default_prometheus_endpoint(port: str, namespace: str):
     """Compute default prometheus endpoint using environment variables and Kubernetes service discovery"""
 
-    k8s_api = KubernetesAPI()
-    k8s_namespace = k8s_api.current_namespace
-
+    k8s_namespace = get_current_k8s_namespace()
     if k8s_namespace and k8s_namespace != "default":
         prometheus_service = f"{namespace}-prometheus"
         return f"http://{prometheus_service}.{k8s_namespace}.svc.cluster.local:{port}"
     else:
-        raise RuntimeError("Can't find a prometheus endpoint for the planner!")
+        raise RuntimeError(
+            f"Cannot determine Prometheus endpoint. Running in namespace '{k8s_namespace}'. "
+            "Ensure the planner is deployed in a Kubernetes cluster with proper namespace configuration."
+        )
 
 
 class SLAPlannerDefaults(BasePlannerDefaults):
diff --git a/components/planner/src/dynamo/planner/kube.py b/components/planner/src/dynamo/planner/kube.py
index 127d0392e2a..7b9e846bd57 100644
--- a/components/planner/src/dynamo/planner/kube.py
+++ b/components/planner/src/dynamo/planner/kube.py
@@ -20,6 +20,16 @@
 from kubernetes.config.config_exception import ConfigException
 
 
+def get_current_k8s_namespace() -> str:
+    """Get the current namespace if running inside a k8s cluster"""
+    try:
+        with open("/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r") as f:
+            return f.read().strip()
+    except FileNotFoundError:
+        # Fallback to 'default' if not running in k8s
+        return "default"
+
+
 class KubernetesAPI:
     def __init__(self, k8s_namespace: Optional[str] = None):
         # Load kubernetes configuration
@@ -29,18 +39,7 @@ def __init__(self, k8s_namespace: Optional[str] = None):
             config.load_kube_config()  # for out-of-cluster deployment
 
         self.custom_api = client.CustomObjectsApi()
-        self.current_namespace = k8s_namespace or self._get_current_namespace()
-
-    def _get_current_namespace(self) -> str:
-        """Get the current namespace if running inside a k8s cluster"""
-        try:
-            with open(
-                "/var/run/secrets/kubernetes.io/serviceaccount/namespace", "r"
-            ) as f:
-                return f.read().strip()
-        except FileNotFoundError:
-            # Fallback to 'default' if not running in k8s
-            return "default"
+        self.current_namespace = k8s_namespace or get_current_k8s_namespace()
 
     def _get_graph_deployment_from_name(
         self, graph_deployment_name: str
diff --git a/components/planner/src/dynamo/planner/kubernetes_connector.py b/components/planner/src/dynamo/planner/kubernetes_connector.py
index a70e71e8416..9a9933507b8 100644
--- a/components/planner/src/dynamo/planner/kubernetes_connector.py
+++ b/components/planner/src/dynamo/planner/kubernetes_connector.py
@@ -76,19 +76,48 @@ async def remove_component(self, component_name: str, blocking: bool = True):
                     self._get_graph_deployment_name(deployment)
                 )
 
-    async def set_component_replicas(
-        self, target_replicas: dict[str, int], blocking: bool = True
-    ):
-        """Set the replicas for multiple components at once"""
-
+    async def _validate_components_same_deployment(
+        self, target_replicas: dict[str, int]
+    ) -> dict:
+        """
+        Validate that all target components belong to the same DynamoGraphDeployment.
+        """
+        if not target_replicas:
+            raise ValueError("target_replicas cannot be empty")
+
+        # Get deployment for first component
+        first_component = next(iter(target_replicas))
         deployment = await self.kube_api.get_graph_deployment(
-            next(iter(target_replicas)), self.dynamo_namespace
+            first_component, self.dynamo_namespace
         )
         if deployment is None:
             raise ValueError(
-                f"Graph {next(iter(target_replicas))} not found for namespace {self.dynamo_namespace}"
+                f"Component {first_component} not found in namespace {self.dynamo_namespace}"
+            )
+
+        # Validate that all components belong to the same DGD
+        graph_name = deployment["metadata"]["name"]
+        for component in target_replicas:
+            comp_deployment = await self.kube_api.get_graph_deployment(
+                component, self.dynamo_namespace
             )
+            if comp_deployment is None:
+                raise ValueError(
+                    f"Component {component} not found in namespace {self.dynamo_namespace}"
+                )
+            if comp_deployment["metadata"]["name"] != graph_name:
+                raise ValueError(
+                    f"Component {component} belongs to graph '{comp_deployment['metadata']['name']}' "
+                    f"but expected graph '{graph_name}'. All components must belong to the same GraphDeployment."
+                )
 
+        return deployment
+
+    async def set_component_replicas(
+        self, target_replicas: dict[str, int], blocking: bool = True
+    ):
+        """Set the replicas for multiple components at once"""
+        deployment = await self._validate_components_same_deployment(target_replicas)
         if not await self.kube_api.is_deployment_ready(
             self._get_graph_deployment_name(deployment)
         ):
diff --git a/components/planner/src/dynamo/planner/planner_sla.py b/components/planner/src/dynamo/planner/planner_sla.py
index 65b546ba7b2..788c65862d2 100644
--- a/components/planner/src/dynamo/planner/planner_sla.py
+++ b/components/planner/src/dynamo/planner/planner_sla.py
@@ -43,7 +43,7 @@ async def init_planner(runtime: DistributedRuntime, args):
     component = runtime.namespace(SLAPlannerDefaults.namespace).component("Planner")
     await component.create_service()
 
-    async def generate(self, request: RequestType):
+    async def generate(request: RequestType):
         """Dummy endpoint to satisfy that each component has an endpoint"""
         yield "mock endpoint"
 

From f601f88498017dd94c41ee746810a9f68a080cdf Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 28 Jul 2025 10:02:26 -0700
Subject: [PATCH 55/58] fix pytest

---
 components/planner/src/dynamo/planner/defaults.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py
index 7b866fef805..2eae047167f 100644
--- a/components/planner/src/dynamo/planner/defaults.py
+++ b/components/planner/src/dynamo/planner/defaults.py
@@ -13,9 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import os
 
 from dynamo.planner.kube import get_current_k8s_namespace
+from dynamo.runtime.logging import configure_dynamo_logging
+
+configure_dynamo_logging()
+logger = logging.getLogger(__name__)
 
 
 # Source of truth for planner defaults
@@ -48,10 +53,11 @@ def _get_default_prometheus_endpoint(port: str, namespace: str):
         prometheus_service = f"{namespace}-prometheus"
         return f"http://{prometheus_service}.{k8s_namespace}.svc.cluster.local:{port}"
     else:
-        raise RuntimeError(
+        logger.warning(
             f"Cannot determine Prometheus endpoint. Running in namespace '{k8s_namespace}'. "
             "Ensure the planner is deployed in a Kubernetes cluster with proper namespace configuration."
         )
+        return f"{namespace}-prometheus"
 
 
 class SLAPlannerDefaults(BasePlannerDefaults):

From 69d64dcfa2958c3e4301308c8d20b9c88a8e1fd1 Mon Sep 17 00:00:00 2001
From: hongkuan <hongkuanz@nvidia.com>
Date: Mon, 28 Jul 2025 11:02:47 -0700
Subject: [PATCH 56/58] mypy

---
 .../planner/src/dynamo/planner/utils/planner_core.py   | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index 51aafa56268..7ba3e6f4ef5 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -167,15 +167,19 @@ def observe_metrics(self):
 
     async def make_adjustments(self):
         try:
-            # Check if metrics are valid (not NaN) - skip adjustment if no traffic
+            # Check if metrics are valid (not None and not NaN) - skip adjustment if no traffic
             if (
-                math.isnan(self.last_metrics.ttft)
+                self.last_metrics.ttft is None
+                or self.last_metrics.itl is None
+                or self.last_metrics.isl is None
+                or self.last_metrics.osl is None
+                or math.isnan(self.last_metrics.ttft)
                 or math.isnan(self.last_metrics.itl)
                 or math.isnan(self.last_metrics.isl)
                 or math.isnan(self.last_metrics.osl)
             ):
                 logger.info(
-                    "Metrics contain NaN values (no active requests), skipping adjustment"
+                    "Metrics contain None or NaN values (no active requests), skipping adjustment"
                 )
                 return
 

From 1fdf3e9a8d980eea10cd24308a55b4d951752f53 Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 28 Jul 2025 11:56:03 -0700
Subject: [PATCH 57/58] feat: addressing MR comments

---
 .../src/dynamo/planner/utils/planner_core.py  | 29 ++++--
 .../src/dynamo/planner/utils/prometheus.py    | 95 +++++++++----------
 2 files changed, 65 insertions(+), 59 deletions(-)

diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index 7ba3e6f4ef5..c93d4722dc4 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -47,6 +47,24 @@ class Metrics:
     p_load: Optional[float] = None
     d_load: Optional[float] = None
 
+    def is_valid(self) -> bool:
+        """
+        Check if all metrics are valid (not None and not NaN).
+
+        Returns:
+            True if all metrics are valid, False otherwise
+        """
+        return (
+            self.ttft is not None
+            and self.itl is not None
+            and self.isl is not None
+            and self.osl is not None
+            and not math.isnan(self.ttft)
+            and not math.isnan(self.itl)
+            and not math.isnan(self.isl)
+            and not math.isnan(self.osl)
+        )
+
 
 class Planner:
     def __init__(self, runtime: DistributedRuntime, args: argparse.Namespace):
@@ -168,16 +186,7 @@ def observe_metrics(self):
     async def make_adjustments(self):
         try:
             # Check if metrics are valid (not None and not NaN) - skip adjustment if no traffic
-            if (
-                self.last_metrics.ttft is None
-                or self.last_metrics.itl is None
-                or self.last_metrics.isl is None
-                or self.last_metrics.osl is None
-                or math.isnan(self.last_metrics.ttft)
-                or math.isnan(self.last_metrics.itl)
-                or math.isnan(self.last_metrics.isl)
-                or math.isnan(self.last_metrics.osl)
-            ):
+            if not self.last_metrics.is_valid():
                 logger.info(
                     "Metrics contain None or NaN values (no active requests), skipping adjustment"
                 )
diff --git a/components/planner/src/dynamo/planner/utils/prometheus.py b/components/planner/src/dynamo/planner/utils/prometheus.py
index 46982b4e092..586c3efacc9 100644
--- a/components/planner/src/dynamo/planner/utils/prometheus.py
+++ b/components/planner/src/dynamo/planner/utils/prometheus.py
@@ -27,46 +27,55 @@ class PrometheusAPIClient:
     def __init__(self, url: str):
         self.prom = PrometheusConnect(url=url, disable_ssl=True)
 
-    def get_avg_inter_token_latency(self, interval: str):
+    def _get_average_metric(
+        self, metric_name: str, interval: str, operation_name: str
+    ) -> float:
+        """
+        Helper method to get average metrics using the pattern:
+        increase(metric_sum[interval])/increase(metric_count[interval])
+
+        Args:
+            metric_name: Base metric name (e.g., 'nv_llm_http_service_inter_token_latency_seconds')
+            interval: Time interval for the query (e.g., '60s')
+            operation_name: Human-readable name for error logging
+
+        Returns:
+            Average metric value or 0 if no data/error
+        """
         try:
-            result = self.prom.custom_query(
-                query=f"increase(nv_llm_http_service_inter_token_latency_seconds_sum[{interval}])/increase(nv_llm_http_service_inter_token_latency_seconds_count[{interval}])",
-            )
+            query = f"increase({metric_name}_sum[{interval}])/increase({metric_name}_count[{interval}])"
+            result = self.prom.custom_query(query=query)
             if not result:
                 # No data available yet (no requests made) - return 0 silently
                 return 0
             return float(result[0]["value"][1])
         except Exception as e:
-            logger.error(f"Error getting avg inter token latency: {e}")
+            logger.error(f"Error getting {operation_name}: {e}")
             return 0
 
+    def get_avg_inter_token_latency(self, interval: str):
+        return self._get_average_metric(
+            "nv_llm_http_service_inter_token_latency_seconds",
+            interval,
+            "avg inter token latency",
+        )
+
     def get_avg_time_to_first_token(self, interval: str):
-        try:
-            result = self.prom.custom_query(
-                query=f"increase(nv_llm_http_service_time_to_first_token_seconds_sum[{interval}])/increase(nv_llm_http_service_time_to_first_token_seconds_count[{interval}])",
-            )
-            if not result:
-                # No data available yet (no requests made) - return 0 silently
-                return 0
-            return float(result[0]["value"][1])
-        except Exception as e:
-            logger.error(f"Error getting avg time to first token: {e}")
-            return 0
+        return self._get_average_metric(
+            "nv_llm_http_service_time_to_first_token_seconds",
+            interval,
+            "avg time to first token",
+        )
 
     def get_avg_request_duration(self, interval: str):
-        try:
-            result = self.prom.custom_query(
-                query=f"increase(nv_llm_http_service_request_duration_seconds_sum[{interval}])/increase(nv_llm_http_service_request_duration_seconds_count[{interval}])",
-            )
-            if not result:
-                # No data available yet (no requests made) - return 0 silently
-                return 0
-            return float(result[0]["value"][1])
-        except Exception as e:
-            logger.error(f"Error getting avg request duration: {e}")
-            return 0
+        return self._get_average_metric(
+            "nv_llm_http_service_request_duration_seconds",
+            interval,
+            "avg request duration",
+        )
 
     def get_avg_request_count(self, interval: str):
+        # This function follows a different query pattern than the other metrics
         try:
             raw_res = self.prom.custom_query(
                 query=f"increase(nv_llm_http_service_requests_total[{interval}])"
@@ -81,27 +90,15 @@ def get_avg_request_count(self, interval: str):
             return 0
 
     def get_avg_input_sequence_tokens(self, interval: str):
-        try:
-            result = self.prom.custom_query(
-                query=f"increase(nv_llm_http_service_input_sequence_tokens_sum[{interval}])/increase(nv_llm_http_service_input_sequence_tokens_count[{interval}])",
-            )
-            if not result:
-                # No data available yet (no requests made) - return 0 silently
-                return 0
-            return float(result[0]["value"][1])
-        except Exception as e:
-            logger.error(f"Error getting avg input sequence tokens: {e}")
-            return 0
+        return self._get_average_metric(
+            "nv_llm_http_service_input_sequence_tokens",
+            interval,
+            "avg input sequence tokens",
+        )
 
     def get_avg_output_sequence_tokens(self, interval: str):
-        try:
-            result = self.prom.custom_query(
-                query=f"increase(nv_llm_http_service_output_sequence_tokens_sum[{interval}])/increase(nv_llm_http_service_output_sequence_tokens_count[{interval}])",
-            )
-            if not result:
-                # No data available yet (no requests made) - return 0 silently
-                return 0
-            return float(result[0]["value"][1])
-        except Exception as e:
-            logger.error(f"Error getting avg output sequence tokens: {e}")
-            return 0
+        return self._get_average_metric(
+            "nv_llm_http_service_output_sequence_tokens",
+            interval,
+            "avg output sequence tokens",
+        )

From bf58bd01823ff00a4f5252b6ee856163cdadb04a Mon Sep 17 00:00:00 2001
From: Hannah Zhang <hannahz@nvidia.com>
Date: Mon, 28 Jul 2025 11:56:33 -0700
Subject: [PATCH 58/58] feat: addressing MR comments

---
 .../planner/src/dynamo/planner/utils/planner_core.py     | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py
index c93d4722dc4..ffe532d216b 100644
--- a/components/planner/src/dynamo/planner/utils/planner_core.py
+++ b/components/planner/src/dynamo/planner/utils/planner_core.py
@@ -48,12 +48,7 @@ class Metrics:
     d_load: Optional[float] = None
 
     def is_valid(self) -> bool:
-        """
-        Check if all metrics are valid (not None and not NaN).
-
-        Returns:
-            True if all metrics are valid, False otherwise
-        """
+        """Check if all metrics are valid (not None and not NaN)."""
         return (
             self.ttft is not None
             and self.itl is not None
@@ -185,7 +180,7 @@ def observe_metrics(self):
 
     async def make_adjustments(self):
         try:
-            # Check if metrics are valid (not None and not NaN) - skip adjustment if no traffic
+            # Skip adjustment if no traffic
             if not self.last_metrics.is_valid():
                 logger.info(
                     "Metrics contain None or NaN values (no active requests), skipping adjustment"