Skip to content
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b00cc28
add sub component type
tmonty12 Sep 19, 2025
433c71e
add testing
tmonty12 Sep 19, 2025
11f9e59
planner using prometheus env var endpoint
tmonty12 Sep 22, 2025
8eb0665
planner uses subComponentType and testingt
tmonty12 Sep 23, 2025
ea84c64
use deployment validation error
tmonty12 Sep 23, 2025
0a71b68
add back copyright header
tmonty12 Sep 23, 2025
3ec215d
backwards compatibility with framework component name
tmonty12 Sep 23, 2025
411c02f
update pre deployment profiling to use subComponentType
tmonty12 Sep 24, 2025
c59dd08
small comments
tmonty12 Sep 24, 2025
8e5dfda
update planner manifest to remove prometheus svc and use subComponent…
tmonty12 Sep 24, 2025
69bc1a9
update sla planner deployment docs
tmonty12 Sep 24, 2025
7dc6696
fix doc link
tmonty12 Sep 24, 2025
752b4ac
update profiler config and fix ci
tmonty12 Sep 24, 2025
0761925
small fixes
tmonty12 Sep 24, 2025
51782ff
more small fixes
tmonty12 Sep 24, 2025
3cc6677
revert changes to profiler - will do so in follow on PR
tmonty12 Sep 24, 2025
88b4181
args not optional
tmonty12 Sep 24, 2025
4da6983
small docs update
tmonty12 Sep 24, 2025
20504a2
properly parse prometheus metrics
tmonty12 Sep 26, 2025
3661f5d
fix ci
tmonty12 Sep 26, 2025
74e054c
fix virtual_connector
tmonty12 Sep 26, 2025
823327d
fix mypy
tmonty12 Sep 26, 2025
2c85f2d
remove prometheus server
tmonty12 Sep 26, 2025
0b54bca
pc
tedzhouhk Sep 26, 2025
15d524e
add subComponentType, remove prometheus installation, remove service …
tmonty12 Sep 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 2 additions & 35 deletions components/backends/sglang/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,45 +61,11 @@ spec:
--backend=sglang
--adjustment-interval=60
--profile-results-dir=/data/profiling_results
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: dynamo
componentType: frontend
replicas: 1
envs:
- name: PYTHONPATH
value: "/workspace/components/planner/src"
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
extraPodSpec:
mainContainer:
image: my-registry/sglang-runtime:my-tag
workingDir: /workspace/components/backends/sglang
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.planner.prometheus"
decode:
dynamoNamespace: dynamo
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 2
resources:
limits:
Expand Down Expand Up @@ -131,6 +97,7 @@ spec:
dynamoNamespace: dynamo
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 2
resources:
limits:
Expand Down
42 changes: 2 additions & 40 deletions components/backends/trtllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,6 @@ spec:
envFromSecret: hf-token-secret
componentType: planner
replicas: 1
envs:
- name: PROMETHEUS_PORT
value: "8000"
livenessProbe:
exec:
command:
Expand Down Expand Up @@ -84,47 +81,11 @@ spec:
- --adjustment-interval=60
- --profile-results-dir=/data/profiling_results
- --prometheus-port=9085
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: trtllm-disagg-planner
componentType: frontend
replicas: 1
envs:
- name: PYTHONPATH
value: "/workspace/components/planner/src"
- name: PROMETHEUS_PORT
value: "8000"
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
extraPodSpec:
mainContainer:
image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/backends/trtllm
command:
- python3
args:
- -m
- dynamo.planner.prometheus
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
livenessProbe:
httpGet:
Expand Down Expand Up @@ -173,6 +134,7 @@ spec:
dynamoNamespace: trtllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
Expand Down
39 changes: 2 additions & 37 deletions components/backends/vllm/deploy/disagg_planner.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,6 @@ spec:
value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
- name: DYNAMO_NAMESPACE
value: "vllm-disagg-planner"
- name: PROMETHEUS_PORT
value: "8000"
services:
Frontend:
dynamoNamespace: vllm-disagg-planner
Expand Down Expand Up @@ -63,45 +61,11 @@ spec:
--backend=vllm
--adjustment-interval=60
--profile-results-dir=/data/profiling_results
Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
dynamoNamespace: vllm-disagg-planner
componentType: frontend
replicas: 1
envs:
- name: PYTHONPATH
value: "/workspace/components/planner/src"
livenessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
readinessProbe:
exec:
command:
- /bin/sh
- -c
- "exit 0"
initialDelaySeconds: 30
periodSeconds: 60
timeoutSeconds: 30
failureThreshold: 10
extraPodSpec:
mainContainer:
image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:my-tag
workingDir: /workspace/components/backends/vllm
command:
- /bin/sh
- -c
args:
- "python3 -m dynamo.planner.prometheus"
VllmDecodeWorker:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 2
resources:
limits:
Expand All @@ -127,6 +91,7 @@ spec:
dynamoNamespace: vllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 2
resources:
limits:
Expand Down
8 changes: 7 additions & 1 deletion components/planner/src/dynamo/planner/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,17 @@
"LoadPlannerDefaults",
"SLAPlannerDefaults",
"ServiceConfig",
"TargetReplica",
"SubComponentType",
]
# Import the classes
from dynamo.planner.config import ServiceConfig
from dynamo.planner.defaults import LoadPlannerDefaults, SLAPlannerDefaults
from dynamo.planner.kubernetes_connector import KubernetesConnector
from dynamo.planner.kubernetes_connector import (
KubernetesConnector,
SubComponentType,
TargetReplica,
)
from dynamo.planner.planner_connector import PlannerConnector
from dynamo.planner.virtual_connector import VirtualConnector

Expand Down
4 changes: 4 additions & 0 deletions components/planner/src/dynamo/planner/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ class LoadPlannerDefaults(BasePlannerDefaults):

def _get_default_prometheus_endpoint(port: str, namespace: str):
"""Compute default prometheus endpoint using environment variables and Kubernetes service discovery"""
prometheus_endpoint = os.environ.get("PROMETHEUS_ENDPOINT", "").strip()
if prometheus_endpoint:
logger.debug("Using PROMETHEUS_ENDPOINT override: %s", prometheus_endpoint)
return prometheus_endpoint

k8s_namespace = get_current_k8s_namespace()
if k8s_namespace and k8s_namespace != "default":
Expand Down
65 changes: 24 additions & 41 deletions components/planner/src/dynamo/planner/kube.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,18 @@
# limitations under the License.

import asyncio
import os
import logging
from typing import Optional

from kubernetes import client, config
from kubernetes.config.config_exception import ConfigException

from dynamo.planner.utils.exceptions import DynamoGraphDeploymentNotFoundError
from dynamo.runtime.logging import configure_dynamo_logging

configure_dynamo_logging()
logger = logging.getLogger(__name__)


def get_current_k8s_namespace() -> str:
"""Get the current namespace if running inside a k8s cluster"""
Expand All @@ -42,9 +48,7 @@ def __init__(self, k8s_namespace: Optional[str] = None):
self.custom_api = client.CustomObjectsApi()
self.current_namespace = k8s_namespace or get_current_k8s_namespace()

def _get_graph_deployment_from_name(
self, graph_deployment_name: str
) -> Optional[dict]:
def _get_graph_deployment_from_name(self, graph_deployment_name: str) -> dict:
"""Get the graph deployment from the dynamo graph deployment name"""
return self.custom_api.get_namespaced_custom_object(
group="nvidia.com",
Expand All @@ -54,38 +58,27 @@ def _get_graph_deployment_from_name(
name=graph_deployment_name,
)

async def get_parent_graph_deployment(self) -> Optional[dict]:
def get_graph_deployment(self, graph_deployment_name: str) -> dict:
"""
Get the parent DynamoGraphDeployment using environment variable.

Uses DYN_PARENT_DGD_K8S_NAME environment variable and assumes the DGD
is in the same namespace as this component (self.current_namespace).
Get the parent DynamoGraphDeployment

Returns:
The DynamoGraphDeployment object or None if env var is not set
"""
dgd_name = os.getenv("DYN_PARENT_DGD_K8S_NAME")

if not dgd_name:
return None
The DynamoGraphDeployment object

Raises:
DynamoGraphDeploymentNotFoundError: If the parent graph deployment is not found
"""
try:
return self._get_graph_deployment_from_name(dgd_name)
return self._get_graph_deployment_from_name(graph_deployment_name)
except client.ApiException as e:
if e.status == 404:
return None
raise DynamoGraphDeploymentNotFoundError(
deployment_name=graph_deployment_name,
namespace=self.current_namespace,
)
raise

async def get_graph_deployment(self) -> Optional[dict]:
"""
Get the parent DynamoGraphDeployment using environment variable.

Returns:
The DynamoGraphDeployment object or None if env var is not set
"""
return await self.get_parent_graph_deployment()

async def update_graph_replicas(
def update_graph_replicas(
self, graph_deployment_name: str, component_name: str, replicas: int
) -> None:
"""Update the replicas count for a component in a DynamoGraphDeployment"""
Expand All @@ -99,15 +92,10 @@ async def update_graph_replicas(
body=patch,
)

async def is_deployment_ready(self, graph_deployment_name: str) -> bool:
def is_deployment_ready(self, deployment: dict) -> bool:
"""Check if a graph deployment is ready"""

graph_deployment = self._get_graph_deployment_from_name(graph_deployment_name)

if not graph_deployment:
raise ValueError(f"Graph deployment {graph_deployment_name} not found")

conditions = graph_deployment.get("status", {}).get("conditions", [])
conditions = deployment.get("status", {}).get("conditions", [])
ready_condition = next(
(c for c in conditions if c.get("type") == "Ready"), None
)
Expand All @@ -125,12 +113,7 @@ async def wait_for_graph_deployment_ready(
for attempt in range(max_attempts):
await asyncio.sleep(delay_seconds)

graph_deployment = self._get_graph_deployment_from_name(
graph_deployment_name
)

if not graph_deployment:
raise ValueError(f"Graph deployment {graph_deployment_name} not found")
graph_deployment = self.get_graph_deployment(graph_deployment_name)

conditions = graph_deployment.get("status", {}).get("conditions", [])
ready_condition = next(
Expand All @@ -140,7 +123,7 @@ async def wait_for_graph_deployment_ready(
if ready_condition and ready_condition.get("status") == "True":
return # Deployment is ready

print(
logger.info(
f"[Attempt {attempt + 1}/{max_attempts}] "
f"(status: {ready_condition.get('status') if ready_condition else 'N/A'}, "
f"message: {ready_condition.get('message') if ready_condition else 'no condition found'})"
Expand Down
Loading
Loading