feat: wip but tentatively working planner, with documentation

ai-dynamo · tedzhouhk · Jul 28, 2025 · Jul 11, 2025 · Jul 11, 2025 · Jul 12, 2025
commit 9b34ee9520e4e3423e40e9b07d4a2c5181055cec
diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml
@@ -8,7 +8,7 @@ metadata:
 spec:
   envs:
     - name: DYNAMO_SERVICE_CONFIG
-      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]}}'
+      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["vllm-disagg-planner-frontend:8000"]}]}]}}'
   services:
     Frontend:
       dynamoNamespace: vllm-disagg-planner
@@ -34,22 +34,20 @@ spec:
         failureThreshold: 10
       resources:
         requests:
-          cpu: "2"
-          memory: "4Gi"
+          cpu: "4"
+          memory: "16Gi"
         limits:
-          cpu: "2"
-          memory: "4Gi"
+          cpu: "4"
+          memory: "16Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
-          workingDir: /workspace/examples/vllm
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
+          workingDir: /workspace/components/backends/vllm
+          command:
+            - /bin/sh
+            - -c
           args:
-            - dynamo
-            - run
-            - in=http
-            - out=dyn
-            - --http-port
-            - "8000"
+            - "python3 -m dynamo.frontend --http-port 8000"
     Planner:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -87,7 +85,7 @@ spec:
         mountPoint: /workspace/profiling_results
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.5
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/planner/src/dynamo/planner
           args:
             - python
@@ -106,6 +104,8 @@ spec:
       envs:
         - name: PYTHONPATH
           value: "/workspace/components/planner/src"
+        - name: DYNAMO_PORT
+          value: "9090"
       livenessProbe:
         exec:
           command:
@@ -134,15 +134,13 @@ spec:
           memory: "2Gi"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - python
-            - -m
-            - dynamo.planner.prometheus
+            - "python3 -m dynamo.planner.prometheus"
     VllmDecodeWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -178,15 +176,13 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - /bin/sh
-            - -c
-            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager 2>&1 | tee /tmp/vllm.log"
+            - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --namespace vllm-disagg-planner 2>&1 | tee /tmp/vllm.log"
     VllmPrefillWorker:
       dynamoNamespace: vllm-disagg-planner
       envFromSecret: hf-token-secret
@@ -222,10 +218,10 @@ spec:
           gpu: "1"
       extraPodSpec:
         mainContainer:
-          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-233.17
+          image: nvcr.io/nvidian/nim-llm-dev/vllm-runtime:dep-253.11
           workingDir: /workspace/components/backends/vllm
           command:
             - /bin/sh
             - -c
           args:
-            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --enforce-eager --is-prefill-worker 2>&1 | tee /tmp/vllm.log
+            - python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B --is-prefill-worker --namespace vllm-disagg-planner 2>&1 | tee /tmp/vllm.log
@@ -52,10 +52,16 @@ def parse_args() -> Config:
         default=DEFAULT_ENDPOINT,
         help=f"Dynamo endpoint string in 'dyn://namespace.component.endpoint' format. Default: {DEFAULT_ENDPOINT}",
     )
+    parser.add_argument(
+        "--namespace",
+        type=str,
+        default="dynamo",
+        help="Dynamo namespace for this worker. Default: dynamo",
+    )
     parser.add_argument(
         "--is-prefill-worker",
         action="store_true",
-        help="Enable prefill functionality for this worker. Currently overwrites the --endpoint to be a specially chosen dyn://dynamo.prefill.generate",
+        help="Enable prefill functionality for this worker. Uses the provided namespace to construct dyn://namespace.prefill.generate",
     )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
@@ -80,7 +86,7 @@ def parse_args() -> Config:
         config.served_model_name = None
 
     if args.is_prefill_worker:
-        args.endpoint = "dyn://dynamo.prefill.generate"
+        args.endpoint = f"dyn://{args.namespace}.prefill.generate"
 
     endpoint_str = args.endpoint.replace("dyn://", "", 1)
     endpoint_parts = endpoint_str.split(".")
@@ -127,6 +133,14 @@ async def allocate_and_reserve_port(
     """
 
     node_name = socket.gethostname()
+    try:
+        node_ip = socket.gethostbyname(node_name)
+    except socket.gaierror:
+        # If hostname cannot be resolved, fall back to localhost
+        logger.warning(
+            f"Hostname '{node_name}' cannot be resolved, falling back to '127.0.0.1'"
+        )
+        node_ip = "127.0.0.1"
 
     for attempt in range(1, max_attempts + 1):
         # Hold socket open just long enough to reserve in ETCD
@@ -136,7 +150,7 @@ async def allocate_and_reserve_port(
             port = sock.getsockname()[1]
 
             # Reserve in ETCD while holding the socket
-            key = f"dyn://{namespace}/ports/{node_name}/{port}"
+            key = f"dyn://{namespace}/ports/{node_ip}/{port}"
             value = {
                 "worker_id": worker_id,
                 "reason": reason,
@@ -238,23 +252,41 @@ def overwrite_args(config):
             raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.")
 
 
-def set_side_channel_host_and_port(config: Config, hostname: Optional[str] = None):
-    """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors.
-    This sets the port number for the side channel.
+def get_host_ip() -> str:
+    """Get the IP address of the host.
+    This is needed for the side channel to work in multi-node deployments.
     """
-    if hostname is None:
-        hostname = socket.gethostname()
-        # Test if hostname is usable by attempting to bind to it
+    try:
+        host_name = socket.gethostname()
+    except socket.error as e:
+        logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'")
+        return "127.0.0.1"
+    else:
         try:
+            # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments
+            host_ip = socket.gethostbyname(host_name)
+            # Test if the IP is actually usable by binding to it
             with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket:
-                test_socket.bind((hostname, 0))
-        except (socket.error, socket.gaierror):
-            # If hostname is not usable, fall back to localhost
+                test_socket.bind((host_ip, 0))
+            return host_ip
+        except socket.gaierror as e:
             logger.warning(
-                f"Hostname '{hostname}' is not usable, falling back to '127.0.0.1'"
+                f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'"
             )
-            hostname = "127.0.0.1"
+            return "127.0.0.1"
+        except socket.error as e:
+            # If hostname is not usable for binding, fall back to localhost
+            logger.warning(
+                f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'"
+            )
+            return "127.0.0.1"
+
 
-    os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = hostname
+def set_side_channel_host_and_port(config: Config):
+    """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors.
+    This sets the port number for the side channel.
+    """
+    host_ip = get_host_ip()
+    os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = host_ip
     os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(config.side_channel_port)
-    logger.debug(f"Set NIXL side channel to {hostname}:{config.side_channel_port}")
+    logger.debug(f"Set NIXL side channel to {host_ip}:{config.side_channel_port}")
@@ -48,9 +48,9 @@ class SLAPlannerDefaults(BasePlannerDefaults):
 
 
 class VllmComponentName:
-    prefill_worker = "VllmPrefillWorker"
+    prefill_worker = "prefill"
     prefill_worker_endpoint = "generate"
-    decode_worker = "VllmDecodeWorker"
+    decode_worker = "backend"
     decode_worker_endpoint = "generate"
 
 

@@ -47,6 +47,7 @@ async def start_prometheus_server(config):
     cmd = [
         "prometheus",
         f"--config.file={config_path}",
+        "--web.listen-address=0.0.0.0:9090",
     ]
 
     logger.info(f"Prometheus cmd: {cmd}")

@@ -222,7 +222,14 @@ async def make_adjustments(self):
 
             # compute how many replicas are needed for decode
             # 1. apply d_correction_factor to the ITL SLA
-            corrected_itl = self.args.itl / self.d_correction_factor
+            # Prevent divide by zero when d_correction_factor is 0 (no metrics yet)
+            if self.d_correction_factor <= 0:
+                logger.warning(
+                    f"d_correction_factor is {self.d_correction_factor}, using default value of 1.0"
+                )
+                corrected_itl = self.args.itl
+            else:
+                corrected_itl = self.args.itl / self.d_correction_factor
             # 2. reversely find out what is best throughput/gpu that can achieve corrected_itl under the predicted context length
             pred_decode_thpt_per_gpu = (
                 self.decode_interpolator.find_best_throughput_per_gpu(
@@ -274,7 +281,7 @@ async def make_adjustments(self):
                 WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p,
                 WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d,
             }
-            self.connector.set_component_replicas(target_replicas, blocking=False)
+            await self.connector.set_component_replicas(target_replicas, blocking=False)
 
     async def run(self):
         """Main loop for the planner"""

diff --git a/docs/architecture/sla_planner.md b/docs/architecture/sla_planner.md
@@ -106,11 +106,13 @@ Finally, SLA planner applies the change by scaling up/down the number of prefill
 
 ## Deploying
 
-To deploy SLA-planner, ensure etcd and NATS are running first, then use the frontend that reports metrics at `/metrics` HTTP endpoint. You can also use your own frontend, but it must report number of requests, ISL, OSL, TTFT, ITL in the same format.
-
-SLA-planner and prometheus server are provided as common components that can be directly imported from `dynamo` package.
+For detailed deployment instructions including setup, configuration, troubleshooting, and architecture overview, see the [SLA Planner Deployment Guide](../guides/dynamo_deploy/sla_planner_deployment.md).
 
+**Quick Start:**
 ```bash
 cd components/backends/vllm/deploy
 kubectl apply -f disagg_planner.yaml -n {$NAMESPACE}
 ```
+
+> [!NOTE]
+> The SLA planner requires a frontend that reports metrics at `/metrics` HTTP endpoint with number of requests, ISL, OSL, TTFT, ITL in the correct format. The VLLM frontend provides these metrics automatically.