From 992adfbfbf1611335e69803fd0005df3efe4c87d Mon Sep 17 00:00:00 2001
From: Alec <35311602+alec-flowers@users.noreply.github.com>
Date: Wed, 30 Jul 2025 14:07:21 -0700
Subject: [PATCH 1/9] fix: add better port logic (#2175) (#2192)

---
 .../backends/vllm/src/dynamo/vllm/args.py     | 195 +++++-------
 .../backends/vllm/src/dynamo/vllm/ports.py    | 290 ++++++++++++++++++
 2 files changed, 364 insertions(+), 121 deletions(-)
 create mode 100644 components/backends/vllm/src/dynamo/vllm/ports.py

diff --git a/components/backends/vllm/src/dynamo/vllm/args.py b/components/backends/vllm/src/dynamo/vllm/args.py
index b86649f06b..889405f6af 100644
--- a/components/backends/vllm/src/dynamo/vllm/args.py
+++ b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -2,13 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 
-import asyncio
-import json
 import logging
 import os
-import socket
 import sys
-import time
 from typing import Optional
 
 from vllm.config import KVTransferConfig
@@ -16,9 +12,20 @@
 from vllm.engine.arg_utils import AsyncEngineArgs
 from vllm.utils import FlexibleArgumentParser
 
+from .ports import (
+    DEFAULT_DYNAMO_PORT_MAX,
+    DEFAULT_DYNAMO_PORT_MIN,
+    DynamoPortRange,
+    EtcdContext,
+    PortAllocationRequest,
+    PortMetadata,
+    allocate_and_reserve_port,
+    allocate_and_reserve_port_block,
+    get_host_ip,
+)
+
 logger = logging.getLogger(__name__)
 
-# Only used if you run it manually from the command line
 DEFAULT_ENDPOINT = "dyn://dynamo.backend.generate"
 DEFAULT_MODEL = "Qwen/Qwen3-0.6B"
 
@@ -34,6 +41,7 @@ class Config:
     migration_limit: int = 0
     kv_port: Optional[int] = None
     side_channel_port: Optional[int] = None
+    port_range: DynamoPortRange
 
     # mirror vLLM
     model: str
@@ -64,6 +72,18 @@ def parse_args() -> Config:
         default=0,
         help="Maximum number of times a request may be migrated to a different engine worker. The number may be overridden by the engine.",
     )
+    parser.add_argument(
+        "--dynamo-port-min",
+        type=int,
+        default=DEFAULT_DYNAMO_PORT_MIN,
+        help=f"Minimum port number for Dynamo services (default: {DEFAULT_DYNAMO_PORT_MIN}). Must be in registered ports range (1024-49151).",
+    )
+    parser.add_argument(
+        "--dynamo-port-max",
+        type=int,
+        default=DEFAULT_DYNAMO_PORT_MAX,
+        help=f"Maximum port number for Dynamo services (default: {DEFAULT_DYNAMO_PORT_MAX}). Must be in registered ports range (1024-49151).",
+    )
 
     parser = AsyncEngineArgs.add_cli_args(parser)
     args = parser.parse_args()
@@ -110,6 +130,9 @@ def parse_args() -> Config:
     config.engine_args = engine_args
     config.is_prefill_worker = args.is_prefill_worker
     config.migration_limit = args.migration_limit
+    config.port_range = DynamoPortRange(
+        min=args.dynamo_port_min, max=args.dynamo_port_max
+    )
 
     if config.engine_args.block_size is None:
         config.engine_args.block_size = 16
@@ -120,106 +143,66 @@ def parse_args() -> Config:
     return config
 
 
-async def allocate_and_reserve_port(
-    namespace,
-    etcd_client,
-    worker_id: str,
-    reason: str,
-    max_attempts: int = 100,
-) -> int:
-    """
-    Get an OS-assigned port and atomically reserve it in ETCD.
-    Retries until successful or max_attempts reached.
-
-    Args:
-        max_attempts: Maximum number of ports to try (default: 100)
-
-    Raises:
-        RuntimeError: If unable to reserve a port within max_attempts
-        OSError: If unable to create sockets (system resource issues)
-    """
-
-    node_name = socket.gethostname()
-    try:
-        node_ip = socket.gethostbyname(node_name)
-    except socket.gaierror:
-        # If hostname cannot be resolved, fall back to localhost
-        logger.warning(
-            f"Hostname '{node_name}' cannot be resolved, falling back to '127.0.0.1'"
-        )
-        node_ip = "127.0.0.1"
-
-    for attempt in range(1, max_attempts + 1):
-        # Hold socket open just long enough to reserve in ETCD
-        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
-            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-            sock.bind(("", 0))
-            port = sock.getsockname()[1]
-
-            # Reserve in ETCD while holding the socket
-            key = f"dyn://{namespace}/ports/{node_ip}/{port}"
-            value = {
-                "worker_id": worker_id,
-                "reason": reason,
-                "reserved_at": time.time(),
-                "pid": os.getpid(),
-            }
-
-            try:
-                await etcd_client.kv_create(
-                    key=key,
-                    value=json.dumps(value).encode(),
-                    lease_id=etcd_client.primary_lease_id(),
-                )
-                logger.debug(f"Reserved OS-assigned port {port} for {worker_id}")
-                return port
-
-            except Exception as e:
-                logger.debug(
-                    f"Port {port} on {node_name} was already reserved (attempt {attempt}): {e}"
-                )
-
-        if attempt < max_attempts:
-            await asyncio.sleep(0.01)
-
-    raise RuntimeError(
-        f"Failed to allocate and reserve a port after {max_attempts} attempts"
-    )
-
-
 async def configure_ports_with_etcd(config: Config, etcd_client):
     """Configure all settings that require ETCD, including port allocation and vLLM overrides."""
 
-    # First, allocate ports
+    etcd_context = EtcdContext(client=etcd_client, namespace=config.namespace)
+
     dp_rank = config.engine_args.data_parallel_rank or 0
     worker_id = f"vllm-{config.component}-dp{dp_rank}"
 
     # Allocate KV events port
-    kv_port = await allocate_and_reserve_port(
-        namespace=config.namespace,
-        etcd_client=etcd_client,
-        worker_id=f"{worker_id}",
-        reason="zmq_kv_event_port",
+    if config.engine_args.enable_prefix_caching:
+        kv_metadata = PortMetadata(worker_id=worker_id, reason="zmq_kv_event_port")
+        kv_port = await allocate_and_reserve_port(
+            etcd_context=etcd_context,
+            metadata=kv_metadata,
+            port_range=config.port_range,
+        )
+        config.kv_port = kv_port
+        logger.info(f"Allocated ZMQ KV events port: {kv_port} (worker_id={worker_id})")
+
+    # Allocate side channel ports
+    # https://github.com/vllm-project/vllm/blob/releases/v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L372
+    # NIXL calculates ports as: base_port + (dp_rank * tp_size) + tp_rank
+    # For dp_rank, we need to reserve tp_size consecutive ports
+    tp_size = config.engine_args.tensor_parallel_size or 1
+
+    # The first port for this dp_rank will be at: base_port + (dp_rank * tp_size)
+    # We need to allocate tp_size consecutive ports starting from there
+    nixl_metadata = PortMetadata(worker_id=worker_id, reason="nixl_side_channel_port")
+    nixl_request = PortAllocationRequest(
+        etcd_context=etcd_context,
+        metadata=nixl_metadata,
+        port_range=config.port_range,
+        block_size=tp_size,
     )
+    allocated_ports = await allocate_and_reserve_port_block(nixl_request)
+    first_port_for_dp_rank = allocated_ports[0]
+
+    # Calculate the base port that NIXL expects
+    # base_port = first_port_for_dp_rank - (dp_rank * tp_size)
+    nixl_offset = dp_rank * tp_size
+    base_side_channel_port = first_port_for_dp_rank - nixl_offset
+
+    if base_side_channel_port < 0:
+        raise ValueError(
+            f"NIXL base port calculation resulted in negative port: "
+            f"first_allocated_port={first_port_for_dp_rank}, offset={nixl_offset}, "
+            f"base_port={base_side_channel_port}. Current range: {config.port_range.min}-{config.port_range.max}. "
+            f"Consider using a higher port range."
+        )
 
-    # Allocate side channel port
-    side_channel_port = await allocate_and_reserve_port(
-        namespace=config.namespace,
-        etcd_client=etcd_client,
-        worker_id=f"{worker_id}",
-        reason="nixl_side_channel_port",
-    )
+    config.side_channel_port = base_side_channel_port
 
-    # Update config with allocated ports
-    config.kv_port = kv_port
-    config.side_channel_port = side_channel_port
+    logger.info(
+        f"Allocated NIXL side channel ports: base={base_side_channel_port}, "
+        f"allocated_ports={allocated_ports} (worker_id={worker_id}, dp_rank={dp_rank}, tp_size={tp_size})"
+    )
 
 
 def overwrite_args(config):
     """Set vLLM defaults for Dynamo."""
-    assert (
-        config.kv_port is not None
-    ), "Must set the kv_port, use configure_ports_with_etcd"
     assert (
         config.side_channel_port is not None
     ), "Must set the kv_port, use configure_ports_with_etcd"
@@ -263,36 +246,6 @@ def overwrite_args(config):
             raise ValueError(f"{key} not found in AsyncEngineArgs from vLLM.")
 
 
-def get_host_ip() -> str:
-    """Get the IP address of the host.
-    This is needed for the side channel to work in multi-node deployments.
-    """
-    try:
-        host_name = socket.gethostname()
-    except socket.error as e:
-        logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'")
-        return "127.0.0.1"
-    else:
-        try:
-            # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments
-            host_ip = socket.gethostbyname(host_name)
-            # Test if the IP is actually usable by binding to it
-            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket:
-                test_socket.bind((host_ip, 0))
-            return host_ip
-        except socket.gaierror as e:
-            logger.warning(
-                f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'"
-            )
-            return "127.0.0.1"
-        except socket.error as e:
-            # If hostname is not usable for binding, fall back to localhost
-            logger.warning(
-                f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'"
-            )
-            return "127.0.0.1"
-
-
 def set_side_channel_host_and_port(config: Config):
     """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors.
     This sets the port number for the side channel.
diff --git a/components/backends/vllm/src/dynamo/vllm/ports.py b/components/backends/vllm/src/dynamo/vllm/ports.py
new file mode 100644
index 0000000000..19fdde7279
--- /dev/null
+++ b/components/backends/vllm/src/dynamo/vllm/ports.py
@@ -0,0 +1,290 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Port allocation and management utilities for Dynamo services."""
+
+import asyncio
+import json
+import logging
+import os
+import random
+import socket
+import time
+from contextlib import contextmanager
+from dataclasses import dataclass, field
+
+from dynamo.runtime import EtcdKvCache
+
+logger = logging.getLogger(__name__)
+
+# Default port range in the registered ports section
+DEFAULT_DYNAMO_PORT_MIN = 20000
+DEFAULT_DYNAMO_PORT_MAX = 30000
+
+
+@dataclass
+class DynamoPortRange:
+    """Port range configuration for Dynamo services"""
+
+    min: int
+    max: int
+
+    def __post_init__(self):
+        if self.min < 1024 or self.max > 49151:
+            raise ValueError(
+                f"Port range {self.min}-{self.max} is outside registered ports range (1024-49151)"
+            )
+        if self.min >= self.max:
+            raise ValueError(
+                f"Invalid port range: min ({self.min}) must be less than max ({self.max})"
+            )
+
+
+@dataclass
+class EtcdContext:
+    """Context for ETCD operations"""
+
+    client: EtcdKvCache  # etcd client instance
+    namespace: str  # Namespace for keys (used in key prefix)
+
+    def make_port_key(self, port: int) -> str:
+        """Generate ETCD key for a port reservation"""
+        node_ip = get_host_ip()
+        return f"dyn://{self.namespace}/ports/{node_ip}/{port}"
+
+
+@dataclass
+class PortMetadata:
+    """Metadata to store with port reservations in ETCD"""
+
+    worker_id: str  # Worker identifier (e.g., "vllm-backend-dp0")
+    reason: str  # Purpose of the port (e.g., "nixl_side_channel_port")
+    block_info: dict = field(default_factory=dict)  # Optional block allocation info
+
+    def to_etcd_value(self) -> dict:
+        """Convert to dictionary for ETCD storage"""
+        value = {
+            "worker_id": self.worker_id,
+            "reason": self.reason,
+            "reserved_at": time.time(),
+            "pid": os.getpid(),
+        }
+        if self.block_info:
+            value.update(self.block_info)
+        return value
+
+
+@dataclass
+class PortAllocationRequest:
+    """Parameters for port allocation"""
+
+    etcd_context: EtcdContext
+    metadata: PortMetadata
+    port_range: DynamoPortRange
+    block_size: int = 1
+    max_attempts: int = 100
+
+
+@contextmanager
+def hold_ports(ports: int | list[int]):
+    """Context manager to hold port binding(s).
+
+    Holds socket bindings to ensure exclusive access to ports during reservation.
+    Can handle a single port or multiple ports.
+
+    Args:
+        ports: Single port number or list of port numbers to hold
+    """
+    if isinstance(ports, int):
+        ports = [ports]
+
+    sockets = []
+    try:
+        for port in ports:
+            sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+            sock.bind(("", port))
+            sockets.append(sock)
+
+        yield
+
+    finally:
+        for sock in sockets:
+            sock.close()
+
+
+def check_port_available(port: int) -> bool:
+    """Check if a specific port is available for binding."""
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+            sock.bind(("", port))
+            return True
+    except OSError:
+        return False
+
+
+async def reserve_port_in_etcd(
+    etcd_context: EtcdContext,
+    port: int,
+    metadata: PortMetadata,
+) -> None:
+    """Reserve a single port in ETCD."""
+    key = etcd_context.make_port_key(port)
+    value = metadata.to_etcd_value()
+
+    await etcd_context.client.kv_create(
+        key=key,
+        value=json.dumps(value).encode(),
+        lease_id=etcd_context.client.primary_lease_id(),
+    )
+
+
+async def allocate_and_reserve_port_block(request: PortAllocationRequest) -> list[int]:
+    """
+    Allocate a contiguous block of ports from the specified range and atomically reserve them in ETCD.
+    Returns a list of all allocated ports in order.
+
+    This function uses a context manager to hold port bindings while reserving in ETCD,
+    preventing race conditions between multiple processes.
+
+    Args:
+        request: PortAllocationRequest containing all allocation parameters
+
+    Returns:
+        list[int]: List of all allocated ports in ascending order
+
+    Raises:
+        RuntimeError: If unable to reserve a port block within max_attempts
+        OSError: If unable to create sockets (system resource issues)
+    """
+    # Create a list of valid starting ports (must have room for the entire block)
+    max_start_port = request.port_range.max - request.block_size + 1
+    if max_start_port < request.port_range.min:
+        raise ValueError(
+            f"Port range {request.port_range.min}-{request.port_range.max} is too small for block size {request.block_size}"
+        )
+
+    available_start_ports = list(range(request.port_range.min, max_start_port + 1))
+    random.shuffle(available_start_ports)
+
+    actual_max_attempts = min(len(available_start_ports), request.max_attempts)
+
+    for attempt in range(1, actual_max_attempts + 1):
+        start_port = available_start_ports[attempt - 1]
+        ports_to_reserve = list(range(start_port, start_port + request.block_size))
+
+        try:
+            # Try to bind to all ports in the block atomically
+            with hold_ports(ports_to_reserve):
+                logger.debug(
+                    f"Successfully bound to ports {ports_to_reserve}, now reserving in ETCD"
+                )
+
+                # We have exclusive access to these ports, now reserve them in ETCD
+                for i, port in enumerate(ports_to_reserve):
+                    port_metadata = PortMetadata(
+                        worker_id=f"{request.metadata.worker_id}-{i}"
+                        if request.block_size > 1
+                        else request.metadata.worker_id,
+                        reason=request.metadata.reason,
+                        block_info={
+                            "block_index": i,
+                            "block_size": request.block_size,
+                            "block_start": start_port,
+                        }
+                        if request.block_size > 1
+                        else {},
+                    )
+
+                    await reserve_port_in_etcd(
+                        etcd_context=request.etcd_context,
+                        port=port,
+                        metadata=port_metadata,
+                    )
+
+                logger.debug(
+                    f"Reserved port block {ports_to_reserve} from range {request.port_range.min}-{request.port_range.max} "
+                    f"for {request.metadata.worker_id} (block_size={request.block_size})"
+                )
+                return ports_to_reserve
+
+        except OSError as e:
+            logger.debug(
+                f"Failed to bind to port block starting at {start_port} (attempt {attempt}): {e}"
+            )
+        except Exception as e:
+            logger.debug(
+                f"Failed to reserve port block starting at {start_port} in ETCD (attempt {attempt}): {e}"
+            )
+
+        if attempt < actual_max_attempts:
+            await asyncio.sleep(0.01)
+
+    raise RuntimeError(
+        f"Failed to allocate and reserve a port block of size {request.block_size} from range "
+        f"{request.port_range.min}-{request.port_range.max} after {actual_max_attempts} attempts"
+    )
+
+
+async def allocate_and_reserve_port(
+    etcd_context: EtcdContext,
+    metadata: PortMetadata,
+    port_range: DynamoPortRange,
+    max_attempts: int = 100,
+) -> int:
+    """
+    Allocate a port from the specified range and atomically reserve it in ETCD.
+    This is a convenience wrapper around allocate_and_reserve_port_block with block_size=1.
+
+    Args:
+        etcd_context: ETCD context for operations
+        metadata: Port metadata for ETCD storage
+        port_range: DynamoPortRange object specifying min and max ports to try
+        max_attempts: Maximum number of ports to try (default: 100)
+
+    Returns:
+        int: The allocated port number
+
+    Raises:
+        RuntimeError: If unable to reserve a port within max_attempts
+        OSError: If unable to create sockets (system resource issues)
+    """
+    request = PortAllocationRequest(
+        etcd_context=etcd_context,
+        metadata=metadata,
+        port_range=port_range,
+        block_size=1,
+        max_attempts=max_attempts,
+    )
+    allocated_ports = await allocate_and_reserve_port_block(request)
+    return allocated_ports[0]  # Return the single allocated port
+
+
+def get_host_ip() -> str:
+    """Get the IP address of the host.
+    This is needed for the side channel to work in multi-node deployments.
+    """
+    try:
+        host_name = socket.gethostname()
+    except socket.error as e:
+        logger.warning(f"Failed to get hostname: {e}, falling back to '127.0.0.1'")
+        return "127.0.0.1"
+    else:
+        try:
+            # Get the IP address of the hostname - this is needed for the side channel to work in multi-node deployments
+            host_ip = socket.gethostbyname(host_name)
+            # Test if the IP is actually usable by binding to it
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as test_socket:
+                test_socket.bind((host_ip, 0))
+            return host_ip
+        except socket.gaierror as e:
+            logger.warning(
+                f"Hostname '{host_name}' cannot be resolved: {e}, falling back to '127.0.0.1'"
+            )
+            return "127.0.0.1"
+        except socket.error as e:
+            # If hostname is not usable for binding, fall back to localhost
+            logger.warning(
+                f"Hostname '{host_name}' is not usable for binding: {e}, falling back to '127.0.0.1'"
+            )
+            return "127.0.0.1"

From 9a93f11b1dde903767d57cd2a222951e433f720e Mon Sep 17 00:00:00 2001
From: ishandhanani <82981111+ishandhanani@users.noreply.github.com>
Date: Wed, 30 Jul 2025 14:42:49 -0700
Subject: [PATCH 2/9] chore: fix install (#2191)

Co-authored-by: Anant Sharma <anants@nvidia.com>
Co-authored-by: Ishan Dhanani <idhanani@nvidia.com>
---
 README.md                          | 7 ++++++-
 container/Dockerfile.sglang        | 9 ++++-----
 container/Dockerfile.sglang-wideep | 2 +-
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 759a9187d8..6a8fa61890 100644
--- a/README.md
+++ b/README.md
@@ -167,10 +167,15 @@ To specify which GPUs to use set environment variable `CUDA_VISIBLE_DEVICES`.
 
 ## SGLang
 
+
 ```
-# Install libnuma
+# Install libnuma-dev
 apt install -y libnuma-dev
 
+# Install flashinfer-python pre-release (required by sglang for optimized inference)
+uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow
+
+# Install ai-dynamo with sglang support
 uv pip install ai-dynamo[sglang]
 ```
 
diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang
index 8557684096..620cad6a3e 100644
--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -378,8 +378,6 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la
     sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
     echo "cat ~/.launch_screen" >> ~/.bashrc
 
-ENV PYTHONPATH=/workspace/dynamo/components/planner/src:/workspace/examples/sglang/utils:$PYTHONPATH
-
 ########################################
 ########## Development Image ###########
 ########################################
@@ -446,7 +444,10 @@ RUN apt-get update && \
 COPY --from=ci_minimum /workspace/target/release/metrics /usr/local/bin/metrics
 COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
 COPY --from=base /workspace/wheels/nixl/*.whl wheelhouse/
-RUN uv pip install ai-dynamo[sglang] --find-links wheelhouse
+
+# Install flashinfer-python pre-release version separately, then install ai-dynamo with sglang support
+RUN uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow && \
+    uv pip install "ai-dynamo[sglang]" --find-links wheelhouse
 
 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \
@@ -466,7 +467,5 @@ RUN uv pip install /workspace/benchmarks
 # Copy attribution files
 COPY ATTRIBUTION* LICENSE /workspace/
 
-ENV PYTHONPATH=/workspace/examples/sglang/utils:$PYTHONPATH
-
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
 CMD []
diff --git a/container/Dockerfile.sglang-wideep b/container/Dockerfile.sglang-wideep
index 0bbcb3af23..c8746ceb9d 100644
--- a/container/Dockerfile.sglang-wideep
+++ b/container/Dockerfile.sglang-wideep
@@ -121,7 +121,7 @@ ENV CARGO_BUILD_JOBS=${CARGO_BUILD_JOBS:-16}
 RUN cargo build --release
 
 RUN cd lib/bindings/python && pip install --break-system-packages -e . && cd ../../..
-RUN pip install --break-system-packages -e .
+RUN pip install --break-system-packages .
 
 RUN wget --tries=3 --waitretry=5 https://github.com/nats-io/nats-server/releases/download/v2.10.28/nats-server-v2.10.28-${ARCH}.deb && \
     dpkg -i nats-server-v2.10.28-${ARCH}.deb && rm nats-server-v2.10.28-${ARCH}.deb

From 5d4b3ca07aeee3b6837e59f721f727b6f4198289 Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Wed, 30 Jul 2025 15:48:43 -0700
Subject: [PATCH 3/9] changes to address QA issues

---
 README.md                            | 38 ++++++++++++++--------------
 components/backends/sglang/README.md | 12 ++++-----
 components/backends/trtllm/README.md | 12 ++++-----
 components/backends/vllm/README.md   | 12 ++++-----
 examples/README.md                   | 20 +++++++--------
 5 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/README.md b/README.md
index 6a8fa61890..e56f0c62d6 100644
--- a/README.md
+++ b/README.md
@@ -21,12 +21,30 @@ limitations under the License.
 [![Discord](https://dcbadge.limes.pink/api/server/D92uqZRjCZ?style=flat)](https://discord.gg/D92uqZRjCZ)
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/ai-dynamo/dynamo)
 
-| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** |
+| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Support Matrix](docs/support_matrix.md)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** |
 
 # NVIDIA Dynamo
 
 High-throughput, low-latency inference framework designed for serving generative AI and reasoning models in multi-node distributed environments.
 
+## Framework Support Matrix
+
+| Feature | vLLM | SGLang | TensorRT-LLM |
+|---------|----------------------|----------------------------|----------------------------------------|
+| [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | ✅ | ✅ | ✅ |
+| [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 |
+| [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | ✅ | ✅ | ✅ |
+| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | 🚧 | 🚧 |
+| [**Load Based Planner**](/docs/architecture/load_planner.md) | ✅ | 🚧 | 🚧 |
+| [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 |
+
+To learn more about each framework and their capabilities, check out each framework's README and deploy them with Dynamo!
+- **[vLLM](components/backends/vllm/README.md)**
+- **[SGLang](components/backends/sglang/README.md)**
+- **[TensorRT-LLM](components/backends/trtllm/README.md)**
+
+Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach.
+
 ## The Era of Multi-GPU, Multi-Node
 
 <p align="center">
@@ -47,24 +65,6 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa
   <img src="./docs/images/frontpage-architecture.png" alt="Dynamo architecture" width="600" />
 </p>
 
-## Framework Support Matrix
-
-| Feature | vLLM | SGLang | TensorRT-LLM |
-|---------|----------------------|----------------------------|----------------------------------------|
-| [**Disaggregated Serving**](/docs/architecture/disagg_serving.md) | ✅ | ✅ | ✅ |
-| [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 |
-| [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | ✅ | ✅ | ✅ |
-| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | 🚧 | 🚧 |
-| [**Load Based Planner**](/docs/architecture/load_planner.md) | ✅ | 🚧 | 🚧 |
-| [**KVBM**](/docs/architecture/kvbm_architecture.md) | 🚧 | 🚧 | 🚧 |
-
-To learn more about each framework and their capabilities, check out each framework's README!
-- **[vLLM](components/backends/vllm/README.md)**
-- **[SGLang](components/backends/sglang/README.md)**
-- **[TensorRT-LLM](components/backends/trtllm/README.md)**
-
-Built in Rust for performance and in Python for extensibility, Dynamo is fully open-source and driven by a transparent, OSS (Open Source Software) first development approach.
-
 # Installation
 
 The following examples require a few system level packages.
diff --git a/components/backends/sglang/README.md b/components/backends/sglang/README.md
index ffb58e76a0..e1d71516d5 100644
--- a/components/backends/sglang/README.md
+++ b/components/backends/sglang/README.md
@@ -34,12 +34,12 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 
 | Feature | SGLang | Notes |
 |---------|--------|-------|
-| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | ✅ |  |
-| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) |
-| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | ✅ |  |
-| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | ❌ | Planned |
-| [**Load Based Planner**](../../docs/architecture/load_planner.md) | ❌ | Planned |
-| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | ❌ | Planned |
+| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ |  |
+| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP [PR](https://github.com/sgl-project/sglang/pull/7730) |
+| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ |  |
+| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ❌ | Planned |
+| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | ❌ | Planned |
+| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | ❌ | Planned |
 
 ### Large Scale P/D and WideEP Features
 
diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md
index 3a5b495dce..b525bab329 100644
--- a/components/backends/trtllm/README.md
+++ b/components/backends/trtllm/README.md
@@ -49,12 +49,12 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 
 | Feature | TensorRT-LLM | Notes |
 |---------|--------------|-------|
-| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | ✅ |  |
-| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet |
-| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | ✅ |  |
-| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | 🚧 | Planned |
-| [**Load Based Planner**](../../docs/architecture/load_planner.md) | 🚧 | Planned |
-| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | 🚧 | Planned |
+| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ |  |
+| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet |
+| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ |  |
+| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | 🚧 | Planned |
+| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | 🚧 | Planned |
+| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | 🚧 | Planned |
 
 ### Large Scale P/D and WideEP Features
 
diff --git a/components/backends/vllm/README.md b/components/backends/vllm/README.md
index f20b9bb9d0..6ff95160bb 100644
--- a/components/backends/vllm/README.md
+++ b/components/backends/vllm/README.md
@@ -35,12 +35,12 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 
 | Feature | vLLM | Notes |
 |---------|------|-------|
-| [**Disaggregated Serving**](../../docs/architecture/disagg_serving.md) | ✅ |  |
-| [**Conditional Disaggregation**](../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP |
-| [**KV-Aware Routing**](../../docs/architecture/kv_cache_routing.md) | ✅ |  |
-| [**SLA-Based Planner**](../../docs/architecture/sla_planner.md) | ✅ |  |
-| [**Load Based Planner**](../../docs/architecture/load_planner.md) | 🚧 | WIP |
-| [**KVBM**](../../docs/architecture/kvbm_architecture.md) | 🚧 | WIP |
+| [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ |  |
+| [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | WIP |
+| [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ |  |
+| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ✅ |  |
+| [**Load Based Planner**](../../../docs/architecture/load_planner.md) | 🚧 | WIP |
+| [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | 🚧 | WIP |
 
 ### Large Scale P/D and WideEP Features
 
diff --git a/examples/README.md b/examples/README.md
index 13fdfe5ad2..225cf13dba 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -22,6 +22,15 @@ This directory contains practical examples demonstrating how to deploy and use D
 > **Want to see a specific example?**
 > Open a [GitHub issue](https://github.com/ai-dynamo/dynamo/issues) to request an example you'd like to see, or [open a pull request](https://github.com/ai-dynamo/dynamo/pulls) if you'd like to contribute your own!
 
+## Framework Support
+
+The /examples directory shows how Dynamo broadly works using major inference engines.
+
+If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory:
+- **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration
+- **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows
+- **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations
+
 ## Basics & Tutorials
 
 Learn fundamental Dynamo concepts through these introductory examples:
@@ -67,13 +76,4 @@ Before running any examples, ensure you have:
 - **Docker & Docker Compose** - For containerized services
 - **CUDA-compatible GPU** - For LLM inference (except hello_world, which is non-GPU aware)
 - **Python 3.9++** - For client scripts and utilities
-- **Kubernetes cluster** - For any cloud deployment/K8s examples
-
-## Framework Support
-
-These examples show how Dynamo broadly works using major inference engines.
-
-If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory:
-- **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration
-- **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows
-- **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations
\ No newline at end of file
+- **Kubernetes cluster** - For any cloud deployment/K8s examples
\ No newline at end of file

From aa42f9dc3708e0797347501c96f02b2e9fe46c07 Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Wed, 30 Jul 2025 15:54:08 -0700
Subject: [PATCH 4/9] trigger PR refresh


From dbb1eaf64e67f036a0ef2e88ea74b8b2a07334c8 Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Wed, 30 Jul 2025 17:41:06 -0700
Subject: [PATCH 5/9] Revert Dockerfile.sglang changes to match main

Remove the specific flashinfer-python installation and revert to the
standard ai-dynamo[sglang] --pre installation from main branch.
---
 container/Dockerfile.sglang | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang
index 620cad6a3e..90692150a0 100644
--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -444,10 +444,7 @@ RUN apt-get update && \
 COPY --from=ci_minimum /workspace/target/release/metrics /usr/local/bin/metrics
 COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
 COPY --from=base /workspace/wheels/nixl/*.whl wheelhouse/
-
-# Install flashinfer-python pre-release version separately, then install ai-dynamo with sglang support
-RUN uv pip install "flashinfer-python==0.2.9rc2" --prerelease=allow && \
-    uv pip install "ai-dynamo[sglang]" --find-links wheelhouse
+RUN uv pip install "ai-dynamo[sglang]" --pre --find-links wheelhouse
 
 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \

From c8bd069a71d690b0c4bac352894635ff9191afa2 Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Wed, 30 Jul 2025 17:43:29 -0700
Subject: [PATCH 6/9] same dockerfile as one in main

---
 container/Dockerfile.sglang | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/container/Dockerfile.sglang b/container/Dockerfile.sglang
index 90692150a0..a510272fb8 100644
--- a/container/Dockerfile.sglang
+++ b/container/Dockerfile.sglang
@@ -378,6 +378,8 @@ RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/la
     sed '/^#\s/d' /workspace/launch_message.txt > ~/.launch_screen && \
     echo "cat ~/.launch_screen" >> ~/.bashrc
 
+ENV PYTHONPATH=/workspace/dynamo/components/planner/src:/workspace/examples/sglang/utils:$PYTHONPATH
+
 ########################################
 ########## Development Image ###########
 ########################################
@@ -464,5 +466,7 @@ RUN uv pip install /workspace/benchmarks
 # Copy attribution files
 COPY ATTRIBUTION* LICENSE /workspace/
 
+ENV PYTHONPATH=/workspace/examples/sglang/utils:$PYTHONPATH
+
 ENTRYPOINT ["/opt/nvidia/nvidia_entrypoint.sh"]
-CMD []
+CMD []
\ No newline at end of file

From d292048b5fe1e378d456051650071927b63ac6b4 Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Mon, 4 Aug 2025 20:58:21 -0700
Subject: [PATCH 7/9] added containers & helm charts

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e56f0c62d6..359585264a 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ limitations under the License.
 [![Discord](https://dcbadge.limes.pink/api/server/D92uqZRjCZ?style=flat)](https://discord.gg/D92uqZRjCZ)
 [![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/ai-dynamo/dynamo)
 
-| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Support Matrix](docs/support_matrix.md)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** |
+| **[Roadmap](https://github.com/ai-dynamo/dynamo/issues/762)** | **[Documentation](https://docs.nvidia.com/dynamo/latest/index.html)** | **[Support Matrix](docs/support_matrix.md)** | **[Examples](https://github.com/ai-dynamo/dynamo/tree/main/examples)** | **[Design Proposals](https://github.com/ai-dynamo/enhancements)** |  **[Containers & Helm Charts](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/ai-dynamo/collections/ai-dynamos)**
 
 # NVIDIA Dynamo
 

From fcc114d607db73305a4303b1e03b59a0e0504d92 Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Mon, 4 Aug 2025 21:03:56 -0700
Subject: [PATCH 8/9] fix merge conflict + edit exampels readme

---
 examples/README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 225cf13dba..a2d917157d 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -17,20 +17,20 @@ limitations under the License.
 
 # Dynamo Examples
 
-This directory contains practical examples demonstrating how to deploy and use Dynamo for distributed LLM inference. Each example includes setup instructions, configuration files, and explanations to help you understand different deployment patterns and use cases.
-
-> **Want to see a specific example?**
-> Open a [GitHub issue](https://github.com/ai-dynamo/dynamo/issues) to request an example you'd like to see, or [open a pull request](https://github.com/ai-dynamo/dynamo/pulls) if you'd like to contribute your own!
-
 ## Framework Support
 
-The /examples directory shows how Dynamo broadly works using major inference engines.
+The /examples directory shows how Dynamo broadly works using various inference engines.
 
 If you want to see advanced, framework-specific deployment patterns and best practices, check out the [Components Workflows](../components/backends/) directory:
 - **[vLLM](../components/backends/vllm/)** – vLLM-specific deployment and configuration
 - **[SGLang](../components/backends/sglang/)** – SGLang integration examples and workflows
 - **[TensorRT-LLM](../components/backends/trtllm/)** – TensorRT-LLM workflows and optimizations
 
+This directory contains practical examples & tutorials demonstrating how to deploy and use Dynamo for distributed LLM inference. Each example includes setup instructions, configuration files, and explanations to help you understand different deployment patterns and use cases.
+
+> **Want to see a specific example?**
+> Open a [GitHub issue](https://github.com/ai-dynamo/dynamo/issues) to request an example you'd like to see, or [open a pull request](https://github.com/ai-dynamo/dynamo/pulls) if you'd like to contribute your own!
+
 ## Basics & Tutorials
 
 Learn fundamental Dynamo concepts through these introductory examples:

From 3e7186d3b52e9e3a43acd5086ded8ce1796c461e Mon Sep 17 00:00:00 2001
From: athreesh <anish.maddipoti@utexas.edu>
Date: Mon, 4 Aug 2025 21:14:51 -0700
Subject: [PATCH 9/9] adding a readme for benchmarks folder

---
 benchmarks/llm/README.md | 67 +++++++++++++++++++++++++++++++---------
 1 file changed, 52 insertions(+), 15 deletions(-)

diff --git a/benchmarks/llm/README.md b/benchmarks/llm/README.md
index e0cb8e976d..da55a2a0ea 100644
--- a/benchmarks/llm/README.md
+++ b/benchmarks/llm/README.md
@@ -1,15 +1,52 @@
-<!--
-SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-SPDX-License-Identifier: Apache-2.0
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
--->
-
-[../../examples/llm/benchmarks/README.md](../../examples/llm/benchmarks/README.md)
+# LLM Benchmarking Tools
+
+This directory contains tools for benchmarking LLM inference performance in Dynamo deployments.
+
+## Overview
+
+The benchmarking suite includes:
+- **`perf.sh`** - Automated performance benchmarking script using GenAI-Perf
+- **`plot_pareto.py`** - Results analysis and Pareto efficiency visualization 
+- **`nginx.conf`** - Load balancer configuration for multi-backend setups
+
+## Key Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--tensor-parallelism, --tp` | Tensor parallelism for aggregated mode | 0 |
+| `--data-parallelism, --dp` | Data parallelism for aggregated mode | 0 |
+| `--prefill-tp` | Prefill tensor parallelism for disaggregated mode | 0 |
+| `--prefill-dp` | Prefill data parallelism for disaggregated mode | 0 |
+| `--decode-tp` | Decode tensor parallelism for disaggregated mode | 0 |
+| `--decode-dp` | Decode data parallelism for disaggregated mode | 0 |
+| `--model` | HuggingFace model ID | `neuralmagic/DeepSeek-R1-Distill-Llama-70B-FP8-dynamic` |
+| `--url` | Target inference endpoint | `http://localhost:8000` |
+| `--concurrency` | Comma-separated concurrency levels | `1,2,4,8,16,32,64,128,256` |
+| `--isl` | Input sequence length | 3000 |
+| `--osl` | Output sequence length | 150 |
+| `--mode` | Serving mode (`aggregated` or `disaggregated`) | `aggregated` |
+
+
+## Best Practices
+
+1. **Warm up services** before benchmarking to ensure stable performance
+2. **Match parallelism settings** to your actual deployment configuration
+3. **Run multiple benchmark iterations** for statistical confidence
+4. **Monitor resource utilization** during benchmarks to identify bottlenecks
+5. **Compare configurations** using Pareto plots to find optimal settings
+
+## Requirements
+
+- GenAI-Perf tool installed and available in PATH
+- Python 3.7+ with matplotlib, pandas, seaborn, numpy
+- nginx (for load balancing scenarios)
+- Access to target LLM inference service
+
+## Troubleshooting
+
+- Ensure the target URL is accessible before running benchmarks
+- Verify model names match those available in your deployment
+- Check that parallelism settings align with your hardware configuration
+- Monitor system resources to avoid resource contention during benchmarks
+
+