ai-dynamo · tedzhouhk · Sep 10, 2025 · Sep 5, 2025 · Sep 8, 2025 · Sep 9, 2025
diff --git a/README.md b/README.md
@@ -59,7 +59,7 @@ Dynamo is designed to be inference engine agnostic (supports TRT-LLM, vLLM, SGLa
 | [**Conditional Disaggregation**](/docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | 🚧 | 🚧 |
 | [**KV-Aware Routing**](/docs/architecture/kv_cache_routing.md) | ✅ | ✅ | ✅ |
 | [**Load Based Planner**](/docs/architecture/load_planner.md) | 🚧 | 🚧 | 🚧 |
-| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | ✅ | 🚧 |
+| [**SLA-Based Planner**](/docs/architecture/sla_planner.md) | ✅ | ✅ | ✅ |
 | [**KVBM**](/docs/architecture/kvbm_architecture.md) | ✅ | 🚧 | ✅ |
 
 To learn more about each framework and their capabilities, check out each framework's README!

@@ -548,8 +548,8 @@ async def run_profile(args):
         "--backend",
         type=str,
         default="vllm",
-        choices=["vllm", "sglang"],
-        help="backend type, currently support [vllm, sglang]",
+        choices=["vllm", "sglang", "trtllm"],
+        help="backend type, currently support [vllm, sglang, trtllm]",
     )
     parser.add_argument(
         "--config",

diff --git a/components/backends/trtllm/README.md b/components/backends/trtllm/README.md
@@ -55,7 +55,7 @@ git checkout $(git describe --tags $(git rev-list --tags --max-count=1))
 | [**Disaggregated Serving**](../../../docs/architecture/disagg_serving.md) | ✅ |  |
 | [**Conditional Disaggregation**](../../../docs/architecture/disagg_serving.md#conditional-disaggregation) | 🚧 | Not supported yet |
 | [**KV-Aware Routing**](../../../docs/architecture/kv_cache_routing.md) | ✅ |  |
-| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | 🚧 | Planned |
+| [**SLA-Based Planner**](../../../docs/architecture/sla_planner.md) | ✅ |  |
 | [**Load Based Planner**](../../../docs/architecture/load_planner.md) | 🚧 | Planned |
 | [**KVBM**](../../../docs/architecture/kvbm_architecture.md) | 🚧 | Planned |
 

diff --git a/components/backends/trtllm/deploy/README.md b/components/backends/trtllm/deploy/README.md
@@ -42,6 +42,19 @@ Aggregated deployment with custom configuration.
 - `Frontend`: OpenAI-compatible API server (with kv router mode disabled)
 - `TRTLLMWorker`: Single worker handling both prefill and decode with custom configuration mounted from the configmap
 
+### 6. **Disaggregated Planner Deployment** (`disagg_planner.yaml`)
+Advanced disaggregated deployment with SLA-based automatic scaling.
+
+**Architecture:**
+- `Frontend`: HTTP API server coordinating between workers
+- `Planner`: SLA-based planner that monitors performance and scales workers automatically
+- `Prometheus`: Metrics collection and monitoring
+- `TRTLLMDecodeWorker`: Specialized decode-only worker
+- `TRTLLMPrefillWorker`: Specialized prefill-only worker
+
+> [!NOTE]
+> This deployment requires pre-deployment profiling to be completed first. See [Pre-Deployment Profiling](../../../../docs/benchmarks/pre_deployment_profiling.md) for detailed instructions.
+
 ## CRD Structure
 
 All templates use the **DynamoGraphDeployment** CRD:

diff --git a/components/backends/trtllm/deploy/disagg_planner.yaml b/components/backends/trtllm/deploy/disagg_planner.yaml
@@ -0,0 +1,205 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+apiVersion: nvidia.com/v1alpha1
+kind: DynamoGraphDeployment
+metadata:
+  name: trtllm-disagg-planner
+spec:
+  envs:
+    - name: DYNAMO_SERVICE_CONFIG
+      value: '{"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:8000"]}]},{"job_name":"frontend","static_configs":[{"targets":["trtllm-disagg-planner-frontend:8000"]}]}]}}'
+    - name: DYNAMO_NAMESPACE
+      value: "trtllm-disagg-planner"
+  services:
+    Frontend:
+      dynamoNamespace: trtllm-disagg-planner
+      componentType: frontend
+      replicas: 1
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:hzhou-0909-03
+          workingDir: /workspace/components/backends/trtllm
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.frontend
+            - --http-port
+            - "8000"
+            - --kv-cache-block-size
+            - "128"
+            - --router-mode
+            - kv
+            - --kv-overlap-score-weight
+            - "0.0"
+            - --router-temperature
+            - "0.0"
+            - --no-kv-events
+    Planner:
+      dynamoNamespace: trtllm-disagg-planner
+      envFromSecret: hf-token-secret
+      componentType: planner
+      replicas: 1
+      envs:
+        - name: PROMETHEUS_PORT
+          value: "8000"
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 60
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      pvc:
+        create: false
+        name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
+        mountPoint: /workspace/profiling_results
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:hzhou-0909-03
+          workingDir: /workspace/components/planner/src/dynamo/planner
+          ports:
+            - name: metrics
+              containerPort: 9085
+          command:
+            - python3
+          args:
+            - -m
+            - planner_sla
+            - --environment=kubernetes
+            - --backend=trtllm
+            - --adjustment-interval=60
+            - --profile-results-dir=/workspace/profiling_results
+            - --prometheus-port=9085
+    Prometheus: # NOTE: this is set on Prometheus to ensure a service is created for the Prometheus component. This is a workaround and should be managed differently.
+      dynamoNamespace: trtllm-disagg-planner
+      componentType: frontend
+      replicas: 1
+      envs:
+        - name: PYTHONPATH
+          value: "/workspace/components/planner/src"
+        - name: PROMETHEUS_PORT
+          value: "8000"
+      livenessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      readinessProbe:
+        exec:
+          command:
+            - /bin/sh
+            - -c
+            - "exit 0"
+        initialDelaySeconds: 30
+        periodSeconds: 60
+        timeoutSeconds: 30
+        failureThreshold: 10
+      extraPodSpec:
+        mainContainer:
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:hzhou-0909-03
+          workingDir: /workspace/components/backends/trtllm
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.planner.prometheus
+    TRTLLMDecodeWorker:
+      dynamoNamespace: trtllm-disagg-planner
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 1
+      livenessProbe:
+        httpGet:
+          path: /live
+          port: 9090
+        periodSeconds: 5
+        timeoutSeconds: 30
+        failureThreshold: 1
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 9090
+        periodSeconds: 10
+        timeoutSeconds: 30
+        failureThreshold: 60
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        terminationGracePeriodSeconds: 600
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:hzhou-0909-03
+          workingDir: /workspace/components/backends/trtllm
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.trtllm
+            - --model-path
+            - Qwen/Qwen3-0.6B
+            - --served-model-name
+            - Qwen/Qwen3-0.6B
+            - --extra-engine-args
+            - engine_configs/decode.yaml
+            - --disaggregation-mode
+            - decode
+            - --disaggregation-strategy
+            - decode_first
+    TRTLLMPrefillWorker:
+      dynamoNamespace: trtllm-disagg-planner
+      envFromSecret: hf-token-secret
+      componentType: worker
+      replicas: 1
+      resources:
+        limits:
+          gpu: "1"
+      extraPodSpec:
+        terminationGracePeriodSeconds: 600
+        mainContainer:
+          startupProbe:
+            httpGet:
+              path: /health
+              port: 9090
+            periodSeconds: 10
+            failureThreshold: 60
+          image: nvcr.io/nvidian/dynamo-dev/dynamo-trtllm-runtime:hzhou-0909-03
+          workingDir: /workspace/components/backends/trtllm
+          command:
+            - python3
+          args:
+            - -m
+            - dynamo.trtllm
+            - --model-path
+            - Qwen/Qwen3-0.6B
+            - --served-model-name
+            - Qwen/Qwen3-0.6B
+            - --extra-engine-args
+            - engine_configs/prefill.yaml
+            - --disaggregation-mode
+            - prefill
+            - --disaggregation-strategy
+            - decode_first
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import asyncio
+import json
 import logging
 import os
 import signal
@@ -22,6 +23,7 @@
 from transformers import AutoConfig
 
 import dynamo.nixl_connect as nixl_connect
+from benchmarks.profiler.utils.config import deep_update
 from dynamo.llm import ModelInput, ModelRuntimeConfig, ModelType, register_llm
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
@@ -192,6 +194,17 @@ async def init(runtime: DistributedRuntime, config: Config):
     if config.extra_engine_args != "":
         # TODO: Support extra engine args from json file as well.
         arg_map = update_llm_args_with_extra_options(arg_map, config.extra_engine_args)
+
+    # Apply override_engine_args if provided
+    if config.override_engine_args != "":
+        try:
+            overrides = json.loads(config.override_engine_args)
+            logging.info(f"Applying engine arg overrides: {overrides}")
+
+            deep_update(arg_map, overrides)
+        except json.JSONDecodeError as e:
+            logging.error(f"Failed to parse override_engine_args as JSON: {e}")
+            sys.exit(1)
     if config.publish_events_and_metrics:
         # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
         kv_cache_config = None

@@ -46,6 +46,7 @@ def __init__(self) -> None:
         self.max_beam_width: int = BuildConfig.max_beam_width
         self.free_gpu_memory_fraction: Optional[float] = None
         self.extra_engine_args: str = ""
+        self.override_engine_args: str = ""
         self.publish_events_and_metrics: bool = False
         self.disaggregation_mode: DisaggregationMode = DEFAULT_DISAGGREGATION_MODE
         self.disaggregation_strategy: DisaggregationStrategy = (
@@ -77,6 +78,7 @@ def __str__(self) -> str:
             f"max_beam_width={self.max_beam_width}, "
             f"free_gpu_memory_fraction={self.free_gpu_memory_fraction}, "
             f"extra_engine_args={self.extra_engine_args}, "
+            f"override_engine_args={self.override_engine_args}, "
             f"migration_limit={self.migration_limit}, "
             f"publish_events_and_metrics={self.publish_events_and_metrics}, "
             f"disaggregation_mode={self.disaggregation_mode}, "
@@ -217,6 +219,12 @@ def cmd_line_args():
         default="",
         help="Path to a YAML file containing additional keyword arguments to pass to the TRTLLM engine.",
     )
+    parser.add_argument(
+        "--override-engine-args",
+        type=str,
+        default="",
+        help='Python dictionary string to override specific engine arguments from the YAML file. Example: \'{"tensor_parallel_size": 2, "kv_cache_config": {"enable_block_reuse": false}}\'',
+    )
     parser.add_argument(
         "--publish-events-and-metrics",
         action="store_true",
@@ -352,6 +360,7 @@ def cmd_line_args():
     config.kv_block_size = args.kv_block_size
     config.migration_limit = args.migration_limit
     config.extra_engine_args = args.extra_engine_args
+    config.override_engine_args = args.override_engine_args
     config.publish_events_and_metrics = args.publish_events_and_metrics
     config.modality = args.modality
 

@@ -101,7 +101,22 @@ class SGLangComponentName:
     decode_worker_endpoint = "generate"
 
 
+class TrtllmComponentName:
+    # Note: Planner only supports DECODE_FIRST strategy in TRT-LLM:
+    # - Decode worker is the first worker (tensorrt_llm)
+    # - Prefill worker is the next worker (tensorrt_llm_next)
+    prefill_worker_k8s_name = "TRTLLMPrefillWorker"
+    prefill_worker_component_name = (
+        "tensorrt_llm_next"  # Prefill is "next" with DECODE_FIRST
+    )
+    prefill_worker_endpoint = "generate"
+    decode_worker_k8s_name = "TRTLLMDecodeWorker"
+    decode_worker_component_name = "tensorrt_llm"  # Decode is "first" with DECODE_FIRST
+    decode_worker_endpoint = "generate"
+
+
 WORKER_COMPONENT_NAMES = {
     "vllm": VllmComponentName,
     "sglang": SGLangComponentName,
+    "trtllm": TrtllmComponentName,
 }
@@ -39,7 +39,7 @@ def create_sla_planner_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--backend",
         default=SLAPlannerDefaults.backend,
-        choices=["vllm", "sglang"],
+        choices=["vllm", "sglang", "trtllm"],
         help="Backend type",
     )
     parser.add_argument(

@@ -237,6 +237,7 @@ COPY components/ /workspace/components/
 COPY tests /workspace/tests
 COPY benchmarks /workspace/benchmarks
 COPY examples /workspace/examples
+COPY deploy /workspace/deploy
 RUN uv pip install /workspace/benchmarks
 
 # Copy benchmarks, backends and tests for CI

@@ -322,6 +322,10 @@ RUN . /opt/dynamo/venv/bin/activate && \
 RUN pip install dist/ai_dynamo_runtime*cp312*.whl  && \
     pip install dist/ai_dynamo*any.whl
 
+# Install common dependencies including aiofiles
+RUN --mount=type=bind,source=./container/deps/requirements.txt,target=/tmp/requirements.txt \
+    pip install --requirement /tmp/requirements.txt
+
 ENV DYNAMO_HOME=/workspace
 # Copy launch banner
 RUN --mount=type=bind,source=./container/launch_message.txt,target=/workspace/launch_message.txt \

@@ -24,7 +24,7 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
-import aiofiles  # type: ignore[import-untyped]
+import aiofiles
 import httpx  # added for HTTP requests
 import kubernetes_asyncio as kubernetes
 import yaml

diff --git a/docs/architecture/planner_intro.rst b/docs/architecture/planner_intro.rst
@@ -44,7 +44,7 @@ Key features include:
      - ✅
      - vLLM
    * -
-     - ❌
+     - ✅
      - TensorRT-LLM
    * -
      - ❌
-Original file line number
+Diff line change
@@ Expand Up / @@ -44,7 +44,7 @@ Key features include: @@
          - ✅
          - vLLM
        * -
-         - ❌
+         - ✅
          - TensorRT-LLM
        * -
          - ❌
@@ Expand Down @@