-
Notifications
You must be signed in to change notification settings - Fork 894
Expand file tree
/
Copy pathdisagg_planner.yaml
More file actions
129 lines (128 loc) · 3.61 KB
/
disagg_planner.yaml
File metadata and controls
129 lines (128 loc) · 3.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
apiVersion: nvidia.com/v1alpha1
kind: DynamoGraphDeployment
metadata:
name: trtllm-disagg-planner
spec:
pvcs:
- name: dynamo-pvc
create: false
services:
Frontend:
dynamoNamespace: trtllm-disagg-planner
componentType: frontend
replicas: 1
extraPodSpec:
mainContainer:
image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/examples/backends/trtllm
command:
- python3
args:
- -m
- dynamo.frontend
- --http-port
- "8000"
- --kv-cache-block-size
- "128"
- --router-mode
- kv
- --kv-overlap-score-weight
- "0.0"
- --router-temperature
- "0.0"
- --no-kv-events
Planner:
dynamoNamespace: trtllm-disagg-planner
envFromSecret: hf-token-secret
componentType: planner
replicas: 1
volumeMounts:
- name: dynamo-pvc # Must be pre-created before deployment and SLA profiler must have been run
mountPoint: /data
extraPodSpec:
mainContainer:
image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/components/src/dynamo/planner
ports:
- name: metrics
containerPort: 9085
command:
- python3
args:
- -m
- planner_sla
- --environment=kubernetes
- --backend=trtllm
- --adjustment-interval=60
- --profile-results-dir=/data
- --prometheus-port=9085
TRTLLMDecodeWorker:
dynamoNamespace: trtllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: decode
replicas: 1
livenessProbe:
httpGet:
path: /live
port: 9090
periodSeconds: 5
timeoutSeconds: 30
failureThreshold: 1
readinessProbe:
httpGet:
path: /health
port: 9090
periodSeconds: 10
timeoutSeconds: 30
failureThreshold: 60
resources:
limits:
gpu: "1"
extraPodSpec:
terminationGracePeriodSeconds: 600
mainContainer:
image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/
command:
- python3
args:
- -m
- dynamo.trtllm
- --model-path
- Qwen/Qwen3-0.6B
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./examples/backends/trtllm/engine_configs/qwen3/decode.yaml
- --disaggregation-mode
- decode
TRTLLMPrefillWorker:
dynamoNamespace: trtllm-disagg-planner
envFromSecret: hf-token-secret
componentType: worker
subComponentType: prefill
replicas: 1
resources:
limits:
gpu: "1"
extraPodSpec:
terminationGracePeriodSeconds: 600
mainContainer:
image: my-registry/trtllm-runtime:my-tag
workingDir: /workspace/
command:
- python3
args:
- -m
- dynamo.trtllm
- --model-path
- Qwen/Qwen3-0.6B
- --served-model-name
- Qwen/Qwen3-0.6B
- --extra-engine-args
- ./examples/backends/trtllm/engine_configs/qwen3/prefill.yaml
- --disaggregation-mode
- prefill