diff --git a/examples/llm/deploy/agg.yaml b/examples/llm/deploy/agg.yaml new file mode 100644 index 0000000000..c277e974c6 --- /dev/null +++ b/examples/llm/deploy/agg.yaml @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: llm-agg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","router-num-threads":4,"common-configs":["model","block-size","max-model-len"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-agg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-agg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-agg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker diff --git a/examples/llm/deploy/agg_router.yaml b/examples/llm/deploy/agg_router.yaml new file mode 100644 index 0000000000..fa40fe2e31 --- /dev/null +++ b/examples/llm/deploy/agg_router.yaml @@ -0,0 +1,125 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg-router +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","router":"kv","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"softmax-sample":true,"common-configs":["model","block-size","router"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"tensor-parallel-size":1,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-agg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg_router:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-agg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg_router:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + Router: + dynamoNamespace: llm-agg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg_router:Router + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Router + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-agg-router + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.agg_router:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker diff --git a/examples/llm/deploy/disagg.yaml b/examples/llm/deploy/disagg.yaml new file mode 100644 index 0000000000..d64089f5a2 --- /dev/null +++ b/examples/llm/deploy/disagg.yaml @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: llm-disagg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"router":"round-robin","common-configs":["model","block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-disagg + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + PrefillWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker diff --git a/examples/llm/deploy/disagg_router.yaml b/examples/llm/deploy/disagg_router.yaml new file mode 100644 index 0000000000..152d09f7ed --- /dev/null +++ b/examples/llm/deploy/disagg_router.yaml @@ -0,0 +1,152 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg-router +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"router":"kv","kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.Processor.chat/completions","port":8000},"Processor":{"common-configs":["model","block-size","max-model-len","router"]},"Router":{"min-workers":1,"common-configs":["model","block-size","router"]},"VllmWorker":{"max-num-batched-tokens":16384,"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"tensor-parallel-size":1,"enable-prefix-caching":true,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","router","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"ServiceArgs":{"workers":1,"resources":{"gpu":"1"}},"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Planner":{"environment":"kubernetes","no-operation":true}}' + services: + Frontend: + dynamoNamespace: llm-disagg-router + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg_router:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + Processor: + dynamoNamespace: llm-disagg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg_router:Processor + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Processor + Router: + dynamoNamespace: llm-disagg-router + componentType: worker + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg_router:Router + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Router + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg-router + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg_router:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + PrefillWorker: + envFromSecret: hf-token-secret + dynamoNamespace: llm-disagg-router + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/llm + args: + - dynamo + - serve + - graphs.disagg_router:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker diff --git a/examples/vllm_v0/deploy/agg.yaml b/examples/vllm_v0/deploy/agg.yaml new file mode 100644 index 0000000000..45af6f3cee --- /dev/null +++ b/examples/vllm_v0/deploy/agg.yaml @@ -0,0 +1,75 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}' + services: + Frontend: + dynamoNamespace: vllm-v0-agg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + VllmWorker: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v0-agg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.agg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker diff --git a/examples/vllm_v0/deploy/disagg.yaml b/examples/vllm_v0/deploy/disagg.yaml new file mode 100644 index 0000000000..c4f41342e8 --- /dev/null +++ b/examples/vllm_v0/deploy/disagg.yaml @@ -0,0 +1,102 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}' + services: + Frontend: + dynamoNamespace: vllm-v0-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + VllmWorker: + dynamoNamespace: vllm-v0-disagg + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + PrefillWorker: + dynamoNamespace: vllm-v0-disagg + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker diff --git a/examples/vllm_v0/deploy/disagg_planner.yaml b/examples/vllm_v0/deploy/disagg_planner.yaml new file mode 100644 index 0000000000..082e3cb9bc --- /dev/null +++ b/examples/vllm_v0/deploy/disagg_planner.yaml @@ -0,0 +1,156 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg-planner +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}' + services: + Frontend: + dynamoNamespace: vllm-v0-disagg-planner + componentType: main + replicas: 1 + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "2" + memory: "4Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + + VllmWorker: + dynamoNamespace: vllm-v0-disagg-planner + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:VllmWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmWorker + + PrefillWorker: + dynamoNamespace: vllm-v0-disagg-planner + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:PrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - PrefillWorker + + Planner: + dynamoNamespace: vllm-v0-disagg-planner + replicas: 1 + componentType: planner + resources: + requests: + cpu: "2" + memory: "2Gi" + limits: + cpu: "2" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:Planner + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Planner + - --Planner.environment=kubernetes + + Prometheus: + dynamoNamespace: vllm-v0-disagg-planner + replicas: 1 + resources: + requests: + cpu: "1000m" + memory: "1000Mi" + limits: + cpu: "1000m" + memory: "1000Mi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v0 + args: + - dynamo + - serve + - graphs.disagg_planner:Prometheus + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Prometheus diff --git a/examples/vllm_v1/deploy/agg.yaml b/examples/vllm_v1/deploy/agg.yaml new file mode 100644 index 0000000000..08dd5e22fc --- /dev/null +++ b/examples/vllm_v1/deploy/agg.yaml @@ -0,0 +1,100 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: agg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"enforce-eager":true,"max-num-batched-tokens":16384,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len"]}}' + services: + Frontend: + dynamoNamespace: vllm-v1-agg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.agg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + SimpleLoadBalancer: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-agg + replicas: 1 + resources: + requests: + cpu: "1" + memory: "20Gi" + limits: + cpu: "1" + memory: "20Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.agg:SimpleLoadBalancer + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - SimpleLoadBalancer + VllmDecodeWorker: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-agg + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.agg:VllmDecodeWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmDecodeWorker diff --git a/examples/vllm_v1/deploy/disagg.yaml b/examples/vllm_v1/deploy/disagg.yaml new file mode 100644 index 0000000000..a85459f07d --- /dev/null +++ b/examples/vllm_v1/deploy/disagg.yaml @@ -0,0 +1,127 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]}}' + services: + Frontend: + dynamoNamespace: vllm-v1-disagg + componentType: main + replicas: 1 + resources: + requests: + cpu: "1" + memory: "2Gi" + limits: + cpu: "1" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + SimpleLoadBalancer: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-disagg + replicas: 1 + resources: + requests: + cpu: "1" + memory: "20Gi" + limits: + cpu: "1" + memory: "20Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:SimpleLoadBalancer + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - SimpleLoadBalancer + VllmDecodeWorker: + dynamoNamespace: vllm-v1-disagg + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:VllmDecodeWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmDecodeWorker + VllmPrefillWorker: + dynamoNamespace: vllm-v1-disagg + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + limits: + cpu: "10" + memory: "20Gi" + nvidia.com/gpu: "1" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg:VllmPrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmPrefillWorker diff --git a/examples/vllm_v1/deploy/disagg_planner.yaml b/examples/vllm_v1/deploy/disagg_planner.yaml new file mode 100644 index 0000000000..bf73f35e44 --- /dev/null +++ b/examples/vllm_v1/deploy/disagg_planner.yaml @@ -0,0 +1,182 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +apiVersion: nvidia.com/v1alpha1 +kind: DynamoGraphDeployment +metadata: + name: disagg-planner +spec: + envs: + - name: DYN_DEPLOYMENT_CONFIG + value: '{"Common":{"model":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","block-size":64,"max-model-len":16384,"kv-transfer-config":"{\"kv_connector\":\"DynamoNixlConnector\"}"},"Frontend":{"served_model_name":"deepseek-ai/DeepSeek-R1-Distill-Llama-8B","endpoint":"dynamo.VllmWorker.generate","port":8000,"router":"round-robin","common-configs":["block-size"]},"VllmWorker":{"remote-prefill":true,"conditional-disagg":true,"max-local-prefill-length":10,"max-prefill-queue-size":2,"enable-prefix-caching":true,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"PrefillWorker":{"max-num-batched-tokens":16384,"common-configs":["model","block-size","max-model-len","kv-transfer-config"]},"Prometheus":{"global":{"scrape_interval":"5s"},"scrape_configs":[{"job_name":"prometheus","static_configs":[{"targets":["localhost:9090"]}]},{"job_name":"frontend","static_configs":[{"targets":["localhost:8000"]}]}]},"Planner":{"adjustment-interval":180,"profile-results-dir":"/workspace/examples/profiling_results","isl":3000,"osl":150,"ttft":0.5,"itl":0.05,"load-predictor":"arima"}}' + services: + Frontend: + dynamoNamespace: vllm-v1-disagg-planner + componentType: main + replicas: 1 + resources: + requests: + cpu: "2" + memory: "4Gi" + limits: + cpu: "2" + memory: "4Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:Frontend + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Frontend + + SimpleLoadBalancer: + envFromSecret: hf-token-secret + dynamoNamespace: vllm-v1-disagg-planner + replicas: 1 + resources: + requests: + cpu: "1" + memory: "20Gi" + limits: + cpu: "1" + memory: "20Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:SimpleLoadBalancer + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - SimpleLoadBalancer + + VllmDecodeWorker: + dynamoNamespace: vllm-v1-disagg-planner + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:VllmDecodeWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmDecodeWorker + + VllmPrefillWorker: + dynamoNamespace: vllm-v1-disagg-planner + envFromSecret: hf-token-secret + replicas: 1 + resources: + requests: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + limits: + cpu: "20" + memory: "40Gi" + nvidia.com/gpu: "2" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:VllmPrefillWorker + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - VllmPrefillWorker + + Planner: + dynamoNamespace: vllm-v1-disagg-planner + replicas: 1 + componentType: planner + resources: + requests: + cpu: "2" + memory: "2Gi" + limits: + cpu: "2" + memory: "2Gi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:Planner + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Planner + - --Planner.environment=kubernetes + + Prometheus: + dynamoNamespace: vllm-v1-disagg-planner + replicas: 1 + resources: + requests: + cpu: "1000m" + memory: "1000Mi" + limits: + cpu: "1000m" + memory: "1000Mi" + extraPodSpec: + mainContainer: + image: nvcr.io/nvidia/ai-dynamo/vllm-runtime:latest + workingDir: /workspace/examples/vllm_v1 + args: + - dynamo + - serve + - graphs.disagg_planner:Prometheus + - --system-app-port + - "5000" + - --enable-system-app + - --use-default-health-checks + - --service-name + - Prometheus