diff --git a/benchmarks/profiler/deploy/profile_sla_job.yaml b/benchmarks/profiler/deploy/profile_sla_job.yaml index 948512ab21..cebe63a590 100644 --- a/benchmarks/profiler/deploy/profile_sla_job.yaml +++ b/benchmarks/profiler/deploy/profile_sla_job.yaml @@ -14,11 +14,8 @@ spec: image: ${DOCKER_IMAGE} resources: requests: - cpu: "1" - memory: "2Gi" - limits: - cpu: "2" - memory: "4Gi" + cpu: "16" + memory: "10Gi" env: - name: HUGGING_FACE_HUB_TOKEN valueFrom: @@ -37,6 +34,18 @@ spec: - /workspace/profiling_results - --namespace - ${NAMESPACE} + - --min-num-gpus-per-engine + - "1" + - --max-num-gpus-per-engine + - "8" + - --isl + - "3000" + - --osl + - "150" + - --ttft + - "200" + - --itl + - "20" volumeMounts: - name: output-volume mountPath: /workspace/profiling_results diff --git a/benchmarks/profiler/utils/config.py b/benchmarks/profiler/utils/config.py index 17ec95c6cc..9c6ba0660e 100644 --- a/benchmarks/profiler/utils/config.py +++ b/benchmarks/profiler/utils/config.py @@ -89,16 +89,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d if target == "prefill": # convert prefill worker into decode worker config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].decode_worker + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name ] = config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].prefill_worker + WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name ] del config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].prefill_worker + WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name ] args = config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].decode_worker + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name ]["extraPodSpec"]["mainContainer"]["args"] args = break_arguments(args) @@ -112,18 +112,18 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d if "--no-enable-prefix-caching" not in args: args = append_argument(args, "--no-enable-prefix-caching") - config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ - "extraPodSpec" - ]["mainContainer"]["args"] = join_arguments(args) + config["spec"]["services"][ + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name + ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args) elif target == "decode": # delete prefill worker del config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].prefill_worker + WORKER_COMPONENT_NAMES["vllm"].prefill_worker_k8s_name ] args = config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].decode_worker + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name ]["extraPodSpec"]["mainContainer"]["args"] args = break_arguments(args) @@ -134,13 +134,13 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d if "--no-enable-prefix-caching" in args: args.remove("--no-enable-prefix-caching") - config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ - "extraPodSpec" - ]["mainContainer"]["args"] = join_arguments(args) + config["spec"]["services"][ + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name + ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args) # set num workers to 1 decode_worker_config = config["spec"]["services"][ - WORKER_COMPONENT_NAMES["vllm"].decode_worker + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name ] decode_worker_config["replicas"] = 1 @@ -150,16 +150,16 @@ def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> d def set_config_tp_size(cls, config: dict, tp_size: int): config = deepcopy(config) - config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ - "resources" - ]["requests"]["gpu"] = str(tp_size) - config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ - "resources" - ]["limits"]["gpu"] = str(tp_size) + config["spec"]["services"][ + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name + ]["resources"]["requests"]["gpu"] = str(tp_size) + config["spec"]["services"][ + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name + ]["resources"]["limits"]["gpu"] = str(tp_size) - args = config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ - "extraPodSpec" - ]["mainContainer"]["args"] + args = config["spec"]["services"][ + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name + ]["extraPodSpec"]["mainContainer"]["args"] args = break_arguments(args) @@ -169,15 +169,15 @@ def set_config_tp_size(cls, config: dict, tp_size: int): except ValueError: args = append_argument(args, ["--tensor-parallel-size", str(tp_size)]) - config["spec"]["services"][WORKER_COMPONENT_NAMES["vllm"].decode_worker][ - "extraPodSpec" - ]["mainContainer"]["args"] = join_arguments(args) + config["spec"]["services"][ + WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name + ]["extraPodSpec"]["mainContainer"]["args"] = join_arguments(args) return config @classmethod def get_model_name(cls, config: dict) -> str: - worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker + worker_name = WORKER_COMPONENT_NAMES["vllm"].decode_worker_k8s_name args = config["spec"]["services"][worker_name]["extraPodSpec"]["mainContainer"][ "args" ] diff --git a/components/backends/vllm/deploy/disagg_planner.yaml b/components/backends/vllm/deploy/disagg_planner.yaml index 6d974ecc38..8746ffedcc 100644 --- a/components/backends/vllm/deploy/disagg_planner.yaml +++ b/components/backends/vllm/deploy/disagg_planner.yaml @@ -141,7 +141,7 @@ spec: - -c args: - "python3 -m dynamo.planner.prometheus" - backend: + VllmDecodeWorker: dynamoNamespace: vllm-disagg-planner envFromSecret: hf-token-secret componentType: worker @@ -191,7 +191,7 @@ spec: - -c args: - "python3 -m dynamo.vllm --model Qwen/Qwen3-0.6B 2>&1 | tee /tmp/vllm.log" - prefill: + VllmPrefillWorker: dynamoNamespace: vllm-disagg-planner envFromSecret: hf-token-secret componentType: worker diff --git a/components/planner/src/dynamo/planner/defaults.py b/components/planner/src/dynamo/planner/defaults.py index 2eae047167..f461fcb39d 100644 --- a/components/planner/src/dynamo/planner/defaults.py +++ b/components/planner/src/dynamo/planner/defaults.py @@ -74,9 +74,11 @@ class SLAPlannerDefaults(BasePlannerDefaults): class VllmComponentName: - prefill_worker = "prefill" + prefill_worker_k8s_name = "VllmPrefillWorker" + prefill_worker_component_name = "prefill" prefill_worker_endpoint = "generate" - decode_worker = "backend" + decode_worker_k8s_name = "VllmDecodeWorker" + decode_worker_component_name = "backend" decode_worker_endpoint = "generate" diff --git a/components/planner/src/dynamo/planner/utils/planner_core.py b/components/planner/src/dynamo/planner/utils/planner_core.py index ffe532d216..085d5a6298 100644 --- a/components/planner/src/dynamo/planner/utils/planner_core.py +++ b/components/planner/src/dynamo/planner/utils/planner_core.py @@ -106,7 +106,11 @@ async def get_workers_info(self): if self.prefill_client is None: self.prefill_client = ( await self.runtime.namespace(self.namespace) - .component(WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker) + .component( + WORKER_COMPONENT_NAMES[ + self.args.backend + ].prefill_worker_component_name + ) .endpoint( WORKER_COMPONENT_NAMES[ self.args.backend @@ -127,7 +131,11 @@ async def get_workers_info(self): if self.workers_client is None: self.workers_client = ( await self.runtime.namespace(self.namespace) - .component(WORKER_COMPONENT_NAMES[self.args.backend].decode_worker) + .component( + WORKER_COMPONENT_NAMES[ + self.args.backend + ].decode_worker_component_name + ) .endpoint( WORKER_COMPONENT_NAMES[self.args.backend].decode_worker_endpoint ) @@ -300,8 +308,12 @@ async def make_adjustments(self): if not self.args.no_operation: target_replicas = { - WORKER_COMPONENT_NAMES[self.args.backend].prefill_worker: next_num_p, - WORKER_COMPONENT_NAMES[self.args.backend].decode_worker: next_num_d, + WORKER_COMPONENT_NAMES[ + self.args.backend + ].prefill_worker_k8s_name: next_num_p, + WORKER_COMPONENT_NAMES[ + self.args.backend + ].decode_worker_k8s_name: next_num_d, } await self.connector.set_component_replicas(target_replicas, blocking=False) diff --git a/docs/architecture/pre_deployment_profiling.md b/docs/architecture/pre_deployment_profiling.md index ea2ddaab2c..2fdfbf301d 100644 --- a/docs/architecture/pre_deployment_profiling.md +++ b/docs/architecture/pre_deployment_profiling.md @@ -82,25 +82,47 @@ kubectl create secret docker-registry nvcr-imagepullsecret \ # in the project's root folder ./container/build.sh --framework VLLM # Tag and push to your container registry +export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own dynamoimage +# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE +# Modify this yaml to profile different models +export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file ``` Replace the `image` within `profile_sla_job.yaml` with the tag of the image you pushed. -**Step 2: Run profiling (required)** +**Step 2: Set SLA target** + +Edit `$DYNAMO_HOME/benchmarks/profiler/deploy/profile_sla_job.yaml` to set the target ISL, OSL, TTFT, and ITL. + +```yaml +spec: + template: + spec: + containers: + - name: profile-sla + args: + - --isl + - "3000" # average ISL is 3000 tokens + - --osl + - "150" # average OSL is 150 tokens + - --ttft + - "200" # target TTFT is 200ms + - --itl + - "20" # target ITL is 20ms +``` + +**Step 3: Run profiling (required)** + ```bash cd $DYNAMO_HOME/benchmarks/profiler/deploy envsubst < profiling_pvc.yaml | kubectl apply -f - envsubst < profile_sla_sa.yaml | kubectl apply -f - envsubst < profile_sla_rbac.yaml | kubectl apply -f - envsubst < profile_sla_binding.yaml | kubectl apply -f - - -export DOCKER_IMAGE=nvcr.io/nvidia/ai-dynamo/vllm-runtime:0.3.2 # or your own image -# NOTE: DGD_CONFIG_FILE is pointing to the location of the config file inside DOCKER_IMAGE -export DGD_CONFIG_FILE=/workspace/components/backends/vllm/deploy/disagg.yaml # or your own disagg config file envsubst < profile_sla_job.yaml | kubectl apply -f - ``` -**Step 3: Wait for profiling to complete** +**Step 4: Wait for profiling to complete** ```bash kubectl get jobs -n $NAMESPACE kubectl logs job/profile-sla -n $NAMESPACE @@ -129,13 +151,14 @@ The profiling results are stored in a PVC named `profiling-pvc`. To access the r 1. **Create a temporary pod to access the PVC:** ```bash - kubectl run temp-access --image=alpine:latest --rm -it --restart=Never \ - --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["sh"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \ + kubectl run temp-access --image=alpine:latest --restart=Never \ + --overrides='{"spec":{"containers":[{"name":"temp-access","image":"alpine:latest","command":["tail","-f","/dev/null"],"volumeMounts":[{"name":"results","mountPath":"/workspace/profiling_results"}]}],"volumes":[{"name":"results","persistentVolumeClaim":{"claimName":"profiling-pvc"}}]}}' \ -n $NAMESPACE ``` 2. **Inside the temporary pod, navigate to the results directory:** ```bash + kubectl exec -it temp-access -n $NAMESPACE -- sh cd /workspace/profiling_results ls -la ```