Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
591 changes: 47 additions & 544 deletions benchmarks/profiler/profile_sla.py

Large diffs are not rendered by default.

242 changes: 242 additions & 0 deletions benchmarks/profiler/utils/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,242 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging
from typing import Literal

from dynamo.planner.defaults import WORKER_COMPONENT_NAMES

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.INFO)
formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s", "%Y-%m-%d %H:%M:%S"
)
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)


class VllmV0ConfigModifier:
@classmethod
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
config = config.copy()

# disable planner
if "Planner" in config:
config["Planner"]["no-operation"] = True

if target == "prefill":
if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
# make PrefillWorker into VllmWorker
del config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker] = config[
WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker
]
del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]

# to profile prefill, we disable prefix caching
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"enable-prefix-caching"
] = False
elif target == "decode":
if WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker in config:
del config[WORKER_COMPONENT_NAMES["vllm_v0"].prefill_worker]

# to profile prefill, we enable prefix caching to pass the prefill stage
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"enable-prefix-caching"
] = True

# set num workers to 1
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
"workers"
] = 1

# set PP to 1
if (
"pipeline-parallel-size"
in config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]
and config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"pipeline-parallel-size"
]
> 1
):
logger.warning("Currently we only support TP, setting PP to 1")
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"pipeline-parallel-size"
] = 1

# always local prefill
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"remote-prefill"
] = False
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"conditional-disagg"
] = False

return config

@classmethod
def set_config_tp_size(cls, config: dict, tp_size: int):
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker][
"tensor-parallel-size"
] = tp_size
config[WORKER_COMPONENT_NAMES["vllm_v0"].decode_worker]["ServiceArgs"][
"resources"
]["gpu"] = tp_size
return config

@classmethod
def get_model_name(cls, config: dict) -> str:
if "Common" in config and "served_model_name" in config["Common"]:
return config["Common"]["served_model_name"]
else:
return config["Frontend"]["served_model_name"]

@classmethod
def get_port(cls, config: dict) -> int:
if "Common" in config and "port" in config["Common"]:
return config["Common"]["port"]
else:
return config["Frontend"]["port"]

@classmethod
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
try:
with open(dynamo_log_fn, "r") as f:
for line in f:
if "Maximum concurrency for" in line:
line = line.strip().split("Maximum concurrency for ")[1]
token_count = int(line.split(" tokens per request: ")[0])
concurrency = float(line.split(" tokens per request: ")[1][:-1])

logger.info(
f"Found KV cache info: {token_count} x {concurrency} = {int(token_count * concurrency)}"
)
return int(token_count * concurrency)
except Exception as e:
logger.warning(
f"Failed to parse KV cache size from line: {line}. Error: {e}"
)
return 0


class VllmV1ConfigModifier:
@classmethod
def convert_config(cls, config: dict, target: Literal["prefill", "decode"]) -> dict:
config = config.copy()

# disable planner
if "Planner" in config:
config["Planner"]["no-operation"] = True

# turn-off disagg
config["SimpleLoadBalancer"]["enable_disagg"] = False

if target == "prefill":
if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
# make VllmPrefillWorker into VllmDecodeWorker
del config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker] = config[
WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker
]
del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]

# to profile prefill, we disable prefix caching
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
"enable-prefix-caching"
] = False
elif target == "decode":
if WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker in config:
del config[WORKER_COMPONENT_NAMES["vllm_v1"].prefill_worker]

# to profile prefill, we enable prefix caching to pass the prefill stage
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
"enable-prefix-caching"
] = True

# set num workers to 1
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
"workers"
] = 1

# set PP to 1
if (
"pipeline-parallel-size"
in config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]
and config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
"pipeline-parallel-size"
]
> 1
):
logger.warning("Currently we only support TP, setting PP to 1")
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
"pipeline-parallel-size"
] = 1

return config

@classmethod
def set_config_tp_size(cls, config: dict, tp_size: int):
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker][
"tensor-parallel-size"
] = tp_size
config[WORKER_COMPONENT_NAMES["vllm_v1"].decode_worker]["ServiceArgs"][
"resources"
]["gpu"] = tp_size
return config

@classmethod
def get_model_name(cls, config: dict) -> str:
if "Common" in config and "served_model_name" in config["Common"]:
return config["Common"]["served_model_name"]
else:
return config["Frontend"]["served_model_name"]

@classmethod
def get_port(cls, config: dict) -> int:
if "Common" in config and "port" in config["Common"]:
return config["Common"]["port"]
else:
return config["Frontend"]["port"]

@classmethod
def get_kv_cache_size_from_dynamo_log(cls, dynamo_log_fn: str) -> int:
try:
with open(dynamo_log_fn, "r") as f:
for line in f:
if "Maximum concurrency for" in line:
line = line.strip().split("Maximum concurrency for ")[1]
token_count = int(
line.split(" tokens per request: ")[0].replace(",", "")
)
concurrency = float(line.split(" tokens per request: ")[1][:-1])

logger.info(
f"Found KV cache info: {token_count} x {concurrency} = {int(token_count * concurrency)}"
)
return int(token_count * concurrency)
except Exception as e:
logger.warning(
f"Failed to parse KV cache size from line: {line}. Error: {e}"
)
return 0


CONFIG_MODIFIERS = {
"vllm_v0": VllmV0ConfigModifier,
"vllm_v1": VllmV1ConfigModifier,
}
31 changes: 31 additions & 0 deletions benchmarks/profiler/utils/defaults.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

DECODE_NUM_REQUESTS_RANGE = [
1,
5,
10,
25,
50,
100,
150,
200,
250,
300,
350,
400,
450,
500,
]
Loading
Loading