Skip to content
Prev Previous commit
Use perf_counter() instead + formatting
Signed-off-by: Zhongxuan Wang <[email protected]>
  • Loading branch information
zhongxuanwang-nv committed Dec 9, 2025
commit 133893ac5cd4fb0dcccd2f383b4962b5d8792a5c
25 changes: 14 additions & 11 deletions components/src/dynamo/vllm/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -633,10 +633,10 @@ async def generate_tokens(
out = {"token_ids": output.token_ids[num_output_tokens_so_far:]}
if output.finish_reason:
out["finish_reason"] = output.finish_reason
out["completion_usage"] = (
BaseWorkerHandler._build_completion_usage(
request_output=res,
)
out[
"completion_usage"
] = BaseWorkerHandler._build_completion_usage(
request_output=res,
)
# Log completion with LoRA info (debug level to avoid log spam)
if lora_request:
Expand Down Expand Up @@ -805,7 +805,9 @@ async def generate(self, request, context):
):
# Capture first token timing
if include_timing and not first_token_sent:
first_token_time = time.time()
first_token_time = decode_start_seconds + (
time.perf_counter() - decode_start_perf_counter
)
timing_metrics["decode_first_token_seconds"] = first_token_time
# In aggregated mode, prefill finishes when first token is generated
if prefill_result is None:
Expand Down Expand Up @@ -966,14 +968,15 @@ async def generate(self, request, context):
disaggregated_params: Optional[Dict[str, Any]] = {}

if res.kv_transfer_params:
disaggregated_params["kv_transfer_params"] = (
res.kv_transfer_params
)
disaggregated_params[
"kv_transfer_params"
] = res.kv_transfer_params

if include_timing and timing_metrics:
timing_metrics["prefill_end_seconds"] = (
prefill_start_seconds
+ (time.perf_counter() - prefill_start_perf_counter)
timing_metrics[
"prefill_end_seconds"
] = prefill_start_seconds + (
time.perf_counter() - prefill_start_perf_counter
)
disaggregated_params["timing_metrics"] = timing_metrics

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,9 @@ async def generate(self, request: vLLMMultimodalRequest, context):
# Update the prompt token id in the decode request to the one
# in response, which has image templated filled in. So that
# the decode worker will fetch correct amount of KV blocks.
decode_request.engine_prompt["prompt_token_ids"] = (
prefill_response.prompt_token_ids
)
decode_request.engine_prompt[
"prompt_token_ids"
] = prefill_response.prompt_token_ids
logger.debug(
f"Prefill response kv_transfer_params: {prefill_response.kv_transfer_params}"
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@ class TestShouldIncludeTimingMetrics:

def test_returns_true_with_multiple_observability_fields(self):
"""Timing metrics should be included when explicitly requested."""
request = {"observability_fields": ["worker_id", "timing_metrics", "other_field"]}
request = {
"observability_fields": ["worker_id", "timing_metrics", "other_field"]
}
assert _request_contains_timing_metrics(request) is True

def test_returns_false_when_observability_fields_is_none(self):
Expand Down
Loading