-
Notifications
You must be signed in to change notification settings - Fork 738
feat: timing metrics reporting through nvext for vLLM backend
#4661
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
zhongxuanwang-nv
wants to merge
14
commits into
ai-dynamo:main
Choose a base branch
from
zhongxuanwang-nv:timing_zxw
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from 9 commits
Commits
Show all changes
14 commits
Select commit
Hold shift + click to select a range
08d432b
Initial push
zhongxuanwang-nv ded8aa8
Fixes
zhongxuanwang-nv 8588689
Small issue fixes
zhongxuanwang-nv 75ea426
black formatting
zhongxuanwang-nv 2166fa7
Rust formats
zhongxuanwang-nv 2dab081
Merge branch 'main' into timing_zxw
zhongxuanwang-nv 58fdc64
Rust formatting
zhongxuanwang-nv 0f1d019
Fix tests
zhongxuanwang-nv 4f437c1
Fix
zhongxuanwang-nv b023688
Fixes
zhongxuanwang-nv 956a435
Fix formatting
zhongxuanwang-nv bee9b19
Incoporating a round of feedback
zhongxuanwang-nv 32f0528
Merge branch 'main' into timing_zxw
zhongxuanwang-nv 133893a
Use perf_counter() instead + formatting
zhongxuanwang-nv File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Some comments aren't visible on the classic Files Changed page.
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
208 changes: 208 additions & 0 deletions
208
components/src/dynamo/vllm/tests/test_vllm_extra_fields.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,208 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| """Unit tests for extra_fields handling in vLLM handlers.""" | ||
|
|
||
| import asyncio | ||
| import warnings | ||
| from unittest.mock import AsyncMock, MagicMock, patch | ||
|
|
||
| import pytest | ||
|
|
||
| # Filter Pydantic deprecation warning before importing handlers | ||
| warnings.filterwarnings( | ||
| "ignore", | ||
| message=".*json_encoders.*is deprecated.*", | ||
| category=DeprecationWarning, | ||
| ) | ||
|
|
||
| from dynamo.vllm.handlers import ( # noqa: E402 | ||
| DecodeWorkerHandler, | ||
| PrefillWorkerHandler, | ||
| _should_include_timing_metrics, | ||
| ) | ||
|
|
||
|
|
||
| pytestmark = [ | ||
| pytest.mark.unit, | ||
| pytest.mark.vllm, | ||
| pytest.mark.gpu_1, | ||
| pytest.mark.pre_merge, | ||
| ] | ||
|
|
||
|
|
||
| class TestShouldIncludeTimingMetrics: | ||
| """Tests for _should_include_timing_metrics helper function.""" | ||
|
|
||
| def test_returns_true_with_multiple_extra_fields(self): | ||
| """Timing metrics should be included when explicitly requested.""" | ||
| request = {"extra_fields": ["worker_id", "timing_metrics", "other_field"]} | ||
| assert _should_include_timing_metrics(request) is True | ||
|
|
||
| def test_returns_false_when_extra_fields_is_none(self): | ||
| """Timing metrics should not be included when extra_fields is None.""" | ||
| request = {"extra_fields": None} | ||
| assert _should_include_timing_metrics(request) is False | ||
|
|
||
| def test_returns_false_when_extra_fields_missing(self): | ||
| """Timing metrics should not be included when extra_fields key is absent.""" | ||
| request: dict[str, list[str]] = {} | ||
| assert _should_include_timing_metrics(request) is False | ||
|
|
||
|
|
||
| def make_mock_request_output( | ||
| token_ids: list[int], | ||
| finish_reason: str | None = None, | ||
| prompt_token_ids: list[int] | None = None, | ||
| ): | ||
| """Create a mock vLLM RequestOutput.""" | ||
| output = MagicMock() | ||
| output.token_ids = token_ids | ||
| output.finish_reason = finish_reason | ||
| output.stop_reason = None | ||
|
|
||
| request_output = MagicMock() | ||
| request_output.outputs = [output] | ||
| request_output.prompt_token_ids = prompt_token_ids or [1, 2, 3] | ||
| request_output.num_cached_tokens = 0 | ||
| request_output.kv_transfer_params = None | ||
| return request_output | ||
|
|
||
|
|
||
| def create_mock_handler(handler_class: type): | ||
| """Create a handler with mocked dependencies.""" | ||
| runtime = MagicMock() | ||
| component = MagicMock() | ||
| engine = MagicMock() | ||
| default_sampling_params: dict[str, str] = {} | ||
|
|
||
| with patch("dynamo.vllm.handlers.VllmEngineMonitor"): | ||
| with patch("dynamo.vllm.handlers.ImageLoader"): | ||
| handler = handler_class( | ||
| runtime=runtime, | ||
| component=component, | ||
| engine=engine, | ||
| default_sampling_params=default_sampling_params, | ||
| model_max_len=4096, | ||
| ) | ||
| return handler | ||
|
|
||
|
|
||
| def create_mock_context(request_id: str = "test-request-123"): | ||
| """Create a mock context that doesn't trigger abort.""" | ||
| context = MagicMock() | ||
| context.id.return_value = request_id | ||
| # Make async_killed_or_stopped hang forever (never abort) | ||
| context.async_killed_or_stopped = AsyncMock(side_effect=asyncio.CancelledError) | ||
| return context | ||
|
|
||
|
|
||
| class TestDecodeWorkerHandlerTiming: | ||
| """E2E tests for timing metrics in DecodeWorkerHandler.""" | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_no_timing_metrics_when_not_requested(self): | ||
| """When timing_metrics not requested, no timing data in output.""" | ||
| handler = create_mock_handler(DecodeWorkerHandler) | ||
| context = create_mock_context() | ||
|
|
||
| final_output = make_mock_request_output([100], finish_reason="stop") | ||
|
|
||
| async def mock_generate(*args, **kwargs): | ||
| yield final_output | ||
|
|
||
| handler.engine_client.generate = mock_generate | ||
|
|
||
| request = { | ||
| "token_ids": [1, 2, 3], | ||
| "sampling_options": {}, | ||
| "stop_conditions": {}, | ||
| } | ||
|
|
||
| results = [] | ||
| async for output in handler.generate(request, context): | ||
| results.append(output) | ||
|
|
||
| final = results[-1] | ||
| assert ( | ||
| final.get("disaggregated_params") is None | ||
| or final.get("disaggregated_params", {}).get("timing_metrics") is None | ||
| ) | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_disaggregated_mode_preserves_frontend_timestamp(self): | ||
| """In disaggregated mode, frontend's request_received_seconds is preserved.""" | ||
| handler = create_mock_handler(DecodeWorkerHandler) | ||
| context = create_mock_context() | ||
|
|
||
| final_output = make_mock_request_output([100], finish_reason="stop") | ||
|
|
||
| async def mock_generate(*args, **kwargs): | ||
| yield final_output | ||
|
|
||
| handler.engine_client.generate = mock_generate | ||
|
|
||
| request = { | ||
| "token_ids": [1, 2, 3], | ||
| "sampling_options": {}, | ||
| "stop_conditions": {}, | ||
| "extra_fields": ["timing_metrics"], | ||
| "request_received_seconds": 1000.0, | ||
| "prefill_result": { | ||
| "disaggregated_params": { | ||
| "timing_metrics": { | ||
| "request_received_seconds": 999.0, | ||
| "prefill_start_seconds": 1001.0, | ||
| "prefill_end_seconds": 1002.0, | ||
| } | ||
| } | ||
| }, | ||
| } | ||
|
|
||
| results = [] | ||
| async for output in handler.generate(request, context): | ||
| results.append(output) | ||
|
|
||
| timing = results[-1]["disaggregated_params"]["timing_metrics"] | ||
|
|
||
| # Frontend's timestamp must be preserved | ||
| assert timing["request_received_seconds"] == 1000.0 | ||
| # Prefill timing should be merged | ||
| assert timing["prefill_start_seconds"] == 1001.0 | ||
| assert timing["prefill_end_seconds"] == 1002.0 | ||
|
|
||
|
|
||
| class TestPrefillWorkerHandlerTiming: | ||
| """E2E tests for timing metrics in PrefillWorkerHandler.""" | ||
|
|
||
| @pytest.mark.asyncio | ||
| async def test_timing_metrics_included_in_prefill_output(self): | ||
| """When timing_metrics requested, prefill output contains timing data.""" | ||
| handler = create_mock_handler(PrefillWorkerHandler) | ||
| context = create_mock_context() | ||
|
|
||
| prefill_output = make_mock_request_output([100]) | ||
| prefill_output.kv_transfer_params = {"some": "params"} | ||
|
|
||
| async def mock_generate(*args, **kwargs): | ||
| yield prefill_output | ||
|
|
||
| handler.engine_client.generate = mock_generate | ||
|
|
||
| request = { | ||
| "token_ids": [1, 2, 3], | ||
| "sampling_options": {}, | ||
| "stop_conditions": {}, | ||
| "extra_fields": ["timing_metrics"], | ||
| "request_received_seconds": 1000.0, | ||
| } | ||
|
|
||
| results = [] | ||
| async for output in handler.generate(request, context): | ||
| results.append(output) | ||
|
|
||
| timing = results[-1]["disaggregated_params"]["timing_metrics"] | ||
|
|
||
| assert timing["request_received_seconds"] == 1000.0 | ||
| assert "prefill_start_seconds" in timing | ||
| assert "prefill_end_seconds" in timing |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.