-
Notifications
You must be signed in to change notification settings - Fork 750
feat: router supporting intra-worker dp routing #1285
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
86c79ba
6bee243
dab052c
be6900e
25e1291
34e5c5b
2cef74c
10d3326
4483c68
263c12d
65ea6b5
a2ef896
e80d66c
7a733bd
1bddc8e
ee283cc
183a8fe
be7f951
e1011d8
5bf4fae
d6ded6c
61b94ac
9335efe
931b837
2a72271
eb7bb10
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
- Loading branch information
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,100 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
|
|
||
| # Work In Progress. This is not usable currently | ||
|
|
||
| import asyncio | ||
| import logging | ||
| import os | ||
| import signal | ||
| import socket | ||
| from typing import Optional | ||
|
|
||
| from utils.args import parse_vllm_args | ||
| from vllm import run_headless | ||
| from vllm.distributed.kv_events import KVEventsConfig | ||
|
|
||
| from dynamo.sdk import service | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
||
| BLOCK_SIZE = 16 | ||
|
|
||
|
|
||
| @service( | ||
| dynamo={ | ||
| "enabled": True, | ||
| "namespace": "dynamo", | ||
| }, | ||
| resources={"gpu": 1, "cpu": "10", "memory": "20Gi"}, | ||
| workers=1, | ||
| ) | ||
| class VllmHeadlessWorker: | ||
| def __init__(self): | ||
| class_name = self.__class__.__name__ | ||
| self.engine_args = parse_vllm_args(class_name, "") | ||
| self.engine_args.kv_events_config = KVEventsConfig( | ||
| enable_kv_cache_events=True, publisher="zmq" | ||
| ) | ||
| if not self.engine_args.block_size: | ||
| logger.info(f"block_size not set, default to {BLOCK_SIZE}") | ||
| self.engine_args.block_size = BLOCK_SIZE | ||
|
|
||
| os.environ["VLLM_NO_USAGE_STATS"] = "1" # Avoid internal HTTP requests | ||
|
|
||
| model_config = self.engine_args.create_model_config() | ||
| self.default_sampling_params = model_config.get_diff_sampling_param() | ||
|
|
||
| self.kv_publishers = [] | ||
|
|
||
| signal.signal(signal.SIGTERM, self.shutdown_vllm_engine) | ||
| signal.signal(signal.SIGINT, self.shutdown_vllm_engine) | ||
|
|
||
| self.set_side_channel_host_and_port() | ||
|
|
||
| async def async_init(self): | ||
| run_headless(self.engine_args) | ||
|
|
||
| def shutdown_vllm_engine(self, signum, frame): | ||
| """Shutdown the background loop""" | ||
| logger.info(f"Received signal {signum}, shutting down") | ||
| loop = asyncio.get_event_loop() | ||
| try: | ||
| self.engine_client.shutdown() | ||
| for publisher in self.kv_publishers: | ||
| publisher.shutdown() | ||
| logger.info("VllmWorker shutdown complete") | ||
| except Exception as e: | ||
| logger.error(f"Error during shutdown: {e}") | ||
| finally: | ||
| loop.stop() | ||
|
|
||
| def set_side_channel_host_and_port( | ||
| self, hostname: Optional[str] = None, port: Optional[int] = None | ||
| ): | ||
| """vLLM V1 NixlConnector creates a side channel to exchange metadata with other NIXL connectors. | ||
| This sets the port number for the side channel. | ||
| """ | ||
| if hostname is None: | ||
| hostname = socket.gethostname() | ||
| if port is None: | ||
| with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | ||
| s.bind(("", 0)) # Bind to a free port provided by the host. | ||
| port = s.getsockname()[1] # Get the port number assigned. | ||
| logger.debug("Setting VLLM_NIXL_SIDE_CHANNEL_HOST to %s", hostname) | ||
| os.environ["VLLM_NIXL_SIDE_CHANNEL_HOST"] = hostname | ||
| logger.debug("Setting VLLM_NIXL_SIDE_CHANNEL_PORT to %s", port) | ||
| os.environ["VLLM_NIXL_SIDE_CHANNEL_PORT"] = str(port) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,8 +1,4 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # you may not use this file except in compliance with the License.More actions | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
|
|
@@ -14,7 +10,7 @@ | |
| # limitations under the License. | ||
| Common: | ||
| model: Qwen/Qwen3-0.6B | ||
| data-parallel-size: 2 | ||
|
|
||
| block-size: 16 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need to set that? |
||
| max-model-len: 16384 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need to set that? |
||
| served_model_name: Qwen/Qwen3-0.6B | ||
|
|
@@ -27,9 +23,14 @@ VllmDecodeWorker: | |
| enforce-eager: true | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need to set that?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah sorry. I think we pushed the changes to our config file for our local dev. We were hijacking your simple load balancer to do kv routing 😆, but those changes were not pushed. For now, we are still working on cleaning the python bits up. |
||
| max-num-batched-tokens: 16384 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do we need to set that? |
||
| enable-prefix-caching: true | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is enabled by default in V1 |
||
| data-parallel-address: 127.0.0.1 | ||
| data-parallel-rpc-port: 62300 | ||
| data-parallel-size: 2 | ||
| data-parallel-size-local: 1 | ||
| # api-server-count: 2 | ||
|
|
||
| ServiceArgs: | ||
| workers: 1 # 2 workers | ||
| resources: | ||
| gpu: 2 # 2 dp ranks | ||
| common-configs: [model, served_model_name, block-size, data-parallel-size, max-model-len] | ||
|
|
||
| gpu: 1 # 2 dp ranks | ||
| common-configs: [model, served_model_name, block-size, max-model-len] | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
V1 and prefix caching are enabled by default