Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
86c79ba
some prelim cleanups
PeaBrane May 30, 2025
6bee243
router can route to dp ranks
PeaBrane May 30, 2025
dab052c
make the bunny hoppy
PeaBrane May 30, 2025
be6900e
Merge remote-tracking branch 'origin/main' into rupei/router-general
PeaBrane May 30, 2025
25e1291
Merge remote-tracking branch 'origin/main' into rupei/router-general
PeaBrane May 30, 2025
34e5c5b
new struct combining worker_id with dp_rank, dirty commit, breaks bin…
PeaBrane May 30, 2025
2cef74c
binding works
PeaBrane May 30, 2025
10d3326
dummy c binding note
PeaBrane May 30, 2025
4483c68
add_class WorkerWithDpRank
PeaBrane May 30, 2025
263c12d
renames + comments + fmt
PeaBrane May 31, 2025
65ea6b5
allow suffix for dp_rank identification
PeaBrane Jun 3, 2025
a2ef896
WIP: fix fn dp_rank, add TODO's
alec-flowers Jun 3, 2025
e80d66c
refactor: fix bugs, kv publishing working
alec-flowers Jun 3, 2025
7a733bd
fix panicing metric thread issue
alec-flowers Jun 4, 2025
1bddc8e
remove verbose log
alec-flowers Jun 4, 2025
ee283cc
update v1 worker
alec-flowers Jun 4, 2025
183a8fe
put dp_rank in PreprocessedRequest
PeaBrane Jun 4, 2025
be7f951
new agg config
PeaBrane Jun 4, 2025
e1011d8
updated comments
PeaBrane Jun 4, 2025
5bf4fae
update v1 example
alec-flowers Jun 4, 2025
d6ded6c
final touches for it working with dp
alec-flowers Jun 4, 2025
61b94ac
Merge branch 'main' into rupei/router-general
alec-flowers Jun 4, 2025
9335efe
fix cost function trace
PeaBrane Jun 4, 2025
931b837
fmt
PeaBrane Jun 4, 2025
2a72271
Merge branch 'main' into rupei/router-general
PeaBrane Jun 4, 2025
eb7bb10
WIP document current work steps
alec-flowers Jun 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
final touches for it working with dp
  • Loading branch information
alec-flowers committed Jun 4, 2025
commit d6ded6ca98c5ac539f7cdd0661df57e74d339bbe
7 changes: 6 additions & 1 deletion examples/vllm_v1/components/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from utils.args import parse_vllm_args
from utils.protocol import PreprocessedRequest
from vllm.config import VllmConfig
from vllm.distributed.kv_events import ZmqEventPublisher
from vllm.distributed.kv_events import KVEventsConfig, ZmqEventPublisher
from vllm.inputs import TokensPrompt
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
Expand Down Expand Up @@ -103,10 +103,15 @@ class VllmBaseWorker:
def __init__(self):
class_name = self.__class__.__name__
self.engine_args = parse_vllm_args(class_name, "")
self.engine_args.kv_events_config = KVEventsConfig(
enable_kv_cache_events=True, publisher="zmq"
)
if not self.engine_args.block_size:
logger.info(f"block_size not set, default to {BLOCK_SIZE}")
self.engine_args.block_size = BLOCK_SIZE

os.environ["VLLM_NO_USAGE_STATS"] = "1" # Avoid internal HTTP requests

model_config = self.engine_args.create_model_config()
self.default_sampling_params = model_config.get_diff_sampling_param()

Expand Down
5 changes: 2 additions & 3 deletions examples/vllm_v1/configs/agg.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,7 @@
Common:
model: Qwen/Qwen3-0.6B
data-parallel-size: 2
router: kv
block-size: 64
block-size: 16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to set that?

max-model-len: 16384
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to set that?

served_model_name: Qwen/Qwen3-0.6B

Expand All @@ -29,7 +28,7 @@ VllmDecodeWorker:
max-num-batched-tokens: 16384
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why do we need to set that?

enable-prefix-caching: true
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is enabled by default in V1

ServiceArgs:
workers: 2 # 2 workers
workers: 1 # 2 workers
resources:
gpu: 2 # 2 dp ranks
common-configs: [model, served_model_name, block-size, data-parallel-size, max-model-len]
Expand Down
3 changes: 3 additions & 0 deletions examples/vllm_v1/utils/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.


# TODO: rename to avoid ambiguity with vllm package
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.utils import FlexibleArgumentParser
Expand All @@ -23,6 +24,7 @@
def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
config = ServiceConfig.get_instance()
vllm_args = config.as_args(service_name, prefix=prefix)

parser = FlexibleArgumentParser()
parser.add_argument(
"--enable-disagg", action="store_true", help="Enable disaggregation"
Expand All @@ -31,4 +33,5 @@ def parse_vllm_args(service_name, prefix) -> AsyncEngineArgs:
args = parser.parse_args(vllm_args)
engine_args = AsyncEngineArgs.from_cli_args(args)
engine_args.enable_disagg = args.enable_disagg

return engine_args