Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions components/backends/sglang/src/dynamo/sglang/health_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""
sglang-specific health check configuration.
This module defines the default health check payload for sglang backends.
"""

from dynamo.health_check import HealthCheckPayload


class SglangHealthCheckPayload(HealthCheckPayload):
"""
sglang-specific health check payload.
Provides sglang defaults and inherits environment override support from base class.
"""

def __init__(self):
"""
Initialize sglang health check payload with sglang-specific defaults.
"""
# Set sglang default payload - minimal request that completes quickly
# The handler expects token_ids, stop_conditions, and sampling_options
self.default_payload = {
"token_ids": [1], # Single token for minimal processing
"stop_conditions": {
"max_tokens": 1, # Generate only 1 token
"stop": None,
"stop_token_ids_hidden": None,
"min_tokens": 0,
"ignore_eos": False,
},
"sampling_options": {
"n": 1,
"temperature": 0.0,
"top_p": 1.0,
"top_k": -1,
"seed": None,
"use_beam_search": False,
},
"eos_token_ids": [],
"annotations": [],
}
super().__init__()
9 changes: 9 additions & 0 deletions components/backends/sglang/src/dynamo/sglang/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.sglang.args import Config, DisaggregationMode, parse_args
from dynamo.sglang.health_check import SglangHealthCheckPayload
from dynamo.sglang.publisher import setup_sgl_metrics
from dynamo.sglang.register import register_llm_with_runtime_config
from dynamo.sglang.request_handlers import DecodeWorkerHandler, PrefillWorkerHandler
Expand Down Expand Up @@ -112,6 +113,9 @@ async def register_model():
ready_event.set()
logging.info("Model registration succeeded; processing queued requests")

# Get health check payload (checks env var and falls back to sglang default)
health_check_payload = SglangHealthCheckPayload().to_dict()

try:
# Start endpoint immediately and register model concurrently
# Requests queue until ready_event is set
Expand All @@ -120,6 +124,7 @@ async def register_model():
handler.generate,
graceful_shutdown=True,
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
),
register_model(),
)
Expand Down Expand Up @@ -150,11 +155,15 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):

handler = PrefillWorkerHandler(component, engine, config)

# Get health check payload (checks env var and falls back to sglang default)
health_check_payload = SglangHealthCheckPayload().to_dict()

tasks = [
generate_endpoint.serve_endpoint(
handler.generate,
graceful_shutdown=True,
metrics_labels=[("model", server_args.served_model_name)],
health_check_payload=health_check_payload,
)
]

Expand Down
Loading