ai-dynamo · tzulingk · Sep 19, 2025 · Sep 15, 2025 · Sep 18, 2025
@@ -0,0 +1,46 @@
+#  SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#  SPDX-License-Identifier: Apache-2.0
+
+"""
+sglang-specific health check configuration.
+
+This module defines the default health check payload for sglang backends.
+"""
+
+from dynamo.health_check import HealthCheckPayload
+
+
+class SglangHealthCheckPayload(HealthCheckPayload):
+    """
+    sglang-specific health check payload.
+
+    Provides sglang defaults and inherits environment override support from base class.
+    """
+
+    def __init__(self):
+        """
+        Initialize sglang health check payload with sglang-specific defaults.
+        """
+        # Set sglang default payload - minimal request that completes quickly
+        # The handler expects token_ids, stop_conditions, and sampling_options
+        self.default_payload = {
+            "token_ids": [1],  # Single token for minimal processing
+            "stop_conditions": {
+                "max_tokens": 1,  # Generate only 1 token
+                "stop": None,
+                "stop_token_ids_hidden": None,
+                "min_tokens": 0,
+                "ignore_eos": False,
+            },
+            "sampling_options": {
+                "n": 1,
+                "temperature": 0.0,
+                "top_p": 1.0,
+                "top_k": -1,
+                "seed": None,
+                "use_beam_search": False,
+            },
+            "eos_token_ids": [],
+            "annotations": [],
+        }
+        super().__init__()
@@ -15,6 +15,7 @@
 from dynamo.runtime import DistributedRuntime, dynamo_worker
 from dynamo.runtime.logging import configure_dynamo_logging
 from dynamo.sglang.args import Config, DisaggregationMode, parse_args
+from dynamo.sglang.health_check import SglangHealthCheckPayload
 from dynamo.sglang.publisher import setup_sgl_metrics
 from dynamo.sglang.register import register_llm_with_runtime_config
 from dynamo.sglang.request_handlers import DecodeWorkerHandler, PrefillWorkerHandler
@@ -112,6 +113,9 @@ async def register_model():
         ready_event.set()
         logging.info("Model registration succeeded; processing queued requests")
 
+    # Get health check payload (checks env var and falls back to sglang default)
+    health_check_payload = SglangHealthCheckPayload().to_dict()
+
     try:
         # Start endpoint immediately and register model concurrently
         # Requests queue until ready_event is set
@@ -120,6 +124,7 @@ async def register_model():
                 handler.generate,
                 graceful_shutdown=True,
                 metrics_labels=metrics_labels,
+                health_check_payload=health_check_payload,
             ),
             register_model(),
         )
@@ -150,11 +155,15 @@ async def init_prefill(runtime: DistributedRuntime, config: Config):
 
     handler = PrefillWorkerHandler(component, engine, config)
 
+    # Get health check payload (checks env var and falls back to sglang default)
+    health_check_payload = SglangHealthCheckPayload().to_dict()
+
     tasks = [
         generate_endpoint.serve_endpoint(
             handler.generate,
             graceful_shutdown=True,
             metrics_labels=[("model", server_args.served_model_name)],
+            health_check_payload=health_check_payload,
         )
     ]