Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 21 additions & 5 deletions components/backends/trtllm/src/dynamo/trtllm/health_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,27 @@ def __init__(self):
"""
Initialize TRT-LLM health check payload with TRT-LLM-specific defaults.
"""
# Set TRT-LLM default payload - minimal request that completes quickly
# Set TensorRT-LLM default payload - minimal request that completes quickly
# The handler expects token_ids, stop_conditions, and sampling_options
self.default_payload = {
"messages": [{"role": "user", "content": "1"}],
"max_tokens": 1,
"temperature": 0.0,
"stream": False,
"token_ids": [1], # Single token for minimal processing
"stop_conditions": {
"max_tokens": 1, # Generate only 1 token
"stop": None,
"stop_token_ids": None,
"include_stop_str_in_output": False,
"ignore_eos": False,
"min_tokens": 0,
},
"sampling_options": {
"temperature": 0.0,
"top_p": 1.0,
"top_k": 1,
"beam_width": 1,
"repetition_penalty": 1.0,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"seed": None,
},
}
super().__init__()
12 changes: 10 additions & 2 deletions components/backends/trtllm/src/dynamo/trtllm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from dynamo.runtime import DistributedRuntime, dynamo_worker
from dynamo.runtime.logging import configure_dynamo_logging
from dynamo.trtllm.engine import TensorRTLLMEngine, get_llm_engine
from dynamo.trtllm.health_check import TrtllmHealthCheckPayload
from dynamo.trtllm.multimodal_processor import MultimodalRequestProcessor
from dynamo.trtllm.publisher import get_publisher
from dynamo.trtllm.request_handlers.handlers import (
Expand Down Expand Up @@ -316,6 +317,9 @@ async def init(runtime: DistributedRuntime, config: Config):
runtime_config=runtime_config,
)

# Get health check payload (checks env var and falls back to TensorRT-LLM default)
health_check_payload = TrtllmHealthCheckPayload().to_dict()

if config.publish_events_and_metrics and is_first_worker(config):
# Initialize and pass in the publisher to the request handler to
# publish events and metrics.
Expand All @@ -334,11 +338,15 @@ async def init(runtime: DistributedRuntime, config: Config):
handler_config.publisher = publisher
handler = RequestHandlerFactory().get_request_handler(handler_config)
await endpoint.serve_endpoint(
handler.generate, metrics_labels=metrics_labels
handler.generate,
metrics_labels=metrics_labels,
health_check_payload=health_check_payload,
)
else:
handler = RequestHandlerFactory().get_request_handler(handler_config)
await endpoint.serve_endpoint(handler.generate)
await endpoint.serve_endpoint(
handler.generate, health_check_payload=health_check_payload
)


def main():
Expand Down
Loading