diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index c528581ad9e..85c9a3ac942 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -100,6 +100,7 @@ class GenericLlmRequest RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional)>; using RequestPtr = std::shared_ptr; using MillisecondsType = std::chrono::milliseconds; + using TimePoint = std::chrono::time_point; using CacheSaltIDType = runtime::CacheSaltIDType; GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr const& inputTokens, @@ -138,7 +139,7 @@ class GenericLlmRequest std::optional languageAdapterUid = std::nullopt, std::optional allottedTimeMs = std::nullopt, std::optional const& contextPhaseParams = std::nullopt, - std::optional cacheSaltID = std::nullopt) + std::optional cacheSaltID = std::nullopt, std::optional arrivalTime = std::nullopt) : mRequestId(requestId) , mPromptLen(inputTokens->size()) , mMaxNewTokens(maxNewTokens) @@ -202,7 +203,7 @@ class GenericLlmRequest mState = LlmRequestState::kENCODER_INIT; } - initialize(*inputTokens, returnLogProbs); + initialize(*inputTokens, returnLogProbs, arrivalTime); } GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens, @@ -2054,7 +2055,8 @@ class GenericLlmRequest std::optional mCacheSaltID{std::nullopt}; private: - void initialize(VecTokens const& inputTokens, bool outputLogProbs) + void initialize( + VecTokens const& inputTokens, bool outputLogProbs, std::optional arrivalTime = std::nullopt) { if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY) { @@ -2148,7 +2150,7 @@ class GenericLlmRequest if (mReturnPerfMetrics) { - mPerfMetrics.timingMetrics.arrivalTime = std::chrono::steady_clock::now(); + mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(std::chrono::steady_clock::now()); } mStartTime = std::chrono::steady_clock::now(); } @@ -2197,61 +2199,9 @@ class LlmRequest : public GenericLlmRequest using TokenExtraIdType = Base::TokenExtraIdType; using VecTokenExtraIds = Base::VecTokenExtraIds; - // 49 parameters, 49 parameters in Base class constructor - LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr inputTokens, - runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional endId = std::nullopt, - std::optional padId = std::nullopt, std::optional embeddingBias = std::nullopt, - std::optional badWordsList = std::nullopt, std::optional stopWordsList = std::nullopt, - std::optional>> positionIds = std::nullopt, - std::optional promptEmbeddingTable = std::nullopt, - std::optional promptVocabSize = std::nullopt, - std::optional>>> multimodalHashes = std::nullopt, - std::optional>> multimodalPositions = std::nullopt, - std::optional>> multimodalLengths = std::nullopt, - std::optional multimodalEmbedding = std::nullopt, - std::optional mropeRotaryCosSin = std::nullopt, - std::optional mropePositionDeltas = std::nullopt, - std::optional loraTaskId = std::nullopt, std::optional loraWeights = std::nullopt, - std::optional loraConfig = std::nullopt, - std::optional lookaheadConfig = std::nullopt, - std::optional kvCacheRetentionConfig = std::nullopt, - bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false, - std::optional> const& draftTokens = std::nullopt, - std::optional draftLogits = std::nullopt, bool excludeInputFromOutput = false, - std::optional logitsPostProcessor = std::nullopt, - bool applyLogitsPostProcessorBatched = false, - std::optional> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false, - std::optional clientId = std::nullopt, - executor::PriorityType priority = executor::Request::kDefaultPriority, - std::optional encoderInputFeatures = std::nullopt, - std::optional encoderOutputLength = std::nullopt, - std::optional crossAttentionMask = std::nullopt, - LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, - std::optional> inputTokenExtraIds = std::nullopt, - SizeType32 numReturnSequences = 1, std::optional eagleConfig = std::nullopt, - std::optional skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false, - std::optional guidedDecodingParams = std::nullopt, - std::optional languageAdapterUid = std::nullopt, - std::optional allottedTimeMs = std::nullopt, - std::optional const& contextPhaseParams = std::nullopt, - std::optional cacheSaltID = std::nullopt) - : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId, - std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds), - std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalHashes), - std::move(multimodalPositions), std::move(multimodalLengths), std::move(multimodalEmbedding), - std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights), - std::move(loraConfig), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, - returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), - excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, - std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures), - std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType, - std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks), - returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, - cacheSaltID) - { - } - - // 49 parameters, 49 parameters in Base class constructor + // inherit constructors + using Base::Base; + LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector inputTokens, runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional endId = std::nullopt, std::optional padId = std::nullopt, std::optional embeddingBias = std::nullopt, @@ -2286,7 +2236,7 @@ class LlmRequest : public GenericLlmRequest std::optional languageAdapterUid = std::nullopt, std::optional allottedTimeMs = std::nullopt, std::optional const& contextPhaseParams = std::nullopt, - std::optional cacheSaltID = std::nullopt) + std::optional cacheSaltID = std::nullopt, std::optional arrivalTime = std::nullopt) : Base(requestId, maxNewTokens, std::make_shared>(std::move(inputTokens)), samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), @@ -2316,37 +2266,8 @@ class LlmRequest : public GenericLlmRequest inputTokenExtraIds ? std::make_optional(std::make_shared(std::move(*inputTokenExtraIds))) : std::optional>(std::nullopt), numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics, - std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID) - { - } - - // 32 parameters, 32 parameters in Base class constructor - LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens, - runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional endId = std::nullopt, - std::optional padId = std::nullopt, std::optional embeddingBias = std::nullopt, - std::optional badWordsList = std::nullopt, std::optional stopWordsList = std::nullopt, - std::optional>> positionIds = std::nullopt, - std::optional promptEmbeddingTable = std::nullopt, - std::optional promptVocabSize = std::nullopt, - std::optional loraTaskId = std::nullopt, std::optional loraWeights = std::nullopt, - std::optional loraConfig = std::nullopt, - std::optional lookaheadConfig = std::nullopt, bool returnLogProbs = false, - bool returnContextLogits = false, bool returnGenerationLogits = false, - std::optional draftTokens = std::nullopt, std::optional draftLogits = std::nullopt, - bool excludeInputFromOutput = false, std::optional logitsPostProcessor = std::nullopt, - bool applyLogitsPostProcessorBatched = false, std::optional encoderInputTokens = std::nullopt, - bool returnEncoderOutput = false, std::optional clientId = std::nullopt, - executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1, - std::optional languageAdapterUid = std::nullopt, - std::optional const& contextPhaseParams = std::nullopt, - std::optional cacheSaltID = std::nullopt) - : Base(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId, padId, - std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds), - std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig), - lookaheadConfig, returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens), - std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor), - applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, - numReturnSequences, languageAdapterUid, contextPhaseParams, cacheSaltID) + std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID, + arrivalTime) { } diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp index 94f15939f02..e0325b51c8a 100644 --- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp @@ -32,6 +32,7 @@ #include #include +#include #include #include #include @@ -289,7 +290,8 @@ void initBindings(nb::module_& m) std::optional language_adapter_uid, std::optional allotted_time_ms, std::optional context_phase_params, - std::optional cache_salt_id) + std::optional cache_salt_id, + std::optional arrival_time) { auto makeOptionalTensor = [](std::optional const& atTensor, bool unsqueeze = false) { @@ -329,8 +331,8 @@ void initBindings(nb::module_& m) encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr, encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids, num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics, - guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params, - cache_salt_id}; + guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, + arrival_time}; }, nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"), nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt, @@ -355,7 +357,8 @@ void initBindings(nb::module_& m) nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt, nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt, nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt, - nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt) + nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt, + nb::arg("arrival_time") = std::nullopt) .def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, nb::arg("vocab_size")) .def(nb::init()) .def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"), diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp index 46bfa0de64a..07d630cb3b2 100644 --- a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp @@ -126,6 +126,7 @@ std::shared_ptr LlmRequest::toTrtLlm() const mLanguageAdapterUid, // mAllottedTimeMs, // mContextPhaseParams, // - mCacheSaltID // + mCacheSaltID, // + mPerfMetrics.timingMetrics.arrivalTime // ); } diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h index b3d6f04aef8..4ea47fdcc8c 100644 --- a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h +++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h @@ -85,7 +85,8 @@ class LlmRequest : public tb::GenericLlmRequest std::optional languageAdapterUid = std::nullopt, std::optional allottedTimeMs = std::nullopt, std::optional const& contextPhaseParams = std::nullopt, - std::optional cacheSaltID = std::nullopt) + std::optional cacheSaltID = std::nullopt, + std::optional arrivalTime = std::nullopt) : Base(requestId, // maxNewTokens, // std::make_shared>(std::move(inputTokens)), // @@ -147,7 +148,8 @@ class LlmRequest : public tb::GenericLlmRequest languageAdapterUid, // allottedTimeMs, // contextPhaseParams, // - cacheSaltID // + cacheSaltID, // + arrivalTime // ) { } diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp index 21a5d4fbddd..b652adb341b 100644 --- a/cpp/tensorrt_llm/nanobind/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/bindings.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -511,4 +512,6 @@ NB_MODULE(TRTLLM_NB_MODULE, m) m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference); m.def("ipc_nvls_free", &tr::ipcNvlsFree); m.def("ipc_nvls_supported", &tr::ipcNvlsSupported); + + m.def("steady_clock_now", []() { return std::chrono::steady_clock::now(); }); } diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp index dffe8ad1977..9bcd22e39e4 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp @@ -295,7 +295,8 @@ void initBindings(pybind11::module_& m) std::optional language_adapter_uid, std::optional allotted_time_ms, std::optional context_phase_params, - std::optional cache_salt_id) + std::optional cache_salt_id, + std::optional arrival_time) { auto makeOptionalTensor = [](std::optional const& atTensor, bool unsqueeze = false) { @@ -336,7 +337,7 @@ void initBindings(pybind11::module_& m) encoder_input_features_tensor_ptr, encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids, num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics, guided_decoding_params, - language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id}; + language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, arrival_time}; }), py::arg("request_id"), py::arg("max_new_tokens"), py::arg("input_tokens"), py::arg("sampling_config"), py::arg("is_streaming"), py::arg("end_id") = std::nullopt, py::arg("pad_id") = std::nullopt, @@ -362,7 +363,8 @@ void initBindings(pybind11::module_& m) py::arg("eagle_config") = std::nullopt, py::arg("skip_cross_attn_blocks") = std::nullopt, py::arg("return_perf_metrics") = false, py::arg("guided_decoding_params") = std::nullopt, py::arg("language_adapter_uid") = std::nullopt, py::arg("allotted_time_ms") = std::nullopt, - py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt) + py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt, + py::arg("arrival_time") = std::nullopt) .def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, py::arg("vocab_size")) .def(py::init()) .def("validate", &tb::LlmRequest::validate, py::arg("max_input_len"), py::arg("max_seq_len"), diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp index 9b5c4bc1298..bcc9d4bf13f 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp +++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp @@ -125,6 +125,7 @@ std::shared_ptr LlmRequest::toTrtLlm() const mLanguageAdapterUid, // mAllottedTimeMs, // mContextPhaseParams, // - mCacheSaltID // + mCacheSaltID, // + mPerfMetrics.timingMetrics.arrivalTime // ); } diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h index 8d004cb304f..b43fb8dd073 100644 --- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h +++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h @@ -85,7 +85,8 @@ class LlmRequest : public tb::GenericLlmRequest std::optional languageAdapterUid = std::nullopt, std::optional allottedTimeMs = std::nullopt, std::optional const& contextPhaseParams = std::nullopt, - std::optional cacheSaltID = std::nullopt) + std::optional cacheSaltID = std::nullopt, + std::optional arrivalTime = std::nullopt) : Base(requestId, // maxNewTokens, // std::make_shared>(std::move(inputTokens)), // @@ -147,7 +148,8 @@ class LlmRequest : public tb::GenericLlmRequest languageAdapterUid, // allottedTimeMs, // contextPhaseParams, // - cacheSaltID // + cacheSaltID, // + arrivalTime // ) { } diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp index 2b35fbf6946..e6f28548c20 100644 --- a/cpp/tensorrt_llm/pybind/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/bindings.cpp @@ -16,6 +16,7 @@ */ #include +#include #include #include #include @@ -498,4 +499,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m) m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, py::return_value_policy::reference); m.def("ipc_nvls_free", &tr::ipcNvlsFree); m.def("ipc_nvls_supported", &tr::ipcNvlsSupported); + + m.def("steady_clock_now", []() { return std::chrono::steady_clock::now(); }); } diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py index c6915fbd663..3d21238ee86 100644 --- a/tensorrt_llm/_torch/pyexecutor/llm_request.py +++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py @@ -562,6 +562,7 @@ def executor_request_to_llm_request( llm_request_type=llm_request_type, context_phase_params=executor_request.context_phase_params, cache_salt_id=executor_request.cache_salt_id, + arrival_time=getattr(executor_request, "py_arrival_time", None), py_multimodal_data=getattr(executor_request, "py_multimodal_data", None)) if child_req_ids: diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py index c9d55a7cfc1..10a90457586 100644 --- a/tensorrt_llm/executor/executor.py +++ b/tensorrt_llm/executor/executor.py @@ -125,6 +125,7 @@ def generate_async( multimodal_params: Optional[MultimodalParams] = None, scheduling_params: Optional[SchedulingParams] = None, cache_salt_id: Optional[int] = None, + arrival_time: Optional[float] = None, ) -> GenerationResult: """Generate output for the given prompt token ids in the asynchronous mode. Asynchronous generation accepts single prompt only. @@ -149,7 +150,8 @@ def generate_async( disaggregated_params=disaggregated_params, multimodal_params=multimodal_params, scheduling_params=scheduling_params, - cache_salt_id=cache_salt_id) + cache_salt_id=cache_salt_id, + arrival_time=arrival_time) result = self.submit(request) # release memory in time if hasattr(request, "multimodal_params"): diff --git a/tensorrt_llm/executor/request.py b/tensorrt_llm/executor/request.py index 1030e57f091..6d4b1be1b62 100644 --- a/tensorrt_llm/executor/request.py +++ b/tensorrt_llm/executor/request.py @@ -98,6 +98,7 @@ def __init__( multimodal_params: Optional[MultimodalParams] = None, scheduling_params: Optional[SchedulingParams] = None, cache_salt_id: Optional[int] = None, + arrival_time: Optional[float] = None, ): if isinstance(prompt_token_ids, list): self.prompt_token_ids = prompt_token_ids @@ -124,6 +125,7 @@ def __init__( self.disaggregated_params = disaggregated_params self.scheduling_params = scheduling_params self.cache_salt_id = cache_salt_id + self.arrival_time = arrival_time def set_id(self, id): assert self.id is None, f"Request ID is already set: {self.id}" diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index b636985c137..4a7f49ea14e 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -593,6 +593,9 @@ def _deduce_max_tokens(request: GenerationRequest, if self._is_pytorch_backend and request.scheduling_params is not None: executor_request.py_scheduling_params = request.scheduling_params + if request.arrival_time is not None: + executor_request.py_arrival_time = request.arrival_time + if request.query_token_ids is not None: # pytorch star attention workflow # a workaround to avoid public interface update diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index b4c7b4cfbe0..b2665b587ec 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -18,6 +18,7 @@ from .._utils import nvtx_range_debug from ..bindings import executor as tllm +from ..bindings import steady_clock_now from ..builder import EngineConfig from ..disaggregated_params import DisaggregatedParams from ..executor import (DetokenizedGenerationResultBase, GenerationExecutor, @@ -350,6 +351,9 @@ def generate_async( if self._executor is None or self._executor.is_shutdown(): raise RuntimeError("LLM is shutting down") + arrival_time = steady_clock_now( + ) if self.args.return_perf_metrics else None + sampling_params = self._prepare_sampling_params(sampling_params) cache_salt_id = get_cache_salt_id( cache_salt) if cache_salt is not None else None @@ -448,6 +452,7 @@ def generate_async( multimodal_params=multimodal_params, scheduling_params=scheduling_params, cache_salt_id=cache_salt_id, + arrival_time=arrival_time, ) return RequestOutput._from_generation_result(result, prompt, diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 18dbac44c37..910f39dc1e6 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -427,9 +427,6 @@ async def create_chat_response( vocab_size=self.tokenizer.tokenizer.vocab_size, gather_generation_logits=self.llm.args.gather_generation_logits, backend=self.llm.args.backend) - # TODO: better way to enable metrics - if len(os.getenv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH", "")) > 0: - sampling_params.return_perf_metrics = True postproc_args = ChatPostprocArgs.from_request(request) disaggregated_params = to_llm_disaggregated_params(request.disaggregated_params)