diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index c528581ad9e..85c9a3ac942 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -100,6 +100,7 @@ class GenericLlmRequest
         RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
     using RequestPtr = std::shared_ptr<GenericLlmRequest>;
     using MillisecondsType = std::chrono::milliseconds;
+    using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
     using CacheSaltIDType = runtime::CacheSaltIDType;
 
     GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> const& inputTokens,
@@ -138,7 +139,7 @@ class GenericLlmRequest
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
         : mRequestId(requestId)
         , mPromptLen(inputTokens->size())
         , mMaxNewTokens(maxNewTokens)
@@ -202,7 +203,7 @@ class GenericLlmRequest
             mState = LlmRequestState::kENCODER_INIT;
         }
 
-        initialize(*inputTokens, returnLogProbs);
+        initialize(*inputTokens, returnLogProbs, arrivalTime);
     }
 
     GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
@@ -2054,7 +2055,8 @@ class GenericLlmRequest
     std::optional<CacheSaltIDType> mCacheSaltID{std::nullopt};
 
 private:
-    void initialize(VecTokens const& inputTokens, bool outputLogProbs)
+    void initialize(
+        VecTokens const& inputTokens, bool outputLogProbs, std::optional<TimePoint> arrivalTime = std::nullopt)
     {
         if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY)
         {
@@ -2148,7 +2150,7 @@ class GenericLlmRequest
 
         if (mReturnPerfMetrics)
         {
-            mPerfMetrics.timingMetrics.arrivalTime = std::chrono::steady_clock::now();
+            mPerfMetrics.timingMetrics.arrivalTime = arrivalTime.value_or(std::chrono::steady_clock::now());
         }
         mStartTime = std::chrono::steady_clock::now();
     }
@@ -2197,61 +2199,9 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
     using TokenExtraIdType = Base::TokenExtraIdType;
     using VecTokenExtraIds = Base::VecTokenExtraIds;
 
-    // 49 parameters, 49 parameters in Base class constructor
-    LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
-        runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
-        std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
-        std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
-        std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
-        std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
-        std::optional<SizeType32> promptVocabSize = std::nullopt,
-        std::optional<std::shared_ptr<std::vector<std::vector<SizeType32>>>> multimodalHashes = std::nullopt,
-        std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalPositions = std::nullopt,
-        std::optional<std::shared_ptr<std::vector<SizeType32>>> multimodalLengths = std::nullopt,
-        std::optional<TensorPtr> multimodalEmbedding = std::nullopt,
-        std::optional<TensorPtr> mropeRotaryCosSin = std::nullopt,
-        std::optional<SizeType32> mropePositionDeltas = std::nullopt,
-        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
-        std::optional<TensorPtr> loraConfig = std::nullopt,
-        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
-        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
-        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
-        std::optional<std::shared_ptr<VecTokens>> const& draftTokens = std::nullopt,
-        std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
-        std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
-        bool applyLogitsPostProcessorBatched = false,
-        std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false,
-        std::optional<RequestIdType> clientId = std::nullopt,
-        executor::PriorityType priority = executor::Request::kDefaultPriority,
-        std::optional<TensorPtr> encoderInputFeatures = std::nullopt,
-        std::optional<SizeType32> encoderOutputLength = std::nullopt,
-        std::optional<TensorPtr> crossAttentionMask = std::nullopt,
-        LlmRequestType llmRequestType = LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION,
-        std::optional<std::shared_ptr<VecTokenExtraIds>> inputTokenExtraIds = std::nullopt,
-        SizeType32 numReturnSequences = 1, std::optional<executor::EagleConfig> eagleConfig = std::nullopt,
-        std::optional<TensorPtr> skipCrossAttnBlocks = std::nullopt, bool returnPerfMetrics = false,
-        std::optional<executor::GuidedDecodingParams> guidedDecodingParams = std::nullopt,
-        std::optional<SizeType32> languageAdapterUid = std::nullopt,
-        std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
-        std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
-        : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
-            std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
-            std::move(promptEmbeddingTable), promptVocabSize, std::move(multimodalHashes),
-            std::move(multimodalPositions), std::move(multimodalLengths), std::move(multimodalEmbedding),
-            std::move(mropeRotaryCosSin), mropePositionDeltas, loraTaskId, std::move(loraWeights),
-            std::move(loraConfig), std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs,
-            returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits),
-            excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched,
-            std::move(encoderInputTokens), returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures),
-            std::move(encoderOutputLength), std::move(crossAttentionMask), llmRequestType,
-            std::move(inputTokenExtraIds), numReturnSequences, std::move(eagleConfig), std::move(skipCrossAttnBlocks),
-            returnPerfMetrics, std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams,
-            cacheSaltID)
-    {
-    }
-
-    // 49 parameters, 49 parameters in Base class constructor
+    // inherit constructors
+    using Base::Base;
+
     LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::vector<TokenIdType> inputTokens,
         runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
         std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
@@ -2286,7 +2236,7 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt, std::optional<TimePoint> arrivalTime = std::nullopt)
         : Base(requestId, maxNewTokens, std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),
             samplingConfig, isStreaming, endId, padId, std::move(embeddingBias), std::move(badWordsList),
             std::move(stopWordsList),
@@ -2316,37 +2266,8 @@ class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
             inputTokenExtraIds ? std::make_optional(std::make_shared<VecTokenExtraIds>(std::move(*inputTokenExtraIds)))
                                : std::optional<std::shared_ptr<VecTokenExtraIds>>(std::nullopt),
             numReturnSequences, std::move(eagleConfig), skipCrossAttnBlocks, returnPerfMetrics,
-            std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID)
-    {
-    }
-
-    // 32 parameters, 32 parameters in Base class constructor
-    LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, VecTokens const& inputTokens,
-        runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
-        std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
-        std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
-        std::optional<std::shared_ptr<std::vector<SizeType32>>> positionIds = std::nullopt,
-        std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
-        std::optional<SizeType32> promptVocabSize = std::nullopt,
-        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
-        std::optional<TensorPtr> loraConfig = std::nullopt,
-        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
-        bool returnContextLogits = false, bool returnGenerationLogits = false,
-        std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
-        bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
-        bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
-        bool returnEncoderOutput = false, std::optional<RequestIdType> clientId = std::nullopt,
-        executor::PriorityType priority = executor::Request::kDefaultPriority, SizeType32 numReturnSequences = 1,
-        std::optional<SizeType32> languageAdapterUid = std::nullopt,
-        std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
-        : Base(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, endId, padId,
-            std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
-            std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
-            lookaheadConfig, returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens),
-            std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
-            applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
-            numReturnSequences, languageAdapterUid, contextPhaseParams, cacheSaltID)
+            std::move(guidedDecodingParams), languageAdapterUid, allottedTimeMs, contextPhaseParams, cacheSaltID,
+            arrivalTime)
     {
     }
 
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
index 94f15939f02..e0325b51c8a 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/bindings.cpp
@@ -32,6 +32,7 @@
 
 #include <ATen/ATen.h>
 #include <nanobind/nanobind.h>
+#include <nanobind/stl/chrono.h>
 #include <nanobind/stl/optional.h>
 #include <nanobind/stl/shared_ptr.h>
 #include <nanobind/stl/tuple.h>
@@ -289,7 +290,8 @@ void initBindings(nb::module_& m)
                 std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
                 std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
                 std::optional<executor::ContextPhaseParams> context_phase_params,
-                std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id)
+                std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
+                std::optional<tb::LlmRequest::TimePoint> arrival_time)
             {
                 auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
                 {
@@ -329,8 +331,8 @@ void initBindings(nb::module_& m)
                     encoder_input_tokens, return_encoder_output, client_id, priority, encoder_input_features_tensor_ptr,
                     encoder_output_length, cross_attention_mask_tensor_ptr, llm_request_type, input_token_extra_ids,
                     num_return_sequences, eagle_config, skip_cross_attn_blocks_tensor_ptr, return_perf_metrics,
-                    guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params,
-                    cache_salt_id};
+                    guided_decoding_params, language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id,
+                    arrival_time};
             },
             nb::arg("request_id"), nb::arg("max_new_tokens"), nb::arg("input_tokens"), nb::arg("sampling_config"),
             nb::arg("is_streaming"), nb::arg("end_id") = std::nullopt, nb::arg("pad_id") = std::nullopt,
@@ -355,7 +357,8 @@ void initBindings(nb::module_& m)
             nb::arg("eagle_config") = std::nullopt, nb::arg("skip_cross_attn_blocks") = std::nullopt,
             nb::arg("return_perf_metrics") = false, nb::arg("guided_decoding_params") = std::nullopt,
             nb::arg("language_adapter_uid") = std::nullopt, nb::arg("allotted_time_ms") = std::nullopt,
-            nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt)
+            nb::arg("context_phase_params") = std::nullopt, nb::arg("cache_salt_id") = std::nullopt,
+            nb::arg("arrival_time") = std::nullopt)
         .def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, nb::arg("vocab_size"))
         .def(nb::init<tb::LlmRequest const&>())
         .def("validate", &tb::LlmRequest::validate, nb::arg("max_input_len"), nb::arg("max_seq_len"),
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
index 46bfa0de64a..07d630cb3b2 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.cpp
@@ -126,6 +126,7 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
         mLanguageAdapterUid,                                       //
         mAllottedTimeMs,                                           //
         mContextPhaseParams,                                       //
-        mCacheSaltID                                               //
+        mCacheSaltID,                                              //
+        mPerfMetrics.timingMetrics.arrivalTime                     //
     );
 }
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
index b3d6f04aef8..4ea47fdcc8c 100644
--- a/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
+++ b/cpp/tensorrt_llm/nanobind/batch_manager/llmRequest.h
@@ -85,7 +85,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
+        std::optional<TimePoint> arrivalTime = std::nullopt)
         : Base(requestId,                                                                                       //
             maxNewTokens,                                                                                       //
             std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
@@ -147,7 +148,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
             languageAdapterUid,                                                                                  //
             allottedTimeMs,                                                                                      //
             contextPhaseParams,                                                                                  //
-            cacheSaltID                                                                                          //
+            cacheSaltID,                                                                                         //
+            arrivalTime                                                                                          //
         )
     {
     }
diff --git a/cpp/tensorrt_llm/nanobind/bindings.cpp b/cpp/tensorrt_llm/nanobind/bindings.cpp
index 21a5d4fbddd..b652adb341b 100644
--- a/cpp/tensorrt_llm/nanobind/bindings.cpp
+++ b/cpp/tensorrt_llm/nanobind/bindings.cpp
@@ -19,6 +19,7 @@
 #include <nanobind/nanobind.h>
 #include <nanobind/operators.h>
 #include <nanobind/stl/bind_vector.h>
+#include <nanobind/stl/chrono.h>
 #include <nanobind/stl/filesystem.h>
 #include <nanobind/stl/optional.h>
 #include <nanobind/stl/shared_ptr.h>
@@ -511,4 +512,6 @@ NB_MODULE(TRTLLM_NB_MODULE, m)
     m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, nb::rv_policy::reference);
     m.def("ipc_nvls_free", &tr::ipcNvlsFree);
     m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
+
+    m.def("steady_clock_now", []() { return std::chrono::steady_clock::now(); });
 }
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index dffe8ad1977..9bcd22e39e4 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -295,7 +295,8 @@ void initBindings(pybind11::module_& m)
                      std::optional<tb::LlmRequest::SizeType32> language_adapter_uid,
                      std::optional<tb::LlmRequest::MillisecondsType> allotted_time_ms,
                      std::optional<executor::ContextPhaseParams> context_phase_params,
-                     std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id)
+                     std::optional<tb::LlmRequest::CacheSaltIDType> cache_salt_id,
+                     std::optional<tb::LlmRequest::TimePoint> arrival_time)
                  {
                      auto makeOptionalTensor = [](std::optional<at::Tensor> const& atTensor, bool unsqueeze = false)
                      {
@@ -336,7 +337,7 @@ void initBindings(pybind11::module_& m)
                          encoder_input_features_tensor_ptr, encoder_output_length, cross_attention_mask_tensor_ptr,
                          llm_request_type, input_token_extra_ids, num_return_sequences, eagle_config,
                          skip_cross_attn_blocks_tensor_ptr, return_perf_metrics, guided_decoding_params,
-                         language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id};
+                         language_adapter_uid, allotted_time_ms, context_phase_params, cache_salt_id, arrival_time};
                  }),
             py::arg("request_id"), py::arg("max_new_tokens"), py::arg("input_tokens"), py::arg("sampling_config"),
             py::arg("is_streaming"), py::arg("end_id") = std::nullopt, py::arg("pad_id") = std::nullopt,
@@ -362,7 +363,8 @@ void initBindings(pybind11::module_& m)
             py::arg("eagle_config") = std::nullopt, py::arg("skip_cross_attn_blocks") = std::nullopt,
             py::arg("return_perf_metrics") = false, py::arg("guided_decoding_params") = std::nullopt,
             py::arg("language_adapter_uid") = std::nullopt, py::arg("allotted_time_ms") = std::nullopt,
-            py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt)
+            py::arg("context_phase_params") = std::nullopt, py::arg("cache_salt_id") = std::nullopt,
+            py::arg("arrival_time") = std::nullopt)
         .def("check_token_id_range", &tb::LlmRequest::checkTokenIdRange, py::arg("vocab_size"))
         .def(py::init<tb::LlmRequest const&>())
         .def("validate", &tb::LlmRequest::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
index 9b5c4bc1298..bcc9d4bf13f 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
@@ -125,6 +125,7 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
         mLanguageAdapterUid,                                       //
         mAllottedTimeMs,                                           //
         mContextPhaseParams,                                       //
-        mCacheSaltID                                               //
+        mCacheSaltID,                                              //
+        mPerfMetrics.timingMetrics.arrivalTime                     //
     );
 }
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
index 8d004cb304f..b43fb8dd073 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
@@ -85,7 +85,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
         std::optional<SizeType32> languageAdapterUid = std::nullopt,
         std::optional<MillisecondsType> allottedTimeMs = std::nullopt,
         std::optional<executor::ContextPhaseParams> const& contextPhaseParams = std::nullopt,
-        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt)
+        std::optional<CacheSaltIDType> cacheSaltID = std::nullopt,
+        std::optional<TimePoint> arrivalTime = std::nullopt)
         : Base(requestId,                                                                                       //
             maxNewTokens,                                                                                       //
             std::make_shared<std::vector<TokenIdType>>(std::move(inputTokens)),                                 //
@@ -147,7 +148,8 @@ class LlmRequest : public tb::GenericLlmRequest<at::Tensor, c10::Stream>
             languageAdapterUid,                                                                                  //
             allottedTimeMs,                                                                                      //
             contextPhaseParams,                                                                                  //
-            cacheSaltID                                                                                          //
+            cacheSaltID,                                                                                         //
+            arrivalTime                                                                                          //
         )
     {
     }
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
index 2b35fbf6946..e6f28548c20 100644
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -16,6 +16,7 @@
  */
 
 #include <pybind11/cast.h>
+#include <pybind11/chrono.h>
 #include <pybind11/functional.h>
 #include <pybind11/operators.h>
 #include <pybind11/pybind11.h>
@@ -498,4 +499,6 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
     m.def("ipc_nvls_allocate", &tr::ipcNvlsAllocate, py::return_value_policy::reference);
     m.def("ipc_nvls_free", &tr::ipcNvlsFree);
     m.def("ipc_nvls_supported", &tr::ipcNvlsSupported);
+
+    m.def("steady_clock_now", []() { return std::chrono::steady_clock::now(); });
 }
diff --git a/tensorrt_llm/_torch/pyexecutor/llm_request.py b/tensorrt_llm/_torch/pyexecutor/llm_request.py
index c6915fbd663..3d21238ee86 100644
--- a/tensorrt_llm/_torch/pyexecutor/llm_request.py
+++ b/tensorrt_llm/_torch/pyexecutor/llm_request.py
@@ -562,6 +562,7 @@ def executor_request_to_llm_request(
         llm_request_type=llm_request_type,
         context_phase_params=executor_request.context_phase_params,
         cache_salt_id=executor_request.cache_salt_id,
+        arrival_time=getattr(executor_request, "py_arrival_time", None),
         py_multimodal_data=getattr(executor_request, "py_multimodal_data",
                                    None))
     if child_req_ids:
diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
index c9d55a7cfc1..10a90457586 100644
--- a/tensorrt_llm/executor/executor.py
+++ b/tensorrt_llm/executor/executor.py
@@ -125,6 +125,7 @@ def generate_async(
         multimodal_params: Optional[MultimodalParams] = None,
         scheduling_params: Optional[SchedulingParams] = None,
         cache_salt_id: Optional[int] = None,
+        arrival_time: Optional[float] = None,
     ) -> GenerationResult:
         """Generate output for the given prompt token ids in the asynchronous mode.
         Asynchronous generation accepts single prompt only.
@@ -149,7 +150,8 @@ def generate_async(
             disaggregated_params=disaggregated_params,
             multimodal_params=multimodal_params,
             scheduling_params=scheduling_params,
-            cache_salt_id=cache_salt_id)
+            cache_salt_id=cache_salt_id,
+            arrival_time=arrival_time)
         result = self.submit(request)
         # release memory in time
         if hasattr(request, "multimodal_params"):
diff --git a/tensorrt_llm/executor/request.py b/tensorrt_llm/executor/request.py
index 1030e57f091..6d4b1be1b62 100644
--- a/tensorrt_llm/executor/request.py
+++ b/tensorrt_llm/executor/request.py
@@ -98,6 +98,7 @@ def __init__(
         multimodal_params: Optional[MultimodalParams] = None,
         scheduling_params: Optional[SchedulingParams] = None,
         cache_salt_id: Optional[int] = None,
+        arrival_time: Optional[float] = None,
     ):
         if isinstance(prompt_token_ids, list):
             self.prompt_token_ids = prompt_token_ids
@@ -124,6 +125,7 @@ def __init__(
         self.disaggregated_params = disaggregated_params
         self.scheduling_params = scheduling_params
         self.cache_salt_id = cache_salt_id
+        self.arrival_time = arrival_time
 
     def set_id(self, id):
         assert self.id is None, f"Request ID is already set: {self.id}"
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index b636985c137..4a7f49ea14e 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -593,6 +593,9 @@ def _deduce_max_tokens(request: GenerationRequest,
             if self._is_pytorch_backend and request.scheduling_params is not None:
                 executor_request.py_scheduling_params = request.scheduling_params
 
+            if request.arrival_time is not None:
+                executor_request.py_arrival_time = request.arrival_time
+
             if request.query_token_ids is not None:
                 # pytorch star attention workflow
                 # a workaround to avoid public interface update
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index b4c7b4cfbe0..b2665b587ec 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -18,6 +18,7 @@
 
 from .._utils import nvtx_range_debug
 from ..bindings import executor as tllm
+from ..bindings import steady_clock_now
 from ..builder import EngineConfig
 from ..disaggregated_params import DisaggregatedParams
 from ..executor import (DetokenizedGenerationResultBase, GenerationExecutor,
@@ -350,6 +351,9 @@ def generate_async(
         if self._executor is None or self._executor.is_shutdown():
             raise RuntimeError("LLM is shutting down")
 
+        arrival_time = steady_clock_now(
+        ) if self.args.return_perf_metrics else None
+
         sampling_params = self._prepare_sampling_params(sampling_params)
         cache_salt_id = get_cache_salt_id(
             cache_salt) if cache_salt is not None else None
@@ -448,6 +452,7 @@ def generate_async(
             multimodal_params=multimodal_params,
             scheduling_params=scheduling_params,
             cache_salt_id=cache_salt_id,
+            arrival_time=arrival_time,
         )
 
         return RequestOutput._from_generation_result(result, prompt,
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
index 18dbac44c37..910f39dc1e6 100644
--- a/tensorrt_llm/serve/openai_server.py
+++ b/tensorrt_llm/serve/openai_server.py
@@ -427,9 +427,6 @@ async def create_chat_response(
                 vocab_size=self.tokenizer.tokenizer.vocab_size,
                 gather_generation_logits=self.llm.args.gather_generation_logits,
                 backend=self.llm.args.backend)
-            # TODO: better way to enable metrics
-            if len(os.getenv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH", "")) > 0:
-                sampling_params.return_perf_metrics = True
             postproc_args = ChatPostprocArgs.from_request(request)
             disaggregated_params = to_llm_disaggregated_params(request.disaggregated_params)