remove further options that were never used

microsoft · gedoensmax · Aug 27, 2025 · Aug 28, 2025 · Aug 15, 2025 · Aug 15, 2025
commit f92a8c3fff3492c36679adb0cdc8359d069f23f8
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -751,29 +751,6 @@ NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId devi
   }
 }
 
-NvExecutionProvider::PerThreadContext::~PerThreadContext() {
-  trt_context_map_.clear();
-}
-
-void NvExecutionProvider::PerThreadContext::ResetTensorRTContext(std::string fused_node) {
-  auto it = trt_context_map_.find(fused_node);
-  if (it != trt_context_map_.end()) {
-    trt_context_map_[fused_node].reset();
-  }
-}
-
-bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext(std::string fused_node, tensorrt_ptr::unique_pointer_exec_ctx context) {
-  if (!context) {
-    context = tensorrt_ptr::unique_pointer_exec_ctx();
-  }
-  trt_context_map_[fused_node] = std::move(context);
-
-  if (trt_context_map_[fused_node]) {
-    return true;
-  }
-  return false;
-}
-
 void NvExecutionProvider::PerThreadContext::DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id) {
   graph_id_to_run_count_.erase(cuda_graph_annotation_id);
   cuda_graph_.Reset();
@@ -854,24 +831,6 @@ void NvExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphC
   graph_id_to_run_count_[cuda_graph_annotation_id]++;
 }
 
-bool NvExecutionProvider::PerThreadContext::IsTensorRTContextInMap(std::string fused_node) {
-  auto it = trt_context_map_.find(fused_node);
-  if (it != trt_context_map_.end()) {
-    return true;
-  }
-  return false;
-}
-
-nvinfer1::IExecutionContext& NvExecutionProvider::PerThreadContext::GetTensorRTContext(std::string fused_node) {
-  auto it = trt_context_map_.find(fused_node);
-  if (it != trt_context_map_.end()) {
-    return *(it->second.get());  // dereference shared pointer
-  }
-  auto context = tensorrt_ptr::unique_pointer_exec_ctx();
-  trt_context_map_[fused_node] = std::move(context);
-  return *(trt_context_map_[fused_node].get());  // dereference shared pointer
-}
-
 void NvExecutionProvider::ReleasePerThreadContext() const {
   const auto& per_thread_context_cache = PerThreadContextCache();
 
@@ -1015,13 +974,6 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
   ep_context_file_path_ = info.ep_context_file_path;
   ep_context_embed_mode_ = info.ep_context_embed_mode;
   enable_engine_cache_for_ep_context_model();
-  cache_prefix_ = info.engine_cache_prefix;
-  // use a more global cache if given
-  engine_decryption_enable_ = info.engine_decryption_enable;
-  if (engine_decryption_enable_) {
-    engine_decryption_lib_path_ = info.engine_decryption_lib_path;
-  }
-  force_sequential_engine_build_ = info.force_sequential_engine_build;
   sparsity_enable_ = info.sparsity_enable;
   auxiliary_streams_ = info.auxiliary_streams;
   profile_min_shapes = info.profile_min_shapes;
@@ -1119,20 +1071,6 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
     cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
   }
 
-  if (engine_decryption_enable_) {
-    LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str());
-    if (handle == nullptr) {
-      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "NvTensorRTRTX EP could not open shared library from " + engine_decryption_lib_path_));
-    }
-    engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
-    engine_encryption_ = (int (*)(const char*, char*, size_t))LIBFUNC(handle, "encrypt");
-    if (engine_decryption_ == nullptr) {
-      ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
-                                         "NvTensorRTRTX EP could not find decryption function in shared library from " + engine_decryption_lib_path_));
-    }
-  }
-
   // cuda graph:
   // cudaStreamSynchronize() is not allowed in cuda graph capture.
   //
@@ -1162,16 +1100,12 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
                         << ", nv_dump_subgraphs: " << dump_subgraphs_
                         << ", nv_weight_stripped_engine_enable: " << weight_stripped_engine_enable_
                         << ", nv_onnx_model_folder_path: " << onnx_model_folder_path_
-                        << ", nv_engine_decryption_enable: " << engine_decryption_enable_
-                        << ", nv_engine_decryption_lib_path: " << engine_decryption_lib_path_
-                        << ", nv_force_sequential_engine_build: " << force_sequential_engine_build_
                         << ", nv_sparsity_enable: " << sparsity_enable_
                         << ", nv_auxiliary_streams: " << auxiliary_streams_
                         << ", enable_cuda_graph: " << cuda_graph_enable_
                         << ", nv_dump_ep_context_model: " << dump_ep_context_model_
                         << ", nv_ep_context_file_path: " << ep_context_file_path_
                         << ", nv_ep_context_embed_mode: " << ep_context_embed_mode_
-                        << ", nv_cache_prefix: " << cache_prefix_
                         << ", nv_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_
                         << ", nv_onnx_external_bytestream_size_: " << onnx_external_data_bytestream_size_
                         << ", nv_use_external_data_initializer_: " << use_external_data_initializer_
@@ -2815,16 +2749,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     if (dump_ep_context_model_) {
       // "ep_cache_context" node attribute should be a relative path to context model directory
 
-      std::string cache_path = "";
-      // Customize cache prefix if assigned
-      if (!cache_prefix_.empty()) {
-        // Generate cache suffix in case user would like to customize cache prefix
-        cache_path = GetCachePath(cache_path_, cache_prefix_) + fused_node.Name() + ".engine";
-        ;
-      } else {
-        cache_path = GetCachePath(cache_path_, fused_node.Name()) + ".engine";
-        ;
-      }
+      std::string cache_path = GetCachePath(cache_path_, fused_node.Name()) + ".engine";
       // NV TRT EP per default generates hardware compatible engines for any RTX device with compute capability > 80
       std::string compute_capability_hw_compat = "80+";
       if (!ep_context_model_) {
@@ -2919,9 +2844,8 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
           input_shape_ranges_[context->node_name], &tensorrt_mu_,
           engine_cache_enable_, cache_path_,
           runtime_.get(), profiles_[context->node_name],
-          engine_decryption_enable_, engine_decryption_, engine_encryption_,
           detailed_build_log_, sparsity_enable_,
-          auxiliary_streams_, cuda_graph_enable_, is_dynamic_shape_context, cache_prefix_};
+          auxiliary_streams_, cuda_graph_enable_, is_dynamic_shape_context};
     *state = p.release();
     return 0;
   };
@@ -3471,8 +3395,8 @@ void NvExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry& s
                             true /* release_cpu_buffer_on_cuda_stream */,
                             stream_,
                             external_stream_ /* use_existing_stream */,
-                            external_cudnn_handle_,
-                            external_cublas_handle_,
+                            nullptr,
+                            nullptr,
                             {});
 }
 

diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
@@ -228,16 +228,11 @@ struct TensorrtFuncState {
   std::string engine_cache_path;
   nvinfer1::IRuntime* runtime = nullptr;
   std::vector<nvinfer1::IOptimizationProfile*> profiles;
-  bool engine_decryption_enable = false;
-  int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
-  int (*engine_encryption)(const char*, char*, size_t) = nullptr;
   bool detailed_build_log = false;
   bool sparsity_enable = false;
   int auxiliary_streams = -1;
   bool cuda_graph_enable = 0;
   bool is_dynamic_shape = false;
-  std::string cache_prefix;
-  std::string cache_suffix;
   // runtime parameters
   std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
   std::vector<TensorParams> input_tensors;
@@ -289,14 +284,6 @@ class NvExecutionProvider : public IExecutionProvider {
   //  explicit NvExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
   virtual ~NvExecutionProvider();
 
-  cublasHandle_t PerThreadDefaultCublasHandle() {
-    return GetPerThreadContext().CublasHandle();
-  }
-
-  cudnnHandle_t PerThreadDefaultCudnnHandle() {
-    return GetPerThreadContext().CudnnHandle();
-  }
-
   virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
   std::unique_ptr<IDataTransfer> GetDataTransfer() const override;
 
@@ -351,7 +338,6 @@ class NvExecutionProvider : public IExecutionProvider {
   size_t min_subgraph_size_ = 1;
   size_t max_workspace_size_ = 0;
   size_t max_shared_mem_size_ = 0;
-  bool force_sequential_engine_build_ = false;
   bool dump_subgraphs_ = false;
   bool engine_cache_enable_ = false;
   bool weight_stripped_engine_enable_ = false;
@@ -364,21 +350,17 @@ class NvExecutionProvider : public IExecutionProvider {
   size_t onnx_external_data_bytestream_size_ = 0;
   bool sparsity_enable_ = false;
   int auxiliary_streams_ = -1;
-  std::string cache_path_, engine_decryption_lib_path_;
+  std::string cache_path_;
   std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
   std::mutex tensorrt_mu_;
   int device_id_;
   std::string compute_capability_;
   size_t max_ctx_mem_size_ = 0;
   mutable char model_path_[4096] = {};  // Reserved for max path length
-  bool engine_decryption_enable_ = false;
-  int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
-  int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
   bool detailed_build_log_ = false;
   bool cuda_graph_enable_ = false;
   bool multi_profile_enable_ = false;
   std::filesystem::path runtime_cache_;
-  std::string cache_prefix_;
   std::string op_types_to_exclude_;
   int nv_profile_index_ = 0;
   std::unique_ptr<onnxruntime::Model> ep_context_model_;
@@ -420,10 +402,6 @@ class NvExecutionProvider : public IExecutionProvider {
   std::unordered_map<std::string, std::vector<nvinfer1::IOptimizationProfile*>> profiles_;
   std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_maps_;
 
-  // for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture
-  cudnnHandle_t external_cudnn_handle_ = nullptr;
-  cublasHandle_t external_cublas_handle_ = nullptr;
-
   // Call cudaStreamSynchronize() after TRT enqueueV3()
   mutable bool sync_stream_after_enqueue_ = true;
 
@@ -434,20 +412,7 @@ class NvExecutionProvider : public IExecutionProvider {
   class PerThreadContext final {
    public:
     PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream);
-    ~PerThreadContext();
-
-    cublasHandle_t CublasHandle() const {
-      return external_cublas_handle_;
-    }
-
-    cudnnHandle_t CudnnHandle() const {
-      return external_cudnn_handle_;
-    }
-
-    bool IsTensorRTContextInMap(std::string fused_node);
-    nvinfer1::IExecutionContext& GetTensorRTContext(std::string fused_node);
-    bool UpdateTensorRTContext(std::string fused_node, tensorrt_ptr::unique_pointer_exec_ctx context);
-    void ResetTensorRTContext(std::string fused_node);
+    ~PerThreadContext() = default;
 
     // CUDA Graph management
     void SetCudaGraphStream(cudaStream_t stream) { cuda_graph_.SetStream(stream); }
@@ -465,23 +430,6 @@ class NvExecutionProvider : public IExecutionProvider {
     void DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);
 
    private:
-    cudnnHandle_t external_cudnn_handle_ = nullptr;
-    cublasHandle_t external_cublas_handle_ = nullptr;
-
-    // Maintaining execution context on a per thread basis is suggested by TRT doc.
-    // Also, for enqueueV2() in execution context, to perform inference concurrently in multiple streams, use one execution context per stream.
-    // ORT multi-streams feature uses one stream for one thread, therefore maintaining execution context on a per thread basis is necessary for TRT EP,
-    // otherwise it may result in undefined behavior or synchronization issues.
-    //
-    // See more details here:
-    // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-    // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_execution_context.html#a63cd95430852038ce864e17c670e0b36
-    std::unordered_map<std::string, tensorrt_ptr::unique_pointer_exec_ctx> trt_context_map_;
-
-    // The profile shape ranges for the engine that the execution context maintained by the PerThreadContext is built with.
-    // TRT EP needs this info to determine whether to rebuild the execution context.
-    std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;
-
     // Cuda graph with multi threads will be supported in the future, so cuda_graph_ is put under PerThreadContext.
     // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph instance is enough (no need to maintain one CUDAGraph instance per TRT subgraph)
     CUDAGraph cuda_graph_;

diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider_info.h
@@ -26,17 +26,13 @@ struct NvExecutionProviderInfo {
   size_t max_workspace_size{0};
   size_t max_shared_mem_size{0};
   bool dump_subgraphs{false};
-  std::string engine_cache_path{""};
   bool weight_stripped_engine_enable{false};
   std::string onnx_model_folder_path{""};
   const void* onnx_bytestream{nullptr};
   size_t onnx_bytestream_size{0};
   bool use_external_data_initializer{false};
   const void* external_data_bytestream{nullptr};
   size_t external_data_bytestream_size{0};
-  bool engine_decryption_enable{false};
-  std::string engine_decryption_lib_path{""};
-  bool force_sequential_engine_build{false};
   std::string runtime_cache_path{""};
   bool detailed_build_log{false};
   bool sparsity_enable{false};
@@ -49,7 +45,6 @@ struct NvExecutionProviderInfo {
   bool dump_ep_context_model{false};
   std::string ep_context_file_path{""};
   int ep_context_embed_mode{0};
-  std::string engine_cache_prefix{""};
   std::string op_types_to_exclude{""};
 
   static NvExecutionProviderInfo FromProviderOptions(const ProviderOptions& options,