Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
remove further options that were never used
  • Loading branch information
gedoensmax committed Sep 22, 2025
commit f92a8c3fff3492c36679adb0cdc8359d069f23f8
Original file line number Diff line number Diff line change
Expand Up @@ -751,29 +751,6 @@ NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId devi
}
}

NvExecutionProvider::PerThreadContext::~PerThreadContext() {
trt_context_map_.clear();
}

void NvExecutionProvider::PerThreadContext::ResetTensorRTContext(std::string fused_node) {
auto it = trt_context_map_.find(fused_node);
if (it != trt_context_map_.end()) {
trt_context_map_[fused_node].reset();
}
}

bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext(std::string fused_node, tensorrt_ptr::unique_pointer_exec_ctx context) {
if (!context) {
context = tensorrt_ptr::unique_pointer_exec_ctx();
}
trt_context_map_[fused_node] = std::move(context);

if (trt_context_map_[fused_node]) {
return true;
}
return false;
}

void NvExecutionProvider::PerThreadContext::DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id) {
graph_id_to_run_count_.erase(cuda_graph_annotation_id);
cuda_graph_.Reset();
Expand Down Expand Up @@ -854,24 +831,6 @@ void NvExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphC
graph_id_to_run_count_[cuda_graph_annotation_id]++;
}

bool NvExecutionProvider::PerThreadContext::IsTensorRTContextInMap(std::string fused_node) {
auto it = trt_context_map_.find(fused_node);
if (it != trt_context_map_.end()) {
return true;
}
return false;
}

nvinfer1::IExecutionContext& NvExecutionProvider::PerThreadContext::GetTensorRTContext(std::string fused_node) {
auto it = trt_context_map_.find(fused_node);
if (it != trt_context_map_.end()) {
return *(it->second.get()); // dereference shared pointer
}
auto context = tensorrt_ptr::unique_pointer_exec_ctx();
trt_context_map_[fused_node] = std::move(context);
return *(trt_context_map_[fused_node].get()); // dereference shared pointer
}

void NvExecutionProvider::ReleasePerThreadContext() const {
const auto& per_thread_context_cache = PerThreadContextCache();

Expand Down Expand Up @@ -1015,13 +974,6 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
ep_context_file_path_ = info.ep_context_file_path;
ep_context_embed_mode_ = info.ep_context_embed_mode;
enable_engine_cache_for_ep_context_model();
cache_prefix_ = info.engine_cache_prefix;
// use a more global cache if given
engine_decryption_enable_ = info.engine_decryption_enable;
if (engine_decryption_enable_) {
engine_decryption_lib_path_ = info.engine_decryption_lib_path;
}
force_sequential_engine_build_ = info.force_sequential_engine_build;
sparsity_enable_ = info.sparsity_enable;
auxiliary_streams_ = info.auxiliary_streams;
profile_min_shapes = info.profile_min_shapes;
Expand Down Expand Up @@ -1119,20 +1071,6 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
cache_path_ = GetPathOrParentPathOfCtxModel(ep_context_file_path_).append(cache_path_).string();
}

if (engine_decryption_enable_) {
LIBTYPE handle = OPENLIB(engine_decryption_lib_path_.c_str());
if (handle == nullptr) {
ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"NvTensorRTRTX EP could not open shared library from " + engine_decryption_lib_path_));
}
engine_decryption_ = (int (*)(const char*, char*, size_t*))LIBFUNC(handle, "decrypt");
engine_encryption_ = (int (*)(const char*, char*, size_t))LIBFUNC(handle, "encrypt");
if (engine_decryption_ == nullptr) {
ORT_THROW_IF_ERROR(ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
"NvTensorRTRTX EP could not find decryption function in shared library from " + engine_decryption_lib_path_));
}
}

// cuda graph:
// cudaStreamSynchronize() is not allowed in cuda graph capture.
//
Expand Down Expand Up @@ -1162,16 +1100,12 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
<< ", nv_dump_subgraphs: " << dump_subgraphs_
<< ", nv_weight_stripped_engine_enable: " << weight_stripped_engine_enable_
<< ", nv_onnx_model_folder_path: " << onnx_model_folder_path_
<< ", nv_engine_decryption_enable: " << engine_decryption_enable_
<< ", nv_engine_decryption_lib_path: " << engine_decryption_lib_path_
<< ", nv_force_sequential_engine_build: " << force_sequential_engine_build_
<< ", nv_sparsity_enable: " << sparsity_enable_
<< ", nv_auxiliary_streams: " << auxiliary_streams_
<< ", enable_cuda_graph: " << cuda_graph_enable_
<< ", nv_dump_ep_context_model: " << dump_ep_context_model_
<< ", nv_ep_context_file_path: " << ep_context_file_path_
<< ", nv_ep_context_embed_mode: " << ep_context_embed_mode_
<< ", nv_cache_prefix: " << cache_prefix_
<< ", nv_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_
<< ", nv_onnx_external_bytestream_size_: " << onnx_external_data_bytestream_size_
<< ", nv_use_external_data_initializer_: " << use_external_data_initializer_
Expand Down Expand Up @@ -2815,16 +2749,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
if (dump_ep_context_model_) {
// "ep_cache_context" node attribute should be a relative path to context model directory

std::string cache_path = "";
// Customize cache prefix if assigned
if (!cache_prefix_.empty()) {
// Generate cache suffix in case user would like to customize cache prefix
cache_path = GetCachePath(cache_path_, cache_prefix_) + fused_node.Name() + ".engine";
;
} else {
cache_path = GetCachePath(cache_path_, fused_node.Name()) + ".engine";
;
}
std::string cache_path = GetCachePath(cache_path_, fused_node.Name()) + ".engine";
// NV TRT EP per default generates hardware compatible engines for any RTX device with compute capability > 80
std::string compute_capability_hw_compat = "80+";
if (!ep_context_model_) {
Expand Down Expand Up @@ -2919,9 +2844,8 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
input_shape_ranges_[context->node_name], &tensorrt_mu_,
engine_cache_enable_, cache_path_,
runtime_.get(), profiles_[context->node_name],
engine_decryption_enable_, engine_decryption_, engine_encryption_,
detailed_build_log_, sparsity_enable_,
auxiliary_streams_, cuda_graph_enable_, is_dynamic_shape_context, cache_prefix_};
auxiliary_streams_, cuda_graph_enable_, is_dynamic_shape_context};
*state = p.release();
return 0;
};
Expand Down Expand Up @@ -3471,8 +3395,8 @@ void NvExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry& s
true /* release_cpu_buffer_on_cuda_stream */,
stream_,
external_stream_ /* use_existing_stream */,
external_cudnn_handle_,
external_cublas_handle_,
nullptr,
nullptr,
{});
}

Expand Down
56 changes: 2 additions & 54 deletions onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,16 +228,11 @@ struct TensorrtFuncState {
std::string engine_cache_path;
nvinfer1::IRuntime* runtime = nullptr;
std::vector<nvinfer1::IOptimizationProfile*> profiles;
bool engine_decryption_enable = false;
int (*engine_decryption)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption)(const char*, char*, size_t) = nullptr;
bool detailed_build_log = false;
bool sparsity_enable = false;
int auxiliary_streams = -1;
bool cuda_graph_enable = 0;
bool is_dynamic_shape = false;
std::string cache_prefix;
std::string cache_suffix;
// runtime parameters
std::vector<IAllocatorUniquePtr<void>> scratch_buffers;
std::vector<TensorParams> input_tensors;
Expand Down Expand Up @@ -289,14 +284,6 @@ class NvExecutionProvider : public IExecutionProvider {
// explicit NvExecutionProvider(const ProviderOptions& provider_options_map, const ConfigOptions* config_options);
virtual ~NvExecutionProvider();

cublasHandle_t PerThreadDefaultCublasHandle() {
return GetPerThreadContext().CublasHandle();
}

cudnnHandle_t PerThreadDefaultCudnnHandle() {
return GetPerThreadContext().CudnnHandle();
}

virtual std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
std::unique_ptr<IDataTransfer> GetDataTransfer() const override;

Expand Down Expand Up @@ -351,7 +338,6 @@ class NvExecutionProvider : public IExecutionProvider {
size_t min_subgraph_size_ = 1;
size_t max_workspace_size_ = 0;
size_t max_shared_mem_size_ = 0;
bool force_sequential_engine_build_ = false;
bool dump_subgraphs_ = false;
bool engine_cache_enable_ = false;
bool weight_stripped_engine_enable_ = false;
Expand All @@ -364,21 +350,17 @@ class NvExecutionProvider : public IExecutionProvider {
size_t onnx_external_data_bytestream_size_ = 0;
bool sparsity_enable_ = false;
int auxiliary_streams_ = -1;
std::string cache_path_, engine_decryption_lib_path_;
std::string cache_path_;
std::unique_ptr<nvinfer1::IRuntime> runtime_ = nullptr;
std::mutex tensorrt_mu_;
int device_id_;
std::string compute_capability_;
size_t max_ctx_mem_size_ = 0;
mutable char model_path_[4096] = {}; // Reserved for max path length
bool engine_decryption_enable_ = false;
int (*engine_decryption_)(const char*, char*, size_t*) = nullptr;
int (*engine_encryption_)(const char*, char*, size_t) = nullptr;
bool detailed_build_log_ = false;
bool cuda_graph_enable_ = false;
bool multi_profile_enable_ = false;
std::filesystem::path runtime_cache_;
std::string cache_prefix_;
std::string op_types_to_exclude_;
int nv_profile_index_ = 0;
std::unique_ptr<onnxruntime::Model> ep_context_model_;
Expand Down Expand Up @@ -420,10 +402,6 @@ class NvExecutionProvider : public IExecutionProvider {
std::unordered_map<std::string, std::vector<nvinfer1::IOptimizationProfile*>> profiles_;
std::unordered_map<std::string, DDSOutputAllocatorMap> dds_output_allocator_maps_;

// for external stream, we need to create its cudnn/cublass handle before cuda EP enable cuda graph capture
cudnnHandle_t external_cudnn_handle_ = nullptr;
cublasHandle_t external_cublas_handle_ = nullptr;

// Call cudaStreamSynchronize() after TRT enqueueV3()
mutable bool sync_stream_after_enqueue_ = true;

Expand All @@ -434,20 +412,7 @@ class NvExecutionProvider : public IExecutionProvider {
class PerThreadContext final {
public:
PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream);
~PerThreadContext();

cublasHandle_t CublasHandle() const {
return external_cublas_handle_;
}

cudnnHandle_t CudnnHandle() const {
return external_cudnn_handle_;
}

bool IsTensorRTContextInMap(std::string fused_node);
nvinfer1::IExecutionContext& GetTensorRTContext(std::string fused_node);
bool UpdateTensorRTContext(std::string fused_node, tensorrt_ptr::unique_pointer_exec_ctx context);
void ResetTensorRTContext(std::string fused_node);
~PerThreadContext() = default;

// CUDA Graph management
void SetCudaGraphStream(cudaStream_t stream) { cuda_graph_.SetStream(stream); }
Expand All @@ -465,23 +430,6 @@ class NvExecutionProvider : public IExecutionProvider {
void DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);

private:
cudnnHandle_t external_cudnn_handle_ = nullptr;
cublasHandle_t external_cublas_handle_ = nullptr;

// Maintaining execution context on a per thread basis is suggested by TRT doc.
// Also, for enqueueV2() in execution context, to perform inference concurrently in multiple streams, use one execution context per stream.
// ORT multi-streams feature uses one stream for one thread, therefore maintaining execution context on a per thread basis is necessary for TRT EP,
// otherwise it may result in undefined behavior or synchronization issues.
//
// See more details here:
// https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
// https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_execution_context.html#a63cd95430852038ce864e17c670e0b36
std::unordered_map<std::string, tensorrt_ptr::unique_pointer_exec_ctx> trt_context_map_;

// The profile shape ranges for the engine that the execution context maintained by the PerThreadContext is built with.
// TRT EP needs this info to determine whether to rebuild the execution context.
std::unordered_map<std::string, ShapeRangesMap> input_shape_ranges_;

// Cuda graph with multi threads will be supported in the future, so cuda_graph_ is put under PerThreadContext.
// ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph instance is enough (no need to maintain one CUDAGraph instance per TRT subgraph)
CUDAGraph cuda_graph_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,17 +26,13 @@ struct NvExecutionProviderInfo {
size_t max_workspace_size{0};
size_t max_shared_mem_size{0};
bool dump_subgraphs{false};
std::string engine_cache_path{""};
bool weight_stripped_engine_enable{false};
std::string onnx_model_folder_path{""};
const void* onnx_bytestream{nullptr};
size_t onnx_bytestream_size{0};
bool use_external_data_initializer{false};
const void* external_data_bytestream{nullptr};
size_t external_data_bytestream_size{0};
bool engine_decryption_enable{false};
std::string engine_decryption_lib_path{""};
bool force_sequential_engine_build{false};
std::string runtime_cache_path{""};
bool detailed_build_log{false};
bool sparsity_enable{false};
Expand All @@ -49,7 +45,6 @@ struct NvExecutionProviderInfo {
bool dump_ep_context_model{false};
std::string ep_context_file_path{""};
int ep_context_embed_mode{0};
std::string engine_cache_prefix{""};
std::string op_types_to_exclude{""};

static NvExecutionProviderInfo FromProviderOptions(const ProviderOptions& options,
Expand Down