diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index a1e9d06b133fd..6aad71e40b2a8 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1226,6 +1226,12 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) ${onnxruntime_perf_test_src_patterns} ) onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc) + + # ABSL_FLAGS_STRIP_NAMES is set to 1 by default to disable flag registration when building for Android, iPhone, and "embedded devices". + # See the issue: https://github.com/abseil/abseil-cpp/issues/1875 + # We set it to 0 for all builds to be able to use ABSL flags for onnxruntime_perf_test. + target_compile_definitions(onnxruntime_perf_test PRIVATE ABSL_FLAGS_STRIP_NAMES=0) + if(MSVC) target_compile_options(onnxruntime_perf_test PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs index 8cca2b42e987a..3c92400715740 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs @@ -368,6 +368,88 @@ public struct OrtApi public IntPtr EpDevice_Device; public IntPtr GetEpApi; public IntPtr GetTensorSizeInBytes; + + public IntPtr AllocatorGetStats; + + public IntPtr CreateMemoryInfo_V2; + public IntPtr MemoryInfoGetDeviceMemType; + public IntPtr MemoryInfoGetVendorId; + + public IntPtr ValueInfo_GetValueProducer; + public IntPtr ValueInfo_GetValueNumConsumers; + public IntPtr ValueInfo_GetValueConsumers; + public IntPtr ValueInfo_GetInitializerValue; + public IntPtr ValueInfo_GetExternalInitializerInfo; + public IntPtr ValueInfo_IsRequiredGraphInput; + public IntPtr ValueInfo_IsOptionalGraphInput; + public IntPtr ValueInfo_IsGraphOutput; + public IntPtr ValueInfo_IsConstantInitializer; + public IntPtr ValueInfo_IsFromOuterScope; + public IntPtr Graph_GetName; + public IntPtr Graph_GetModelPath; + public IntPtr Graph_GetOnnxIRVersion; + public IntPtr Graph_GetNumOperatorSets; + public IntPtr Graph_GetOperatorSets; + public IntPtr Graph_GetNumInputs; + public IntPtr Graph_GetInputs; + public IntPtr Graph_GetNumOutputs; + public IntPtr Graph_GetOutputs; + public IntPtr Graph_GetNumInitializers; + public IntPtr Graph_GetInitializers; + public IntPtr Graph_GetNumNodes; + public IntPtr Graph_GetNodes; + public IntPtr Graph_GetParentNode; + public IntPtr Graph_GetGraphView; + public IntPtr Node_GetId; + public IntPtr Node_GetName; + public IntPtr Node_GetOperatorType; + public IntPtr Node_GetDomain; + public IntPtr Node_GetSinceVersion; + public IntPtr Node_GetNumInputs; + public IntPtr Node_GetInputs; + public IntPtr Node_GetNumOutputs; + public IntPtr Node_GetOutputs; + public IntPtr Node_GetNumImplicitInputs; + public IntPtr Node_GetImplicitInputs; + public IntPtr Node_GetNumAttributes; + public IntPtr Node_GetAttributes; + public IntPtr Node_GetAttributeByName; + public IntPtr Node_GetTensorAttributeAsOrtValue; + public IntPtr OpAttr_GetType; + public IntPtr OpAttr_GetName; + public IntPtr Node_GetNumSubgraphs; + public IntPtr Node_GetSubgraphs; + public IntPtr Node_GetGraph; + public IntPtr Node_GetEpName; + public IntPtr ReleaseExternalInitializerInfo; + public IntPtr ExternalInitializerInfo_GetFilePath; + public IntPtr ExternalInitializerInfo_GetFileOffset; + public IntPtr ExternalInitializerInfo_GetByteSize; + + public IntPtr GetRunConfigEntry; + + public IntPtr EpDevice_MemoryInfo; + + public IntPtr CreateSharedAllocator; + public IntPtr GetSharedAllocator; + public IntPtr ReleaseSharedAllocator; + + public IntPtr GetTensorData; + + public IntPtr GetSessionOptionsConfigEntries; + + public IntPtr SessionGetMemoryInfoForInputs; + public IntPtr SessionGetMemoryInfoForOutputs; + public IntPtr SessionGetEpDeviceForInputs; + + public IntPtr CreateSyncStreamForEpDevice; + public IntPtr SyncStream_GetHandle; + public IntPtr ReleaseSyncStream; + + public IntPtr CopyTensors; + + public IntPtr Graph_GetModelMetadata; + public IntPtr GetModelCompatibilityForEpDevices; } internal static class NativeMethods @@ -704,6 +786,10 @@ static NativeMethods() (DSessionOptionsSetEpSelectionPolicyDelegate)Marshal.GetDelegateForFunctionPointer( api_.SessionOptionsSetEpSelectionPolicyDelegate, typeof(DSessionOptionsSetEpSelectionPolicyDelegate)); + + OrtGetModelCompatibilityForEpDevices = (DOrtGetModelCompatibilityForEpDevices)Marshal.GetDelegateForFunctionPointer( + api_.GetModelCompatibilityForEpDevices, + typeof(DOrtGetModelCompatibilityForEpDevices)); } internal class NativeLib @@ -2456,6 +2542,18 @@ public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps, public static DOrtGetEpDevices OrtGetEpDevices; + /// + /// Validate compiled model compatibility for the provided EP devices. + /// + [UnmanagedFunctionPointer(CallingConvention.Winapi)] + public delegate IntPtr /* OrtStatus* */ DOrtGetModelCompatibilityForEpDevices( + IntPtr[] /* const OrtEpDevice* const* */ ep_devices, + UIntPtr /* size_t */ num_ep_devices, + byte[] /* const char* */ compatibility_info, + out int /* OrtCompiledModelCompatibility */ out_status); + + public static DOrtGetModelCompatibilityForEpDevices OrtGetModelCompatibilityForEpDevices; + /// /// Add execution provider devices to the session options. /// Priority is based on the order of the OrtEpDevice instances. Highest priority first. diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs index 5c70808b82be1..052d5899b52c0 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs @@ -7,6 +7,21 @@ namespace Microsoft.ML.OnnxRuntime { + /// + /// Represents the compatibility status of a pre-compiled model with one or more execution provider devices. + /// + /// + /// This enum is used to determine whether a pre-compiled model can be used with specific execution providers + /// and devices, or if recompilation is needed. + /// + public enum OrtCompiledModelCompatibility + { + EP_NOT_APPLICABLE = 0, + EP_SUPPORTED_OPTIMAL = 1, + EP_SUPPORTED_PREFER_RECOMPILATION = 2, + EP_UNSUPPORTED = 3, + } + /// /// Delegate for logging function callback. /// Supply your function and register it with the environment to receive logging callbacks via @@ -361,6 +376,31 @@ public string[] GetAvailableProviders() } } + /// + /// Validate a compiled model's compatibility information for one or more EP devices. + /// + /// The list of EP devices to validate against. + /// The compatibility string from the precompiled model to validate. + /// OrtCompiledModelCompatibility enum value denoting the compatibility status + public OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices( + IReadOnlyList epDevices, string compatibilityInfo) + { + if (epDevices == null || epDevices.Count == 0) + throw new ArgumentException("epDevices must be non-empty", nameof(epDevices)); + + var devicePtrs = new IntPtr[epDevices.Count]; + for (int i = 0; i < epDevices.Count; ++i) + { + devicePtrs[i] = epDevices[i].Handle; + } + + var infoUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(compatibilityInfo); + NativeApiStatus.VerifySuccess( + NativeMethods.OrtGetModelCompatibilityForEpDevices( + devicePtrs, (UIntPtr)devicePtrs.Length, infoUtf8, out int status)); + return (OrtCompiledModelCompatibility)status; + } + /// /// Get/Set log level property of OrtEnv instance diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs new file mode 100644 index 0000000000000..103fe5bc10106 --- /dev/null +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs @@ -0,0 +1,49 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// not supported on mobile platforms +#if !(ANDROID || IOS) + +namespace Microsoft.ML.OnnxRuntime.Tests; + +using System; +using System.Linq; +using Xunit; +using System.Collections.Generic; + +public class EpCompatibilityTests +{ + private readonly OrtEnv ortEnvInstance = OrtEnv.Instance(); + + private IReadOnlyList GetDevices() + { + var epDevices = ortEnvInstance.GetEpDevices(); + Assert.NotNull(epDevices); + Assert.NotEmpty(epDevices); + return epDevices; + } + + [Fact] + public void GetEpCompatibility_InvalidArgs() + { + Assert.Throws(() => ortEnvInstance.GetModelCompatibilityForEpDevices(null, "info")); + Assert.Throws(() => ortEnvInstance.GetModelCompatibilityForEpDevices(new List(), "info")); + } + + [Fact] + public void GetEpCompatibility_SingleDeviceCpuProvider() + { + var devices = GetDevices(); + var someInfo = "arbitrary-compat-string"; + + // Use CPU device + var cpu = devices.First(d => d.EpName == "CPUExecutionProvider"); + Assert.NotNull(cpu); + var selected = new List { cpu }; + var status = ortEnvInstance.GetModelCompatibilityForEpDevices(selected, someInfo); + + // CPU defaults to not applicable in this scenario + Assert.Equal(OrtCompiledModelCompatibility.EP_NOT_APPLICABLE, status); + } +} +#endif diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h index 6c041ec96a035..06c1b1ac6475f 100644 --- a/include/onnxruntime/core/graph/model_saving_options.h +++ b/include/onnxruntime/core/graph/model_saving_options.h @@ -9,36 +9,30 @@ class PrepackedWeightsForGraph; // These options affect how the model initializers are written to the external file. // This includes options to align external initializer offset. -// For models running on CPU, ORT will try to use mmap to load external -// initializers. To use mmap, external initializer need to be offset aligned. +// ORT will try to use mmap to load external initializers. +// // ORT saves external initializers into single data file, each initializer is // accessed with offset(start position of initializer) and length(byte length of -// initializer) of the data file. To use mmap, each offset need to be aligned -// which means offset need to divisible by allocation granularity(64KB for -// windows and 4K for other OSes). With align_offset to true, ORT will align -// offset for large initializer when save ONNX model with external data file. +// initializer) of the data file. With align_offset to true, ORT will align +// offset for large initializer (larger than align_threshold) +// when save ONNX model with external data file. It will align then to +// on_disk_alignment value. struct ModelSavingOptions { explicit ModelSavingOptions(size_t size_threshold) : initializer_size_threshold(size_threshold) {} // Minimal initializer size in bytes to be externalized on disk size_t initializer_size_threshold; - // Offset will always be page aligned and allocation granularity aligned for - // mmap support. This is done by padding previous tensor data with zeros - // keeping same length. + // Offset will always be aligned for mmap support. + // This is done by padding previous tensor data with zeros keeping same length. bool align_offset = false; // Alignment threshold for size of data. // Having a low threshold will waste file space for small initializers. // Only when tensor's data size is > the page_align_threshold it will be force // aligned. Default to 1MB. int64_t align_threshold = 1048576; - // The allocation Granularity for mmap() support. - // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB. -#ifdef _WIN32 - int64_t allocation_granularity = 65536; -#else - int64_t allocation_granularity = 4096; -#endif + // Alignment factor for big tensors (bigger than align_threshold). Defaults to 4K. + int64_t on_disk_alignment = 4096; // Force embed all external initializer into the Onnx file // Used for EPContext model generation while some nodes fallback on CPU which has external data dependency bool force_embed_external_ini = false; diff --git a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h index dc27204017caa..a32f465e44adf 100644 --- a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h +++ b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h @@ -31,7 +31,7 @@ constexpr const char* kDetailedBuildLog = "nv_detailed_build_log"; constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes"; constexpr const char* kProfilesMaxShapes = "nv_profile_max_shapes"; constexpr const char* kProfilesOptShapes = "nv_profile_opt_shapes"; -constexpr const char* kCudaGraphEnable = "nv_cuda_graph_enable"; +constexpr const char* kCudaGraphEnable = "enable_cuda_graph"; constexpr const char* kMultiProfileEnable = "nv_multi_profile_enable"; constexpr const char* kUseExternalDataInitializer = "nv_use_external_data_initializer"; diff --git a/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h b/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h index 21aa797ce16eb..28ce4439fdc7e 100644 --- a/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h +++ b/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h @@ -232,7 +232,7 @@ static Ort::Status GetOrtValueInfoTensorTypeShape(const OrtValueInfo& ort_value_ /*out*/ std::vector& dims, /*out*/ std::vector& symbolic_dims); static Ort::Status OrtValueInfoToProto(const OrtValueInfo& ort_value_info, onnx::ValueInfoProto& value_info_proto); -static Ort::Status OrtOpAttrToProto(const OrtNode& ort_node, const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto); +static Ort::Status OrtOpAttrToProto(const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto); Ort::Status OrtGraphToProto(const OrtGraph& ort_graph, onnx::GraphProto& graph_proto, @@ -379,7 +379,7 @@ Ort::Status OrtGraphToProto(const OrtGraph& ort_graph, } onnx::AttributeProto* attr_proto = node_proto->add_attribute(); - ORT_EP_UTILS_CXX_RETURN_IF_ERROR(OrtOpAttrToProto(*ort_node, *ort_attr, *attr_proto)); + ORT_EP_UTILS_CXX_RETURN_IF_ERROR(OrtOpAttrToProto(*ort_attr, *attr_proto)); } } @@ -652,7 +652,7 @@ static Ort::Status OrtValueInfoToProto(const OrtValueInfo& ort_value_info, return Ort::Status{nullptr}; } -static Ort::Status OrtOpAttrToProto(const OrtNode& ort_node, const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto) { +static Ort::Status OrtOpAttrToProto(const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto) { const OrtApi& ort_api = Ort::GetApi(); const char* attr_name = nullptr; @@ -766,7 +766,7 @@ static Ort::Status OrtOpAttrToProto(const OrtNode& ort_node, const OrtOpAttr& or // TensorProto as an attribute value doesn't require a name. OrtValue* ort_value = nullptr; - ORT_EP_UTILS_C_RETURN_IF_ERROR(ort_api.Node_GetTensorAttributeAsOrtValue(&ort_node, &ort_attr, &ort_value)); + ORT_EP_UTILS_C_RETURN_IF_ERROR(ort_api.OpAttr_GetTensorAttributeAsOrtValue(&ort_attr, &ort_value)); Ort::Value tensor(ort_value); diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index bedeeb972c3a7..72e8a3ca1103c 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -902,6 +902,16 @@ typedef void (*RunAsyncCallbackFn)(void* user_data, OrtValue** outputs, size_t n * * \nosubgrouping */ +/* + * Public enum for compiled model compatibility across EPs. + */ +typedef enum OrtCompiledModelCompatibility { + OrtCompiledModelCompatibility_EP_NOT_APPLICABLE = 0, + OrtCompiledModelCompatibility_EP_SUPPORTED_OPTIMAL, + OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION, + OrtCompiledModelCompatibility_EP_UNSUPPORTED, +} OrtCompiledModelCompatibility; + struct OrtApi { /// \name OrtStatus /// @{ @@ -6069,7 +6079,6 @@ struct OrtApi { /** \brief Get the OrtNode's 'TENSOR' attribute as an OrtValue. * - * \param[in] node The OrtNode instance. * \param[in] attribute The OrtOpAttr instance. * \param[out] attr_tensor If successful, contains the 'TENSOR' attribute as a newly created OrtValue. Must be freed with OrtApi::ReleaseValue. @@ -6078,7 +6087,7 @@ struct OrtApi { * * \since Version 1.23. */ - ORT_API2_STATUS(Node_GetTensorAttributeAsOrtValue, _In_ const OrtNode* node, _In_ const OrtOpAttr* attribute, + ORT_API2_STATUS(OpAttr_GetTensorAttributeAsOrtValue, _In_ const OrtOpAttr* attribute, _Outptr_result_maybenull_ OrtValue** attr_tensor); /** \brief Get the attribute type as OrtOpAttrType from an OrtOpAttr. @@ -6480,6 +6489,24 @@ struct OrtApi { * \since Version 1.23. */ ORT_API2_STATUS(Graph_GetModelMetadata, _In_ const OrtGraph* graph, _Outptr_ OrtModelMetadata** out); + + /** \brief Validate a compiled model's compatibility information for one or more EP devices. + * + * \param[in] ep_devices The EP devices to validate against (e.g., from GetEpDevices). + * All devices must belong to the same execution provider. + * \param[in] num_ep_devices The number of EP devices provided. + * \param[in] compatibility_info The compatibility info string produced when the model was compiled. + * \param[out] out_status The resulting compatibility status for the EP devices. + * + * \snippet{doc} snippets.dox OrtStatus Return Value + * + * \since Version 1.23. + */ + ORT_API2_STATUS(GetModelCompatibilityForEpDevices, + _In_reads_(num_ep_devices) const OrtEpDevice* const* ep_devices, + _In_ size_t num_ep_devices, + _In_ const char* compatibility_info, + _Out_ OrtCompiledModelCompatibility* out_status); }; /* diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 2f4fd36c8115f..981c70ab8b954 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -725,9 +725,7 @@ using AllocatedStringPtr = std::unique_ptr; * constructors to construct an instance of a Status object from exceptions. */ struct Status : detail::Base { - using Base = detail::Base; - using Base::Base; - + Status() = default; // Same as with std::nullptr_t. But can be used in re-sizable containers and represent success. explicit Status(std::nullptr_t) noexcept {} ///< Create an empty object, must be assigned a valid one to be used explicit Status(OrtStatus* status) noexcept; ///< Takes ownership of OrtStatus instance returned from the C API. explicit Status(const Exception&); ///< Creates status instance out of exception @@ -1015,6 +1013,16 @@ struct EpDevice : detail::EpDeviceImpl { ConstKeyValuePairs ep_metadata = {}, ConstKeyValuePairs ep_options = {}); }; +/** \brief Validate a compiled model's compatibility for one or more EP devices. + * + * Throws on error. Returns the resulting compatibility status. + * /// \param ep_devices The EP devices to check compatibility against. + * /// \param compatibility_info The compatibility string from the precompiled model to validate. + */ +OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices( + const std::vector& ep_devices, + const char* compatibility_info); + /** \brief The Env (Environment) * * The Env holds the logging state used by all other objects. diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index 73200d8852223..05c86ae4e0c58 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -823,7 +823,7 @@ inline Status Env::CopyTensors(const std::vector& src_tensors, return Status("Source and destination tensor vectors must have the same size", ORT_INVALID_ARGUMENT); } if (src_tensors.empty()) { - return Status(); + return Status(nullptr); } const OrtValue* const* src_tensors_ptr = reinterpret_cast(src_tensors.data()); @@ -859,6 +859,26 @@ inline void CustomOpDomain::Add(const OrtCustomOp* op) { ThrowOnError(GetApi().CustomOpDomain_Add(p_, op)); } +inline OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices( + const std::vector& ep_devices, + const char* compatibility_info) { + if (ep_devices.empty()) { + ORT_CXX_API_THROW("ep_devices is empty", ORT_INVALID_ARGUMENT); + } + + std::vector ptrs; + ptrs.reserve(ep_devices.size()); + for (const auto& d : ep_devices) ptrs.push_back(d); + + OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; + ThrowOnError(GetApi().GetModelCompatibilityForEpDevices( + reinterpret_cast(ptrs.data()), + ptrs.size(), + compatibility_info, + &status)); + return status; +} + inline LoraAdapter LoraAdapter::CreateLoraAdapter(const std::basic_string& adapter_path, OrtAllocator* allocator) { OrtLoraAdapter* p; diff --git a/include/onnxruntime/core/session/onnxruntime_ep_c_api.h b/include/onnxruntime/core/session/onnxruntime_ep_c_api.h index 620cb5fcf13cc..975f6b453a88d 100644 --- a/include/onnxruntime/core/session/onnxruntime_ep_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_ep_c_api.h @@ -482,18 +482,6 @@ typedef enum OrtEpDataLayout { OrtEpDataLayout_Default = OrtEpDataLayout_NCHW, } OrtEpDataLayout; -/** - * \brief Enumeration describing the compatibility state of a compiled model relative to an execution provider. - * - * \since Version 1.23. - */ -typedef enum OrtCompiledModelCompatibility { - OrtCompiledModelCompatibility_EP_NOT_APPLICABLE = 0, - OrtCompiledModelCompatibility_EP_SUPPORTED_OPTIMAL, - OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION, - OrtCompiledModelCompatibility_EP_UNSUPPORTED, -} OrtCompiledModelCompatibility; - /** * \brief The OrtEp struct provides functions to implement for an execution provider. * \since Version 1.22. @@ -901,20 +889,28 @@ struct OrtEpFactory { */ ORT_API_T(const char*, GetVersion, _In_ const OrtEpFactory* this_ptr); - /** \brief Validate the compatibility of a compiled model with the execution provider. + /** \brief Validate the compatibility of a compiled model with the execution provider factory for one or more devices. + * + * Given a compatibility info string produced during model compilation, the EP factory should determine whether the + * compiled model is compatible with the EP factory when targeting the provided hardware devices. All devices provided + * must belong to the same execution provider instance that this factory creates. * - * This function validates if a model produced with the supplied compatibility info string is supported by the underlying EP. - * The EP should check if a compiled model is compatible with the EP and set the model_compatibility parameter accordingly. + * The EP factory implementation should consider the set of devices (e.g., multi-adapter or multi-GPU scenarios) when + * evaluating compatibility and set `model_compatibility` accordingly. * * \param[in] this_ptr The OrtEpFactory instance. - * \param[in] compatibility_info The compatibility information string that will be used - * \param[out] model_compatibility OrtCompiledModelCompatibility enum value describing the compatibility of the model with the EP. + * \param[in] devices Array of OrtHardwareDevice pointers that the EP would run on. All must map to this EP. + * \param[in] num_devices Number of entries in `devices`. + * \param[in] compatibility_info The compatibility information string produced when the model was compiled. + * \param[out] model_compatibility OrtCompiledModelCompatibility value describing the compatibility of the model with the EP. * * \snippet{doc} snippets.dox OrtStatus Return Value * * \since Version 1.23. */ ORT_API2_STATUS(ValidateCompiledModelCompatibilityInfo, _In_ OrtEpFactory* this_ptr, + _In_reads_(num_devices) const OrtHardwareDevice* const* devices, + _In_ size_t num_devices, _In_ const char* compatibility_info, _Out_ OrtCompiledModelCompatibility* model_compatibility); diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h index 0d5117709c18a..bfa450f4287f8 100644 --- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h @@ -280,6 +280,18 @@ class GQAAttentionBase { output, static_cast(present_buffer_sequence_length), nullptr); } + // Pre-allocate buffer for attention mask to avoid allocating it for every processed token + float* attention_bias_thread_fp32 = nullptr; + if (attention_bias_thread != nullptr) { + if constexpr (!std::is_same_v) { + static_assert(std::is_same_v && std::is_same_v); + + size_t bytes = attention_total_seqlen * sizeof(float); + attention_bias_thread_fp32 = static_cast(allocator->Alloc(bytes)); + } + } + BufferUniquePtr scratch_buffer(attention_bias_thread_fp32, BufferDeleter(allocator)); + // compute Softmax U* output_softmax = output; for (size_t seq = 0; seq < sequence_length; seq++) { @@ -316,9 +328,6 @@ class GQAAttentionBase { static_cast(window_size)); } else { static_assert(std::is_same_v && std::is_same_v); - size_t bytes = window_size * sizeof(float); - auto attention_bias_thread_fp32 = static_cast(allocator->Alloc(bytes)); - BufferUniquePtr scratch_buffer(attention_bias_thread_fp32, BufferDeleter(allocator)); MlasConvertHalfToFloatBuffer(attention_bias_thread + start_offset, attention_bias_thread_fp32, window_size); ApplyAttentionBias(output_softmax + start_offset, attention_bias_thread_fp32, static_cast(window_size)); diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index 85a2cbaea0e44..36a6f70cc69d9 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -200,6 +200,19 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase { can_use_dynamic_quant_mlas_ = (!b_quantization_might_be_asymmetric && b_scale_available); + // Kleidi dynamic path requires strictly positive, finite scales. + // Disable if any invalid scale is detected. + if (can_use_dynamic_quant_mlas_) { + const auto bs = b_scale_tensor->DataAsSpan(); + const bool has_invalid = + std::any_of(bs.begin(), bs.end(), + [](float s) { return !std::isfinite(s) || s <= 0.0f; }); + + if (has_invalid) { + can_use_dynamic_quant_mlas_ = false; + } + } + // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops. // We check that here too before attempting to use them. if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) { @@ -379,7 +392,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { if (y->Shape().Size() == 0) return Status::OK(); - auto a_data = static_cast(ctx->Input(IN_A)->DataRaw()); + const float* a_data = ctx->Input(IN_A)->Data(); auto* y_data = y->MutableData(); // batch gemm @@ -393,7 +406,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { for (size_t gemm_idx = 0; gemm_idx < num_gemms; gemm_idx++) { auto& params = gemm_data_vec[gemm_idx]; - params.A = reinterpret_cast(a_data + helper.LeftOffsets()[gemm_idx]); + params.A = a_data + helper.LeftOffsets()[gemm_idx]; params.lda = gemm_shape.K; params.PackedB = packed_b_.get(); params.C = y_data + helper.OutputOffsets()[gemm_idx]; diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc index 971851db62437..d7f5b23d56c70 100644 --- a/onnxruntime/core/framework/tensor_external_data_info.cc +++ b/onnxruntime/core/framework/tensor_external_data_info.cc @@ -107,7 +107,7 @@ void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& e std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto( const PrepackedWeightsForGraph& prepacked_for_graph, const InlinedHashSet& blob_keys, bool align, - int64_t align_threshold, int64_t allocation_granularity, + int64_t align_threshold, int64_t on_disk_alignment, std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) { size_t key_count = 0; for (const auto& key : blob_keys) { @@ -120,7 +120,7 @@ std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto( const auto size_in_bytes = prepacked_weights->buffer_sizes_[i]; if (align && static_cast(size_in_bytes) > align_threshold) { // return early on error - if (!AlignAndPad(os, allocation_granularity, external_offset)) { + if (!AlignAndPad(os, on_disk_alignment, external_offset)) { return os; } } diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h index 2de1e01f381ec..784b3f352a78e 100644 --- a/onnxruntime/core/framework/tensor_external_data_info.h +++ b/onnxruntime/core/framework/tensor_external_data_info.h @@ -41,15 +41,13 @@ class ExternalDataInfo { size_t tensor_bytes_size, ::ONNX_NAMESPACE::TensorProto& proto); - // Pads the output with zeros according to the specified allocation_granularity + // Pads the output with zeros according to the specified alignment_factor // It updates external_offset for alignment. // need to do padding before write actual tensor data as we do offset alignment at the begin of - // large tensors (offset need to be page aligned and allocation granularity aligned) like below: + // large tensors (offset need to be page aligned) like below: // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->| - static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) { - // Align to the larger of the page size or the allocation granularity - int64_t alignment_factor = std::max(static_cast(4096), allocation_granularity); + static std::ostream& AlignAndPad(std::ostream& stream, int64_t alignment_factor, int64_t& external_offset) { // Align to the next page or alloc granularity boundary SafeInt safe_external_offset = external_offset; int64_t new_external_offset = ((safe_external_offset + alignment_factor - 1) / alignment_factor) * @@ -66,7 +64,7 @@ class ExternalDataInfo { static std::ostream& WritePrepackedToFileAndAddToProto( const PrepackedWeightsForGraph& prepacked_for_graph, const InlinedHashSet& blob_keys, - bool align, int64_t align_threshold, int64_t allocation_granularity, + bool align, int64_t align_threshold, int64_t on_disk_alignment, std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto); diff --git a/onnxruntime/core/graph/abi_graph_types.h b/onnxruntime/core/graph/abi_graph_types.h index b99c22edb36c8..2ef7c4a9091f3 100644 --- a/onnxruntime/core/graph/abi_graph_types.h +++ b/onnxruntime/core/graph/abi_graph_types.h @@ -252,16 +252,6 @@ struct OrtNode { /// A status indicating success or an error. virtual onnxruntime::Status GetAttributes(gsl::span attrs) const = 0; - /// - /// Gets the node's 'TENSOR' attribute as an OrtValue. - /// - /// Node's 'TENSOR' attribute. - /// Output parameter is set to a newly created OrtValue containing the 'TENSOR' attribute value, - /// only if the attribute is of type 'TENSOR' - /// A status indicating success or an error. - virtual onnxruntime::Status GetTensorAttributeAsOrtValue(const OrtOpAttr* attr, - OrtValue*& value) const = 0; - /// /// Gets the number of node subgraphs. /// diff --git a/onnxruntime/core/graph/ep_api_types.cc b/onnxruntime/core/graph/ep_api_types.cc index 759a2998ace3a..0d9b93631ee8a 100644 --- a/onnxruntime/core/graph/ep_api_types.cc +++ b/onnxruntime/core/graph/ep_api_types.cc @@ -249,32 +249,6 @@ Status EpNode::GetAttributes(gsl::span dst) const { return Status::OK(); } -Status EpNode::GetTensorAttributeAsOrtValue(const OrtOpAttr* attribute, OrtValue*& result) const { - const auto* attr_proto = reinterpret_cast(attribute); - - if (attr_proto->type() != onnx::AttributeProto::TENSOR) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "This OrtOpAttr instance is not a 'TENSOR' attribute"); - } - - const auto& graph_viewer = ep_graph_->GetGraphViewer(); - const auto& tensor_proto = attr_proto->t(); - - // Check that TensorProto is valid. - ORT_ENFORCE(utils::HasDataType(tensor_proto), "Tensor proto doesn't have data type."); - ORT_ENFORCE(ONNX_NAMESPACE::TensorProto::DataType_IsValid(tensor_proto.data_type()), "Tensor proto has invalid data type."); - ORT_ENFORCE(!utils::HasExternalData(tensor_proto), - "Tensor proto with external data for value attribute is not supported."); - - // Initialize OrtValue for tensor attribute. - auto tensor_attribute_value = std::make_unique(); - AllocatorPtr tensor_attribute_allocator = CPUAllocator::DefaultInstance(); - ORT_RETURN_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), graph_viewer.ModelPath(), tensor_proto, - tensor_attribute_allocator, *tensor_attribute_value)); - - result = tensor_attribute_value.release(); - return Status::OK(); -} - Status EpNode::GetNumSubgraphs(size_t& num_subgraphs) const { num_subgraphs = subgraphs_.size(); return Status::OK(); diff --git a/onnxruntime/core/graph/ep_api_types.h b/onnxruntime/core/graph/ep_api_types.h index 7f22e265129f7..e003f02a79a2d 100644 --- a/onnxruntime/core/graph/ep_api_types.h +++ b/onnxruntime/core/graph/ep_api_types.h @@ -183,9 +183,6 @@ struct EpNode : public OrtNode { // Gets the node's attributes. Status GetAttributes(gsl::span attrs) const override; - Status GetTensorAttributeAsOrtValue(const OrtOpAttr* attribute, - OrtValue*& attr_tensor) const override; - // Gets the number of subgraphs contained by this node. Status GetNumSubgraphs(size_t& num_subgraphs) const override; diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index e4f8cd6df678e..0a228176175eb 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -4536,14 +4536,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl( continue; } - // update external_offset for alignment + // update external_offset for alignment (if enabled) // need to do padding before write actual tensor data as we do offset alignment at the begin of - // large tensors (offset need to be page aligned and allocation granularity aligned) like below: + // large tensors (offset need to be page aligned) like below: // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->| if (model_saving_options.align_offset && static_cast(tensor_bytes_size) > model_saving_options.align_threshold) { - ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity, + ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.on_disk_alignment, external_offset), "Failed writing external data to: ", model_external_file_path); } @@ -4576,7 +4576,7 @@ Status Graph::AddExternalInitializersToGraphProtoImpl( auto& os = ExternalDataInfo::WritePrepackedToFileAndAddToProto( *prepacked_weights_for_graph_, blob_keys_to_external_data, model_saving_options.align_offset, model_saving_options.align_threshold, - model_saving_options.allocation_granularity, + model_saving_options.on_disk_alignment, external_stream, external_offset, *output_proto); ORT_RETURN_IF_NOT(os.good(), "Failed to write pre-packed blobs to external file"); } diff --git a/onnxruntime/core/graph/model_editor_api_types.h b/onnxruntime/core/graph/model_editor_api_types.h index e7ffcbc7e4c90..2c0f6d6174303 100644 --- a/onnxruntime/core/graph/model_editor_api_types.h +++ b/onnxruntime/core/graph/model_editor_api_types.h @@ -138,11 +138,6 @@ struct ModelEditorNode : public OrtNode { "OrtModelEditorApi does not support getting attribute OrtOpAttr for OrtNode"); } - Status GetTensorAttributeAsOrtValue(const OrtOpAttr* /*attribute*/, OrtValue*& /*attr_tensor*/) const override { - return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, - "OrtModelEditorApi does not support getting 'TENSOR' attribute for OrtNode"); - } - Status GetNumSubgraphs(size_t& /*num_subgraphs*/) const override { return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED, "OrtModelEditorApi does not support getting the subgraphs for OrtNode"); diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index caa445b71e2a5..c579ff1542eb9 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -153,28 +153,23 @@ ArmKleidiAI::MlasGemmBatch( MLAS_THREADPOOL* ThreadPool ) { - if(TransA == CblasTrans) - { - return false; + if (M == 0 || N == 0) { + return true; } - if (TransA == CblasNoTrans && K == 0) { - if (Data->beta != 1.0f) { + + if (Data->alpha == 0.0f || K == 0) { + if (Data->beta == 0.0f) { + for (size_t i = 0; i < M; ++i) { + std::fill_n(Data->C + i * Data->ldc, N, 0.0f); + } + } else if (Data->beta != 1.0f) { for (size_t i = 0; i < M; ++i) { for (size_t j = 0; j < N; ++j) { Data->C[i * Data->ldc + j] *= Data->beta; } } } - } - if (Data->beta == 0.0f){ - std::fill_n(Data->C, M * Data->ldc, 0.0f); - } - //Fallback in the case of unsupported cases - if (M == 0 || N == 0 || K == 0 || - TransA != CblasNoTrans || - (TransB != CblasNoTrans && !Data[0].BIsPacked)) - { - return false; + return true; } if (TransA == CblasNoTrans) { @@ -185,11 +180,9 @@ ArmKleidiAI::MlasGemmBatch( auto m_step = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); auto n_step = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa(); - if (M < m_step || N < n_step) { - if (GetMlasPlatform().MlasGemmBatchOverride != ArmKleidiAI::MlasGemmBatch){ - //Fallback to MLAS - return false; - } + if (M < m_step && N < n_step && !Data->BIsPacked) { + // Fallback to MLAS + return false; } std::vector KaiPackedData; @@ -316,7 +309,7 @@ ArmKleidiAI::MlasGemmBatch( float* dst_tile = reinterpret_cast(CTile); // quick copy of data in cases where we are not scaling or accumulating anything - // with bounds checking on tile sizing to ensure the data fits in the memory block + // with bounds checking on tile sizing to ensure the data fits in the memory block bool can_memcpy = ( Data[BIdx].alpha == 1.0f && Data[BIdx].beta == 0.0f && @@ -328,21 +321,37 @@ ArmKleidiAI::MlasGemmBatch( if (can_memcpy) { std::memcpy(dst_tile, temp_tile, TileSizeM * TileSizeN * sizeof(float)); - }else { - // apply alpha scaling and beta to output files - for (size_t i = 0; i < TileSizeM; ++i) { - for (size_t j = 0; j < TileSizeN; ++j) { - const size_t idx = i * TileSizeN + j; - const size_t dst_idx = i * Data[BIdx].ldc + j; - - float ab = temp_tile[idx]; - float c_orig = dst_tile[dst_idx]; + return; + } - dst_tile[dst_idx] = Data[BIdx].alpha * ab + Data[BIdx].beta * c_orig; + float alpha = Data[BIdx].alpha; + float beta = Data[BIdx].beta; + size_t ldc = Data[BIdx].ldc; + + for (size_t i = 0; i < TileSizeM; ++i) { + for (size_t j = 0; j < TileSizeN; ++j) { + const size_t temp_idx = i * TileSizeN + j; + const size_t dst_idx = i * ldc + j; + + float ab = temp_tile[temp_idx]; + float c_orig = dst_tile[dst_idx]; + + if (alpha == 1.0f && beta == 0.0f) { + dst_tile[dst_idx] = ab; + } else if (alpha == 1.0f) { + dst_tile[dst_idx] = ab + beta * c_orig; + } else if (beta == 0.0f) { + dst_tile[dst_idx] = alpha * ab; + } else { + dst_tile[dst_idx] = alpha * ab + beta * c_orig; } } } + return; }); + return true; + } + else { + return false; } - return true; } diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc index 8caa67f266266..4efaec325292a 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc @@ -13,6 +13,39 @@ namespace onnxruntime { +/** + * Checks whether or not the output path from a given node leads to a QuantizeLinear op, optionally, with no + * branching ReLU or Clip op in between. See also: NodeGroupSelector::GetQDQSelection() in qdq_selectors.cc. + * + * @param node The starting node to check the output path from. + * @param graph The graph containing the nodes. + * + * @return true if the path exist, false otherwise. + */ +static bool IsNoBranchPathToQuantizeLinear(const Node& node, const Graph& graph) { + const Node* current = &node; + while (true) { + // Conv / ConvTranspose / Gemm produces single output + if (current->OutputDefs().size() != 1) { + return false; + } + const std::vector& consumers = graph.GetConsumerNodes(current->OutputDefs()[0]->Name()); + // Branching or no consumer: not eligible + if (consumers.size() != 1) { + return false; + } + const Node* consumer = consumers[0]; + if (consumer->OpType() == QDQ::QOpName) { + return true; + } + // Allow ReLU or Clip, see also: NodeGroupSelector::GetQDQSelection() in qdq_selectors.cc. + if (consumer->OpType() != "Relu" && consumer->OpType() != "Clip") { + return false; + } + current = consumer; + } +} + Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { const GraphViewer graph_viewer{graph}; @@ -43,11 +76,8 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph continue; } - // Require that the node's output is consumed by a single QuantizeLinear node. - // Otherwise, if only the inputs are quantized, but not the output, then this node group would not - // be considered a QDQ node unit anyway. - std::vector children_nodes = graph.GetConsumerNodes(node.OutputDefs()[0]->Name()); - if (children_nodes.size() != 1 || children_nodes[0]->OpType() != QDQ::QOpName) { + // Check if the output path leads to QuantizeLinear with optionally ReLU or Clip op in between. + if (!IsNoBranchPathToQuantizeLinear(node, graph)) { continue; } diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc index 36c6b54a1fce0..aa237fc6441b2 100644 --- a/onnxruntime/core/platform/windows/env.cc +++ b/onnxruntime/core/platform/windows/env.cc @@ -29,6 +29,7 @@ limitations under the License. #include #include "core/common/logging/logging.h" #include "core/common/narrow.h" +#include "core/common/safeint.h" #include "core/common/span_utils.h" #include "core/platform/env.h" #include "core/platform/scoped_resource.h" @@ -439,30 +440,28 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path, SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); - static const DWORD page_size = sysinfo.dwPageSize; static const DWORD allocation_granularity = sysinfo.dwAllocationGranularity; - const FileOffsetType offset_to_page = offset % static_cast(page_size); - const size_t mapped_length = length + static_cast(offset_to_page); - const FileOffsetType mapped_offset = offset - offset_to_page; - if (mapped_offset % allocation_granularity != 0) { - const auto error_code = GetLastError(); - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "mapped offset must be a multiple of the allocation granularity", - " , mapped_offset = ", mapped_offset, - " , allocation_granularity = ", allocation_granularity, - " , errcode = ", error_code, - " - ", std::system_category().message(error_code)); - } + const FileOffsetType offset_to_granularity = offset % static_cast(allocation_granularity); + const SIZE_T mapped_length = SafeInt(offset_to_granularity) + length; + const FileOffsetType mapped_offset = offset - offset_to_granularity; + assert((mapped_offset % allocation_granularity) == 0); void* const mapped_base = MapViewOfFile(file_mapping_handle.get(), FILE_MAP_READ, static_cast((mapped_offset >> 32) & 0xFFFFFFFF), static_cast(mapped_offset & 0xFFFFFFFF), mapped_length); - GSL_SUPPRESS(r.11) + + if (mapped_base == nullptr) { + const auto error_code = GetLastError(); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "MapViewOfFile ", ToUTF8String(Basename(file_path)), + " fail, errcode = ", error_code, + " - ", std::system_category().message(error_code)); + } mapped_memory = - MappedMemoryPtr{reinterpret_cast(mapped_base) + offset_to_page, + MappedMemoryPtr{reinterpret_cast(mapped_base) + offset_to_granularity, [mapped_base](void*) { UnmapFile(mapped_base); }}; diff --git a/onnxruntime/core/providers/cuda/cuda_graph.cc b/onnxruntime/core/providers/cuda/cuda_graph.cc index 8353c654681fc..88e58aec70550 100644 --- a/onnxruntime/core/providers/cuda/cuda_graph.cc +++ b/onnxruntime/core/providers/cuda/cuda_graph.cc @@ -72,7 +72,7 @@ void CUDAGraphManager::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id cuda_graph_set_.Put(cuda_graph_annotation_id, graph_exec); } -Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id) { +Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag) { // Although this function is not thread safe, the lock is not needed here because // CUDA EP maintains a separate cuda graph per thread LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_ << " with cuda_graph_annotation_id " @@ -81,7 +81,9 @@ Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id) cudaGraphExec_t graph_exec = cuda_graph_set_.Get(cuda_graph_annotation_id); CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec, stream_)); - CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_)); + if (sync_status_flag) { + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_)); + } return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h index 064b526e604bc..6b61a66671de4 100644 --- a/onnxruntime/core/providers/cuda/cuda_graph.h +++ b/onnxruntime/core/providers/cuda/cuda_graph.h @@ -38,7 +38,7 @@ struct CUDAGraphManager { void SetStream(cudaStream_t stream); void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id); void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id); - Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id); + Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag = true); void Reset(); diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc index 451be69c81cfb..93b673f2df5bd 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc @@ -19,7 +19,7 @@ #include "nv_data_transfer.h" #include "onnx_ctx_model_helper.h" #include "core/providers/cuda/shared_inc/cuda_call.h" -#include "core/providers/cuda/math/unary_elementwise_ops_impl.h" +#include "core/providers/cuda/cuda_graph.h" #include "core/session/allocator_adapters.h" #include "cuda_runtime_api.h" #include "core/common/parse_string.h" @@ -84,40 +84,6 @@ struct ShutdownProtobuf { namespace onnxruntime { -namespace cuda { -template <> -void Impl_Cast( - cudaStream_t stream, - const int64_t* input_data, int32_t* output_data, - size_t count) { - return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); -} - -template <> -void Impl_Cast( - cudaStream_t stream, - const int32_t* input_data, int64_t* output_data, - size_t count) { - return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); -} - -template <> -void Impl_Cast( - cudaStream_t stream, - const double* input_data, float* output_data, - size_t count) { - return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); -} - -template <> -void Impl_Cast( - cudaStream_t stream, - const float* input_data, double* output_data, - size_t count) { - return g_host->cuda__Impl_Cast(static_cast(stream), input_data, output_data, count); -} -} // namespace cuda - void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size, uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept { // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr @@ -255,7 +221,8 @@ bool ApplyProfileShapesFromProviderOptions(std::vector>>& profile_min_shapes, std::unordered_map>>& profile_max_shapes, std::unordered_map>>& profile_opt_shapes, - ShapeRangesMap& input_explicit_shape_ranges) { + ShapeRangesMap& input_explicit_shape_ranges, + bool& cuda_graph_flag) { if (trt_profiles.size() == 0) { LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Number of optimization profiles should be greater than 0, but it's 0."; return false; @@ -282,6 +249,10 @@ bool ApplyProfileShapesFromProviderOptions(std::vectorisShapeTensor()) { + if (cuda_graph_flag) { + LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Shape tensor detected on input '" << input->getName() << "'. Disabling CUDA Graph."; + cuda_graph_flag = false; + } int shape_size = nb_dims == 0 ? 1 : static_cast(profile_min_shapes[input_name][i].size()); std::vector shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size); @@ -366,51 +337,19 @@ bool ApplyProfileShapesFromProviderOptions(std::vector(); \ - skip_input_binding_allowed = false; \ - if (input_tensor_ptr != nullptr && elem_cnt > 0) { \ - scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, elem_cnt * sizeof(DstT))); \ - data = scratch_buffers.back().get(); \ - cuda::Impl_Cast(stream, input_tensor_ptr, reinterpret_cast(data), elem_cnt); \ - } else { \ - scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, 1)); \ - data = scratch_buffers.back().get(); \ - } \ - break; \ - } - #define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT) \ case DATA_TYPE: { \ auto output_tensor_ptr = output_tensor.GetTensorMutableData(); \ data_ptr = output_tensor_ptr; \ if (output_tensor_ptr != nullptr && elem_cnt > 0) { \ - buffers[output_name] = output_tensor_ptr; \ + buffer = output_tensor_ptr; \ } else { \ scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, 1)); \ - buffers[output_name] = scratch_buffers.back().get(); \ + buffer = scratch_buffers.back().get(); \ } \ break; \ } -#define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT) \ - case DATA_TYPE: { \ - auto output_tensor_ptr = output_tensor.GetTensorMutableData(); \ - data_ptr = output_tensor_ptr; \ - skip_output_binding_allowed = false; \ - if (output_tensor_ptr != nullptr && elem_cnt > 0) { \ - scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, elem_cnt * sizeof(DstT))); \ - buffers[output_name] = scratch_buffers.back().get(); \ - output_dim_sizes[i] = static_cast(elem_cnt); \ - } else { \ - scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator(alloc, 1)); \ - buffers[output_name] = scratch_buffers.back().get(); \ - output_dim_sizes[i] = 1; \ - } \ - break; \ - } - #define CASE_COPY_TENSOR(DATA_TYPE, DstT) \ case DATA_TYPE: { \ auto output_tensor_ptr = output_tensor.GetTensorMutableData(); \ @@ -420,15 +359,6 @@ bool ApplyProfileShapesFromProviderOptions(std::vector(); \ - if (output_tensor_ptr != nullptr && elem_cnt > 0) { \ - cuda::Impl_Cast(stream, reinterpret_cast(allocator->getBuffer()), reinterpret_cast(output_tensor_ptr), elem_cnt); \ - } \ - break; \ - } - /* * Set Nv executio context input. * @@ -551,7 +481,6 @@ Status BindContextInput(Ort::KernelContext& ctx, CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t) CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t) CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t) - CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float) default: { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "NvTensorRTRTX EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported."); @@ -576,8 +505,6 @@ Status BindContextInput(Ort::KernelContext& ctx, * param output_type - Data type of the output * param i - Output iteration index * param output_tensors - Output iteration index to output's ORT value - * param output_dim_sizes - Output iteration index to the multiplocation of its shape's dimensions - * param dds_output_set - DDS output set * param dds_output_allocator_map - DDS output to its allocator * param scratch_buffer - The allocation buffer created by TRT EP * param allocator - ORT allocator @@ -589,16 +516,11 @@ Status BindContextOutput(Ort::KernelContext& ctx, const char* output_name, size_t output_index, size_t output_type, - size_t i, - std::unordered_map& output_tensors, - std::unordered_map& output_dim_sizes, DDSOutputAllocatorMap& dds_output_allocator_map, std::vector>& scratch_buffers, OrtAllocator* alloc, - std::unordered_map& buffers, nvinfer1::Dims& dims, - void*& data_ptr, - bool& skip_output_binding_allowed) { + void*& data_ptr) { // Get output shape dims = trt_context->getTensorShape(output_name); int nb_dims = dims.nbDims; @@ -628,10 +550,11 @@ Status BindContextOutput(Ort::KernelContext& ctx, data_ptr = nullptr; // Set data_ptr to nullptr for DDS output binding. } } else { - output_tensors[i] = ctx.GetOutput(output_index, dims.d, nb_dims); - auto& output_tensor = output_tensors[i]; + auto output_tensor = ctx.GetOutput(output_index, dims.d, nb_dims); const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount(); + void* buffer = nullptr; + switch (output_type) { // below macros set data_ptr and skip_output_binding_allowed variables CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float) @@ -642,13 +565,12 @@ Status BindContextOutput(Ort::KernelContext& ctx, CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t) CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t) CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t) - CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float) default: { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "NvTensorRTRTX EP output tensor data type: " + std::to_string(output_type) + " not supported."); } } - trt_context->setTensorAddress(output_name, buffers[output_name]); + trt_context->setTensorAddress(output_name, buffer); } return Status::OK(); @@ -705,7 +627,6 @@ Status BindKernelOutput(Ort::KernelContext& ctx, CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t) CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t) CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t) - CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double) default: { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "NvTensorRTRTX EP output tensor data type: " + std::to_string(output_type) + " not supported."); @@ -715,10 +636,10 @@ Status BindKernelOutput(Ort::KernelContext& ctx, } NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream) { - // TODO: figure out if PerThreadContext is used at all. If not, just clean it up. + // Only set device if user hasn't provided a compute stream if (has_user_compute_stream) { CUDA_CALL_THROW(cudaSetDevice(device_id)); - (void)(stream); + (void)stream; } } @@ -745,6 +666,86 @@ bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext(std::string fu return false; } +void NvExecutionProvider::PerThreadContext::DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id) { + graph_id_to_run_count_.erase(cuda_graph_annotation_id); + cuda_graph_.Reset(); +} + +void NvExecutionProvider::PerThreadContext::ResetWarmupRuns(CudaGraphAnnotation_t cuda_graph_annotation_id) { + if (graph_id_to_run_count_.find(cuda_graph_annotation_id) == graph_id_to_run_count_.end()) { + return; + } + graph_id_to_run_count_[cuda_graph_annotation_id] = 0; +} + +bool NvExecutionProvider::PerThreadContext::IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const { + if (!IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id)) { + return false; + } + + // Safe access to map - return false if key doesn't exist yet + auto it = graph_id_to_run_count_.find(cuda_graph_annotation_id); + if (it == graph_id_to_run_count_.end()) { + return false; // Entry doesn't exist yet, not ready for capture + } + + bool allowed = it->second >= min_num_runs_before_cuda_graph_capture_; + if (allowed) { + LOGS_DEFAULT(VERBOSE) << "NvTensorRTRTX EP Graph capture allowed for ID: " << cuda_graph_annotation_id + << ", run count: " << it->second; + } + return allowed; +} + +bool NvExecutionProvider::PerThreadContext::IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const { + return cuda_graph_.IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id); +} + +CudaGraphAnnotation_t NvExecutionProvider::PerThreadContext::GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const { + // Actual implementation + auto graph_annotation_str = run_options.GetConfigOptions().GetConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation); + CudaGraphAnnotation_t cuda_graph_annotation_id = kCudaGraphAnnotationDefault; + + // Kind of debugging head implementation, can be cleaned and made robust like CUDA EP + if (graph_annotation_str.has_value() && !graph_annotation_str->empty()) { + if (!TryParseStringWithClassicLocale(*graph_annotation_str, cuda_graph_annotation_id)) { + LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Failed to parse cuda graph annotation id: " + << *graph_annotation_str << ", using default: " << kCudaGraphAnnotationDefault; + cuda_graph_annotation_id = kCudaGraphAnnotationDefault; + } + } + return cuda_graph_annotation_id; +} + +void NvExecutionProvider::PerThreadContext::SetCurrentGraphAnnotationId(CudaGraphAnnotation_t cuda_graph_annotation_id) { + current_graph_annotation_id_ = cuda_graph_annotation_id; +} + +CudaGraphAnnotation_t NvExecutionProvider::PerThreadContext::GetCurrentGraphAnnotationId() const { + return current_graph_annotation_id_; +} + +void NvExecutionProvider::PerThreadContext::CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id) { + cuda_graph_.Reset(); + cuda_graph_.CaptureBegin(cuda_graph_annotation_id); +} + +void NvExecutionProvider::PerThreadContext::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id) { + cuda_graph_.CaptureEnd(cuda_graph_annotation_id); +} + +bool NvExecutionProvider::PerThreadContext::IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const { + return cuda_graph_.IsGraphCaptured(cuda_graph_annotation_id); +} + +Status NvExecutionProvider::PerThreadContext::ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag) { + return cuda_graph_.Replay(cuda_graph_annotation_id, sync_status_flag); +} + +void NvExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id) { + graph_id_to_run_count_[cuda_graph_annotation_id]++; +} + bool NvExecutionProvider::PerThreadContext::IsTensorRTContextInMap(std::string fused_node) { auto it = trt_context_map_.find(fused_node); if (it != trt_context_map_.end()) { @@ -846,6 +847,12 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info) if (info.has_user_compute_stream) { external_stream_ = true; stream_ = static_cast(info.user_compute_stream); + } else if (cuda_graph_enable_) { + external_stream_ = false; + CUDA_CALL_THROW(cudaStreamCreate(&stream_)); + } else { + external_stream_ = false; + stream_ = nullptr; // Will be created in compute function } std::string profile_min_shapes, profile_max_shapes, profile_opt_shapes; @@ -1010,7 +1017,7 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info) // external stream: // If user provides "external" cuda stream, only this cuda stream will be used even if multiple threads are running InferenceSession.Run() concurrently. // So, no need to synchronize different streams after enqueueV3. - if (cuda_graph_enable_ || external_stream_) { + if (external_stream_) { sync_stream_after_enqueue_ = false; } @@ -1038,7 +1045,7 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info) << ", nv_force_sequential_engine_build: " << force_sequential_engine_build_ << ", nv_sparsity_enable: " << sparsity_enable_ << ", nv_auxiliary_streams: " << auxiliary_streams_ - << ", nv_cuda_graph_enable: " << cuda_graph_enable_ + << ", enable_cuda_graph: " << cuda_graph_enable_ << ", nv_dump_ep_context_model: " << dump_ep_context_model_ << ", nv_ep_context_file_path: " << ep_context_file_path_ << ", nv_ep_context_embed_mode: " << ep_context_embed_mode_ @@ -1060,7 +1067,7 @@ NvExecutionProvider::~NvExecutionProvider() { } } - if (!external_stream_ && stream_) { + if (!external_stream_ && stream_ != nullptr) { ORT_IGNORE_RETURN_VALUE(CUDA_CALL(cudaStreamDestroy(stream_))); } ReleaseTensorRTCustomOpDomainList(info_.custom_op_domain_list); @@ -1072,41 +1079,82 @@ NvExecutionProvider::~NvExecutionProvider() { } } +void NvExecutionProvider::HandleCudaGraphStart(cudaStream_t stream, bool require_io_binding, + CudaGraphAnnotation_t cuda_graph_annotation_id, bool& graph_replay_on_this_run, bool& should_start_capture) { + graph_replay_on_this_run = false; + should_start_capture = false; + + // Case 1: CUDA Graph capture is enabled AND IO binding is required. + // In this case, we force graph re-capture by resetting warmup runs. + // If a graph for this annotation ID already exists, delete it before proceeding. + if (require_io_binding && cuda_graph_enable_) { + GetPerThreadContext().ResetWarmupRuns(cuda_graph_annotation_id); + + if (GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) { + LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Graph already captured and required_io_binding is true, resetting warmup runs and deleting graph"; + GetPerThreadContext().DeleteCapturedGraph(cuda_graph_annotation_id); + } + // Case 2: CUDA Graph capture is enabled AND IO binding is NOT required + } else if (cuda_graph_enable_ && !require_io_binding) { + // If the graph is not yet captured, increment the regular run counter + if (cuda_graph_annotation_id != kCudaGraphAnnotationSkip && + !GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) { + GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture(cuda_graph_annotation_id); + } + + // If capture is allowed and graph not already captured, + // set the stream and begin capture + if (!GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id) && + GetPerThreadContext().IsGraphCaptureAllowed(cuda_graph_annotation_id)) { + GetPerThreadContext().SetCudaGraphStream(stream); + GetPerThreadContext().CaptureBegin(cuda_graph_annotation_id); + should_start_capture = true; + } + + // If a graph is already captured for this ID, mark it for replay in this run. + if (GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) { + graph_replay_on_this_run = true; + } + } +} + bool NvExecutionProvider::IsGraphCaptureEnabled() const { return cuda_graph_enable_; } -bool NvExecutionProvider::IsGraphCaptureAllowed() const { - return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_; +bool NvExecutionProvider::IsGraphCaptured(int graph_annotation_id) const { + // This is hardcoded to always return false because we are not allowing the ORT framework to have the CUDA graph control. + (void)graph_annotation_id; + return false; } -void NvExecutionProvider::CaptureBegin(int) { - cuda_graph_.Reset(); - cuda_graph_.CaptureBegin(0); +Status NvExecutionProvider::ReplayGraph(int graph_annotation_id) { + // This is hardcoded to always return OK because we are not allowing the ORT framework to have the CUDA graph control. + (void)graph_annotation_id; + return Status::OK(); } -void NvExecutionProvider::CaptureEnd(int) { - cuda_graph_.CaptureEnd(0); - is_graph_captured_ = true; -} +Status NvExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) { + if (cuda_graph_enable_) { + CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCudaGraphAnnotationId(run_options); + GetPerThreadContext().SetCurrentGraphAnnotationId(cuda_graph_annotation_id); + } -bool NvExecutionProvider::IsGraphCaptured(int) const { - return is_graph_captured_; + if (multi_profile_enable_ == true) { + auto graph_annotation_str = + run_options.GetConfigOptions().GetConfigEntry(nv::run_option_names::kProfileIndex); + TryParseStringWithClassicLocale(*graph_annotation_str, nv_profile_index_); + } + return Status::OK(); } -Status NvExecutionProvider::ReplayGraph(int) { - ORT_ENFORCE(IsGraphCaptured(0)); - // Please note that CUDAGraph::Replay() is not thread safe. - // ORT TRT calls ReplayGraph() in compute_func() where synchronization is enforced due to lock_guard(), - // therefore calling CUDAGraph::Replay() here is guaranteed to be thread safe. - return cuda_graph_.Replay(0); -} +Status NvExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) { + (void)run_options; -void NvExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() { - // Please note that this function is not thread safe. - // ORT TRT calls this function in compute_func() where synchronization is enforced due to lock_guard(), - // therefore following increment is guaranteed to be thread safe. - ++regular_run_count_before_graph_capture_; + if (sync_stream && external_stream_) { + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_)); + } + return Status::OK(); } std::vector NvExecutionProvider::CreatePreferredAllocators() { @@ -1133,22 +1181,6 @@ std::unique_ptr NvExecutionProvider::GetDataTransfer() const { return std::make_unique(); } -Status NvExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) { - if (multi_profile_enable_ == true) { - auto graph_annotation_str = - run_options.GetConfigOptions().GetConfigEntry(nv::run_option_names::kProfileIndex); - TryParseStringWithClassicLocale(*graph_annotation_str, nv_profile_index_); - } - return Status::OK(); -} - -Status NvExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) { - if (sync_stream && external_stream_) { - CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_)); - } - return Status::OK(); -} - // Get the pointer to the IBuilder instance. // Note: This function is not thread safe. Calls to this function from different threads must be serialized // even though it doesn't make sense to have multiple threads initializing the same inference session. @@ -2519,7 +2551,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr profile_opt_shapes_.find(input_name) != profile_opt_shapes_.end() && profile_max_shapes_.find(input_name) != profile_max_shapes_.end(); if (has_explicit_profile && tensor_has_profile) { - apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges); + apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges, cuda_graph_enable_); } else { LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] Creating implicit profile for tensor " << input_name; profile_min_shapes_[input_name] = std::vector>{{}}; @@ -2546,7 +2578,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr profile_max_shapes_[input_name][0][idx_dim] = dim_value; } } - apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges); + apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges, cuda_graph_enable_); } if (!apply_profile) { std::ostringstream msg; @@ -2600,6 +2632,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr // Otherwise engine will be handled at inference time. std::unique_ptr trt_engine; std::unique_ptr trt_context; + std::unique_ptr trt_runtime_config; // Generate file name for dumping ep context model if (dump_ep_context_model_ && ctx_model_path_.empty()) { @@ -2622,6 +2655,13 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "NvTensorRTRTX EP failed to deserialize engine for fused node: " + fused_node.Name()); } + + trt_runtime_config = std::unique_ptr(trt_engine->createRuntimeConfig()); + if (trt_runtime_config && cuda_graph_enable_) { + trt_runtime_config->setDynamicShapesKernelSpecializationStrategy(nvinfer1::DynamicShapesKernelSpecializationStrategy::kEAGER); + } + trt_runtime_config->setExecutionContextAllocationStrategy(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED); + if (detailed_build_log_) { auto engine_build_stop = std::chrono::steady_clock::now(); LOGS_DEFAULT(INFO) << "TensorRT engine build for " << fused_node.Name() << " took: " << std::chrono::duration_cast(engine_build_stop - engine_build_start).count() << "ms" << std::endl; @@ -2681,7 +2721,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr // Build context // Note: Creating an execution context from an engine is thread safe per TRT doc // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading - trt_context = std::unique_ptr(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED)); + trt_context = std::unique_ptr(trt_engine->createExecutionContext(trt_runtime_config.get())); if (!trt_context) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "NvTensorRTRTX EP could not build execution context for fused node: " + fused_node.Name()); @@ -2712,7 +2752,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr } // Save TRT engine, other TRT objects and input/output info to map - parsers_.emplace(fused_node.Name(), std::move(trt_parser)); engines_.emplace(fused_node.Name(), std::move(trt_engine)); contexts_.emplace(fused_node.Name(), std::move(trt_context)); networks_.emplace(fused_node.Name(), std::move(trt_network)); @@ -2728,7 +2767,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) { std::unique_ptr p = std::make_unique(); *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(), - &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name], + &engines_[context->node_name], &contexts_[context->node_name], &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name], input_shape_ranges_[context->node_name], &tensorrt_mu_, engine_cache_enable_, cache_path_, @@ -2766,7 +2805,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr auto trt_engine = trt_state->engine->get(); auto trt_context = trt_state->context->get(); auto trt_profiles = trt_state->profiles; - int num_outputs = static_cast(output_indexes.size()); std::unordered_set input_names; if (alloc_ == nullptr) { @@ -2777,9 +2815,17 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr } OrtAllocator* alloc = alloc_; - void* cuda_stream; - Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream)); - cudaStream_t stream = static_cast(cuda_stream); + cudaStream_t stream; + if (stream_ != nullptr) { + // Use our existing stream (either user's or our early-created) + stream = stream_; + } else { + // Create stream now (lazy creation case) + void* cuda_stream; + Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream)); + stream = static_cast(cuda_stream); + stream_ = stream; + } if (multi_profile_enable_ == true) { if (!trt_context->setOptimizationProfileAsync(nv_profile_index_, stream)) @@ -2833,16 +2879,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr /* * Set output shapes and bind output buffers */ - std::unordered_map buffers; - buffers.reserve(num_outputs); - using OutputOrtValue = Ort::UnownedValue; - std::unordered_map output_tensors; - output_tensors.reserve(num_outputs); - std::unordered_map output_dim_sizes; - output_dim_sizes.reserve(num_outputs); - if (require_io_binding) { - bool skip_output_binding_allowed = true; for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) { char const* output_name = output_binding_names[i]; @@ -2861,16 +2898,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr nvinfer1::Dims dims; void* data_ptr = nullptr; - Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes, - dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed); + Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, + dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr); if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); } trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims}; } - - trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed; } // Set execution context memory @@ -2886,18 +2921,23 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr trt_context->setDeviceMemoryV2(trt_state->context_memory.get(), mem_size); } } - // Start CUDA graph capture. - // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because - // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream. - if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) { - LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model"; - cuda_graph_.SetStream(stream); - CaptureBegin(0); - } - // Run TRT inference - if (!trt_context->enqueueV3(stream)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed."); + // Start CUDA graph capture with the correct stream + // Note: We need to set the stream and start capture here because this is where we have access to the actual compute stream + // Get the graph annotation ID that was stored during OnRunStart + CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCurrentGraphAnnotationId(); + bool graph_replay_on_this_run = false; + bool should_start_capture = false; + + HandleCudaGraphStart(stream, require_io_binding, cuda_graph_annotation_id, + graph_replay_on_this_run, should_start_capture); + + if (!graph_replay_on_this_run) { + if (!trt_context->enqueueV3(stream)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed."); + } + } else { + ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_)); } /* @@ -2914,10 +2954,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above. * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture. */ + + if (cuda_graph_enable_ && should_start_capture) { + GetPerThreadContext().CaptureEnd(cuda_graph_annotation_id); + ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_)); + } + if (sync_stream_after_enqueue_) { CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); } - // Assign TRT output back to ORT output // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished) // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output @@ -2940,29 +2985,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage()); } - } else { - auto& output_tensor = output_tensors[i]; - if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { - auto output_tensor_ptr = output_tensor.GetTensorMutableData(); - if (output_tensor_ptr != nullptr) { - cuda::Impl_Cast(stream, reinterpret_cast(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]); - } - } - } - } - - // End CUDA graph capture. - // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture - // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc. - // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis. - if (cuda_graph_enable_ && !IsGraphCaptured(0)) { - if (IsGraphCaptureAllowed()) { - CaptureEnd(0); - // CUDA work issued to a capturing stream doesn't actually run on the GPU, - // so run the captured graph here to actually execute the work. - ORT_RETURN_IF_ERROR(ReplayGraph(0)); - } else { - IncrementRegularRunCountBeforeGraphCapture(); } } @@ -3086,7 +3108,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name]; auto trt_engine = trt_state->engine->get(); auto trt_context = trt_state->context->get(); - int num_outputs = static_cast(output_indexes.size()); std::unordered_map> shape_tensor_values; // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run std::unordered_map> shape_tensor_values_int64; // same as above but for int64 shape tensor input @@ -3098,9 +3119,16 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra } OrtAllocator* alloc = alloc_; - void* cuda_stream; - Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream)); - cudaStream_t stream = static_cast(cuda_stream); + cudaStream_t stream; + if (stream_ != nullptr) { + // Use our existing stream (either user's or our early-created) + stream = stream_; + } else { + // Create stream now (lazy creation case) + void* cuda_stream; + Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream)); + stream = static_cast(cuda_stream); + } // Check before using trt_engine if (trt_engine == nullptr) { @@ -3149,16 +3177,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra /* * Set output shapes and bind output buffers */ - std::unordered_map buffers; - buffers.reserve(num_outputs); - using OutputOrtValue = Ort::UnownedValue; - std::unordered_map output_tensors; - output_tensors.reserve(num_outputs); - std::unordered_map output_dim_sizes; - output_dim_sizes.reserve(num_outputs); - if (require_io_binding) { - bool skip_output_binding_allowed = true; for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) { char const* output_name = output_binding_names[i]; @@ -3177,16 +3196,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra nvinfer1::Dims dims; void* data_ptr = nullptr; - Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes, - dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed); + Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, + dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr); if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); } trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims}; } - - trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed; } // Set execution context memory @@ -3203,18 +3220,23 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra trt_context->setDeviceMemoryV2(trt_state->context_memory.get(), mem_size); } } - // Start CUDA graph capture. - // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because - // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream. - if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) { - LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model"; - cuda_graph_.SetStream(stream); - CaptureBegin(0); - } - // Run TRT inference - if (!trt_context->enqueueV3(stream)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed."); + // Start CUDA graph capture with the correct stream + // Note: We need to set the stream and start capture here because this is where we have access to the actual compute stream + // Get the graph annotation ID that was stored during OnRunStart + CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCurrentGraphAnnotationId(); + bool graph_replay_on_this_run = false; + bool should_start_capture = false; + + HandleCudaGraphStart(stream, require_io_binding, cuda_graph_annotation_id, + graph_replay_on_this_run, should_start_capture); + + if (!graph_replay_on_this_run) { + if (!trt_context->enqueueV3(stream)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed."); + } + } else { + ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_)); } /* @@ -3231,10 +3253,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above. * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture. */ + + if (cuda_graph_enable_ && should_start_capture) { + GetPerThreadContext().CaptureEnd(cuda_graph_annotation_id); + ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_)); + } + if (sync_stream_after_enqueue_) { CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); } - // Assign TRT output back to ORT output // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished) // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output @@ -3257,29 +3284,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra if (status != Status::OK()) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage()); } - } else { - auto& output_tensor = output_tensors[i]; - if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) { - auto output_tensor_ptr = output_tensor.GetTensorMutableData(); - if (output_tensor_ptr != nullptr) { - cuda::Impl_Cast(stream, reinterpret_cast(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]); - } - } - } - } - - // End CUDA graph capture. - // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture - // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc. - // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis. - if (cuda_graph_enable_ && !IsGraphCaptured(0)) { - if (IsGraphCaptureAllowed()) { - CaptureEnd(0); - // CUDA work issued to a capturing stream doesn't actually run on the GPU, - // so run the captured graph here to actually execute the work. - ORT_RETURN_IF_ERROR(ReplayGraph(0)); - } else { - IncrementRegularRunCountBeforeGraphCapture(); } } diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h index e3dd38eb837ff..9e5fd03756f02 100644 --- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h +++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h @@ -12,7 +12,7 @@ typedef void* cublasHandle_t; typedef void* cudnnStatus_t; #endif #include "core/providers/nv_tensorrt_rtx/nv_includes.h" - +#include "core/session/onnxruntime_run_options_config_keys.h" #include #include "core/providers/cuda/cuda_graph.h" #include "nv_execution_provider_info.h" @@ -195,7 +195,6 @@ struct TensorrtFuncState { AllocatorHandle allocator = nullptr; std::string fused_node_name; nvinfer1::IBuilder* builder; - tensorrt_ptr::unique_pointer* parser = nullptr; std::unique_ptr* engine = nullptr; std::unique_ptr* context = nullptr; std::unique_ptr* network = nullptr; @@ -305,9 +304,11 @@ class NvExecutionProvider : public IExecutionProvider { std::vector CreatePreferredAllocators() override; + // CUDA Graph support bool IsGraphCaptureEnabled() const override; bool IsGraphCaptured(int graph_annotation_id) const override; Status ReplayGraph(int graph_annotation_id) override; + void HandleCudaGraphStart(cudaStream_t stream, bool require_io_binding, CudaGraphAnnotation_t cuda_graph_annotation_id, bool& graph_replay_on_this_run, bool& should_start_capture); static common::Status RefitEngine(std::string onnx_model_filename, std::string& onnx_model_folder_path, @@ -384,7 +385,6 @@ class NvExecutionProvider : public IExecutionProvider { // In general, TensorRT objects are not thread safe; accesses to an object from different threads must be serialized by the client. // But there are still some thread safe operations, please see here https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading // For those non thread safe operations, TRT EP uses (1) lock_guard or (2) PerThreadContext to make sure synchronization. - std::unordered_map> parsers_; std::unordered_map> engines_; std::unordered_map> contexts_; std::unordered_map> builders_; @@ -405,15 +405,6 @@ class NvExecutionProvider : public IExecutionProvider { // Call cudaStreamSynchronize() after TRT enqueueV3() mutable bool sync_stream_after_enqueue_ = true; - CUDAGraph cuda_graph_; - bool is_graph_captured_ = false; - int regular_run_count_before_graph_capture_ = 0; - // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like: - // (1) memory pattern is enabled. (2) arena allocation for stream. - // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs - // to allocate enough memory in Arena before graph capturing. - const int min_num_runs_before_cuda_graph_capture_ = 1; // required min regular runs before graph capture for the necessary memory allocations. - // [Note] We don't use PerThreadContext for now since it has issue with multithreading // // TRT or CUDA objects that must be maintained on a per thread basis will be put under this PerThreadContext data structure. @@ -436,14 +427,20 @@ class NvExecutionProvider : public IExecutionProvider { bool UpdateTensorRTContext(std::string fused_node, std::unique_ptr context); void ResetTensorRTContext(std::string fused_node); - void InitCUDAGraph(); - void SetGraphStream(cudaStream_t stream); - bool IsGraphCaptureAllowed() const; - void CaptureBegin(int graph_annotation_id); - void CaptureEnd(int graph_annotation_id); - bool IsGraphCaptured(int graph_annotation_id) const; - Status ReplayGraph(int graph_annotation_id); - void IncrementRegularRunCountBeforeGraphCapture(); + // CUDA Graph management + void SetCudaGraphStream(cudaStream_t stream) { cuda_graph_.SetStream(stream); } + bool IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const; + bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const; + CudaGraphAnnotation_t GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const; + void SetCurrentGraphAnnotationId(CudaGraphAnnotation_t cuda_graph_annotation_id); + CudaGraphAnnotation_t GetCurrentGraphAnnotationId() const; + void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id); + void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id); + bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const; + Status ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag); + void IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id); + void ResetWarmupRuns(CudaGraphAnnotation_t cuda_graph_annotation_id); + void DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id); private: cudnnHandle_t external_cudnn_handle_ = nullptr; @@ -466,13 +463,18 @@ class NvExecutionProvider : public IExecutionProvider { // Cuda graph with multi threads will be supported in the future, so cuda_graph_ is put under PerThreadContext. // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph instance is enough (no need to maintain one CUDAGraph instance per TRT subgraph) CUDAGraph cuda_graph_; + // Map of graph id to regular_run_count_before_graph_capture + std::unordered_map graph_id_to_run_count_; bool is_graph_captured_ = false; int regular_run_count_before_graph_capture_ = 0; + // Current graph annotation ID for this run + CudaGraphAnnotation_t current_graph_annotation_id_ = kCudaGraphAnnotationDefault; // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like: // (1) memory pattern is enabled. (2) arena allocation for stream. // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs // to allocate enough memory in Arena before graph capturing. - const int min_num_runs_before_cuda_graph_capture_ = 1; // required min regular runs before graph capture for the necessary memory allocations. + const int min_num_runs_before_cuda_graph_capture_ = 2; // required min regular runs before graph capture for the necessary memory allocations. + // https://github.com/NVIDIA/TensorRT/blob/main/samples/common/sampleInference.cpp#L1258-L1291 Based on the trtexec code }; using PerThreadContextMap = std::unordered_map>; @@ -606,11 +608,6 @@ class NvExecutionProvider : public IExecutionProvider { std::unordered_map& output_map, std::vector& node_compute_funcs); - bool IsGraphCaptureAllowed() const; - void CaptureBegin(int graph_annotation_id); - void CaptureEnd(int graph_annotation_id); - void IncrementRegularRunCountBeforeGraphCapture(); - /** * Get the pointer to the IBuilder instance. * This function only creates the instance at the first time it's being called." diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index be59b1ae07020..68d15bdfdcee0 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -90,7 +90,12 @@ BackendManager::BackendManager(SessionContext& session_context, "[OpenVINO-EP] Bounded dynamic model execution using provider option reshape_input is not supported for OVEP EPContext model"; ORT_THROW(exception_str); } - model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph); + if (subgraph_context_.is_ep_ctx_ovir_encapsulated) { + model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.onnx_model_path_name.replace_extension("xml").string(), subgraph); + } else { + model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph); + } + } else { model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger); } @@ -236,7 +241,9 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie std::ofstream blob_file(blob_filename, std::ios::out | std::ios::trunc | std::ios::binary); if (!blob_file) { - ORT_THROW("Unable to open file for epctx model dump."); + std::ostringstream err_msg; + err_msg << "Unable to open file for epctx model dump: " << blob_filename; + ORT_THROW(err_msg.str()); } compiled_model.export_model(blob_file); model_blob_str = blob_filename.filename().string(); @@ -375,6 +382,56 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) { return false; } +static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) { + const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); + for (std::size_t i = 0; i < node_indices.size(); i++) { + gsl::not_null node(graph_viewer.GetNode(node_indices[i])); + for (auto& output : node->OutputDefs()) { + if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) + return true; + } + } + return false; +} + +static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) { + const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr; + return type_proto && type_proto->has_tensor_type() && + (type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 || + type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16); +} + +// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization +static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) { + std::unordered_set qdq_ops = {"QuantizeLinear", "DequantizeLinear"}; + const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder(); + + for (size_t i = 0; i < node_indices.size(); i++) { + gsl::not_null node(graph_viewer.GetNode(node_indices[i])); + + if (qdq_ops.find(node->OpType()) != qdq_ops.end()) { + const auto& input_defs = node->InputDefs(); + + if (node->OpType() == "DequantizeLinear") { + // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output] + // Check quantized input tensor and optional zero point + if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) || + (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) { + return true; + } + } else if (node->OpType() == "QuantizeLinear") { + // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output] + const auto& output_defs = node->OutputDefs(); + if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) || + (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) { + return true; + } + } + } + } + return false; +} + static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name, [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto, [[maybe_unused]] const onnxruntime::Node& fused_node) { @@ -433,6 +490,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, } #endif + // Check if the graph is QDQ and has int16 or uint16 quantization + // If so, we will apply the QDQ scales fix transformation (for GPU device only) + bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph); + const auto& onnx_model_path_name = subgraph.ModelPath(); // QDQ stripping enabled only for the NPU and experimentally on the GPU if ((session_context_.device_type.find("NPU") != std::string::npos) && @@ -446,7 +507,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; } else if ((session_context_.device_type.find("GPU") != std::string::npos) && - enable_ovep_qdq_optimizer) { + is_qdq_graph_uint16_or_int16) { // Create a copy of the model std::unique_ptr model; Status status = qdq_scales_fix::Transform(subgraph, logger, model); @@ -456,6 +517,16 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node, DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); return model_proto; + } else if (IsModelBF16(subgraph)) { + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled"; + std::unique_ptr model; + Status status = bfloat16_fix::Transform(subgraph, logger, model); + auto model_proto = model->ToProto(); + model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + print_model_proto_duration(); + DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node); + ORT_ENFORCE(status.IsOK(), status.ErrorMessage()); + return model_proto; } else { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled"; auto model = subgraph.CreateModel(logger); diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 73fbe9a0fa76f..7027861f0c4dc 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -150,6 +150,11 @@ CreateOVModel(std::string&& model, LOGS_DEFAULT(INFO) << log_tag << "Reshaping the ov tensor to specified shape"; ov_model->reshape(session_context.reshape); } + + if (!session_context.layout.empty()) { + LOGS_DEFAULT(INFO) << log_tag << "Setting the ov tensor layout to specified layout"; + ov_model = Set_Layout(ov_model, session_context.layout); + } // Check for Constant Folding if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) { ov::pass::ConstantFolding pass_const_obj; @@ -199,6 +204,41 @@ GetOutputTensor(Ort::KernelContext& context, return context.GetOutput(index, output_shape); } +std::shared_ptr Set_Layout(std::shared_ptr ov_model, const layout_t& layout) { + ov::preprocess::PrePostProcessor preproc(ov_model); + + const auto& inputs = ov_model->inputs(); + const auto& outputs = ov_model->outputs(); + + auto find_tensor_index = [](const std::vector>& tensors, const std::string& name) -> std::optional { + for (size_t i = 0; i < tensors.size(); ++i) { + const auto& tensor = tensors[i]; + if (tensor.get_any_name() == name || tensor.get_tensor().get_names().count(name) > 0) { + return i; + } + } + return std::nullopt; + }; + + for (const auto& [tensor_name, layout_value] : layout) { + bool tensor_found = false; + + if (auto input_idx = find_tensor_index(inputs, tensor_name)) { + preproc.input(*input_idx).tensor().set_layout(layout_value); + tensor_found = true; + } else if (auto output_idx = find_tensor_index(outputs, tensor_name)) { + preproc.output(*output_idx).tensor().set_layout(layout_value); + tensor_found = true; + } + + if (!tensor_found) { + LOGS_DEFAULT(WARNING) << "Tensor '" << tensor_name << "' not found in model inputs or outputs"; + } + } + + return preproc.build(); +} + int GetFirstAvailableDevice(SessionContext& session_context) { int i = 0; // Get the first available VAD-M device and set the device to busy diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 15145df651fa2..27f791c7a5bd1 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -79,6 +79,8 @@ int GetFirstAvailableDevice(SessionContext& session_context); void FillOutputsWithConstantData(std::shared_ptr node, Ort::UnownedValue& out_tensor); +std::shared_ptr Set_Layout(std::shared_ptr ov_model, const layout_t& layout); + template void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr node); diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 6efd866d47c3c..2f174110dd31b 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -59,7 +59,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr }; // If the EPContext node with OVIR Encapsulation, then create // an executable network from EP_CACHE_CONTEXT using read_model() & compile_model() - exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream, + exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream->stream_, hw_target, device_config, enable_causallm, @@ -98,6 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr !subgraph_context_.has_dynamic_input_shape && !session_context_.so_context_enable && session_context_.reshape.empty() && + session_context_.layout.empty() && !enable_causallm && !eligible_for_cpu_fallback && auto_unified_compile); @@ -213,101 +214,29 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { if (!session_context_.load_config.empty()) { const std::map& target_config = session_context_.load_config; - if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) { - if (target_config.find("NPU") != target_config.end()) { - auto npu_genai_config = target_config.at("NPU"); - CausalLMConfig().ApplyConfig(npu_genai_config, device_config); - } else { - LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found."; - } - } + // Extract device names from device string and apply their configs + // Examples: "GPU" -> ["GPU"], "AUTO:GPU.0,CPU" -> ["AUTO", "GPU", "CPU"] + auto apply_device_config = [&](std::string_view device) { + if (device.empty()) return; - if (session_context_.device_type.find("NPU") != std::string::npos) { - auto npuw_config = target_config.at("NPU"); - - // Check if "NPU_USE_NPUW" exists and is set to "YES" - auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW"); - if (npu_use_npuw_it != npuw_config.end() && - npu_use_npuw_it->second.is() && - npu_use_npuw_it->second.as() == "YES") { - // Only add NPUW-related keys if NPU_USE_NPUW is "YES" - for (const auto& [key, value] : npuw_config) { - if (key.find("NPUW") != std::string::npos) { - if (!value.is()) { - LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key; - continue; - } - device_config[key] = value; - } - } - } else { - // Check if there are any "NPUW" keys and log a warning - if (std::any_of(npuw_config.begin(), npuw_config.end(), - [&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) { - LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'."; - } - } - } - auto find_device_type_mode = [&](const std::string& device_type) -> std::string { - std::string device_mode = ""; - auto delimiter_pos = device_type.find(':'); - if (delimiter_pos != std::string::npos) { - std::stringstream str_stream(device_type.substr(0, delimiter_pos)); - std::getline(str_stream, device_mode, ','); - } - return device_mode; - }; - - // Parse device types like "AUTO:CPU,GPU" and extract individual devices - auto parse_individual_devices = [&](const std::string& device_type) -> std::vector { - std::vector devices; - auto delimiter_pos = device_type.find(':'); - if (delimiter_pos != std::string::npos) { - std::stringstream str_stream(device_type.substr(delimiter_pos + 1)); - std::string device; - while (std::getline(str_stream, device, ',')) { - devices.emplace_back(device); - } - } else { - devices.emplace_back(device_type); - } - return devices; - }; + // Remove device index: "GPU.0" -> "GPU" + auto base_device = device.substr(0, device.find('.')); - // Set properties, Validation will be handled by OpenVINO Core - auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options) { - for (const auto& [key, value] : config_options) { - if ((key.find("NPUW") != std::string::npos) || - ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) { - continue; + if (auto config_it = target_config.find(std::string(base_device)); config_it != target_config.end()) { + for (const auto& [key, value] : config_it->second) { + device_config[key] = value; } - OVCore::Get()->core.set_property(device, ov::AnyMap{{key, value}}); } }; - // Check if the device type is AUTO, HETERO, or MULTI - if (session_context_.device_type.find("AUTO") == 0 || - session_context_.device_type.find("HETERO") == 0 || - session_context_.device_type.find("MULTI") == 0) { - //// Parse to get the device mode (e.g., "AUTO:CPU,GPU" -> "AUTO") - std::unordered_set supported_mode = {"AUTO", "HETERO", "MULTI"}; - auto device_mode = find_device_type_mode(session_context_.device_type); - ORT_ENFORCE(supported_mode.find(device_mode) != supported_mode.end(), " Invalid device mode is passed : ", session_context_.device_type); - // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"]) - auto individual_devices = parse_individual_devices(session_context_.device_type); - if (!device_mode.empty()) individual_devices.emplace_back(device_mode); - - // Set properties only for individual devices (e.g., "CPU", "GPU") - for (const std::string& device : individual_devices) { - if (target_config.count(device)) { - // Set properties for the device - set_target_properties(device, target_config.at(device)); + // Parse device string by splitting on ':' and ',' delimiters + const auto& device_str = session_context_.device_type; + for (size_t start = 0, pos = 0; pos <= device_str.size(); ++pos) { + if (pos == device_str.size() || device_str[pos] == ':' || device_str[pos] == ',') { + if (pos > start) { + apply_device_config(std::string_view(device_str).substr(start, pos - start)); } - } - } else { - if (target_config.count(session_context_.device_type)) { - set_target_properties(session_context_.device_type, - target_config.at(session_context_.device_type)); + start = pos + 1; } } } diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 6a2b375d733f9..07b09899ac214 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -70,6 +70,7 @@ class SharedContext : public WeakSingleton { using config_t = std::map; using reshape_t = std::map; +using layout_t = std::map; struct ProviderInfo { std::string device_type{""}; // [device_type]: Overrides the accelerator hardware type and @@ -88,6 +89,7 @@ struct ProviderInfo { // (GPU) feature. If blob files are already present, // it will be directly loaded. reshape_t reshape{}; // Used for reshaping the ov input tensor shape at runtime. + layout_t layout{}; // Used for specifying the ov input/output tensor layout at runtime. std::string model_priority{"DEFAULT"}; // High-level OpenVINO model priority hint // Defines what model should be provided with more performant // bounded resource first @@ -110,7 +112,7 @@ struct ProviderInfo { const ConfigOptions* config_options{NULL}; const std::unordered_set valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision", "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer", - "enable_causallm", "disable_dynamic_shapes", "reshape_input"}; + "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"}; }; // Holds context applicable to the entire EP instance. diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index ec38425f602eb..365a4625815d6 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -19,7 +19,7 @@ class IBackend { virtual ~IBackend() = default; virtual void RewindKVCache(size_t index) {} }; -using ptr_stream_t = std::unique_ptr; +using ptr_stream_t = std::unique_ptr; class BackendFactory { public: static std::shared_ptr diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index 9e70756a254aa..051a39bd4f205 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -100,7 +100,8 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer, return Status::OK(); } -std::unique_ptr EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const { +std::unique_ptr +EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const { auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin(); auto node = graph_viewer.GetNode(first_index); ORT_ENFORCE(node != nullptr); @@ -113,10 +114,11 @@ std::unique_ptr EPCtxHandler::GetModelBlobStream(const std::filesy bool embed_mode = static_cast(attrs.at(EMBED_MODE).i()); std::unique_ptr result; + std::filesystem::path blob_filepath{}; if (embed_mode) { result.reset((std::istream*)new std::istringstream(ep_cache_context)); } else { - auto blob_filepath = so_context_file_path; + blob_filepath = so_context_file_path; if (blob_filepath.empty() && !graph_viewer.ModelPath().empty()) { blob_filepath = graph_viewer.ModelPath(); } @@ -126,16 +128,18 @@ std::unique_ptr EPCtxHandler::GetModelBlobStream(const std::filesy } bool isXML = backend_utils::IsModelStreamXML(*result); + std::filesystem::path native_blob_path{}; if (!isXML) { // If the model stream is not an XML (i.e. precompiled blob), the OpenVINO SDK version that it was // exported with must match the version that is currently running. + native_blob_path = std::move(blob_filepath); ORT_ENFORCE((attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_), "EPCtx blob was exported / is compatible with OpenVINO SDK version " + attrs.at(EP_SDK_VER).s() + ", but OpenVINO SDK version currently in use is " + openvino_sdk_version_); } LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; - return result; + return std::make_unique(std::move(result), native_blob_path); } bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const { diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h index b9ddb40a7a233..f207f5014ca1f 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h @@ -12,6 +12,12 @@ namespace onnxruntime { namespace openvino_ep { +struct ModelBlobWrapper { + ModelBlobWrapper(std::unique_ptr stream, const std::filesystem::path& native_blob_path) : stream_(std::move(stream)), maybe_native_blob_path_(native_blob_path) {} + std::unique_ptr stream_; + std::filesystem::path maybe_native_blob_path_; +}; + // Utilities to handle EPContext node export and parsing of an EPContext node // to create the compiled_model object to infer on static const char EPCONTEXT_OP[] = "EPContext"; @@ -31,7 +37,7 @@ class EPCtxHandler { const std::string& graph_name, const bool embed_mode, std::string&& model_blob_str) const; - std::unique_ptr GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const; + std::unique_ptr GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const; InlinedVector GetEPCtxNodes() const; bool CheckEPCacheContextAttribute(const GraphViewer& graph_viewer, const std::string& target_attr_extn) const; diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 1b19517b07363..a0fa885cbfc38 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -94,18 +94,23 @@ common::Status OpenVINOExecutionProvider::Compile( auto& logger = *GetLogger(); Status status = Status::OK(); + bool is_epctx_model = false; if (!fused_nodes.empty()) { // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get(); session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string(); session_context_.onnx_opset_version = graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain); + + // OVIR wrapped in epctx should be treated as source but this code does not + // This corner case is not in use and will be addressed in a future commit + is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0); } // The block below is executed during EP context model inference auto& metadata = shared_context_->shared_weights.metadata; // Metadata object in memory if (session_context_.so_share_ep_contexts && - !session_context_.so_context_enable && + is_epctx_model && metadata.empty()) { fs::path context_model_file_path = session_context_.so_context_file_path; if (context_model_file_path.empty()) { diff --git a/onnxruntime/core/providers/openvino/openvino_parser_utils.cc b/onnxruntime/core/providers/openvino/openvino_parser_utils.cc index 21fc7f935da23..a290fea73e0e8 100644 --- a/onnxruntime/core/providers/openvino/openvino_parser_utils.cc +++ b/onnxruntime/core/providers/openvino/openvino_parser_utils.cc @@ -236,5 +236,79 @@ ov::Dimension OpenVINOParserUtils::ParseDimensionRange(const std::string& range_ return ov::Dimension(range_start, range_end); } +layout_t OpenVINOParserUtils::ParseLayout(const std::string& layout_definition) { + layout_t parsed_layout_map; + + // Return empty map for empty input + if (layout_definition.empty()) { + ORT_THROW("Empty layout definition provided in layout parameter"); + } + + // Regular expression for parsing layout definitions + const std::regex layout_pattern(R"(([^\[\],]+)\s*\[(.*?)\])"); // e.g. "input_1[NC],data[CHW]" + + // Find all tensor layout definitions using regex + auto layout_begin = std::sregex_iterator( + layout_definition.begin(), + layout_definition.end(), + layout_pattern); + auto layout_end = std::sregex_iterator(); + + // If no matches found, throw error + if (layout_begin == layout_end) { + ORT_THROW("Invalid layout definition format: " + layout_definition); + } + + // Process each tensor definition + for (std::sregex_iterator i = std::move(layout_begin); i != layout_end; ++i) { + std::smatch layout_match = *i; + + // Extract tensor name and trim whitespace + std::string tensor_name = layout_match[1].str(); // Group 1: tensor name e.g. "input_1" + tensor_name = TrimWhitespace(tensor_name); + + if (tensor_name.empty()) { + ORT_THROW("Empty tensor name provided in layout parameter"); + } + + // Extract dimensions string + std::string dimensions_str = layout_match[2].str(); // Group 2: dimensions string [e.g. "NC", "CHW"] + + if (!Check_Valid_Layout(dimensions_str, tensor_name)) { + ORT_THROW("Invalid dimensions string provided in layout parameter"); + } + + // Store parsed shape in result map + parsed_layout_map[tensor_name] = ov::Layout(dimensions_str); + } + + return parsed_layout_map; +} + +bool OpenVINOParserUtils::Check_Valid_Layout(const std::string& layout_str, const std::string& tensor_name) { + // Check if the layout string is empty + if (layout_str.empty()) { + return false; + } + + std::unordered_set seen_alphabets; + for (char c : layout_str) { + if (std::isalpha(c)) { + char upper_c = static_cast(std::toupper(c)); // Convert to uppercase for case-insensitive comparison + if (seen_alphabets.find(upper_c) != seen_alphabets.end()) { + ORT_THROW("Repeated Dim '" + std::string(1, c) + + "' found in layout dimensions for tensor '" + tensor_name + "'"); + } + seen_alphabets.insert(upper_c); + } else if (c != '?') { + // Only '?' is allowed as non-alphabetic character + ORT_THROW("Invalid character '" + std::string(1, c) + + "' found in layout dimensions for tensor '" + tensor_name + "'"); + } + } + + return true; +} + } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/openvino_parser_utils.h b/onnxruntime/core/providers/openvino/openvino_parser_utils.h index e6aa0e0a46a3b..a0936d627df40 100644 --- a/onnxruntime/core/providers/openvino/openvino_parser_utils.h +++ b/onnxruntime/core/providers/openvino/openvino_parser_utils.h @@ -18,8 +18,10 @@ class OpenVINOParserUtils { std::string& device_type, const std::string& option_name); static reshape_t ParseInputShape(const std::string& reshape_input_definition); + static layout_t ParseLayout(const std::string& layout_definition); static std::string TrimWhitespace(const std::string& str); static ov::Dimension ParseDimensionRange(const std::string& range_str, const std::string& tensor_name); + static bool Check_Valid_Layout(const std::string& layout_str, const std::string& tensor_name); }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 9dba8623031d0..1a10d9849d5cc 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -171,7 +171,7 @@ std::string ParseDeviceType(std::shared_ptr ov_core, const ProviderOptio if (!device_mode.empty()) { selected_device = device_mode + ":" + ov_luid_devices; for (const auto& dev_str : devices_to_check) { - const auto default_dev = split(dev_str, '.')[0]; + const std::string default_dev = split(dev_str, '.')[0]; if (ov_luid_devices.find(default_dev) == std::string::npos) selected_device = selected_device + "," + dev_str; @@ -230,6 +230,10 @@ static void ParseProviderInfo(const ProviderOptions& provider_options, pi.reshape = OpenVINOParserUtils::ParseInputShape(provider_options.at("reshape_input")); } + if (provider_options.contains("layout")) { + pi.layout = OpenVINOParserUtils::ParseLayout(provider_options.at("layout")); + } + if (provider_options.contains("load_config")) { auto parse_config = [&](const std::string& config_str) -> std::map { // If the config string is empty, return an empty map and skip processing @@ -526,7 +530,7 @@ struct OpenVINO_Provider : Provider { std::string ov_device_string; if (is_meta_device_factory) { // Build up a meta device string based on the devices that are passed in. E.g. AUTO:NPU,GPU.0,CPU - ov_device_string = ov_meta_device_type; + ov_device_string = std::move(ov_meta_device_type); ov_device_string += ":"; } @@ -539,7 +543,7 @@ struct OpenVINO_Provider : Provider { prepend_comma = true; } - provider_options["device_type"] = ov_device_string; + provider_options["device_type"] = std::move(ov_device_string); // Parse provider info with the device type ProviderInfo pi; diff --git a/onnxruntime/core/providers/openvino/ov_factory.cc b/onnxruntime/core/providers/openvino/ov_factory.cc index 8860405338409..2853cc17726ab 100644 --- a/onnxruntime/core/providers/openvino/ov_factory.cc +++ b/onnxruntime/core/providers/openvino/ov_factory.cc @@ -105,7 +105,7 @@ OrtStatus* OpenVINOEpPluginFactory::GetSupportedDevices(const OrtHardwareDevice* std::string ov_device_name; auto get_gpu_device_id = [&](const std::string& ov_device) { try { - auto device_id_str = ov_core_->get_property(ov_device, "GPU_DEVICE_ID").as(); + const std::string device_id_str = ov_core_->get_property(ov_device, "GPU_DEVICE_ID").as(); return static_cast(std::stoul(device_id_str, nullptr, 0)); } catch (ov::Exception&) { return 0u; // If we can't get the GPU_DEVICE_ID info, we won't have a device ID. diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 2d29df8eb4197..899845d4890cf 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -11,6 +11,7 @@ #include "core/providers/openvino/backend_utils.h" #include "core/providers/openvino/backends/basic_backend.h" #include "core/providers/openvino/ov_stateful_patch_utils.h" +#include "core/providers/openvino/onnx_ctx_model_helper.h" namespace onnxruntime { namespace openvino_ep { @@ -191,14 +192,23 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, "Exception while Loading Network for graph {}", name); } -OVExeNetwork OVCore::ImportModel(std::istream& model_stream, +OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob, std::string hw_target, const ov::AnyMap& device_config, std::string name) { return OvExceptionBoundary([&]() { ov::CompiledModel obj; - obj = core.import_model(model_stream, hw_target, device_config); +#if (OPENVINO_VERSION_MAJOR > 2025 || (OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR >= 3)) + if (!model_blob.maybe_native_blob_path_.empty()) { + obj = core.import_model(ov::read_tensor_data(model_blob.maybe_native_blob_path_), hw_target, device_config); + } else { + obj = core.import_model(*model_blob.stream_, hw_target, device_config); + } +#else + obj = core.import_model(*model_blob.stream_, hw_target, device_config); +#endif OVExeNetwork exe(obj, hw_target); + #ifndef NDEBUG printDebugInfo(exe.Get()); #endif diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index 6d1db4366410b..38ea883078e85 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -26,6 +26,7 @@ namespace openvino_ep { class OVCore; class OVInferRequest; class OVExeNetwork; +struct ModelBlobWrapper; typedef ov::Tensor OVTensor; typedef ov::ProfilingInfo OVProfilingInfo; @@ -82,7 +83,7 @@ struct OVCore : WeakSingleton { ov::AnyMap& device_config, const std::string& name); // OV Interface for Import model Stream - OVExeNetwork ImportModel(std::istream& model_stream, + OVExeNetwork ImportModel(ModelBlobWrapper& model_blob, std::string hw_target, const ov::AnyMap& device_config, std::string name); @@ -126,29 +127,16 @@ class OVInferRequest { OVTensorPtr GetTensor(const std::string& name); std::string GetInputTensorName(uint32_t index); - // Set tensor described param_info and ort_ptr. Overrides shape in param_info with shape_override. Call infer req tensor if ort_ptr is last set. + // Set tensor call infer req tensor if ort_ptr differs from last set ptr. void SetTensor(const std::string& name, const ov::element::Type& type, const ov::Shape& shape, void* ort_ptr) { auto& cached_binding = bindings_cache_[name]; - if (cached_binding.ort_ptr != ort_ptr) { - auto tensor_ptr = std::make_shared(type, shape, const_cast(ort_ptr)); - SetTensor(name, tensor_ptr); - cached_binding = {tensor_ptr, ort_ptr}; - } else if (ort_ptr == nullptr) { - // a null ort_ptr is expected for a tensor that has 0 elements. - // for example, a tensor of shape=[1, 8, 0, 64], which is valid. - // So, we check to see if at least one shape entry is 0. - auto contains_zero = [](const ov::Shape& shape) { - for (auto& s : shape) - if (s == 0) return true; - return false; - }; - if (contains_zero(shape)) { - // if there are zero elements (i.e. at least one shape entry is 0), - // then create and set the tensor anyway. - auto tensor_ptr = std::make_shared(type, shape); - SetTensor(name, tensor_ptr); - cached_binding = {tensor_ptr, ort_ptr}; - } + if (cached_binding.ort_ptr != ort_ptr || + !cached_binding.tensor_ptr || + cached_binding.tensor_ptr->get_shape() != shape) { + cached_binding.tensor_ptr.reset(); + auto ov_tensor = std::make_shared(type, shape, const_cast(ort_ptr)); + ovInfReq.set_tensor(name, *ov_tensor); + cached_binding = {std::move(ov_tensor), ort_ptr}; } } diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 2309ff3de751b..1893700cab09c 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -166,17 +166,28 @@ std::vector> GetCapability::Execute() { auto connected_clusters = GetConnectedClusters(graph_viewer_, ng_clusters); int no_of_clusters = 0; - + size_t cluster_index = 0; + size_t total_clusters = connected_clusters.size(); for (auto this_cluster : connected_clusters) { - // If subgraph has less then three, graph is considered trivial unless its an epctx cluster - if (this_cluster.size() < 3) { - bool is_epctx_node = false; - for (auto node_idx : this_cluster) { - if (graph_viewer_.GetNode(node_idx)->OpType() == "EPContext") - is_epctx_node = true; + bool omit_subgraph = false; + + if (this_cluster.size() == 1) { + // check next cluster + auto index = this_cluster.at(0); + size_t j = cluster_index; + if (graph_viewer_.GetNode(index)->OpType() == "EPContext") { + omit_subgraph = false; + } else if (j < total_clusters - 1) { + bool append_node = false; + while (j < total_clusters && !append_node) { + j = j + 1; + append_node = AddTrivialClusterToNextClusterIfConnected(graph_viewer_, index, connected_clusters[j]); + } + if (append_node) { + connected_clusters[j].emplace_back(index); + } + omit_subgraph = true; } - if (!is_epctx_node) - continue; } std::vector cluster_graph_inputs, cluster_inputs, cluster_outputs; @@ -188,7 +199,6 @@ std::vector> GetCapability::Execute() { cluster_inputs, cluster_outputs); - bool omit_subgraph = false; // Omitting zero dim subgraphs for (auto index : this_cluster) { const Node* node = graph_viewer_.GetNode(index); @@ -217,15 +227,17 @@ std::vector> GetCapability::Execute() { } } } - if (omit_subgraph) - continue; /* In scenarios, when there are no inputs or all inputs being initializers, ConstantFolding optimization in onnxruntime pre-computes the value.*/ - if (!cluster_inputs.empty()) { - AppendClusterToSubGraph(this_cluster, cluster_inputs, cluster_outputs, result); - no_of_clusters++; + if (!omit_subgraph) { + if (!cluster_inputs.empty()) { + AppendClusterToSubGraph(this_cluster, cluster_inputs, cluster_outputs, result); + no_of_clusters++; + } } + + cluster_index = cluster_index + 1; } LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Supported subgraphs on OpenVINO: " << no_of_clusters; } diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 17e69ad080b90..f848b89ed10c8 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -121,6 +121,7 @@ std::vector supported_op_mode = { {"DepthToSpace", V_2020_4, {"CPU", "GPU"}}, {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}}, {"DequantizeLinear", V_2024_4, {"NPU"}}, + {"DynamicQuantizeLinear", V_2025_2, {"CPU", "GPU"}}, {"DynamicQuantizeMatMul", V_2025_0, {"CPU", "GPU"}}, {"Div", V_2020_4, {"CPU", "GPU"}}, {"Dropout", V_2020_4, {"CPU", "GPU"}}, @@ -172,6 +173,7 @@ std::vector supported_op_mode = { {"LSTM", V_2020_4, {"CPU", "GPU"}}, {"MatMul", V_2020_4, {"CPU", "GPU"}}, {"MatMulInteger", V_2022_1, {"CPU"}}, + {"MatMulInteger", V_2025_2, {"GPU"}}, {"MatMulNBits", V_2024_5, {"CPU", "GPU"}}, {"Max", V_2020_4, {"CPU", "GPU"}}, {"MaxPool", V_2020_4, {"CPU", "GPU"}}, @@ -191,7 +193,7 @@ std::vector supported_op_mode = { {"Pad", V_2020_4, {"CPU", "GPU"}}, {"Pow", V_2020_4, {"CPU", "GPU"}}, {"PRelu", V_2020_4, {"CPU", "GPU"}}, - {"QLinearMatMul", V_2022_3, {"CPU"}}, + // {"QLinearMatMul", V_2022_3, {"CPU"}}, {"QuantizeLinear", V_2021_4, {"CPU", "GPU"}}, {"QuickGelu", V_2025_0, {"CPU", "GPU"}}, {"RNN", V_2023_1, {"CPU", "GPU"}}, @@ -361,6 +363,7 @@ void DataOps::populate_op_mode_supported() { no_dimension_supported_.push_back({"Clip", V_2022_1, {"All"}}); no_dimension_supported_.push_back({"Div", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"DequantizeLinear", V_2021_4, {"All"}}); + no_dimension_supported_.push_back({"DynamicQuantizeLinear", V_2025_2, {"All"}}); no_dimension_supported_.push_back({"Equal", V_2022_1, {"CPU"}}); no_dimension_supported_.push_back({"Equal", V_2023_0, {"GPU"}}); no_dimension_supported_.push_back({"Expand", V_2023_3, {"CPU"}}); @@ -374,6 +377,7 @@ void DataOps::populate_op_mode_supported() { no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}}); no_dimension_supported_.push_back({"Max", V_2024_4, {"All"}}); no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}}); + no_dimension_supported_.push_back({"MatMulInteger", V_2025_2, {"All"}}); no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}}); no_dimension_supported_.push_back({"Pow", V_2023_0, {"CPU", "GPU"}}); @@ -555,8 +559,13 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { return false; } + auto dtype = type_proto->tensor_type().elem_type(); + // Enable bfloat16 -> float16 on-the-fly conversion + if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16 || + dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || + dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16) + return true; if (is_initializer) { - auto dtype = type_proto->tensor_type().elem_type(); for (auto const& var : supported_types_initializer_) { if ((var.first <= version_id_) && (var.second == dtype)) { @@ -571,8 +580,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { #endif return false; } else { - auto dtype = type_proto->tensor_type().elem_type(); - if (device_id_.find("HETERO") != std::string::npos || device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) { for (auto const& var : supported_types_npu_) { @@ -609,9 +616,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { (var.second == dtype)) { return true; } - // experimentally for GPU and qdq stripping mode allow int16 types - if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)) - return true; } #ifndef NDEBUG if (openvino_ep::backend_utils::IsDebugEnabled()) { diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.cc b/onnxruntime/core/providers/openvino/ov_versions/utils.cc index f924fa0c8205c..791341218913f 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/utils.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/utils.cc @@ -153,6 +153,24 @@ GetConnectedClusters(const GraphViewer& graph_viewer, const std::vector& search_cluster) { + for (auto index : search_cluster) { + auto curr_node = graph_viewer.GetNode(index); + for (auto node = curr_node->InputNodesBegin(); node != curr_node->InputNodesEnd(); ++node) { + if ((*node).Index() == curr_node_index) + return true; + } + + for (auto node = curr_node->OutputNodesBegin(); node != curr_node->OutputNodesEnd(); ++node) { + if ((*node).Index() == curr_node_index) + return true; + } + } + return false; +} + void GetInputsOutputsOfCluster(const GraphViewer& graph_viewer, const std::vector& cluster, const std::unordered_set& ng_required_initializers, diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.h b/onnxruntime/core/providers/openvino/ov_versions/utils.h index 34aa762ba9b67..bdad047a422c1 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/utils.h +++ b/onnxruntime/core/providers/openvino/ov_versions/utils.h @@ -40,6 +40,10 @@ void IdentifyConnectedNodes( std::vector> GetConnectedClusters(const GraphViewer& graph_viewer, const std::vector>& clusters); +bool AddTrivialClusterToNextClusterIfConnected(const GraphViewer& graph_viewer, + const NodeIndex index, + const std::vector& search_cluster); + void GetInputsOutputsOfCluster(const GraphViewer& graph_viewer, const std::vector& cluster, const std::unordered_set& ng_required_initializers, diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp index d159930d52845..3a39152b5d17d 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp @@ -3,6 +3,8 @@ #include "qdq_scales_fix.h" #include "core/providers/openvino/ov_protobuf_utils.h" +#include "core/framework/ort_value.h" +#include "core/framework/float16.h" #include #include @@ -903,22 +905,11 @@ Status copy_model(const GraphViewer& src_graph_viewer, } for (auto& [name, tensor_proto] : src_graph.GetAllInitializedTensors()) { - dst_graph.AddInitializedTensor(*tensor_proto); - } - - for (auto node_arg : src_graph.GetInputsIncludingInitializers()) { - auto check_inputs = [node_arg](auto input_node_arg) { - return input_node_arg->Name() == node_arg->Name(); - }; - if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end()) - continue; - - auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true); - if (src_tensor_proto) { - auto dst_tensor_proto = onnx::TensorProto::Create(); - dst_tensor_proto->copy_from(src_tensor_proto); - dst_graph.AddInitializedTensor(*dst_tensor_proto); - } + auto ort_value = OrtValue(); + if (src_graph.GetOrtValueInitializer(name, ort_value)) + ORT_RETURN_IF_ERROR(dst_graph.AddInitializedOrtValue(*tensor_proto, ort_value)); + else + dst_graph.AddInitializedTensor(*tensor_proto); } ORT_RETURN_IF_ERROR(dst_graph.Resolve()); @@ -940,5 +931,54 @@ Status Transform(const GraphViewer& src_graph_viewer, return status; } } // namespace qdq_scales_fix + +namespace bfloat16_fix { +void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) { + for (auto& const_node : gen_graph.original_graph.Nodes()) { + auto node = const_cast(const_node); + if (node->OpType() == "Cast") { + for (auto& [name, const_attribute] : node->GetAttributes()) { + auto& attribute = const_cast(const_attribute); + if (name == "to" && attribute.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INT) + if (attribute.i() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) + attribute.set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + } + } + for (auto& output : node->OutputDefs()) { + auto& output_proto = const_cast(output->ToProto().type()); + if (output_proto.mutable_tensor_type()->elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) + output_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + } + } + + const auto& init_set = gen_graph.original_graph.GetAllInitializedTensors(); + for (auto& [key, const_tensor_proto] : init_set) { + auto tensor_proto = const_cast(const_tensor_proto); + auto dt = tensor_proto->data_type(); + if (dt == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) { + auto raw_data = tensor_proto->has_raw_data() ? reinterpret_cast(tensor_proto->mutable_raw_data()->data()) : nullptr; + if (raw_data) { + tensor_proto->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16); + std::int64_t size = 1; + for (int i = 0; i < tensor_proto->dims_size(); ++i) + size *= tensor_proto->dims()[i]; + for (std::int64_t i = 0; i < size; ++i) { + raw_data[i] = onnxruntime::MLFloat16(onnxruntime::BFloat16::FromBits(raw_data[i])).val; + } + } + } + } +} + +Status Transform(const GraphViewer& src_graph_viewer, + const logging::Logger& logger, + /*out*/ std::unique_ptr& model) { + auto status = qdq_scales_fix::copy_model(src_graph_viewer, logger, model); + auto g = qdq_scales_fix::generate_graph_from_onnx(model->MainGraph()); + + replace_bf16_with_fp16(g); + return status; +} +} // namespace bfloat16_fix } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h index c54c531e1bd40..2182850d96c43 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h @@ -15,5 +15,10 @@ Status Transform(const GraphViewer& src_graph, const logging::Logger& logger, /*out*/ std::unique_ptr& model); } +namespace bfloat16_fix { +Status Transform(const GraphViewer& src_graph, + const logging::Logger& logger, + /*out*/ std::unique_ptr& model); +} } // namespace openvino_ep } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index 24e8892622175..e010851f22e50 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -677,6 +677,27 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph, } } +// To check if the input parameters of a DQ or Q node are quantization parameters +// Scale and Zero point parameters are quantization parameters +static bool IsQuantizationParameter(const std::string& initializer_name, + const onnxruntime::GraphViewer& src_graph) { + // Check if this initializer is used as scale or zero_point in any DQ/Q node + for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) { + const auto* node = src_graph.GetNode(node_idx); + if (node->OpType() == "DequantizeLinear" || node->OpType() == "QuantizeLinear") { + const auto& input_defs = node->InputDefs(); + // Check if this initializer is used as scale (input 1) or zero_point (input 2) + if (input_defs.size() >= 2 && input_defs[1]->Name() == initializer_name) { + return true; // This is a scale parameter + } + if (input_defs.size() >= 3 && input_defs[2]->Name() == initializer_name) { + return true; // This is a zero_point parameter + } + } + } + return false; +} + // Creates a new model without the DQ/Q operators in the src graph. Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, const logging::Logger& logger, @@ -845,10 +866,20 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph, if (!init_with_data && utils::HasExternalData(initializer_tensor) && enable_ovep_weight_sharing) { - insert_metadata(initializer_tensor); + // Only convert to input if it's not a quantization parameter + bool is_quant_param = IsQuantizationParameter(name, src_graph); + + if (!is_quant_param) { + // This is actual weight data - so to convert to input for weight sharing + insert_metadata(initializer_tensor); + AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, name); + } else { + // This is a quantization parameter - keep as initializer even if external - // Add initializer with external data as input - AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, name); + if (initializers_to_keep.count(name) > 0) { + dst_graph.AddInitializedTensor(initializer_tensor); + } + } } else { // Add as an initialized tensor if it does not have external data if (initializers_to_keep.count(name) > 0) { diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h index e910afcbcf6c6..dbdb2d828f039 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h @@ -236,7 +236,7 @@ class BaseOpBuilder : public IOpBuilder { } // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end] - void ReArranagePads(std::vector& pads) const { + void ReArrangePads(std::vector& pads) const { auto pads_size = pads.size(); auto middle_pos = pads_size / 2; std::vector first_half(pads.begin(), pads.begin() + middle_pos); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc index b80d9db5d3560..dba4fbdbe0872 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc @@ -24,7 +24,6 @@ static Status GetOnnxConvType(const std::string& onnx_op_type, OnnxConvType& con } else { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unsupported ONNX convolution op type: ", onnx_op_type.c_str()); } - return Status::OK(); } @@ -171,7 +170,7 @@ Status ConvOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, return ProcessConv2D3DInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation); } - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Conv only supports 3D(rank 5), 2D (rank 4) or 1D (rank 3) inputs."); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Conv only supports 3D (rank 5), 2D (rank 4) or 1D (rank 3) inputs."); } Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper, @@ -713,7 +712,7 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra } } - ReArranagePads(pads); + ReArrangePads(pads); uint32_t pad_size = narrow(pads.size() / 2); QnnParamWrapper pad_amount_paramwrapper(node_unit.Index(), node_unit.Name(), QNN_OP_CONV_2D_PARAM_PAD_AMOUNT, {pad_size, 2}, std::move(pads)); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc index 404d3c402c21e..d2b1434c1c896 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc @@ -193,7 +193,7 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap [](int64_t item) { return SafeInt(item); }); // Onnx format is begin_0, begin_1, ..., end_0, end_1, ... // Qnn format is begin_0, end_0, begin_1, end_1, ... - ReArranagePads(pad_amount); + ReArrangePads(pad_amount); std::vector input_shape; ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0."); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc index 78b16ed784049..78ab047a560a7 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc @@ -199,7 +199,7 @@ Status PoolOpBuilder::SetCommonPoolParams(const NodeAttrHelper& node_helper, } } } - ReArranagePads(pad_amount); + ReArrangePads(pad_amount); // Param: rounding_mode. rounding_mode = node_helper.Get("ceil_mode", rounding_mode); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index e1a74b9e35370..ee5f52289d779 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -158,7 +158,7 @@ bool QnnModelWrapper::CreateQnnInputOutputTensors(const std::string& qnn_node_na return false; } - // During graph patitioning, we only need to do op validation, it's not required to create Qnn graph tensor + // During graph partitioning, we only need to do op validation, it's not required to create Qnn graph tensor // We only need to create the Qnn graph tensor during Compile to create Qnn graph if (!do_op_validation) { std::string error_string; diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 1f491bc788870..f3e2a8ce7ba7b 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -3036,7 +3036,7 @@ ORT_API_STATUS_IMPL(OrtApis::Node_GetAttributeByName, _In_ const OrtNode* node, API_IMPL_END } -ORT_API_STATUS_IMPL(OrtApis::Node_GetTensorAttributeAsOrtValue, _In_ const OrtNode* node, _In_ const OrtOpAttr* attribute, _Outptr_result_maybenull_ OrtValue** attr_tensor) { +ORT_API_STATUS_IMPL(OrtApis::OpAttr_GetTensorAttributeAsOrtValue, _In_ const OrtOpAttr* attribute, _Outptr_result_maybenull_ OrtValue** attr_tensor) { API_IMPL_BEGIN if (attr_tensor == nullptr) { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "attr_tensor argument is null"); @@ -3045,7 +3045,39 @@ ORT_API_STATUS_IMPL(OrtApis::Node_GetTensorAttributeAsOrtValue, _In_ const OrtNo return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "attribute argument is null"); } - ORT_API_RETURN_IF_STATUS_NOT_OK(node->GetTensorAttributeAsOrtValue(attribute, *attr_tensor)); + const auto* attr_proto = reinterpret_cast(attribute); + + if (attr_proto->type() != onnx::AttributeProto::TENSOR) { + return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "This OrtOpAttr instance is not a 'TENSOR' attribute"); + } + + const auto& tensor_proto = attr_proto->t(); + + // Check that TensorProto is valid. + if (!utils::HasDataType(tensor_proto)) { + return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Tensor proto doesn't have data type."); + } + + if (!ONNX_NAMESPACE::TensorProto::DataType_IsValid(tensor_proto.data_type())) { + return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Tensor proto has invalid data type."); + } + + if (utils::HasExternalData(tensor_proto)) { + return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, + "Tensor proto with external data for value attribute is not supported."); + } + + // Initialize OrtValue for tensor attribute. + auto tensor_attribute_value = std::make_unique(); + AllocatorPtr tensor_attribute_allocator = CPUAllocator::DefaultInstance(); + // The tensor in the 'Tensor' attribute's TensorProto is stored inline, not in an external file. + // Therefore, the 'model_path' passed to TensorProtoToOrtValue() may be an empty path. + std::filesystem::path model_path; + ORT_API_RETURN_IF_STATUS_NOT_OK(utils::TensorProtoToOrtValue(Env::Default(), model_path, tensor_proto, + tensor_attribute_allocator, *tensor_attribute_value)); + + *attr_tensor = tensor_attribute_value.release(); + return nullptr; API_IMPL_END } @@ -3423,25 +3455,86 @@ ORT_API_STATUS_IMPL(OrtApis::CopyTensors, _In_ const OrtEnv* env, API_IMPL_END } +// Validate compiled model compatibility info for specific EP device(s) +ORT_API_STATUS_IMPL(OrtApis::GetModelCompatibilityForEpDevices, + _In_reads_(num_ep_devices) const OrtEpDevice* const* ep_devices, + _In_ size_t num_ep_devices, + _In_ const char* compatibility_info, + _Out_ OrtCompiledModelCompatibility* out_status) { + API_IMPL_BEGIN + if (ep_devices == nullptr || num_ep_devices == 0 || compatibility_info == nullptr || out_status == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid argument provided to GetModelCompatibilityForEpDevices."); + } + + // Validate inputs and ensure all devices belong to the same EP/factory + const OrtEpFactory* first_factory = nullptr; + for (size_t i = 0; i < num_ep_devices; ++i) { + if (ep_devices[i] == nullptr) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "ep_devices contains a null entry."); + } + const OrtEpFactory* f = ep_devices[i]->GetMutableFactory(); + if (i == 0) { + first_factory = f; + } else if (f != first_factory) { + return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "All ep_devices must be from the same execution provider."); + } + } + + OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; + OrtStatus* ort_status = nullptr; + OrtEpFactory* factory = ep_devices[0]->GetMutableFactory(); + if (factory && factory->ValidateCompiledModelCompatibilityInfo) { + // collect hardware devices corresponding to the ep_devices + InlinedVector hardware_devices; + hardware_devices.reserve(num_ep_devices); + for (size_t i = 0; i < num_ep_devices; ++i) { + hardware_devices.push_back(ep_devices[i]->device); + } + ort_status = factory->ValidateCompiledModelCompatibilityInfo(factory, + hardware_devices.data(), + hardware_devices.size(), + compatibility_info, + &status); + } + if (ort_status != nullptr) { + return ToOrtStatus(ToStatusAndRelease(ort_status)); + } + + *out_status = status; + return nullptr; + API_IMPL_END +} + #else // defined(ORT_MINIMAL_BUILD) ORT_API_STATUS_IMPL(OrtApis::RegisterExecutionProviderLibrary, _In_ OrtEnv* /*env*/, _In_ const char* /*registration_name*/, const ORTCHAR_T* /*path*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "RegisterExecutionProviderLibrary is not supported in a minimal build."); API_IMPL_END } ORT_API_STATUS_IMPL(OrtApis::UnregisterExecutionProviderLibrary, _In_ OrtEnv* /*env*/, _In_ const char* /*registration_name*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "UnregisterExecutionProviderLibrary is not supported in a minimal build."); API_IMPL_END } ORT_API_STATUS_IMPL(OrtApis::GetEpDevices, _In_ const OrtEnv* /*env*/, _Outptr_ const OrtEpDevice* const** /*ep_devices*/, _Out_ size_t* /*num_ep_devices*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "GetEpDevices is not supported in a minimal build."); + API_IMPL_END +} + +// Minimal build stub for GetModelCompatibilityForEpDevices to satisfy symbol references from the API table +ORT_API_STATUS_IMPL(OrtApis::GetModelCompatibilityForEpDevices, + _In_reads_(num_ep_devices) const OrtEpDevice* const* /*ep_devices*/, + _In_ size_t /*num_ep_devices*/, + _In_ const char* /*compatibility_info*/, + _Out_ OrtCompiledModelCompatibility* /*out_status*/) { + API_IMPL_BEGIN + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "GetModelCompatibilityForEpDevices is not supported in a minimal build."); API_IMPL_END } @@ -3453,7 +3546,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_V2, _In_ OrtS _In_reads_(num_op_options) const char* const* /*ep_option_vals*/, size_t /*num_ep_options*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "SessionOptionsAppendExecutionProvider_V2 is not supported in a minimal build."); API_IMPL_END } @@ -3466,7 +3559,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionGetEpDeviceForInputs, _In_ const OrtSession* _Out_writes_(num_values) const OrtEpDevice** /*inputs_ep_devices*/, _In_ size_t /*num_values*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "SessionGetEpDeviceForInputs is not supported in a minimal build."); API_IMPL_END } @@ -3474,7 +3567,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSyncStreamForEpDevice, _In_ const OrtEpDevice _In_opt_ const OrtKeyValuePairs* /*stream_options*/, _Outptr_ OrtSyncStream** /*ort_stream*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "CreateSyncStreamForEpDevice is not supported in a minimal build."); API_IMPL_END } @@ -3493,7 +3586,7 @@ ORT_API_STATUS_IMPL(OrtApis::CopyTensors, _In_ const OrtEnv* /*env*/, _In_opt_ OrtSyncStream* /*stream*/, _In_ size_t /*num_tensors*/) { API_IMPL_BEGIN - return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build."); + return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "CopyTensors is not supported in a minimal build."); API_IMPL_END } @@ -4073,7 +4166,7 @@ static constexpr OrtApi ort_api_1_to_23 = { &OrtApis::Node_GetNumAttributes, &OrtApis::Node_GetAttributes, &OrtApis::Node_GetAttributeByName, - &OrtApis::Node_GetTensorAttributeAsOrtValue, + &OrtApis::OpAttr_GetTensorAttributeAsOrtValue, &OrtApis::OpAttr_GetType, &OrtApis::OpAttr_GetName, &OrtApis::Node_GetNumSubgraphs, @@ -4108,6 +4201,7 @@ static constexpr OrtApi ort_api_1_to_23 = { &OrtApis::CopyTensors, &OrtApis::Graph_GetModelMetadata, + &OrtApis::GetModelCompatibilityForEpDevices, }; // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase. diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h index b3b0036c68247..6dc4cf9d195cc 100644 --- a/onnxruntime/core/session/ort_apis.h +++ b/onnxruntime/core/session/ort_apis.h @@ -636,6 +636,13 @@ ORT_API_STATUS_IMPL(ValueInfo_IsFromOuterScope, _In_ const OrtValueInfo* value_i // OrtGraph ORT_API_STATUS_IMPL(Graph_GetName, _In_ const OrtGraph* graph, _Outptr_ const char** graph_name); ORT_API_STATUS_IMPL(Graph_GetModelMetadata, _In_ const OrtGraph* graph, _Outptr_ OrtModelMetadata** out); + +// EP Compatibility Info APIs +ORT_API_STATUS_IMPL(GetModelCompatibilityForEpDevices, + _In_reads_(num_ep_devices) const OrtEpDevice* const* ep_devices, + _In_ size_t num_ep_devices, + _In_ const char* compatibility_info, + _Out_ OrtCompiledModelCompatibility* out_status); ORT_API_STATUS_IMPL(Graph_GetModelPath, _In_ const OrtGraph* graph, _Outptr_ const ORTCHAR_T** model_path); ORT_API_STATUS_IMPL(Graph_GetOnnxIRVersion, _In_ const OrtGraph* graph, _Out_ int64_t* onnx_ir_version); ORT_API_STATUS_IMPL(Graph_GetNumOperatorSets, _In_ const OrtGraph* graph, _Out_ size_t* num_operator_sets); @@ -680,7 +687,7 @@ ORT_API_STATUS_IMPL(Node_GetAttributes, _In_ const OrtNode* node, _Out_writes_(num_attributes) const OrtOpAttr** attributes, _In_ size_t num_attributes); ORT_API_STATUS_IMPL(Node_GetAttributeByName, _In_ const OrtNode* node, _In_ const char* attribute_name, _Outptr_result_maybenull_ const OrtOpAttr** attribute); -ORT_API_STATUS_IMPL(Node_GetTensorAttributeAsOrtValue, _In_ const OrtNode* node, _In_ const OrtOpAttr* attribute, +ORT_API_STATUS_IMPL(OpAttr_GetTensorAttributeAsOrtValue, _In_ const OrtOpAttr* attribute, _Outptr_result_maybenull_ OrtValue** attr_tensor); ORT_API_STATUS_IMPL(OpAttr_GetType, _In_ const OrtOpAttr* attribute, _Out_ OrtOpAttrType* type); ORT_API_STATUS_IMPL(OpAttr_GetName, _In_ const OrtOpAttr* attribute, _Outptr_ const char** name); diff --git a/onnxruntime/core/session/plugin_ep/ep_factory_internal.h b/onnxruntime/core/session/plugin_ep/ep_factory_internal.h index 23e5e95af2903..093bfce462d32 100644 --- a/onnxruntime/core/session/plugin_ep/ep_factory_internal.h +++ b/onnxruntime/core/session/plugin_ep/ep_factory_internal.h @@ -80,9 +80,11 @@ class EpFactoryInternal : public OrtEpFactory { return impl_->CreateSyncStreamForDevice(memory_device, stream_options, stream); } - OrtStatus* ValidateCompiledModelCompatibilityInfo(_In_ const char* compatibility_info, + OrtStatus* ValidateCompiledModelCompatibilityInfo(_In_reads_(num_devices) const OrtHardwareDevice* const* devices, + _In_ size_t num_devices, + _In_ const char* compatibility_info, _Out_ OrtCompiledModelCompatibility* model_compatibility) noexcept { - return impl_->ValidateCompiledModelCompatibilityInfo(compatibility_info, model_compatibility); + return impl_->ValidateCompiledModelCompatibilityInfo(devices, num_devices, compatibility_info, model_compatibility); } // Function ORT calls to release an EP instance. diff --git a/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h b/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h index 6c55730d83979..f29154d19c53c 100644 --- a/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h +++ b/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h @@ -62,8 +62,13 @@ class EpFactoryInternalImpl { return false; } - virtual OrtStatus* ValidateCompiledModelCompatibilityInfo(_In_ const char* compatibility_info, - _Out_ OrtCompiledModelCompatibility* model_compatibility) noexcept { + virtual OrtStatus* ValidateCompiledModelCompatibilityInfo( + _In_reads_(num_devices) const OrtHardwareDevice* const* devices, + _In_ size_t num_devices, + _In_ const char* compatibility_info, + _Out_ OrtCompiledModelCompatibility* model_compatibility) noexcept { + ORT_UNUSED_PARAMETER(devices); + ORT_UNUSED_PARAMETER(num_devices); ORT_UNUSED_PARAMETER(compatibility_info); // Default implementation: mark as not applicable *model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc index 3bfca62a4d011..c8829423fbe26 100644 --- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc +++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc @@ -668,8 +668,15 @@ Status PluginExecutionProvider::ValidateCompiledModelCompatibilityInfo(const std // Plugin EP did not provide an implementation of this function, so we call a default implementation. return Base::ValidateCompiledModelCompatibilityInfo(compatibility_info, model_compatibility); } - // Delegate to the EP factory's validation method + // Delegate to the EP factory's validation method, passing hardware devices derived from our ep_devices_ + std::vector hardware_devices; + hardware_devices.reserve(ep_devices_.size()); + for (const auto* ep_device : ep_devices_) { + hardware_devices.push_back(ep_device->device); + } ORT_RETURN_IF_ERROR(ToStatusAndRelease(ep_factory_.ValidateCompiledModelCompatibilityInfo(&ep_factory_, + hardware_devices.data(), + hardware_devices.size(), compatibility_info.c_str(), &model_compatibility))); return Status::OK(); diff --git a/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h b/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h index 29793b503c9d1..2cceb1d08d536 100644 --- a/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h +++ b/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h @@ -46,9 +46,12 @@ struct ForwardToFactoryImpl { } static OrtStatus* ORT_API_CALL ValidateCompiledModelCompatibilityInfo(OrtEpFactory* this_ptr, + _In_reads_(num_devices) const OrtHardwareDevice* const* devices, + size_t num_devices, const char* compatibility_info, OrtCompiledModelCompatibility* model_compatibility) noexcept { - return static_cast(this_ptr)->ValidateCompiledModelCompatibilityInfo(compatibility_info, model_compatibility); + return static_cast(this_ptr)->ValidateCompiledModelCompatibilityInfo(devices, num_devices, + compatibility_info, model_compatibility); } static OrtStatus* ORT_API_CALL CreateAllocator(_In_ OrtEpFactory* this_ptr, diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 24554560b4dde..eb06a65ad5330 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1575,6 +1575,17 @@ void addGlobalMethods(py::module& m) { R"pbdoc(Get the list of available OrtEpDevice instances.)pbdoc", py::return_value_policy::reference); + m.def( + "get_model_compatibility_for_ep_devices", + [](const std::vector& ep_devices, + const std::string& compatibility_info) -> OrtCompiledModelCompatibility { + OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; + Ort::ThrowOnError(Ort::GetApi().GetModelCompatibilityForEpDevices( + ep_devices.data(), ep_devices.size(), compatibility_info.c_str(), &status)); + return status; + }, + R"pbdoc("Validate a compiled model's compatibility information for one or more EP devices.)pbdoc"); + #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE) m.def( "get_available_openvino_device_ids", []() -> std::vector { @@ -1759,6 +1770,12 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra .value("PRIORITY_BASED", ExecutionOrder::PRIORITY_BASED) .value("MEMORY_EFFICIENT", ExecutionOrder::MEMORY_EFFICIENT); + py::enum_(m, "OrtCompiledModelCompatibility") + .value("EP_NOT_APPLICABLE", OrtCompiledModelCompatibility_EP_NOT_APPLICABLE) + .value("EP_SUPPORTED_OPTIMAL", OrtCompiledModelCompatibility_EP_SUPPORTED_OPTIMAL) + .value("EP_SUPPORTED_PREFER_RECOMPILATION", OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION) + .value("EP_UNSUPPORTED", OrtCompiledModelCompatibility_EP_UNSUPPORTED); + py::enum_(m, "OrtAllocatorType") .value("INVALID", OrtInvalidAllocator) .value("ORT_DEVICE_ALLOCATOR", OrtDeviceAllocator) @@ -1782,7 +1799,7 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra type = OrtDevice::GPU; vendor = OrtDevice::VendorIds::MICROSOFT; } else if (type == OrtDevice::GPU) { -#if USE_CUDA +#if USE_CUDA || USE_NV || USE_NV_PROVIDER_INTERFACE || USE_CUDA_PROVIDER_INTERFACE vendor = OrtDevice::VendorIds::NVIDIA; #elif USE_ROCM || USE_MIGRAPHX vendor = OrtDevice::VendorIds::AMD; diff --git a/onnxruntime/test/framework/ep_compatibility_test.cc b/onnxruntime/test/framework/ep_compatibility_test.cc index be97cf2620881..a8a83fbe5ceb6 100644 --- a/onnxruntime/test/framework/ep_compatibility_test.cc +++ b/onnxruntime/test/framework/ep_compatibility_test.cc @@ -15,6 +15,7 @@ #include "core/session/onnxruntime_ep_device_ep_metadata_keys.h" #include "core/session/utils.h" #include "core/session/onnxruntime_c_api.h" +#include "core/session/onnxruntime_cxx_api.h" #include "core/session/abi_session_options_impl.h" #include "core/framework/error_code_helper.h" #include "dummy_provider.h" @@ -408,3 +409,122 @@ TEST_F(EpCompatibilityTest, TestSessionOptionConfiguration) { EXPECT_TRUE(has_config); EXPECT_EQ(config_value, "0"); } + +// ----------------------------- +// C API unit tests +// ----------------------------- + +namespace { + +// Helper to create an OrtEnv and fetch a CPU EP device pointer via the C API. +// Returns a pair of (env, cpu_device). Caller releases env via api->ReleaseEnv. +static std::pair CreateEnvAndGetCpuEpDevice(const OrtApi* api) { + OrtEnv* env = nullptr; + EXPECT_EQ(nullptr, api->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "EpCompatCapiTest", &env)); + EXPECT_NE(env, nullptr); + + const OrtEpDevice* const* devices = nullptr; + size_t num_devices = 0; + EXPECT_EQ(nullptr, api->GetEpDevices(env, &devices, &num_devices)); + EXPECT_GT(num_devices, 0u); + + const OrtEpDevice* cpu_device = nullptr; + for (size_t i = 0; i < num_devices; ++i) { + const char* name = api->EpDevice_EpName(devices[i]); + if (name && std::string(name) == "CPUExecutionProvider") { + cpu_device = devices[i]; + break; + } + } + + // Fallback: just pick the first device if CPU wasn't found (environment-dependent builds). + if (!cpu_device && num_devices > 0) { + cpu_device = devices[0]; + } + + EXPECT_NE(cpu_device, nullptr); + return {env, cpu_device}; +} + +} // namespace + +TEST(EpCompatibilityCapiTest, InvalidArguments) { + const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION); + ASSERT_NE(api, nullptr); + + OrtCompiledModelCompatibility out_status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE; + + // ep_devices == nullptr + OrtStatus* st = api->GetModelCompatibilityForEpDevices(nullptr, 0, "info", &out_status); + ASSERT_NE(st, nullptr); + EXPECT_EQ(api->GetErrorCode(st), ORT_INVALID_ARGUMENT); + api->ReleaseStatus(st); + + // Prepare a valid device + auto [env, device] = CreateEnvAndGetCpuEpDevice(api); + ASSERT_NE(env, nullptr); + ASSERT_NE(device, nullptr); + + // compatibility_info == nullptr + const OrtEpDevice* devices1[] = {device}; + st = api->GetModelCompatibilityForEpDevices(devices1, 1, nullptr, &out_status); + ASSERT_NE(st, nullptr); + EXPECT_EQ(api->GetErrorCode(st), ORT_INVALID_ARGUMENT); + api->ReleaseStatus(st); + + // out_status == nullptr + st = api->GetModelCompatibilityForEpDevices(devices1, 1, "some-info", nullptr); + ASSERT_NE(st, nullptr); + EXPECT_EQ(api->GetErrorCode(st), ORT_INVALID_ARGUMENT); + api->ReleaseStatus(st); + + api->ReleaseEnv(env); +} + +TEST(EpCompatibilityCapiTest, CpuEpReturnsNotApplicableIfNoValidation) { + const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION); + ASSERT_NE(api, nullptr); + + auto [env, device] = CreateEnvAndGetCpuEpDevice(api); + ASSERT_NE(env, nullptr); + ASSERT_NE(device, nullptr); + + OrtCompiledModelCompatibility out_status = static_cast(-1); + const OrtEpDevice* devices2[] = {device}; + OrtStatus* st = api->GetModelCompatibilityForEpDevices(devices2, 1, "arbitrary-compat-string", &out_status); + ASSERT_EQ(st, nullptr) << (st ? api->GetErrorMessage(st) : ""); + + // For providers that don't implement validation, API should return EP_NOT_APPLICABLE. + EXPECT_EQ(out_status, OrtCompiledModelCompatibility_EP_NOT_APPLICABLE); + api->ReleaseStatus(st); + + api->ReleaseEnv(env); +} + +// ----------------------------- +// C++ API unit tests +// ----------------------------- + +TEST(EpCompatibilityCxxApiTest, SingleDeviceCpuProvider) { + Ort::Env env{ORT_LOGGING_LEVEL_WARNING, "EpCompatCxx"}; + auto devices = env.GetEpDevices(); + ASSERT_FALSE(devices.empty()); + + std::vector selected; + for (const auto& d : devices) { + if (std::string{d.EpName()} == "CPUExecutionProvider") { + selected.push_back(d); + break; + } + } + + ASSERT_FALSE(selected.empty()); + + // Pick a status that the CPU EP would never return to ensure the value is set correctly. + OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION; + ASSERT_NO_FATAL_FAILURE({ + status = Ort::GetModelCompatibilityForEpDevices(selected, "arbitrary-compat-string"); + }); + + ASSERT_TRUE(status == OrtCompiledModelCompatibility_EP_NOT_APPLICABLE); +} \ No newline at end of file diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc index 98874874d50e9..e70d870ef6988 100644 --- a/onnxruntime/test/framework/save_model_with_external_initializers.cc +++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc @@ -84,7 +84,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx, size_t tensor_offset; std::stringstream stream(entry.value()); stream >> tensor_offset; - ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0, + ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.on_disk_alignment == 0, "tensor offset not align"); } } diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 98640bb2f6b4c..f626a1704f7a1 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -5370,8 +5370,59 @@ TEST(QDQTransformerTests, WeightBiasQuantization_Conv_Weight_Bias) { #endif } +// Tests that the WeightBiasQuantization optimizer still processes nodes that contain a type-preserving no +// branch ReLU op to QuantizeLinear e.g., Q -> DQ -> Conv (w/ float weight initializer) -> ReLU -> Q -> DQ +TEST(QDQTransformerTests, WeightBiasQuantization_ConvWithReLU) { + auto test_case = [](bool use_contrib_qdq) { + auto build_test_case = [&](ModelTestBuilder& builder) { + NodeArg* input_fp32 = builder.MakeInput({1, 1, 4, 4}, -1.0f, 1.0f); + NodeArg* weight_fp32 = builder.MakeInitializer({2, 1, 3, 3}, -1.0f, 1.0f); + NodeArg* input_q = builder.MakeIntermediate(); + NodeArg* input_dq = builder.MakeIntermediate(); + NodeArg* conv_fp32 = builder.MakeIntermediate(); + NodeArg* relu_fp32 = builder.MakeIntermediate(); + NodeArg* relu_q = builder.MakeIntermediate(); + NodeArg* relu_dq = builder.MakeOutput(); + builder.AddQuantizeLinearNode(input_fp32, 0.18f, static_cast(127), input_q, use_contrib_qdq); + builder.AddDequantizeLinearNode(input_q, 0.18f, static_cast(127), input_dq, use_contrib_qdq); + auto& conv_node = builder.AddNode("Conv", {input_dq, weight_fp32}, {conv_fp32}); + conv_node.AddAttribute("dilations", std::vector{1, 1}); + conv_node.AddAttribute("kernel_shape", std::vector{3, 3}); + conv_node.AddAttribute("strides", std::vector{1, 1}); + conv_node.AddAttribute("group", static_cast(1)); + conv_node.AddAttribute("pads", std::vector{0, 0, 0, 0}); + builder.AddNode("Relu", {conv_fp32}, {relu_fp32}); + builder.AddQuantizeLinearNode(relu_fp32, 0.69f, static_cast(127), relu_q, use_contrib_qdq); + builder.AddDequantizeLinearNode(relu_q, 0.69f, static_cast(127), relu_dq, use_contrib_qdq); + }; + + // Conv's weights should be quantized and folded, one additional Q/DQ pair inserted for weight + auto check_transformed_graph = [](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + EXPECT_EQ(op_to_count["QuantizeLinear"] + op_to_count["com.microsoft.QuantizeLinear"], 2 + 1); + EXPECT_EQ(op_to_count["DequantizeLinear"] + op_to_count["com.microsoft.DequantizeLinear"], 2 + 1); + EXPECT_EQ(op_to_count["Conv"], 1); + EXPECT_EQ(op_to_count["Relu"], 1); + }; + + TransformerTester(build_test_case, + check_transformed_graph, + TransformerLevel::Default, + TransformerLevel::Level1, + /*opset_version=*/20, + /*per_sample_tolerance=*/0.01, + /*relative_per_sample_tolerance=*/0.01, + /*transformer=*/std::make_unique()); + }; + + test_case(false); +#if !defined(DISABLE_CONTRIB_OPS) + test_case(true); +#endif +} + // Tests that the WeightBiasQuantization optimizer does not process nodes that do not -// already have an output that is consumed by a single QuantizeLinear node. +// already have an output that is consumed by a valid path to QuantizeLinear node. TEST(QDQTransformerTests, WeightBiasQuantization_SkipIfOutputNotQuantized) { auto test_case = [](bool add_final_reshape) { auto build_test_case = [&](ModelTestBuilder& builder) { diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index a22375320edae..46958843872d7 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -66,7 +66,9 @@ ABSL_FLAG(std::string, i, "", " [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n" " [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n" " [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n" - " [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true cache_dir|\"\"\"\n" + " [OpenVINO only] [reshape_input]: Sets model input shapes with support for bounded dynamic dimensions using 'min..max' syntax (e.g., [1..10,3,224,224]) \n" + " [OpenVINO only] [layout]: Specifies the layout for inputs/outputs to interpret tensor dimensions correctly. \n" + " [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true reshape_input|[1,3,60,60..100] layout|[NCHW] cache_dir|\"\"\"\n" "\n" " [QNN only] [backend_type]: QNN backend type. E.g., 'cpu', 'htp'. Mutually exclusive with 'backend_path'.\n" " [QNN only] [backend_path]: QNN backend path. E.g., '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. Mutually exclusive with 'backend_type'.\n" diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index f1a40b1da8651..1ba3078efdb1a 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -863,12 +863,14 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); ov_options[key] = value; } else if (key == "reshape_input") { ov_options[key] = value; + } else if (key == "layout") { + ov_options[key] = value; } else { ORT_THROW( "[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO." " ['device_type', 'device_id', 'num_of_threads', 'load_config', 'cache_dir', 'num_streams', " "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer'," - " 'enable_causallm', 'model_priority'] \n"); + " 'enable_causallm', 'reshape_input', 'layout', 'model_priority'] \n"); } } session_options.AppendExecutionProvider_OpenVINO_V2(ov_options); diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc index ccc703716844f..e6f3c4dd8b89e 100644 --- a/onnxruntime/test/platform/file_io_test.cc +++ b/onnxruntime/test/platform/file_io_test.cc @@ -17,7 +17,7 @@ #include #include "gtest/gtest.h" - +#include "asserts.h" #include "core/common/span_utils.h" #include "test/util/include/file_util.h" @@ -157,7 +157,6 @@ TEST(FileIoTest, MapFileIntoMemory) { SYSTEM_INFO sysinfo; GetSystemInfo(&sysinfo); static const auto page_size = sysinfo.dwPageSize; - static const auto allocation_granularity = sysinfo.dwAllocationGranularity; ASSERT_GT(page_size, static_cast(0)); TempFilePath tmp(ORT_TSTR("map_file_test_")); @@ -167,21 +166,10 @@ TEST(FileIoTest, MapFileIntoMemory) { const auto offsets_and_lengths = GenerateValidOffsetLengthPairs( 0, expected_data.size(), page_size / 10); - for (const auto& offset_and_length : offsets_and_lengths) { - const auto offset = offset_and_length.first; - const auto length = offset_and_length.second; - - // The offset must be a multiple of the allocation granularity - if (offset % allocation_granularity != 0) { - continue; - } - + for (const auto& [offset, length] : offsets_and_lengths) { Env::MappedMemoryPtr mapped_memory{}; - auto status = Env::Default().MapFileIntoMemory( - tmp.path.c_str(), offset, length, mapped_memory); - ASSERT_TRUE(status.IsOK()) - << "MapFileIntoMemory failed for offset " << offset << " and length " << length - << " with error: " << status.ErrorMessage(); + ASSERT_STATUS_OK(Env::Default().MapFileIntoMemory( + tmp.path.c_str(), offset, length, mapped_memory)); auto mapped_span = gsl::make_span(mapped_memory.get(), length); @@ -190,20 +178,11 @@ TEST(FileIoTest, MapFileIntoMemory) { ASSERT_TRUE(SpanEq(mapped_span, expected_data_span)); } - { - Env::MappedMemoryPtr mapped_memory{}; - - // invalid - offset is not a multiple of the allocation granularity - ASSERT_FALSE(Env::Default().MapFileIntoMemory( - tmp.path.c_str(), allocation_granularity * 3 / 2, page_size / 10, mapped_memory) - .IsOK()); - } - { Env::MappedMemoryPtr mapped_memory{}; // invalid - negative offset - ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK()); + ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory)); } } #endif diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc index a5fd37361a255..dc50a75873034 100644 --- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc @@ -688,7 +688,7 @@ TEST(Loop, SubgraphTypeOverride) { Graph::ResolveOptions options; options.override_types = true; test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kTensorrtExecutionProvider}, &session_run_options, nullptr, + {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}, &session_run_options, nullptr, ExecutionMode::ORT_SEQUENTIAL, options); } @@ -1162,7 +1162,7 @@ TEST(Loop, SequenceAsLoopCarriedDependency) { test.AddSeqOutput("loop_var_0_final", seq_output); // Disable TensorRT on unsupported data type BOOL - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); } #if !defined(DISABLE_OPTIONAL_TYPE) diff --git a/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc index f4d8cad90a714..1a71da6d95135 100644 --- a/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc @@ -11,7 +11,8 @@ namespace test { // range = [-ve, +ve] TEST(QuantizeLinearOpTest, DynamicQuantizeLinear) { // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { + if (DefaultDmlExecutionProvider().get() != nullptr || + DefaultOpenVINOExecutionProvider().get() != nullptr) { GTEST_SKIP() << "Skipping because of the following error: Expected equality of these values: 26 and 25"; } diff --git a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc new file mode 100644 index 0000000000000..fc90563a61bb1 --- /dev/null +++ b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc @@ -0,0 +1,116 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include + +#include "core/session/onnxruntime_cxx_api.h" +#include "core/framework/float16.h" + +#include "test/util/include/test/test_environment.h" +#include "test/optimizer/qdq_test_utils.h" + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +using namespace ONNX_NAMESPACE; +using namespace onnxruntime::logging; + +extern std::unique_ptr ort_env; + +class OVEP_BF16_Tests : public ::testing::TestWithParam {}; + +namespace detail { +auto ConstructModel() { + using namespace onnxruntime; + using namespace test; + + std::unordered_map domain_to_version; + domain_to_version[kOnnxDomain] = 19; + Model model("Bfloat16Tester", true, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(), + domain_to_version, {}, DefaultLoggingManager().DefaultLogger()); + + Graph& graph = model.MainGraph(); + ModelTestBuilder builder(graph); + auto dim = 4; + std::vector input_data(dim, 1.0f); + auto* input = builder.MakeInput({dim}, input_data); + builder.graph_.SetInputs({input}); + + auto* cast_to_bf16 = builder.MakeIntermediate(); + Node& cast_node = builder.AddNode("Cast", {input}, {cast_to_bf16}, ""); + cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)); + + std::vector weight_data(dim * dim); + for (std::size_t i = 0; i < weight_data.size(); ++i) + weight_data[i] = onnxruntime::BFloat16(static_cast(i % dim) / dim); + auto* weights = builder.MakeInitializer({dim, dim}, weight_data); + + auto* matmul_out = builder.MakeIntermediate(); + builder.AddNode("MatMul", {cast_to_bf16, weights}, {matmul_out}); + + std::vector weight_data_2(dim * dim); + for (std::size_t i = 0; i < weight_data_2.size(); ++i) + weight_data_2[i] = onnxruntime::BFloat16(static_cast(i % dim) / dim); + auto* weights_2 = builder.MakeInitializer({dim, dim}, weight_data_2); + + auto* matmul_out_2 = builder.MakeIntermediate(); + builder.AddNode("MatMul", {matmul_out, weights_2}, {matmul_out_2}); + + auto* output = builder.MakeOutput(); + Node& cast2_node = builder.AddNode("Cast", {matmul_out_2}, {output}); + cast2_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT)); + + builder.SetGraphOutputs(); + auto st = model.MainGraph().Resolve(); + if (st != Status::OK()) + throw std::runtime_error(st.ErrorMessage()); + return model; +} + +auto ProbeDevice(const std::string& device) { + static std::map is_present; + if (is_present.find(device) == is_present.end()) { + Ort::SessionOptions sessionOptions; + std::unordered_map ov_options; + ov_options["device_type"] = device; + try { + sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); + is_present[device] = true; + } catch (...) { + is_present[device] = false; + } + } + return is_present[device]; +} +} // namespace detail + +namespace onnxruntime { +namespace test { + +TEST_P(OVEP_BF16_Tests, TestModelConversion) { + Ort::SessionOptions sessionOptions; + std::unordered_map ov_options; + const auto& device = GetParam(); + if (!::detail::ProbeDevice(device)) + GTEST_SKIP() << device + " is not available on this machine"; + + ov_options["device_type"] = device; + auto model = ::detail::ConstructModel(); + sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options); + + std::string model_data; + model.ToProto().SerializeToString(&model_data); + auto model_data_span = AsByteSpan(model_data.data(), model_data.size()); + try { + Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), sessionOptions); + } catch (...) { + FAIL(); + } +} +INSTANTIATE_TEST_SUITE_P(OVEP_Tests, + OVEP_BF16_Tests, + ::testing::Values("CPU", "GPU", "NPU")); +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py new file mode 100644 index 0000000000000..8e69fdf088103 --- /dev/null +++ b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py @@ -0,0 +1,46 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +import platform +import sys +import unittest + +from onnxruntime.capi.onnxruntime_pybind11_state import ( + OrtCompiledModelCompatibility, + get_ep_devices, + get_model_compatibility_for_ep_devices, +) + +# handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed. +if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8: # noqa: YTT204 + os.add_dll_directory(os.getcwd()) + + +class TestEpCompatibility(unittest.TestCase): + def test_invalid_args(self): + # empty devices + with self.assertRaises(RuntimeError): + get_model_compatibility_for_ep_devices([], "info") + # None compatibility info should raise TypeError before native call + with self.assertRaises(TypeError): + get_model_compatibility_for_ep_devices(get_ep_devices(), None) # type: ignore[arg-type] + + def test_basic_smoke(self): + devices = list(get_ep_devices()) + if not devices: + self.skipTest("No EP devices available in this build") + + # Always select CPUExecutionProvider; skip if not present. + cpu_devices = [d for d in devices if getattr(d, "ep_name", None) == "CPUExecutionProvider"] + if not cpu_devices: + self.skipTest("CPUExecutionProvider not available in this build") + selected = [cpu_devices[0]] + + # API requires all devices belong to the same EP; we pass only one. + status = get_model_compatibility_for_ep_devices(selected, "arbitrary-compat-string") + self.assertEqual(status, OrtCompiledModelCompatibility.EP_NOT_APPLICABLE) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py new file mode 100644 index 0000000000000..d5c80a4a1f4ba --- /dev/null +++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py @@ -0,0 +1,468 @@ +# Copyright (c) NVIDIA Corporation. All rights reserved. +# Licensed under the MIT License. +from __future__ import annotations + +import sys +import unittest +from collections.abc import Sequence + +import numpy as np +import torch +from autoep_helper import AutoEpTestCase +from helper import get_name +from numpy.testing import assert_almost_equal +from onnx import TensorProto, helper +from onnx.defs import onnx_opset_version + +import onnxruntime as onnxrt +from onnxruntime.capi._pybind_state import OrtDevice as C_OrtDevice +from onnxruntime.capi._pybind_state import OrtValue as C_OrtValue +from onnxruntime.capi._pybind_state import OrtValueVector, SessionIOBinding + + +class TestNvTensorRTRTXAutoEP(AutoEpTestCase): + """ + Test suite for the NvTensorRTRTX Execution Provider. + + This class contains tests for registering the NvTensorRTRTX EP, + selecting it using different policies, and running inference with various + I/O binding configurations. + """ + + ep_lib_path = "onnxruntime_providers_nv_tensorrt_rtx.dll" + ep_name = "NvTensorRTRTXExecutionProvider" + + def setUp(self): + if sys.platform != "win32": + self.skipTest("Skipping test because device discovery is only supported on Windows") + self.register_execution_provider_library(self.ep_name, self.ep_lib_path) + + def tearDown(self): + self.unregister_execution_provider_library(self.ep_name) + + def _create_ortvalue_input_on_gpu(self, device): + return onnxrt.OrtValue.ortvalue_from_numpy( + np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), device, 0 + ) + + def _create_ortvalue_alternate_input_on_gpu(self, device): + return onnxrt.OrtValue.ortvalue_from_numpy( + np.array([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]], dtype=np.float32), + device, + 0, + ) + + def _create_uninitialized_ortvalue_input_on_gpu(self, device): + return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, device, 0) + + def _create_numpy_input(self): + return np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + + def _create_expected_output(self): + return np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + + def _create_expected_output_alternate(self): + return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32) + + def torch_to_onnx_type(self, torch_dtype): + if torch_dtype == torch.float32: + return TensorProto.FLOAT + elif torch_dtype == torch.float16: + return TensorProto.FLOAT16 + elif torch_dtype == torch.bfloat16: + return TensorProto.BFLOAT16 + elif torch_dtype == torch.int8: + return TensorProto.int8 + elif torch_dtype == torch.int32: + return TensorProto.INT32 + elif torch_dtype == torch.int64: + return TensorProto.INT64 + else: + raise TypeError(f"Unsupported dtype: {torch_dtype}") + + def test_nv_tensorrt_rtx_ep_register_and_inference(self): + """ + Test registration of NvTensorRTRTX EP, adding its OrtDevice to the SessionOptions, and running inference. + """ + ep_devices = onnxrt.get_ep_devices() + nv_tensorrt_rtx_ep_device = next((d for d in ep_devices if d.ep_name == self.ep_name), None) + self.assertIsNotNone(nv_tensorrt_rtx_ep_device) + self.assertEqual(nv_tensorrt_rtx_ep_device.ep_vendor, "NVIDIA") + + hw_device = nv_tensorrt_rtx_ep_device.device + self.assertEqual(hw_device.type, onnxrt.OrtHardwareDeviceType.GPU) + + # Run sample model and check output + sess = onnxrt.InferenceSession(get_name("mul_1.onnx")) + + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + input_name = sess.get_inputs()[0].name + res = sess.run([], {input_name: x}) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + + def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self): + """ + Test selecting NvTensorRTRTX EP via the PREFER_GPU policy and running inference. + """ + # Set a policy to prefer GPU. NvTensorRTRTX should be selected. + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + + # Run sample model and check output + sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options) + + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + input_name = sess.get_inputs()[0].name + res = sess.run([], {input_name: x}) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + + def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self): + """ + Test selecting NvTensorRTRTX EP via the custom EP selection delegate function and then run inference. + """ + + # User's custom EP selection function. + def my_delegate( + ep_devices: Sequence[onnxrt.OrtEpDevice], + model_metadata: dict[str, str], + runtime_metadata: dict[str, str], + max_selections: int, + ) -> Sequence[onnxrt.OrtEpDevice]: + self.assertGreater(len(model_metadata), 0) + self.assertGreaterEqual(len(ep_devices), 1) + self.assertGreaterEqual(max_selections, 2) + + nv_tensorrt_rtx_ep_device = next((d for d in ep_devices if d.ep_name == self.ep_name), None) + self.assertIsNotNone(nv_tensorrt_rtx_ep_device) + + # Select the NvTensorRTRTX device + return [nv_tensorrt_rtx_ep_device] + + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy_delegate(my_delegate) + self.assertTrue(sess_options.has_providers()) + + # Run sample model and check output + sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options) + + x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32) + input_name = sess.get_inputs()[0].name + res = sess.run([], {input_name: x}) + output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32) + np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08) + + def test_bind_input_only(self): + """ + Test I/O binding with input data only. + """ + # Set a policy to prefer GPU. NvTensorRTRTX should be selected. + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + + input = self._create_ortvalue_input_on_gpu("cuda") + + session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options) + io_binding = session.io_binding() + + # Bind input to the GPU + io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr()) + + # Sync if different streams + io_binding.synchronize_inputs() + + # Bind output to CPU + io_binding.bind_output("Y") + + # Invoke Run + session.run_with_iobinding(io_binding) + + # Sync if different streams + io_binding.synchronize_outputs() + + # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host + # here) + ort_output = io_binding.copy_outputs_to_cpu()[0] + + # Validate results + self.assertTrue(np.array_equal(self._create_expected_output(), ort_output)) + + def test_bind_input_and_bind_output_with_ortvalues(self): + """ + Test I/O binding with OrtValues for both input and output. + """ + # Set a policy to prefer GPU. NvTensorRTRTX EP should be selected. + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + + session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options) + io_binding = session.io_binding() + + # Bind ortvalue as input + input_ortvalue = self._create_ortvalue_input_on_gpu("cuda") + io_binding.bind_ortvalue_input("X", input_ortvalue) + + # Bind ortvalue as output + output_ortvalue = self._create_uninitialized_ortvalue_input_on_gpu("cuda") + io_binding.bind_ortvalue_output("Y", output_ortvalue) + + # Sync if different streams + io_binding.synchronize_inputs() + + # Invoke Run + session.run_with_iobinding(io_binding) + + # Sync if different streams + io_binding.synchronize_outputs() + + # Inspect contents of output_ortvalue and make sure that it has the right contents + self.assertTrue(np.array_equal(self._create_expected_output(), output_ortvalue.numpy())) + + # Bind another ortvalue as input + input_ortvalue_2 = self._create_ortvalue_alternate_input_on_gpu("cuda") + io_binding.bind_ortvalue_input("X", input_ortvalue_2) + + # Sync if different streams + io_binding.synchronize_inputs() + + # Invoke Run + session.run_with_iobinding(io_binding) + + # Sync if different streams + io_binding.synchronize_outputs() + + # Inspect contents of output_ortvalue and make sure that it has the right contents + self.assertTrue(np.array_equal(self._create_expected_output_alternate(), output_ortvalue.numpy())) + + def test_bind_input_and_non_preallocated_output(self): + """ + Test I/O binding with non-preallocated output. + """ + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + + session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options) + io_binding = session.io_binding() + + input = self._create_ortvalue_input_on_gpu("cuda") + + # Bind input to the GPU + io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr()) + + # Bind output to the GPU + io_binding.bind_output("Y", "cuda") + + # Sync if different streams + io_binding.synchronize_inputs() + + # Invoke Run + session.run_with_iobinding(io_binding) + + # Sync if different streams + io_binding.synchronize_outputs() + + # This call returns an OrtValue which has data allocated by ORT on the GPU + ort_outputs = io_binding.get_outputs() + self.assertEqual(len(ort_outputs), 1) + self.assertEqual(ort_outputs[0].device_name(), "cuda") + # Validate results (by copying results to CPU by creating a Numpy object) + self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy())) + + # We should be able to repeat the above process as many times as we want - try once more + ort_outputs = io_binding.get_outputs() + self.assertEqual(len(ort_outputs), 1) + self.assertEqual(ort_outputs[0].device_name(), "cuda") + # Validate results (by copying results to CPU by creating a Numpy object) + self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy())) + + input = self._create_ortvalue_alternate_input_on_gpu("cuda") + + # Change the bound input and validate the results in the same bound OrtValue + # Bind alternate input to the GPU + io_binding.bind_input( + "X", + "cuda", + 0, + np.float32, + [3, 2], + input.data_ptr(), + ) + + # Sync if different streams + io_binding.synchronize_inputs() + + # Invoke Run + session.run_with_iobinding(io_binding) + + # Sync if different streams + io_binding.synchronize_outputs() + + # This call returns an OrtValue which has data allocated by ORT on the GPU + ort_outputs = io_binding.get_outputs() + self.assertEqual(len(ort_outputs), 1) + self.assertEqual(ort_outputs[0].device_name(), "cuda") + # Validate results (by copying results to CPU by creating a Numpy object) + self.assertTrue(np.array_equal(self._create_expected_output_alternate(), ort_outputs[0].numpy())) + + def test_bind_input_and_preallocated_output(self): + """ + Test I/O binding with preallocated output. + """ + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + + input = self._create_ortvalue_input_on_gpu("cuda") + + session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options) + io_binding = session.io_binding() + + # Bind input to the GPU + io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr()) + + # Bind output to the GPU + output = self._create_uninitialized_ortvalue_input_on_gpu("cuda") + io_binding.bind_output("Y", "cuda", 0, np.float32, [3, 2], output.data_ptr()) + + # Sync if different streams + io_binding.synchronize_inputs() + + # Invoke Run + session.run_with_iobinding(io_binding) + + # Sync if different streams + io_binding.synchronize_outputs() + + # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host + # here) + ort_output_vals = io_binding.copy_outputs_to_cpu()[0] + # Validate results + self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals)) + + # Validate if ORT actually wrote to pre-allocated buffer by copying the allocated buffer + # to the host and validating its contents + ort_output_vals_in_cpu = output.numpy() + # Validate results + self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals_in_cpu)) + + def test_bind_input_types(self): + """ + Test I/O binding with various input data types. + """ + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + opset = onnx_opset_version() + device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0) + + for dtype in [ + np.float32, + # np.float64, + np.int32, + # np.uint32, + np.int64, + # np.uint64, + # np.int16, + # np.uint16, + # np.int8, + np.uint8, + np.float16, + np.bool_, + ]: + with self.subTest(dtype=dtype, inner_device=str(device)): + x = np.arange(8).reshape((-1, 2)).astype(dtype) + proto_dtype = helper.np_dtype_to_tensor_dtype(x.dtype) + + X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]]) # noqa: N806 + Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]]) # noqa: N806 + + # inference + node_add = helper.make_node("Identity", ["X"], ["Y"]) + + # graph + graph_def = helper.make_graph([node_add], "lr", [X], [Y], []) + model_def = helper.make_model( + graph_def, + producer_name="dummy", + ir_version=7, + producer_version="0", + opset_imports=[helper.make_operatorsetid("", opset)], + ) + + sess = onnxrt.InferenceSession(model_def.SerializeToString(), sess_options=sess_options) + + bind = SessionIOBinding(sess._sess) + ort_value = C_OrtValue.ortvalue_from_numpy(x, device) + bind.bind_ortvalue_input("X", ort_value) + bind.bind_output("Y", device) + sess._sess.run_with_iobinding(bind, None) + ortvaluevector = bind.get_outputs() + self.assertIsInstance(ortvaluevector, OrtValueVector) + ortvalue = bind.get_outputs()[0] + y = ortvalue.numpy() + assert_almost_equal(x, y) + + bind = SessionIOBinding(sess._sess) + bind.bind_input("X", device, dtype, x.shape, ort_value.data_ptr()) + bind.bind_output("Y", device) + sess._sess.run_with_iobinding(bind, None) + ortvalue = bind.get_outputs()[0] + y = ortvalue.numpy() + assert_almost_equal(x, y) + + def test_bind_onnx_types_from_torch(self): + """ + Test I/O binding with various input data types. + """ + sess_options = onnxrt.SessionOptions() + sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU) + self.assertTrue(sess_options.has_providers()) + opset = onnx_opset_version() + + for dtype in [ + torch.float32, + torch.float16, + torch.bfloat16, + torch.int32, + torch.int64, + ]: + with self.subTest(dtype=dtype): + proto_dtype = self.torch_to_onnx_type(dtype) + + x_ = helper.make_tensor_value_info("X", proto_dtype, [None]) + y_ = helper.make_tensor_value_info("Y", proto_dtype, [None]) + node_add = helper.make_node("Identity", ["X"], ["Y"]) + graph_def = helper.make_graph([node_add], "lr", [x_], [y_], []) + model_def = helper.make_model( + graph_def, + producer_name="dummy", + ir_version=10, + producer_version="0", + opset_imports=[helper.make_operatorsetid("", opset)], + ) + sess = onnxrt.InferenceSession(model_def.SerializeToString(), sess_options=sess_options) + + dev = "cuda" if torch.cuda.is_available() else "cpu" + device = ( + C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0) + if dev == "cuda" + else C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0) + ) + + x = torch.arange(8, dtype=dtype, device=dev) + y = torch.empty(8, dtype=dtype, device=dev) + + bind = SessionIOBinding(sess._sess) + bind.bind_input("X", device, proto_dtype, x.shape, x.data_ptr()) + bind.bind_output("Y", device, proto_dtype, y.shape, y.data_ptr()) + sess._sess.run_with_iobinding(bind, None) + self.assertTrue(torch.equal(x, y)) + + +if __name__ == "__main__": + unittest.main(verbosity=1) diff --git a/tools/ci_build/github/windows/extract_nuget_files.ps1 b/tools/ci_build/github/windows/extract_nuget_files.ps1 index ff8f63a85b97a..20d6c1f2b63a5 100644 --- a/tools/ci_build/github/windows/extract_nuget_files.ps1 +++ b/tools/ci_build/github/windows/extract_nuget_files.ps1 @@ -1,105 +1,119 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -# This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline +# This file is used by Zip-Nuget-Java Packaging Pipeline -# Re-construct a build directory that contains binaries from all the different platforms we're including -# in the native ORT nuget package +# Define the directory for NuGet artifacts. $nuget_artifacts_dir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts" -New-Item -Path $nuget_artifacts_dir -ItemType directory +# Create the directory if it doesn't exist. +New-Item -Path $nuget_artifacts_dir -ItemType directory -ErrorAction SilentlyContinue ## .zip files -# unzip directly -# exclude the iOS xcframework as we need to leave that zipped up to preserve symlinks -Get-ChildItem -Path $Env:BUILD_BINARIESDIRECTORY\nuget-artifact\* -Include *.zip -Exclude onnxruntime_ios_xcframework.*.zip | +# Unzip files directly, excluding the iOS xcframework to preserve its symlinks. +Get-ChildItem -Path "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact\*" -Include *.zip -Exclude onnxruntime_ios_xcframework.*.zip | Foreach-Object { - $cmd = "7z.exe x $($_.FullName) -y -o$nuget_artifacts_dir" - Write-Output $cmd - Invoke-Expression -Command $cmd + # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01). + $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20" + Write-Output "Executing: 7z.exe $arguments" + # Directly call 7z.exe using the call operator '&' + & 7z.exe $arguments + # Check the exit code of the last command. A non-zero code indicates an error. + if ($LASTEXITCODE -ne 0) { + throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE" + } } ## .tgz files -# first extract the tar file from the tgz -Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tgz | +# First, extract the .tar file from the .tgz archive. +Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tgz | Foreach-Object { - $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" - Write-Output $cmd - Invoke-Expression -Command $cmd + # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01). + $arguments = "x", "$($_.FullName)", "-y", "-o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact", "-snld20" + Write-Output "Executing: 7z.exe $arguments" + & 7z.exe $arguments + if ($LASTEXITCODE -ne 0) { + throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE" + } } -# now extract the actual folder structure from the tar file to the build dir -Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tar | +# Now, extract the contents from the .tar file into the final directory. +Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tar | Foreach-Object { - $cmd = "7z.exe x $($_.FullName) -y -o$nuget_artifacts_dir" - Write-Output $cmd - Invoke-Expression -Command $cmd + # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01). + $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20" + Write-Output "Executing: 7z.exe $arguments" + & 7z.exe $arguments + if ($LASTEXITCODE -ne 0) { + throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE" + } } -# process iOS xcframework -$xcframeworks = Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter onnxruntime_ios_xcframework.*.zip +# Process iOS xcframework +$xcframeworks = Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter onnxruntime_ios_xcframework.*.zip if ($xcframeworks.Count -eq 1) { - $xcframework = $xcframeworks[0] - $target_dir = "$nuget_artifacts_dir\onnxruntime-ios-xcframework" - # remove version info from filename and use required filename format - $target_file = "$target_dir\onnxruntime.xcframework.zip" - New-Item -Path $target_dir -ItemType directory + $xcframework = $xcframeworks[0] + $target_dir = "$nuget_artifacts_dir\onnxruntime-ios-xcframework" + # Use the required filename format, removing version info. + $target_file = "$target_dir\onnxruntime.xcframework.zip" + New-Item -Path $target_dir -ItemType directory -ErrorAction SilentlyContinue - Write-Output "Copy-Item $($xcframework.FullName) $target_file" - Copy-Item $xcframework.FullName $target_file + Write-Output "Copying $($xcframework.FullName) to $target_file" + Copy-Item $xcframework.FullName $target_file } elseif ($xcframeworks.Count -gt 1) { - Write-Error "Expected at most one onnxruntime_ios_xcframework*.zip file but got: [$xcframeworks]" + Write-Error "Expected at most one onnxruntime_ios_xcframework*.zip file but got: [$xcframeworks]" } - -# copy android AAR. -# for full build of onnxruntime Android AAR, there should only be one .aar file -# called onnxruntime-android-x.y.z.aar or onnxruntime-training-android-x.y.z.aar but sanity check that -$aars = Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.aar +# Copy Android AAR file. +# There should only be one .aar file for a full build. +$aars = Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.aar if ($aars.Count -eq 1) { - $aar = $aars[0] - $aar_prefix = "onnxruntime" - if ($aar -like "onnxruntime-training*") { - $aar_prefix = "onnxruntime-training" - } - $target_dir = "$nuget_artifacts_dir\$aar_prefix-android-aar" - $target_file = "$target_dir\onnxruntime.aar" # remove '-mobile' and version info from filename - New-Item -Path $target_dir -ItemType directory + $aar = $aars[0] + $aar_prefix = "onnxruntime" + if ($aar.Name -like "onnxruntime-training*") { + $aar_prefix = "onnxruntime-training" + } + $target_dir = "$nuget_artifacts_dir\$aar_prefix-android-aar" + # Remove version info from the filename for consistency. + $target_file = "$target_dir\onnxruntime.aar" + New-Item -Path $target_dir -ItemType directory -ErrorAction SilentlyContinue - Write-Output "Copy-Item $($aar.FullName) $target_file" - Copy-Item $aar.FullName $target_file + Write-Output "Copying $($aar.FullName) to $target_file" + Copy-Item $aar.FullName $target_file } elseif ($aars.Count -gt 1) { - Write-Error "Expected at most one Android .aar file but got: [$aars]" + Write-Error "Expected at most one Android .aar file but got: [$aars]" } -# Check whether this is a training pipeline -$is_training_pipeline = $false -if (Test-Path -Path $nuget_artifacts_dir\onnxruntime-training-win-x64-*) { - $is_training_pipeline = $true - Write-Output "onnxruntime-training-win-x64-* dir exists. This is a training pipeline." +# Check if this is a training pipeline by looking for a specific directory. +$is_training_pipeline = Test-Path -Path "$nuget_artifacts_dir\onnxruntime-training-win-x64-*" +if ($is_training_pipeline) { + Write-Output "onnxruntime-training-win-x64-* dir exists. This is a training pipeline." } -# Copy onnxruntime and protoc binaries to the binaries dir as these are required -# by Microsoft.ML.OnnxRuntime.Tests.NetCoreApp +# Copy onnxruntime and protoc binaries required by tests. +$destinationDir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo" if ($is_training_pipeline) { - Copy-Item -Path $nuget_artifacts_dir\onnxruntime-training-win-x64-*\lib\* -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo + Copy-Item -Path "$nuget_artifacts_dir\onnxruntime-training-win-x64-*\lib\*" -Destination $destinationDir -Recurse } else { - Copy-Item -Path $nuget_artifacts_dir\onnxruntime-win-x64-*\lib\* -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo + Copy-Item -Path "$nuget_artifacts_dir\onnxruntime-win-x64-*\lib\*" -Destination $destinationDir -Recurse } -"Get-ChildItem -Directory -Path $nuget_artifacts_dir\onnxruntime-*" -$ort_dirs = Get-ChildItem -Directory -Path $nuget_artifacts_dir\onnxruntime-* -foreach ($ort_dir in $ort_dirs) -{ - # remove the last '-xxx' segment from the dir name. typically that's the architecture. - $dirname = Split-Path -Path $ort_dir -Leaf - $dirname = $dirname.SubString(0,$dirname.LastIndexOf('-')) - Write-Output "Renaming $ort_dir to $dirname" - Rename-Item -Path $ort_dir -NewName $nuget_artifacts_dir\$dirname +# Rename directories to remove the architecture-specific suffix. +Write-Output "Renaming onnxruntime directories..." +Get-ChildItem -Directory -Path "$nuget_artifacts_dir\onnxruntime-*" | ForEach-Object { + $dirname = $_.Name + # Find the last hyphen and remove the suffix. + $lastHyphenIndex = $dirname.LastIndexOf('-') + if ($lastHyphenIndex -gt -1) { + $newName = $dirname.Substring(0, $lastHyphenIndex) + $newPath = Join-Path -Path $_.Parent.FullName -ChildPath $newName + Write-Output "Renaming '$($_.FullName)' to '$newPath'" + Rename-Item -Path $_.FullName -NewName $newName + } } -# List artifacts -"Post copy artifacts" -Get-ChildItem -Recurse $nuget_artifacts_dir\ +# List the final artifacts. +Write-Output "Post-copy artifacts:" +Get-ChildItem -Recurse $nuget_artifacts_dir \ No newline at end of file diff --git a/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1 b/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1 index 01a8eebe75df2..29946dcb73f8a 100644 --- a/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1 +++ b/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1 @@ -2,47 +2,81 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget-Java Packaging Pipeline -New-Item -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts -ItemType directory +# Define the directory for NuGet artifacts. +$nuget_artifacts_dir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts" +# Create the directory if it doesn't exist. +New-Item -Path $nuget_artifacts_dir -ItemType directory -ErrorAction SilentlyContinue -Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.zip | +## .zip files +Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.zip | Foreach-Object { - $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts" - Write-Output $cmd - Invoke-Expression -Command $cmd + # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01). + $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20" + Write-Output "Executing: 7z.exe $arguments" + & 7z.exe $arguments + if ($LASTEXITCODE -ne 0) { + throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE" + } } -Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tgz | +## .tgz files +Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tgz | Foreach-Object { - $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" # *.tar will be created after *.tgz is extracted - Write-Output $cmd - Invoke-Expression -Command $cmd + # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01). + # *.tar will be created after *.tgz is extracted + $arguments = "x", "$($_.FullName)", "-y", "-o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact", "-snld20" + Write-Output "Executing: 7z.exe $arguments" + & 7z.exe $arguments + if ($LASTEXITCODE -ne 0) { + throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE" + } } -Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tar | +## .tar files +Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tar | Foreach-Object { - $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts" - Write-Output $cmd - Invoke-Expression -Command $cmd + # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01). + $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20" + Write-Output "Executing: 7z.exe $arguments" + & 7z.exe $arguments + if ($LASTEXITCODE -ne 0) { + throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE" + } } +# Create directory for protobuf build dependencies. +New-Item -Path "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo" -ItemType directory -ErrorAction SilentlyContinue -New-Item -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo -ItemType directory - -Copy-Item -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-win-x64-cuda-*\lib\* -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo +# Copy CUDA libraries. +Copy-Item -Path "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-win-x64-cuda-*\lib\*" -Destination "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo" +# Install protoc via dotnet. $protocInstallDir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build" dotnet new console dotnet add package Google.Protobuf.Tools --version 3.21.12 --package-directory $protocInstallDir +if ($LASTEXITCODE -ne 0) { + throw "Error adding Google.Protobuf.Tools package. Exit code: $LASTEXITCODE" +} + +# Find and copy the protoc executable. $protocDir = Get-ChildItem -Path $protocInstallDir -Recurse -Filter "protoc.exe" | Select-Object -ExpandProperty DirectoryName -First 1 -Write-Output $protocDir -Copy-Item -Path $protocDir -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo - -$ort_dirs = Get-ChildItem -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-* -Directory -foreach ($ort_dir in $ort_dirs) -{ - $dirname = Split-Path -Path $ort_dir -Leaf - $dirname = $dirname.SubString(0,$dirname.LastIndexOf('-')) - Write-Output "Renaming $ort_dir to $dirname" - Rename-Item -Path $ort_dir -NewName $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\$dirname +if ($protocDir) { + Write-Output "Found protoc directory: $protocDir" + Copy-Item -Path $protocDir -Destination "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo" +} +else { + Write-Error "Could not find protoc.exe in $protocInstallDir" } +# Rename onnxruntime directories to a generic format. +$ort_dirs = Get-ChildItem -Path "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-*" -Directory +foreach ($ort_dir in $ort_dirs) { + $dirname = Split-Path -Path $ort_dir -Leaf + $lastHyphenIndex = $dirname.LastIndexOf('-') + if ($lastHyphenIndex -gt -1) { + $newName = $dirname.Substring(0, $lastHyphenIndex) + $newPath = Join-Path -Path $ort_dir.Parent.FullName -ChildPath $newName + Write-Output "Renaming '$($ort_dir.FullName)' to '$newPath'" + Rename-Item -Path $ort_dir.FullName -NewName $newName + } +}