diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index a1e9d06b133fd..6aad71e40b2a8 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -1226,6 +1226,12 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
       ${onnxruntime_perf_test_src_patterns}
       )
     onnxruntime_add_executable(onnxruntime_perf_test ${onnxruntime_perf_test_src} ${ONNXRUNTIME_ROOT}/core/platform/path_lib.cc)
+
+    # ABSL_FLAGS_STRIP_NAMES is set to 1 by default to disable flag registration when building for Android, iPhone, and "embedded devices".
+    # See the issue: https://github.com/abseil/abseil-cpp/issues/1875
+    # We set it to 0 for all builds to be able to use ABSL flags for onnxruntime_perf_test.
+    target_compile_definitions(onnxruntime_perf_test PRIVATE ABSL_FLAGS_STRIP_NAMES=0)
+
     if(MSVC)
       target_compile_options(onnxruntime_perf_test PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /utf-8>"
             "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>")
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
index 8cca2b42e987a..3c92400715740 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs
@@ -368,6 +368,88 @@ public struct OrtApi
         public IntPtr EpDevice_Device;
         public IntPtr GetEpApi;
         public IntPtr GetTensorSizeInBytes;
+
+        public IntPtr AllocatorGetStats;
+
+        public IntPtr CreateMemoryInfo_V2;
+        public IntPtr MemoryInfoGetDeviceMemType;
+        public IntPtr MemoryInfoGetVendorId;
+
+        public IntPtr ValueInfo_GetValueProducer;
+        public IntPtr ValueInfo_GetValueNumConsumers;
+        public IntPtr ValueInfo_GetValueConsumers;
+        public IntPtr ValueInfo_GetInitializerValue;
+        public IntPtr ValueInfo_GetExternalInitializerInfo;
+        public IntPtr ValueInfo_IsRequiredGraphInput;
+        public IntPtr ValueInfo_IsOptionalGraphInput;
+        public IntPtr ValueInfo_IsGraphOutput;
+        public IntPtr ValueInfo_IsConstantInitializer;
+        public IntPtr ValueInfo_IsFromOuterScope;
+        public IntPtr Graph_GetName;
+        public IntPtr Graph_GetModelPath;
+        public IntPtr Graph_GetOnnxIRVersion;
+        public IntPtr Graph_GetNumOperatorSets;
+        public IntPtr Graph_GetOperatorSets;
+        public IntPtr Graph_GetNumInputs;
+        public IntPtr Graph_GetInputs;
+        public IntPtr Graph_GetNumOutputs;
+        public IntPtr Graph_GetOutputs;
+        public IntPtr Graph_GetNumInitializers;
+        public IntPtr Graph_GetInitializers;
+        public IntPtr Graph_GetNumNodes;
+        public IntPtr Graph_GetNodes;
+        public IntPtr Graph_GetParentNode;
+        public IntPtr Graph_GetGraphView;
+        public IntPtr Node_GetId;
+        public IntPtr Node_GetName;
+        public IntPtr Node_GetOperatorType;
+        public IntPtr Node_GetDomain;
+        public IntPtr Node_GetSinceVersion;
+        public IntPtr Node_GetNumInputs;
+        public IntPtr Node_GetInputs;
+        public IntPtr Node_GetNumOutputs;
+        public IntPtr Node_GetOutputs;
+        public IntPtr Node_GetNumImplicitInputs;
+        public IntPtr Node_GetImplicitInputs;
+        public IntPtr Node_GetNumAttributes;
+        public IntPtr Node_GetAttributes;
+        public IntPtr Node_GetAttributeByName;
+        public IntPtr Node_GetTensorAttributeAsOrtValue;
+        public IntPtr OpAttr_GetType;
+        public IntPtr OpAttr_GetName;
+        public IntPtr Node_GetNumSubgraphs;
+        public IntPtr Node_GetSubgraphs;
+        public IntPtr Node_GetGraph;
+        public IntPtr Node_GetEpName;
+        public IntPtr ReleaseExternalInitializerInfo;
+        public IntPtr ExternalInitializerInfo_GetFilePath;
+        public IntPtr ExternalInitializerInfo_GetFileOffset;
+        public IntPtr ExternalInitializerInfo_GetByteSize;
+
+        public IntPtr GetRunConfigEntry;
+
+        public IntPtr EpDevice_MemoryInfo;
+
+        public IntPtr CreateSharedAllocator;
+        public IntPtr GetSharedAllocator;
+        public IntPtr ReleaseSharedAllocator;
+
+        public IntPtr GetTensorData;
+
+        public IntPtr GetSessionOptionsConfigEntries;
+
+        public IntPtr SessionGetMemoryInfoForInputs;
+        public IntPtr SessionGetMemoryInfoForOutputs;
+        public IntPtr SessionGetEpDeviceForInputs;
+
+        public IntPtr CreateSyncStreamForEpDevice;
+        public IntPtr SyncStream_GetHandle;
+        public IntPtr ReleaseSyncStream;
+
+        public IntPtr CopyTensors;
+
+        public IntPtr Graph_GetModelMetadata;
+        public IntPtr GetModelCompatibilityForEpDevices;
     }
 
     internal static class NativeMethods
@@ -704,6 +786,10 @@ static NativeMethods()
                 (DSessionOptionsSetEpSelectionPolicyDelegate)Marshal.GetDelegateForFunctionPointer(
                     api_.SessionOptionsSetEpSelectionPolicyDelegate,
                     typeof(DSessionOptionsSetEpSelectionPolicyDelegate));
+
+            OrtGetModelCompatibilityForEpDevices = (DOrtGetModelCompatibilityForEpDevices)Marshal.GetDelegateForFunctionPointer(
+                api_.GetModelCompatibilityForEpDevices,
+                typeof(DOrtGetModelCompatibilityForEpDevices));
         }
 
         internal class NativeLib
@@ -2456,6 +2542,18 @@ public delegate void DOrtRemoveKeyValuePair(IntPtr /* OrtKeyValuePairs* */ kvps,
 
         public static DOrtGetEpDevices OrtGetEpDevices;
 
+        /// <summary>
+        /// Validate compiled model compatibility for the provided EP devices.
+        /// </summary>
+        [UnmanagedFunctionPointer(CallingConvention.Winapi)]
+        public delegate IntPtr /* OrtStatus* */ DOrtGetModelCompatibilityForEpDevices(
+            IntPtr[] /* const OrtEpDevice* const* */ ep_devices,
+            UIntPtr /* size_t */ num_ep_devices,
+            byte[] /* const char* */ compatibility_info,
+            out int /* OrtCompiledModelCompatibility */ out_status);
+
+        public static DOrtGetModelCompatibilityForEpDevices OrtGetModelCompatibilityForEpDevices;
+
         /// <summary>
         /// Add execution provider devices to the session options.
         /// Priority is based on the order of the OrtEpDevice instances. Highest priority first.
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
index 5c70808b82be1..052d5899b52c0 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/OrtEnv.shared.cs
@@ -7,6 +7,21 @@
 
 namespace Microsoft.ML.OnnxRuntime
 {
+    /// <summary>
+    /// Represents the compatibility status of a pre-compiled model with one or more execution provider devices.
+    /// </summary>
+    /// <remarks>
+    /// This enum is used to determine whether a pre-compiled model can be used with specific execution providers
+    /// and devices, or if recompilation is needed. 
+    /// </remarks>
+    public enum OrtCompiledModelCompatibility
+    {
+        EP_NOT_APPLICABLE = 0,
+        EP_SUPPORTED_OPTIMAL = 1,
+        EP_SUPPORTED_PREFER_RECOMPILATION = 2,
+        EP_UNSUPPORTED = 3,
+    }
+
     /// <summary>
     /// Delegate for logging function callback.
     /// Supply your function and register it with the environment to receive logging callbacks via
@@ -361,6 +376,31 @@ public string[] GetAvailableProviders()
             }
         }
 
+        /// <summary>
+        /// Validate a compiled model's compatibility information for one or more EP devices.
+        /// </summary>
+        /// <param name="epDevices">The list of EP devices to validate against.</param>
+        /// <param name="compatibilityInfo">The compatibility string from the precompiled model to validate.</param>
+        /// <returns>OrtCompiledModelCompatibility enum value denoting the compatibility status</returns>
+        public OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices(
+            IReadOnlyList<OrtEpDevice> epDevices, string compatibilityInfo)
+        {
+            if (epDevices == null || epDevices.Count == 0)
+                throw new ArgumentException("epDevices must be non-empty", nameof(epDevices));
+
+            var devicePtrs = new IntPtr[epDevices.Count];
+            for (int i = 0; i < epDevices.Count; ++i)
+            {
+                devicePtrs[i] = epDevices[i].Handle;
+            }
+
+            var infoUtf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(compatibilityInfo);
+            NativeApiStatus.VerifySuccess(
+                NativeMethods.OrtGetModelCompatibilityForEpDevices(
+                    devicePtrs, (UIntPtr)devicePtrs.Length, infoUtf8, out int status));
+            return (OrtCompiledModelCompatibility)status;
+        }
+
 
         /// <summary>
         /// Get/Set log level property of OrtEnv instance
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs
new file mode 100644
index 0000000000000..103fe5bc10106
--- /dev/null
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/EpCompatibilityTests.cs
@@ -0,0 +1,49 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// not supported on mobile platforms
+#if !(ANDROID || IOS)
+
+namespace Microsoft.ML.OnnxRuntime.Tests;
+
+using System;
+using System.Linq;
+using Xunit;
+using System.Collections.Generic;
+
+public class EpCompatibilityTests
+{
+    private readonly OrtEnv ortEnvInstance = OrtEnv.Instance();
+
+    private IReadOnlyList<OrtEpDevice> GetDevices()
+    {
+        var epDevices = ortEnvInstance.GetEpDevices();
+        Assert.NotNull(epDevices);
+        Assert.NotEmpty(epDevices);
+        return epDevices;
+    }
+
+    [Fact]
+    public void GetEpCompatibility_InvalidArgs()
+    {
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetModelCompatibilityForEpDevices(null, "info"));
+        Assert.Throws<ArgumentException>(() => ortEnvInstance.GetModelCompatibilityForEpDevices(new List<OrtEpDevice>(), "info"));
+    }
+
+    [Fact]
+    public void GetEpCompatibility_SingleDeviceCpuProvider()
+    {
+        var devices = GetDevices();
+        var someInfo = "arbitrary-compat-string";
+
+        // Use CPU device 
+        var cpu = devices.First(d => d.EpName == "CPUExecutionProvider");
+        Assert.NotNull(cpu);
+        var selected = new List<OrtEpDevice> { cpu };
+        var status = ortEnvInstance.GetModelCompatibilityForEpDevices(selected, someInfo);
+
+        // CPU defaults to not applicable in this scenario
+        Assert.Equal(OrtCompiledModelCompatibility.EP_NOT_APPLICABLE, status);
+    }
+}
+#endif
diff --git a/include/onnxruntime/core/graph/model_saving_options.h b/include/onnxruntime/core/graph/model_saving_options.h
index 6c041ec96a035..06c1b1ac6475f 100644
--- a/include/onnxruntime/core/graph/model_saving_options.h
+++ b/include/onnxruntime/core/graph/model_saving_options.h
@@ -9,36 +9,30 @@ class PrepackedWeightsForGraph;
 
 // These options affect how the model initializers are written to the external file.
 // This includes options to align external initializer offset.
-// For models running on CPU, ORT will try to use mmap to load external
-// initializers. To use mmap, external initializer need to be offset aligned.
+// ORT will try to use mmap to load external initializers.
+//
 // ORT saves external initializers into single data file, each initializer is
 // accessed with offset(start position of initializer) and length(byte length of
-// initializer) of the data file. To use mmap, each offset need to be aligned
-// which means offset need to divisible by allocation granularity(64KB for
-// windows and 4K for other OSes). With align_offset to true, ORT will align
-// offset for large initializer when save ONNX model with external data file.
+// initializer) of the data file. With align_offset to true, ORT will align
+// offset for large initializer (larger than  align_threshold)
+// when save ONNX model with external data file. It will align then to
+// on_disk_alignment value.
 struct ModelSavingOptions {
   explicit ModelSavingOptions(size_t size_threshold)
       : initializer_size_threshold(size_threshold) {}
 
   // Minimal initializer size in bytes to be externalized on disk
   size_t initializer_size_threshold;
-  // Offset will always be page aligned and allocation granularity aligned for
-  // mmap support. This is done by padding previous tensor data with zeros
-  // keeping same length.
+  // Offset will always be aligned for mmap support.
+  // This is done by padding previous tensor data with zeros keeping same length.
   bool align_offset = false;
   // Alignment threshold for size of data.
   // Having a low threshold will waste file space for small initializers.
   // Only when tensor's data size is > the page_align_threshold it will be force
   // aligned. Default to 1MB.
   int64_t align_threshold = 1048576;
-  // The allocation Granularity for mmap() support.
-  // Typically 64KB for Windows & 4KB for other OSes. Default to 64KB.
-#ifdef _WIN32
-  int64_t allocation_granularity = 65536;
-#else
-  int64_t allocation_granularity = 4096;
-#endif
+  // Alignment factor for big tensors (bigger than align_threshold). Defaults to 4K.
+  int64_t on_disk_alignment = 4096;
   // Force embed all external initializer into the Onnx file
   // Used for EPContext model generation while some nodes fallback on CPU which has external data dependency
   bool force_embed_external_ini = false;
diff --git a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
index dc27204017caa..a32f465e44adf 100644
--- a/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
+++ b/include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h
@@ -31,7 +31,7 @@ constexpr const char* kDetailedBuildLog = "nv_detailed_build_log";
 constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes";
 constexpr const char* kProfilesMaxShapes = "nv_profile_max_shapes";
 constexpr const char* kProfilesOptShapes = "nv_profile_opt_shapes";
-constexpr const char* kCudaGraphEnable = "nv_cuda_graph_enable";
+constexpr const char* kCudaGraphEnable = "enable_cuda_graph";
 constexpr const char* kMultiProfileEnable = "nv_multi_profile_enable";
 constexpr const char* kUseExternalDataInitializer = "nv_use_external_data_initializer";
 
diff --git a/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h b/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h
index 21aa797ce16eb..28ce4439fdc7e 100644
--- a/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h
+++ b/include/onnxruntime/core/providers/utils/ort_graph_to_proto.h
@@ -232,7 +232,7 @@ static Ort::Status GetOrtValueInfoTensorTypeShape(const OrtValueInfo& ort_value_
                                                   /*out*/ std::vector<int64_t>& dims,
                                                   /*out*/ std::vector<std::string>& symbolic_dims);
 static Ort::Status OrtValueInfoToProto(const OrtValueInfo& ort_value_info, onnx::ValueInfoProto& value_info_proto);
-static Ort::Status OrtOpAttrToProto(const OrtNode& ort_node, const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto);
+static Ort::Status OrtOpAttrToProto(const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto);
 
 Ort::Status OrtGraphToProto(const OrtGraph& ort_graph,
                             onnx::GraphProto& graph_proto,
@@ -379,7 +379,7 @@ Ort::Status OrtGraphToProto(const OrtGraph& ort_graph,
         }
 
         onnx::AttributeProto* attr_proto = node_proto->add_attribute();
-        ORT_EP_UTILS_CXX_RETURN_IF_ERROR(OrtOpAttrToProto(*ort_node, *ort_attr, *attr_proto));
+        ORT_EP_UTILS_CXX_RETURN_IF_ERROR(OrtOpAttrToProto(*ort_attr, *attr_proto));
       }
     }
 
@@ -652,7 +652,7 @@ static Ort::Status OrtValueInfoToProto(const OrtValueInfo& ort_value_info,
   return Ort::Status{nullptr};
 }
 
-static Ort::Status OrtOpAttrToProto(const OrtNode& ort_node, const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto) {
+static Ort::Status OrtOpAttrToProto(const OrtOpAttr& ort_attr, onnx::AttributeProto& attr_proto) {
   const OrtApi& ort_api = Ort::GetApi();
 
   const char* attr_name = nullptr;
@@ -766,7 +766,7 @@ static Ort::Status OrtOpAttrToProto(const OrtNode& ort_node, const OrtOpAttr& or
       // TensorProto as an attribute value doesn't require a name.
 
       OrtValue* ort_value = nullptr;
-      ORT_EP_UTILS_C_RETURN_IF_ERROR(ort_api.Node_GetTensorAttributeAsOrtValue(&ort_node, &ort_attr, &ort_value));
+      ORT_EP_UTILS_C_RETURN_IF_ERROR(ort_api.OpAttr_GetTensorAttributeAsOrtValue(&ort_attr, &ort_value));
 
       Ort::Value tensor(ort_value);
 
diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index bedeeb972c3a7..72e8a3ca1103c 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -902,6 +902,16 @@ typedef void (*RunAsyncCallbackFn)(void* user_data, OrtValue** outputs, size_t n
  *
  * \nosubgrouping
  */
+/*
+ * Public enum for compiled model compatibility across EPs.
+ */
+typedef enum OrtCompiledModelCompatibility {
+  OrtCompiledModelCompatibility_EP_NOT_APPLICABLE = 0,
+  OrtCompiledModelCompatibility_EP_SUPPORTED_OPTIMAL,
+  OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION,
+  OrtCompiledModelCompatibility_EP_UNSUPPORTED,
+} OrtCompiledModelCompatibility;
+
 struct OrtApi {
   /// \name OrtStatus
   /// @{
@@ -6069,7 +6079,6 @@ struct OrtApi {
 
   /** \brief Get the OrtNode's 'TENSOR' attribute as an OrtValue.
    *
-   * \param[in] node The OrtNode instance.
    * \param[in] attribute The OrtOpAttr instance.
    * \param[out] attr_tensor If successful, contains the 'TENSOR' attribute as a newly created OrtValue.
                              Must be freed with OrtApi::ReleaseValue.
@@ -6078,7 +6087,7 @@ struct OrtApi {
    *
    * \since Version 1.23.
    */
-  ORT_API2_STATUS(Node_GetTensorAttributeAsOrtValue, _In_ const OrtNode* node, _In_ const OrtOpAttr* attribute,
+  ORT_API2_STATUS(OpAttr_GetTensorAttributeAsOrtValue, _In_ const OrtOpAttr* attribute,
                   _Outptr_result_maybenull_ OrtValue** attr_tensor);
 
   /** \brief Get the attribute type as OrtOpAttrType from an OrtOpAttr.
@@ -6480,6 +6489,24 @@ struct OrtApi {
    * \since Version 1.23.
    */
   ORT_API2_STATUS(Graph_GetModelMetadata, _In_ const OrtGraph* graph, _Outptr_ OrtModelMetadata** out);
+
+  /** \brief Validate a compiled model's compatibility information for one or more EP devices.
+   *
+   * \param[in] ep_devices The EP devices to validate against (e.g., from GetEpDevices).
+   *                        All devices must belong to the same execution provider.
+   * \param[in] num_ep_devices The number of EP devices provided.
+   * \param[in] compatibility_info The compatibility info string produced when the model was compiled.
+   * \param[out] out_status The resulting compatibility status for the EP devices.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.23.
+   */
+  ORT_API2_STATUS(GetModelCompatibilityForEpDevices,
+                  _In_reads_(num_ep_devices) const OrtEpDevice* const* ep_devices,
+                  _In_ size_t num_ep_devices,
+                  _In_ const char* compatibility_info,
+                  _Out_ OrtCompiledModelCompatibility* out_status);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 2f4fd36c8115f..981c70ab8b954 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -725,9 +725,7 @@ using AllocatedStringPtr = std::unique_ptr<char, detail::AllocatedFree>;
  *  constructors to construct an instance of a Status object from exceptions.
  */
 struct Status : detail::Base<OrtStatus> {
-  using Base = detail::Base<OrtStatus>;
-  using Base::Base;
-
+  Status() = default;                              // Same as with std::nullptr_t. But can be used in re-sizable containers and represent success.
   explicit Status(std::nullptr_t) noexcept {}      ///< Create an empty object, must be assigned a valid one to be used
   explicit Status(OrtStatus* status) noexcept;     ///< Takes ownership of OrtStatus instance returned from the C API.
   explicit Status(const Exception&);               ///< Creates status instance out of exception
@@ -1015,6 +1013,16 @@ struct EpDevice : detail::EpDeviceImpl<OrtEpDevice> {
            ConstKeyValuePairs ep_metadata = {}, ConstKeyValuePairs ep_options = {});
 };
 
+/** \brief Validate a compiled model's compatibility for one or more EP devices.
+ *
+ * Throws on error. Returns the resulting compatibility status.
+ * /// \param ep_devices The EP devices to check compatibility against.
+ * /// \param compatibility_info The compatibility string from the precompiled model to validate.
+ */
+OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices(
+    const std::vector<ConstEpDevice>& ep_devices,
+    const char* compatibility_info);
+
 /** \brief The Env (Environment)
  *
  * The Env holds the logging state used by all other objects.
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 73200d8852223..05c86ae4e0c58 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -823,7 +823,7 @@ inline Status Env::CopyTensors(const std::vector<Value>& src_tensors,
     return Status("Source and destination tensor vectors must have the same size", ORT_INVALID_ARGUMENT);
   }
   if (src_tensors.empty()) {
-    return Status();
+    return Status(nullptr);
   }
 
   const OrtValue* const* src_tensors_ptr = reinterpret_cast<const OrtValue* const*>(src_tensors.data());
@@ -859,6 +859,26 @@ inline void CustomOpDomain::Add(const OrtCustomOp* op) {
   ThrowOnError(GetApi().CustomOpDomain_Add(p_, op));
 }
 
+inline OrtCompiledModelCompatibility GetModelCompatibilityForEpDevices(
+    const std::vector<ConstEpDevice>& ep_devices,
+    const char* compatibility_info) {
+  if (ep_devices.empty()) {
+    ORT_CXX_API_THROW("ep_devices is empty", ORT_INVALID_ARGUMENT);
+  }
+
+  std::vector<const OrtEpDevice*> ptrs;
+  ptrs.reserve(ep_devices.size());
+  for (const auto& d : ep_devices) ptrs.push_back(d);
+
+  OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
+  ThrowOnError(GetApi().GetModelCompatibilityForEpDevices(
+      reinterpret_cast<const OrtEpDevice* const*>(ptrs.data()),
+      ptrs.size(),
+      compatibility_info,
+      &status));
+  return status;
+}
+
 inline LoraAdapter LoraAdapter::CreateLoraAdapter(const std::basic_string<ORTCHAR_T>& adapter_path,
                                                   OrtAllocator* allocator) {
   OrtLoraAdapter* p;
diff --git a/include/onnxruntime/core/session/onnxruntime_ep_c_api.h b/include/onnxruntime/core/session/onnxruntime_ep_c_api.h
index 620cb5fcf13cc..975f6b453a88d 100644
--- a/include/onnxruntime/core/session/onnxruntime_ep_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_ep_c_api.h
@@ -482,18 +482,6 @@ typedef enum OrtEpDataLayout {
   OrtEpDataLayout_Default = OrtEpDataLayout_NCHW,
 } OrtEpDataLayout;
 
-/**
- * \brief Enumeration describing the compatibility state of a compiled model relative to an execution provider.
- *
- * \since Version 1.23.
- */
-typedef enum OrtCompiledModelCompatibility {
-  OrtCompiledModelCompatibility_EP_NOT_APPLICABLE = 0,
-  OrtCompiledModelCompatibility_EP_SUPPORTED_OPTIMAL,
-  OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION,
-  OrtCompiledModelCompatibility_EP_UNSUPPORTED,
-} OrtCompiledModelCompatibility;
-
 /**
  * \brief The OrtEp struct provides functions to implement for an execution provider.
  * \since Version 1.22.
@@ -901,20 +889,28 @@ struct OrtEpFactory {
    */
   ORT_API_T(const char*, GetVersion, _In_ const OrtEpFactory* this_ptr);
 
-  /** \brief Validate the compatibility of a compiled model with the execution provider.
+  /** \brief Validate the compatibility of a compiled model with the execution provider factory for one or more devices.
+   *
+   * Given a compatibility info string produced during model compilation, the EP factory should determine whether the
+   * compiled model is compatible with the EP factory when targeting the provided hardware devices. All devices provided
+   * must belong to the same execution provider instance that this factory creates.
    *
-   * This function validates if a model produced with the supplied compatibility info string is supported by the underlying EP.
-   * The EP should check if a compiled model is compatible with the EP and set the model_compatibility parameter accordingly.
+   * The EP factory implementation should consider the set of devices (e.g., multi-adapter or multi-GPU scenarios) when
+   * evaluating compatibility and set `model_compatibility` accordingly.
    *
    * \param[in] this_ptr The OrtEpFactory instance.
-   * \param[in] compatibility_info The compatibility information string that will be used
-   * \param[out] model_compatibility OrtCompiledModelCompatibility enum value describing the compatibility of the model with the EP.
+   * \param[in] devices Array of OrtHardwareDevice pointers that the EP would run on. All must map to this EP.
+   * \param[in] num_devices Number of entries in `devices`.
+   * \param[in] compatibility_info The compatibility information string produced when the model was compiled.
+   * \param[out] model_compatibility OrtCompiledModelCompatibility value describing the compatibility of the model with the EP.
    *
    * \snippet{doc} snippets.dox OrtStatus Return Value
    *
    * \since Version 1.23.
    */
   ORT_API2_STATUS(ValidateCompiledModelCompatibilityInfo, _In_ OrtEpFactory* this_ptr,
+                  _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+                  _In_ size_t num_devices,
                   _In_ const char* compatibility_info,
                   _Out_ OrtCompiledModelCompatibility* model_compatibility);
 
diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
index 0d5117709c18a..bfa450f4287f8 100644
--- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
+++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h
@@ -280,6 +280,18 @@ class GQAAttentionBase {
                                           output, static_cast<int>(present_buffer_sequence_length), nullptr);
         }
 
+        // Pre-allocate buffer for attention mask to avoid allocating it for every processed token
+        float* attention_bias_thread_fp32 = nullptr;
+        if (attention_bias_thread != nullptr) {
+          if constexpr (!std::is_same_v<U, T>) {
+            static_assert(std::is_same_v<U, float> && std::is_same_v<T, MLFloat16>);
+
+            size_t bytes = attention_total_seqlen * sizeof(float);
+            attention_bias_thread_fp32 = static_cast<float*>(allocator->Alloc(bytes));
+          }
+        }
+        BufferUniquePtr scratch_buffer(attention_bias_thread_fp32, BufferDeleter(allocator));
+
         // compute Softmax
         U* output_softmax = output;
         for (size_t seq = 0; seq < sequence_length; seq++) {
@@ -316,9 +328,6 @@ class GQAAttentionBase {
                                  static_cast<int>(window_size));
             } else {
               static_assert(std::is_same_v<U, float> && std::is_same_v<T, MLFloat16>);
-              size_t bytes = window_size * sizeof(float);
-              auto attention_bias_thread_fp32 = static_cast<float*>(allocator->Alloc(bytes));
-              BufferUniquePtr scratch_buffer(attention_bias_thread_fp32, BufferDeleter(allocator));
 
               MlasConvertHalfToFloatBuffer(attention_bias_thread + start_offset, attention_bias_thread_fp32, window_size);
               ApplyAttentionBias(output_softmax + start_offset, attention_bias_thread_fp32, static_cast<int>(window_size));
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
index 85a2cbaea0e44..36a6f70cc69d9 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -200,6 +200,19 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
 
       can_use_dynamic_quant_mlas_ = (!b_quantization_might_be_asymmetric && b_scale_available);
 
+      // Kleidi dynamic path requires strictly positive, finite scales.
+      // Disable if any invalid scale is detected.
+      if (can_use_dynamic_quant_mlas_) {
+        const auto bs = b_scale_tensor->DataAsSpan<float>();
+        const bool has_invalid =
+            std::any_of(bs.begin(), bs.end(),
+                        [](float s) { return !std::isfinite(s) || s <= 0.0f; });
+
+        if (has_invalid) {
+          can_use_dynamic_quant_mlas_ = false;
+        }
+      }
+
       // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops.
       // We check that here too before attempting to use them.
       if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) {
@@ -379,7 +392,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
     if (y->Shape().Size() == 0)
       return Status::OK();
 
-    auto a_data = static_cast<const uint8_t*>(ctx->Input<Tensor>(IN_A)->DataRaw());
+    const float* a_data = ctx->Input<Tensor>(IN_A)->Data<float>();
     auto* y_data = y->MutableData<float>();
 
     // batch gemm
@@ -393,7 +406,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
 
     for (size_t gemm_idx = 0; gemm_idx < num_gemms; gemm_idx++) {
       auto& params = gemm_data_vec[gemm_idx];
-      params.A = reinterpret_cast<const float*>(a_data + helper.LeftOffsets()[gemm_idx]);
+      params.A = a_data + helper.LeftOffsets()[gemm_idx];
       params.lda = gemm_shape.K;
       params.PackedB = packed_b_.get();
       params.C = y_data + helper.OutputOffsets()[gemm_idx];
diff --git a/onnxruntime/core/framework/tensor_external_data_info.cc b/onnxruntime/core/framework/tensor_external_data_info.cc
index 971851db62437..d7f5b23d56c70 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.cc
+++ b/onnxruntime/core/framework/tensor_external_data_info.cc
@@ -107,7 +107,7 @@ void ExternalDataInfo::SetExternalLocationToProto(const std::filesystem::path& e
 std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto(
     const PrepackedWeightsForGraph& prepacked_for_graph,
     const InlinedHashSet<std::string>& blob_keys, bool align,
-    int64_t align_threshold, int64_t allocation_granularity,
+    int64_t align_threshold, int64_t on_disk_alignment,
     std::ostream& os, int64_t& external_offset, ::ONNX_NAMESPACE::TensorProto& proto) {
   size_t key_count = 0;
   for (const auto& key : blob_keys) {
@@ -120,7 +120,7 @@ std::ostream& ExternalDataInfo::WritePrepackedToFileAndAddToProto(
       const auto size_in_bytes = prepacked_weights->buffer_sizes_[i];
       if (align && static_cast<int64_t>(size_in_bytes) > align_threshold) {
         // return early on error
-        if (!AlignAndPad(os, allocation_granularity, external_offset)) {
+        if (!AlignAndPad(os, on_disk_alignment, external_offset)) {
           return os;
         }
       }
diff --git a/onnxruntime/core/framework/tensor_external_data_info.h b/onnxruntime/core/framework/tensor_external_data_info.h
index 2de1e01f381ec..784b3f352a78e 100644
--- a/onnxruntime/core/framework/tensor_external_data_info.h
+++ b/onnxruntime/core/framework/tensor_external_data_info.h
@@ -41,15 +41,13 @@ class ExternalDataInfo {
                                          size_t tensor_bytes_size,
                                          ::ONNX_NAMESPACE::TensorProto& proto);
 
-  // Pads the output with zeros according to the specified allocation_granularity
+  // Pads the output with zeros according to the specified alignment_factor
   // It updates external_offset for alignment.
   // need to do padding before write actual tensor data as we do offset alignment at the begin of
-  // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+  // large tensors (offset need to be page aligned) like below:
   // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
   // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
-  static std::ostream& AlignAndPad(std::ostream& stream, int64_t allocation_granularity, int64_t& external_offset) {
-    // Align to the larger of the page size or the allocation granularity
-    int64_t alignment_factor = std::max(static_cast<int64_t>(4096), allocation_granularity);
+  static std::ostream& AlignAndPad(std::ostream& stream, int64_t alignment_factor, int64_t& external_offset) {
     // Align to the next page or alloc granularity boundary
     SafeInt<int64_t> safe_external_offset = external_offset;
     int64_t new_external_offset = ((safe_external_offset + alignment_factor - 1) / alignment_factor) *
@@ -66,7 +64,7 @@ class ExternalDataInfo {
   static std::ostream& WritePrepackedToFileAndAddToProto(
       const PrepackedWeightsForGraph& prepacked_for_graph,
       const InlinedHashSet<std::string>& blob_keys,
-      bool align, int64_t align_threshold, int64_t allocation_granularity,
+      bool align, int64_t align_threshold, int64_t on_disk_alignment,
       std::ostream& os,
       int64_t& external_offset,
       ::ONNX_NAMESPACE::TensorProto& proto);
diff --git a/onnxruntime/core/graph/abi_graph_types.h b/onnxruntime/core/graph/abi_graph_types.h
index b99c22edb36c8..2ef7c4a9091f3 100644
--- a/onnxruntime/core/graph/abi_graph_types.h
+++ b/onnxruntime/core/graph/abi_graph_types.h
@@ -252,16 +252,6 @@ struct OrtNode {
   /// <returns>A status indicating success or an error.</returns>
   virtual onnxruntime::Status GetAttributes(gsl::span<const OrtOpAttr*> attrs) const = 0;
 
-  /// <summary>
-  /// Gets the node's 'TENSOR' attribute as an OrtValue.
-  /// </summary>
-  /// <param name="attr">Node's 'TENSOR' attribute.</param>
-  /// <param name="value">Output parameter is set to a newly created OrtValue containing the 'TENSOR' attribute value,
-  ///                     only if the attribute is of type 'TENSOR'</param>
-  /// <returns>A status indicating success or an error.</returns>
-  virtual onnxruntime::Status GetTensorAttributeAsOrtValue(const OrtOpAttr* attr,
-                                                           OrtValue*& value) const = 0;
-
   /// <summary>
   /// Gets the number of node subgraphs.
   /// </summary>
diff --git a/onnxruntime/core/graph/ep_api_types.cc b/onnxruntime/core/graph/ep_api_types.cc
index 759a2998ace3a..0d9b93631ee8a 100644
--- a/onnxruntime/core/graph/ep_api_types.cc
+++ b/onnxruntime/core/graph/ep_api_types.cc
@@ -249,32 +249,6 @@ Status EpNode::GetAttributes(gsl::span<const OrtOpAttr*> dst) const {
   return Status::OK();
 }
 
-Status EpNode::GetTensorAttributeAsOrtValue(const OrtOpAttr* attribute, OrtValue*& result) const {
-  const auto* attr_proto = reinterpret_cast<const ONNX_NAMESPACE::AttributeProto*>(attribute);
-
-  if (attr_proto->type() != onnx::AttributeProto::TENSOR) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "This OrtOpAttr instance is not a 'TENSOR' attribute");
-  }
-
-  const auto& graph_viewer = ep_graph_->GetGraphViewer();
-  const auto& tensor_proto = attr_proto->t();
-
-  // Check that TensorProto is valid.
-  ORT_ENFORCE(utils::HasDataType(tensor_proto), "Tensor proto doesn't have data type.");
-  ORT_ENFORCE(ONNX_NAMESPACE::TensorProto::DataType_IsValid(tensor_proto.data_type()), "Tensor proto has invalid data type.");
-  ORT_ENFORCE(!utils::HasExternalData(tensor_proto),
-              "Tensor proto with external data for value attribute is not supported.");
-
-  // Initialize OrtValue for tensor attribute.
-  auto tensor_attribute_value = std::make_unique<OrtValue>();
-  AllocatorPtr tensor_attribute_allocator = CPUAllocator::DefaultInstance();
-  ORT_RETURN_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), graph_viewer.ModelPath(), tensor_proto,
-                                                   tensor_attribute_allocator, *tensor_attribute_value));
-
-  result = tensor_attribute_value.release();
-  return Status::OK();
-}
-
 Status EpNode::GetNumSubgraphs(size_t& num_subgraphs) const {
   num_subgraphs = subgraphs_.size();
   return Status::OK();
diff --git a/onnxruntime/core/graph/ep_api_types.h b/onnxruntime/core/graph/ep_api_types.h
index 7f22e265129f7..e003f02a79a2d 100644
--- a/onnxruntime/core/graph/ep_api_types.h
+++ b/onnxruntime/core/graph/ep_api_types.h
@@ -183,9 +183,6 @@ struct EpNode : public OrtNode {
   // Gets the node's attributes.
   Status GetAttributes(gsl::span<const OrtOpAttr*> attrs) const override;
 
-  Status GetTensorAttributeAsOrtValue(const OrtOpAttr* attribute,
-                                      OrtValue*& attr_tensor) const override;
-
   // Gets the number of subgraphs contained by this node.
   Status GetNumSubgraphs(size_t& num_subgraphs) const override;
 
diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
index e4f8cd6df678e..0a228176175eb 100644
--- a/onnxruntime/core/graph/graph.cc
+++ b/onnxruntime/core/graph/graph.cc
@@ -4536,14 +4536,14 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
         continue;
       }
 
-      // update external_offset for alignment
+      // update external_offset for alignment (if enabled)
       // need to do padding before write actual tensor data as we do offset alignment at the begin of
-      // large tensors (offset need to be page aligned and allocation granularity aligned) like below:
+      // large tensors (offset need to be page aligned) like below:
       // \242\2557\256\023.\031&0000000000000000\332)k+\253\246\342\246(&\006!\347\232\374\236\325\026\032+\36XXXX
       // |<---smaller tensor---->|<---padding--->|<------------------large tensor----------------------------->|
       if (model_saving_options.align_offset && static_cast<int64_t>(tensor_bytes_size) >
                                                    model_saving_options.align_threshold) {
-        ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.allocation_granularity,
+        ORT_RETURN_IF_NOT(ExternalDataInfo::AlignAndPad(external_stream, model_saving_options.on_disk_alignment,
                                                         external_offset),
                           "Failed writing external data to: ", model_external_file_path);
       }
@@ -4576,7 +4576,7 @@ Status Graph::AddExternalInitializersToGraphProtoImpl(
           auto& os = ExternalDataInfo::WritePrepackedToFileAndAddToProto(
               *prepacked_weights_for_graph_, blob_keys_to_external_data,
               model_saving_options.align_offset, model_saving_options.align_threshold,
-              model_saving_options.allocation_granularity,
+              model_saving_options.on_disk_alignment,
               external_stream, external_offset, *output_proto);
           ORT_RETURN_IF_NOT(os.good(), "Failed to write pre-packed blobs to external file");
         }
diff --git a/onnxruntime/core/graph/model_editor_api_types.h b/onnxruntime/core/graph/model_editor_api_types.h
index e7ffcbc7e4c90..2c0f6d6174303 100644
--- a/onnxruntime/core/graph/model_editor_api_types.h
+++ b/onnxruntime/core/graph/model_editor_api_types.h
@@ -138,11 +138,6 @@ struct ModelEditorNode : public OrtNode {
                            "OrtModelEditorApi does not support getting attribute OrtOpAttr for OrtNode");
   }
 
-  Status GetTensorAttributeAsOrtValue(const OrtOpAttr* /*attribute*/, OrtValue*& /*attr_tensor*/) const override {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
-                           "OrtModelEditorApi does not support getting 'TENSOR' attribute for OrtNode");
-  }
-
   Status GetNumSubgraphs(size_t& /*num_subgraphs*/) const override {
     return ORT_MAKE_STATUS(ONNXRUNTIME, NOT_IMPLEMENTED,
                            "OrtModelEditorApi does not support getting the subgraphs for OrtNode");
diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
index caa445b71e2a5..c579ff1542eb9 100644
--- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
+++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
@@ -153,28 +153,23 @@ ArmKleidiAI::MlasGemmBatch(
     MLAS_THREADPOOL* ThreadPool
 )
 {
-    if(TransA == CblasTrans)
-    {
-        return false;
+    if (M == 0 || N == 0) {
+        return true;
     }
-    if (TransA == CblasNoTrans && K == 0) {
-        if (Data->beta != 1.0f) {
+
+    if (Data->alpha == 0.0f || K == 0) {
+        if (Data->beta == 0.0f) {
+            for (size_t i = 0; i < M; ++i) {
+                std::fill_n(Data->C + i * Data->ldc, N, 0.0f);
+            }
+        } else if (Data->beta != 1.0f) {
             for (size_t i = 0; i < M; ++i) {
                 for (size_t j = 0; j < N; ++j) {
                     Data->C[i * Data->ldc + j] *= Data->beta;
                 }
             }
         }
-    }
-    if (Data->beta == 0.0f){
-        std::fill_n(Data->C, M * Data->ldc, 0.0f);
-    }
-    //Fallback in the case of unsupported cases
-    if (M == 0 || N == 0 || K == 0 ||
-        TransA != CblasNoTrans ||
-        (TransB != CblasNoTrans && !Data[0].BIsPacked))
-    {
-        return false;
+        return true;
     }
 
     if (TransA == CblasNoTrans) {
@@ -185,11 +180,9 @@ ArmKleidiAI::MlasGemmBatch(
         auto m_step = kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
         auto n_step = kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa();
 
-        if (M < m_step || N < n_step) {
-            if (GetMlasPlatform().MlasGemmBatchOverride != ArmKleidiAI::MlasGemmBatch){
-                //Fallback to MLAS
-                return false;
-            }
+        if (M < m_step && N < n_step && !Data->BIsPacked) {
+            // Fallback to MLAS
+            return false;
         }
 
         std::vector<MLAS_SGEMM_DATA_PARAMS> KaiPackedData;
@@ -316,7 +309,7 @@ ArmKleidiAI::MlasGemmBatch(
             float* dst_tile = reinterpret_cast<float*>(CTile);
 
             // quick copy of data in cases where we are not scaling or accumulating anything
-           // with bounds checking on tile sizing to ensure the data fits in the memory block
+            // with bounds checking on tile sizing to ensure the data fits in the memory block
             bool can_memcpy = (
                 Data[BIdx].alpha == 1.0f &&
                 Data[BIdx].beta == 0.0f &&
@@ -328,21 +321,37 @@ ArmKleidiAI::MlasGemmBatch(
 
             if (can_memcpy) {
                 std::memcpy(dst_tile, temp_tile, TileSizeM * TileSizeN * sizeof(float));
-            }else {
-                // apply alpha scaling and beta to output files
-                for (size_t i = 0; i <  TileSizeM; ++i) {
-                    for (size_t j = 0; j < TileSizeN; ++j) {
-                        const size_t idx = i * TileSizeN + j;
-                        const size_t dst_idx = i * Data[BIdx].ldc + j;
-
-                        float ab = temp_tile[idx];
-                        float c_orig = dst_tile[dst_idx];
+                return;
+            }
 
-                        dst_tile[dst_idx] = Data[BIdx].alpha * ab + Data[BIdx].beta * c_orig;
+            float alpha = Data[BIdx].alpha;
+            float beta = Data[BIdx].beta;
+            size_t ldc = Data[BIdx].ldc;
+
+            for (size_t i = 0; i < TileSizeM; ++i) {
+                for (size_t j = 0; j < TileSizeN; ++j) {
+                    const size_t temp_idx = i * TileSizeN + j;
+                    const size_t dst_idx = i * ldc + j;
+
+                    float ab = temp_tile[temp_idx];
+                    float c_orig = dst_tile[dst_idx];
+
+                    if (alpha == 1.0f && beta == 0.0f) {
+                        dst_tile[dst_idx] = ab;
+                    } else if (alpha == 1.0f) {
+                        dst_tile[dst_idx] = ab + beta * c_orig;
+                    } else if (beta == 0.0f) {
+                        dst_tile[dst_idx] = alpha * ab;
+                    } else {
+                        dst_tile[dst_idx] = alpha * ab + beta * c_orig;
                     }
                 }
             }
+            return;
         });
+        return true;
+    }
+    else {
+        return false;
     }
-    return true;
 }
diff --git a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
index 8caa67f266266..4efaec325292a 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/weight_bias_quantization.cc
@@ -13,6 +13,39 @@
 
 namespace onnxruntime {
 
+/**
+ * Checks whether or not the output path from a given node leads to a QuantizeLinear op, optionally, with no
+ * branching ReLU or Clip op in between. See also: NodeGroupSelector::GetQDQSelection() in qdq_selectors.cc.
+ *
+ * @param node The starting node to check the output path from.
+ * @param graph The graph containing the nodes.
+ *
+ * @return true if the path exist, false otherwise.
+ */
+static bool IsNoBranchPathToQuantizeLinear(const Node& node, const Graph& graph) {
+  const Node* current = &node;
+  while (true) {
+    // Conv / ConvTranspose / Gemm produces single output
+    if (current->OutputDefs().size() != 1) {
+      return false;
+    }
+    const std::vector<const Node*>& consumers = graph.GetConsumerNodes(current->OutputDefs()[0]->Name());
+    // Branching or no consumer: not eligible
+    if (consumers.size() != 1) {
+      return false;
+    }
+    const Node* consumer = consumers[0];
+    if (consumer->OpType() == QDQ::QOpName) {
+      return true;
+    }
+    // Allow ReLU or Clip, see also: NodeGroupSelector::GetQDQSelection() in qdq_selectors.cc.
+    if (consumer->OpType() != "Relu" && consumer->OpType() != "Clip") {
+      return false;
+    }
+    current = consumer;
+  }
+}
+
 Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph_level,
                                          const logging::Logger& logger) const {
   const GraphViewer graph_viewer{graph};
@@ -43,11 +76,8 @@ Status WeightBiasQuantization::ApplyImpl(Graph& graph, bool& modified, int graph
       continue;
     }
 
-    // Require that the node's output is consumed by a single QuantizeLinear node.
-    // Otherwise, if only the inputs are quantized, but not the output, then this node group would not
-    // be considered a QDQ node unit anyway.
-    std::vector<const Node*> children_nodes = graph.GetConsumerNodes(node.OutputDefs()[0]->Name());
-    if (children_nodes.size() != 1 || children_nodes[0]->OpType() != QDQ::QOpName) {
+    // Check if the output path leads to QuantizeLinear with optionally ReLU or Clip op in between.
+    if (!IsNoBranchPathToQuantizeLinear(node, graph)) {
       continue;
     }
 
diff --git a/onnxruntime/core/platform/windows/env.cc b/onnxruntime/core/platform/windows/env.cc
index 36c6b54a1fce0..aa237fc6441b2 100644
--- a/onnxruntime/core/platform/windows/env.cc
+++ b/onnxruntime/core/platform/windows/env.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include <gsl/gsl>
 #include "core/common/logging/logging.h"
 #include "core/common/narrow.h"
+#include "core/common/safeint.h"
 #include "core/common/span_utils.h"
 #include "core/platform/env.h"
 #include "core/platform/scoped_resource.h"
@@ -439,30 +440,28 @@ Status WindowsEnv::MapFileIntoMemory(_In_z_ const ORTCHAR_T* file_path,
   SYSTEM_INFO sysinfo;
   GetSystemInfo(&sysinfo);
 
-  static const DWORD page_size = sysinfo.dwPageSize;
   static const DWORD allocation_granularity = sysinfo.dwAllocationGranularity;
-  const FileOffsetType offset_to_page = offset % static_cast<FileOffsetType>(page_size);
-  const size_t mapped_length = length + static_cast<size_t>(offset_to_page);
-  const FileOffsetType mapped_offset = offset - offset_to_page;
-  if (mapped_offset % allocation_granularity != 0) {
-    const auto error_code = GetLastError();
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
-                           "mapped offset must be a multiple of the allocation granularity",
-                           " , mapped_offset = ", mapped_offset,
-                           " , allocation_granularity = ", allocation_granularity,
-                           " , errcode = ", error_code,
-                           " - ", std::system_category().message(error_code));
-  }
+  const FileOffsetType offset_to_granularity = offset % static_cast<FileOffsetType>(allocation_granularity);
+  const SIZE_T mapped_length = SafeInt<SIZE_T>(offset_to_granularity) + length;
+  const FileOffsetType mapped_offset = offset - offset_to_granularity;
+  assert((mapped_offset % allocation_granularity) == 0);
 
   void* const mapped_base = MapViewOfFile(file_mapping_handle.get(),
                                           FILE_MAP_READ,
                                           static_cast<DWORD>((mapped_offset >> 32) & 0xFFFFFFFF),
                                           static_cast<DWORD>(mapped_offset & 0xFFFFFFFF),
                                           mapped_length);
-  GSL_SUPPRESS(r.11)
+
+  if (mapped_base == nullptr) {
+    const auto error_code = GetLastError();
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                           "MapViewOfFile ", ToUTF8String(Basename(file_path)),
+                           " fail, errcode = ", error_code,
+                           " - ", std::system_category().message(error_code));
+  }
 
   mapped_memory =
-      MappedMemoryPtr{reinterpret_cast<char*>(mapped_base) + offset_to_page,
+      MappedMemoryPtr{reinterpret_cast<char*>(mapped_base) + offset_to_granularity,
                       [mapped_base](void*) {
                         UnmapFile(mapped_base);
                       }};
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.cc b/onnxruntime/core/providers/cuda/cuda_graph.cc
index 8353c654681fc..88e58aec70550 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.cc
+++ b/onnxruntime/core/providers/cuda/cuda_graph.cc
@@ -72,7 +72,7 @@ void CUDAGraphManager::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id
   cuda_graph_set_.Put(cuda_graph_annotation_id, graph_exec);
 }
 
-Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag) {
   // Although this function is not thread safe, the lock is not needed here because
   // CUDA EP maintains a separate cuda graph per thread
   LOGS_DEFAULT(INFO) << "Replaying CUDA graph on stream " << stream_ << " with cuda_graph_annotation_id "
@@ -81,7 +81,9 @@ Status CUDAGraphManager::Replay(CudaGraphAnnotation_t cuda_graph_annotation_id)
   cudaGraphExec_t graph_exec = cuda_graph_set_.Get(cuda_graph_annotation_id);
   CUDA_RETURN_IF_ERROR(cudaGraphLaunch(graph_exec, stream_));
 
-  CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+  if (sync_status_flag) {
+    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+  }
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h
index 064b526e604bc..6b61a66671de4 100644
--- a/onnxruntime/core/providers/cuda/cuda_graph.h
+++ b/onnxruntime/core/providers/cuda/cuda_graph.h
@@ -38,7 +38,7 @@ struct CUDAGraphManager {
   void SetStream(cudaStream_t stream);
   void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
   void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
-  Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id);
+  Status Replay(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag = true);
 
   void Reset();
 
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
index 451be69c81cfb..93b673f2df5bd 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.cc
@@ -19,7 +19,7 @@
 #include "nv_data_transfer.h"
 #include "onnx_ctx_model_helper.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
-#include "core/providers/cuda/math/unary_elementwise_ops_impl.h"
+#include "core/providers/cuda/cuda_graph.h"
 #include "core/session/allocator_adapters.h"
 #include "cuda_runtime_api.h"
 #include "core/common/parse_string.h"
@@ -84,40 +84,6 @@ struct ShutdownProtobuf {
 
 namespace onnxruntime {
 
-namespace cuda {
-template <>
-void Impl_Cast(
-    cudaStream_t stream,
-    const int64_t* input_data, int32_t* output_data,
-    size_t count) {
-  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
-}
-
-template <>
-void Impl_Cast(
-    cudaStream_t stream,
-    const int32_t* input_data, int64_t* output_data,
-    size_t count) {
-  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
-}
-
-template <>
-void Impl_Cast(
-    cudaStream_t stream,
-    const double* input_data, float* output_data,
-    size_t count) {
-  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
-}
-
-template <>
-void Impl_Cast(
-    cudaStream_t stream,
-    const float* input_data, double* output_data,
-    size_t count) {
-  return g_host->cuda__Impl_Cast(static_cast<void*>(stream), input_data, output_data, count);
-}
-}  // namespace cuda
-
 void* OutputAllocator::reallocateOutputAsync(char const* /*tensorName*/, void* /*currentMemory*/, uint64_t size,
                                              uint64_t /*alignment*/, cudaStream_t /*stream*/) noexcept {
   // Some memory allocators return nullptr when allocating zero bytes, but TensorRT requires a non-null ptr
@@ -255,7 +221,8 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
                                            std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_min_shapes,
                                            std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_max_shapes,
                                            std::unordered_map<std::string, std::vector<std::vector<int64_t>>>& profile_opt_shapes,
-                                           ShapeRangesMap& input_explicit_shape_ranges) {
+                                           ShapeRangesMap& input_explicit_shape_ranges,
+                                           bool& cuda_graph_flag) {
   if (trt_profiles.size() == 0) {
     LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Number of optimization profiles should be greater than 0, but it's 0.";
     return false;
@@ -282,6 +249,10 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
 
     // Shape tensor
     if (input->isShapeTensor()) {
+      if (cuda_graph_flag) {
+        LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Shape tensor detected on input '" << input->getName() << "'. Disabling CUDA Graph.";
+        cuda_graph_flag = false;
+      }
       int shape_size = nb_dims == 0 ? 1 : static_cast<int>(profile_min_shapes[input_name][i].size());
       std::vector<int64_t> shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size);
 
@@ -366,51 +337,19 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
     break;                                                                                  \
   }
 
-#define CASE_GET_CAST_INPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                         \
-  case DATA_TYPE: {                                                                                               \
-    auto input_tensor_ptr = input_tensor.GetTensorData<SrcT>();                                                   \
-    skip_input_binding_allowed = false;                                                                           \
-    if (input_tensor_ptr != nullptr && elem_cnt > 0) {                                                            \
-      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
-      data = scratch_buffers.back().get();                                                                        \
-      cuda::Impl_Cast<SrcT, DstT>(stream, input_tensor_ptr, reinterpret_cast<DstT*>(data), elem_cnt);             \
-    } else {                                                                                                      \
-      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
-      data = scratch_buffers.back().get();                                                                        \
-    }                                                                                                             \
-    break;                                                                                                        \
-  }
-
 #define CASE_GET_OUTPUT_TENSOR(DATA_TYPE, SrcT)                                             \
   case DATA_TYPE: {                                                                         \
     auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                    \
     data_ptr = output_tensor_ptr;                                                           \
     if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                     \
-      buffers[output_name] = output_tensor_ptr;                                             \
+      buffer = output_tensor_ptr;                                                           \
     } else {                                                                                \
       scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1)); \
-      buffers[output_name] = scratch_buffers.back().get();                                  \
+      buffer = scratch_buffers.back().get();                                                \
     }                                                                                       \
     break;                                                                                  \
   }
 
-#define CASE_GET_CAST_OUTPUT_TENSOR(DATA_TYPE, SrcT, DstT)                                                        \
-  case DATA_TYPE: {                                                                                               \
-    auto output_tensor_ptr = output_tensor.GetTensorMutableData<SrcT>();                                          \
-    data_ptr = output_tensor_ptr;                                                                                 \
-    skip_output_binding_allowed = false;                                                                          \
-    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                           \
-      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, elem_cnt * sizeof(DstT))); \
-      buffers[output_name] = scratch_buffers.back().get();                                                        \
-      output_dim_sizes[i] = static_cast<int>(elem_cnt);                                                           \
-    } else {                                                                                                      \
-      scratch_buffers.push_back(IAllocator::MakeUniquePtrFromOrtAllocator<void>(alloc, 1));                       \
-      buffers[output_name] = scratch_buffers.back().get();                                                        \
-      output_dim_sizes[i] = 1;                                                                                    \
-    }                                                                                                             \
-    break;                                                                                                        \
-  }
-
 #define CASE_COPY_TENSOR(DATA_TYPE, DstT)                                                                                                          \
   case DATA_TYPE: {                                                                                                                                \
     auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                           \
@@ -420,15 +359,6 @@ bool ApplyProfileShapesFromProviderOptions(std::vector<nvinfer1::IOptimizationPr
     break;                                                                                                                                         \
   }
 
-#define CASE_CAST_TENSOR(DATA_TYPE, SrcT, DstT)                                                                                                   \
-  case DATA_TYPE: {                                                                                                                               \
-    auto output_tensor_ptr = output_tensor.GetTensorMutableData<DstT>();                                                                          \
-    if (output_tensor_ptr != nullptr && elem_cnt > 0) {                                                                                           \
-      cuda::Impl_Cast<SrcT, DstT>(stream, reinterpret_cast<SrcT*>(allocator->getBuffer()), reinterpret_cast<DstT*>(output_tensor_ptr), elem_cnt); \
-    }                                                                                                                                             \
-    break;                                                                                                                                        \
-  }
-
 /*
  * Set Nv executio context input.
  *
@@ -551,7 +481,6 @@ Status BindContextInput(Ort::KernelContext& ctx,
       CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
       CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
       CASE_GET_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
-      CASE_GET_CAST_INPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                "NvTensorRTRTX EP input onnx tensor data type: " + std::to_string(tensor_type) + " not supported.");
@@ -576,8 +505,6 @@ Status BindContextInput(Ort::KernelContext& ctx,
  * param output_type - Data type of the output
  * param i - Output iteration index
  * param output_tensors - Output iteration index to output's ORT value
- * param output_dim_sizes - Output iteration index to the multiplocation of its shape's dimensions
- * param dds_output_set - DDS output set
  * param dds_output_allocator_map - DDS output to its allocator
  * param scratch_buffer - The allocation buffer created by TRT EP
  * param allocator - ORT allocator
@@ -589,16 +516,11 @@ Status BindContextOutput(Ort::KernelContext& ctx,
                          const char* output_name,
                          size_t output_index,
                          size_t output_type,
-                         size_t i,
-                         std::unordered_map<size_t, Ort::UnownedValue>& output_tensors,
-                         std::unordered_map<size_t, int>& output_dim_sizes,
                          DDSOutputAllocatorMap& dds_output_allocator_map,
                          std::vector<IAllocatorUniquePtr<void>>& scratch_buffers,
                          OrtAllocator* alloc,
-                         std::unordered_map<char const*, void*>& buffers,
                          nvinfer1::Dims& dims,
-                         void*& data_ptr,
-                         bool& skip_output_binding_allowed) {
+                         void*& data_ptr) {
   // Get output shape
   dims = trt_context->getTensorShape(output_name);
   int nb_dims = dims.nbDims;
@@ -628,10 +550,11 @@ Status BindContextOutput(Ort::KernelContext& ctx,
       data_ptr = nullptr;  // Set data_ptr to nullptr for DDS output binding.
     }
   } else {
-    output_tensors[i] = ctx.GetOutput(output_index, dims.d, nb_dims);
-    auto& output_tensor = output_tensors[i];
+    auto output_tensor = ctx.GetOutput(output_index, dims.d, nb_dims);
     const auto elem_cnt = output_tensor.GetTensorTypeAndShapeInfo().GetElementCount();
 
+    void* buffer = nullptr;
+
     switch (output_type) {
       // below macros set data_ptr and skip_output_binding_allowed variables
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT, float)
@@ -642,13 +565,12 @@ Status BindContextOutput(Ort::KernelContext& ctx,
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
       CASE_GET_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
-      CASE_GET_CAST_OUTPUT_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, double, float)
       default: {
         return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                                "NvTensorRTRTX EP output tensor data type: " + std::to_string(output_type) + " not supported.");
       }
     }
-    trt_context->setTensorAddress(output_name, buffers[output_name]);
+    trt_context->setTensorAddress(output_name, buffer);
   }
 
   return Status::OK();
@@ -705,7 +627,6 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
     CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8, uint8_t)
     CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32, int32_t)
     CASE_COPY_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64, int64_t)
-    CASE_CAST_TENSOR(ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE, float, double)
     default: {
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "NvTensorRTRTX EP output tensor data type: " + std::to_string(output_type) + " not supported.");
@@ -715,10 +636,10 @@ Status BindKernelOutput(Ort::KernelContext& ctx,
 }
 
 NvExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, bool has_user_compute_stream, cudaStream_t stream) {
-  // TODO: figure out if PerThreadContext is used at all. If not, just clean it up.
+  // Only set device if user hasn't provided a compute stream
   if (has_user_compute_stream) {
     CUDA_CALL_THROW(cudaSetDevice(device_id));
-    (void)(stream);
+    (void)stream;
   }
 }
 
@@ -745,6 +666,86 @@ bool NvExecutionProvider::PerThreadContext::UpdateTensorRTContext(std::string fu
   return false;
 }
 
+void NvExecutionProvider::PerThreadContext::DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  graph_id_to_run_count_.erase(cuda_graph_annotation_id);
+  cuda_graph_.Reset();
+}
+
+void NvExecutionProvider::PerThreadContext::ResetWarmupRuns(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  if (graph_id_to_run_count_.find(cuda_graph_annotation_id) == graph_id_to_run_count_.end()) {
+    return;
+  }
+  graph_id_to_run_count_[cuda_graph_annotation_id] = 0;
+}
+
+bool NvExecutionProvider::PerThreadContext::IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  if (!IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id)) {
+    return false;
+  }
+
+  // Safe access to map - return false if key doesn't exist yet
+  auto it = graph_id_to_run_count_.find(cuda_graph_annotation_id);
+  if (it == graph_id_to_run_count_.end()) {
+    return false;  // Entry doesn't exist yet, not ready for capture
+  }
+
+  bool allowed = it->second >= min_num_runs_before_cuda_graph_capture_;
+  if (allowed) {
+    LOGS_DEFAULT(VERBOSE) << "NvTensorRTRTX EP Graph capture allowed for ID: " << cuda_graph_annotation_id
+                          << ", run count: " << it->second;
+  }
+  return allowed;
+}
+
+bool NvExecutionProvider::PerThreadContext::IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_.IsGraphCaptureAllowedOnRun(cuda_graph_annotation_id);
+}
+
+CudaGraphAnnotation_t NvExecutionProvider::PerThreadContext::GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const {
+  // Actual implementation
+  auto graph_annotation_str = run_options.GetConfigOptions().GetConfigEntry(kOrtRunOptionsConfigCudaGraphAnnotation);
+  CudaGraphAnnotation_t cuda_graph_annotation_id = kCudaGraphAnnotationDefault;
+
+  // Kind of debugging head implementation, can be cleaned and made robust like CUDA EP
+  if (graph_annotation_str.has_value() && !graph_annotation_str->empty()) {
+    if (!TryParseStringWithClassicLocale<CudaGraphAnnotation_t>(*graph_annotation_str, cuda_graph_annotation_id)) {
+      LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Failed to parse cuda graph annotation id: "
+                            << *graph_annotation_str << ", using default: " << kCudaGraphAnnotationDefault;
+      cuda_graph_annotation_id = kCudaGraphAnnotationDefault;
+    }
+  }
+  return cuda_graph_annotation_id;
+}
+
+void NvExecutionProvider::PerThreadContext::SetCurrentGraphAnnotationId(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  current_graph_annotation_id_ = cuda_graph_annotation_id;
+}
+
+CudaGraphAnnotation_t NvExecutionProvider::PerThreadContext::GetCurrentGraphAnnotationId() const {
+  return current_graph_annotation_id_;
+}
+
+void NvExecutionProvider::PerThreadContext::CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cuda_graph_.Reset();
+  cuda_graph_.CaptureBegin(cuda_graph_annotation_id);
+}
+
+void NvExecutionProvider::PerThreadContext::CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  cuda_graph_.CaptureEnd(cuda_graph_annotation_id);
+}
+
+bool NvExecutionProvider::PerThreadContext::IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const {
+  return cuda_graph_.IsGraphCaptured(cuda_graph_annotation_id);
+}
+
+Status NvExecutionProvider::PerThreadContext::ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag) {
+  return cuda_graph_.Replay(cuda_graph_annotation_id, sync_status_flag);
+}
+
+void NvExecutionProvider::PerThreadContext::IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id) {
+  graph_id_to_run_count_[cuda_graph_annotation_id]++;
+}
+
 bool NvExecutionProvider::PerThreadContext::IsTensorRTContextInMap(std::string fused_node) {
   auto it = trt_context_map_.find(fused_node);
   if (it != trt_context_map_.end()) {
@@ -846,6 +847,12 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
   if (info.has_user_compute_stream) {
     external_stream_ = true;
     stream_ = static_cast<cudaStream_t>(info.user_compute_stream);
+  } else if (cuda_graph_enable_) {
+    external_stream_ = false;
+    CUDA_CALL_THROW(cudaStreamCreate(&stream_));
+  } else {
+    external_stream_ = false;
+    stream_ = nullptr;  // Will be created in compute function
   }
 
   std::string profile_min_shapes, profile_max_shapes, profile_opt_shapes;
@@ -1010,7 +1017,7 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
   // external stream:
   // If user provides "external" cuda stream, only this cuda stream will be used even if multiple threads are running InferenceSession.Run() concurrently.
   // So, no need to synchronize different streams after enqueueV3.
-  if (cuda_graph_enable_ || external_stream_) {
+  if (external_stream_) {
     sync_stream_after_enqueue_ = false;
   }
 
@@ -1038,7 +1045,7 @@ NvExecutionProvider::NvExecutionProvider(const NvExecutionProviderInfo& info)
                         << ", nv_force_sequential_engine_build: " << force_sequential_engine_build_
                         << ", nv_sparsity_enable: " << sparsity_enable_
                         << ", nv_auxiliary_streams: " << auxiliary_streams_
-                        << ", nv_cuda_graph_enable: " << cuda_graph_enable_
+                        << ", enable_cuda_graph: " << cuda_graph_enable_
                         << ", nv_dump_ep_context_model: " << dump_ep_context_model_
                         << ", nv_ep_context_file_path: " << ep_context_file_path_
                         << ", nv_ep_context_embed_mode: " << ep_context_embed_mode_
@@ -1060,7 +1067,7 @@ NvExecutionProvider::~NvExecutionProvider() {
     }
   }
 
-  if (!external_stream_ && stream_) {
+  if (!external_stream_ && stream_ != nullptr) {
     ORT_IGNORE_RETURN_VALUE(CUDA_CALL(cudaStreamDestroy(stream_)));
   }
   ReleaseTensorRTCustomOpDomainList(info_.custom_op_domain_list);
@@ -1072,41 +1079,82 @@ NvExecutionProvider::~NvExecutionProvider() {
   }
 }
 
+void NvExecutionProvider::HandleCudaGraphStart(cudaStream_t stream, bool require_io_binding,
+                                               CudaGraphAnnotation_t cuda_graph_annotation_id, bool& graph_replay_on_this_run, bool& should_start_capture) {
+  graph_replay_on_this_run = false;
+  should_start_capture = false;
+
+  // Case 1: CUDA Graph capture is enabled AND IO binding is required.
+  // In this case, we force graph re-capture by resetting warmup runs.
+  // If a graph for this annotation ID already exists, delete it before proceeding.
+  if (require_io_binding && cuda_graph_enable_) {
+    GetPerThreadContext().ResetWarmupRuns(cuda_graph_annotation_id);
+
+    if (GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) {
+      LOGS_DEFAULT(WARNING) << "[NvTensorRTRTX EP] Graph already captured and required_io_binding is true, resetting warmup runs and deleting graph";
+      GetPerThreadContext().DeleteCapturedGraph(cuda_graph_annotation_id);
+    }
+    // Case 2: CUDA Graph capture is enabled AND IO binding is NOT required
+  } else if (cuda_graph_enable_ && !require_io_binding) {
+    // If the graph is not yet captured, increment the regular run counter
+    if (cuda_graph_annotation_id != kCudaGraphAnnotationSkip &&
+        !GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) {
+      GetPerThreadContext().IncrementRegularRunCountBeforeGraphCapture(cuda_graph_annotation_id);
+    }
+
+    // If capture is allowed and graph not already captured,
+    // set the stream and begin capture
+    if (!GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id) &&
+        GetPerThreadContext().IsGraphCaptureAllowed(cuda_graph_annotation_id)) {
+      GetPerThreadContext().SetCudaGraphStream(stream);
+      GetPerThreadContext().CaptureBegin(cuda_graph_annotation_id);
+      should_start_capture = true;
+    }
+
+    // If a graph is already captured for this ID, mark it for replay in this run.
+    if (GetPerThreadContext().IsGraphCaptured(cuda_graph_annotation_id)) {
+      graph_replay_on_this_run = true;
+    }
+  }
+}
+
 bool NvExecutionProvider::IsGraphCaptureEnabled() const {
   return cuda_graph_enable_;
 }
 
-bool NvExecutionProvider::IsGraphCaptureAllowed() const {
-  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
+bool NvExecutionProvider::IsGraphCaptured(int graph_annotation_id) const {
+  // This is hardcoded to always return false because we are not allowing the ORT framework to have the CUDA graph control.
+  (void)graph_annotation_id;
+  return false;
 }
 
-void NvExecutionProvider::CaptureBegin(int) {
-  cuda_graph_.Reset();
-  cuda_graph_.CaptureBegin(0);
+Status NvExecutionProvider::ReplayGraph(int graph_annotation_id) {
+  // This is hardcoded to always return OK because we are not allowing the ORT framework to have the CUDA graph control.
+  (void)graph_annotation_id;
+  return Status::OK();
 }
 
-void NvExecutionProvider::CaptureEnd(int) {
-  cuda_graph_.CaptureEnd(0);
-  is_graph_captured_ = true;
-}
+Status NvExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
+  if (cuda_graph_enable_) {
+    CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCudaGraphAnnotationId(run_options);
+    GetPerThreadContext().SetCurrentGraphAnnotationId(cuda_graph_annotation_id);
+  }
 
-bool NvExecutionProvider::IsGraphCaptured(int) const {
-  return is_graph_captured_;
+  if (multi_profile_enable_ == true) {
+    auto graph_annotation_str =
+        run_options.GetConfigOptions().GetConfigEntry(nv::run_option_names::kProfileIndex);
+    TryParseStringWithClassicLocale<int>(*graph_annotation_str, nv_profile_index_);
+  }
+  return Status::OK();
 }
 
-Status NvExecutionProvider::ReplayGraph(int) {
-  ORT_ENFORCE(IsGraphCaptured(0));
-  // Please note that CUDAGraph::Replay() is not thread safe.
-  // ORT TRT calls ReplayGraph() in compute_func() where synchronization is enforced due to lock_guard(),
-  // therefore calling CUDAGraph::Replay() here is guaranteed to be thread safe.
-  return cuda_graph_.Replay(0);
-}
+Status NvExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) {
+  (void)run_options;
 
-void NvExecutionProvider::IncrementRegularRunCountBeforeGraphCapture() {
-  // Please note that this function is not thread safe.
-  // ORT TRT calls this function in compute_func() where synchronization is enforced due to lock_guard(),
-  // therefore following increment is guaranteed to be thread safe.
-  ++regular_run_count_before_graph_capture_;
+  if (sync_stream && external_stream_) {
+    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
+  }
+  return Status::OK();
 }
 
 std::vector<AllocatorPtr> NvExecutionProvider::CreatePreferredAllocators() {
@@ -1133,22 +1181,6 @@ std::unique_ptr<IDataTransfer> NvExecutionProvider::GetDataTransfer() const {
   return std::make_unique<GPUDataTransfer>();
 }
 
-Status NvExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_options) {
-  if (multi_profile_enable_ == true) {
-    auto graph_annotation_str =
-        run_options.GetConfigOptions().GetConfigEntry(nv::run_option_names::kProfileIndex);
-    TryParseStringWithClassicLocale<int>(*graph_annotation_str, nv_profile_index_);
-  }
-  return Status::OK();
-}
-
-Status NvExecutionProvider::OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& /*run_options*/) {
-  if (sync_stream && external_stream_) {
-    CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream_));
-  }
-  return Status::OK();
-}
-
 // Get the pointer to the IBuilder instance.
 // Note: This function is not thread safe. Calls to this function from different threads must be serialized
 // even though it doesn't make sense to have multiple threads initializing the same inference session.
@@ -2519,7 +2551,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
                                 profile_opt_shapes_.find(input_name) != profile_opt_shapes_.end() &&
                                 profile_max_shapes_.find(input_name) != profile_max_shapes_.end();
       if (has_explicit_profile && tensor_has_profile) {
-        apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
+        apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges, cuda_graph_enable_);
       } else {
         LOGS_DEFAULT(INFO) << "[NvTensorRTRTX EP] Creating implicit profile for tensor " << input_name;
         profile_min_shapes_[input_name] = std::vector<std::vector<int64_t>>{{}};
@@ -2546,7 +2578,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
             profile_max_shapes_[input_name][0][idx_dim] = dim_value;
           }
         }
-        apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges);
+        apply_profile = ApplyProfileShapesFromProviderOptions(trt_profiles, input, profile_min_shapes_, profile_max_shapes_, profile_opt_shapes_, input_explicit_shape_ranges, cuda_graph_enable_);
       }
       if (!apply_profile) {
         std::ostringstream msg;
@@ -2600,6 +2632,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   // Otherwise engine will be handled at inference time.
   std::unique_ptr<nvinfer1::ICudaEngine> trt_engine;
   std::unique_ptr<nvinfer1::IExecutionContext> trt_context;
+  std::unique_ptr<nvinfer1::IRuntimeConfig> trt_runtime_config;
 
   // Generate file name for dumping ep context model
   if (dump_ep_context_model_ && ctx_model_path_.empty()) {
@@ -2622,6 +2655,13 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
       return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                              "NvTensorRTRTX EP failed to deserialize engine for fused node: " + fused_node.Name());
     }
+
+    trt_runtime_config = std::unique_ptr<nvinfer1::IRuntimeConfig>(trt_engine->createRuntimeConfig());
+    if (trt_runtime_config && cuda_graph_enable_) {
+      trt_runtime_config->setDynamicShapesKernelSpecializationStrategy(nvinfer1::DynamicShapesKernelSpecializationStrategy::kEAGER);
+    }
+    trt_runtime_config->setExecutionContextAllocationStrategy(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED);
+
     if (detailed_build_log_) {
       auto engine_build_stop = std::chrono::steady_clock::now();
       LOGS_DEFAULT(INFO) << "TensorRT engine build for " << fused_node.Name() << " took: " << std::chrono::duration_cast<std::chrono::milliseconds>(engine_build_stop - engine_build_start).count() << "ms" << std::endl;
@@ -2681,7 +2721,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   // Build context
   // Note: Creating an execution context from an engine is thread safe per TRT doc
   // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
-  trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(nvinfer1::ExecutionContextAllocationStrategy::kUSER_MANAGED));
+  trt_context = std::unique_ptr<nvinfer1::IExecutionContext>(trt_engine->createExecutionContext(trt_runtime_config.get()));
   if (!trt_context) {
     return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL,
                            "NvTensorRTRTX EP could not build execution context for fused node: " + fused_node.Name());
@@ -2712,7 +2752,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   }
 
   // Save TRT engine, other TRT objects and input/output info to map
-  parsers_.emplace(fused_node.Name(), std::move(trt_parser));
   engines_.emplace(fused_node.Name(), std::move(trt_engine));
   contexts_.emplace(fused_node.Name(), std::move(trt_context));
   networks_.emplace(fused_node.Name(), std::move(trt_network));
@@ -2728,7 +2767,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
   compute_info.create_state_func = [=](ComputeContext* context, FunctionState* state) {
     std::unique_ptr<TensorrtFuncState> p = std::make_unique<TensorrtFuncState>();
     *p = {context->allocate_func, context->release_func, context->allocator_handle, context->node_name, builder_.get(),
-          &parsers_[context->node_name], &engines_[context->node_name], &contexts_[context->node_name],
+          &engines_[context->node_name], &contexts_[context->node_name],
           &networks_[context->node_name], input_info_[context->node_name], output_info_[context->node_name],
           input_shape_ranges_[context->node_name], &tensorrt_mu_,
           engine_cache_enable_, cache_path_,
@@ -2766,7 +2805,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     auto trt_engine = trt_state->engine->get();
     auto trt_context = trt_state->context->get();
     auto trt_profiles = trt_state->profiles;
-    int num_outputs = static_cast<int>(output_indexes.size());
     std::unordered_set<std::string> input_names;
 
     if (alloc_ == nullptr) {
@@ -2777,9 +2815,17 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     }
     OrtAllocator* alloc = alloc_;
 
-    void* cuda_stream;
-    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+    cudaStream_t stream;
+    if (stream_ != nullptr) {
+      // Use our existing stream (either user's or our early-created)
+      stream = stream_;
+    } else {
+      // Create stream now (lazy creation case)
+      void* cuda_stream;
+      Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+      stream = static_cast<cudaStream_t>(cuda_stream);
+      stream_ = stream;
+    }
 
     if (multi_profile_enable_ == true) {
       if (!trt_context->setOptimizationProfileAsync(nv_profile_index_, stream))
@@ -2833,16 +2879,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
     /*
      * Set output shapes and bind output buffers
      */
-    std::unordered_map<char const*, void*> buffers;
-    buffers.reserve(num_outputs);
-    using OutputOrtValue = Ort::UnownedValue;
-    std::unordered_map<size_t, OutputOrtValue> output_tensors;
-    output_tensors.reserve(num_outputs);
-    std::unordered_map<size_t, int> output_dim_sizes;
-    output_dim_sizes.reserve(num_outputs);
-
     if (require_io_binding) {
-      bool skip_output_binding_allowed = true;
       for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
         char const* output_name = output_binding_names[i];
 
@@ -2861,16 +2898,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
         nvinfer1::Dims dims;
         void* data_ptr = nullptr;
 
-        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                          dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
+        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type,
+                                          dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
         }
 
         trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims};
       }
-
-      trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
     }
 
     // Set execution context memory
@@ -2886,18 +2921,23 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
         trt_context->setDeviceMemoryV2(trt_state->context_memory.get(), mem_size);
       }
     }
-    // Start CUDA graph capture.
-    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
-    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
-    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
-      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
-      cuda_graph_.SetStream(stream);
-      CaptureBegin(0);
-    }
 
-    // Run TRT inference
-    if (!trt_context->enqueueV3(stream)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed.");
+    // Start CUDA graph capture with the correct stream
+    // Note: We need to set the stream and start capture here because this is where we have access to the actual compute stream
+    // Get the graph annotation ID that was stored during OnRunStart
+    CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCurrentGraphAnnotationId();
+    bool graph_replay_on_this_run = false;
+    bool should_start_capture = false;
+
+    HandleCudaGraphStart(stream, require_io_binding, cuda_graph_annotation_id,
+                         graph_replay_on_this_run, should_start_capture);
+
+    if (!graph_replay_on_this_run) {
+      if (!trt_context->enqueueV3(stream)) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed.");
+      }
+    } else {
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_));
     }
 
     /*
@@ -2914,10 +2954,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
      * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above.
      * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture.
      */
+
+    if (cuda_graph_enable_ && should_start_capture) {
+      GetPerThreadContext().CaptureEnd(cuda_graph_annotation_id);
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_));
+    }
+
     if (sync_stream_after_enqueue_) {
       CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
     }
-
     // Assign TRT output back to ORT output
     // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
     // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
@@ -2940,29 +2985,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphViewer& gr
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
-      } else {
-        auto& output_tensor = output_tensors[i];
-        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
-          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-          if (output_tensor_ptr != nullptr) {
-            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
-          }
-        }
-      }
-    }
-
-    // End CUDA graph capture.
-    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
-    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
-    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
-    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
-      if (IsGraphCaptureAllowed()) {
-        CaptureEnd(0);
-        // CUDA work issued to a capturing stream doesn't actually run on the GPU,
-        // so run the captured graph here to actually execute the work.
-        ORT_RETURN_IF_ERROR(ReplayGraph(0));
-      } else {
-        IncrementRegularRunCountBeforeGraphCapture();
       }
     }
 
@@ -3086,7 +3108,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     auto& dds_output_allocator_map = this->dds_output_allocator_maps_[fused_node_name];
     auto trt_engine = trt_state->engine->get();
     auto trt_context = trt_state->context->get();
-    int num_outputs = static_cast<int>(output_indexes.size());
     std::unordered_map<std::string, std::vector<int32_t>> shape_tensor_values;        // This map holds "shape tensor -> shape values" for the shape tensor input across this inference run
     std::unordered_map<std::string, std::vector<int64_t>> shape_tensor_values_int64;  // same as above but for int64 shape tensor input
 
@@ -3098,9 +3119,16 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     }
     OrtAllocator* alloc = alloc_;
 
-    void* cuda_stream;
-    Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
-    cudaStream_t stream = static_cast<cudaStream_t>(cuda_stream);
+    cudaStream_t stream;
+    if (stream_ != nullptr) {
+      // Use our existing stream (either user's or our early-created)
+      stream = stream_;
+    } else {
+      // Create stream now (lazy creation case)
+      void* cuda_stream;
+      Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &cuda_stream));
+      stream = static_cast<cudaStream_t>(cuda_stream);
+    }
 
     // Check before using trt_engine
     if (trt_engine == nullptr) {
@@ -3149,16 +3177,7 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
     /*
      * Set output shapes and bind output buffers
      */
-    std::unordered_map<char const*, void*> buffers;
-    buffers.reserve(num_outputs);
-    using OutputOrtValue = Ort::UnownedValue;
-    std::unordered_map<size_t, OutputOrtValue> output_tensors;
-    output_tensors.reserve(num_outputs);
-    std::unordered_map<size_t, int> output_dim_sizes;
-    output_dim_sizes.reserve(num_outputs);
-
     if (require_io_binding) {
-      bool skip_output_binding_allowed = true;
       for (size_t i = 0, end = output_binding_names.size(); i < end; ++i) {
         char const* output_name = output_binding_names[i];
 
@@ -3177,16 +3196,14 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
         nvinfer1::Dims dims;
         void* data_ptr = nullptr;
 
-        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type, i, output_tensors, output_dim_sizes,
-                                          dds_output_allocator_map, scratch_buffers, alloc, buffers, dims, data_ptr, skip_output_binding_allowed);
+        Status status = BindContextOutput(ctx, trt_context, output_name, output_index, output_type,
+                                          dds_output_allocator_map, scratch_buffers, alloc, dims, data_ptr);
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage());
         }
 
         trt_state->output_tensors[output_index] = TensorParams{data_ptr, dims};
       }
-
-      trt_state->skip_io_binding_allowed = trt_state->skip_io_binding_allowed | skip_output_binding_allowed;
     }
 
     // Set execution context memory
@@ -3203,18 +3220,23 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
         trt_context->setDeviceMemoryV2(trt_state->context_memory.get(), mem_size);
       }
     }
-    // Start CUDA graph capture.
-    // Note: The reason we don't put graph capture in OnRunStart() like CUDA EP does is because
-    // current ORT TRT doesn't get cuda stream until compute time and graph capture requires cuda stream.
-    if (cuda_graph_enable_ && IsGraphCaptureAllowed() && !IsGraphCaptured(0)) {
-      LOGS_DEFAULT(INFO) << "Capturing the cuda graph for this model";
-      cuda_graph_.SetStream(stream);
-      CaptureBegin(0);
-    }
 
-    // Run TRT inference
-    if (!trt_context->enqueueV3(stream)) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed.");
+    // Start CUDA graph capture with the correct stream
+    // Note: We need to set the stream and start capture here because this is where we have access to the actual compute stream
+    // Get the graph annotation ID that was stored during OnRunStart
+    CudaGraphAnnotation_t cuda_graph_annotation_id = GetPerThreadContext().GetCurrentGraphAnnotationId();
+    bool graph_replay_on_this_run = false;
+    bool should_start_capture = false;
+
+    HandleCudaGraphStart(stream, require_io_binding, cuda_graph_annotation_id,
+                         graph_replay_on_this_run, should_start_capture);
+
+    if (!graph_replay_on_this_run) {
+      if (!trt_context->enqueueV3(stream)) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "NvTensorRTRTX EP execution context enqueue failed.");
+      }
+    } else {
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_));
     }
 
     /*
@@ -3231,10 +3253,15 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
      * Therefore, TRT EP needs to call cudaStreamSynchronize() which means to wait until stream has completed all operations to prevent the concurrent issue mentioned above.
      * However, if cuda graph is enabled, TRT EP won't call cudaStreamSynchronize() since it's not allowed during graph capture.
      */
+
+    if (cuda_graph_enable_ && should_start_capture) {
+      GetPerThreadContext().CaptureEnd(cuda_graph_annotation_id);
+      ORT_RETURN_IF_ERROR(GetPerThreadContext().ReplayGraph(cuda_graph_annotation_id, sync_stream_after_enqueue_));
+    }
+
     if (sync_stream_after_enqueue_) {
       CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream));
     }
-
     // Assign TRT output back to ORT output
     // (1) Bind TRT DDS output to ORT kernel context output. (It needs to wait until enqueueV3 is finished)
     // (2) Cast TRT INT32 output to ORT INT64 output or TRT double output to float output
@@ -3257,29 +3284,6 @@ Status NvExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(const Gra
         if (status != Status::OK()) {
           return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, status.ErrorMessage());
         }
-      } else {
-        auto& output_tensor = output_tensors[i];
-        if (output_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE) {
-          auto output_tensor_ptr = output_tensor.GetTensorMutableData<double>();
-          if (output_tensor_ptr != nullptr) {
-            cuda::Impl_Cast<float, double>(stream, reinterpret_cast<float*>(buffers[output_name]), output_tensor_ptr, output_dim_sizes[i]);
-          }
-        }
-      }
-    }
-
-    // End CUDA graph capture.
-    // Note: One reason we don't put end of graph capture in OnRunEnd() like CUDA EP does is because of cuda stream mentioned in graph capture
-    // above, another reason is because OnRunEnd() is not synchronized with OnRunStart() and ExecuteGraph() per inference_session.cc.
-    // It's safe to start/end CUDA graph capture in compute_func() here since cuda graph object is maintained by a per thread basis.
-    if (cuda_graph_enable_ && !IsGraphCaptured(0)) {
-      if (IsGraphCaptureAllowed()) {
-        CaptureEnd(0);
-        // CUDA work issued to a capturing stream doesn't actually run on the GPU,
-        // so run the captured graph here to actually execute the work.
-        ORT_RETURN_IF_ERROR(ReplayGraph(0));
-      } else {
-        IncrementRegularRunCountBeforeGraphCapture();
       }
     }
 
diff --git a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
index e3dd38eb837ff..9e5fd03756f02 100644
--- a/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
+++ b/onnxruntime/core/providers/nv_tensorrt_rtx/nv_execution_provider.h
@@ -12,7 +12,7 @@ typedef void* cublasHandle_t;
 typedef void* cudnnStatus_t;
 #endif
 #include "core/providers/nv_tensorrt_rtx/nv_includes.h"
-
+#include "core/session/onnxruntime_run_options_config_keys.h"
 #include <mutex>
 #include "core/providers/cuda/cuda_graph.h"
 #include "nv_execution_provider_info.h"
@@ -195,7 +195,6 @@ struct TensorrtFuncState {
   AllocatorHandle allocator = nullptr;
   std::string fused_node_name;
   nvinfer1::IBuilder* builder;
-  tensorrt_ptr::unique_pointer<nvonnxparser::IParser>* parser = nullptr;
   std::unique_ptr<nvinfer1::ICudaEngine>* engine = nullptr;
   std::unique_ptr<nvinfer1::IExecutionContext>* context = nullptr;
   std::unique_ptr<nvinfer1::INetworkDefinition>* network = nullptr;
@@ -305,9 +304,11 @@ class NvExecutionProvider : public IExecutionProvider {
 
   std::vector<AllocatorPtr> CreatePreferredAllocators() override;
 
+  // CUDA Graph support
   bool IsGraphCaptureEnabled() const override;
   bool IsGraphCaptured(int graph_annotation_id) const override;
   Status ReplayGraph(int graph_annotation_id) override;
+  void HandleCudaGraphStart(cudaStream_t stream, bool require_io_binding, CudaGraphAnnotation_t cuda_graph_annotation_id, bool& graph_replay_on_this_run, bool& should_start_capture);
 
   static common::Status RefitEngine(std::string onnx_model_filename,
                                     std::string& onnx_model_folder_path,
@@ -384,7 +385,6 @@ class NvExecutionProvider : public IExecutionProvider {
   // In general, TensorRT objects are not thread safe; accesses to an object from different threads must be serialized by the client.
   // But there are still some thread safe operations, please see here https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading
   // For those non thread safe operations, TRT EP uses (1) lock_guard or (2) PerThreadContext to make sure synchronization.
-  std::unordered_map<std::string, tensorrt_ptr::unique_pointer<nvonnxparser::IParser>> parsers_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::ICudaEngine>> engines_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::IExecutionContext>> contexts_;
   std::unordered_map<std::string, std::unique_ptr<nvinfer1::IBuilder>> builders_;
@@ -405,15 +405,6 @@ class NvExecutionProvider : public IExecutionProvider {
   // Call cudaStreamSynchronize() after TRT enqueueV3()
   mutable bool sync_stream_after_enqueue_ = true;
 
-  CUDAGraph cuda_graph_;
-  bool is_graph_captured_ = false;
-  int regular_run_count_before_graph_capture_ = 0;
-  // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like:
-  // (1) memory pattern is enabled. (2) arena allocation for stream.
-  // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
-  // to allocate enough memory in Arena before graph capturing.
-  const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
-
   // [Note] We don't use PerThreadContext for now since it has issue with multithreading
   //
   // TRT or CUDA objects that must be maintained on a per thread basis will be put under this PerThreadContext data structure.
@@ -436,14 +427,20 @@ class NvExecutionProvider : public IExecutionProvider {
     bool UpdateTensorRTContext(std::string fused_node, std::unique_ptr<nvinfer1::IExecutionContext> context);
     void ResetTensorRTContext(std::string fused_node);
 
-    void InitCUDAGraph();
-    void SetGraphStream(cudaStream_t stream);
-    bool IsGraphCaptureAllowed() const;
-    void CaptureBegin(int graph_annotation_id);
-    void CaptureEnd(int graph_annotation_id);
-    bool IsGraphCaptured(int graph_annotation_id) const;
-    Status ReplayGraph(int graph_annotation_id);
-    void IncrementRegularRunCountBeforeGraphCapture();
+    // CUDA Graph management
+    void SetCudaGraphStream(cudaStream_t stream) { cuda_graph_.SetStream(stream); }
+    bool IsGraphCaptureAllowed(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    bool IsGraphCaptureAllowedOnRun(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    CudaGraphAnnotation_t GetCudaGraphAnnotationId(const onnxruntime::RunOptions& run_options) const;
+    void SetCurrentGraphAnnotationId(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    CudaGraphAnnotation_t GetCurrentGraphAnnotationId() const;
+    void CaptureBegin(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    void CaptureEnd(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    bool IsGraphCaptured(CudaGraphAnnotation_t cuda_graph_annotation_id) const;
+    Status ReplayGraph(CudaGraphAnnotation_t cuda_graph_annotation_id, bool sync_status_flag);
+    void IncrementRegularRunCountBeforeGraphCapture(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    void ResetWarmupRuns(CudaGraphAnnotation_t cuda_graph_annotation_id);
+    void DeleteCapturedGraph(CudaGraphAnnotation_t cuda_graph_annotation_id);
 
    private:
     cudnnHandle_t external_cudnn_handle_ = nullptr;
@@ -466,13 +463,18 @@ class NvExecutionProvider : public IExecutionProvider {
     // Cuda graph with multi threads will be supported in the future, so cuda_graph_ is put under PerThreadContext.
     // ORT TRT only supports CUDA graph when whole model is supported by TRT, so simply maintaining a CUDAGraph instance is enough (no need to maintain one CUDAGraph instance per TRT subgraph)
     CUDAGraph cuda_graph_;
+    // Map of graph id to regular_run_count_before_graph_capture
+    std::unordered_map<CudaGraphAnnotation_t, int> graph_id_to_run_count_;
     bool is_graph_captured_ = false;
     int regular_run_count_before_graph_capture_ = 0;
+    // Current graph annotation ID for this run
+    CudaGraphAnnotation_t current_graph_annotation_id_ = kCudaGraphAnnotationDefault;
     // There is chance (currently only happens in CUDA EP) that the second regular run allocates GPU memory for causes like:
     // (1) memory pattern is enabled. (2) arena allocation for stream.
     // Since no GPU memory allocation is allowed during graph capturing, we need at least two regular runs
     // to allocate enough memory in Arena before graph capturing.
-    const int min_num_runs_before_cuda_graph_capture_ = 1;  // required min regular runs before graph capture for the necessary memory allocations.
+    const int min_num_runs_before_cuda_graph_capture_ = 2;  // required min regular runs before graph capture for the necessary memory allocations.
+    // https://github.com/NVIDIA/TensorRT/blob/main/samples/common/sampleInference.cpp#L1258-L1291 Based on the trtexec code
   };
 
   using PerThreadContextMap = std::unordered_map<const NvExecutionProvider*, std::weak_ptr<PerThreadContext>>;
@@ -606,11 +608,6 @@ class NvExecutionProvider : public IExecutionProvider {
                                         std::unordered_map<std::string, size_t>& output_map,
                                         std::vector<NodeComputeInfo>& node_compute_funcs);
 
-  bool IsGraphCaptureAllowed() const;
-  void CaptureBegin(int graph_annotation_id);
-  void CaptureEnd(int graph_annotation_id);
-  void IncrementRegularRunCountBeforeGraphCapture();
-
   /**
    * Get the pointer to the IBuilder instance.
    * This function only creates the instance at the first time it's being called."
diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc
index be59b1ae07020..68d15bdfdcee0 100644
--- a/onnxruntime/core/providers/openvino/backend_manager.cc
+++ b/onnxruntime/core/providers/openvino/backend_manager.cc
@@ -90,7 +90,12 @@ BackendManager::BackendManager(SessionContext& session_context,
           "[OpenVINO-EP] Bounded dynamic model execution using provider option reshape_input is not supported for OVEP EPContext model";
       ORT_THROW(exception_str);
     }
-    model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
+    if (subgraph_context_.is_ep_ctx_ovir_encapsulated) {
+      model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.onnx_model_path_name.replace_extension("xml").string(), subgraph);
+    } else {
+      model_stream = ep_ctx_handle_.GetModelBlobStream(session_context_.so_context_file_path, subgraph);
+    }
+
   } else {
     model_proto = GetModelProtoFromFusedNode(fused_node, subgraph, logger);
   }
@@ -236,7 +241,9 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie
     std::ofstream blob_file(blob_filename,
                             std::ios::out | std::ios::trunc | std::ios::binary);
     if (!blob_file) {
-      ORT_THROW("Unable to open file for epctx model dump.");
+      std::ostringstream err_msg;
+      err_msg << "Unable to open file for epctx model dump: " << blob_filename;
+      ORT_THROW(err_msg.str());
     }
     compiled_model.export_model(blob_file);
     model_blob_str = blob_filename.filename().string();
@@ -375,6 +382,56 @@ static bool IsQDQGraph(const onnxruntime::GraphViewer& graph_viewer) {
   return false;
 }
 
+static bool IsModelBF16(const onnxruntime::GraphViewer& graph_viewer) {
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+  for (std::size_t i = 0; i < node_indices.size(); i++) {
+    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
+    for (auto& output : node->OutputDefs()) {
+      if (output->ToProto().type().tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
+        return true;
+    }
+  }
+  return false;
+}
+
+static bool Is16BitTensor(const onnxruntime::NodeArg* node_arg) {
+  const auto* type_proto = node_arg ? node_arg->TypeAsProto() : nullptr;
+  return type_proto && type_proto->has_tensor_type() &&
+         (type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT16 ||
+          type_proto->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT16);
+}
+
+// Check to see if the graph has Q/DQ nodes with int16 or uint16 quantization
+static bool IsQDQGraphWithUint16OrInt16(const onnxruntime::GraphViewer& graph_viewer) {
+  std::unordered_set<std::string> qdq_ops = {"QuantizeLinear", "DequantizeLinear"};
+  const auto& node_indices = graph_viewer.GetNodesInTopologicalOrder();
+
+  for (size_t i = 0; i < node_indices.size(); i++) {
+    gsl::not_null<const onnxruntime::Node*> node(graph_viewer.GetNode(node_indices[i]));
+
+    if (qdq_ops.find(node->OpType()) != qdq_ops.end()) {
+      const auto& input_defs = node->InputDefs();
+
+      if (node->OpType() == "DequantizeLinear") {
+        // DequantizeLinear: [quantized_input, scale, zero_point] -> [float_output]
+        // Check quantized input tensor and optional zero point
+        if (Is16BitTensor(input_defs.empty() ? nullptr : input_defs[0]) ||
+            (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
+          return true;
+        }
+      } else if (node->OpType() == "QuantizeLinear") {
+        // QuantizeLinear: [float_input, scale, zero_point] -> [quantized_output]
+        const auto& output_defs = node->OutputDefs();
+        if (Is16BitTensor(output_defs.empty() ? nullptr : output_defs[0]) ||
+            (input_defs.size() >= 3 && Is16BitTensor(input_defs[2]))) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
 static void DumpOpenVINOEPModel([[maybe_unused]] const std::filesystem::path& onnx_model_path_name,
                                 [[maybe_unused]] ONNX_NAMESPACE::ModelProto* model_proto,
                                 [[maybe_unused]] const onnxruntime::Node& fused_node) {
@@ -433,6 +490,10 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
   }
 #endif
 
+  // Check if the graph is QDQ and has int16 or uint16 quantization
+  // If so, we will apply the QDQ scales fix transformation (for GPU device only)
+  bool is_qdq_graph_uint16_or_int16 = IsQDQGraphWithUint16OrInt16(subgraph);
+
   const auto& onnx_model_path_name = subgraph.ModelPath();
   // QDQ stripping enabled only for the NPU and experimentally on the GPU
   if ((session_context_.device_type.find("NPU") != std::string::npos) &&
@@ -446,7 +507,7 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
   } else if ((session_context_.device_type.find("GPU") != std::string::npos) &&
-             enable_ovep_qdq_optimizer) {
+             is_qdq_graph_uint16_or_int16) {
     // Create a copy of the model
     std::unique_ptr<onnxruntime::Model> model;
     Status status = qdq_scales_fix::Transform(subgraph, logger, model);
@@ -456,6 +517,16 @@ BackendManager::GetModelProtoFromFusedNode(const onnxruntime::Node& fused_node,
     DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
     ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
     return model_proto;
+  } else if (IsModelBF16(subgraph)) {
+    LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP bfloat16->float16 optimization pass is enabled";
+    std::unique_ptr<onnxruntime::Model> model;
+    Status status = bfloat16_fix::Transform(subgraph, logger, model);
+    auto model_proto = model->ToProto();
+    model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION);
+    print_model_proto_duration();
+    DumpOpenVINOEPModel(onnx_model_path_name, model_proto.get(), fused_node);
+    ORT_ENFORCE(status.IsOK(), status.ErrorMessage());
+    return model_proto;
   } else {
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] OVEP QDQ optimization pass is disabled";
     auto model = subgraph.CreateModel(logger);
diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc
index 73fbe9a0fa76f..7027861f0c4dc 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.cc
+++ b/onnxruntime/core/providers/openvino/backend_utils.cc
@@ -150,6 +150,11 @@ CreateOVModel(std::string&& model,
       LOGS_DEFAULT(INFO) << log_tag << "Reshaping the ov tensor to specified shape";
       ov_model->reshape(session_context.reshape);
     }
+
+    if (!session_context.layout.empty()) {
+      LOGS_DEFAULT(INFO) << log_tag << "Setting the ov tensor layout to specified layout";
+      ov_model = Set_Layout(ov_model, session_context.layout);
+    }
     // Check for Constant Folding
     if ((session_context.device_type != "NPU") && !session_context.is_wholly_supported_graph) {
       ov::pass::ConstantFolding pass_const_obj;
@@ -199,6 +204,41 @@ GetOutputTensor(Ort::KernelContext& context,
   return context.GetOutput(index, output_shape);
 }
 
+std::shared_ptr<OVNetwork> Set_Layout(std::shared_ptr<OVNetwork> ov_model, const layout_t& layout) {
+  ov::preprocess::PrePostProcessor preproc(ov_model);
+
+  const auto& inputs = ov_model->inputs();
+  const auto& outputs = ov_model->outputs();
+
+  auto find_tensor_index = [](const std::vector<ov::Output<ov::Node>>& tensors, const std::string& name) -> std::optional<size_t> {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      const auto& tensor = tensors[i];
+      if (tensor.get_any_name() == name || tensor.get_tensor().get_names().count(name) > 0) {
+        return i;
+      }
+    }
+    return std::nullopt;
+  };
+
+  for (const auto& [tensor_name, layout_value] : layout) {
+    bool tensor_found = false;
+
+    if (auto input_idx = find_tensor_index(inputs, tensor_name)) {
+      preproc.input(*input_idx).tensor().set_layout(layout_value);
+      tensor_found = true;
+    } else if (auto output_idx = find_tensor_index(outputs, tensor_name)) {
+      preproc.output(*output_idx).tensor().set_layout(layout_value);
+      tensor_found = true;
+    }
+
+    if (!tensor_found) {
+      LOGS_DEFAULT(WARNING) << "Tensor '" << tensor_name << "' not found in model inputs or outputs";
+    }
+  }
+
+  return preproc.build();
+}
+
 int GetFirstAvailableDevice(SessionContext& session_context) {
   int i = 0;
   // Get the first available VAD-M device and set the device to busy
diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h
index 15145df651fa2..27f791c7a5bd1 100644
--- a/onnxruntime/core/providers/openvino/backend_utils.h
+++ b/onnxruntime/core/providers/openvino/backend_utils.h
@@ -79,6 +79,8 @@ int GetFirstAvailableDevice(SessionContext& session_context);
 
 void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor);
 
+std::shared_ptr<OVNetwork> Set_Layout(std::shared_ptr<OVNetwork> ov_model, const layout_t& layout);
+
 template <typename T>
 void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> node);
 
diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
index 6efd866d47c3c..2f174110dd31b 100644
--- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc
+++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc
@@ -59,7 +59,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
         };
         // If the EPContext node with OVIR Encapsulation, then create
         // an executable network from EP_CACHE_CONTEXT using read_model() & compile_model()
-        exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream,
+        exe_network_ = OVCore::Get()->ImportEPCtxOVIREncapsulation(*model_stream->stream_,
                                                                    hw_target,
                                                                    device_config,
                                                                    enable_causallm,
@@ -98,6 +98,7 @@ BasicBackend::BasicBackend(std::unique_ptr<ONNX_NAMESPACE::ModelProto>& model_pr
                                !subgraph_context_.has_dynamic_input_shape &&
                                !session_context_.so_context_enable &&
                                session_context_.reshape.empty() &&
+                               session_context_.layout.empty() &&
                                !enable_causallm &&
                                !eligible_for_cpu_fallback &&
                                auto_unified_compile);
@@ -213,101 +214,29 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
   if (!session_context_.load_config.empty()) {
     const std::map<std::string, ov::AnyMap>& target_config = session_context_.load_config;
 
-    if ((session_context_.device_type.find("NPU") != std::string::npos) && session_context_.enable_causallm) {
-      if (target_config.find("NPU") != target_config.end()) {
-        auto npu_genai_config = target_config.at("NPU");
-        CausalLMConfig().ApplyConfig(npu_genai_config, device_config);
-      } else {
-        LOGS_DEFAULT(WARNING) << "ORT GenAI CausalLMConfig Configuration not found.";
-      }
-    }
+    // Extract device names from device string and apply their configs
+    // Examples: "GPU" -> ["GPU"], "AUTO:GPU.0,CPU" -> ["AUTO", "GPU", "CPU"]
+    auto apply_device_config = [&](std::string_view device) {
+      if (device.empty()) return;
 
-    if (session_context_.device_type.find("NPU") != std::string::npos) {
-      auto npuw_config = target_config.at("NPU");
-
-      // Check if "NPU_USE_NPUW" exists and is set to "YES"
-      auto npu_use_npuw_it = npuw_config.find("NPU_USE_NPUW");
-      if (npu_use_npuw_it != npuw_config.end() &&
-          npu_use_npuw_it->second.is<std::string>() &&
-          npu_use_npuw_it->second.as<std::string>() == "YES") {
-        // Only add NPUW-related keys if NPU_USE_NPUW is "YES"
-        for (const auto& [key, value] : npuw_config) {
-          if (key.find("NPUW") != std::string::npos) {
-            if (!value.is<std::string>()) {
-              LOGS_DEFAULT(ERROR) << "Invalid value type for key: " << key;
-              continue;
-            }
-            device_config[key] = value;
-          }
-        }
-      } else {
-        // Check if there are any "NPUW" keys and log a warning
-        if (std::any_of(npuw_config.begin(), npuw_config.end(),
-                        [&](const auto& pair) { return pair.first.find("NPUW") != std::string::npos; })) {
-          LOGS_DEFAULT(WARNING) << "Skipping NPUW-related configurations as NPU_USE_NPUW is not set to 'YES'.";
-        }
-      }
-    }
-    auto find_device_type_mode = [&](const std::string& device_type) -> std::string {
-      std::string device_mode = "";
-      auto delimiter_pos = device_type.find(':');
-      if (delimiter_pos != std::string::npos) {
-        std::stringstream str_stream(device_type.substr(0, delimiter_pos));
-        std::getline(str_stream, device_mode, ',');
-      }
-      return device_mode;
-    };
-
-    // Parse device types like "AUTO:CPU,GPU" and extract individual devices
-    auto parse_individual_devices = [&](const std::string& device_type) -> std::vector<std::string> {
-      std::vector<std::string> devices;
-      auto delimiter_pos = device_type.find(':');
-      if (delimiter_pos != std::string::npos) {
-        std::stringstream str_stream(device_type.substr(delimiter_pos + 1));
-        std::string device;
-        while (std::getline(str_stream, device, ',')) {
-          devices.emplace_back(device);
-        }
-      } else {
-        devices.emplace_back(device_type);
-      }
-      return devices;
-    };
+      // Remove device index: "GPU.0" -> "GPU"
+      auto base_device = device.substr(0, device.find('.'));
 
-    // Set properties, Validation will be handled by OpenVINO Core
-    auto set_target_properties = [&](const std::string& device, const ov::AnyMap& config_options) {
-      for (const auto& [key, value] : config_options) {
-        if ((key.find("NPUW") != std::string::npos) ||
-            ((device_config.find(key) != device_config.end()) && session_context_.enable_causallm)) {
-          continue;
+      if (auto config_it = target_config.find(std::string(base_device)); config_it != target_config.end()) {
+        for (const auto& [key, value] : config_it->second) {
+          device_config[key] = value;
         }
-        OVCore::Get()->core.set_property(device, ov::AnyMap{{key, value}});
       }
     };
 
-    // Check if the device type is AUTO, HETERO, or MULTI
-    if (session_context_.device_type.find("AUTO") == 0 ||
-        session_context_.device_type.find("HETERO") == 0 ||
-        session_context_.device_type.find("MULTI") == 0) {
-      //// Parse to get the device mode (e.g., "AUTO:CPU,GPU" -> "AUTO")
-      std::unordered_set<std::string> supported_mode = {"AUTO", "HETERO", "MULTI"};
-      auto device_mode = find_device_type_mode(session_context_.device_type);
-      ORT_ENFORCE(supported_mode.find(device_mode) != supported_mode.end(), " Invalid device mode is passed : ", session_context_.device_type);
-      // Parse individual devices (e.g., "AUTO:CPU,GPU" -> ["CPU", "GPU"])
-      auto individual_devices = parse_individual_devices(session_context_.device_type);
-      if (!device_mode.empty()) individual_devices.emplace_back(device_mode);
-
-      // Set properties only for individual devices (e.g., "CPU", "GPU")
-      for (const std::string& device : individual_devices) {
-        if (target_config.count(device)) {
-          // Set properties for the device
-          set_target_properties(device, target_config.at(device));
+    // Parse device string by splitting on ':' and ',' delimiters
+    const auto& device_str = session_context_.device_type;
+    for (size_t start = 0, pos = 0; pos <= device_str.size(); ++pos) {
+      if (pos == device_str.size() || device_str[pos] == ':' || device_str[pos] == ',') {
+        if (pos > start) {
+          apply_device_config(std::string_view(device_str).substr(start, pos - start));
         }
-      }
-    } else {
-      if (target_config.count(session_context_.device_type)) {
-        set_target_properties(session_context_.device_type,
-                              target_config.at(session_context_.device_type));
+        start = pos + 1;
       }
     }
   }
diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h
index 6a2b375d733f9..07b09899ac214 100644
--- a/onnxruntime/core/providers/openvino/contexts.h
+++ b/onnxruntime/core/providers/openvino/contexts.h
@@ -70,6 +70,7 @@ class SharedContext : public WeakSingleton<SharedContext> {
 
 using config_t = std::map<std::string, ov::AnyMap>;
 using reshape_t = std::map<std::string, ov::PartialShape>;
+using layout_t = std::map<std::string, ov::Layout>;
 
 struct ProviderInfo {
   std::string device_type{""};             // [device_type]: Overrides the accelerator hardware type and
@@ -88,6 +89,7 @@ struct ProviderInfo {
                                            // (GPU) feature. If blob files are already present,
                                            // it will be directly loaded.
   reshape_t reshape{};                     // Used for reshaping the ov input tensor shape at runtime.
+  layout_t layout{};                       // Used for specifying the ov input/output tensor layout at runtime.
   std::string model_priority{"DEFAULT"};   // High-level OpenVINO model priority hint
                                            // Defines what model should be provided with more performant
                                            // bounded resource first
@@ -110,7 +112,7 @@ struct ProviderInfo {
   const ConfigOptions* config_options{NULL};
   const std::unordered_set<std::string> valid_provider_keys = {"device_type", "device_id", "device_luid", "cache_dir", "precision",
                                                                "load_config", "context", "num_of_threads", "model_priority", "num_streams", "enable_opencl_throttling", "enable_qdq_optimizer",
-                                                               "enable_causallm", "disable_dynamic_shapes", "reshape_input"};
+                                                               "enable_causallm", "disable_dynamic_shapes", "reshape_input", "layout"};
 };
 
 // Holds context applicable to the entire EP instance.
diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h
index ec38425f602eb..365a4625815d6 100644
--- a/onnxruntime/core/providers/openvino/ibackend.h
+++ b/onnxruntime/core/providers/openvino/ibackend.h
@@ -19,7 +19,7 @@ class IBackend {
   virtual ~IBackend() = default;
   virtual void RewindKVCache(size_t index) {}
 };
-using ptr_stream_t = std::unique_ptr<std::istream>;
+using ptr_stream_t = std::unique_ptr<ModelBlobWrapper>;
 class BackendFactory {
  public:
   static std::shared_ptr<IBackend>
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
index 9e70756a254aa..051a39bd4f205 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc
@@ -100,7 +100,8 @@ Status EPCtxHandler::AddOVEPCtxNodeToGraph(const GraphViewer& graph_viewer,
   return Status::OK();
 }
 
-std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
+std::unique_ptr<ModelBlobWrapper>
+EPCtxHandler::GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const {
   auto first_index = *graph_viewer.GetNodesInTopologicalOrder().begin();
   auto node = graph_viewer.GetNode(first_index);
   ORT_ENFORCE(node != nullptr);
@@ -113,10 +114,11 @@ std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const std::filesy
   bool embed_mode = static_cast<bool>(attrs.at(EMBED_MODE).i());
 
   std::unique_ptr<std::istream> result;
+  std::filesystem::path blob_filepath{};
   if (embed_mode) {
     result.reset((std::istream*)new std::istringstream(ep_cache_context));
   } else {
-    auto blob_filepath = so_context_file_path;
+    blob_filepath = so_context_file_path;
     if (blob_filepath.empty() && !graph_viewer.ModelPath().empty()) {
       blob_filepath = graph_viewer.ModelPath();
     }
@@ -126,16 +128,18 @@ std::unique_ptr<std::istream> EPCtxHandler::GetModelBlobStream(const std::filesy
   }
 
   bool isXML = backend_utils::IsModelStreamXML(*result);
+  std::filesystem::path native_blob_path{};
   if (!isXML) {
     // If the model stream is not an XML (i.e. precompiled blob), the OpenVINO SDK version that it was
     // exported with must match the version that is currently running.
+    native_blob_path = std::move(blob_filepath);
     ORT_ENFORCE((attrs.count(EP_SDK_VER) == 1) && (attrs.at(EP_SDK_VER).s() == openvino_sdk_version_),
                 "EPCtx blob was exported / is compatible with OpenVINO SDK version " + attrs.at(EP_SDK_VER).s() +
                     ", but OpenVINO SDK version currently in use is " + openvino_sdk_version_);
   }
 
   LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node";
-  return result;
+  return std::make_unique<ModelBlobWrapper>(std::move(result), native_blob_path);
 }
 
 bool EPCtxHandler::CheckForOVEPCtxNodeInGraph(const GraphViewer& graph_viewer) const {
diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
index b9ddb40a7a233..f207f5014ca1f 100644
--- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
+++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h
@@ -12,6 +12,12 @@
 namespace onnxruntime {
 namespace openvino_ep {
 
+struct ModelBlobWrapper {
+  ModelBlobWrapper(std::unique_ptr<std::istream> stream, const std::filesystem::path& native_blob_path) : stream_(std::move(stream)), maybe_native_blob_path_(native_blob_path) {}
+  std::unique_ptr<std::istream> stream_;
+  std::filesystem::path maybe_native_blob_path_;
+};
+
 // Utilities to handle EPContext node export and parsing of an EPContext node
 // to create the compiled_model object to infer on
 static const char EPCONTEXT_OP[] = "EPContext";
@@ -31,7 +37,7 @@ class EPCtxHandler {
                                const std::string& graph_name,
                                const bool embed_mode,
                                std::string&& model_blob_str) const;
-  std::unique_ptr<std::istream> GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const;
+  std::unique_ptr<ModelBlobWrapper> GetModelBlobStream(const std::filesystem::path& so_context_file_path, const GraphViewer& graph_viewer) const;
   InlinedVector<const Node*> GetEPCtxNodes() const;
   bool CheckEPCacheContextAttribute(const GraphViewer& graph_viewer, const std::string& target_attr_extn) const;
 
diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
index 1b19517b07363..a0fa885cbfc38 100644
--- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
+++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc
@@ -94,18 +94,23 @@ common::Status OpenVINOExecutionProvider::Compile(
   auto& logger = *GetLogger();
   Status status = Status::OK();
 
+  bool is_epctx_model = false;
   if (!fused_nodes.empty()) {
     // Assume these properties are constant for all the model subgraphs, otherwise move to SubGraphContext
     const auto& graph_body_viewer_0 = fused_nodes[0].filtered_graph.get();
     session_context_.onnx_model_path_name = graph_body_viewer_0.ModelPath().string();
     session_context_.onnx_opset_version =
         graph_body_viewer_0.DomainToVersionMap().at(kOnnxDomain);
+
+    // OVIR wrapped in epctx should be treated as source but this code does not
+    // This corner case is not in use and will be addressed in a future commit
+    is_epctx_model = ep_ctx_handle_.CheckForOVEPCtxNodeInGraph(graph_body_viewer_0);
   }
 
   // The block below is executed during EP context model inference
   auto& metadata = shared_context_->shared_weights.metadata;  // Metadata object in memory
   if (session_context_.so_share_ep_contexts &&
-      !session_context_.so_context_enable &&
+      is_epctx_model &&
       metadata.empty()) {
     fs::path context_model_file_path = session_context_.so_context_file_path;
     if (context_model_file_path.empty()) {
diff --git a/onnxruntime/core/providers/openvino/openvino_parser_utils.cc b/onnxruntime/core/providers/openvino/openvino_parser_utils.cc
index 21fc7f935da23..a290fea73e0e8 100644
--- a/onnxruntime/core/providers/openvino/openvino_parser_utils.cc
+++ b/onnxruntime/core/providers/openvino/openvino_parser_utils.cc
@@ -236,5 +236,79 @@ ov::Dimension OpenVINOParserUtils::ParseDimensionRange(const std::string& range_
   return ov::Dimension(range_start, range_end);
 }
 
+layout_t OpenVINOParserUtils::ParseLayout(const std::string& layout_definition) {
+  layout_t parsed_layout_map;
+
+  // Return empty map for empty input
+  if (layout_definition.empty()) {
+    ORT_THROW("Empty layout definition provided in layout parameter");
+  }
+
+  // Regular expression for parsing layout definitions
+  const std::regex layout_pattern(R"(([^\[\],]+)\s*\[(.*?)\])");  // e.g. "input_1[NC],data[CHW]"
+
+  // Find all tensor layout definitions using regex
+  auto layout_begin = std::sregex_iterator(
+      layout_definition.begin(),
+      layout_definition.end(),
+      layout_pattern);
+  auto layout_end = std::sregex_iterator();
+
+  // If no matches found, throw error
+  if (layout_begin == layout_end) {
+    ORT_THROW("Invalid layout definition format: " + layout_definition);
+  }
+
+  // Process each tensor definition
+  for (std::sregex_iterator i = std::move(layout_begin); i != layout_end; ++i) {
+    std::smatch layout_match = *i;
+
+    // Extract tensor name and trim whitespace
+    std::string tensor_name = layout_match[1].str();  // Group 1: tensor name e.g. "input_1"
+    tensor_name = TrimWhitespace(tensor_name);
+
+    if (tensor_name.empty()) {
+      ORT_THROW("Empty tensor name provided in layout parameter");
+    }
+
+    // Extract dimensions string
+    std::string dimensions_str = layout_match[2].str();  // Group 2: dimensions string [e.g. "NC", "CHW"]
+
+    if (!Check_Valid_Layout(dimensions_str, tensor_name)) {
+      ORT_THROW("Invalid dimensions string provided in layout parameter");
+    }
+
+    // Store parsed shape in result map
+    parsed_layout_map[tensor_name] = ov::Layout(dimensions_str);
+  }
+
+  return parsed_layout_map;
+}
+
+bool OpenVINOParserUtils::Check_Valid_Layout(const std::string& layout_str, const std::string& tensor_name) {
+  // Check if the layout string is empty
+  if (layout_str.empty()) {
+    return false;
+  }
+
+  std::unordered_set<char> seen_alphabets;
+  for (char c : layout_str) {
+    if (std::isalpha(c)) {
+      char upper_c = static_cast<char>(std::toupper(c));  // Convert to uppercase for case-insensitive comparison
+      if (seen_alphabets.find(upper_c) != seen_alphabets.end()) {
+        ORT_THROW("Repeated Dim '" + std::string(1, c) +
+                  "' found in layout dimensions for tensor '" + tensor_name + "'");
+      }
+      seen_alphabets.insert(upper_c);
+    } else if (c != '?') {
+      // Only '?' is allowed as non-alphabetic character
+      ORT_THROW("Invalid character '" + std::string(1, c) +
+                "' found in layout dimensions for tensor '" + tensor_name + "'");
+    }
+  }
+
+  return true;
+}
+
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/openvino_parser_utils.h b/onnxruntime/core/providers/openvino/openvino_parser_utils.h
index e6aa0e0a46a3b..a0936d627df40 100644
--- a/onnxruntime/core/providers/openvino/openvino_parser_utils.h
+++ b/onnxruntime/core/providers/openvino/openvino_parser_utils.h
@@ -18,8 +18,10 @@ class OpenVINOParserUtils {
                                     std::string& device_type,
                                     const std::string& option_name);
   static reshape_t ParseInputShape(const std::string& reshape_input_definition);
+  static layout_t ParseLayout(const std::string& layout_definition);
   static std::string TrimWhitespace(const std::string& str);
   static ov::Dimension ParseDimensionRange(const std::string& range_str, const std::string& tensor_name);
+  static bool Check_Valid_Layout(const std::string& layout_str, const std::string& tensor_name);
 };
 
 }  // namespace openvino_ep
diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
index 9dba8623031d0..1a10d9849d5cc 100644
--- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
+++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc
@@ -171,7 +171,7 @@ std::string ParseDeviceType(std::shared_ptr<OVCore> ov_core, const ProviderOptio
     if (!device_mode.empty()) {
       selected_device = device_mode + ":" + ov_luid_devices;
       for (const auto& dev_str : devices_to_check) {
-        const auto default_dev = split(dev_str, '.')[0];
+        const std::string default_dev = split(dev_str, '.')[0];
 
         if (ov_luid_devices.find(default_dev) == std::string::npos)
           selected_device = selected_device + "," + dev_str;
@@ -230,6 +230,10 @@ static void ParseProviderInfo(const ProviderOptions& provider_options,
     pi.reshape = OpenVINOParserUtils::ParseInputShape(provider_options.at("reshape_input"));
   }
 
+  if (provider_options.contains("layout")) {
+    pi.layout = OpenVINOParserUtils::ParseLayout(provider_options.at("layout"));
+  }
+
   if (provider_options.contains("load_config")) {
     auto parse_config = [&](const std::string& config_str) -> std::map<std::string, ov::AnyMap> {
       // If the config string is empty, return an empty map and skip processing
@@ -526,7 +530,7 @@ struct OpenVINO_Provider : Provider {
     std::string ov_device_string;
     if (is_meta_device_factory) {
       // Build up a meta device string based on the devices that are passed in. E.g. AUTO:NPU,GPU.0,CPU
-      ov_device_string = ov_meta_device_type;
+      ov_device_string = std::move(ov_meta_device_type);
       ov_device_string += ":";
     }
 
@@ -539,7 +543,7 @@ struct OpenVINO_Provider : Provider {
       prepend_comma = true;
     }
 
-    provider_options["device_type"] = ov_device_string;
+    provider_options["device_type"] = std::move(ov_device_string);
 
     // Parse provider info with the device type
     ProviderInfo pi;
diff --git a/onnxruntime/core/providers/openvino/ov_factory.cc b/onnxruntime/core/providers/openvino/ov_factory.cc
index 8860405338409..2853cc17726ab 100644
--- a/onnxruntime/core/providers/openvino/ov_factory.cc
+++ b/onnxruntime/core/providers/openvino/ov_factory.cc
@@ -105,7 +105,7 @@ OrtStatus* OpenVINOEpPluginFactory::GetSupportedDevices(const OrtHardwareDevice*
     std::string ov_device_name;
     auto get_gpu_device_id = [&](const std::string& ov_device) {
       try {
-        auto device_id_str = ov_core_->get_property(ov_device, "GPU_DEVICE_ID").as<std::string>();
+        const std::string device_id_str = ov_core_->get_property(ov_device, "GPU_DEVICE_ID").as<std::string>();
         return static_cast<uint32_t>(std::stoul(device_id_str, nullptr, 0));
       } catch (ov::Exception&) {
         return 0u;  // If we can't get the GPU_DEVICE_ID info, we won't have a device ID.
diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc
index 2d29df8eb4197..899845d4890cf 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.cc
+++ b/onnxruntime/core/providers/openvino/ov_interface.cc
@@ -11,6 +11,7 @@
 #include "core/providers/openvino/backend_utils.h"
 #include "core/providers/openvino/backends/basic_backend.h"
 #include "core/providers/openvino/ov_stateful_patch_utils.h"
+#include "core/providers/openvino/onnx_ctx_model_helper.h"
 
 namespace onnxruntime {
 namespace openvino_ep {
@@ -191,14 +192,23 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model,
                              "Exception while Loading Network for graph {}", name);
 }
 
-OVExeNetwork OVCore::ImportModel(std::istream& model_stream,
+OVExeNetwork OVCore::ImportModel(ModelBlobWrapper& model_blob,
                                  std::string hw_target,
                                  const ov::AnyMap& device_config,
                                  std::string name) {
   return OvExceptionBoundary([&]() {
     ov::CompiledModel obj;
-    obj = core.import_model(model_stream, hw_target, device_config);
+#if (OPENVINO_VERSION_MAJOR > 2025 || (OPENVINO_VERSION_MAJOR == 2025 && OPENVINO_VERSION_MINOR >= 3))
+    if (!model_blob.maybe_native_blob_path_.empty()) {
+      obj = core.import_model(ov::read_tensor_data(model_blob.maybe_native_blob_path_), hw_target, device_config);
+    } else {
+      obj = core.import_model(*model_blob.stream_, hw_target, device_config);
+    }
+#else
+    obj = core.import_model(*model_blob.stream_, hw_target, device_config);
+#endif
     OVExeNetwork exe(obj, hw_target);
+
 #ifndef NDEBUG
     printDebugInfo(exe.Get());
 #endif
diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h
index 6d1db4366410b..38ea883078e85 100644
--- a/onnxruntime/core/providers/openvino/ov_interface.h
+++ b/onnxruntime/core/providers/openvino/ov_interface.h
@@ -26,6 +26,7 @@ namespace openvino_ep {
 class OVCore;
 class OVInferRequest;
 class OVExeNetwork;
+struct ModelBlobWrapper;
 
 typedef ov::Tensor OVTensor;
 typedef ov::ProfilingInfo OVProfilingInfo;
@@ -82,7 +83,7 @@ struct OVCore : WeakSingleton<OVCore> {
                             ov::AnyMap& device_config,
                             const std::string& name);
   // OV Interface for Import model Stream
-  OVExeNetwork ImportModel(std::istream& model_stream,
+  OVExeNetwork ImportModel(ModelBlobWrapper& model_blob,
                            std::string hw_target,
                            const ov::AnyMap& device_config,
                            std::string name);
@@ -126,29 +127,16 @@ class OVInferRequest {
   OVTensorPtr GetTensor(const std::string& name);
   std::string GetInputTensorName(uint32_t index);
 
-  // Set tensor described param_info and ort_ptr. Overrides shape in param_info with shape_override. Call infer req tensor if ort_ptr is last set.
+  // Set tensor call infer req tensor if ort_ptr differs from last set ptr.
   void SetTensor(const std::string& name, const ov::element::Type& type, const ov::Shape& shape, void* ort_ptr) {
     auto& cached_binding = bindings_cache_[name];
-    if (cached_binding.ort_ptr != ort_ptr) {
-      auto tensor_ptr = std::make_shared<ov::Tensor>(type, shape, const_cast<void*>(ort_ptr));
-      SetTensor(name, tensor_ptr);
-      cached_binding = {tensor_ptr, ort_ptr};
-    } else if (ort_ptr == nullptr) {
-      // a null ort_ptr is expected for a tensor that has 0 elements.
-      // for example, a tensor of shape=[1, 8, 0, 64], which is valid.
-      // So, we check to see if at least one shape entry is 0.
-      auto contains_zero = [](const ov::Shape& shape) {
-        for (auto& s : shape)
-          if (s == 0) return true;
-        return false;
-      };
-      if (contains_zero(shape)) {
-        // if there are zero elements (i.e. at least one shape entry is 0),
-        // then create and set the tensor anyway.
-        auto tensor_ptr = std::make_shared<ov::Tensor>(type, shape);
-        SetTensor(name, tensor_ptr);
-        cached_binding = {tensor_ptr, ort_ptr};
-      }
+    if (cached_binding.ort_ptr != ort_ptr ||
+        !cached_binding.tensor_ptr ||
+        cached_binding.tensor_ptr->get_shape() != shape) {
+      cached_binding.tensor_ptr.reset();
+      auto ov_tensor = std::make_shared<ov::Tensor>(type, shape, const_cast<void*>(ort_ptr));
+      ovInfReq.set_tensor(name, *ov_tensor);
+      cached_binding = {std::move(ov_tensor), ort_ptr};
     }
   }
 
diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
index 2309ff3de751b..1893700cab09c 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc
@@ -166,17 +166,28 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
     auto connected_clusters = GetConnectedClusters(graph_viewer_, ng_clusters);
 
     int no_of_clusters = 0;
-
+    size_t cluster_index = 0;
+    size_t total_clusters = connected_clusters.size();
     for (auto this_cluster : connected_clusters) {
-      // If subgraph has less then three, graph is considered trivial unless its an epctx cluster
-      if (this_cluster.size() < 3) {
-        bool is_epctx_node = false;
-        for (auto node_idx : this_cluster) {
-          if (graph_viewer_.GetNode(node_idx)->OpType() == "EPContext")
-            is_epctx_node = true;
+      bool omit_subgraph = false;
+
+      if (this_cluster.size() == 1) {
+        // check next cluster
+        auto index = this_cluster.at(0);
+        size_t j = cluster_index;
+        if (graph_viewer_.GetNode(index)->OpType() == "EPContext") {
+          omit_subgraph = false;
+        } else if (j < total_clusters - 1) {
+          bool append_node = false;
+          while (j < total_clusters && !append_node) {
+            j = j + 1;
+            append_node = AddTrivialClusterToNextClusterIfConnected(graph_viewer_, index, connected_clusters[j]);
+          }
+          if (append_node) {
+            connected_clusters[j].emplace_back(index);
+          }
+          omit_subgraph = true;
         }
-        if (!is_epctx_node)
-          continue;
       }
 
       std::vector<std::string> cluster_graph_inputs, cluster_inputs, cluster_outputs;
@@ -188,7 +199,6 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
                                 cluster_inputs,
                                 cluster_outputs);
 
-      bool omit_subgraph = false;
       // Omitting zero dim subgraphs
       for (auto index : this_cluster) {
         const Node* node = graph_viewer_.GetNode(index);
@@ -217,15 +227,17 @@ std::vector<std::unique_ptr<ComputeCapability>> GetCapability::Execute() {
           }
         }
       }
-      if (omit_subgraph)
-        continue;
 
       /* In scenarios, when there are no inputs or all inputs being initializers,
          ConstantFolding optimization in onnxruntime pre-computes the value.*/
-      if (!cluster_inputs.empty()) {
-        AppendClusterToSubGraph(this_cluster, cluster_inputs, cluster_outputs, result);
-        no_of_clusters++;
+      if (!omit_subgraph) {
+        if (!cluster_inputs.empty()) {
+          AppendClusterToSubGraph(this_cluster, cluster_inputs, cluster_outputs, result);
+          no_of_clusters++;
+        }
       }
+
+      cluster_index = cluster_index + 1;
     }
     LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Supported subgraphs on OpenVINO: " << no_of_clusters;
   }
diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
index 17e69ad080b90..f848b89ed10c8 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc
@@ -121,6 +121,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"DepthToSpace", V_2020_4, {"CPU", "GPU"}},
     {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}},
     {"DequantizeLinear", V_2024_4, {"NPU"}},
+    {"DynamicQuantizeLinear", V_2025_2, {"CPU", "GPU"}},
     {"DynamicQuantizeMatMul", V_2025_0, {"CPU", "GPU"}},
     {"Div", V_2020_4, {"CPU", "GPU"}},
     {"Dropout", V_2020_4, {"CPU", "GPU"}},
@@ -172,6 +173,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"LSTM", V_2020_4, {"CPU", "GPU"}},
     {"MatMul", V_2020_4, {"CPU", "GPU"}},
     {"MatMulInteger", V_2022_1, {"CPU"}},
+    {"MatMulInteger", V_2025_2, {"GPU"}},
     {"MatMulNBits", V_2024_5, {"CPU", "GPU"}},
     {"Max", V_2020_4, {"CPU", "GPU"}},
     {"MaxPool", V_2020_4, {"CPU", "GPU"}},
@@ -191,7 +193,7 @@ std::vector<SupportedOp> supported_op_mode = {
     {"Pad", V_2020_4, {"CPU", "GPU"}},
     {"Pow", V_2020_4, {"CPU", "GPU"}},
     {"PRelu", V_2020_4, {"CPU", "GPU"}},
-    {"QLinearMatMul", V_2022_3, {"CPU"}},
+    // {"QLinearMatMul", V_2022_3, {"CPU"}},
     {"QuantizeLinear", V_2021_4, {"CPU", "GPU"}},
     {"QuickGelu", V_2025_0, {"CPU", "GPU"}},
     {"RNN", V_2023_1, {"CPU", "GPU"}},
@@ -361,6 +363,7 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"Clip", V_2022_1, {"All"}});
   no_dimension_supported_.push_back({"Div", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"DequantizeLinear", V_2021_4, {"All"}});
+  no_dimension_supported_.push_back({"DynamicQuantizeLinear", V_2025_2, {"All"}});
   no_dimension_supported_.push_back({"Equal", V_2022_1, {"CPU"}});
   no_dimension_supported_.push_back({"Equal", V_2023_0, {"GPU"}});
   no_dimension_supported_.push_back({"Expand", V_2023_3, {"CPU"}});
@@ -374,6 +377,7 @@ void DataOps::populate_op_mode_supported() {
   no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}});
   no_dimension_supported_.push_back({"Max", V_2024_4, {"All"}});
   no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}});
+  no_dimension_supported_.push_back({"MatMulInteger", V_2025_2, {"All"}});
   no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}});
   no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}});
   no_dimension_supported_.push_back({"Pow", V_2023_0, {"CPU", "GPU"}});
@@ -555,8 +559,13 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
     return false;
   }
 
+  auto dtype = type_proto->tensor_type().elem_type();
+  // Enable bfloat16 -> float16 on-the-fly conversion
+  if (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BFLOAT16 ||
+      dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 ||
+      dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)
+    return true;
   if (is_initializer) {
-    auto dtype = type_proto->tensor_type().elem_type();
     for (auto const& var : supported_types_initializer_) {
       if ((var.first <= version_id_) &&
           (var.second == dtype)) {
@@ -571,8 +580,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
 #endif
     return false;
   } else {
-    auto dtype = type_proto->tensor_type().elem_type();
-
     if (device_id_.find("HETERO") != std::string::npos ||
         device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) {
       for (auto const& var : supported_types_npu_) {
@@ -609,9 +616,6 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) {
             (var.second == dtype)) {
           return true;
         }
-        // experimentally for GPU and qdq stripping mode allow int16 types
-        if (npu_qdq_optimizer_enabled_ && (dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16 || dtype == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16))
-          return true;
       }
 #ifndef NDEBUG
       if (openvino_ep::backend_utils::IsDebugEnabled()) {
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.cc b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
index f924fa0c8205c..791341218913f 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.cc
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.cc
@@ -153,6 +153,24 @@ GetConnectedClusters(const GraphViewer& graph_viewer, const std::vector<std::vec
   return connected_clusters;
 }
 
+bool AddTrivialClusterToNextClusterIfConnected(const GraphViewer& graph_viewer,
+                                               const NodeIndex curr_node_index,
+                                               const std::vector<NodeIndex>& search_cluster) {
+  for (auto index : search_cluster) {
+    auto curr_node = graph_viewer.GetNode(index);
+    for (auto node = curr_node->InputNodesBegin(); node != curr_node->InputNodesEnd(); ++node) {
+      if ((*node).Index() == curr_node_index)
+        return true;
+    }
+
+    for (auto node = curr_node->OutputNodesBegin(); node != curr_node->OutputNodesEnd(); ++node) {
+      if ((*node).Index() == curr_node_index)
+        return true;
+    }
+  }
+  return false;
+}
+
 void GetInputsOutputsOfCluster(const GraphViewer& graph_viewer,
                                const std::vector<NodeIndex>& cluster,
                                const std::unordered_set<std::string>& ng_required_initializers,
diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.h b/onnxruntime/core/providers/openvino/ov_versions/utils.h
index 34aa762ba9b67..bdad047a422c1 100644
--- a/onnxruntime/core/providers/openvino/ov_versions/utils.h
+++ b/onnxruntime/core/providers/openvino/ov_versions/utils.h
@@ -40,6 +40,10 @@ void IdentifyConnectedNodes(
 std::vector<std::vector<NodeIndex>>
 GetConnectedClusters(const GraphViewer& graph_viewer, const std::vector<std::vector<NodeIndex>>& clusters);
 
+bool AddTrivialClusterToNextClusterIfConnected(const GraphViewer& graph_viewer,
+                                               const NodeIndex index,
+                                               const std::vector<NodeIndex>& search_cluster);
+
 void GetInputsOutputsOfCluster(const GraphViewer& graph_viewer,
                                const std::vector<NodeIndex>& cluster,
                                const std::unordered_set<std::string>& ng_required_initializers,
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
index d159930d52845..3a39152b5d17d 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.cpp
@@ -3,6 +3,8 @@
 
 #include "qdq_scales_fix.h"
 #include "core/providers/openvino/ov_protobuf_utils.h"
+#include "core/framework/ort_value.h"
+#include "core/framework/float16.h"
 
 #include <fstream>
 #include <list>
@@ -903,22 +905,11 @@ Status copy_model(const GraphViewer& src_graph_viewer,
   }
 
   for (auto& [name, tensor_proto] : src_graph.GetAllInitializedTensors()) {
-    dst_graph.AddInitializedTensor(*tensor_proto);
-  }
-
-  for (auto node_arg : src_graph.GetInputsIncludingInitializers()) {
-    auto check_inputs = [node_arg](auto input_node_arg) {
-      return input_node_arg->Name() == node_arg->Name();
-    };
-    if (std::find_if(dst_graph_inputs.begin(), dst_graph_inputs.end(), check_inputs) != dst_graph_inputs.end())
-      continue;
-
-    auto src_tensor_proto = src_graph.GetConstantInitializer(node_arg->Name(), true);
-    if (src_tensor_proto) {
-      auto dst_tensor_proto = onnx::TensorProto::Create();
-      dst_tensor_proto->copy_from(src_tensor_proto);
-      dst_graph.AddInitializedTensor(*dst_tensor_proto);
-    }
+    auto ort_value = OrtValue();
+    if (src_graph.GetOrtValueInitializer(name, ort_value))
+      ORT_RETURN_IF_ERROR(dst_graph.AddInitializedOrtValue(*tensor_proto, ort_value));
+    else
+      dst_graph.AddInitializedTensor(*tensor_proto);
   }
 
   ORT_RETURN_IF_ERROR(dst_graph.Resolve());
@@ -940,5 +931,54 @@ Status Transform(const GraphViewer& src_graph_viewer,
   return status;
 }
 }  // namespace qdq_scales_fix
+
+namespace bfloat16_fix {
+void replace_bf16_with_fp16(qdq_scales_fix::CustomGraph& gen_graph) {
+  for (auto& const_node : gen_graph.original_graph.Nodes()) {
+    auto node = const_cast<ONNX_NAMESPACE::Node*>(const_node);
+    if (node->OpType() == "Cast") {
+      for (auto& [name, const_attribute] : node->GetAttributes()) {
+        auto& attribute = const_cast<ONNX_NAMESPACE::AttributeProto&>(const_attribute);
+        if (name == "to" && attribute.type() == ONNX_NAMESPACE::AttributeProto_AttributeType_INT)
+          if (attribute.i() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
+            attribute.set_i(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+      }
+    }
+    for (auto& output : node->OutputDefs()) {
+      auto& output_proto = const_cast<ONNX_NAMESPACE::TypeProto&>(output->ToProto().type());
+      if (output_proto.mutable_tensor_type()->elem_type() == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16)
+        output_proto.mutable_tensor_type()->set_elem_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+    }
+  }
+
+  const auto& init_set = gen_graph.original_graph.GetAllInitializedTensors();
+  for (auto& [key, const_tensor_proto] : init_set) {
+    auto tensor_proto = const_cast<ONNX_NAMESPACE::TensorProto*>(const_tensor_proto);
+    auto dt = tensor_proto->data_type();
+    if (dt == ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16) {
+      auto raw_data = tensor_proto->has_raw_data() ? reinterpret_cast<std::uint16_t*>(tensor_proto->mutable_raw_data()->data()) : nullptr;
+      if (raw_data) {
+        tensor_proto->set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16);
+        std::int64_t size = 1;
+        for (int i = 0; i < tensor_proto->dims_size(); ++i)
+          size *= tensor_proto->dims()[i];
+        for (std::int64_t i = 0; i < size; ++i) {
+          raw_data[i] = onnxruntime::MLFloat16(onnxruntime::BFloat16::FromBits(raw_data[i])).val;
+        }
+      }
+    }
+  }
+}
+
+Status Transform(const GraphViewer& src_graph_viewer,
+                 const logging::Logger& logger,
+                 /*out*/ std::unique_ptr<onnxruntime::Model>& model) {
+  auto status = qdq_scales_fix::copy_model(src_graph_viewer, logger, model);
+  auto g = qdq_scales_fix::generate_graph_from_onnx(model->MainGraph());
+
+  replace_bf16_with_fp16(g);
+  return status;
+}
+}  // namespace bfloat16_fix
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h
index c54c531e1bd40..2182850d96c43 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_scales_fix.h
@@ -15,5 +15,10 @@ Status Transform(const GraphViewer& src_graph,
                  const logging::Logger& logger,
                  /*out*/ std::unique_ptr<onnxruntime::Model>& model);
 }
+namespace bfloat16_fix {
+Status Transform(const GraphViewer& src_graph,
+                 const logging::Logger& logger,
+                 /*out*/ std::unique_ptr<onnxruntime::Model>& model);
+}
 }  // namespace openvino_ep
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
index 24e8892622175..e010851f22e50 100644
--- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
+++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc
@@ -677,6 +677,27 @@ static void AddInitializerAsInput(onnxruntime::Graph& dst_graph,
   }
 }
 
+// To check if the input parameters of a DQ or Q node are quantization parameters
+// Scale and Zero point parameters are quantization parameters
+static bool IsQuantizationParameter(const std::string& initializer_name,
+                                    const onnxruntime::GraphViewer& src_graph) {
+  // Check if this initializer is used as scale or zero_point in any DQ/Q node
+  for (auto& node_idx : src_graph.GetNodesInTopologicalOrder()) {
+    const auto* node = src_graph.GetNode(node_idx);
+    if (node->OpType() == "DequantizeLinear" || node->OpType() == "QuantizeLinear") {
+      const auto& input_defs = node->InputDefs();
+      // Check if this initializer is used as scale (input 1) or zero_point (input 2)
+      if (input_defs.size() >= 2 && input_defs[1]->Name() == initializer_name) {
+        return true;  // This is a scale parameter
+      }
+      if (input_defs.size() >= 3 && input_defs[2]->Name() == initializer_name) {
+        return true;  // This is a zero_point parameter
+      }
+    }
+  }
+  return false;
+}
+
 // Creates a new model without the DQ/Q operators in the src graph.
 Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
                                        const logging::Logger& logger,
@@ -845,10 +866,20 @@ Status CreateModelWithStrippedQDQNodes(const GraphViewer& src_graph,
     if (!init_with_data &&
         utils::HasExternalData(initializer_tensor) &&
         enable_ovep_weight_sharing) {
-      insert_metadata(initializer_tensor);
+      // Only convert to input if it's not a quantization parameter
+      bool is_quant_param = IsQuantizationParameter(name, src_graph);
+
+      if (!is_quant_param) {
+        // This is actual weight data - so to convert to input for weight sharing
+        insert_metadata(initializer_tensor);
+        AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, name);
+      } else {
+        // This is a quantization parameter - keep as initializer even if external
 
-      // Add initializer with external data as input
-      AddInitializerAsInput(dst_graph, accumulated_inputs, src_graph, name);
+        if (initializers_to_keep.count(name) > 0) {
+          dst_graph.AddInitializedTensor(initializer_tensor);
+        }
+      }
     } else {
       // Add as an initialized tensor if it does not have external data
       if (initializers_to_keep.count(name) > 0) {
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index e910afcbcf6c6..dbdb2d828f039 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -236,7 +236,7 @@ class BaseOpBuilder : public IOpBuilder {
   }
 
   // Onnx Pads is [x1_begin, x2_begin, x1_end, x2_end], QNN requires [x1_begin, x1_end, x2_begin, x2_end]
-  void ReArranagePads(std::vector<uint32_t>& pads) const {
+  void ReArrangePads(std::vector<uint32_t>& pads) const {
     auto pads_size = pads.size();
     auto middle_pos = pads_size / 2;
     std::vector<uint32_t> first_half(pads.begin(), pads.begin() + middle_pos);
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
index b80d9db5d3560..dba4fbdbe0872 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/conv_op_builder.cc
@@ -24,7 +24,6 @@ static Status GetOnnxConvType(const std::string& onnx_op_type, OnnxConvType& con
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN EP: Unsupported ONNX convolution op type: ", onnx_op_type.c_str());
   }
-
   return Status::OK();
 }
 
@@ -171,7 +170,7 @@ Status ConvOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
     return ProcessConv2D3DInputs(qnn_model_wrapper, node_unit, logger, input_names, do_op_validation);
   }
 
-  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Conv only supports 3D(rank 5), 2D (rank 4) or 1D (rank 3) inputs.");
+  return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "QNN Conv only supports 3D (rank 5), 2D (rank 4) or 1D (rank 3) inputs.");
 }
 
 Status ConvOpBuilder::ProcessConv2D3DInputs(QnnModelWrapper& qnn_model_wrapper,
@@ -713,7 +712,7 @@ Status ConvOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wra
       }
     }
 
-    ReArranagePads(pads);
+    ReArrangePads(pads);
     uint32_t pad_size = narrow<uint32_t>(pads.size() / 2);
     QnnParamWrapper pad_amount_paramwrapper(node_unit.Index(), node_unit.Name(), QNN_OP_CONV_2D_PARAM_PAD_AMOUNT,
                                             {pad_size, 2}, std::move(pads));
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
index 404d3c402c21e..d2b1434c1c896 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc
@@ -193,7 +193,7 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap
                  [](int64_t item) { return SafeInt<uint32_t>(item); });
   // Onnx format is begin_0, begin_1, ..., end_0, end_1, ...
   // Qnn format is begin_0, end_0, begin_1, end_1, ...
-  ReArranagePads(pad_amount);
+  ReArrangePads(pad_amount);
 
   std::vector<uint32_t> input_shape;
   ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0.");
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
index 78b16ed784049..78ab047a560a7 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pool_op_builder.cc
@@ -199,7 +199,7 @@ Status PoolOpBuilder::SetCommonPoolParams(const NodeAttrHelper& node_helper,
       }
     }
   }
-  ReArranagePads(pad_amount);
+  ReArrangePads(pad_amount);
 
   // Param: rounding_mode.
   rounding_mode = node_helper.Get("ceil_mode", rounding_mode);
diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
index e1a74b9e35370..ee5f52289d779 100644
--- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
+++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc
@@ -158,7 +158,7 @@ bool QnnModelWrapper::CreateQnnInputOutputTensors(const std::string& qnn_node_na
       return false;
     }
 
-    // During graph patitioning, we only need to do op validation, it's not required to create Qnn graph tensor
+    // During graph partitioning, we only need to do op validation, it's not required to create Qnn graph tensor
     // We only need to create the Qnn graph tensor during Compile to create Qnn graph
     if (!do_op_validation) {
       std::string error_string;
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 1f491bc788870..f3e2a8ce7ba7b 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -3036,7 +3036,7 @@ ORT_API_STATUS_IMPL(OrtApis::Node_GetAttributeByName, _In_ const OrtNode* node,
   API_IMPL_END
 }
 
-ORT_API_STATUS_IMPL(OrtApis::Node_GetTensorAttributeAsOrtValue, _In_ const OrtNode* node, _In_ const OrtOpAttr* attribute, _Outptr_result_maybenull_ OrtValue** attr_tensor) {
+ORT_API_STATUS_IMPL(OrtApis::OpAttr_GetTensorAttributeAsOrtValue, _In_ const OrtOpAttr* attribute, _Outptr_result_maybenull_ OrtValue** attr_tensor) {
   API_IMPL_BEGIN
   if (attr_tensor == nullptr) {
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "attr_tensor argument is null");
@@ -3045,7 +3045,39 @@ ORT_API_STATUS_IMPL(OrtApis::Node_GetTensorAttributeAsOrtValue, _In_ const OrtNo
     return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "attribute argument is null");
   }
 
-  ORT_API_RETURN_IF_STATUS_NOT_OK(node->GetTensorAttributeAsOrtValue(attribute, *attr_tensor));
+  const auto* attr_proto = reinterpret_cast<const ONNX_NAMESPACE::AttributeProto*>(attribute);
+
+  if (attr_proto->type() != onnx::AttributeProto::TENSOR) {
+    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "This OrtOpAttr instance is not a 'TENSOR' attribute");
+  }
+
+  const auto& tensor_proto = attr_proto->t();
+
+  // Check that TensorProto is valid.
+  if (!utils::HasDataType(tensor_proto)) {
+    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Tensor proto doesn't have data type.");
+  }
+
+  if (!ONNX_NAMESPACE::TensorProto::DataType_IsValid(tensor_proto.data_type())) {
+    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT, "Tensor proto has invalid data type.");
+  }
+
+  if (utils::HasExternalData(tensor_proto)) {
+    return OrtApis::CreateStatus(OrtErrorCode::ORT_INVALID_ARGUMENT,
+                                 "Tensor proto with external data for value attribute is not supported.");
+  }
+
+  // Initialize OrtValue for tensor attribute.
+  auto tensor_attribute_value = std::make_unique<OrtValue>();
+  AllocatorPtr tensor_attribute_allocator = CPUAllocator::DefaultInstance();
+  // The tensor in the 'Tensor' attribute's TensorProto is stored inline, not in an external file.
+  // Therefore, the 'model_path' passed to TensorProtoToOrtValue() may be an empty path.
+  std::filesystem::path model_path;
+  ORT_API_RETURN_IF_STATUS_NOT_OK(utils::TensorProtoToOrtValue(Env::Default(), model_path, tensor_proto,
+                                                               tensor_attribute_allocator, *tensor_attribute_value));
+
+  *attr_tensor = tensor_attribute_value.release();
+
   return nullptr;
   API_IMPL_END
 }
@@ -3423,25 +3455,86 @@ ORT_API_STATUS_IMPL(OrtApis::CopyTensors, _In_ const OrtEnv* env,
   API_IMPL_END
 }
 
+// Validate compiled model compatibility info for specific EP device(s)
+ORT_API_STATUS_IMPL(OrtApis::GetModelCompatibilityForEpDevices,
+                    _In_reads_(num_ep_devices) const OrtEpDevice* const* ep_devices,
+                    _In_ size_t num_ep_devices,
+                    _In_ const char* compatibility_info,
+                    _Out_ OrtCompiledModelCompatibility* out_status) {
+  API_IMPL_BEGIN
+  if (ep_devices == nullptr || num_ep_devices == 0 || compatibility_info == nullptr || out_status == nullptr) {
+    return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "Invalid argument provided to GetModelCompatibilityForEpDevices.");
+  }
+
+  // Validate inputs and ensure all devices belong to the same EP/factory
+  const OrtEpFactory* first_factory = nullptr;
+  for (size_t i = 0; i < num_ep_devices; ++i) {
+    if (ep_devices[i] == nullptr) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "ep_devices contains a null entry.");
+    }
+    const OrtEpFactory* f = ep_devices[i]->GetMutableFactory();
+    if (i == 0) {
+      first_factory = f;
+    } else if (f != first_factory) {
+      return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "All ep_devices must be from the same execution provider.");
+    }
+  }
+
+  OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
+  OrtStatus* ort_status = nullptr;
+  OrtEpFactory* factory = ep_devices[0]->GetMutableFactory();
+  if (factory && factory->ValidateCompiledModelCompatibilityInfo) {
+    // collect hardware devices corresponding to the ep_devices
+    InlinedVector<const OrtHardwareDevice*> hardware_devices;
+    hardware_devices.reserve(num_ep_devices);
+    for (size_t i = 0; i < num_ep_devices; ++i) {
+      hardware_devices.push_back(ep_devices[i]->device);
+    }
+    ort_status = factory->ValidateCompiledModelCompatibilityInfo(factory,
+                                                                 hardware_devices.data(),
+                                                                 hardware_devices.size(),
+                                                                 compatibility_info,
+                                                                 &status);
+  }
+  if (ort_status != nullptr) {
+    return ToOrtStatus(ToStatusAndRelease(ort_status));
+  }
+
+  *out_status = status;
+  return nullptr;
+  API_IMPL_END
+}
+
 #else  // defined(ORT_MINIMAL_BUILD)
 ORT_API_STATUS_IMPL(OrtApis::RegisterExecutionProviderLibrary, _In_ OrtEnv* /*env*/, _In_ const char* /*registration_name*/,
                     const ORTCHAR_T* /*path*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "RegisterExecutionProviderLibrary is not supported in a minimal build.");
   API_IMPL_END
 }
 
 ORT_API_STATUS_IMPL(OrtApis::UnregisterExecutionProviderLibrary, _In_ OrtEnv* /*env*/,
                     _In_ const char* /*registration_name*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "UnregisterExecutionProviderLibrary is not supported in a minimal build.");
   API_IMPL_END
 }
 
 ORT_API_STATUS_IMPL(OrtApis::GetEpDevices, _In_ const OrtEnv* /*env*/,
                     _Outptr_ const OrtEpDevice* const** /*ep_devices*/, _Out_ size_t* /*num_ep_devices*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "GetEpDevices is not supported in a minimal build.");
+  API_IMPL_END
+}
+
+// Minimal build stub for GetModelCompatibilityForEpDevices to satisfy symbol references from the API table
+ORT_API_STATUS_IMPL(OrtApis::GetModelCompatibilityForEpDevices,
+                    _In_reads_(num_ep_devices) const OrtEpDevice* const* /*ep_devices*/,
+                    _In_ size_t /*num_ep_devices*/,
+                    _In_ const char* /*compatibility_info*/,
+                    _Out_ OrtCompiledModelCompatibility* /*out_status*/) {
+  API_IMPL_BEGIN
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "GetModelCompatibilityForEpDevices is not supported in a minimal build.");
   API_IMPL_END
 }
 
@@ -3453,7 +3546,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_V2, _In_ OrtS
                     _In_reads_(num_op_options) const char* const* /*ep_option_vals*/,
                     size_t /*num_ep_options*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "SessionOptionsAppendExecutionProvider_V2 is not supported in a minimal build.");
   API_IMPL_END
 }
 
@@ -3466,7 +3559,7 @@ ORT_API_STATUS_IMPL(OrtApis::SessionGetEpDeviceForInputs, _In_ const OrtSession*
                     _Out_writes_(num_values) const OrtEpDevice** /*inputs_ep_devices*/,
                     _In_ size_t /*num_values*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "SessionGetEpDeviceForInputs is not supported in a minimal build.");
   API_IMPL_END
 }
 
@@ -3474,7 +3567,7 @@ ORT_API_STATUS_IMPL(OrtApis::CreateSyncStreamForEpDevice, _In_ const OrtEpDevice
                     _In_opt_ const OrtKeyValuePairs* /*stream_options*/,
                     _Outptr_ OrtSyncStream** /*ort_stream*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "CreateSyncStreamForEpDevice is not supported in a minimal build.");
   API_IMPL_END
 }
 
@@ -3493,7 +3586,7 @@ ORT_API_STATUS_IMPL(OrtApis::CopyTensors, _In_ const OrtEnv* /*env*/,
                     _In_opt_ OrtSyncStream* /*stream*/,
                     _In_ size_t /*num_tensors*/) {
   API_IMPL_BEGIN
-  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "This API in not supported in a minimal build.");
+  return OrtApis::CreateStatus(ORT_NOT_IMPLEMENTED, "CopyTensors is not supported in a minimal build.");
   API_IMPL_END
 }
 
@@ -4073,7 +4166,7 @@ static constexpr OrtApi ort_api_1_to_23 = {
     &OrtApis::Node_GetNumAttributes,
     &OrtApis::Node_GetAttributes,
     &OrtApis::Node_GetAttributeByName,
-    &OrtApis::Node_GetTensorAttributeAsOrtValue,
+    &OrtApis::OpAttr_GetTensorAttributeAsOrtValue,
     &OrtApis::OpAttr_GetType,
     &OrtApis::OpAttr_GetName,
     &OrtApis::Node_GetNumSubgraphs,
@@ -4108,6 +4201,7 @@ static constexpr OrtApi ort_api_1_to_23 = {
     &OrtApis::CopyTensors,
 
     &OrtApis::Graph_GetModelMetadata,
+    &OrtApis::GetModelCompatibilityForEpDevices,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index b3b0036c68247..6dc4cf9d195cc 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -636,6 +636,13 @@ ORT_API_STATUS_IMPL(ValueInfo_IsFromOuterScope, _In_ const OrtValueInfo* value_i
 // OrtGraph
 ORT_API_STATUS_IMPL(Graph_GetName, _In_ const OrtGraph* graph, _Outptr_ const char** graph_name);
 ORT_API_STATUS_IMPL(Graph_GetModelMetadata, _In_ const OrtGraph* graph, _Outptr_ OrtModelMetadata** out);
+
+// EP Compatibility Info APIs
+ORT_API_STATUS_IMPL(GetModelCompatibilityForEpDevices,
+                    _In_reads_(num_ep_devices) const OrtEpDevice* const* ep_devices,
+                    _In_ size_t num_ep_devices,
+                    _In_ const char* compatibility_info,
+                    _Out_ OrtCompiledModelCompatibility* out_status);
 ORT_API_STATUS_IMPL(Graph_GetModelPath, _In_ const OrtGraph* graph, _Outptr_ const ORTCHAR_T** model_path);
 ORT_API_STATUS_IMPL(Graph_GetOnnxIRVersion, _In_ const OrtGraph* graph, _Out_ int64_t* onnx_ir_version);
 ORT_API_STATUS_IMPL(Graph_GetNumOperatorSets, _In_ const OrtGraph* graph, _Out_ size_t* num_operator_sets);
@@ -680,7 +687,7 @@ ORT_API_STATUS_IMPL(Node_GetAttributes, _In_ const OrtNode* node,
                     _Out_writes_(num_attributes) const OrtOpAttr** attributes, _In_ size_t num_attributes);
 ORT_API_STATUS_IMPL(Node_GetAttributeByName, _In_ const OrtNode* node, _In_ const char* attribute_name,
                     _Outptr_result_maybenull_ const OrtOpAttr** attribute);
-ORT_API_STATUS_IMPL(Node_GetTensorAttributeAsOrtValue, _In_ const OrtNode* node, _In_ const OrtOpAttr* attribute,
+ORT_API_STATUS_IMPL(OpAttr_GetTensorAttributeAsOrtValue, _In_ const OrtOpAttr* attribute,
                     _Outptr_result_maybenull_ OrtValue** attr_tensor);
 ORT_API_STATUS_IMPL(OpAttr_GetType, _In_ const OrtOpAttr* attribute, _Out_ OrtOpAttrType* type);
 ORT_API_STATUS_IMPL(OpAttr_GetName, _In_ const OrtOpAttr* attribute, _Outptr_ const char** name);
diff --git a/onnxruntime/core/session/plugin_ep/ep_factory_internal.h b/onnxruntime/core/session/plugin_ep/ep_factory_internal.h
index 23e5e95af2903..093bfce462d32 100644
--- a/onnxruntime/core/session/plugin_ep/ep_factory_internal.h
+++ b/onnxruntime/core/session/plugin_ep/ep_factory_internal.h
@@ -80,9 +80,11 @@ class EpFactoryInternal : public OrtEpFactory {
     return impl_->CreateSyncStreamForDevice(memory_device, stream_options, stream);
   }
 
-  OrtStatus* ValidateCompiledModelCompatibilityInfo(_In_ const char* compatibility_info,
+  OrtStatus* ValidateCompiledModelCompatibilityInfo(_In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+                                                    _In_ size_t num_devices,
+                                                    _In_ const char* compatibility_info,
                                                     _Out_ OrtCompiledModelCompatibility* model_compatibility) noexcept {
-    return impl_->ValidateCompiledModelCompatibilityInfo(compatibility_info, model_compatibility);
+    return impl_->ValidateCompiledModelCompatibilityInfo(devices, num_devices, compatibility_info, model_compatibility);
   }
 
   // Function ORT calls to release an EP instance.
diff --git a/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h b/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h
index 6c55730d83979..f29154d19c53c 100644
--- a/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h
+++ b/onnxruntime/core/session/plugin_ep/ep_factory_internal_impl.h
@@ -62,8 +62,13 @@ class EpFactoryInternalImpl {
     return false;
   }
 
-  virtual OrtStatus* ValidateCompiledModelCompatibilityInfo(_In_ const char* compatibility_info,
-                                                            _Out_ OrtCompiledModelCompatibility* model_compatibility) noexcept {
+  virtual OrtStatus* ValidateCompiledModelCompatibilityInfo(
+      _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+      _In_ size_t num_devices,
+      _In_ const char* compatibility_info,
+      _Out_ OrtCompiledModelCompatibility* model_compatibility) noexcept {
+    ORT_UNUSED_PARAMETER(devices);
+    ORT_UNUSED_PARAMETER(num_devices);
     ORT_UNUSED_PARAMETER(compatibility_info);
     // Default implementation: mark as not applicable
     *model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
diff --git a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
index 3bfca62a4d011..c8829423fbe26 100644
--- a/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
+++ b/onnxruntime/core/session/plugin_ep/ep_plugin_provider_interfaces.cc
@@ -668,8 +668,15 @@ Status PluginExecutionProvider::ValidateCompiledModelCompatibilityInfo(const std
     // Plugin EP did not provide an implementation of this function, so we call a default implementation.
     return Base::ValidateCompiledModelCompatibilityInfo(compatibility_info, model_compatibility);
   }
-  // Delegate to the EP factory's validation method
+  // Delegate to the EP factory's validation method, passing hardware devices derived from our ep_devices_
+  std::vector<const OrtHardwareDevice*> hardware_devices;
+  hardware_devices.reserve(ep_devices_.size());
+  for (const auto* ep_device : ep_devices_) {
+    hardware_devices.push_back(ep_device->device);
+  }
   ORT_RETURN_IF_ERROR(ToStatusAndRelease(ep_factory_.ValidateCompiledModelCompatibilityInfo(&ep_factory_,
+                                                                                            hardware_devices.data(),
+                                                                                            hardware_devices.size(),
                                                                                             compatibility_info.c_str(),
                                                                                             &model_compatibility)));
   return Status::OK();
diff --git a/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h b/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h
index 29793b503c9d1..2cceb1d08d536 100644
--- a/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h
+++ b/onnxruntime/core/session/plugin_ep/forward_to_factory_impl.h
@@ -46,9 +46,12 @@ struct ForwardToFactoryImpl {
   }
 
   static OrtStatus* ORT_API_CALL ValidateCompiledModelCompatibilityInfo(OrtEpFactory* this_ptr,
+                                                                        _In_reads_(num_devices) const OrtHardwareDevice* const* devices,
+                                                                        size_t num_devices,
                                                                         const char* compatibility_info,
                                                                         OrtCompiledModelCompatibility* model_compatibility) noexcept {
-    return static_cast<TFactory*>(this_ptr)->ValidateCompiledModelCompatibilityInfo(compatibility_info, model_compatibility);
+    return static_cast<TFactory*>(this_ptr)->ValidateCompiledModelCompatibilityInfo(devices, num_devices,
+                                                                                    compatibility_info, model_compatibility);
   }
 
   static OrtStatus* ORT_API_CALL CreateAllocator(_In_ OrtEpFactory* this_ptr,
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 24554560b4dde..eb06a65ad5330 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1575,6 +1575,17 @@ void addGlobalMethods(py::module& m) {
       R"pbdoc(Get the list of available OrtEpDevice instances.)pbdoc",
       py::return_value_policy::reference);
 
+  m.def(
+      "get_model_compatibility_for_ep_devices",
+      [](const std::vector<const OrtEpDevice*>& ep_devices,
+         const std::string& compatibility_info) -> OrtCompiledModelCompatibility {
+        OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
+        Ort::ThrowOnError(Ort::GetApi().GetModelCompatibilityForEpDevices(
+            ep_devices.data(), ep_devices.size(), compatibility_info.c_str(), &status));
+        return status;
+      },
+      R"pbdoc("Validate a compiled model's compatibility information for one or more EP devices.)pbdoc");
+
 #if defined(USE_OPENVINO) || defined(USE_OPENVINO_PROVIDER_INTERFACE)
   m.def(
       "get_available_openvino_device_ids", []() -> std::vector<std::string> {
@@ -1759,6 +1770,12 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
       .value("PRIORITY_BASED", ExecutionOrder::PRIORITY_BASED)
       .value("MEMORY_EFFICIENT", ExecutionOrder::MEMORY_EFFICIENT);
 
+  py::enum_<OrtCompiledModelCompatibility>(m, "OrtCompiledModelCompatibility")
+      .value("EP_NOT_APPLICABLE", OrtCompiledModelCompatibility_EP_NOT_APPLICABLE)
+      .value("EP_SUPPORTED_OPTIMAL", OrtCompiledModelCompatibility_EP_SUPPORTED_OPTIMAL)
+      .value("EP_SUPPORTED_PREFER_RECOMPILATION", OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION)
+      .value("EP_UNSUPPORTED", OrtCompiledModelCompatibility_EP_UNSUPPORTED);
+
   py::enum_<OrtAllocatorType>(m, "OrtAllocatorType")
       .value("INVALID", OrtInvalidAllocator)
       .value("ORT_DEVICE_ALLOCATOR", OrtDeviceAllocator)
@@ -1782,7 +1799,7 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
                type = OrtDevice::GPU;
                vendor = OrtDevice::VendorIds::MICROSOFT;
              } else if (type == OrtDevice::GPU) {
-#if USE_CUDA
+#if USE_CUDA || USE_NV || USE_NV_PROVIDER_INTERFACE || USE_CUDA_PROVIDER_INTERFACE
                vendor = OrtDevice::VendorIds::NVIDIA;
 #elif USE_ROCM || USE_MIGRAPHX
                vendor = OrtDevice::VendorIds::AMD;
diff --git a/onnxruntime/test/framework/ep_compatibility_test.cc b/onnxruntime/test/framework/ep_compatibility_test.cc
index be97cf2620881..a8a83fbe5ceb6 100644
--- a/onnxruntime/test/framework/ep_compatibility_test.cc
+++ b/onnxruntime/test/framework/ep_compatibility_test.cc
@@ -15,6 +15,7 @@
 #include "core/session/onnxruntime_ep_device_ep_metadata_keys.h"
 #include "core/session/utils.h"
 #include "core/session/onnxruntime_c_api.h"
+#include "core/session/onnxruntime_cxx_api.h"
 #include "core/session/abi_session_options_impl.h"
 #include "core/framework/error_code_helper.h"
 #include "dummy_provider.h"
@@ -408,3 +409,122 @@ TEST_F(EpCompatibilityTest, TestSessionOptionConfiguration) {
   EXPECT_TRUE(has_config);
   EXPECT_EQ(config_value, "0");
 }
+
+// -----------------------------
+// C API unit tests
+// -----------------------------
+
+namespace {
+
+// Helper to create an OrtEnv and fetch a CPU EP device pointer via the C API.
+// Returns a pair of (env, cpu_device). Caller releases env via api->ReleaseEnv.
+static std::pair<OrtEnv*, const OrtEpDevice*> CreateEnvAndGetCpuEpDevice(const OrtApi* api) {
+  OrtEnv* env = nullptr;
+  EXPECT_EQ(nullptr, api->CreateEnv(ORT_LOGGING_LEVEL_WARNING, "EpCompatCapiTest", &env));
+  EXPECT_NE(env, nullptr);
+
+  const OrtEpDevice* const* devices = nullptr;
+  size_t num_devices = 0;
+  EXPECT_EQ(nullptr, api->GetEpDevices(env, &devices, &num_devices));
+  EXPECT_GT(num_devices, 0u);
+
+  const OrtEpDevice* cpu_device = nullptr;
+  for (size_t i = 0; i < num_devices; ++i) {
+    const char* name = api->EpDevice_EpName(devices[i]);
+    if (name && std::string(name) == "CPUExecutionProvider") {
+      cpu_device = devices[i];
+      break;
+    }
+  }
+
+  // Fallback: just pick the first device if CPU wasn't found (environment-dependent builds).
+  if (!cpu_device && num_devices > 0) {
+    cpu_device = devices[0];
+  }
+
+  EXPECT_NE(cpu_device, nullptr);
+  return {env, cpu_device};
+}
+
+}  // namespace
+
+TEST(EpCompatibilityCapiTest, InvalidArguments) {
+  const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+  ASSERT_NE(api, nullptr);
+
+  OrtCompiledModelCompatibility out_status = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
+
+  // ep_devices == nullptr
+  OrtStatus* st = api->GetModelCompatibilityForEpDevices(nullptr, 0, "info", &out_status);
+  ASSERT_NE(st, nullptr);
+  EXPECT_EQ(api->GetErrorCode(st), ORT_INVALID_ARGUMENT);
+  api->ReleaseStatus(st);
+
+  // Prepare a valid device
+  auto [env, device] = CreateEnvAndGetCpuEpDevice(api);
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(device, nullptr);
+
+  // compatibility_info == nullptr
+  const OrtEpDevice* devices1[] = {device};
+  st = api->GetModelCompatibilityForEpDevices(devices1, 1, nullptr, &out_status);
+  ASSERT_NE(st, nullptr);
+  EXPECT_EQ(api->GetErrorCode(st), ORT_INVALID_ARGUMENT);
+  api->ReleaseStatus(st);
+
+  // out_status == nullptr
+  st = api->GetModelCompatibilityForEpDevices(devices1, 1, "some-info", nullptr);
+  ASSERT_NE(st, nullptr);
+  EXPECT_EQ(api->GetErrorCode(st), ORT_INVALID_ARGUMENT);
+  api->ReleaseStatus(st);
+
+  api->ReleaseEnv(env);
+}
+
+TEST(EpCompatibilityCapiTest, CpuEpReturnsNotApplicableIfNoValidation) {
+  const OrtApi* api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+  ASSERT_NE(api, nullptr);
+
+  auto [env, device] = CreateEnvAndGetCpuEpDevice(api);
+  ASSERT_NE(env, nullptr);
+  ASSERT_NE(device, nullptr);
+
+  OrtCompiledModelCompatibility out_status = static_cast<OrtCompiledModelCompatibility>(-1);
+  const OrtEpDevice* devices2[] = {device};
+  OrtStatus* st = api->GetModelCompatibilityForEpDevices(devices2, 1, "arbitrary-compat-string", &out_status);
+  ASSERT_EQ(st, nullptr) << (st ? api->GetErrorMessage(st) : "");
+
+  // For providers that don't implement validation, API should return EP_NOT_APPLICABLE.
+  EXPECT_EQ(out_status, OrtCompiledModelCompatibility_EP_NOT_APPLICABLE);
+  api->ReleaseStatus(st);
+
+  api->ReleaseEnv(env);
+}
+
+// -----------------------------
+// C++ API unit tests
+// -----------------------------
+
+TEST(EpCompatibilityCxxApiTest, SingleDeviceCpuProvider) {
+  Ort::Env env{ORT_LOGGING_LEVEL_WARNING, "EpCompatCxx"};
+  auto devices = env.GetEpDevices();
+  ASSERT_FALSE(devices.empty());
+
+  std::vector<Ort::ConstEpDevice> selected;
+  for (const auto& d : devices) {
+    if (std::string{d.EpName()} == "CPUExecutionProvider") {
+      selected.push_back(d);
+      break;
+    }
+  }
+
+  ASSERT_FALSE(selected.empty());
+
+  // Pick a status that the CPU EP would never return to ensure the value is set correctly.
+  OrtCompiledModelCompatibility status = OrtCompiledModelCompatibility_EP_SUPPORTED_PREFER_RECOMPILATION;
+  ASSERT_NO_FATAL_FAILURE({
+    status = Ort::GetModelCompatibilityForEpDevices(selected, "arbitrary-compat-string");
+  });
+
+  ASSERT_TRUE(status == OrtCompiledModelCompatibility_EP_NOT_APPLICABLE);
+}
\ No newline at end of file
diff --git a/onnxruntime/test/framework/save_model_with_external_initializers.cc b/onnxruntime/test/framework/save_model_with_external_initializers.cc
index 98874874d50e9..e70d870ef6988 100644
--- a/onnxruntime/test/framework/save_model_with_external_initializers.cc
+++ b/onnxruntime/test/framework/save_model_with_external_initializers.cc
@@ -84,7 +84,7 @@ Status LoadSaveAndCompareModel(const std::filesystem::path& input_onnx,
           size_t tensor_offset;
           std::stringstream stream(entry.value());
           stream >> tensor_offset;
-          ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.allocation_granularity == 0,
+          ORT_RETURN_IF_NOT(tensor_offset % model_saving_options.on_disk_alignment == 0,
                             "tensor offset not align");
         }
       }
diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc
index 98640bb2f6b4c..f626a1704f7a1 100644
--- a/onnxruntime/test/optimizer/qdq_transformer_test.cc
+++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc
@@ -5370,8 +5370,59 @@ TEST(QDQTransformerTests, WeightBiasQuantization_Conv_Weight_Bias) {
 #endif
 }
 
+// Tests that the WeightBiasQuantization optimizer still processes nodes that contain a type-preserving no
+// branch ReLU op to QuantizeLinear e.g., Q -> DQ -> Conv (w/ float weight initializer) -> ReLU -> Q -> DQ
+TEST(QDQTransformerTests, WeightBiasQuantization_ConvWithReLU) {
+  auto test_case = [](bool use_contrib_qdq) {
+    auto build_test_case = [&](ModelTestBuilder& builder) {
+      NodeArg* input_fp32 = builder.MakeInput<float>({1, 1, 4, 4}, -1.0f, 1.0f);
+      NodeArg* weight_fp32 = builder.MakeInitializer<float>({2, 1, 3, 3}, -1.0f, 1.0f);
+      NodeArg* input_q = builder.MakeIntermediate();
+      NodeArg* input_dq = builder.MakeIntermediate();
+      NodeArg* conv_fp32 = builder.MakeIntermediate();
+      NodeArg* relu_fp32 = builder.MakeIntermediate();
+      NodeArg* relu_q = builder.MakeIntermediate();
+      NodeArg* relu_dq = builder.MakeOutput();
+      builder.AddQuantizeLinearNode<uint8_t>(input_fp32, 0.18f, static_cast<uint8_t>(127), input_q, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<uint8_t>(input_q, 0.18f, static_cast<uint8_t>(127), input_dq, use_contrib_qdq);
+      auto& conv_node = builder.AddNode("Conv", {input_dq, weight_fp32}, {conv_fp32});
+      conv_node.AddAttribute("dilations", std::vector<int64_t>{1, 1});
+      conv_node.AddAttribute("kernel_shape", std::vector<int64_t>{3, 3});
+      conv_node.AddAttribute("strides", std::vector<int64_t>{1, 1});
+      conv_node.AddAttribute("group", static_cast<int64_t>(1));
+      conv_node.AddAttribute("pads", std::vector<int64_t>{0, 0, 0, 0});
+      builder.AddNode("Relu", {conv_fp32}, {relu_fp32});
+      builder.AddQuantizeLinearNode<uint8_t>(relu_fp32, 0.69f, static_cast<uint8_t>(127), relu_q, use_contrib_qdq);
+      builder.AddDequantizeLinearNode<uint8_t>(relu_q, 0.69f, static_cast<uint8_t>(127), relu_dq, use_contrib_qdq);
+    };
+
+    // Conv's weights should be quantized and folded, one additional Q/DQ pair inserted for weight
+    auto check_transformed_graph = [](InferenceSessionWrapper& session) {
+      auto op_to_count = CountOpsInGraph(session.GetGraph());
+      EXPECT_EQ(op_to_count["QuantizeLinear"] + op_to_count["com.microsoft.QuantizeLinear"], 2 + 1);
+      EXPECT_EQ(op_to_count["DequantizeLinear"] + op_to_count["com.microsoft.DequantizeLinear"], 2 + 1);
+      EXPECT_EQ(op_to_count["Conv"], 1);
+      EXPECT_EQ(op_to_count["Relu"], 1);
+    };
+
+    TransformerTester(build_test_case,
+                      check_transformed_graph,
+                      TransformerLevel::Default,
+                      TransformerLevel::Level1,
+                      /*opset_version=*/20,
+                      /*per_sample_tolerance=*/0.01,
+                      /*relative_per_sample_tolerance=*/0.01,
+                      /*transformer=*/std::make_unique<WeightBiasQuantization>());
+  };
+
+  test_case(false);
+#if !defined(DISABLE_CONTRIB_OPS)
+  test_case(true);
+#endif
+}
+
 // Tests that the WeightBiasQuantization optimizer does not process nodes that do not
-// already have an output that is consumed by a single QuantizeLinear node.
+// already have an output that is consumed by a valid path to QuantizeLinear node.
 TEST(QDQTransformerTests, WeightBiasQuantization_SkipIfOutputNotQuantized) {
   auto test_case = [](bool add_final_reshape) {
     auto build_test_case = [&](ModelTestBuilder& builder) {
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index a22375320edae..46958843872d7 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -66,7 +66,9 @@ ABSL_FLAG(std::string, i, "",
           "  [OpenVINO only] [num_of_threads]: Overrides the accelerator hardware type and precision with these values at runtime.\n"
           "  [OpenVINO only] [cache_dir]: Explicitly specify the path to dump and load the blobs(Model caching) or cl_cache (Kernel Caching) files feature. If blob files are already present, it will be directly loaded.\n"
           "  [OpenVINO only] [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU device(Reduces the CPU Utilization while using GPU) \n"
-          "  [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true cache_dir|\"<path>\"\"\n"
+          "  [OpenVINO only] [reshape_input]: Sets model input shapes with support for bounded dynamic dimensions using 'min..max' syntax (e.g., [1..10,3,224,224]) \n"
+          "  [OpenVINO only] [layout]: Specifies the layout for inputs/outputs to interpret tensor dimensions correctly. \n"
+          "  [Example] [For OpenVINO EP] -e openvino -i \"device_type|CPU num_of_threads|5 enable_opencl_throttling|true reshape_input|<input_name>[1,3,60,60..100] layout|<input_name>[NCHW] cache_dir|\"<path>\"\"\n"
           "\n"
           "  [QNN only] [backend_type]: QNN backend type. E.g., 'cpu', 'htp'. Mutually exclusive with 'backend_path'.\n"
           "  [QNN only] [backend_path]: QNN backend path. E.g., '/folderpath/libQnnHtp.so', '/winfolderpath/QnnHtp.dll'. Mutually exclusive with 'backend_type'.\n"
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index f1a40b1da8651..1ba3078efdb1a 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -863,12 +863,14 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         ov_options[key] = value;
       } else if (key == "reshape_input") {
         ov_options[key] = value;
+      } else if (key == "layout") {
+        ov_options[key] = value;
       } else {
         ORT_THROW(
             "[ERROR] [OpenVINO] wrong key type entered. Choose from the following runtime key options that are available for OpenVINO."
             " ['device_type', 'device_id', 'num_of_threads', 'load_config', 'cache_dir', 'num_streams', "
             "'enable_opencl_throttling', 'disable_dynamic_shapes', 'enable_qdq_optimizer',"
-            " 'enable_causallm', 'model_priority'] \n");
+            " 'enable_causallm', 'reshape_input', 'layout', 'model_priority'] \n");
       }
     }
     session_options.AppendExecutionProvider_OpenVINO_V2(ov_options);
diff --git a/onnxruntime/test/platform/file_io_test.cc b/onnxruntime/test/platform/file_io_test.cc
index ccc703716844f..e6f3c4dd8b89e 100644
--- a/onnxruntime/test/platform/file_io_test.cc
+++ b/onnxruntime/test/platform/file_io_test.cc
@@ -17,7 +17,7 @@
 #include <gsl/gsl>
 
 #include "gtest/gtest.h"
-
+#include "asserts.h"
 #include "core/common/span_utils.h"
 #include "test/util/include/file_util.h"
 
@@ -157,7 +157,6 @@ TEST(FileIoTest, MapFileIntoMemory) {
   SYSTEM_INFO sysinfo;
   GetSystemInfo(&sysinfo);
   static const auto page_size = sysinfo.dwPageSize;
-  static const auto allocation_granularity = sysinfo.dwAllocationGranularity;
   ASSERT_GT(page_size, static_cast<DWORD>(0));
 
   TempFilePath tmp(ORT_TSTR("map_file_test_"));
@@ -167,21 +166,10 @@ TEST(FileIoTest, MapFileIntoMemory) {
   const auto offsets_and_lengths = GenerateValidOffsetLengthPairs(
       0, expected_data.size(), page_size / 10);
 
-  for (const auto& offset_and_length : offsets_and_lengths) {
-    const auto offset = offset_and_length.first;
-    const auto length = offset_and_length.second;
-
-    // The offset must be a multiple of the allocation granularity
-    if (offset % allocation_granularity != 0) {
-      continue;
-    }
-
+  for (const auto& [offset, length] : offsets_and_lengths) {
     Env::MappedMemoryPtr mapped_memory{};
-    auto status = Env::Default().MapFileIntoMemory(
-        tmp.path.c_str(), offset, length, mapped_memory);
-    ASSERT_TRUE(status.IsOK())
-        << "MapFileIntoMemory failed for offset " << offset << " and length " << length
-        << " with error: " << status.ErrorMessage();
+    ASSERT_STATUS_OK(Env::Default().MapFileIntoMemory(
+        tmp.path.c_str(), offset, length, mapped_memory));
 
     auto mapped_span = gsl::make_span(mapped_memory.get(), length);
 
@@ -190,20 +178,11 @@ TEST(FileIoTest, MapFileIntoMemory) {
     ASSERT_TRUE(SpanEq(mapped_span, expected_data_span));
   }
 
-  {
-    Env::MappedMemoryPtr mapped_memory{};
-
-    // invalid - offset is not a multiple of the allocation granularity
-    ASSERT_FALSE(Env::Default().MapFileIntoMemory(
-                                   tmp.path.c_str(), allocation_granularity * 3 / 2, page_size / 10, mapped_memory)
-                     .IsOK());
-  }
-
   {
     Env::MappedMemoryPtr mapped_memory{};
 
     // invalid - negative offset
-    ASSERT_FALSE(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory).IsOK());
+    ASSERT_STATUS_NOT_OK(Env::Default().MapFileIntoMemory(tmp.path.c_str(), -1, 0, mapped_memory));
   }
 }
 #endif
diff --git a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
index a5fd37361a255..dc50a75873034 100644
--- a/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/loop_test.cc
@@ -688,7 +688,7 @@ TEST(Loop, SubgraphTypeOverride) {
   Graph::ResolveOptions options;
   options.override_types = true;
   test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider}, &session_run_options, nullptr,
+           {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}, &session_run_options, nullptr,
            ExecutionMode::ORT_SEQUENTIAL, options);
 }
 
@@ -1162,7 +1162,7 @@ TEST(Loop, SequenceAsLoopCarriedDependency) {
   test.AddSeqOutput("loop_var_0_final", seq_output);
 
   // Disable TensorRT on unsupported data type BOOL
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider});
 }
 
 #if !defined(DISABLE_OPTIONAL_TYPE)
diff --git a/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc
index f4d8cad90a714..1a71da6d95135 100644
--- a/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/dynamic_quantize_linear_test.cc
@@ -11,7 +11,8 @@ namespace test {
 // range = [-ve, +ve]
 TEST(QuantizeLinearOpTest, DynamicQuantizeLinear) {
   // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
+  if (DefaultDmlExecutionProvider().get() != nullptr ||
+      DefaultOpenVINOExecutionProvider().get() != nullptr) {
     GTEST_SKIP() << "Skipping because of the following error: Expected equality of these values: 26 and 25";
   }
 
diff --git a/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc
new file mode 100644
index 0000000000000..fc90563a61bb1
--- /dev/null
+++ b/onnxruntime/test/providers/openvino/openvino_ep_bfloat16_pass_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <filesystem>
+#include <map>
+#include <string>
+
+#include "core/session/onnxruntime_cxx_api.h"
+#include "core/framework/float16.h"
+
+#include "test/util/include/test/test_environment.h"
+#include "test/optimizer/qdq_test_utils.h"
+
+#include "gtest/gtest.h"
+#include "gmock/gmock.h"
+
+using namespace ONNX_NAMESPACE;
+using namespace onnxruntime::logging;
+
+extern std::unique_ptr<Ort::Env> ort_env;
+
+class OVEP_BF16_Tests : public ::testing::TestWithParam<std::string> {};
+
+namespace detail {
+auto ConstructModel() {
+  using namespace onnxruntime;
+  using namespace test;
+
+  std::unordered_map<std::string, int> domain_to_version;
+  domain_to_version[kOnnxDomain] = 19;
+  Model model("Bfloat16Tester", true, ModelMetaData(), PathString(), IOnnxRuntimeOpSchemaRegistryList(),
+              domain_to_version, {}, DefaultLoggingManager().DefaultLogger());
+
+  Graph& graph = model.MainGraph();
+  ModelTestBuilder builder(graph);
+  auto dim = 4;
+  std::vector<float> input_data(dim, 1.0f);
+  auto* input = builder.MakeInput<float>({dim}, input_data);
+  builder.graph_.SetInputs({input});
+
+  auto* cast_to_bf16 = builder.MakeIntermediate();
+  Node& cast_node = builder.AddNode("Cast", {input}, {cast_to_bf16}, "");
+  cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16));
+
+  std::vector<onnxruntime::BFloat16> weight_data(dim * dim);
+  for (std::size_t i = 0; i < weight_data.size(); ++i)
+    weight_data[i] = onnxruntime::BFloat16(static_cast<float>(i % dim) / dim);
+  auto* weights = builder.MakeInitializer<onnxruntime::BFloat16>({dim, dim}, weight_data);
+
+  auto* matmul_out = builder.MakeIntermediate();
+  builder.AddNode("MatMul", {cast_to_bf16, weights}, {matmul_out});
+
+  std::vector<onnxruntime::BFloat16> weight_data_2(dim * dim);
+  for (std::size_t i = 0; i < weight_data_2.size(); ++i)
+    weight_data_2[i] = onnxruntime::BFloat16(static_cast<float>(i % dim) / dim);
+  auto* weights_2 = builder.MakeInitializer<onnxruntime::BFloat16>({dim, dim}, weight_data_2);
+
+  auto* matmul_out_2 = builder.MakeIntermediate();
+  builder.AddNode("MatMul", {matmul_out, weights_2}, {matmul_out_2});
+
+  auto* output = builder.MakeOutput();
+  Node& cast2_node = builder.AddNode("Cast", {matmul_out_2}, {output});
+  cast2_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT));
+
+  builder.SetGraphOutputs();
+  auto st = model.MainGraph().Resolve();
+  if (st != Status::OK())
+    throw std::runtime_error(st.ErrorMessage());
+  return model;
+}
+
+auto ProbeDevice(const std::string& device) {
+  static std::map<std::string, bool> is_present;
+  if (is_present.find(device) == is_present.end()) {
+    Ort::SessionOptions sessionOptions;
+    std::unordered_map<std::string, std::string> ov_options;
+    ov_options["device_type"] = device;
+    try {
+      sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options);
+      is_present[device] = true;
+    } catch (...) {
+      is_present[device] = false;
+    }
+  }
+  return is_present[device];
+}
+}  // namespace detail
+
+namespace onnxruntime {
+namespace test {
+
+TEST_P(OVEP_BF16_Tests, TestModelConversion) {
+  Ort::SessionOptions sessionOptions;
+  std::unordered_map<std::string, std::string> ov_options;
+  const auto& device = GetParam();
+  if (!::detail::ProbeDevice(device))
+    GTEST_SKIP() << device + " is not available on this machine";
+
+  ov_options["device_type"] = device;
+  auto model = ::detail::ConstructModel();
+  sessionOptions.AppendExecutionProvider_OpenVINO_V2(ov_options);
+
+  std::string model_data;
+  model.ToProto().SerializeToString(&model_data);
+  auto model_data_span = AsByteSpan(model_data.data(), model_data.size());
+  try {
+    Ort::Session session(*ort_env, model_data_span.data(), model_data_span.size(), sessionOptions);
+  } catch (...) {
+    FAIL();
+  }
+}
+INSTANTIATE_TEST_SUITE_P(OVEP_Tests,
+                         OVEP_BF16_Tests,
+                         ::testing::Values("CPU", "GPU", "NPU"));
+}  // namespace test
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py
new file mode 100644
index 0000000000000..8e69fdf088103
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_python_ep_compatibility.py
@@ -0,0 +1,46 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+import os
+import platform
+import sys
+import unittest
+
+from onnxruntime.capi.onnxruntime_pybind11_state import (
+    OrtCompiledModelCompatibility,
+    get_ep_devices,
+    get_model_compatibility_for_ep_devices,
+)
+
+# handle change from python 3.8 and on where loading a dll from the current directory needs to be explicitly allowed.
+if platform.system() == "Windows" and sys.version_info.major >= 3 and sys.version_info.minor >= 8:  # noqa: YTT204
+    os.add_dll_directory(os.getcwd())
+
+
+class TestEpCompatibility(unittest.TestCase):
+    def test_invalid_args(self):
+        # empty devices
+        with self.assertRaises(RuntimeError):
+            get_model_compatibility_for_ep_devices([], "info")
+        # None compatibility info should raise TypeError before native call
+        with self.assertRaises(TypeError):
+            get_model_compatibility_for_ep_devices(get_ep_devices(), None)  # type: ignore[arg-type]
+
+    def test_basic_smoke(self):
+        devices = list(get_ep_devices())
+        if not devices:
+            self.skipTest("No EP devices available in this build")
+
+        # Always select CPUExecutionProvider; skip if not present.
+        cpu_devices = [d for d in devices if getattr(d, "ep_name", None) == "CPUExecutionProvider"]
+        if not cpu_devices:
+            self.skipTest("CPUExecutionProvider not available in this build")
+        selected = [cpu_devices[0]]
+
+        # API requires all devices belong to the same EP; we pass only one.
+        status = get_model_compatibility_for_ep_devices(selected, "arbitrary-compat-string")
+        self.assertEqual(status, OrtCompiledModelCompatibility.EP_NOT_APPLICABLE)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
new file mode 100644
index 0000000000000..d5c80a4a1f4ba
--- /dev/null
+++ b/onnxruntime/test/python/onnxruntime_test_python_nv_tensorrt_rtx_ep_tests.py
@@ -0,0 +1,468 @@
+# Copyright (c) NVIDIA Corporation. All rights reserved.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import sys
+import unittest
+from collections.abc import Sequence
+
+import numpy as np
+import torch
+from autoep_helper import AutoEpTestCase
+from helper import get_name
+from numpy.testing import assert_almost_equal
+from onnx import TensorProto, helper
+from onnx.defs import onnx_opset_version
+
+import onnxruntime as onnxrt
+from onnxruntime.capi._pybind_state import OrtDevice as C_OrtDevice
+from onnxruntime.capi._pybind_state import OrtValue as C_OrtValue
+from onnxruntime.capi._pybind_state import OrtValueVector, SessionIOBinding
+
+
+class TestNvTensorRTRTXAutoEP(AutoEpTestCase):
+    """
+    Test suite for the NvTensorRTRTX Execution Provider.
+
+    This class contains tests for registering the NvTensorRTRTX EP,
+    selecting it using different policies, and running inference with various
+    I/O binding configurations.
+    """
+
+    ep_lib_path = "onnxruntime_providers_nv_tensorrt_rtx.dll"
+    ep_name = "NvTensorRTRTXExecutionProvider"
+
+    def setUp(self):
+        if sys.platform != "win32":
+            self.skipTest("Skipping test because device discovery is only supported on Windows")
+        self.register_execution_provider_library(self.ep_name, self.ep_lib_path)
+
+    def tearDown(self):
+        self.unregister_execution_provider_library(self.ep_name)
+
+    def _create_ortvalue_input_on_gpu(self, device):
+        return onnxrt.OrtValue.ortvalue_from_numpy(
+            np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32), device, 0
+        )
+
+    def _create_ortvalue_alternate_input_on_gpu(self, device):
+        return onnxrt.OrtValue.ortvalue_from_numpy(
+            np.array([[2.0, 4.0], [6.0, 8.0], [10.0, 12.0]], dtype=np.float32),
+            device,
+            0,
+        )
+
+    def _create_uninitialized_ortvalue_input_on_gpu(self, device):
+        return onnxrt.OrtValue.ortvalue_from_shape_and_type([3, 2], np.float32, device, 0)
+
+    def _create_numpy_input(self):
+        return np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+
+    def _create_expected_output(self):
+        return np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+
+    def _create_expected_output_alternate(self):
+        return np.array([[2.0, 8.0], [18.0, 32.0], [50.0, 72.0]], dtype=np.float32)
+
+    def torch_to_onnx_type(self, torch_dtype):
+        if torch_dtype == torch.float32:
+            return TensorProto.FLOAT
+        elif torch_dtype == torch.float16:
+            return TensorProto.FLOAT16
+        elif torch_dtype == torch.bfloat16:
+            return TensorProto.BFLOAT16
+        elif torch_dtype == torch.int8:
+            return TensorProto.int8
+        elif torch_dtype == torch.int32:
+            return TensorProto.INT32
+        elif torch_dtype == torch.int64:
+            return TensorProto.INT64
+        else:
+            raise TypeError(f"Unsupported dtype: {torch_dtype}")
+
+    def test_nv_tensorrt_rtx_ep_register_and_inference(self):
+        """
+        Test registration of NvTensorRTRTX EP, adding its OrtDevice to the SessionOptions, and running inference.
+        """
+        ep_devices = onnxrt.get_ep_devices()
+        nv_tensorrt_rtx_ep_device = next((d for d in ep_devices if d.ep_name == self.ep_name), None)
+        self.assertIsNotNone(nv_tensorrt_rtx_ep_device)
+        self.assertEqual(nv_tensorrt_rtx_ep_device.ep_vendor, "NVIDIA")
+
+        hw_device = nv_tensorrt_rtx_ep_device.device
+        self.assertEqual(hw_device.type, onnxrt.OrtHardwareDeviceType.GPU)
+
+        # Run sample model and check output
+        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"))
+
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        input_name = sess.get_inputs()[0].name
+        res = sess.run([], {input_name: x})
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+
+    def test_nv_tensorrt_rtx_ep_prefer_gpu_and_inference(self):
+        """
+        Test selecting NvTensorRTRTX EP via the PREFER_GPU policy and running inference.
+        """
+        # Set a policy to prefer GPU. NvTensorRTRTX should be selected.
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+
+        # Run sample model and check output
+        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        input_name = sess.get_inputs()[0].name
+        res = sess.run([], {input_name: x})
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+
+    def test_nv_tensorrt_rtx_ep_selection_delegate_and_inference(self):
+        """
+        Test selecting NvTensorRTRTX EP via the custom EP selection delegate function and then run inference.
+        """
+
+        # User's custom EP selection function.
+        def my_delegate(
+            ep_devices: Sequence[onnxrt.OrtEpDevice],
+            model_metadata: dict[str, str],
+            runtime_metadata: dict[str, str],
+            max_selections: int,
+        ) -> Sequence[onnxrt.OrtEpDevice]:
+            self.assertGreater(len(model_metadata), 0)
+            self.assertGreaterEqual(len(ep_devices), 1)
+            self.assertGreaterEqual(max_selections, 2)
+
+            nv_tensorrt_rtx_ep_device = next((d for d in ep_devices if d.ep_name == self.ep_name), None)
+            self.assertIsNotNone(nv_tensorrt_rtx_ep_device)
+
+            # Select the NvTensorRTRTX device
+            return [nv_tensorrt_rtx_ep_device]
+
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy_delegate(my_delegate)
+        self.assertTrue(sess_options.has_providers())
+
+        # Run sample model and check output
+        sess = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+
+        x = np.array([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], dtype=np.float32)
+        input_name = sess.get_inputs()[0].name
+        res = sess.run([], {input_name: x})
+        output_expected = np.array([[1.0, 4.0], [9.0, 16.0], [25.0, 36.0]], dtype=np.float32)
+        np.testing.assert_allclose(output_expected, res[0], rtol=1e-05, atol=1e-08)
+
+    def test_bind_input_only(self):
+        """
+        Test I/O binding with input data only.
+        """
+        # Set a policy to prefer GPU. NvTensorRTRTX should be selected.
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+
+        input = self._create_ortvalue_input_on_gpu("cuda")
+
+        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+        io_binding = session.io_binding()
+
+        # Bind input to the GPU
+        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
+
+        # Sync if different streams
+        io_binding.synchronize_inputs()
+
+        # Bind output to CPU
+        io_binding.bind_output("Y")
+
+        # Invoke Run
+        session.run_with_iobinding(io_binding)
+
+        # Sync if different streams
+        io_binding.synchronize_outputs()
+
+        # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+        # here)
+        ort_output = io_binding.copy_outputs_to_cpu()[0]
+
+        # Validate results
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_output))
+
+    def test_bind_input_and_bind_output_with_ortvalues(self):
+        """
+        Test I/O binding with OrtValues for both input and output.
+        """
+        # Set a policy to prefer GPU. NvTensorRTRTX EP should be selected.
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+
+        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+        io_binding = session.io_binding()
+
+        # Bind ortvalue as input
+        input_ortvalue = self._create_ortvalue_input_on_gpu("cuda")
+        io_binding.bind_ortvalue_input("X", input_ortvalue)
+
+        # Bind ortvalue as output
+        output_ortvalue = self._create_uninitialized_ortvalue_input_on_gpu("cuda")
+        io_binding.bind_ortvalue_output("Y", output_ortvalue)
+
+        # Sync if different streams
+        io_binding.synchronize_inputs()
+
+        # Invoke Run
+        session.run_with_iobinding(io_binding)
+
+        # Sync if different streams
+        io_binding.synchronize_outputs()
+
+        # Inspect contents of output_ortvalue and make sure that it has the right contents
+        self.assertTrue(np.array_equal(self._create_expected_output(), output_ortvalue.numpy()))
+
+        # Bind another ortvalue as input
+        input_ortvalue_2 = self._create_ortvalue_alternate_input_on_gpu("cuda")
+        io_binding.bind_ortvalue_input("X", input_ortvalue_2)
+
+        # Sync if different streams
+        io_binding.synchronize_inputs()
+
+        # Invoke Run
+        session.run_with_iobinding(io_binding)
+
+        # Sync if different streams
+        io_binding.synchronize_outputs()
+
+        # Inspect contents of output_ortvalue and make sure that it has the right contents
+        self.assertTrue(np.array_equal(self._create_expected_output_alternate(), output_ortvalue.numpy()))
+
+    def test_bind_input_and_non_preallocated_output(self):
+        """
+        Test I/O binding with non-preallocated output.
+        """
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+
+        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+        io_binding = session.io_binding()
+
+        input = self._create_ortvalue_input_on_gpu("cuda")
+
+        # Bind input to the GPU
+        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
+
+        # Bind output to the GPU
+        io_binding.bind_output("Y", "cuda")
+
+        # Sync if different streams
+        io_binding.synchronize_inputs()
+
+        # Invoke Run
+        session.run_with_iobinding(io_binding)
+
+        # Sync if different streams
+        io_binding.synchronize_outputs()
+
+        # This call returns an OrtValue which has data allocated by ORT on the GPU
+        ort_outputs = io_binding.get_outputs()
+        self.assertEqual(len(ort_outputs), 1)
+        self.assertEqual(ort_outputs[0].device_name(), "cuda")
+        # Validate results (by copying results to CPU by creating a Numpy object)
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+        # We should be able to repeat the above process as many times as we want - try once more
+        ort_outputs = io_binding.get_outputs()
+        self.assertEqual(len(ort_outputs), 1)
+        self.assertEqual(ort_outputs[0].device_name(), "cuda")
+        # Validate results (by copying results to CPU by creating a Numpy object)
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_outputs[0].numpy()))
+
+        input = self._create_ortvalue_alternate_input_on_gpu("cuda")
+
+        # Change the bound input and validate the results in the same bound OrtValue
+        # Bind alternate input to the GPU
+        io_binding.bind_input(
+            "X",
+            "cuda",
+            0,
+            np.float32,
+            [3, 2],
+            input.data_ptr(),
+        )
+
+        # Sync if different streams
+        io_binding.synchronize_inputs()
+
+        # Invoke Run
+        session.run_with_iobinding(io_binding)
+
+        # Sync if different streams
+        io_binding.synchronize_outputs()
+
+        # This call returns an OrtValue which has data allocated by ORT on the GPU
+        ort_outputs = io_binding.get_outputs()
+        self.assertEqual(len(ort_outputs), 1)
+        self.assertEqual(ort_outputs[0].device_name(), "cuda")
+        # Validate results (by copying results to CPU by creating a Numpy object)
+        self.assertTrue(np.array_equal(self._create_expected_output_alternate(), ort_outputs[0].numpy()))
+
+    def test_bind_input_and_preallocated_output(self):
+        """
+        Test I/O binding with preallocated output.
+        """
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+
+        input = self._create_ortvalue_input_on_gpu("cuda")
+
+        session = onnxrt.InferenceSession(get_name("mul_1.onnx"), sess_options=sess_options)
+        io_binding = session.io_binding()
+
+        # Bind input to the GPU
+        io_binding.bind_input("X", "cuda", 0, np.float32, [3, 2], input.data_ptr())
+
+        # Bind output to the GPU
+        output = self._create_uninitialized_ortvalue_input_on_gpu("cuda")
+        io_binding.bind_output("Y", "cuda", 0, np.float32, [3, 2], output.data_ptr())
+
+        # Sync if different streams
+        io_binding.synchronize_inputs()
+
+        # Invoke Run
+        session.run_with_iobinding(io_binding)
+
+        # Sync if different streams
+        io_binding.synchronize_outputs()
+
+        # Get outputs over to CPU (the outputs which were bound to the GPU will get copied over to the host
+        # here)
+        ort_output_vals = io_binding.copy_outputs_to_cpu()[0]
+        # Validate results
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals))
+
+        # Validate if ORT actually wrote to pre-allocated buffer by copying the allocated buffer
+        # to the host and validating its contents
+        ort_output_vals_in_cpu = output.numpy()
+        # Validate results
+        self.assertTrue(np.array_equal(self._create_expected_output(), ort_output_vals_in_cpu))
+
+    def test_bind_input_types(self):
+        """
+        Test I/O binding with various input data types.
+        """
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+        opset = onnx_opset_version()
+        device = C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
+
+        for dtype in [
+            np.float32,
+            # np.float64,
+            np.int32,
+            # np.uint32,
+            np.int64,
+            # np.uint64,
+            # np.int16,
+            # np.uint16,
+            # np.int8,
+            np.uint8,
+            np.float16,
+            np.bool_,
+        ]:
+            with self.subTest(dtype=dtype, inner_device=str(device)):
+                x = np.arange(8).reshape((-1, 2)).astype(dtype)
+                proto_dtype = helper.np_dtype_to_tensor_dtype(x.dtype)
+
+                X = helper.make_tensor_value_info("X", proto_dtype, [None, x.shape[1]])  # noqa: N806
+                Y = helper.make_tensor_value_info("Y", proto_dtype, [None, x.shape[1]])  # noqa: N806
+
+                # inference
+                node_add = helper.make_node("Identity", ["X"], ["Y"])
+
+                # graph
+                graph_def = helper.make_graph([node_add], "lr", [X], [Y], [])
+                model_def = helper.make_model(
+                    graph_def,
+                    producer_name="dummy",
+                    ir_version=7,
+                    producer_version="0",
+                    opset_imports=[helper.make_operatorsetid("", opset)],
+                )
+
+                sess = onnxrt.InferenceSession(model_def.SerializeToString(), sess_options=sess_options)
+
+                bind = SessionIOBinding(sess._sess)
+                ort_value = C_OrtValue.ortvalue_from_numpy(x, device)
+                bind.bind_ortvalue_input("X", ort_value)
+                bind.bind_output("Y", device)
+                sess._sess.run_with_iobinding(bind, None)
+                ortvaluevector = bind.get_outputs()
+                self.assertIsInstance(ortvaluevector, OrtValueVector)
+                ortvalue = bind.get_outputs()[0]
+                y = ortvalue.numpy()
+                assert_almost_equal(x, y)
+
+                bind = SessionIOBinding(sess._sess)
+                bind.bind_input("X", device, dtype, x.shape, ort_value.data_ptr())
+                bind.bind_output("Y", device)
+                sess._sess.run_with_iobinding(bind, None)
+                ortvalue = bind.get_outputs()[0]
+                y = ortvalue.numpy()
+                assert_almost_equal(x, y)
+
+    def test_bind_onnx_types_from_torch(self):
+        """
+        Test I/O binding with various input data types.
+        """
+        sess_options = onnxrt.SessionOptions()
+        sess_options.set_provider_selection_policy(onnxrt.OrtExecutionProviderDevicePolicy.PREFER_GPU)
+        self.assertTrue(sess_options.has_providers())
+        opset = onnx_opset_version()
+
+        for dtype in [
+            torch.float32,
+            torch.float16,
+            torch.bfloat16,
+            torch.int32,
+            torch.int64,
+        ]:
+            with self.subTest(dtype=dtype):
+                proto_dtype = self.torch_to_onnx_type(dtype)
+
+                x_ = helper.make_tensor_value_info("X", proto_dtype, [None])
+                y_ = helper.make_tensor_value_info("Y", proto_dtype, [None])
+                node_add = helper.make_node("Identity", ["X"], ["Y"])
+                graph_def = helper.make_graph([node_add], "lr", [x_], [y_], [])
+                model_def = helper.make_model(
+                    graph_def,
+                    producer_name="dummy",
+                    ir_version=10,
+                    producer_version="0",
+                    opset_imports=[helper.make_operatorsetid("", opset)],
+                )
+                sess = onnxrt.InferenceSession(model_def.SerializeToString(), sess_options=sess_options)
+
+                dev = "cuda" if torch.cuda.is_available() else "cpu"
+                device = (
+                    C_OrtDevice(C_OrtDevice.cuda(), C_OrtDevice.default_memory(), 0)
+                    if dev == "cuda"
+                    else C_OrtDevice(C_OrtDevice.cpu(), C_OrtDevice.default_memory(), 0)
+                )
+
+                x = torch.arange(8, dtype=dtype, device=dev)
+                y = torch.empty(8, dtype=dtype, device=dev)
+
+                bind = SessionIOBinding(sess._sess)
+                bind.bind_input("X", device, proto_dtype, x.shape, x.data_ptr())
+                bind.bind_output("Y", device, proto_dtype, y.shape, y.data_ptr())
+                sess._sess.run_with_iobinding(bind, None)
+                self.assertTrue(torch.equal(x, y))
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=1)
diff --git a/tools/ci_build/github/windows/extract_nuget_files.ps1 b/tools/ci_build/github/windows/extract_nuget_files.ps1
index ff8f63a85b97a..20d6c1f2b63a5 100644
--- a/tools/ci_build/github/windows/extract_nuget_files.ps1
+++ b/tools/ci_build/github/windows/extract_nuget_files.ps1
@@ -1,105 +1,119 @@
 # Copyright (c) Microsoft Corporation. All rights reserved.
 # Licensed under the MIT License.
 
-# This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
+# This file is used by Zip-Nuget-Java Packaging Pipeline
 
-# Re-construct a build directory that contains binaries from all the different platforms we're including
-# in the native ORT nuget package
+# Define the directory for NuGet artifacts.
 $nuget_artifacts_dir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
-New-Item -Path $nuget_artifacts_dir -ItemType directory
+# Create the directory if it doesn't exist.
+New-Item -Path $nuget_artifacts_dir -ItemType directory -ErrorAction SilentlyContinue
 
 ## .zip files
-# unzip directly
-# exclude the iOS xcframework as we need to leave that zipped up to preserve symlinks
-Get-ChildItem -Path $Env:BUILD_BINARIESDIRECTORY\nuget-artifact\* -Include *.zip -Exclude onnxruntime_ios_xcframework.*.zip |
+# Unzip files directly, excluding the iOS xcframework to preserve its symlinks.
+Get-ChildItem -Path "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact\*" -Include *.zip -Exclude onnxruntime_ios_xcframework.*.zip |
 Foreach-Object {
- $cmd = "7z.exe x $($_.FullName) -y -o$nuget_artifacts_dir"
- Write-Output $cmd
- Invoke-Expression -Command $cmd
+    # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01).
+    $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20"
+    Write-Output "Executing: 7z.exe $arguments"
+    # Directly call 7z.exe using the call operator '&'
+    & 7z.exe $arguments
+    # Check the exit code of the last command. A non-zero code indicates an error.
+    if ($LASTEXITCODE -ne 0) {
+        throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE"
+    }
 }
 
 ## .tgz files
-# first extract the tar file from the tgz
-Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tgz |
+# First, extract the .tar file from the .tgz archive.
+Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tgz |
 Foreach-Object {
- $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact"
- Write-Output $cmd
- Invoke-Expression -Command $cmd
+    # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01).
+    $arguments = "x", "$($_.FullName)", "-y", "-o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact", "-snld20"
+    Write-Output "Executing: 7z.exe $arguments"
+    & 7z.exe $arguments
+    if ($LASTEXITCODE -ne 0) {
+        throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE"
+    }
 }
 
-# now extract the actual folder structure from the tar file to the build dir
-Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tar |
+# Now, extract the contents from the .tar file into the final directory.
+Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tar |
 Foreach-Object {
- $cmd = "7z.exe x $($_.FullName) -y -o$nuget_artifacts_dir"
- Write-Output $cmd
- Invoke-Expression -Command $cmd
+    # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01).
+    $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20"
+    Write-Output "Executing: 7z.exe $arguments"
+    & 7z.exe $arguments
+    if ($LASTEXITCODE -ne 0) {
+        throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE"
+    }
 }
 
-# process iOS xcframework
-$xcframeworks = Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter onnxruntime_ios_xcframework.*.zip
+# Process iOS xcframework
+$xcframeworks = Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter onnxruntime_ios_xcframework.*.zip
 if ($xcframeworks.Count -eq 1) {
-  $xcframework = $xcframeworks[0]
-  $target_dir = "$nuget_artifacts_dir\onnxruntime-ios-xcframework"
-  # remove version info from filename and use required filename format
-  $target_file = "$target_dir\onnxruntime.xcframework.zip"
-  New-Item -Path $target_dir -ItemType directory
+    $xcframework = $xcframeworks[0]
+    $target_dir = "$nuget_artifacts_dir\onnxruntime-ios-xcframework"
+    # Use the required filename format, removing version info.
+    $target_file = "$target_dir\onnxruntime.xcframework.zip"
+    New-Item -Path $target_dir -ItemType directory -ErrorAction SilentlyContinue
 
-  Write-Output "Copy-Item $($xcframework.FullName) $target_file"
-  Copy-Item $xcframework.FullName $target_file
+    Write-Output "Copying $($xcframework.FullName) to $target_file"
+    Copy-Item $xcframework.FullName $target_file
 }
 elseif ($xcframeworks.Count -gt 1) {
-  Write-Error "Expected at most one onnxruntime_ios_xcframework*.zip file but got: [$xcframeworks]"
+    Write-Error "Expected at most one onnxruntime_ios_xcframework*.zip file but got: [$xcframeworks]"
 }
 
-
-# copy android AAR.
-# for full build of onnxruntime Android AAR, there should only be one .aar file
-# called onnxruntime-android-x.y.z.aar or onnxruntime-training-android-x.y.z.aar but sanity check that
-$aars = Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.aar
+# Copy Android AAR file.
+# There should only be one .aar file for a full build.
+$aars = Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.aar
 if ($aars.Count -eq 1) {
-  $aar = $aars[0]
-  $aar_prefix = "onnxruntime"
-  if ($aar -like "onnxruntime-training*") {
-    $aar_prefix = "onnxruntime-training"
-  }
-  $target_dir = "$nuget_artifacts_dir\$aar_prefix-android-aar"
-  $target_file = "$target_dir\onnxruntime.aar"  # remove '-mobile' and version info from filename
-  New-Item -Path $target_dir -ItemType directory
+    $aar = $aars[0]
+    $aar_prefix = "onnxruntime"
+    if ($aar.Name -like "onnxruntime-training*") {
+        $aar_prefix = "onnxruntime-training"
+    }
+    $target_dir = "$nuget_artifacts_dir\$aar_prefix-android-aar"
+    # Remove version info from the filename for consistency.
+    $target_file = "$target_dir\onnxruntime.aar"
+    New-Item -Path $target_dir -ItemType directory -ErrorAction SilentlyContinue
 
-  Write-Output "Copy-Item $($aar.FullName) $target_file"
-  Copy-Item $aar.FullName $target_file
+    Write-Output "Copying $($aar.FullName) to $target_file"
+    Copy-Item $aar.FullName $target_file
 }
 elseif ($aars.Count -gt 1) {
-  Write-Error "Expected at most one Android .aar file but got: [$aars]"
+    Write-Error "Expected at most one Android .aar file but got: [$aars]"
 }
 
-# Check whether this is a training pipeline
-$is_training_pipeline = $false
-if (Test-Path -Path $nuget_artifacts_dir\onnxruntime-training-win-x64-*) {
-  $is_training_pipeline = $true
-  Write-Output "onnxruntime-training-win-x64-* dir exists. This is a training pipeline."
+# Check if this is a training pipeline by looking for a specific directory.
+$is_training_pipeline = Test-Path -Path "$nuget_artifacts_dir\onnxruntime-training-win-x64-*"
+if ($is_training_pipeline) {
+    Write-Output "onnxruntime-training-win-x64-* dir exists. This is a training pipeline."
 }
 
-# Copy onnxruntime and protoc binaries to the binaries dir as these are required
-# by Microsoft.ML.OnnxRuntime.Tests.NetCoreApp
+# Copy onnxruntime and protoc binaries required by tests.
+$destinationDir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo"
 if ($is_training_pipeline) {
-  Copy-Item -Path $nuget_artifacts_dir\onnxruntime-training-win-x64-*\lib\* -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo
+    Copy-Item -Path "$nuget_artifacts_dir\onnxruntime-training-win-x64-*\lib\*" -Destination $destinationDir -Recurse
 }
 else {
-  Copy-Item -Path $nuget_artifacts_dir\onnxruntime-win-x64-*\lib\* -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo
+    Copy-Item -Path "$nuget_artifacts_dir\onnxruntime-win-x64-*\lib\*" -Destination $destinationDir -Recurse
 }
 
-"Get-ChildItem -Directory -Path $nuget_artifacts_dir\onnxruntime-*"
-$ort_dirs = Get-ChildItem -Directory -Path $nuget_artifacts_dir\onnxruntime-*
-foreach ($ort_dir in $ort_dirs)
-{
-  # remove the last '-xxx' segment from the dir name. typically that's the architecture.
-  $dirname = Split-Path -Path $ort_dir -Leaf
-  $dirname = $dirname.SubString(0,$dirname.LastIndexOf('-'))
-  Write-Output "Renaming $ort_dir to $dirname"
-  Rename-Item -Path $ort_dir -NewName $nuget_artifacts_dir\$dirname
+# Rename directories to remove the architecture-specific suffix.
+Write-Output "Renaming onnxruntime directories..."
+Get-ChildItem -Directory -Path "$nuget_artifacts_dir\onnxruntime-*" | ForEach-Object {
+    $dirname = $_.Name
+    # Find the last hyphen and remove the suffix.
+    $lastHyphenIndex = $dirname.LastIndexOf('-')
+    if ($lastHyphenIndex -gt -1) {
+        $newName = $dirname.Substring(0, $lastHyphenIndex)
+        $newPath = Join-Path -Path $_.Parent.FullName -ChildPath $newName
+        Write-Output "Renaming '$($_.FullName)' to '$newPath'"
+        Rename-Item -Path $_.FullName -NewName $newName
+    }
 }
 
-# List artifacts
-"Post copy artifacts"
-Get-ChildItem -Recurse $nuget_artifacts_dir\
+# List the final artifacts.
+Write-Output "Post-copy artifacts:"
+Get-ChildItem -Recurse $nuget_artifacts_dir
\ No newline at end of file
diff --git a/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1 b/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1
index 01a8eebe75df2..29946dcb73f8a 100644
--- a/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1
+++ b/tools/ci_build/github/windows/extract_nuget_files_gpu.ps1
@@ -2,47 +2,81 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget-Java Packaging Pipeline
-New-Item -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts -ItemType directory
+# Define the directory for NuGet artifacts.
+$nuget_artifacts_dir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
+# Create the directory if it doesn't exist.
+New-Item -Path $nuget_artifacts_dir -ItemType directory -ErrorAction SilentlyContinue
 
-Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.zip | 
+## .zip files
+Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.zip |
 Foreach-Object {
- $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
- Write-Output $cmd
- Invoke-Expression -Command $cmd
+    # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01).
+    $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20"
+    Write-Output "Executing: 7z.exe $arguments"
+    & 7z.exe $arguments
+    if ($LASTEXITCODE -ne 0) {
+        throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE"
+    }
 }
 
-Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tgz | 
+## .tgz files
+Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tgz |
 Foreach-Object {
- $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" # *.tar will be created after *.tgz is extracted 
- Write-Output $cmd
- Invoke-Expression -Command $cmd
+    # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01).
+    # *.tar will be created after *.tgz is extracted
+    $arguments = "x", "$($_.FullName)", "-y", "-o$Env:BUILD_BINARIESDIRECTORY\nuget-artifact", "-snld20"
+    Write-Output "Executing: 7z.exe $arguments"
+    & 7z.exe $arguments
+    if ($LASTEXITCODE -ne 0) {
+        throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE"
+    }
 }
 
-Get-ChildItem $Env:BUILD_BINARIESDIRECTORY\nuget-artifact -Filter *.tar | 
+## .tar files
+Get-ChildItem "$Env:BUILD_BINARIESDIRECTORY\nuget-artifact" -Filter *.tar |
 Foreach-Object {
- $cmd = "7z.exe x $($_.FullName) -y -o$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
- Write-Output $cmd
- Invoke-Expression -Command $cmd
+    # The -snld20 flag is used to bypass security checks for creating symbolic links (added in 7-Zip 25.01).
+    $arguments = "x", "$($_.FullName)", "-y", "-o$nuget_artifacts_dir", "-snld20"
+    Write-Output "Executing: 7z.exe $arguments"
+    & 7z.exe $arguments
+    if ($LASTEXITCODE -ne 0) {
+        throw "Error extracting '$($_.FullName)'. Exit code: $LASTEXITCODE"
+    }
 }
 
+# Create directory for protobuf build dependencies.
+New-Item -Path "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo" -ItemType directory -ErrorAction SilentlyContinue
 
-New-Item -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo -ItemType directory
-
-Copy-Item -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-win-x64-cuda-*\lib\* -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo
+# Copy CUDA libraries.
+Copy-Item -Path "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-win-x64-cuda-*\lib\*" -Destination "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo"
 
+# Install protoc via dotnet.
 $protocInstallDir = "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build"
 dotnet new console
 dotnet add package Google.Protobuf.Tools --version 3.21.12 --package-directory $protocInstallDir
+if ($LASTEXITCODE -ne 0) {
+    throw "Error adding Google.Protobuf.Tools package. Exit code: $LASTEXITCODE"
+}
+
+# Find and copy the protoc executable.
 $protocDir = Get-ChildItem -Path $protocInstallDir -Recurse -Filter "protoc.exe" | Select-Object -ExpandProperty DirectoryName -First 1
-Write-Output $protocDir
-Copy-Item -Path $protocDir -Destination $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo
-
-$ort_dirs = Get-ChildItem -Path $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-* -Directory
-foreach ($ort_dir in $ort_dirs)
-{  
-  $dirname = Split-Path -Path $ort_dir -Leaf
-  $dirname = $dirname.SubString(0,$dirname.LastIndexOf('-'))
-  Write-Output "Renaming $ort_dir to $dirname"
-  Rename-Item -Path $ort_dir -NewName $Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\$dirname  
+if ($protocDir) {
+    Write-Output "Found protoc directory: $protocDir"
+    Copy-Item -Path $protocDir -Destination "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\_deps\protobuf-build\RelWithDebInfo"
+}
+else {
+    Write-Error "Could not find protoc.exe in $protocInstallDir"
 }
 
+# Rename onnxruntime directories to a generic format.
+$ort_dirs = Get-ChildItem -Path "$Env:BUILD_BINARIESDIRECTORY\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-*" -Directory
+foreach ($ort_dir in $ort_dirs) {
+    $dirname = Split-Path -Path $ort_dir -Leaf
+    $lastHyphenIndex = $dirname.LastIndexOf('-')
+    if ($lastHyphenIndex -gt -1) {
+        $newName = $dirname.Substring(0, $lastHyphenIndex)
+        $newPath = Join-Path -Path $ort_dir.Parent.FullName -ChildPath $newName
+        Write-Output "Renaming '$($ort_dir.FullName)' to '$newPath'"
+        Rename-Item -Path $ort_dir.FullName -NewName $newName
+    }
+}