microsoft · adrianlizarraga · Aug 4, 2025 · Aug 2, 2025
diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h
@@ -740,7 +740,7 @@ class Graph {  // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi
   common::Status InjectExternalInitializedTensors(const InlinedHashMap<std::string, OrtValue>& external_initializers);
 
   /** This function takes externally provided files in memory for initializers with external
-   *    data and replaces graph initializers with its content.
+   *    data and replaces main graph initializers with its content.
    */
   common::Status InjectExternalInitializersFromFilesInMemory(
       const InlinedHashMap<PathString, std::pair<char*, size_t>>& external_initializer_files);

diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
@@ -375,7 +375,7 @@ common::Status SaveInitializedTensors(
       // TODO: if the tensor need be copied, does it have enough room?
       ORT_RETURN_IF_ERROR(planner.GetPreallocatedBuffer(ort_value_index, name, memory_buffer, alloc));
 
-      // ??? Should we ignore this session option if the EP is explictly providing the read only allocator?
+      // ??? Should we ignore this session option if the EP is explicitly providing the read only allocator?
       // bool have_readonly_initializer_allocator = alloc->Info().alloc_type == OrtReadOnlyAllocator;
       const bool use_device_allocator_for_initializers =
           session_options.config_options.GetConfigOrDefault(
@@ -402,9 +402,11 @@ common::Status SaveInitializedTensors(
                                                         std::move(tensor), ort_value));
         }
       } else {
+        // if in memory we were expecting to find it above.
+        ORT_ENFORCE(!utils::HasExternalDataInMemory(tensor_proto));
+
         // We need to deserialize the tensor proto into an OrtValue
         // using the preallocated buffer or allocator.
-
         Status st = DeserializeTensorProto(env, graph_loc, tensor_proto,
                                            (memory_buffer.has_value()) ? &*memory_buffer : nullptr,
                                            alloc, default_cpu_alloc, ort_value, data_transfer_mgr,

diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc
@@ -1226,26 +1226,6 @@ Graph::Graph(const Model& owning_model,
   ArgNameToTypeMap name_to_type_map;
   const auto& model_path = ModelPath();
 
-  // If the tensor proto data is large enough, externalize it and replace with a tensor_proto
-  // with external data reference pointing to an OrtValue, otherwise do nothing.
-  auto put_data_maybe_in_memory = [this, &model_path](ONNX_NAMESPACE::TensorProto& tensor_proto) {
-    size_t size_in_bytes = 0;
-    ORT_THROW_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_in_bytes));
-    if (size_in_bytes > utils::kSmallTensorExternalDataThreshold) {
-      OrtValue ort_value;
-      ORT_THROW_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), model_path, tensor_proto,
-                                                      CPUAllocator::DefaultInstance(), ort_value));
-      constexpr const bool use_tensor_buffer_true = true;
-      auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get<Tensor>(), tensor_proto.name(),
-                                                            use_tensor_buffer_true);
-      assert(ort_value.IsAllocated());
-      auto ins_result = ortvalue_initializers_.insert_or_assign(tensor_proto_to_add.name(), std::move(ort_value));
-      ORT_ENFORCE(ins_result.second, "Unexpected duplicate insert or assign OrtValue for tensor: ", tensor_proto_to_add.name(),
-                  " in the initializer list.");
-      tensor_proto = std::move(tensor_proto_to_add);
-    }
-  };
-
   // Process 'Constant' nodes
   // Put the 'TensorProto' stored in the 'Constant' nodes attribute into the graphs initializer list
   for (auto& node : graph_proto_->node()) {
@@ -1265,8 +1245,6 @@ Graph::Graph(const Model& owning_model,
       }
     }
 
-    put_data_maybe_in_memory(*tensor);
-
     // Ensure initializers are also graph inputs.
     if (ir_version_ < 4) {
       TypeProto t{utils::TypeProtoFromTensorProto(*tensor)};
@@ -1343,22 +1321,7 @@ Graph::Graph(const Model& owning_model,
   }
 
   // Copy initial tensors to a map.
-  for (int i = 0, lim = graph_proto_->initializer_size(); i < lim; ++i) {
-    auto& tensor = *graph_proto_->mutable_initializer(i);
-    // If data is on disk, it will be loaded either by optimizers
-    // or during session state finalization.
-    // If data is already in memory, do nothing.
-    if (!utils::HasExternalData(tensor)) {
-      const bool is_sparse = sparse_tensor_names_.count(tensor.name());
-      if (is_sparse) {
-        sparse_tensor_names_.erase(tensor.name());
-      }
-      put_data_maybe_in_memory(tensor);
-      if (is_sparse) {
-        sparse_tensor_names_.emplace(tensor.name());
-      }
-    }
-
+  for (auto& tensor : graph_proto_->initializer()) {
     auto p = name_to_initial_tensor_.emplace(tensor.name(), &tensor);
     if (!p.second) {
       LOGS(logger_, WARNING) << "Duplicate initializer (dense, sparse or ConstantNode): '" << tensor.name()
@@ -3420,7 +3383,32 @@ Status Graph::Resolve(const ResolveOptions& options) {
 
   ORT_RETURN_IF_ERROR(ForThisAndAllSubgraphs(all_subgraphs, finalize_func));
 
-  return Status::OK();
+  auto put_weights_maybe_in_memory_func = [&](Graph& graph) -> Status {
+    // if we have any initializers that are not in memory, put them there.
+    const auto& model_path = graph.ModelPath();
+    auto& graph_proto = *graph.graph_proto_;
+    for (int i = 0, lim = graph_proto.initializer_size(); i < lim; ++i) {
+      auto& tensor_proto = *graph_proto.mutable_initializer(i);
+      if (utils::HasExternalData(tensor_proto)) {
+        continue;  // ignore data on disk, that will be loaded either by EP or at session_state finalize
+      }
+
+      size_t size_in_bytes = 0;
+      ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(tensor_proto, &size_in_bytes));
+      if (size_in_bytes > utils::kSmallTensorExternalDataThreshold) {
+        OrtValue ort_value;
+        ORT_RETURN_IF_ERROR(utils::TensorProtoToOrtValue(Env::Default(), model_path, tensor_proto,
+                                                         CPUAllocator::DefaultInstance(), ort_value));
+        constexpr const bool use_tensor_buffer_true = true;
+        auto tensor_proto_to_add = utils::TensorToTensorProto(ort_value.Get<Tensor>(), tensor_proto.name(),
+                                                              use_tensor_buffer_true);
+        ORT_RETURN_IF_ERROR(graph.ReplaceInitializedTensor(tensor_proto_to_add, ort_value));
+      }
+    }
+    return Status::OK();
+  };
+
+  return ForThisAndAllSubgraphs(all_subgraphs, put_weights_maybe_in_memory_func);
 }
 
 void Graph::SetName(const std::string& name) {
@@ -3659,6 +3647,15 @@ Status Graph::ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initi
   ORT_RETURN_IF_NOT(old_initializer.data_type() == new_initializer.data_type(),
                     "Replacement tensor's data type does not match.");
 
+  bool is_sparse = false;
+  {
+    auto sparse_tensor_it = sparse_tensor_names_.find(initializer_name);
+    if (sparse_tensor_it != sparse_tensor_names_.end()) {
+      sparse_tensor_names_.erase(sparse_tensor_it);
+      is_sparse = true;
+    }
+  }
+
   auto& mutable_initializers = *(graph_proto_->mutable_initializer());
   // use cheaper pointer comparison to find old entry
   auto existing_entry = std::find(mutable_initializers.pointer_begin(), mutable_initializers.pointer_end(),
@@ -3675,6 +3672,9 @@ Status Graph::ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initi
   }
 
   **existing_entry = std::move(new_initializer);
+  if (is_sparse) {
+    sparse_tensor_names_.insert((**existing_entry).name());
+  }
 
   return Status::OK();
 }
@@ -3720,7 +3720,7 @@ Status Graph::InjectExternalInitializedTensors(const InlinedHashMap<std::string,
 Status Graph::InjectExternalInitializersFromFilesInMemory(
     const InlinedHashMap<PathString, std::pair<char*, size_t>>& external_initializer_files) {
   for (const auto& [tensor_name, tensor_proto] : name_to_initial_tensor_) {
-    if (tensor_proto->data_location() == TensorProto_DataLocation_EXTERNAL) {
+    if (utils::HasExternalDataInFile(*tensor_proto)) {
       std::unique_ptr<ExternalDataInfo> external_data_info;
       ORT_RETURN_IF_ERROR(ExternalDataInfo::Create(tensor_proto->external_data(), external_data_info));
 
@@ -3729,25 +3729,27 @@ Status Graph::InjectExternalInitializersFromFilesInMemory(
       const size_t external_data_length = external_data_info->GetLength();
       SafeInt<size_t> tensor_byte_size;
       ORT_RETURN_IF_ERROR(utils::GetSizeInBytesFromTensorProto<0>(*tensor_proto, &tensor_byte_size));
+
       ORT_RETURN_IF_NOT(external_data_length == 0 || external_data_length == tensor_byte_size,
                         "TensorProto: ", tensor_name, " external data size mismatch. Computed size: ",
                         *&tensor_byte_size, ", external_data.length: ", external_data_length);
 
       SafeInt<FileOffsetType> end_of_read(file_offset);
       end_of_read += tensor_byte_size;
 
-      auto external_file_pos = external_initializer_files.find(external_file);
-      ORT_RETURN_IF(external_file_pos == external_initializer_files.end(),
+      auto user_provided_entry = external_initializer_files.find(external_file);
+      ORT_RETURN_IF(user_provided_entry == external_initializer_files.end(),
                     "External file: ", ORT_TSTR_CONVERT_TO_PRINTABLE_STRING(external_file),
                     " not found from the table user provided.");
-      auto external_file_length = external_file_pos->second.second;
 
-      ORT_RETURN_IF(file_offset < 0 || end_of_read > narrow<FileOffsetType>(external_file_length),
+      auto user_provided_length = user_provided_entry->second.second;
+
+      ORT_RETURN_IF(file_offset < 0 || end_of_read > narrow<FileOffsetType>(user_provided_length),
                     "External initializer: ", tensor_name,
                     " offset: ", file_offset, " size to read: ", external_data_length,
-                    " given file_length: ", external_file_length, " are out of bounds or can not be read in full.");
-      char* external_file_buffer = static_cast<char*>(external_file_pos->second.first);
-      char* tensor_buffer = external_file_buffer + file_offset;
+                    " given file_length: ", user_provided_length, " are out of bounds or can not be read in full.");
+      char* user_provided_file_buffer = static_cast<char*>(user_provided_entry->second.first);
+      char* user_provided_tensor_buffer = user_provided_file_buffer + file_offset;
 
       const auto& old_initializer = *(tensor_proto);
       auto& mutable_initializers = *(graph_proto_->mutable_initializer());
@@ -3762,19 +3764,11 @@ Status Graph::InjectExternalInitializersFromFilesInMemory(
       const DataTypeImpl* const type =
           DataTypeImpl::TensorTypeFromONNXEnum(old_initializer.data_type())->GetElementType();
       TensorShape tensor_shape = utils::GetTensorShapeFromTensorProto(old_initializer);
-      auto tensor = Tensor(type, tensor_shape, tensor_buffer,
+      auto tensor = Tensor(type, tensor_shape, user_provided_tensor_buffer,
                            OrtMemoryInfo(CPU, OrtAllocatorType::OrtDeviceAllocator));
 
-      constexpr const bool use_tensor_buffer_true = true;
-      auto new_tensor_proto = utils::TensorToTensorProto(tensor, tensor_name, use_tensor_buffer_true);
-      // Implied that external data is in memory
-      const bool has_external_data_in_memory = utils::HasExternalData(new_tensor_proto);
-
-      OrtValue ort_value;
-      if (has_external_data_in_memory) {
-        Tensor::InitOrtValue(std::move(tensor), ort_value);
-      }
-      ortvalue_initializers_.insert_or_assign(tensor_name, std::move(ort_value));
+      constexpr const bool use_tensor_buffer_false = false;
+      auto new_tensor_proto = utils::TensorToTensorProto(tensor, tensor_name, use_tensor_buffer_false);
       **existing_entry = std::move(new_tensor_proto);
     }
   }

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/base_op_builder.cc
@@ -7,45 +7,6 @@
 namespace onnxruntime {
 namespace nnapi {
 
-namespace {
-bool HasExternalInitializer(const InitializedTensorSet& initializers, const NodeUnit& node_unit) {
-  const auto is_ext_initializer =
-      [&](const NodeArg& node_arg) {
-        const auto& input_name(node_arg.Name());
-        const auto initializer = initializers.find(input_name);
-        if (initializer == initializers.end())
-          return false;
-
-        const auto& tensor = *initializer->second;
-        if (tensor.has_data_location() &&
-            tensor.data_location() == ONNX_NAMESPACE::TensorProto_DataLocation_EXTERNAL) {
-          LOGS_DEFAULT(VERBOSE) << "Initializer [" << input_name
-                                << "] with external data location are not currently supported";
-          return true;
-        }
-
-        return false;
-      };
-
-  const auto& inputs = node_unit.Inputs();
-  for (const auto& input : inputs) {
-    if (is_ext_initializer(input.node_arg))
-      return true;
-
-    if (!input.quant_param)
-      continue;
-
-    if (is_ext_initializer(input.quant_param->scale))
-      return true;
-
-    if (input.quant_param->zero_point && is_ext_initializer(*input.quant_param->zero_point))
-      return true;
-  }
-
-  return false;
-}
-}  // namespace
-
 // Add operator related
 
 Status BaseOpBuilder::AddToModelBuilder(ModelBuilder& model_builder, const NodeUnit& node_unit) const {
@@ -86,10 +47,6 @@ bool BaseOpBuilder::IsOpSupported(const GraphViewer& graph_viewer, const NodeUni
   if (!HasSupportedInputOutputs(graph_viewer, node_unit, params))
     return false;
 
-  // We do not support external initializers for now
-  if (HasExternalInitializer(graph_viewer.GetAllInitializedTensors(), node_unit))
-    return false;
-
   if (!HasSupportedOpSet(node_unit))
     return false;
 

diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/model_builder.cc
@@ -283,7 +283,7 @@ Status ModelBuilder::RegisterInitializers() {
     auto [index, size, padded_size] = initializers[i++];
     const uint8_t* src = nullptr;
     // TensorProto_DataType_UINT8 or TensorProto_DataType_FLOAT:
-    Initializer unpacked_tensor(tensor, graph_viewer_.ModelPath());
+    Initializer unpacked_tensor(graph_viewer_.GetGraph(), tensor, graph_viewer_.ModelPath());
     size_t size_in_bytes = unpacked_tensor.DataAsByteSpan().size();
     ORT_RETURN_IF_NOT(size == size_in_bytes,
                       "initializer tensor: ", tensor.name(), "'s size: ",

diff --git a/orttraining/orttraining/test/optimizer/graph_transform_test.cc b/orttraining/orttraining/test/optimizer/graph_transform_test.cc
@@ -1235,7 +1235,7 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_TakeEffect) {
         auto out_channel = 64;
         auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
 
-        auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel / group, 1}, {-1.0f, 1.0f});
+        auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel / group, 1}, -1.0f, 1.0f);
         auto* conv_output = builder.MakeOutput();
 
         auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});
@@ -1280,8 +1280,8 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect1) {
       auto out_channel = 64;
       auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
 
-      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
-      auto* bias_arg = builder.MakeInitializer<float>({out_channel}, {-1.0f, 1.0f});
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, -1.0f, 1.0f);
+      auto* bias_arg = builder.MakeInitializer<float>({out_channel}, -1.0f, 1.0f);
       auto* conv_output = builder.MakeOutput();
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg, bias_arg}, {conv_output});
@@ -1314,7 +1314,7 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect2) {
       auto out_channel = 64;
       auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
 
-      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, -1.0f, 1.0f);
       auto* conv_output = builder.MakeOutput();
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});
@@ -1347,7 +1347,7 @@ TEST_F(GraphTransformationTests, Conv1dReplacement_NoTakeEffect3) {
       auto out_channel = 64;
       auto* data_arg = builder.MakeInput<float>({{batch_size, in_channel, in_length}});
 
-      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, {-1.0f, 1.0f});
+      auto* weight_arg = builder.MakeInitializer<float>({out_channel, in_channel, 1}, -1.0f, 1.0f);
       auto* conv_output = builder.MakeOutput();
 
       auto& conv_node = builder.AddNode("Conv", {data_arg, weight_arg}, {conv_output});