Fix bug in allocation planner while planning location for initializers (#9306)

2026-07-08 17:17:15 +00:00 · 2021-10-07 19:05:07 -07:00 · 2021-10-07 19:05:07 -07:00 · 5f5f28bf14
commit 5f5f28bf14
parent 68601fc296
1 changed files with 25 additions and 7 deletions
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@ -698,6 +698,16 @@ class PlannerImpl {
        }

        auto wt_index = Index(def_name);
+        // TODO: Identify error cases where-in an initializer is used on different
+        // devices within the same graph level.
+        // If we ever encounter that, it means that there is a severe bug in Memcpy
+        // transformer and the model will crash while running. The Memcpy transformer
+        // is supposed to duplicate initializers being used on different devices within
+        // the same graph level and hence we should never see an initializer being used
+        // on different devices here.
+        // The same initializer being used on different devices across graph levels
+        // (subgraphs) is okay and utils::CopyInputsAcrossDevices() will take it to
+        // the right device before subgraph execution.
        locations[wt_index].emplace_back(
            GetLocationForNodeInput(node_input_index, node, kernel_create_info_map));
      }
@ -730,6 +740,20 @@ class PlannerImpl {
  }

  Status GeneratePlanForWeights() {
+    // TODO: Move away from usage of vector of `OrtMemoryInfo`s per weight (initializer)
+    // We do not need to maintain a vector of locations that a weight is used in.
+    // We only need to know the location of its first usage because:
+    // (1) If the initializer is used in the graph level it is introduced in, then it can
+    // only be used on one device as the Memcpy transformer will duplicate the initializer
+    // (with a different name) in case it is used on multiple devices.
+    // If the initializer is also additionally used in one of the subgraphs, we rely
+    // on the utils::CopyInputsAcrossDevices() to copy it over to the appropriate device
+    // before the subgraphs are executed.
+    // (2) If the initializer is NOT used in the level it is introduced in and only used
+    // in subgraphs, even then knowing its first usage location is enough as it can't be
+    // used on different devices within the same graph level (see (1) for reason), and for
+    // nested subgraphs, we can rely on the utils::CopyInputsAcrossDevices() to copy it
+    // over to the appropriate device before the subgraphs are executed.
    std::vector<std::vector<OrtMemoryInfo>> locations(plan_.allocation_plan.size());

    GeneratePlanForWeightsHelper(graph_viewer_, graph_viewer_.GetAllInitializedTensors(),
@ -739,6 +763,7 @@ class PlannerImpl {
      const std::vector<OrtMemoryInfo>& loc = locations[i];
      if (loc.empty()) continue;
      plan_.allocation_plan[i].alloc_kind = AllocKind::kAllocateStatically;
+      // The planned location for an initializer is the location of its first usage.
      plan_.allocation_plan[i].location = loc[0];
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
      size_t max_pc = plan_.execution_plan.size();
@ -748,13 +773,6 @@ class PlannerImpl {
      plan_.allocation_plan[i].value_type = utils::GetMLDataType(*node_arg);
      plan_.allocation_plan[i].life_interval = std::pair<size_t, size_t>(0, max_pc);
 #endif
-      for (size_t j = 0; j != loc.size(); ++j) {
-        if (loc[j] != loc[0]) {
-          // set the location to CPU
-          plan_.allocation_plan[i].location = execution_providers_.GetDefaultCpuMemoryInfo();
-          break;
-        }
-      }
    }
    return Status::OK();
  }