diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc index 1db4f9aa0c..5dd4be5522 100644 --- a/onnxruntime/core/framework/allocation_planner.cc +++ b/onnxruntime/core/framework/allocation_planner.cc @@ -698,6 +698,16 @@ class PlannerImpl { } auto wt_index = Index(def_name); + // TODO: Identify error cases where-in an initializer is used on different + // devices within the same graph level. + // If we ever encounter that, it means that there is a severe bug in Memcpy + // transformer and the model will crash while running. The Memcpy transformer + // is supposed to duplicate initializers being used on different devices within + // the same graph level and hence we should never see an initializer being used + // on different devices here. + // The same initializer being used on different devices across graph levels + // (subgraphs) is okay and utils::CopyInputsAcrossDevices() will take it to + // the right device before subgraph execution. locations[wt_index].emplace_back( GetLocationForNodeInput(node_input_index, node, kernel_create_info_map)); } @@ -730,6 +740,20 @@ class PlannerImpl { } Status GeneratePlanForWeights() { + // TODO: Move away from usage of vector of `OrtMemoryInfo`s per weight (initializer) + // We do not need to maintain a vector of locations that a weight is used in. + // We only need to know the location of its first usage because: + // (1) If the initializer is used in the graph level it is introduced in, then it can + // only be used on one device as the Memcpy transformer will duplicate the initializer + // (with a different name) in case it is used on multiple devices. + // If the initializer is also additionally used in one of the subgraphs, we rely + // on the utils::CopyInputsAcrossDevices() to copy it over to the appropriate device + // before the subgraphs are executed. + // (2) If the initializer is NOT used in the level it is introduced in and only used + // in subgraphs, even then knowing its first usage location is enough as it can't be + // used on different devices within the same graph level (see (1) for reason), and for + // nested subgraphs, we can rely on the utils::CopyInputsAcrossDevices() to copy it + // over to the appropriate device before the subgraphs are executed. std::vector> locations(plan_.allocation_plan.size()); GeneratePlanForWeightsHelper(graph_viewer_, graph_viewer_.GetAllInitializedTensors(), @@ -739,6 +763,7 @@ class PlannerImpl { const std::vector& loc = locations[i]; if (loc.empty()) continue; plan_.allocation_plan[i].alloc_kind = AllocKind::kAllocateStatically; + // The planned location for an initializer is the location of its first usage. plan_.allocation_plan[i].location = loc[0]; #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) size_t max_pc = plan_.execution_plan.size(); @@ -748,13 +773,6 @@ class PlannerImpl { plan_.allocation_plan[i].value_type = utils::GetMLDataType(*node_arg); plan_.allocation_plan[i].life_interval = std::pair(0, max_pc); #endif - for (size_t j = 0; j != loc.size(); ++j) { - if (loc[j] != loc[0]) { - // set the location to CPU - plan_.allocation_plan[i].location = execution_providers_.GetDefaultCpuMemoryInfo(); - break; - } - } } return Status::OK(); }