diff --git a/onnxruntime/core/framework/allocation_planner.cc b/onnxruntime/core/framework/allocation_planner.cc
index 1db4f9aa0c..5dd4be5522 100644
--- a/onnxruntime/core/framework/allocation_planner.cc
+++ b/onnxruntime/core/framework/allocation_planner.cc
@@ -698,6 +698,16 @@ class PlannerImpl {
         }
 
         auto wt_index = Index(def_name);
+        // TODO: Identify error cases where-in an initializer is used on different
+        // devices within the same graph level.
+        // If we ever encounter that, it means that there is a severe bug in Memcpy
+        // transformer and the model will crash while running. The Memcpy transformer
+        // is supposed to duplicate initializers being used on different devices within
+        // the same graph level and hence we should never see an initializer being used
+        // on different devices here.
+        // The same initializer being used on different devices across graph levels
+        // (subgraphs) is okay and utils::CopyInputsAcrossDevices() will take it to
+        // the right device before subgraph execution.
         locations[wt_index].emplace_back(
             GetLocationForNodeInput(node_input_index, node, kernel_create_info_map));
       }
@@ -730,6 +740,20 @@ class PlannerImpl {
   }
 
   Status GeneratePlanForWeights() {
+    // TODO: Move away from usage of vector of `OrtMemoryInfo`s per weight (initializer)
+    // We do not need to maintain a vector of locations that a weight is used in.
+    // We only need to know the location of its first usage because:
+    // (1) If the initializer is used in the graph level it is introduced in, then it can
+    // only be used on one device as the Memcpy transformer will duplicate the initializer
+    // (with a different name) in case it is used on multiple devices.
+    // If the initializer is also additionally used in one of the subgraphs, we rely
+    // on the utils::CopyInputsAcrossDevices() to copy it over to the appropriate device
+    // before the subgraphs are executed.
+    // (2) If the initializer is NOT used in the level it is introduced in and only used
+    // in subgraphs, even then knowing its first usage location is enough as it can't be
+    // used on different devices within the same graph level (see (1) for reason), and for
+    // nested subgraphs, we can rely on the utils::CopyInputsAcrossDevices() to copy it
+    // over to the appropriate device before the subgraphs are executed.
     std::vector<std::vector<OrtMemoryInfo>> locations(plan_.allocation_plan.size());
 
     GeneratePlanForWeightsHelper(graph_viewer_, graph_viewer_.GetAllInitializedTensors(),
@@ -739,6 +763,7 @@ class PlannerImpl {
       const std::vector<OrtMemoryInfo>& loc = locations[i];
       if (loc.empty()) continue;
       plan_.allocation_plan[i].alloc_kind = AllocKind::kAllocateStatically;
+      // The planned location for an initializer is the location of its first usage.
       plan_.allocation_plan[i].location = loc[0];
 #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE)
       size_t max_pc = plan_.execution_plan.size();
@@ -748,13 +773,6 @@ class PlannerImpl {
       plan_.allocation_plan[i].value_type = utils::GetMLDataType(*node_arg);
       plan_.allocation_plan[i].life_interval = std::pair<size_t, size_t>(0, max_pc);
 #endif
-      for (size_t j = 0; j != loc.size(); ++j) {
-        if (loc[j] != loc[0]) {
-          // set the location to CPU
-          plan_.allocation_plan[i].location = execution_providers_.GetDefaultCpuMemoryInfo();
-          break;
-        }
-      }
     }
     return Status::OK();
   }