diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 298f84268c..5efc666780 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -389,13 +389,6 @@ static Status AllocateSparseTensor(MLValue& mlvalue, const DataTypeImpl& ml_type
 // This method is not thread safe!
 Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_value_index, const TensorShape* shape,
                                                    size_t nnz) {
-  // if there is a custom allocator for this ort_value_index, call it to do the allocation
-  auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
-  if (custom_alloc_entry != custom_allocators_.cend()) {
-    ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
-    return (custom_alloc_entry->second)(*shape, ort_value);
-  }
-
   const SequentialExecutionPlan* p_seq_exec_plan = session_state_.GetExecutionPlan();
   const auto& alloc_plan = p_seq_exec_plan->allocation_plan;
   ORT_ENFORCE(ort_value_index >= 0 && static_cast<size_t>(ort_value_index) < alloc_plan.size());
@@ -409,6 +402,17 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
         "Tried to allocate without valid type information, ort_value index=" + std::to_string(ort_value_index));
   }
 
+  // if there is a custom allocator for this ort_value_index, call it to do the allocation
+  auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
+  if (custom_alloc_entry != custom_allocators_.cend()) {
+    ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
+    bool allocated = false;
+    // see if custom allocator can handle allocation
+    auto status = (custom_alloc_entry->second)(*shape, alloc_info, ort_value, allocated);
+    if (allocated || !status.IsOK())
+      return status;
+  }
+
   if (ml_type->IsTensorType()) {
     ORT_ENFORCE(shape, "Allocation of tensor types requires a shape.");
 
diff --git a/onnxruntime/core/framework/iexecutor.h b/onnxruntime/core/framework/iexecutor.h
index c06b878c8b..e6cd1589b7 100644
--- a/onnxruntime/core/framework/iexecutor.h
+++ b/onnxruntime/core/framework/iexecutor.h
@@ -20,7 +20,7 @@ class Logger;
 
 class IExecutor {
  public:
-  using CustomAllocator = std::function<Status(const TensorShape&, OrtValue&)>;
+  using CustomAllocator = std::function<Status(const TensorShape&, const OrtMemoryInfo&, OrtValue&, bool& allocated)>;
 
   virtual ~IExecutor() = default;
 
diff --git a/onnxruntime/core/providers/cpu/controlflow/if.cc b/onnxruntime/core/providers/cpu/controlflow/if.cc
index a92f6e3b38..56c04ef13a 100644
--- a/onnxruntime/core/providers/cpu/controlflow/if.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/if.cc
@@ -49,18 +49,18 @@ ONNX_OPERATOR_SET_SCHEMA(
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(If,
                                    1, 10,
                                    KernelDefBuilder()
-                                     .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
-                                     .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
+                                       .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+                                       .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
                                    If);
 
 // output shape rules requiring the output shapes of the 'THEN' and 'ELSE'
 // branches to be the same were relaxed in opset-11
 ONNX_CPU_OPERATOR_KERNEL(If,
-                        11,
-                        KernelDefBuilder()
-                            .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
-                            .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
-                        If);
+                         11,
+                         KernelDefBuilder()
+                             .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+                             .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
+                         If);
 
 struct If::Info {
   Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in) : subgraph(subgraph_in) {
@@ -301,14 +301,27 @@ Status IfImpl::Execute(const FeedsFetchesManager& ffm) {
     if (outputs_[i].first == AllocationType::Delayed) {
       // functor to forward the allocation request from the subgraph to the If node's context so that the
       // allocation plan for the If node's output is used.
-      fetch_allocators[i] = [this, i](const TensorShape& shape, OrtValue& ort_value) {
-        // allocate
+      fetch_allocators[i] = [this, i, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
+                                                OrtValue& ort_value, bool& allocated) {
+        // for now we only allocate on CPU as currently all 'If' outputs are on CPU.
+        // if that does not match the required device we don't update the provided OrtValue and return false for
+        // 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
+        // logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
         auto* tensor = context_.Output(i, shape);
+        if (!tensor)
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
 
-        if (!tensor) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
+        const OrtValue& value = *context_.GetOutputMLValue(i);
+
+        if (tensor->Location().device == location.device) {
+          // return OrtValue for allocated tensor
+          ort_value = value;
+          allocated = true;
+        } else {
+          // put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
+          fetches[i] = value;
+        }
 
-        // return OrtValue for allocated tensor
-        ort_value = *context_.GetOutputMLValue(i);
         return Status::OK();
       };
     }
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
index e3b2f2ce80..d61f68edee 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
@@ -231,14 +231,34 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
           auto& ort_value = *iterator;
           fetches.push_back(ort_value);
         } else {
+          // need a dummy empty entry in fetches so the order matches the output names
+          size_t i = fetches.size();
+          fetches.emplace_back();
+
           // use a custom allocator that will forward the allocation request to the Scan context
           // and add the sequence length dimension. this avoids using a temporary value for the first output
-          fetch_allocators[output] = [&iterator](const TensorShape& shape, OrtValue& ort_value) {
-            return iterator.AllocateSubgraphOutput(shape, ort_value);
-          };
+          fetch_allocators[output] = [i, &iterator, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
+                                                              OrtValue& ort_value, bool& allocated) {
+            auto status = iterator.AllocateFinalOutput(shape);
+            ORT_RETURN_IF_ERROR(status);
 
-          // also need a dummy empty entry in fetches so the order matches the output names
-          fetches.emplace_back();
+            const OrtValue& value = *iterator;
+
+            // for now we only allocate on CPU as currently all 'Scan' outputs are on CPU.
+            // if that does not match the required device we don't update the provided OrtValue and return false for
+            // 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
+            // logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
+            if (value.Get<Tensor>().Location().device == location.device) {
+              // update OrtValue with a current slice from the iterator.
+              ort_value = value;
+              allocated = true;
+            } else {
+              // put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
+              fetches[i] = value;
+            }
+
+            return Status::OK();
+          };
         }
       }
     }
@@ -268,8 +288,8 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
 
 OrtValue AllocateTensorInMLValue(const MLDataType data_type, const TensorShape& shape, AllocatorPtr& allocator) {
   auto new_tensor = onnxruntime::make_unique<Tensor>(data_type,
-                                             shape,
-                                             allocator);
+                                                     shape,
+                                                     allocator);
 
   return OrtValue{new_tensor.release(), DataTypeImpl::GetType<Tensor>(),
                   DataTypeImpl::GetType<Tensor>()->GetDeleteFunc()};
@@ -432,7 +452,7 @@ Status OutputIterator::Initialize() {
     status = AllocateFinalBuffer();
     ORT_RETURN_IF_ERROR(status);
   } else {
-    // delay until the first subgraph execution calls AllocateSubgraphOutput.
+    // delay until the first subgraph execution calls AllocateFinalOutput.
   }
 
   return Status::OK();
@@ -493,7 +513,7 @@ Status OutputIterator::AllocateFinalBuffer() {
   return Status::OK();
 }
 
-Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value) {
+Status OutputIterator::AllocateFinalOutput(const TensorShape& shape) {
   ORT_ENFORCE(!is_concrete_shape_, "If shape was concrete we shouldn't be using a custom allocator");
 
   // update the final shape now that we can fill in the symbolic dimension with an actual value
@@ -504,16 +524,13 @@ Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue
   status = AllocateFinalBuffer();
   ORT_RETURN_IF_ERROR(status);
 
-  // get OrtValue from operator*()
-  ort_value = **this;
-
   return Status::OK();
 }
 
 OrtValue& OutputIterator::operator*() {
   ORT_ENFORCE(cur_iteration_ < num_iterations_);
   ORT_ENFORCE(is_concrete_shape_,
-              "Expected AllocateSubgraphOutput to have been called to before we read the OrtValue from the iterator.");
+              "Expected AllocateFinalOutput to have been called to before we read the OrtValue from the iterator.");
 
   // for v8 both outputs and loop state vars use slicers. for v9 only outputs do
   if (is_v8_ || !is_loop_state_var_)
@@ -525,7 +542,7 @@ OrtValue& OutputIterator::operator*() {
 OutputIterator& OutputIterator::operator++() {
   if (cur_iteration_ < num_iterations_) {
     ORT_ENFORCE(is_concrete_shape_,
-                "Expected AllocateSubgraphOutput to have been called to before we increment the iterator");
+                "Expected AllocateFinalOutput to have been called to before we increment the iterator");
 
     ++cur_iteration_;
 
diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
index f5e3f8afa1..04176e551d 100644
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
@@ -119,9 +119,8 @@ class OutputIterator {
 
   // custom fetch allocator that can be used when the final shape is not concrete.
   // when the subgraph requests the allocation of the subgraph output, we forward the request to this instance,
-  // allocate the overall output (taking into account the sequence length dimension),
-  // and use a slicer to return the chunk for the subgraph output for this iteration.
-  Status AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value);
+  // and allocate the overall output (taking into account the sequence length dimension)
+  Status AllocateFinalOutput(const TensorShape& shape);
 
   // set the output for the current iteration to zeros. used for short sequence lengths
   void ZeroOutCurrent() {
diff --git a/onnxruntime/test/providers/cpu/controlflow/if_test.cc b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
index 408e2fa855..7ccc15dbf1 100644
--- a/onnxruntime/test/providers/cpu/controlflow/if_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
@@ -23,7 +23,7 @@ struct RunOptions {
   bool include_dim_values_in_subgraph = true;
   bool mixed_execution_providers = false;
 };
-}
+}  // namespace
 
 static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const RunOptions& options);
 
@@ -44,8 +44,8 @@ static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const R
 
 class IfOpTester : public OpTester {
  public:
-  IfOpTester(const RunOptions& options, int opset_version = 10) : 
-      OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
+  IfOpTester(const RunOptions& options, int opset_version = 10)
+      : OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
   }
 
  protected:
@@ -81,7 +81,7 @@ class IfOpTester : public OpTester {
         attr_proto.set_type(AttributeProto_AttributeType_INTS);
 
         auto* split_attribute = attr_proto.mutable_ints();
-        *split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches
+        *split_attribute->Add() = 1;  // split "unevenly" to create different shapes across the "then" and "else" branches
         *split_attribute->Add() = 2;
 
         split_node.AddAttribute("split", attr_proto);
@@ -114,7 +114,7 @@ class IfOpTester : public OpTester {
   int opset_version_;
 };
 
-   /* Subgraphs looks like this. All inputs come from outer scope so we just
+/* Subgraphs looks like this. All inputs come from outer scope so we just
    create a NodeArg with the input name. The numbers in [] are the values the tests are expected to produce
    as output from each node.
 
@@ -296,6 +296,15 @@ TEST(If, MixedExecutionProviders) {
   options.mixed_execution_providers = true;
   RunTest(true, options);
 }
+
+TEST(If, MixedExecutionProvidersNoShapeInSubgraph) {
+  RunOptions options{};
+  options.mixed_execution_providers = true;
+  options.include_dim_values_in_main_graph = true;
+  options.symbolic_dim_value_in_main_graph = 0;
+  options.include_dim_values_in_subgraph = false;
+  RunTest(true, options);
+}
 #endif  // USE_CUDA
 
 TEST(If, SymbolicShapeInMainGraph_NoShapeInSubgraph_True) {
diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
index c38656a8cf..10d350f478 100644
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@@ -310,7 +310,7 @@ static void RunTest_v8(const std::string test_name, int64_t batch_size, int64_t
   test.AddOutput<float>("scan_output_2", output_shape, output_2);
   test.AddOutput<float>("scan_output_3", output_shape, output_3);
 
-  test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
+  test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});  // Disable TensorRT because its parser failed
 }
 
 static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_t input_size,
@@ -397,7 +397,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
   test.AddOutput<float>("scan_output_3", calculate_output_shape(3), output_3);
 
   if (options.mixed_execution_providers) {
-    // we want the CUDA provider to be first, and the CPU provider second. all except the Scannode should run on
+    // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
     // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
     std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
     execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -405,7 +405,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
 
     test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}, nullptr, &execution_providers);
   } else {
-    test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
+    test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});  // Disable TensorRT because its parser failed
   }
 }
 
@@ -889,7 +889,7 @@ TEST(Scan9, TransposeOutputDim2) {
   test.AddInput<float>("scan_input_1", input_shape, {1.0, 2.0});
   test.AddOutput<float>("scan_output_1", output_shape, {1.0, 2.0});
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on supported data types
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // Disable TensorRT on supported data types
 }
 
 static void InvalidInput(bool is_v8) {
@@ -1081,14 +1081,14 @@ void MixedTypeInputs(bool is_v8) {
   test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
   test.AddOutput<int64_t>("scan_output_2", seq_shape, {0, 1, 2});
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on unsupported data types
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // Disable TensorRT on unsupported data types
 }
 
 TEST_8_AND_9(MixedTypeInputs);
 
 // create a subgraph that will have unknown dimensions in both the loop state variable and output
 // after shape inferencing.
-void UnknownDimInSubgraphOutput(bool is_v8) {
+void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = false) {
   Model model("ScanBody");
   auto& graph = model.MainGraph();
 
@@ -1146,7 +1146,17 @@ void UnknownDimInSubgraphOutput(bool is_v8) {
   test.AddOutput<float>("final_state_1", state_shape, {3.0});
   test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Disable TensorRT on unknown dimension tests
+  if (mixed_execution_providers) {
+    // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
+    // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}, nullptr, &execution_providers);
+  } else {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Disable TensorRT on unknown dimension tests
+  }
 }
 
 TEST_8_AND_9(UnknownDimInSubgraphOutput);
@@ -1159,6 +1169,12 @@ TEST(Scan, MixedExecutionProviders) {
 
   ShortSequenceOneInBatchOneLoopStateVar(options);
 }
+
+TEST(Scan, MixedExecutionProvidersUnknownDimInSubgraphOutput) {
+  UnknownDimInSubgraphOutput(/*is_v8*/ true, /*mixed_execution_providers*/ true);
+  UnknownDimInSubgraphOutput(/*is_v8*/ false, /*mixed_execution_providers*/ true);
+}
+
 #endif
 
 }  // namespace test