Fix bug with delayed allocation of If and Scan outputs. (#2024)

* Fix bug with delayed allocation of If and Scan outputs. If the subgraph is producing output on a non-CPU device the delayed allocation was incorrectly providing a CPU allocated tensor. Check for the required location, and update 'fetches' instead if a device copy is needed. The utils::ExecuteGraph logic will handle the device copy in this case.
2026-07-09 17:28:58 +00:00 · 2019-10-11 19:49:21 +10:00 · 2019-10-11 19:49:21 +10:00 · ffb94fd170
commit ffb94fd170
parent ca1b88c069
7 changed files with 107 additions and 49 deletions
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@ -389,13 +389,6 @@ static Status AllocateSparseTensor(MLValue& mlvalue, const DataTypeImpl& ml_type
 // This method is not thread safe!
 Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_value_index, const TensorShape* shape,
                                                   size_t nnz) {
-  // if there is a custom allocator for this ort_value_index, call it to do the allocation
-  auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
-  if (custom_alloc_entry != custom_allocators_.cend()) {
-    ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
-    return (custom_alloc_entry->second)(*shape, ort_value);
-  }
-
  const SequentialExecutionPlan* p_seq_exec_plan = session_state_.GetExecutionPlan();
  const auto& alloc_plan = p_seq_exec_plan->allocation_plan;
  ORT_ENFORCE(ort_value_index >= 0 && static_cast<size_t>(ort_value_index) < alloc_plan.size());
@ -409,6 +402,17 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
        "Tried to allocate without valid type information, ort_value index=" + std::to_string(ort_value_index));
  }

+  // if there is a custom allocator for this ort_value_index, call it to do the allocation
+  auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
+  if (custom_alloc_entry != custom_allocators_.cend()) {
+    ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
+    bool allocated = false;
+    // see if custom allocator can handle allocation
+    auto status = (custom_alloc_entry->second)(*shape, alloc_info, ort_value, allocated);
+    if (allocated || !status.IsOK())
+      return status;
+  }
+
  if (ml_type->IsTensorType()) {
    ORT_ENFORCE(shape, "Allocation of tensor types requires a shape.");

--- a/onnxruntime/core/framework/iexecutor.h
+++ b/onnxruntime/core/framework/iexecutor.h
@ -20,7 +20,7 @@ class Logger;

 class IExecutor {
 public:
-  using CustomAllocator = std::function<Status(const TensorShape&, OrtValue&)>;
+  using CustomAllocator = std::function<Status(const TensorShape&, const OrtMemoryInfo&, OrtValue&, bool& allocated)>;

  virtual ~IExecutor() = default;

--- a/onnxruntime/core/providers/cpu/controlflow/if.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/if.cc
@ -49,18 +49,18 @@ ONNX_OPERATOR_SET_SCHEMA(
 ONNX_CPU_OPERATOR_VERSIONED_KERNEL(If,
                                   1, 10,
                                   KernelDefBuilder()
-                                     .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
-                                     .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
+                                       .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+                                       .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
                                   If);

 // output shape rules requiring the output shapes of the 'THEN' and 'ELSE'
 // branches to be the same were relaxed in opset-11
 ONNX_CPU_OPERATOR_KERNEL(If,
-                        11,
-                        KernelDefBuilder()
-                            .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
-                            .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
-                        If);
+                         11,
+                         KernelDefBuilder()
+                             .TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
+                             .TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
+                         If);

 struct If::Info {
  Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in) : subgraph(subgraph_in) {
@ -301,14 +301,27 @@ Status IfImpl::Execute(const FeedsFetchesManager& ffm) {
    if (outputs_[i].first == AllocationType::Delayed) {
      // functor to forward the allocation request from the subgraph to the If node's context so that the
      // allocation plan for the If node's output is used.
-      fetch_allocators[i] = [this, i](const TensorShape& shape, OrtValue& ort_value) {
-        // allocate
+      fetch_allocators[i] = [this, i, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
+                                                OrtValue& ort_value, bool& allocated) {
+        // for now we only allocate on CPU as currently all 'If' outputs are on CPU.
+        // if that does not match the required device we don't update the provided OrtValue and return false for
+        // 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
+        // logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
        auto* tensor = context_.Output(i, shape);
+        if (!tensor)
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);

-        if (!tensor) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
+        const OrtValue& value = *context_.GetOutputMLValue(i);
+
+        if (tensor->Location().device == location.device) {
+          // return OrtValue for allocated tensor
+          ort_value = value;
+          allocated = true;
+        } else {
+          // put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
+          fetches[i] = value;
+        }

-        // return OrtValue for allocated tensor
-        ort_value = *context_.GetOutputMLValue(i);
        return Status::OK();
      };
    }
--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc
@ -231,14 +231,34 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
          auto& ort_value = *iterator;
          fetches.push_back(ort_value);
        } else {
+          // need a dummy empty entry in fetches so the order matches the output names
+          size_t i = fetches.size();
+          fetches.emplace_back();
+
          // use a custom allocator that will forward the allocation request to the Scan context
          // and add the sequence length dimension. this avoids using a temporary value for the first output
-          fetch_allocators[output] = [&iterator](const TensorShape& shape, OrtValue& ort_value) {
-            return iterator.AllocateSubgraphOutput(shape, ort_value);
-          };
+          fetch_allocators[output] = [i, &iterator, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
+                                                              OrtValue& ort_value, bool& allocated) {
+            auto status = iterator.AllocateFinalOutput(shape);
+            ORT_RETURN_IF_ERROR(status);

-          // also need a dummy empty entry in fetches so the order matches the output names
-          fetches.emplace_back();
+            const OrtValue& value = *iterator;
+
+            // for now we only allocate on CPU as currently all 'Scan' outputs are on CPU.
+            // if that does not match the required device we don't update the provided OrtValue and return false for
+            // 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
+            // logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
+            if (value.Get<Tensor>().Location().device == location.device) {
+              // update OrtValue with a current slice from the iterator.
+              ort_value = value;
+              allocated = true;
+            } else {
+              // put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
+              fetches[i] = value;
+            }
+
+            return Status::OK();
+          };
        }
      }
    }
@ -268,8 +288,8 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses

 OrtValue AllocateTensorInMLValue(const MLDataType data_type, const TensorShape& shape, AllocatorPtr& allocator) {
  auto new_tensor = onnxruntime::make_unique<Tensor>(data_type,
-                                             shape,
-                                             allocator);
+                                                     shape,
+                                                     allocator);

  return OrtValue{new_tensor.release(), DataTypeImpl::GetType<Tensor>(),
                  DataTypeImpl::GetType<Tensor>()->GetDeleteFunc()};
@ -432,7 +452,7 @@ Status OutputIterator::Initialize() {
    status = AllocateFinalBuffer();
    ORT_RETURN_IF_ERROR(status);
  } else {
-    // delay until the first subgraph execution calls AllocateSubgraphOutput.
+    // delay until the first subgraph execution calls AllocateFinalOutput.
  }

  return Status::OK();
@ -493,7 +513,7 @@ Status OutputIterator::AllocateFinalBuffer() {
  return Status::OK();
 }

-Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value) {
+Status OutputIterator::AllocateFinalOutput(const TensorShape& shape) {
  ORT_ENFORCE(!is_concrete_shape_, "If shape was concrete we shouldn't be using a custom allocator");

  // update the final shape now that we can fill in the symbolic dimension with an actual value
@ -504,16 +524,13 @@ Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue
  status = AllocateFinalBuffer();
  ORT_RETURN_IF_ERROR(status);

-  // get OrtValue from operator*()
-  ort_value = **this;
-
  return Status::OK();
 }

 OrtValue& OutputIterator::operator*() {
  ORT_ENFORCE(cur_iteration_ < num_iterations_);
  ORT_ENFORCE(is_concrete_shape_,
-              "Expected AllocateSubgraphOutput to have been called to before we read the OrtValue from the iterator.");
+              "Expected AllocateFinalOutput to have been called to before we read the OrtValue from the iterator.");

  // for v8 both outputs and loop state vars use slicers. for v9 only outputs do
  if (is_v8_ || !is_loop_state_var_)
@ -525,7 +542,7 @@ OrtValue& OutputIterator::operator*() {
 OutputIterator& OutputIterator::operator++() {
  if (cur_iteration_ < num_iterations_) {
    ORT_ENFORCE(is_concrete_shape_,
-                "Expected AllocateSubgraphOutput to have been called to before we increment the iterator");
+                "Expected AllocateFinalOutput to have been called to before we increment the iterator");

    ++cur_iteration_;

--- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
+++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h
@ -119,9 +119,8 @@ class OutputIterator {

  // custom fetch allocator that can be used when the final shape is not concrete.
  // when the subgraph requests the allocation of the subgraph output, we forward the request to this instance,
-  // allocate the overall output (taking into account the sequence length dimension),
-  // and use a slicer to return the chunk for the subgraph output for this iteration.
-  Status AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value);
+  // and allocate the overall output (taking into account the sequence length dimension)
+  Status AllocateFinalOutput(const TensorShape& shape);

  // set the output for the current iteration to zeros. used for short sequence lengths
  void ZeroOutCurrent() {
--- a/onnxruntime/test/providers/cpu/controlflow/if_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/if_test.cc
@ -23,7 +23,7 @@ struct RunOptions {
  bool include_dim_values_in_subgraph = true;
  bool mixed_execution_providers = false;
 };
-}
+}  // namespace

 static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const RunOptions& options);

@ -44,8 +44,8 @@ static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const R

 class IfOpTester : public OpTester {
 public:
-  IfOpTester(const RunOptions& options, int opset_version = 10) : 
-      OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
+  IfOpTester(const RunOptions& options, int opset_version = 10)
+      : OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
  }

 protected:
@ -81,7 +81,7 @@ class IfOpTester : public OpTester {
        attr_proto.set_type(AttributeProto_AttributeType_INTS);

        auto* split_attribute = attr_proto.mutable_ints();
-        *split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches
+        *split_attribute->Add() = 1;  // split "unevenly" to create different shapes across the "then" and "else" branches
        *split_attribute->Add() = 2;

        split_node.AddAttribute("split", attr_proto);
@ -114,7 +114,7 @@ class IfOpTester : public OpTester {
  int opset_version_;
 };

-   /* Subgraphs looks like this. All inputs come from outer scope so we just
+/* Subgraphs looks like this. All inputs come from outer scope so we just
   create a NodeArg with the input name. The numbers in [] are the values the tests are expected to produce
   as output from each node.

@ -296,6 +296,15 @@ TEST(If, MixedExecutionProviders) {
  options.mixed_execution_providers = true;
  RunTest(true, options);
 }
+
+TEST(If, MixedExecutionProvidersNoShapeInSubgraph) {
+  RunOptions options{};
+  options.mixed_execution_providers = true;
+  options.include_dim_values_in_main_graph = true;
+  options.symbolic_dim_value_in_main_graph = 0;
+  options.include_dim_values_in_subgraph = false;
+  RunTest(true, options);
+}
 #endif  // USE_CUDA

 TEST(If, SymbolicShapeInMainGraph_NoShapeInSubgraph_True) {
--- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
+++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc
@ -310,7 +310,7 @@ static void RunTest_v8(const std::string test_name, int64_t batch_size, int64_t
  test.AddOutput<float>("scan_output_2", output_shape, output_2);
  test.AddOutput<float>("scan_output_3", output_shape, output_3);

-  test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
+  test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});  // Disable TensorRT because its parser failed
 }

 static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_t input_size,
@ -397,7 +397,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
  test.AddOutput<float>("scan_output_3", calculate_output_shape(3), output_3);

  if (options.mixed_execution_providers) {
-    // we want the CUDA provider to be first, and the CPU provider second. all except the Scannode should run on
+    // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
    // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
    execution_providers.push_back(DefaultCudaExecutionProvider());
@ -405,7 +405,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_

    test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}, nullptr, &execution_providers);
  } else {
-    test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
+    test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});  // Disable TensorRT because its parser failed
  }
 }

@ -889,7 +889,7 @@ TEST(Scan9, TransposeOutputDim2) {
  test.AddInput<float>("scan_input_1", input_shape, {1.0, 2.0});
  test.AddOutput<float>("scan_output_1", output_shape, {1.0, 2.0});

-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on supported data types
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // Disable TensorRT on supported data types
 }

 static void InvalidInput(bool is_v8) {
@ -1081,14 +1081,14 @@ void MixedTypeInputs(bool is_v8) {
  test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
  test.AddOutput<int64_t>("scan_output_2", seq_shape, {0, 1, 2});

-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on unsupported data types
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  // Disable TensorRT on unsupported data types
 }

 TEST_8_AND_9(MixedTypeInputs);

 // create a subgraph that will have unknown dimensions in both the loop state variable and output
 // after shape inferencing.
-void UnknownDimInSubgraphOutput(bool is_v8) {
+void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = false) {
  Model model("ScanBody");
  auto& graph = model.MainGraph();

@ -1146,7 +1146,17 @@ void UnknownDimInSubgraphOutput(bool is_v8) {
  test.AddOutput<float>("final_state_1", state_shape, {3.0});
  test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});

-  test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Disable TensorRT on unknown dimension tests
+  if (mixed_execution_providers) {
+    // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
+    // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
+    std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+    execution_providers.push_back(DefaultCpuExecutionProvider());
+
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}, nullptr, &execution_providers);
+  } else {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});  //Disable TensorRT on unknown dimension tests
+  }
 }

 TEST_8_AND_9(UnknownDimInSubgraphOutput);
@ -1159,6 +1169,12 @@ TEST(Scan, MixedExecutionProviders) {

  ShortSequenceOneInBatchOneLoopStateVar(options);
 }
+
+TEST(Scan, MixedExecutionProvidersUnknownDimInSubgraphOutput) {
+  UnknownDimInSubgraphOutput(/*is_v8*/ true, /*mixed_execution_providers*/ true);
+  UnknownDimInSubgraphOutput(/*is_v8*/ false, /*mixed_execution_providers*/ true);
+}
+
 #endif

 }  // namespace test