diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc index 298f84268c..5efc666780 100644 --- a/onnxruntime/core/framework/execution_frame.cc +++ b/onnxruntime/core/framework/execution_frame.cc @@ -389,13 +389,6 @@ static Status AllocateSparseTensor(MLValue& mlvalue, const DataTypeImpl& ml_type // This method is not thread safe! Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_value_index, const TensorShape* shape, size_t nnz) { - // if there is a custom allocator for this ort_value_index, call it to do the allocation - auto custom_alloc_entry = custom_allocators_.find(ort_value_index); - if (custom_alloc_entry != custom_allocators_.cend()) { - ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here."); - return (custom_alloc_entry->second)(*shape, ort_value); - } - const SequentialExecutionPlan* p_seq_exec_plan = session_state_.GetExecutionPlan(); const auto& alloc_plan = p_seq_exec_plan->allocation_plan; ORT_ENFORCE(ort_value_index >= 0 && static_cast(ort_value_index) < alloc_plan.size()); @@ -409,6 +402,17 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_ "Tried to allocate without valid type information, ort_value index=" + std::to_string(ort_value_index)); } + // if there is a custom allocator for this ort_value_index, call it to do the allocation + auto custom_alloc_entry = custom_allocators_.find(ort_value_index); + if (custom_alloc_entry != custom_allocators_.cend()) { + ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here."); + bool allocated = false; + // see if custom allocator can handle allocation + auto status = (custom_alloc_entry->second)(*shape, alloc_info, ort_value, allocated); + if (allocated || !status.IsOK()) + return status; + } + if (ml_type->IsTensorType()) { ORT_ENFORCE(shape, "Allocation of tensor types requires a shape."); diff --git a/onnxruntime/core/framework/iexecutor.h b/onnxruntime/core/framework/iexecutor.h index c06b878c8b..e6cd1589b7 100644 --- a/onnxruntime/core/framework/iexecutor.h +++ b/onnxruntime/core/framework/iexecutor.h @@ -20,7 +20,7 @@ class Logger; class IExecutor { public: - using CustomAllocator = std::function; + using CustomAllocator = std::function; virtual ~IExecutor() = default; diff --git a/onnxruntime/core/providers/cpu/controlflow/if.cc b/onnxruntime/core/providers/cpu/controlflow/if.cc index a92f6e3b38..56c04ef13a 100644 --- a/onnxruntime/core/providers/cpu/controlflow/if.cc +++ b/onnxruntime/core/providers/cpu/controlflow/if.cc @@ -49,18 +49,18 @@ ONNX_OPERATOR_SET_SCHEMA( ONNX_CPU_OPERATOR_VERSIONED_KERNEL(If, 1, 10, KernelDefBuilder() - .TypeConstraint("B", DataTypeImpl::GetTensorType()) - .TypeConstraint("V", DataTypeImpl::AllTensorTypes()), + .TypeConstraint("B", DataTypeImpl::GetTensorType()) + .TypeConstraint("V", DataTypeImpl::AllTensorTypes()), If); // output shape rules requiring the output shapes of the 'THEN' and 'ELSE' // branches to be the same were relaxed in opset-11 ONNX_CPU_OPERATOR_KERNEL(If, - 11, - KernelDefBuilder() - .TypeConstraint("B", DataTypeImpl::GetTensorType()) - .TypeConstraint("V", DataTypeImpl::AllTensorTypes()), - If); + 11, + KernelDefBuilder() + .TypeConstraint("B", DataTypeImpl::GetTensorType()) + .TypeConstraint("V", DataTypeImpl::AllTensorTypes()), + If); struct If::Info { Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in) : subgraph(subgraph_in) { @@ -301,14 +301,27 @@ Status IfImpl::Execute(const FeedsFetchesManager& ffm) { if (outputs_[i].first == AllocationType::Delayed) { // functor to forward the allocation request from the subgraph to the If node's context so that the // allocation plan for the If node's output is used. - fetch_allocators[i] = [this, i](const TensorShape& shape, OrtValue& ort_value) { - // allocate + fetch_allocators[i] = [this, i, &fetches](const TensorShape& shape, const OrtMemoryInfo& location, + OrtValue& ort_value, bool& allocated) { + // for now we only allocate on CPU as currently all 'If' outputs are on CPU. + // if that does not match the required device we don't update the provided OrtValue and return false for + // 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy + // logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here) auto* tensor = context_.Output(i, shape); + if (!tensor) + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i); - if (!tensor) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i); + const OrtValue& value = *context_.GetOutputMLValue(i); + + if (tensor->Location().device == location.device) { + // return OrtValue for allocated tensor + ort_value = value; + allocated = true; + } else { + // put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it + fetches[i] = value; + } - // return OrtValue for allocated tensor - ort_value = *context_.GetOutputMLValue(i); return Status::OK(); }; } diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc index e3b2f2ce80..d61f68edee 100644 --- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc +++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.cc @@ -231,14 +231,34 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses auto& ort_value = *iterator; fetches.push_back(ort_value); } else { + // need a dummy empty entry in fetches so the order matches the output names + size_t i = fetches.size(); + fetches.emplace_back(); + // use a custom allocator that will forward the allocation request to the Scan context // and add the sequence length dimension. this avoids using a temporary value for the first output - fetch_allocators[output] = [&iterator](const TensorShape& shape, OrtValue& ort_value) { - return iterator.AllocateSubgraphOutput(shape, ort_value); - }; + fetch_allocators[output] = [i, &iterator, &fetches](const TensorShape& shape, const OrtMemoryInfo& location, + OrtValue& ort_value, bool& allocated) { + auto status = iterator.AllocateFinalOutput(shape); + ORT_RETURN_IF_ERROR(status); - // also need a dummy empty entry in fetches so the order matches the output names - fetches.emplace_back(); + const OrtValue& value = *iterator; + + // for now we only allocate on CPU as currently all 'Scan' outputs are on CPU. + // if that does not match the required device we don't update the provided OrtValue and return false for + // 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy + // logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here) + if (value.Get().Location().device == location.device) { + // update OrtValue with a current slice from the iterator. + ort_value = value; + allocated = true; + } else { + // put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it + fetches[i] = value; + } + + return Status::OK(); + }; } } } @@ -268,8 +288,8 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses OrtValue AllocateTensorInMLValue(const MLDataType data_type, const TensorShape& shape, AllocatorPtr& allocator) { auto new_tensor = onnxruntime::make_unique(data_type, - shape, - allocator); + shape, + allocator); return OrtValue{new_tensor.release(), DataTypeImpl::GetType(), DataTypeImpl::GetType()->GetDeleteFunc()}; @@ -432,7 +452,7 @@ Status OutputIterator::Initialize() { status = AllocateFinalBuffer(); ORT_RETURN_IF_ERROR(status); } else { - // delay until the first subgraph execution calls AllocateSubgraphOutput. + // delay until the first subgraph execution calls AllocateFinalOutput. } return Status::OK(); @@ -493,7 +513,7 @@ Status OutputIterator::AllocateFinalBuffer() { return Status::OK(); } -Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value) { +Status OutputIterator::AllocateFinalOutput(const TensorShape& shape) { ORT_ENFORCE(!is_concrete_shape_, "If shape was concrete we shouldn't be using a custom allocator"); // update the final shape now that we can fill in the symbolic dimension with an actual value @@ -504,16 +524,13 @@ Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue status = AllocateFinalBuffer(); ORT_RETURN_IF_ERROR(status); - // get OrtValue from operator*() - ort_value = **this; - return Status::OK(); } OrtValue& OutputIterator::operator*() { ORT_ENFORCE(cur_iteration_ < num_iterations_); ORT_ENFORCE(is_concrete_shape_, - "Expected AllocateSubgraphOutput to have been called to before we read the OrtValue from the iterator."); + "Expected AllocateFinalOutput to have been called to before we read the OrtValue from the iterator."); // for v8 both outputs and loop state vars use slicers. for v9 only outputs do if (is_v8_ || !is_loop_state_var_) @@ -525,7 +542,7 @@ OrtValue& OutputIterator::operator*() { OutputIterator& OutputIterator::operator++() { if (cur_iteration_ < num_iterations_) { ORT_ENFORCE(is_concrete_shape_, - "Expected AllocateSubgraphOutput to have been called to before we increment the iterator"); + "Expected AllocateFinalOutput to have been called to before we increment the iterator"); ++cur_iteration_; diff --git a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h index f5e3f8afa1..04176e551d 100644 --- a/onnxruntime/core/providers/cpu/controlflow/scan_utils.h +++ b/onnxruntime/core/providers/cpu/controlflow/scan_utils.h @@ -119,9 +119,8 @@ class OutputIterator { // custom fetch allocator that can be used when the final shape is not concrete. // when the subgraph requests the allocation of the subgraph output, we forward the request to this instance, - // allocate the overall output (taking into account the sequence length dimension), - // and use a slicer to return the chunk for the subgraph output for this iteration. - Status AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value); + // and allocate the overall output (taking into account the sequence length dimension) + Status AllocateFinalOutput(const TensorShape& shape); // set the output for the current iteration to zeros. used for short sequence lengths void ZeroOutCurrent() { diff --git a/onnxruntime/test/providers/cpu/controlflow/if_test.cc b/onnxruntime/test/providers/cpu/controlflow/if_test.cc index 408e2fa855..7ccc15dbf1 100644 --- a/onnxruntime/test/providers/cpu/controlflow/if_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/if_test.cc @@ -23,7 +23,7 @@ struct RunOptions { bool include_dim_values_in_subgraph = true; bool mixed_execution_providers = false; }; -} +} // namespace static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const RunOptions& options); @@ -44,8 +44,8 @@ static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const R class IfOpTester : public OpTester { public: - IfOpTester(const RunOptions& options, int opset_version = 10) : - OpTester("If", opset_version), options_{options}, opset_version_(opset_version) { + IfOpTester(const RunOptions& options, int opset_version = 10) + : OpTester("If", opset_version), options_{options}, opset_version_(opset_version) { } protected: @@ -81,7 +81,7 @@ class IfOpTester : public OpTester { attr_proto.set_type(AttributeProto_AttributeType_INTS); auto* split_attribute = attr_proto.mutable_ints(); - *split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches + *split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches *split_attribute->Add() = 2; split_node.AddAttribute("split", attr_proto); @@ -114,7 +114,7 @@ class IfOpTester : public OpTester { int opset_version_; }; - /* Subgraphs looks like this. All inputs come from outer scope so we just +/* Subgraphs looks like this. All inputs come from outer scope so we just create a NodeArg with the input name. The numbers in [] are the values the tests are expected to produce as output from each node. @@ -296,6 +296,15 @@ TEST(If, MixedExecutionProviders) { options.mixed_execution_providers = true; RunTest(true, options); } + +TEST(If, MixedExecutionProvidersNoShapeInSubgraph) { + RunOptions options{}; + options.mixed_execution_providers = true; + options.include_dim_values_in_main_graph = true; + options.symbolic_dim_value_in_main_graph = 0; + options.include_dim_values_in_subgraph = false; + RunTest(true, options); +} #endif // USE_CUDA TEST(If, SymbolicShapeInMainGraph_NoShapeInSubgraph_True) { diff --git a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc index c38656a8cf..10d350f478 100644 --- a/onnxruntime/test/providers/cpu/controlflow/scan_test.cc +++ b/onnxruntime/test/providers/cpu/controlflow/scan_test.cc @@ -310,7 +310,7 @@ static void RunTest_v8(const std::string test_name, int64_t batch_size, int64_t test.AddOutput("scan_output_2", output_shape, output_2); test.AddOutput("scan_output_3", output_shape, output_3); - test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed + test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}); // Disable TensorRT because its parser failed } static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_t input_size, @@ -397,7 +397,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_ test.AddOutput("scan_output_3", calculate_output_shape(3), output_3); if (options.mixed_execution_providers) { - // we want the CUDA provider to be first, and the CPU provider second. all except the Scannode should run on + // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly. std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); @@ -405,7 +405,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_ test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}, nullptr, &execution_providers); } else { - test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed + test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}); // Disable TensorRT because its parser failed } } @@ -889,7 +889,7 @@ TEST(Scan9, TransposeOutputDim2) { test.AddInput("scan_input_1", input_shape, {1.0, 2.0}); test.AddOutput("scan_output_1", output_shape, {1.0, 2.0}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on supported data types + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // Disable TensorRT on supported data types } static void InvalidInput(bool is_v8) { @@ -1081,14 +1081,14 @@ void MixedTypeInputs(bool is_v8) { test.AddOutput("scan_output_1", seq_shape, {0.0, 1.0, 2.0}); test.AddOutput("scan_output_2", seq_shape, {0, 1, 2}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on unsupported data types + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // Disable TensorRT on unsupported data types } TEST_8_AND_9(MixedTypeInputs); // create a subgraph that will have unknown dimensions in both the loop state variable and output // after shape inferencing. -void UnknownDimInSubgraphOutput(bool is_v8) { +void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = false) { Model model("ScanBody"); auto& graph = model.MainGraph(); @@ -1146,7 +1146,17 @@ void UnknownDimInSubgraphOutput(bool is_v8) { test.AddOutput("final_state_1", state_shape, {3.0}); test.AddOutput("scan_output_1", seq_shape, {0.0, 1.0, 2.0}); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Disable TensorRT on unknown dimension tests + if (mixed_execution_providers) { + // we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on + // CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly. + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + execution_providers.push_back(DefaultCpuExecutionProvider()); + + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}, nullptr, &execution_providers); + } else { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //Disable TensorRT on unknown dimension tests + } } TEST_8_AND_9(UnknownDimInSubgraphOutput); @@ -1159,6 +1169,12 @@ TEST(Scan, MixedExecutionProviders) { ShortSequenceOneInBatchOneLoopStateVar(options); } + +TEST(Scan, MixedExecutionProvidersUnknownDimInSubgraphOutput) { + UnknownDimInSubgraphOutput(/*is_v8*/ true, /*mixed_execution_providers*/ true); + UnknownDimInSubgraphOutput(/*is_v8*/ false, /*mixed_execution_providers*/ true); +} + #endif } // namespace test