mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-22 22:01:08 +00:00
Fix bug with delayed allocation of If and Scan outputs. (#2024)
* Fix bug with delayed allocation of If and Scan outputs. If the subgraph is producing output on a non-CPU device the delayed allocation was incorrectly providing a CPU allocated tensor. Check for the required location, and update 'fetches' instead if a device copy is needed. The utils::ExecuteGraph logic will handle the device copy in this case.
This commit is contained in:
parent
ca1b88c069
commit
ffb94fd170
7 changed files with 107 additions and 49 deletions
|
|
@ -389,13 +389,6 @@ static Status AllocateSparseTensor(MLValue& mlvalue, const DataTypeImpl& ml_type
|
|||
// This method is not thread safe!
|
||||
Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_value_index, const TensorShape* shape,
|
||||
size_t nnz) {
|
||||
// if there is a custom allocator for this ort_value_index, call it to do the allocation
|
||||
auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
|
||||
if (custom_alloc_entry != custom_allocators_.cend()) {
|
||||
ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
|
||||
return (custom_alloc_entry->second)(*shape, ort_value);
|
||||
}
|
||||
|
||||
const SequentialExecutionPlan* p_seq_exec_plan = session_state_.GetExecutionPlan();
|
||||
const auto& alloc_plan = p_seq_exec_plan->allocation_plan;
|
||||
ORT_ENFORCE(ort_value_index >= 0 && static_cast<size_t>(ort_value_index) < alloc_plan.size());
|
||||
|
|
@ -409,6 +402,17 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
|
|||
"Tried to allocate without valid type information, ort_value index=" + std::to_string(ort_value_index));
|
||||
}
|
||||
|
||||
// if there is a custom allocator for this ort_value_index, call it to do the allocation
|
||||
auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
|
||||
if (custom_alloc_entry != custom_allocators_.cend()) {
|
||||
ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
|
||||
bool allocated = false;
|
||||
// see if custom allocator can handle allocation
|
||||
auto status = (custom_alloc_entry->second)(*shape, alloc_info, ort_value, allocated);
|
||||
if (allocated || !status.IsOK())
|
||||
return status;
|
||||
}
|
||||
|
||||
if (ml_type->IsTensorType()) {
|
||||
ORT_ENFORCE(shape, "Allocation of tensor types requires a shape.");
|
||||
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ class Logger;
|
|||
|
||||
class IExecutor {
|
||||
public:
|
||||
using CustomAllocator = std::function<Status(const TensorShape&, OrtValue&)>;
|
||||
using CustomAllocator = std::function<Status(const TensorShape&, const OrtMemoryInfo&, OrtValue&, bool& allocated)>;
|
||||
|
||||
virtual ~IExecutor() = default;
|
||||
|
||||
|
|
|
|||
|
|
@ -49,18 +49,18 @@ ONNX_OPERATOR_SET_SCHEMA(
|
|||
ONNX_CPU_OPERATOR_VERSIONED_KERNEL(If,
|
||||
1, 10,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
|
||||
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
|
||||
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
|
||||
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
|
||||
If);
|
||||
|
||||
// output shape rules requiring the output shapes of the 'THEN' and 'ELSE'
|
||||
// branches to be the same were relaxed in opset-11
|
||||
ONNX_CPU_OPERATOR_KERNEL(If,
|
||||
11,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
|
||||
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
|
||||
If);
|
||||
11,
|
||||
KernelDefBuilder()
|
||||
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
|
||||
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
|
||||
If);
|
||||
|
||||
struct If::Info {
|
||||
Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in) : subgraph(subgraph_in) {
|
||||
|
|
@ -301,14 +301,27 @@ Status IfImpl::Execute(const FeedsFetchesManager& ffm) {
|
|||
if (outputs_[i].first == AllocationType::Delayed) {
|
||||
// functor to forward the allocation request from the subgraph to the If node's context so that the
|
||||
// allocation plan for the If node's output is used.
|
||||
fetch_allocators[i] = [this, i](const TensorShape& shape, OrtValue& ort_value) {
|
||||
// allocate
|
||||
fetch_allocators[i] = [this, i, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
|
||||
OrtValue& ort_value, bool& allocated) {
|
||||
// for now we only allocate on CPU as currently all 'If' outputs are on CPU.
|
||||
// if that does not match the required device we don't update the provided OrtValue and return false for
|
||||
// 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
|
||||
// logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
|
||||
auto* tensor = context_.Output(i, shape);
|
||||
if (!tensor)
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
|
||||
|
||||
if (!tensor) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
|
||||
const OrtValue& value = *context_.GetOutputMLValue(i);
|
||||
|
||||
if (tensor->Location().device == location.device) {
|
||||
// return OrtValue for allocated tensor
|
||||
ort_value = value;
|
||||
allocated = true;
|
||||
} else {
|
||||
// put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
|
||||
fetches[i] = value;
|
||||
}
|
||||
|
||||
// return OrtValue for allocated tensor
|
||||
ort_value = *context_.GetOutputMLValue(i);
|
||||
return Status::OK();
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -231,14 +231,34 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
|
|||
auto& ort_value = *iterator;
|
||||
fetches.push_back(ort_value);
|
||||
} else {
|
||||
// need a dummy empty entry in fetches so the order matches the output names
|
||||
size_t i = fetches.size();
|
||||
fetches.emplace_back();
|
||||
|
||||
// use a custom allocator that will forward the allocation request to the Scan context
|
||||
// and add the sequence length dimension. this avoids using a temporary value for the first output
|
||||
fetch_allocators[output] = [&iterator](const TensorShape& shape, OrtValue& ort_value) {
|
||||
return iterator.AllocateSubgraphOutput(shape, ort_value);
|
||||
};
|
||||
fetch_allocators[output] = [i, &iterator, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
|
||||
OrtValue& ort_value, bool& allocated) {
|
||||
auto status = iterator.AllocateFinalOutput(shape);
|
||||
ORT_RETURN_IF_ERROR(status);
|
||||
|
||||
// also need a dummy empty entry in fetches so the order matches the output names
|
||||
fetches.emplace_back();
|
||||
const OrtValue& value = *iterator;
|
||||
|
||||
// for now we only allocate on CPU as currently all 'Scan' outputs are on CPU.
|
||||
// if that does not match the required device we don't update the provided OrtValue and return false for
|
||||
// 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
|
||||
// logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
|
||||
if (value.Get<Tensor>().Location().device == location.device) {
|
||||
// update OrtValue with a current slice from the iterator.
|
||||
ort_value = value;
|
||||
allocated = true;
|
||||
} else {
|
||||
// put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
|
||||
fetches[i] = value;
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -268,8 +288,8 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
|
|||
|
||||
OrtValue AllocateTensorInMLValue(const MLDataType data_type, const TensorShape& shape, AllocatorPtr& allocator) {
|
||||
auto new_tensor = onnxruntime::make_unique<Tensor>(data_type,
|
||||
shape,
|
||||
allocator);
|
||||
shape,
|
||||
allocator);
|
||||
|
||||
return OrtValue{new_tensor.release(), DataTypeImpl::GetType<Tensor>(),
|
||||
DataTypeImpl::GetType<Tensor>()->GetDeleteFunc()};
|
||||
|
|
@ -432,7 +452,7 @@ Status OutputIterator::Initialize() {
|
|||
status = AllocateFinalBuffer();
|
||||
ORT_RETURN_IF_ERROR(status);
|
||||
} else {
|
||||
// delay until the first subgraph execution calls AllocateSubgraphOutput.
|
||||
// delay until the first subgraph execution calls AllocateFinalOutput.
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
@ -493,7 +513,7 @@ Status OutputIterator::AllocateFinalBuffer() {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value) {
|
||||
Status OutputIterator::AllocateFinalOutput(const TensorShape& shape) {
|
||||
ORT_ENFORCE(!is_concrete_shape_, "If shape was concrete we shouldn't be using a custom allocator");
|
||||
|
||||
// update the final shape now that we can fill in the symbolic dimension with an actual value
|
||||
|
|
@ -504,16 +524,13 @@ Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue
|
|||
status = AllocateFinalBuffer();
|
||||
ORT_RETURN_IF_ERROR(status);
|
||||
|
||||
// get OrtValue from operator*()
|
||||
ort_value = **this;
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
OrtValue& OutputIterator::operator*() {
|
||||
ORT_ENFORCE(cur_iteration_ < num_iterations_);
|
||||
ORT_ENFORCE(is_concrete_shape_,
|
||||
"Expected AllocateSubgraphOutput to have been called to before we read the OrtValue from the iterator.");
|
||||
"Expected AllocateFinalOutput to have been called to before we read the OrtValue from the iterator.");
|
||||
|
||||
// for v8 both outputs and loop state vars use slicers. for v9 only outputs do
|
||||
if (is_v8_ || !is_loop_state_var_)
|
||||
|
|
@ -525,7 +542,7 @@ OrtValue& OutputIterator::operator*() {
|
|||
OutputIterator& OutputIterator::operator++() {
|
||||
if (cur_iteration_ < num_iterations_) {
|
||||
ORT_ENFORCE(is_concrete_shape_,
|
||||
"Expected AllocateSubgraphOutput to have been called to before we increment the iterator");
|
||||
"Expected AllocateFinalOutput to have been called to before we increment the iterator");
|
||||
|
||||
++cur_iteration_;
|
||||
|
||||
|
|
|
|||
|
|
@ -119,9 +119,8 @@ class OutputIterator {
|
|||
|
||||
// custom fetch allocator that can be used when the final shape is not concrete.
|
||||
// when the subgraph requests the allocation of the subgraph output, we forward the request to this instance,
|
||||
// allocate the overall output (taking into account the sequence length dimension),
|
||||
// and use a slicer to return the chunk for the subgraph output for this iteration.
|
||||
Status AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value);
|
||||
// and allocate the overall output (taking into account the sequence length dimension)
|
||||
Status AllocateFinalOutput(const TensorShape& shape);
|
||||
|
||||
// set the output for the current iteration to zeros. used for short sequence lengths
|
||||
void ZeroOutCurrent() {
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ struct RunOptions {
|
|||
bool include_dim_values_in_subgraph = true;
|
||||
bool mixed_execution_providers = false;
|
||||
};
|
||||
}
|
||||
} // namespace
|
||||
|
||||
static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const RunOptions& options);
|
||||
|
||||
|
|
@ -44,8 +44,8 @@ static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const R
|
|||
|
||||
class IfOpTester : public OpTester {
|
||||
public:
|
||||
IfOpTester(const RunOptions& options, int opset_version = 10) :
|
||||
OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
|
||||
IfOpTester(const RunOptions& options, int opset_version = 10)
|
||||
: OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
|
||||
}
|
||||
|
||||
protected:
|
||||
|
|
@ -81,7 +81,7 @@ class IfOpTester : public OpTester {
|
|||
attr_proto.set_type(AttributeProto_AttributeType_INTS);
|
||||
|
||||
auto* split_attribute = attr_proto.mutable_ints();
|
||||
*split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches
|
||||
*split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches
|
||||
*split_attribute->Add() = 2;
|
||||
|
||||
split_node.AddAttribute("split", attr_proto);
|
||||
|
|
@ -114,7 +114,7 @@ class IfOpTester : public OpTester {
|
|||
int opset_version_;
|
||||
};
|
||||
|
||||
/* Subgraphs looks like this. All inputs come from outer scope so we just
|
||||
/* Subgraphs looks like this. All inputs come from outer scope so we just
|
||||
create a NodeArg with the input name. The numbers in [] are the values the tests are expected to produce
|
||||
as output from each node.
|
||||
|
||||
|
|
@ -296,6 +296,15 @@ TEST(If, MixedExecutionProviders) {
|
|||
options.mixed_execution_providers = true;
|
||||
RunTest(true, options);
|
||||
}
|
||||
|
||||
TEST(If, MixedExecutionProvidersNoShapeInSubgraph) {
|
||||
RunOptions options{};
|
||||
options.mixed_execution_providers = true;
|
||||
options.include_dim_values_in_main_graph = true;
|
||||
options.symbolic_dim_value_in_main_graph = 0;
|
||||
options.include_dim_values_in_subgraph = false;
|
||||
RunTest(true, options);
|
||||
}
|
||||
#endif // USE_CUDA
|
||||
|
||||
TEST(If, SymbolicShapeInMainGraph_NoShapeInSubgraph_True) {
|
||||
|
|
|
|||
|
|
@ -310,7 +310,7 @@ static void RunTest_v8(const std::string test_name, int64_t batch_size, int64_t
|
|||
test.AddOutput<float>("scan_output_2", output_shape, output_2);
|
||||
test.AddOutput<float>("scan_output_3", output_shape, output_3);
|
||||
|
||||
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
|
||||
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}); // Disable TensorRT because its parser failed
|
||||
}
|
||||
|
||||
static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_t input_size,
|
||||
|
|
@ -397,7 +397,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
|
|||
test.AddOutput<float>("scan_output_3", calculate_output_shape(3), output_3);
|
||||
|
||||
if (options.mixed_execution_providers) {
|
||||
// we want the CUDA provider to be first, and the CPU provider second. all except the Scannode should run on
|
||||
// we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
|
||||
// CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
|
||||
execution_providers.push_back(DefaultCudaExecutionProvider());
|
||||
|
|
@ -405,7 +405,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
|
|||
|
||||
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}, nullptr, &execution_providers);
|
||||
} else {
|
||||
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
|
||||
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}); // Disable TensorRT because its parser failed
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -889,7 +889,7 @@ TEST(Scan9, TransposeOutputDim2) {
|
|||
test.AddInput<float>("scan_input_1", input_shape, {1.0, 2.0});
|
||||
test.AddOutput<float>("scan_output_1", output_shape, {1.0, 2.0});
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on supported data types
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // Disable TensorRT on supported data types
|
||||
}
|
||||
|
||||
static void InvalidInput(bool is_v8) {
|
||||
|
|
@ -1081,14 +1081,14 @@ void MixedTypeInputs(bool is_v8) {
|
|||
test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
|
||||
test.AddOutput<int64_t>("scan_output_2", seq_shape, {0, 1, 2});
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on unsupported data types
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // Disable TensorRT on unsupported data types
|
||||
}
|
||||
|
||||
TEST_8_AND_9(MixedTypeInputs);
|
||||
|
||||
// create a subgraph that will have unknown dimensions in both the loop state variable and output
|
||||
// after shape inferencing.
|
||||
void UnknownDimInSubgraphOutput(bool is_v8) {
|
||||
void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = false) {
|
||||
Model model("ScanBody");
|
||||
auto& graph = model.MainGraph();
|
||||
|
||||
|
|
@ -1146,7 +1146,17 @@ void UnknownDimInSubgraphOutput(bool is_v8) {
|
|||
test.AddOutput<float>("final_state_1", state_shape, {3.0});
|
||||
test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Disable TensorRT on unknown dimension tests
|
||||
if (mixed_execution_providers) {
|
||||
// we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
|
||||
// CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
|
||||
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
|
||||
execution_providers.push_back(DefaultCudaExecutionProvider());
|
||||
execution_providers.push_back(DefaultCpuExecutionProvider());
|
||||
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}, nullptr, &execution_providers);
|
||||
} else {
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //Disable TensorRT on unknown dimension tests
|
||||
}
|
||||
}
|
||||
|
||||
TEST_8_AND_9(UnknownDimInSubgraphOutput);
|
||||
|
|
@ -1159,6 +1169,12 @@ TEST(Scan, MixedExecutionProviders) {
|
|||
|
||||
ShortSequenceOneInBatchOneLoopStateVar(options);
|
||||
}
|
||||
|
||||
TEST(Scan, MixedExecutionProvidersUnknownDimInSubgraphOutput) {
|
||||
UnknownDimInSubgraphOutput(/*is_v8*/ true, /*mixed_execution_providers*/ true);
|
||||
UnknownDimInSubgraphOutput(/*is_v8*/ false, /*mixed_execution_providers*/ true);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace test
|
||||
|
|
|
|||
Loading…
Reference in a new issue