Fix bug with delayed allocation of If and Scan outputs. (#2024)

* Fix bug with delayed allocation of If and Scan outputs.
If the subgraph is producing output on a non-CPU device the delayed allocation was incorrectly providing a CPU allocated tensor.
Check for the required location, and update 'fetches' instead if a device copy is needed.
The utils::ExecuteGraph logic will handle the device copy in this case.
This commit is contained in:
Scott McKay 2019-10-11 19:49:21 +10:00 committed by GitHub
parent ca1b88c069
commit ffb94fd170
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 107 additions and 49 deletions

View file

@ -389,13 +389,6 @@ static Status AllocateSparseTensor(MLValue& mlvalue, const DataTypeImpl& ml_type
// This method is not thread safe!
Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_value_index, const TensorShape* shape,
size_t nnz) {
// if there is a custom allocator for this ort_value_index, call it to do the allocation
auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
if (custom_alloc_entry != custom_allocators_.cend()) {
ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
return (custom_alloc_entry->second)(*shape, ort_value);
}
const SequentialExecutionPlan* p_seq_exec_plan = session_state_.GetExecutionPlan();
const auto& alloc_plan = p_seq_exec_plan->allocation_plan;
ORT_ENFORCE(ort_value_index >= 0 && static_cast<size_t>(ort_value_index) < alloc_plan.size());
@ -409,6 +402,17 @@ Status ExecutionFrame::AllocateAsPerAllocationPlan(OrtValue& ort_value, int ort_
"Tried to allocate without valid type information, ort_value index=" + std::to_string(ort_value_index));
}
// if there is a custom allocator for this ort_value_index, call it to do the allocation
auto custom_alloc_entry = custom_allocators_.find(ort_value_index);
if (custom_alloc_entry != custom_allocators_.cend()) {
ORT_ENFORCE(shape, "We don't expect custom allocators for non-tensor types, so a shape is mandatory here.");
bool allocated = false;
// see if custom allocator can handle allocation
auto status = (custom_alloc_entry->second)(*shape, alloc_info, ort_value, allocated);
if (allocated || !status.IsOK())
return status;
}
if (ml_type->IsTensorType()) {
ORT_ENFORCE(shape, "Allocation of tensor types requires a shape.");

View file

@ -20,7 +20,7 @@ class Logger;
class IExecutor {
public:
using CustomAllocator = std::function<Status(const TensorShape&, OrtValue&)>;
using CustomAllocator = std::function<Status(const TensorShape&, const OrtMemoryInfo&, OrtValue&, bool& allocated)>;
virtual ~IExecutor() = default;

View file

@ -49,18 +49,18 @@ ONNX_OPERATOR_SET_SCHEMA(
ONNX_CPU_OPERATOR_VERSIONED_KERNEL(If,
1, 10,
KernelDefBuilder()
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
If);
// output shape rules requiring the output shapes of the 'THEN' and 'ELSE'
// branches to be the same were relaxed in opset-11
ONNX_CPU_OPERATOR_KERNEL(If,
11,
KernelDefBuilder()
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
If);
11,
KernelDefBuilder()
.TypeConstraint("B", DataTypeImpl::GetTensorType<bool>())
.TypeConstraint("V", DataTypeImpl::AllTensorTypes()),
If);
struct If::Info {
Info(const onnxruntime::Node& node, const GraphViewer& subgraph_in) : subgraph(subgraph_in) {
@ -301,14 +301,27 @@ Status IfImpl::Execute(const FeedsFetchesManager& ffm) {
if (outputs_[i].first == AllocationType::Delayed) {
// functor to forward the allocation request from the subgraph to the If node's context so that the
// allocation plan for the If node's output is used.
fetch_allocators[i] = [this, i](const TensorShape& shape, OrtValue& ort_value) {
// allocate
fetch_allocators[i] = [this, i, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
OrtValue& ort_value, bool& allocated) {
// for now we only allocate on CPU as currently all 'If' outputs are on CPU.
// if that does not match the required device we don't update the provided OrtValue and return false for
// 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
// logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
auto* tensor = context_.Output(i, shape);
if (!tensor)
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
if (!tensor) return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create output tensor for If output ", i);
const OrtValue& value = *context_.GetOutputMLValue(i);
if (tensor->Location().device == location.device) {
// return OrtValue for allocated tensor
ort_value = value;
allocated = true;
} else {
// put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
fetches[i] = value;
}
// return OrtValue for allocated tensor
ort_value = *context_.GetOutputMLValue(i);
return Status::OK();
};
}

View file

@ -231,14 +231,34 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
auto& ort_value = *iterator;
fetches.push_back(ort_value);
} else {
// need a dummy empty entry in fetches so the order matches the output names
size_t i = fetches.size();
fetches.emplace_back();
// use a custom allocator that will forward the allocation request to the Scan context
// and add the sequence length dimension. this avoids using a temporary value for the first output
fetch_allocators[output] = [&iterator](const TensorShape& shape, OrtValue& ort_value) {
return iterator.AllocateSubgraphOutput(shape, ort_value);
};
fetch_allocators[output] = [i, &iterator, &fetches](const TensorShape& shape, const OrtMemoryInfo& location,
OrtValue& ort_value, bool& allocated) {
auto status = iterator.AllocateFinalOutput(shape);
ORT_RETURN_IF_ERROR(status);
// also need a dummy empty entry in fetches so the order matches the output names
fetches.emplace_back();
const OrtValue& value = *iterator;
// for now we only allocate on CPU as currently all 'Scan' outputs are on CPU.
// if that does not match the required device we don't update the provided OrtValue and return false for
// 'allocated'. the execution frame will allocate a buffer on the required device, and the fetches copy
// logic in utils::ExecuteSubgraph will handle moving it to CPU (and into the tensor we allocated here)
if (value.Get<Tensor>().Location().device == location.device) {
// update OrtValue with a current slice from the iterator.
ort_value = value;
allocated = true;
} else {
// put the allocated value into fetches so the copy logic in utils::ExecuteGraphImpl can use it
fetches[i] = value;
}
return Status::OK();
};
}
}
}
@ -268,8 +288,8 @@ Status IterateSequence(OpKernelContextInternal& context, const SessionState& ses
OrtValue AllocateTensorInMLValue(const MLDataType data_type, const TensorShape& shape, AllocatorPtr& allocator) {
auto new_tensor = onnxruntime::make_unique<Tensor>(data_type,
shape,
allocator);
shape,
allocator);
return OrtValue{new_tensor.release(), DataTypeImpl::GetType<Tensor>(),
DataTypeImpl::GetType<Tensor>()->GetDeleteFunc()};
@ -432,7 +452,7 @@ Status OutputIterator::Initialize() {
status = AllocateFinalBuffer();
ORT_RETURN_IF_ERROR(status);
} else {
// delay until the first subgraph execution calls AllocateSubgraphOutput.
// delay until the first subgraph execution calls AllocateFinalOutput.
}
return Status::OK();
@ -493,7 +513,7 @@ Status OutputIterator::AllocateFinalBuffer() {
return Status::OK();
}
Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value) {
Status OutputIterator::AllocateFinalOutput(const TensorShape& shape) {
ORT_ENFORCE(!is_concrete_shape_, "If shape was concrete we shouldn't be using a custom allocator");
// update the final shape now that we can fill in the symbolic dimension with an actual value
@ -504,16 +524,13 @@ Status OutputIterator::AllocateSubgraphOutput(const TensorShape& shape, OrtValue
status = AllocateFinalBuffer();
ORT_RETURN_IF_ERROR(status);
// get OrtValue from operator*()
ort_value = **this;
return Status::OK();
}
OrtValue& OutputIterator::operator*() {
ORT_ENFORCE(cur_iteration_ < num_iterations_);
ORT_ENFORCE(is_concrete_shape_,
"Expected AllocateSubgraphOutput to have been called to before we read the OrtValue from the iterator.");
"Expected AllocateFinalOutput to have been called to before we read the OrtValue from the iterator.");
// for v8 both outputs and loop state vars use slicers. for v9 only outputs do
if (is_v8_ || !is_loop_state_var_)
@ -525,7 +542,7 @@ OrtValue& OutputIterator::operator*() {
OutputIterator& OutputIterator::operator++() {
if (cur_iteration_ < num_iterations_) {
ORT_ENFORCE(is_concrete_shape_,
"Expected AllocateSubgraphOutput to have been called to before we increment the iterator");
"Expected AllocateFinalOutput to have been called to before we increment the iterator");
++cur_iteration_;

View file

@ -119,9 +119,8 @@ class OutputIterator {
// custom fetch allocator that can be used when the final shape is not concrete.
// when the subgraph requests the allocation of the subgraph output, we forward the request to this instance,
// allocate the overall output (taking into account the sequence length dimension),
// and use a slicer to return the chunk for the subgraph output for this iteration.
Status AllocateSubgraphOutput(const TensorShape& shape, OrtValue& ort_value);
// and allocate the overall output (taking into account the sequence length dimension)
Status AllocateFinalOutput(const TensorShape& shape);
// set the output for the current iteration to zeros. used for short sequence lengths
void ZeroOutCurrent() {

View file

@ -23,7 +23,7 @@ struct RunOptions {
bool include_dim_values_in_subgraph = true;
bool mixed_execution_providers = false;
};
}
} // namespace
static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const RunOptions& options);
@ -44,8 +44,8 @@ static const ONNX_NAMESPACE::GraphProto CreateSubgraph(bool then_branch, const R
class IfOpTester : public OpTester {
public:
IfOpTester(const RunOptions& options, int opset_version = 10) :
OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
IfOpTester(const RunOptions& options, int opset_version = 10)
: OpTester("If", opset_version), options_{options}, opset_version_(opset_version) {
}
protected:
@ -81,7 +81,7 @@ class IfOpTester : public OpTester {
attr_proto.set_type(AttributeProto_AttributeType_INTS);
auto* split_attribute = attr_proto.mutable_ints();
*split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches
*split_attribute->Add() = 1; // split "unevenly" to create different shapes across the "then" and "else" branches
*split_attribute->Add() = 2;
split_node.AddAttribute("split", attr_proto);
@ -114,7 +114,7 @@ class IfOpTester : public OpTester {
int opset_version_;
};
/* Subgraphs looks like this. All inputs come from outer scope so we just
/* Subgraphs looks like this. All inputs come from outer scope so we just
create a NodeArg with the input name. The numbers in [] are the values the tests are expected to produce
as output from each node.
@ -296,6 +296,15 @@ TEST(If, MixedExecutionProviders) {
options.mixed_execution_providers = true;
RunTest(true, options);
}
TEST(If, MixedExecutionProvidersNoShapeInSubgraph) {
RunOptions options{};
options.mixed_execution_providers = true;
options.include_dim_values_in_main_graph = true;
options.symbolic_dim_value_in_main_graph = 0;
options.include_dim_values_in_subgraph = false;
RunTest(true, options);
}
#endif // USE_CUDA
TEST(If, SymbolicShapeInMainGraph_NoShapeInSubgraph_True) {

View file

@ -310,7 +310,7 @@ static void RunTest_v8(const std::string test_name, int64_t batch_size, int64_t
test.AddOutput<float>("scan_output_2", output_shape, output_2);
test.AddOutput<float>("scan_output_3", output_shape, output_3);
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}); // Disable TensorRT because its parser failed
}
static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_t input_size,
@ -397,7 +397,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
test.AddOutput<float>("scan_output_3", calculate_output_shape(3), output_3);
if (options.mixed_execution_providers) {
// we want the CUDA provider to be first, and the CPU provider second. all except the Scannode should run on
// we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
// CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
@ -405,7 +405,7 @@ static void RunTest_v9(const std::string test_name, int64_t sequence_len, int64_
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}, nullptr, &execution_providers);
} else {
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider});// Disable TensorRT because its parser failed
test.Run(expect_result, failure_message, {kTensorrtExecutionProvider}); // Disable TensorRT because its parser failed
}
}
@ -889,7 +889,7 @@ TEST(Scan9, TransposeOutputDim2) {
test.AddInput<float>("scan_input_1", input_shape, {1.0, 2.0});
test.AddOutput<float>("scan_output_1", output_shape, {1.0, 2.0});
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on supported data types
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // Disable TensorRT on supported data types
}
static void InvalidInput(bool is_v8) {
@ -1081,14 +1081,14 @@ void MixedTypeInputs(bool is_v8) {
test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
test.AddOutput<int64_t>("scan_output_2", seq_shape, {0, 1, 2});
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});// Disable TensorRT on unsupported data types
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // Disable TensorRT on unsupported data types
}
TEST_8_AND_9(MixedTypeInputs);
// create a subgraph that will have unknown dimensions in both the loop state variable and output
// after shape inferencing.
void UnknownDimInSubgraphOutput(bool is_v8) {
void UnknownDimInSubgraphOutput(bool is_v8, bool mixed_execution_providers = false) {
Model model("ScanBody");
auto& graph = model.MainGraph();
@ -1146,7 +1146,17 @@ void UnknownDimInSubgraphOutput(bool is_v8) {
test.AddOutput<float>("final_state_1", state_shape, {3.0});
test.AddOutput<float>("scan_output_1", seq_shape, {0.0, 1.0, 2.0});
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider});//Disable TensorRT on unknown dimension tests
if (mixed_execution_providers) {
// we want the CUDA provider to be first, and the CPU provider second. all except the Scan node should run on
// CUDA given that, which creates the scenario where we need to copy to/from CPU to execute the Scan node correctly.
std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
execution_providers.push_back(DefaultCudaExecutionProvider());
execution_providers.push_back(DefaultCpuExecutionProvider());
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}, nullptr, &execution_providers);
} else {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); //Disable TensorRT on unknown dimension tests
}
}
TEST_8_AND_9(UnknownDimInSubgraphOutput);
@ -1159,6 +1169,12 @@ TEST(Scan, MixedExecutionProviders) {
ShortSequenceOneInBatchOneLoopStateVar(options);
}
TEST(Scan, MixedExecutionProvidersUnknownDimInSubgraphOutput) {
UnknownDimInSubgraphOutput(/*is_v8*/ true, /*mixed_execution_providers*/ true);
UnknownDimInSubgraphOutput(/*is_v8*/ false, /*mixed_execution_providers*/ true);
}
#endif
} // namespace test