Fix bug in MemcpyToHost (#10816)

This commit is contained in:
Hariharan Seshadri 2022-03-10 07:02:27 -08:00 committed by GitHub
parent 9853eaa14f
commit e80ff63274
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 75 additions and 14 deletions

View file

@ -130,6 +130,12 @@ class OpKernelContext {
*/
Status GetTempSpaceAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT;
/**
Return the allocator associated with the CPU EP with memtype of OrtMemTypeDefault.
@remarks Use SafeInt when calculating the size of memory to allocate using AllocatorPtr->Alloc.
*/
Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT;
/**
Return the fence of current node's input.
@param index The index of the input.

View file

@ -94,6 +94,18 @@ Status OpKernelContext::GetTempSpaceAllocator(AllocatorPtr* output) const {
return Status::OK();
}
Status OpKernelContext::GetTempSpaceCPUAllocator(AllocatorPtr* output) const {
// While looking up the allocator from SessionState
// (which is called via ExecutionFrame), the allocator lookup
// logic doesn't key on OrtAllocatorType, so any OrtAllocatorType
// is good here.
*output = execution_frame_->GetAllocator(
OrtMemoryInfo(CPU, OrtAllocatorType::OrtArenaAllocator));
if (!*output)
return Status(common::ONNXRUNTIME, common::FAIL, "CPU allocator not found");
return Status::OK();
}
MLDataType OpKernelContext::InputType(int index) const {
int input_arg_index = GetInputArgIndex(index);
const OrtValue* p_ml_value = execution_frame_->GetNodeInputOrOutputMLValue(input_arg_index);

View file

@ -49,10 +49,25 @@ class Memcpy final : public OpKernel {
auto X_dtype = X->DataType();
Y->SetType(X_dtype);
AllocatorPtr alloc;
auto status = ctx->GetTempSpaceAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get an allocator.");
// If we are copying contents to CUDA, the allocator to use
// to allocate the buffers of the new tensors in the sequence
// can be temp space allocator associated with the CUDA EP
if (Node().OpType() == "MemcpyFromHost") {
auto status = ctx->GetTempSpaceAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get an allocator.");
}
} else {
// If we are copying contents to CPU (op type is "MemcpyToHost"),
// the allocator to use to allocate the buffers of the new tensors
// in the sequence will be the allocator from the CPU EP
auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
if (!status.IsOK()) {
return Status(common::ONNXRUNTIME, common::FAIL,
"Memcpy cuda: unable to get the CPU allocator.");
}
}
auto X_size = X->Size();
for (size_t i = 0; i < X_size; ++i) {
@ -164,7 +179,7 @@ bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
}
void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
cuda_graph_.Reset();
cuda_graph_.CaptureBegin();
}

View file

@ -694,6 +694,7 @@ struct ProviderHost {
virtual int OpKernelContext__InputCount(const OpKernelContext* p) = 0;
virtual int OpKernelContext__OutputCount(const OpKernelContext* p) = 0;
virtual Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0;
virtual Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0;
virtual bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) = 0;
virtual bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0;
virtual bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0;

View file

@ -738,6 +738,8 @@ struct OpKernelContext final {
Status GetTempSpaceAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceAllocator(this, output); }
Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceCPUAllocator(this, output); }
bool GetUseDeterministicCompute() const { return g_host->OpKernelContext__GetUseDeterministicCompute(this); }
bool TryGetInferredOutputShape(int index, TensorShape& shape) const { return g_host->OpKernelContext__TryGetInferredOutputShape(this, index, shape); }
@ -809,7 +811,7 @@ struct OpKernelInfo final {
return GetAttrs<T>(name, tmp).IsOK() ? tmp : default_value;
}
template<typename T>
template <typename T>
Status GetAttrsAsSpan(const std::string& name, gsl::span<const T>& out) const;
Status GetAttrs(const std::string& name, TensorShapeVector& out) const;
@ -863,8 +865,6 @@ inline TensorShapeVector OpKernelInfo::GetAttrsOrDefault(const std::string& name
return GetAttrs(name, tmp).IsOK() ? tmp : default_value;
}
class SessionState {
public:
const DataTransferManager& GetDataTransferMgr() const noexcept { return g_host->SessionState__GetDataTransferMgr(this); }

View file

@ -722,7 +722,7 @@ struct ProviderHostImpl : ProviderHost {
Status Graph__Resolve(Graph* p) override { return p->Resolve(); }
void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); }
Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span<NodeArg* const> & input_args, const gsl::span<NodeArg* const>& output_args, const NodeAttributes* attributes, const std::string& domain) override {
Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span<NodeArg* const>& input_args, const gsl::span<NodeArg* const>& output_args, const NodeAttributes* attributes, const std::string& domain) override {
return p->AddNode(name, op_type, description, input_args, output_args, attributes, domain);
}
@ -787,6 +787,7 @@ struct ProviderHostImpl : ProviderHost {
int OpKernelContext__InputCount(const OpKernelContext* p) override { return p->InputCount(); }
int OpKernelContext__OutputCount(const OpKernelContext* p) override { return p->OutputCount(); }
Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceAllocator(output); }
Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceCPUAllocator(output); }
bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) override { return p->GetUseDeterministicCompute(); }
bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredOutputShape(index, shape); }
bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredInputShape(index, shape); }

View file

@ -168,6 +168,9 @@ static constexpr PATH_TYPE MATMUL_MODEL_URI = TSTR("testdata/matmul_1.onnx");
#ifndef ORT_NO_RTTI
static constexpr PATH_TYPE SEQUENCE_MODEL_URI = TSTR("testdata/sequence_length.onnx");
#endif
#ifdef USE_CUDA
static constexpr PATH_TYPE SEQUENCE_MODEL_URI_2 = TSTR("testdata/optional_sequence_tensor.onnx");
#endif
static constexpr PATH_TYPE CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_1.onnx");
static constexpr PATH_TYPE CUSTOM_OP_LIBRARY_TEST_MODEL_URI = TSTR("testdata/custom_op_library/custom_op_test.onnx");
static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/overridable_initializer.onnx");
@ -777,7 +780,7 @@ TEST(CApiTest, test_custom_op_library) {
#elif defined(__APPLE__)
lib_name = "libcustom_op_library.dylib";
#else
lib_name = "./libcustom_op_library.so";
lib_name = "./libcustom_op_library.so";
#endif
void* library_handle = nullptr;
@ -1144,14 +1147,13 @@ TEST(CApiTest, cuda_graph) {
std::vector<const char*> keys{"enable_cuda_graph"};
std::vector<const char*> values{"1"};
ASSERT_TRUE(api.UpdateCUDAProviderOptions(
rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
Ort::SessionOptions session_options;
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
static_cast<OrtSessionOptions*>(session_options),
rel_cuda_options.get()) == nullptr);
static_cast<OrtSessionOptions*>(session_options),
rel_cuda_options.get()) == nullptr);
// Create IoBinding for inputs and outputs.
struct CudaMemoryDeleter {
explicit CudaMemoryDeleter(const Ort::Allocator* alloc) {
@ -2100,4 +2102,28 @@ TEST(CApiTest, GitHubIssue10179) {
}
}
}
TEST(CApiTest, TestCudaMemcpyToHostWithSequenceTensors) {
const auto* model_path = SEQUENCE_MODEL_URI_2;
Ort::SessionOptions session_options{};
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
Ort::Session session{*ort_env, model_path, session_options};
Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
std::vector<Ort::Value> ort_inputs;
std::vector<const char*> input_names{"cond"};
bool input_data[] = {false};
std::vector<int64_t> input_dims{};
ort_inputs.emplace_back(Ort::Value::CreateTensor<bool>(info, input_data, 1U, input_dims.data(), 0));
const char* output_names[] = {"sequence"};
std::vector<Ort::Value> ort_outputs = session.Run(Ort::RunOptions{nullptr}, input_names.data(),
ort_inputs.data(), ort_inputs.size(),
output_names, countof(output_names));
// There is no need to check the contents of the output, we are just checking to see if the
// model runs without crashing
}
#endif

Binary file not shown.