mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-26 22:35:43 +00:00
Fix bug in MemcpyToHost (#10816)
This commit is contained in:
parent
9853eaa14f
commit
e80ff63274
8 changed files with 75 additions and 14 deletions
|
|
@ -130,6 +130,12 @@ class OpKernelContext {
|
|||
*/
|
||||
Status GetTempSpaceAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT;
|
||||
|
||||
/**
|
||||
Return the allocator associated with the CPU EP with memtype of OrtMemTypeDefault.
|
||||
@remarks Use SafeInt when calculating the size of memory to allocate using AllocatorPtr->Alloc.
|
||||
*/
|
||||
Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT;
|
||||
|
||||
/**
|
||||
Return the fence of current node's input.
|
||||
@param index The index of the input.
|
||||
|
|
|
|||
|
|
@ -94,6 +94,18 @@ Status OpKernelContext::GetTempSpaceAllocator(AllocatorPtr* output) const {
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
Status OpKernelContext::GetTempSpaceCPUAllocator(AllocatorPtr* output) const {
|
||||
// While looking up the allocator from SessionState
|
||||
// (which is called via ExecutionFrame), the allocator lookup
|
||||
// logic doesn't key on OrtAllocatorType, so any OrtAllocatorType
|
||||
// is good here.
|
||||
*output = execution_frame_->GetAllocator(
|
||||
OrtMemoryInfo(CPU, OrtAllocatorType::OrtArenaAllocator));
|
||||
if (!*output)
|
||||
return Status(common::ONNXRUNTIME, common::FAIL, "CPU allocator not found");
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
MLDataType OpKernelContext::InputType(int index) const {
|
||||
int input_arg_index = GetInputArgIndex(index);
|
||||
const OrtValue* p_ml_value = execution_frame_->GetNodeInputOrOutputMLValue(input_arg_index);
|
||||
|
|
|
|||
|
|
@ -49,10 +49,25 @@ class Memcpy final : public OpKernel {
|
|||
auto X_dtype = X->DataType();
|
||||
Y->SetType(X_dtype);
|
||||
AllocatorPtr alloc;
|
||||
auto status = ctx->GetTempSpaceAllocator(&alloc);
|
||||
if (!status.IsOK()) {
|
||||
return Status(common::ONNXRUNTIME, common::FAIL,
|
||||
"Memcpy cuda: unable to get an allocator.");
|
||||
|
||||
// If we are copying contents to CUDA, the allocator to use
|
||||
// to allocate the buffers of the new tensors in the sequence
|
||||
// can be temp space allocator associated with the CUDA EP
|
||||
if (Node().OpType() == "MemcpyFromHost") {
|
||||
auto status = ctx->GetTempSpaceAllocator(&alloc);
|
||||
if (!status.IsOK()) {
|
||||
return Status(common::ONNXRUNTIME, common::FAIL,
|
||||
"Memcpy cuda: unable to get an allocator.");
|
||||
}
|
||||
} else {
|
||||
// If we are copying contents to CPU (op type is "MemcpyToHost"),
|
||||
// the allocator to use to allocate the buffers of the new tensors
|
||||
// in the sequence will be the allocator from the CPU EP
|
||||
auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
|
||||
if (!status.IsOK()) {
|
||||
return Status(common::ONNXRUNTIME, common::FAIL,
|
||||
"Memcpy cuda: unable to get the CPU allocator.");
|
||||
}
|
||||
}
|
||||
auto X_size = X->Size();
|
||||
for (size_t i = 0; i < X_size; ++i) {
|
||||
|
|
@ -164,7 +179,7 @@ bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
|
|||
return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
|
||||
}
|
||||
|
||||
void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
|
||||
void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
|
||||
cuda_graph_.Reset();
|
||||
cuda_graph_.CaptureBegin();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -694,6 +694,7 @@ struct ProviderHost {
|
|||
virtual int OpKernelContext__InputCount(const OpKernelContext* p) = 0;
|
||||
virtual int OpKernelContext__OutputCount(const OpKernelContext* p) = 0;
|
||||
virtual Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0;
|
||||
virtual Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0;
|
||||
virtual bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) = 0;
|
||||
virtual bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0;
|
||||
virtual bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0;
|
||||
|
|
|
|||
|
|
@ -738,6 +738,8 @@ struct OpKernelContext final {
|
|||
|
||||
Status GetTempSpaceAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceAllocator(this, output); }
|
||||
|
||||
Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceCPUAllocator(this, output); }
|
||||
|
||||
bool GetUseDeterministicCompute() const { return g_host->OpKernelContext__GetUseDeterministicCompute(this); }
|
||||
|
||||
bool TryGetInferredOutputShape(int index, TensorShape& shape) const { return g_host->OpKernelContext__TryGetInferredOutputShape(this, index, shape); }
|
||||
|
|
@ -809,7 +811,7 @@ struct OpKernelInfo final {
|
|||
return GetAttrs<T>(name, tmp).IsOK() ? tmp : default_value;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
template <typename T>
|
||||
Status GetAttrsAsSpan(const std::string& name, gsl::span<const T>& out) const;
|
||||
|
||||
Status GetAttrs(const std::string& name, TensorShapeVector& out) const;
|
||||
|
|
@ -863,8 +865,6 @@ inline TensorShapeVector OpKernelInfo::GetAttrsOrDefault(const std::string& name
|
|||
return GetAttrs(name, tmp).IsOK() ? tmp : default_value;
|
||||
}
|
||||
|
||||
|
||||
|
||||
class SessionState {
|
||||
public:
|
||||
const DataTransferManager& GetDataTransferMgr() const noexcept { return g_host->SessionState__GetDataTransferMgr(this); }
|
||||
|
|
|
|||
|
|
@ -722,7 +722,7 @@ struct ProviderHostImpl : ProviderHost {
|
|||
|
||||
Status Graph__Resolve(Graph* p) override { return p->Resolve(); }
|
||||
void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); }
|
||||
Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span<NodeArg* const> & input_args, const gsl::span<NodeArg* const>& output_args, const NodeAttributes* attributes, const std::string& domain) override {
|
||||
Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span<NodeArg* const>& input_args, const gsl::span<NodeArg* const>& output_args, const NodeAttributes* attributes, const std::string& domain) override {
|
||||
return p->AddNode(name, op_type, description, input_args, output_args, attributes, domain);
|
||||
}
|
||||
|
||||
|
|
@ -787,6 +787,7 @@ struct ProviderHostImpl : ProviderHost {
|
|||
int OpKernelContext__InputCount(const OpKernelContext* p) override { return p->InputCount(); }
|
||||
int OpKernelContext__OutputCount(const OpKernelContext* p) override { return p->OutputCount(); }
|
||||
Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceAllocator(output); }
|
||||
Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceCPUAllocator(output); }
|
||||
bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) override { return p->GetUseDeterministicCompute(); }
|
||||
bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredOutputShape(index, shape); }
|
||||
bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredInputShape(index, shape); }
|
||||
|
|
|
|||
|
|
@ -168,6 +168,9 @@ static constexpr PATH_TYPE MATMUL_MODEL_URI = TSTR("testdata/matmul_1.onnx");
|
|||
#ifndef ORT_NO_RTTI
|
||||
static constexpr PATH_TYPE SEQUENCE_MODEL_URI = TSTR("testdata/sequence_length.onnx");
|
||||
#endif
|
||||
#ifdef USE_CUDA
|
||||
static constexpr PATH_TYPE SEQUENCE_MODEL_URI_2 = TSTR("testdata/optional_sequence_tensor.onnx");
|
||||
#endif
|
||||
static constexpr PATH_TYPE CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_1.onnx");
|
||||
static constexpr PATH_TYPE CUSTOM_OP_LIBRARY_TEST_MODEL_URI = TSTR("testdata/custom_op_library/custom_op_test.onnx");
|
||||
static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/overridable_initializer.onnx");
|
||||
|
|
@ -777,7 +780,7 @@ TEST(CApiTest, test_custom_op_library) {
|
|||
#elif defined(__APPLE__)
|
||||
lib_name = "libcustom_op_library.dylib";
|
||||
#else
|
||||
lib_name = "./libcustom_op_library.so";
|
||||
lib_name = "./libcustom_op_library.so";
|
||||
#endif
|
||||
|
||||
void* library_handle = nullptr;
|
||||
|
|
@ -1144,14 +1147,13 @@ TEST(CApiTest, cuda_graph) {
|
|||
std::vector<const char*> keys{"enable_cuda_graph"};
|
||||
std::vector<const char*> values{"1"};
|
||||
ASSERT_TRUE(api.UpdateCUDAProviderOptions(
|
||||
rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
|
||||
rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
|
||||
|
||||
Ort::SessionOptions session_options;
|
||||
ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
|
||||
static_cast<OrtSessionOptions*>(session_options),
|
||||
rel_cuda_options.get()) == nullptr);
|
||||
static_cast<OrtSessionOptions*>(session_options),
|
||||
rel_cuda_options.get()) == nullptr);
|
||||
|
||||
|
||||
// Create IoBinding for inputs and outputs.
|
||||
struct CudaMemoryDeleter {
|
||||
explicit CudaMemoryDeleter(const Ort::Allocator* alloc) {
|
||||
|
|
@ -2100,4 +2102,28 @@ TEST(CApiTest, GitHubIssue10179) {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
TEST(CApiTest, TestCudaMemcpyToHostWithSequenceTensors) {
|
||||
const auto* model_path = SEQUENCE_MODEL_URI_2;
|
||||
Ort::SessionOptions session_options{};
|
||||
Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
|
||||
Ort::Session session{*ort_env, model_path, session_options};
|
||||
|
||||
Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
|
||||
|
||||
std::vector<Ort::Value> ort_inputs;
|
||||
std::vector<const char*> input_names{"cond"};
|
||||
bool input_data[] = {false};
|
||||
std::vector<int64_t> input_dims{};
|
||||
ort_inputs.emplace_back(Ort::Value::CreateTensor<bool>(info, input_data, 1U, input_dims.data(), 0));
|
||||
const char* output_names[] = {"sequence"};
|
||||
|
||||
std::vector<Ort::Value> ort_outputs = session.Run(Ort::RunOptions{nullptr}, input_names.data(),
|
||||
ort_inputs.data(), ort_inputs.size(),
|
||||
output_names, countof(output_names));
|
||||
|
||||
// There is no need to check the contents of the output, we are just checking to see if the
|
||||
// model runs without crashing
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
|||
BIN
onnxruntime/test/testdata/optional_sequence_tensor.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/optional_sequence_tensor.onnx
vendored
Normal file
Binary file not shown.
Loading…
Reference in a new issue