diff --git a/include/onnxruntime/core/framework/op_kernel_context.h b/include/onnxruntime/core/framework/op_kernel_context.h index 5083e59766..df3363a5eb 100644 --- a/include/onnxruntime/core/framework/op_kernel_context.h +++ b/include/onnxruntime/core/framework/op_kernel_context.h @@ -130,6 +130,12 @@ class OpKernelContext { */ Status GetTempSpaceAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT; + /** + Return the allocator associated with the CPU EP with memtype of OrtMemTypeDefault. + @remarks Use SafeInt when calculating the size of memory to allocate using AllocatorPtr->Alloc. + */ + Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT; + /** Return the fence of current node's input. @param index The index of the input. diff --git a/onnxruntime/core/framework/op_kernel.cc b/onnxruntime/core/framework/op_kernel.cc index 61bf9c4471..f2f63c947e 100644 --- a/onnxruntime/core/framework/op_kernel.cc +++ b/onnxruntime/core/framework/op_kernel.cc @@ -94,6 +94,18 @@ Status OpKernelContext::GetTempSpaceAllocator(AllocatorPtr* output) const { return Status::OK(); } +Status OpKernelContext::GetTempSpaceCPUAllocator(AllocatorPtr* output) const { + // While looking up the allocator from SessionState + // (which is called via ExecutionFrame), the allocator lookup + // logic doesn't key on OrtAllocatorType, so any OrtAllocatorType + // is good here. + *output = execution_frame_->GetAllocator( + OrtMemoryInfo(CPU, OrtAllocatorType::OrtArenaAllocator)); + if (!*output) + return Status(common::ONNXRUNTIME, common::FAIL, "CPU allocator not found"); + return Status::OK(); +} + MLDataType OpKernelContext::InputType(int index) const { int input_arg_index = GetInputArgIndex(index); const OrtValue* p_ml_value = execution_frame_->GetNodeInputOrOutputMLValue(input_arg_index); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 099e6c633b..f2743c9c94 100755 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -49,10 +49,25 @@ class Memcpy final : public OpKernel { auto X_dtype = X->DataType(); Y->SetType(X_dtype); AllocatorPtr alloc; - auto status = ctx->GetTempSpaceAllocator(&alloc); - if (!status.IsOK()) { - return Status(common::ONNXRUNTIME, common::FAIL, - "Memcpy cuda: unable to get an allocator."); + + // If we are copying contents to CUDA, the allocator to use + // to allocate the buffers of the new tensors in the sequence + // can be temp space allocator associated with the CUDA EP + if (Node().OpType() == "MemcpyFromHost") { + auto status = ctx->GetTempSpaceAllocator(&alloc); + if (!status.IsOK()) { + return Status(common::ONNXRUNTIME, common::FAIL, + "Memcpy cuda: unable to get an allocator."); + } + } else { + // If we are copying contents to CPU (op type is "MemcpyToHost"), + // the allocator to use to allocate the buffers of the new tensors + // in the sequence will be the allocator from the CPU EP + auto status = ctx->GetTempSpaceCPUAllocator(&alloc); + if (!status.IsOK()) { + return Status(common::ONNXRUNTIME, common::FAIL, + "Memcpy cuda: unable to get the CPU allocator."); + } } auto X_size = X->Size(); for (size_t i = 0; i < X_size; ++i) { @@ -164,7 +179,7 @@ bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const { return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_; } -void CUDAExecutionProvider::PerThreadContext::CaptureBegin() { +void CUDAExecutionProvider::PerThreadContext::CaptureBegin() { cuda_graph_.Reset(); cuda_graph_.CaptureBegin(); } diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 4e0438ae47..72dc98ef53 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -694,6 +694,7 @@ struct ProviderHost { virtual int OpKernelContext__InputCount(const OpKernelContext* p) = 0; virtual int OpKernelContext__OutputCount(const OpKernelContext* p) = 0; virtual Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0; + virtual Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0; virtual bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) = 0; virtual bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0; virtual bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index ab7c3fbed2..59398e18bf 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -738,6 +738,8 @@ struct OpKernelContext final { Status GetTempSpaceAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceAllocator(this, output); } + Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceCPUAllocator(this, output); } + bool GetUseDeterministicCompute() const { return g_host->OpKernelContext__GetUseDeterministicCompute(this); } bool TryGetInferredOutputShape(int index, TensorShape& shape) const { return g_host->OpKernelContext__TryGetInferredOutputShape(this, index, shape); } @@ -809,7 +811,7 @@ struct OpKernelInfo final { return GetAttrs(name, tmp).IsOK() ? tmp : default_value; } - template + template Status GetAttrsAsSpan(const std::string& name, gsl::span& out) const; Status GetAttrs(const std::string& name, TensorShapeVector& out) const; @@ -863,8 +865,6 @@ inline TensorShapeVector OpKernelInfo::GetAttrsOrDefault(const std::string& name return GetAttrs(name, tmp).IsOK() ? tmp : default_value; } - - class SessionState { public: const DataTransferManager& GetDataTransferMgr() const noexcept { return g_host->SessionState__GetDataTransferMgr(this); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 868b1247bd..f55b56209d 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -722,7 +722,7 @@ struct ProviderHostImpl : ProviderHost { Status Graph__Resolve(Graph* p) override { return p->Resolve(); } void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); } - Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span & input_args, const gsl::span& output_args, const NodeAttributes* attributes, const std::string& domain) override { + Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span& input_args, const gsl::span& output_args, const NodeAttributes* attributes, const std::string& domain) override { return p->AddNode(name, op_type, description, input_args, output_args, attributes, domain); } @@ -787,6 +787,7 @@ struct ProviderHostImpl : ProviderHost { int OpKernelContext__InputCount(const OpKernelContext* p) override { return p->InputCount(); } int OpKernelContext__OutputCount(const OpKernelContext* p) override { return p->OutputCount(); } Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceAllocator(output); } + Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceCPUAllocator(output); } bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) override { return p->GetUseDeterministicCompute(); } bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredOutputShape(index, shape); } bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredInputShape(index, shape); } diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index c1050741ca..00879dc4a3 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -168,6 +168,9 @@ static constexpr PATH_TYPE MATMUL_MODEL_URI = TSTR("testdata/matmul_1.onnx"); #ifndef ORT_NO_RTTI static constexpr PATH_TYPE SEQUENCE_MODEL_URI = TSTR("testdata/sequence_length.onnx"); #endif +#ifdef USE_CUDA +static constexpr PATH_TYPE SEQUENCE_MODEL_URI_2 = TSTR("testdata/optional_sequence_tensor.onnx"); +#endif static constexpr PATH_TYPE CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_1.onnx"); static constexpr PATH_TYPE CUSTOM_OP_LIBRARY_TEST_MODEL_URI = TSTR("testdata/custom_op_library/custom_op_test.onnx"); static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/overridable_initializer.onnx"); @@ -777,7 +780,7 @@ TEST(CApiTest, test_custom_op_library) { #elif defined(__APPLE__) lib_name = "libcustom_op_library.dylib"; #else -lib_name = "./libcustom_op_library.so"; + lib_name = "./libcustom_op_library.so"; #endif void* library_handle = nullptr; @@ -1144,14 +1147,13 @@ TEST(CApiTest, cuda_graph) { std::vector keys{"enable_cuda_graph"}; std::vector values{"1"}; ASSERT_TRUE(api.UpdateCUDAProviderOptions( - rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr); + rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr); Ort::SessionOptions session_options; ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2( - static_cast(session_options), - rel_cuda_options.get()) == nullptr); + static_cast(session_options), + rel_cuda_options.get()) == nullptr); - // Create IoBinding for inputs and outputs. struct CudaMemoryDeleter { explicit CudaMemoryDeleter(const Ort::Allocator* alloc) { @@ -2100,4 +2102,28 @@ TEST(CApiTest, GitHubIssue10179) { } } } + +TEST(CApiTest, TestCudaMemcpyToHostWithSequenceTensors) { + const auto* model_path = SEQUENCE_MODEL_URI_2; + Ort::SessionOptions session_options{}; + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + Ort::Session session{*ort_env, model_path, session_options}; + + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + std::vector ort_inputs; + std::vector input_names{"cond"}; + bool input_data[] = {false}; + std::vector input_dims{}; + ort_inputs.emplace_back(Ort::Value::CreateTensor(info, input_data, 1U, input_dims.data(), 0)); + const char* output_names[] = {"sequence"}; + + std::vector ort_outputs = session.Run(Ort::RunOptions{nullptr}, input_names.data(), + ort_inputs.data(), ort_inputs.size(), + output_names, countof(output_names)); + + // There is no need to check the contents of the output, we are just checking to see if the + // model runs without crashing +} + #endif diff --git a/onnxruntime/test/testdata/optional_sequence_tensor.onnx b/onnxruntime/test/testdata/optional_sequence_tensor.onnx new file mode 100644 index 0000000000..bbac0ae9f2 Binary files /dev/null and b/onnxruntime/test/testdata/optional_sequence_tensor.onnx differ