Fix bug in MemcpyToHost (#10816)

2026-07-10 17:37:14 +00:00 · 2022-03-10 07:02:27 -08:00 · 2022-03-10 07:02:27 -08:00 · e80ff63274
commit e80ff63274
parent 9853eaa14f
8 changed files with 75 additions and 14 deletions
--- a/include/onnxruntime/core/framework/op_kernel_context.h
+++ b/include/onnxruntime/core/framework/op_kernel_context.h
@ -130,6 +130,12 @@ class OpKernelContext {
   */
  Status GetTempSpaceAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT;

+  /**
+   Return the allocator associated with the CPU EP with memtype of OrtMemTypeDefault.
+   @remarks Use SafeInt when calculating the size of memory to allocate using AllocatorPtr->Alloc.
+   */
+  Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const ORT_MUST_USE_RESULT;
+
  /**
  Return the fence of current node's input.
  @param index The index of the input.
--- a/onnxruntime/core/framework/op_kernel.cc
+++ b/onnxruntime/core/framework/op_kernel.cc
@ -94,6 +94,18 @@ Status OpKernelContext::GetTempSpaceAllocator(AllocatorPtr* output) const {
  return Status::OK();
 }

+Status OpKernelContext::GetTempSpaceCPUAllocator(AllocatorPtr* output) const {
+  // While looking up the allocator from SessionState
+  // (which is called via ExecutionFrame), the allocator lookup
+  // logic doesn't key on OrtAllocatorType, so any OrtAllocatorType
+  // is good here.
+  *output = execution_frame_->GetAllocator(
+      OrtMemoryInfo(CPU, OrtAllocatorType::OrtArenaAllocator));
+  if (!*output)
+    return Status(common::ONNXRUNTIME, common::FAIL, "CPU allocator not found");
+  return Status::OK();
+}
+
 MLDataType OpKernelContext::InputType(int index) const {
  int input_arg_index = GetInputArgIndex(index);
  const OrtValue* p_ml_value = execution_frame_->GetNodeInputOrOutputMLValue(input_arg_index);
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@ -49,10 +49,25 @@ class Memcpy final : public OpKernel {
      auto X_dtype = X->DataType();
      Y->SetType(X_dtype);
      AllocatorPtr alloc;
-      auto status = ctx->GetTempSpaceAllocator(&alloc);
-      if (!status.IsOK()) {
-        return Status(common::ONNXRUNTIME, common::FAIL,
-                      "Memcpy cuda: unable to get an allocator.");
+
+      // If we are copying contents to CUDA, the allocator to use
+      // to allocate the buffers of the new tensors in the sequence
+      // can be temp space allocator associated with the CUDA EP
+      if (Node().OpType() == "MemcpyFromHost") {
+        auto status = ctx->GetTempSpaceAllocator(&alloc);
+        if (!status.IsOK()) {
+          return Status(common::ONNXRUNTIME, common::FAIL,
+                        "Memcpy cuda: unable to get an allocator.");
+        }
+      } else {
+        // If we are copying contents to CPU (op type is "MemcpyToHost"),
+        // the allocator to use to allocate the buffers of the new tensors
+        // in the sequence will be the allocator from the CPU EP
+        auto status = ctx->GetTempSpaceCPUAllocator(&alloc);
+        if (!status.IsOK()) {
+          return Status(common::ONNXRUNTIME, common::FAIL,
+                        "Memcpy cuda: unable to get the CPU allocator.");
+        }
      }
      auto X_size = X->Size();
      for (size_t i = 0; i < X_size; ++i) {
@ -164,7 +179,7 @@ bool CUDAExecutionProvider::PerThreadContext::IsGraphCaptureAllowed() const {
  return regular_run_count_before_graph_capture_ >= min_num_runs_before_cuda_graph_capture_;
 }

-void CUDAExecutionProvider::PerThreadContext::CaptureBegin()  {
+void CUDAExecutionProvider::PerThreadContext::CaptureBegin() {
  cuda_graph_.Reset();
  cuda_graph_.CaptureBegin();
 }
--- a/onnxruntime/core/providers/shared_library/provider_interfaces.h
+++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h
@ -694,6 +694,7 @@ struct ProviderHost {
  virtual int OpKernelContext__InputCount(const OpKernelContext* p) = 0;
  virtual int OpKernelContext__OutputCount(const OpKernelContext* p) = 0;
  virtual Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0;
+  virtual Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) = 0;
  virtual bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) = 0;
  virtual bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0;
  virtual bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) = 0;
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@ -738,6 +738,8 @@ struct OpKernelContext final {

  Status GetTempSpaceAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceAllocator(this, output); }

+  Status GetTempSpaceCPUAllocator(AllocatorPtr* output) const { return g_host->OpKernelContext__GetTempSpaceCPUAllocator(this, output); }
+
  bool GetUseDeterministicCompute() const { return g_host->OpKernelContext__GetUseDeterministicCompute(this); }

  bool TryGetInferredOutputShape(int index, TensorShape& shape) const { return g_host->OpKernelContext__TryGetInferredOutputShape(this, index, shape); }
@ -809,7 +811,7 @@ struct OpKernelInfo final {
    return GetAttrs<T>(name, tmp).IsOK() ? tmp : default_value;
  }

-  template<typename T>
+  template <typename T>
  Status GetAttrsAsSpan(const std::string& name, gsl::span<const T>& out) const;

  Status GetAttrs(const std::string& name, TensorShapeVector& out) const;
@ -863,8 +865,6 @@ inline TensorShapeVector OpKernelInfo::GetAttrsOrDefault(const std::string& name
  return GetAttrs(name, tmp).IsOK() ? tmp : default_value;
 }

-
-
 class SessionState {
 public:
  const DataTransferManager& GetDataTransferMgr() const noexcept { return g_host->SessionState__GetDataTransferMgr(this); }
--- a/onnxruntime/core/session/provider_bridge_ort.cc
+++ b/onnxruntime/core/session/provider_bridge_ort.cc
@ -722,7 +722,7 @@ struct ProviderHostImpl : ProviderHost {

  Status Graph__Resolve(Graph* p) override { return p->Resolve(); }
  void Graph__AddInitializedTensor(Graph* p, const ONNX_NAMESPACE::TensorProto& tensor) override { p->AddInitializedTensor(tensor); }
-  Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span<NodeArg* const> & input_args, const gsl::span<NodeArg* const>& output_args, const NodeAttributes* attributes, const std::string& domain) override {
+  Node& Graph__AddNode(Graph* p, const std::string& name, const std::string& op_type, const std::string& description, const gsl::span<NodeArg* const>& input_args, const gsl::span<NodeArg* const>& output_args, const NodeAttributes* attributes, const std::string& domain) override {
    return p->AddNode(name, op_type, description, input_args, output_args, attributes, domain);
  }

@ -787,6 +787,7 @@ struct ProviderHostImpl : ProviderHost {
  int OpKernelContext__InputCount(const OpKernelContext* p) override { return p->InputCount(); }
  int OpKernelContext__OutputCount(const OpKernelContext* p) override { return p->OutputCount(); }
  Status OpKernelContext__GetTempSpaceAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceAllocator(output); }
+  Status OpKernelContext__GetTempSpaceCPUAllocator(const OpKernelContext* p, AllocatorPtr* output) override { return p->GetTempSpaceCPUAllocator(output); }
  bool OpKernelContext__GetUseDeterministicCompute(const OpKernelContext* p) override { return p->GetUseDeterministicCompute(); }
  bool OpKernelContext__TryGetInferredOutputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredOutputShape(index, shape); }
  bool OpKernelContext__TryGetInferredInputShape(const OpKernelContext* p, int index, TensorShape& shape) override { return p->TryGetInferredInputShape(index, shape); }
--- a/onnxruntime/test/shared_lib/test_inference.cc
+++ b/onnxruntime/test/shared_lib/test_inference.cc
@ -168,6 +168,9 @@ static constexpr PATH_TYPE MATMUL_MODEL_URI = TSTR("testdata/matmul_1.onnx");
 #ifndef ORT_NO_RTTI
 static constexpr PATH_TYPE SEQUENCE_MODEL_URI = TSTR("testdata/sequence_length.onnx");
 #endif
+#ifdef USE_CUDA
+static constexpr PATH_TYPE SEQUENCE_MODEL_URI_2 = TSTR("testdata/optional_sequence_tensor.onnx");
+#endif
 static constexpr PATH_TYPE CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_1.onnx");
 static constexpr PATH_TYPE CUSTOM_OP_LIBRARY_TEST_MODEL_URI = TSTR("testdata/custom_op_library/custom_op_test.onnx");
 static constexpr PATH_TYPE OVERRIDABLE_INITIALIZER_MODEL_URI = TSTR("testdata/overridable_initializer.onnx");
@ -777,7 +780,7 @@ TEST(CApiTest, test_custom_op_library) {
 #elif defined(__APPLE__)
  lib_name = "libcustom_op_library.dylib";
 #else
-lib_name = "./libcustom_op_library.so";
+  lib_name = "./libcustom_op_library.so";
 #endif

  void* library_handle = nullptr;
@ -1144,14 +1147,13 @@ TEST(CApiTest, cuda_graph) {
  std::vector<const char*> keys{"enable_cuda_graph"};
  std::vector<const char*> values{"1"};
  ASSERT_TRUE(api.UpdateCUDAProviderOptions(
-    rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);
+                  rel_cuda_options.get(), keys.data(), values.data(), 1) == nullptr);

  Ort::SessionOptions session_options;
  ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_CUDA_V2(
-    static_cast<OrtSessionOptions*>(session_options),
-    rel_cuda_options.get()) == nullptr);
+                  static_cast<OrtSessionOptions*>(session_options),
+                  rel_cuda_options.get()) == nullptr);

-  
  // Create IoBinding for inputs and outputs.
  struct CudaMemoryDeleter {
    explicit CudaMemoryDeleter(const Ort::Allocator* alloc) {
@ -2100,4 +2102,28 @@ TEST(CApiTest, GitHubIssue10179) {
    }
  }
 }
+
+TEST(CApiTest, TestCudaMemcpyToHostWithSequenceTensors) {
+  const auto* model_path = SEQUENCE_MODEL_URI_2;
+  Ort::SessionOptions session_options{};
+  Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0));
+  Ort::Session session{*ort_env, model_path, session_options};
+
+  Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault);
+
+  std::vector<Ort::Value> ort_inputs;
+  std::vector<const char*> input_names{"cond"};
+  bool input_data[] = {false};
+  std::vector<int64_t> input_dims{};
+  ort_inputs.emplace_back(Ort::Value::CreateTensor<bool>(info, input_data, 1U, input_dims.data(), 0));
+  const char* output_names[] = {"sequence"};
+
+  std::vector<Ort::Value> ort_outputs = session.Run(Ort::RunOptions{nullptr}, input_names.data(),
+                                                    ort_inputs.data(), ort_inputs.size(),
+                                                    output_names, countof(output_names));
+
+  // There is no need to check the contents of the output, we are just checking to see if the
+  // model runs without crashing
+}
+
 #endif
--- a/onnxruntime/test/testdata/optional_sequence_tensor.onnx
+++ b/onnxruntime/test/testdata/optional_sequence_tensor.onnx