From 3e71e8bd7e60b11eb4b32afacf852089090d2572 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Fri, 30 Oct 2020 13:58:33 -0700
Subject: [PATCH] Revert "[CUDA EP] remove per-thread allocator (#5415)"
 (#5647)

This reverts commit b4869926d338daaa1011a76e22f3e059ca79cf04 because it broke our multiple GPU test pipeline.
---
 .../providers/cuda/cuda_execution_provider.cc | 28 ++++++++-
 .../providers/cuda/cuda_execution_provider.h  | 61 +++++++++++--------
 2 files changed, 63 insertions(+), 26 deletions(-)
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index 0f8caa7331..650ae97874 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -57,10 +57,23 @@ ONNX_OPERATOR_KERNEL_EX(
 
 }  // namespace cuda
 
-CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id) {
+CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) {
   CUDA_CALL_THROW(cudaSetDevice(device_id));
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
   CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_));
+
+  AllocatorCreationInfo default_memory_info(
+      [](OrtDevice::DeviceId id) {
+        return onnxruntime::make_unique<CUDAAllocator>(id, CUDA);
+      },
+      device_id,
+      true,
+      {cuda_mem_limit,
+       static_cast<int>(arena_extend_strategy),
+       -1, -1});
+
+  // CUDA malloc/free is expensive so always use an arena
+  allocator_ = CreateAllocator(default_memory_info);
 }
 
 CUDAExecutionProvider::PerThreadContext::~PerThreadContext() {
@@ -202,7 +215,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont
 
     // get or create a context
     if (context_state_.retired_context_pool.empty()) {
-      context = std::make_shared<PerThreadContext>(device_id_);
+      context = std::make_shared<PerThreadContext>(device_id_, cuda_mem_limit_, arena_extend_strategy_);
     } else {
       context = context_state_.retired_context_pool.back();
       context_state_.retired_context_pool.pop_back();
@@ -238,6 +251,17 @@ void CUDAExecutionProvider::ReleasePerThreadContext() const {
   per_thread_context_cache->erase(cached_context_it);
 }
 
+AllocatorPtr CUDAExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const {
+  // Pinned memory allocator is shared between threads, but CUDA memory allocator is per-thread or it may cause result changes
+  // A hypothesis is that arena allocator is not aligned with CUDA output cache, and data from different kernel writes may
+  // cause cacheline to contain dirty data.
+  if (mem_type == OrtMemTypeDefault) {
+    return GetPerThreadContext().GetAllocator();
+  } else {
+    return IExecutionProvider::GetAllocator(id, mem_type);
+  }
+}
+
 Status CUDAExecutionProvider::Sync() const {
   CUDA_RETURN_IF_ERROR(cudaDeviceSynchronize());
   return Status::OK();
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
index f2cea31bc7..2f915477af 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h
@@ -34,6 +34,8 @@ class CUDAExecutionProvider : public IExecutionProvider {
   explicit CUDAExecutionProvider(const CUDAExecutionProviderInfo& info);
   virtual ~CUDAExecutionProvider();
 
+  AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const override;
+
   Status Sync() const override;
 
   Status OnRunStart() override;
@@ -55,24 +57,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
 
   template <typename T>
   const T* GetConstOnes(size_t count) {
-    if (std::is_same<T, float>::value) {
-      if (!constant_ones_float_) {
-        constant_ones_float_ = cuda::CreateConstantOnes<float>();
-      }
-      return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(count));
-    } else if (std::is_same<T, double>::value) {
-      if (!constant_ones_double_) {
-        constant_ones_double_ = cuda::CreateConstantOnes<double>();
-      }
-      return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(count));
-    } else if (std::is_same<T, half>::value) {
-      if (!constant_ones_half_) {
-        constant_ones_half_ = cuda::CreateConstantOnes<half>();
-      }
-      return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(count));
-    } else {
-      return nullptr;
-    }
+    return GetPerThreadContext().template GetConstOnes<T>(count);
   }
 
   void AddDeferredReleaseCPUPtr(void* p);
@@ -97,7 +82,7 @@ class CUDAExecutionProvider : public IExecutionProvider {
   int GetCudnnConvAlgo() const { return cudnn_conv_algo_; }
   void UpdateProviderOptionsInfo();
 
- private:
+private:
   OrtDevice::DeviceId device_id_;
   cudaDeviceProp device_prop_;
   size_t cuda_mem_limit_;
@@ -113,13 +98,9 @@ class CUDAExecutionProvider : public IExecutionProvider {
   std::unordered_map<cudaEvent_t, DeferredReleaseCPUPtrs> deferred_release_cpu_ptr_;
   OrtMutex deferred_release_cpu_ptr_mutex_;
 
-  std::unique_ptr<cuda::IConstantBuffer<float>> constant_ones_float_;
-  std::unique_ptr<cuda::IConstantBuffer<double>> constant_ones_double_;
-  std::unique_ptr<cuda::IConstantBuffer<half>> constant_ones_half_;
-
   class PerThreadContext final {
    public:
-    PerThreadContext(OrtDevice::DeviceId device_id);
+    PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy);
     ~PerThreadContext();
 
     cublasHandle_t CublasHandle() const {
@@ -134,6 +115,32 @@ class CUDAExecutionProvider : public IExecutionProvider {
       return current_deferred_release_event_;
     }
 
+    template <typename T>
+    const T* GetConstOnes(size_t count) {
+      if (std::is_same<T, float>::value) {
+        if (!constant_ones_float_) {
+          constant_ones_float_ = cuda::CreateConstantOnes<float>();
+        }
+        return reinterpret_cast<const T*>(constant_ones_float_->GetBuffer(count));
+      } else if (std::is_same<T, double>::value) {
+        if (!constant_ones_double_) {
+          constant_ones_double_ = cuda::CreateConstantOnes<double>();
+        }
+        return reinterpret_cast<const T*>(constant_ones_double_->GetBuffer(count));
+      } else if (std::is_same<T, half>::value) {
+        if (!constant_ones_half_) {
+          constant_ones_half_ = cuda::CreateConstantOnes<half>();
+        }
+        return reinterpret_cast<const T*>(constant_ones_half_->GetBuffer(count));
+      } else {
+        return nullptr;
+      }
+    }
+
+    AllocatorPtr GetAllocator() const {
+      return allocator_;
+    }
+
    private:
     cublasHandle_t cublas_handle_ = nullptr;
     cudnnHandle_t cudnn_handle_ = nullptr;
@@ -142,6 +149,12 @@ class CUDAExecutionProvider : public IExecutionProvider {
     // note that cudaEvent will be assigned at OnRunEnd() when PerThreadContext destory
     // so the ownership is passed to deferred_release_cpu_ptr_
     cudaEvent_t current_deferred_release_event_ = nullptr;
+
+    std::unique_ptr<cuda::IConstantBuffer<float>> constant_ones_float_;
+    std::unique_ptr<cuda::IConstantBuffer<double>> constant_ones_double_;
+    std::unique_ptr<cuda::IConstantBuffer<half>> constant_ones_half_;
+
+    AllocatorPtr allocator_;
   };
 
   using PerThreadContextMap = std::unordered_map<const CUDAExecutionProvider*, std::weak_ptr<PerThreadContext>>;