From 3e71e8bd7e60b11eb4b32afacf852089090d2572 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Fri, 30 Oct 2020 13:58:33 -0700 Subject: [PATCH] Revert "[CUDA EP] remove per-thread allocator (#5415)" (#5647) This reverts commit b4869926d338daaa1011a76e22f3e059ca79cf04 because it broke our multiple GPU test pipeline. --- .../providers/cuda/cuda_execution_provider.cc | 28 ++++++++- .../providers/cuda/cuda_execution_provider.h | 61 +++++++++++-------- 2 files changed, 63 insertions(+), 26 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 0f8caa7331..650ae97874 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -57,10 +57,23 @@ ONNX_OPERATOR_KERNEL_EX( } // namespace cuda -CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id) { +CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) { CUDA_CALL_THROW(cudaSetDevice(device_id)); CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_)); + + AllocatorCreationInfo default_memory_info( + [](OrtDevice::DeviceId id) { + return onnxruntime::make_unique(id, CUDA); + }, + device_id, + true, + {cuda_mem_limit, + static_cast(arena_extend_strategy), + -1, -1}); + + // CUDA malloc/free is expensive so always use an arena + allocator_ = CreateAllocator(default_memory_info); } CUDAExecutionProvider::PerThreadContext::~PerThreadContext() { @@ -202,7 +215,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont // get or create a context if (context_state_.retired_context_pool.empty()) { - context = std::make_shared(device_id_); + context = std::make_shared(device_id_, cuda_mem_limit_, arena_extend_strategy_); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -238,6 +251,17 @@ void CUDAExecutionProvider::ReleasePerThreadContext() const { per_thread_context_cache->erase(cached_context_it); } +AllocatorPtr CUDAExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const { + // Pinned memory allocator is shared between threads, but CUDA memory allocator is per-thread or it may cause result changes + // A hypothesis is that arena allocator is not aligned with CUDA output cache, and data from different kernel writes may + // cause cacheline to contain dirty data. + if (mem_type == OrtMemTypeDefault) { + return GetPerThreadContext().GetAllocator(); + } else { + return IExecutionProvider::GetAllocator(id, mem_type); + } +} + Status CUDAExecutionProvider::Sync() const { CUDA_RETURN_IF_ERROR(cudaDeviceSynchronize()); return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index f2cea31bc7..2f915477af 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -34,6 +34,8 @@ class CUDAExecutionProvider : public IExecutionProvider { explicit CUDAExecutionProvider(const CUDAExecutionProviderInfo& info); virtual ~CUDAExecutionProvider(); + AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const override; + Status Sync() const override; Status OnRunStart() override; @@ -55,24 +57,7 @@ class CUDAExecutionProvider : public IExecutionProvider { template const T* GetConstOnes(size_t count) { - if (std::is_same::value) { - if (!constant_ones_float_) { - constant_ones_float_ = cuda::CreateConstantOnes(); - } - return reinterpret_cast(constant_ones_float_->GetBuffer(count)); - } else if (std::is_same::value) { - if (!constant_ones_double_) { - constant_ones_double_ = cuda::CreateConstantOnes(); - } - return reinterpret_cast(constant_ones_double_->GetBuffer(count)); - } else if (std::is_same::value) { - if (!constant_ones_half_) { - constant_ones_half_ = cuda::CreateConstantOnes(); - } - return reinterpret_cast(constant_ones_half_->GetBuffer(count)); - } else { - return nullptr; - } + return GetPerThreadContext().template GetConstOnes(count); } void AddDeferredReleaseCPUPtr(void* p); @@ -97,7 +82,7 @@ class CUDAExecutionProvider : public IExecutionProvider { int GetCudnnConvAlgo() const { return cudnn_conv_algo_; } void UpdateProviderOptionsInfo(); - private: +private: OrtDevice::DeviceId device_id_; cudaDeviceProp device_prop_; size_t cuda_mem_limit_; @@ -113,13 +98,9 @@ class CUDAExecutionProvider : public IExecutionProvider { std::unordered_map deferred_release_cpu_ptr_; OrtMutex deferred_release_cpu_ptr_mutex_; - std::unique_ptr> constant_ones_float_; - std::unique_ptr> constant_ones_double_; - std::unique_ptr> constant_ones_half_; - class PerThreadContext final { public: - PerThreadContext(OrtDevice::DeviceId device_id); + PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy); ~PerThreadContext(); cublasHandle_t CublasHandle() const { @@ -134,6 +115,32 @@ class CUDAExecutionProvider : public IExecutionProvider { return current_deferred_release_event_; } + template + const T* GetConstOnes(size_t count) { + if (std::is_same::value) { + if (!constant_ones_float_) { + constant_ones_float_ = cuda::CreateConstantOnes(); + } + return reinterpret_cast(constant_ones_float_->GetBuffer(count)); + } else if (std::is_same::value) { + if (!constant_ones_double_) { + constant_ones_double_ = cuda::CreateConstantOnes(); + } + return reinterpret_cast(constant_ones_double_->GetBuffer(count)); + } else if (std::is_same::value) { + if (!constant_ones_half_) { + constant_ones_half_ = cuda::CreateConstantOnes(); + } + return reinterpret_cast(constant_ones_half_->GetBuffer(count)); + } else { + return nullptr; + } + } + + AllocatorPtr GetAllocator() const { + return allocator_; + } + private: cublasHandle_t cublas_handle_ = nullptr; cudnnHandle_t cudnn_handle_ = nullptr; @@ -142,6 +149,12 @@ class CUDAExecutionProvider : public IExecutionProvider { // note that cudaEvent will be assigned at OnRunEnd() when PerThreadContext destory // so the ownership is passed to deferred_release_cpu_ptr_ cudaEvent_t current_deferred_release_event_ = nullptr; + + std::unique_ptr> constant_ones_float_; + std::unique_ptr> constant_ones_double_; + std::unique_ptr> constant_ones_half_; + + AllocatorPtr allocator_; }; using PerThreadContextMap = std::unordered_map>;