diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 650ae97874..0f8caa7331 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -57,23 +57,10 @@ ONNX_OPERATOR_KERNEL_EX( } // namespace cuda -CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy) { +CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id) { CUDA_CALL_THROW(cudaSetDevice(device_id)); CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_)); CUDNN_CALL_THROW(cudnnCreate(&cudnn_handle_)); - - AllocatorCreationInfo default_memory_info( - [](OrtDevice::DeviceId id) { - return onnxruntime::make_unique(id, CUDA); - }, - device_id, - true, - {cuda_mem_limit, - static_cast(arena_extend_strategy), - -1, -1}); - - // CUDA malloc/free is expensive so always use an arena - allocator_ = CreateAllocator(default_memory_info); } CUDAExecutionProvider::PerThreadContext::~PerThreadContext() { @@ -215,7 +202,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont // get or create a context if (context_state_.retired_context_pool.empty()) { - context = std::make_shared(device_id_, cuda_mem_limit_, arena_extend_strategy_); + context = std::make_shared(device_id_); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -251,17 +238,6 @@ void CUDAExecutionProvider::ReleasePerThreadContext() const { per_thread_context_cache->erase(cached_context_it); } -AllocatorPtr CUDAExecutionProvider::GetAllocator(int id, OrtMemType mem_type) const { - // Pinned memory allocator is shared between threads, but CUDA memory allocator is per-thread or it may cause result changes - // A hypothesis is that arena allocator is not aligned with CUDA output cache, and data from different kernel writes may - // cause cacheline to contain dirty data. - if (mem_type == OrtMemTypeDefault) { - return GetPerThreadContext().GetAllocator(); - } else { - return IExecutionProvider::GetAllocator(id, mem_type); - } -} - Status CUDAExecutionProvider::Sync() const { CUDA_RETURN_IF_ERROR(cudaDeviceSynchronize()); return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index 2f915477af..f2cea31bc7 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -34,8 +34,6 @@ class CUDAExecutionProvider : public IExecutionProvider { explicit CUDAExecutionProvider(const CUDAExecutionProviderInfo& info); virtual ~CUDAExecutionProvider(); - AllocatorPtr GetAllocator(int id, OrtMemType mem_type) const override; - Status Sync() const override; Status OnRunStart() override; @@ -57,7 +55,24 @@ class CUDAExecutionProvider : public IExecutionProvider { template const T* GetConstOnes(size_t count) { - return GetPerThreadContext().template GetConstOnes(count); + if (std::is_same::value) { + if (!constant_ones_float_) { + constant_ones_float_ = cuda::CreateConstantOnes(); + } + return reinterpret_cast(constant_ones_float_->GetBuffer(count)); + } else if (std::is_same::value) { + if (!constant_ones_double_) { + constant_ones_double_ = cuda::CreateConstantOnes(); + } + return reinterpret_cast(constant_ones_double_->GetBuffer(count)); + } else if (std::is_same::value) { + if (!constant_ones_half_) { + constant_ones_half_ = cuda::CreateConstantOnes(); + } + return reinterpret_cast(constant_ones_half_->GetBuffer(count)); + } else { + return nullptr; + } } void AddDeferredReleaseCPUPtr(void* p); @@ -82,7 +97,7 @@ class CUDAExecutionProvider : public IExecutionProvider { int GetCudnnConvAlgo() const { return cudnn_conv_algo_; } void UpdateProviderOptionsInfo(); -private: + private: OrtDevice::DeviceId device_id_; cudaDeviceProp device_prop_; size_t cuda_mem_limit_; @@ -98,9 +113,13 @@ private: std::unordered_map deferred_release_cpu_ptr_; OrtMutex deferred_release_cpu_ptr_mutex_; + std::unique_ptr> constant_ones_float_; + std::unique_ptr> constant_ones_double_; + std::unique_ptr> constant_ones_half_; + class PerThreadContext final { public: - PerThreadContext(OrtDevice::DeviceId device_id, size_t cuda_mem_limit, ArenaExtendStrategy arena_extend_strategy); + PerThreadContext(OrtDevice::DeviceId device_id); ~PerThreadContext(); cublasHandle_t CublasHandle() const { @@ -115,32 +134,6 @@ private: return current_deferred_release_event_; } - template - const T* GetConstOnes(size_t count) { - if (std::is_same::value) { - if (!constant_ones_float_) { - constant_ones_float_ = cuda::CreateConstantOnes(); - } - return reinterpret_cast(constant_ones_float_->GetBuffer(count)); - } else if (std::is_same::value) { - if (!constant_ones_double_) { - constant_ones_double_ = cuda::CreateConstantOnes(); - } - return reinterpret_cast(constant_ones_double_->GetBuffer(count)); - } else if (std::is_same::value) { - if (!constant_ones_half_) { - constant_ones_half_ = cuda::CreateConstantOnes(); - } - return reinterpret_cast(constant_ones_half_->GetBuffer(count)); - } else { - return nullptr; - } - } - - AllocatorPtr GetAllocator() const { - return allocator_; - } - private: cublasHandle_t cublas_handle_ = nullptr; cudnnHandle_t cudnn_handle_ = nullptr; @@ -149,12 +142,6 @@ private: // note that cudaEvent will be assigned at OnRunEnd() when PerThreadContext destory // so the ownership is passed to deferred_release_cpu_ptr_ cudaEvent_t current_deferred_release_event_ = nullptr; - - std::unique_ptr> constant_ones_float_; - std::unique_ptr> constant_ones_double_; - std::unique_ptr> constant_ones_half_; - - AllocatorPtr allocator_; }; using PerThreadContextMap = std::unordered_map>;