diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index 644bbe4d2d..00f5e0bb54 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -33,53 +33,6 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -template -const cudnnConvolutionFwdAlgo_t Conv::kAllAlgos[] = { - CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, -}; - -cudnnStatus_t getWorkspaceSize(const CudnnConvState& s, - cudnnConvolutionFwdAlgo_t algo, size_t* sz) { - return cudnnGetConvolutionForwardWorkspaceSize( - s.handle, - s.x_tensor, - s.w_desc, - s.conv_desc, - s.y_tensor, - algo, - sz); -} - -template -size_t getMaxWorkspaceSize(const CudnnConvState& s, - const algo_t* algo, int n_algo) { - size_t max_ws_size = 0; - - // TODO: get maximum available size from memory areana - - size_t free, total; - CUDA_CALL_THROW(cudaMemGetInfo(&free, &total)); - // Assuming 10% of fragmentation - free = static_cast(static_cast(free) * 0.9); - - for (int i = 0; i < n_algo; i++) { - cudnnStatus_t err; - size_t sz; - err = getWorkspaceSize(s, algo[i], &sz); - if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free) - continue; - max_ws_size = sz; - } - return max_ws_size; -} - Status SliceOutUnwantedOutputSection(cudaStream_t stream, const void* input_data, const std::vector& input_dims, void* output_data, @@ -260,12 +213,9 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", cudnn_conv_algo); switch (cudnn_conv_algo) { case 0: { - static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; - size_t max_ws_size = getMaxWorkspaceSize(s_, kAllAlgos, num_algos); - IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(max_ws_size); - + IAllocatorUniquePtr algo_search_workspace = GetScratchBuffer(AlgoSearchWorkspaceSize); CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx( - s_.handle, + CudnnHandle(), s_.x_tensor, s_.x_data, s_.w_desc, @@ -277,12 +227,12 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const &algo_count, // returnedAlgoCount &perf, algo_search_workspace.get(), - max_ws_size)); + AlgoSearchWorkspaceSize)); break; } case 1: CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7( - s_.handle, + CudnnHandle(), s_.x_tensor, s_.w_desc, s_.conv_desc, @@ -294,7 +244,14 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) const default: perf.algo = kDefaultConvAlgo; - CUDNN_RETURN_IF_ERROR(getWorkspaceSize(s_, perf.algo, &perf.memory)); + CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize( + CudnnHandle(), + s_.x_tensor, + s_.w_desc, + s_.conv_desc, + s_.y_tensor, + perf.algo, + &perf.memory)); if (std::is_same::value) { perf.mathType = CUDNN_TENSOR_OP_MATH; } else { @@ -333,7 +290,7 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { const auto alpha = Consts::One; const auto beta = Consts::Zero; IAllocatorUniquePtr workspace = GetWorkSpace(); - CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(s_.handle, + CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(CudnnHandle(), &alpha, s_.x_tensor, s_.x_data, @@ -347,7 +304,7 @@ Status Conv::ComputeInternal(OpKernelContext* context) const { s_.y_tensor, s_.y_data)); if (nullptr != s_.b_data) { - CUDNN_RETURN_IF_ERROR(cudnnAddTensor(s_.handle, &alpha, s_.b_tensor, s_.b_data, + CUDNN_RETURN_IF_ERROR(cudnnAddTensor(CudnnHandle(), &alpha, s_.b_tensor, s_.b_data, &alpha, s_.y_tensor, s_.y_data)); } // To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index ab6647941b..5abaa1f595 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -111,8 +111,6 @@ constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000; template struct CudnnConvState { - cudnnHandle_t handle; - // if x/w dims changed, update algo and cudnnTensors std::vector last_x_dims; std::vector last_w_dims; @@ -175,8 +173,6 @@ class Conv : public CudaKernel { Conv(const OpKernelInfo& info) : CudaKernel(info), conv_attrs_(info) { auto pads_size = conv_attrs_.pads.size(); ORT_ENFORCE(pads_size % 2 == 0); - - s_.handle = CudnnHandle(); } Status ComputeInternal(OpKernelContext* context) const override; @@ -190,7 +186,6 @@ class Conv : public CudaKernel { ConvAttributes conv_attrs_; mutable CudnnConvState s_; constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - static const cudnnConvolutionFwdAlgo_t kAllAlgos[]; }; Status SliceOutUnwantedOutputSection(cudaStream_t stream,