Revert the cuda algo finding change as this causes a significant memory bloat. (#8181)

* Revert the cuda algo finding change as this causes a significant memory bloat.

* Address PR comment
This commit is contained in:
Pranav Sharma 2021-06-28 22:49:36 -07:00 committed by GitHub
parent 83be3759bc
commit 9ec0fd6a1c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 14 additions and 62 deletions

View file

@ -33,53 +33,6 @@ REGISTER_KERNEL_TYPED(float)
REGISTER_KERNEL_TYPED(double)
REGISTER_KERNEL_TYPED(MLFloat16)
template <typename T>
const cudnnConvolutionFwdAlgo_t Conv<T>::kAllAlgos[] = {
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
};
cudnnStatus_t getWorkspaceSize(const CudnnConvState<cudnnConvolutionFwdAlgoPerf_t>& s,
cudnnConvolutionFwdAlgo_t algo, size_t* sz) {
return cudnnGetConvolutionForwardWorkspaceSize(
s.handle,
s.x_tensor,
s.w_desc,
s.conv_desc,
s.y_tensor,
algo,
sz);
}
template <typename algo_t>
size_t getMaxWorkspaceSize(const CudnnConvState<cudnnConvolutionFwdAlgoPerf_t>& s,
const algo_t* algo, int n_algo) {
size_t max_ws_size = 0;
// TODO: get maximum available size from memory areana
size_t free, total;
CUDA_CALL_THROW(cudaMemGetInfo(&free, &total));
// Assuming 10% of fragmentation
free = static_cast<size_t>(static_cast<double>(free) * 0.9);
for (int i = 0; i < n_algo; i++) {
cudnnStatus_t err;
size_t sz;
err = getWorkspaceSize(s, algo[i], &sz);
if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free)
continue;
max_ws_size = sz;
}
return max_ws_size;
}
Status SliceOutUnwantedOutputSection(cudaStream_t stream,
const void* input_data, const std::vector<int64_t>& input_dims,
void* output_data,
@ -260,12 +213,9 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", cudnn_conv_algo);
switch (cudnn_conv_algo) {
case 0: {
static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
size_t max_ws_size = getMaxWorkspaceSize(s_, kAllAlgos, num_algos);
IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(max_ws_size);
IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize);
CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx(
s_.handle,
CudnnHandle(),
s_.x_tensor,
s_.x_data,
s_.w_desc,
@ -277,12 +227,12 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
&algo_count, // returnedAlgoCount
&perf,
algo_search_workspace.get(),
max_ws_size));
AlgoSearchWorkspaceSize));
break;
}
case 1:
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
s_.handle,
CudnnHandle(),
s_.x_tensor,
s_.w_desc,
s_.conv_desc,
@ -294,7 +244,14 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
default:
perf.algo = kDefaultConvAlgo;
CUDNN_RETURN_IF_ERROR(getWorkspaceSize(s_, perf.algo, &perf.memory));
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
CudnnHandle(),
s_.x_tensor,
s_.w_desc,
s_.conv_desc,
s_.y_tensor,
perf.algo,
&perf.memory));
if (std::is_same<T, MLFloat16>::value) {
perf.mathType = CUDNN_TENSOR_OP_MATH;
} else {
@ -333,7 +290,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
const auto alpha = Consts<CudaT>::One;
const auto beta = Consts<CudaT>::Zero;
IAllocatorUniquePtr<void> workspace = GetWorkSpace();
CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(s_.handle,
CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(CudnnHandle(),
&alpha,
s_.x_tensor,
s_.x_data,
@ -347,7 +304,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
s_.y_tensor,
s_.y_data));
if (nullptr != s_.b_data) {
CUDNN_RETURN_IF_ERROR(cudnnAddTensor(s_.handle, &alpha, s_.b_tensor, s_.b_data,
CUDNN_RETURN_IF_ERROR(cudnnAddTensor(CudnnHandle(), &alpha, s_.b_tensor, s_.b_data,
&alpha, s_.y_tensor, s_.y_data));
}
// To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions

View file

@ -111,8 +111,6 @@ constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
template <typename AlgoPerfType>
struct CudnnConvState {
cudnnHandle_t handle;
// if x/w dims changed, update algo and cudnnTensors
std::vector<int64_t> last_x_dims;
std::vector<int64_t> last_w_dims;
@ -175,8 +173,6 @@ class Conv : public CudaKernel {
Conv(const OpKernelInfo& info) : CudaKernel(info), conv_attrs_(info) {
auto pads_size = conv_attrs_.pads.size();
ORT_ENFORCE(pads_size % 2 == 0);
s_.handle = CudnnHandle();
}
Status ComputeInternal(OpKernelContext* context) const override;
@ -190,7 +186,6 @@ class Conv : public CudaKernel {
ConvAttributes conv_attrs_;
mutable CudnnConvState<cudnnConvolutionFwdAlgoPerf_t> s_;
constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
static const cudnnConvolutionFwdAlgo_t kAllAlgos[];
};
Status SliceOutUnwantedOutputSection(cudaStream_t stream,