mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-26 03:00:54 +00:00
Revert the cuda algo finding change as this causes a significant memory bloat. (#8181)
* Revert the cuda algo finding change as this causes a significant memory bloat. * Address PR comment
This commit is contained in:
parent
83be3759bc
commit
9ec0fd6a1c
2 changed files with 14 additions and 62 deletions
|
|
@ -33,53 +33,6 @@ REGISTER_KERNEL_TYPED(float)
|
|||
REGISTER_KERNEL_TYPED(double)
|
||||
REGISTER_KERNEL_TYPED(MLFloat16)
|
||||
|
||||
template <typename T>
|
||||
const cudnnConvolutionFwdAlgo_t Conv<T>::kAllAlgos[] = {
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_FFT,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
|
||||
CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED,
|
||||
};
|
||||
|
||||
cudnnStatus_t getWorkspaceSize(const CudnnConvState<cudnnConvolutionFwdAlgoPerf_t>& s,
|
||||
cudnnConvolutionFwdAlgo_t algo, size_t* sz) {
|
||||
return cudnnGetConvolutionForwardWorkspaceSize(
|
||||
s.handle,
|
||||
s.x_tensor,
|
||||
s.w_desc,
|
||||
s.conv_desc,
|
||||
s.y_tensor,
|
||||
algo,
|
||||
sz);
|
||||
}
|
||||
|
||||
template <typename algo_t>
|
||||
size_t getMaxWorkspaceSize(const CudnnConvState<cudnnConvolutionFwdAlgoPerf_t>& s,
|
||||
const algo_t* algo, int n_algo) {
|
||||
size_t max_ws_size = 0;
|
||||
|
||||
// TODO: get maximum available size from memory areana
|
||||
|
||||
size_t free, total;
|
||||
CUDA_CALL_THROW(cudaMemGetInfo(&free, &total));
|
||||
// Assuming 10% of fragmentation
|
||||
free = static_cast<size_t>(static_cast<double>(free) * 0.9);
|
||||
|
||||
for (int i = 0; i < n_algo; i++) {
|
||||
cudnnStatus_t err;
|
||||
size_t sz;
|
||||
err = getWorkspaceSize(s, algo[i], &sz);
|
||||
if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > free)
|
||||
continue;
|
||||
max_ws_size = sz;
|
||||
}
|
||||
return max_ws_size;
|
||||
}
|
||||
|
||||
Status SliceOutUnwantedOutputSection(cudaStream_t stream,
|
||||
const void* input_data, const std::vector<int64_t>& input_dims,
|
||||
void* output_data,
|
||||
|
|
@ -260,12 +213,9 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
|
|||
ORT_ENFORCE(cudnn_conv_algo > -1 && cudnn_conv_algo < 3, "cudnn_conv_algo should be 0, 1 or 2, but got ", cudnn_conv_algo);
|
||||
switch (cudnn_conv_algo) {
|
||||
case 0: {
|
||||
static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT;
|
||||
size_t max_ws_size = getMaxWorkspaceSize(s_, kAllAlgos, num_algos);
|
||||
IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(max_ws_size);
|
||||
|
||||
IAllocatorUniquePtr<void> algo_search_workspace = GetScratchBuffer<void>(AlgoSearchWorkspaceSize);
|
||||
CUDNN_RETURN_IF_ERROR(cudnnFindConvolutionForwardAlgorithmEx(
|
||||
s_.handle,
|
||||
CudnnHandle(),
|
||||
s_.x_tensor,
|
||||
s_.x_data,
|
||||
s_.w_desc,
|
||||
|
|
@ -277,12 +227,12 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
|
|||
&algo_count, // returnedAlgoCount
|
||||
&perf,
|
||||
algo_search_workspace.get(),
|
||||
max_ws_size));
|
||||
AlgoSearchWorkspaceSize));
|
||||
break;
|
||||
}
|
||||
case 1:
|
||||
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardAlgorithm_v7(
|
||||
s_.handle,
|
||||
CudnnHandle(),
|
||||
s_.x_tensor,
|
||||
s_.w_desc,
|
||||
s_.conv_desc,
|
||||
|
|
@ -294,7 +244,14 @@ Status Conv<T>::UpdateState(OpKernelContext* context, bool bias_expected) const
|
|||
|
||||
default:
|
||||
perf.algo = kDefaultConvAlgo;
|
||||
CUDNN_RETURN_IF_ERROR(getWorkspaceSize(s_, perf.algo, &perf.memory));
|
||||
CUDNN_RETURN_IF_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
|
||||
CudnnHandle(),
|
||||
s_.x_tensor,
|
||||
s_.w_desc,
|
||||
s_.conv_desc,
|
||||
s_.y_tensor,
|
||||
perf.algo,
|
||||
&perf.memory));
|
||||
if (std::is_same<T, MLFloat16>::value) {
|
||||
perf.mathType = CUDNN_TENSOR_OP_MATH;
|
||||
} else {
|
||||
|
|
@ -333,7 +290,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
const auto alpha = Consts<CudaT>::One;
|
||||
const auto beta = Consts<CudaT>::Zero;
|
||||
IAllocatorUniquePtr<void> workspace = GetWorkSpace();
|
||||
CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(s_.handle,
|
||||
CUDNN_RETURN_IF_ERROR(cudnnConvolutionForward(CudnnHandle(),
|
||||
&alpha,
|
||||
s_.x_tensor,
|
||||
s_.x_data,
|
||||
|
|
@ -347,7 +304,7 @@ Status Conv<T>::ComputeInternal(OpKernelContext* context) const {
|
|||
s_.y_tensor,
|
||||
s_.y_data));
|
||||
if (nullptr != s_.b_data) {
|
||||
CUDNN_RETURN_IF_ERROR(cudnnAddTensor(s_.handle, &alpha, s_.b_tensor, s_.b_data,
|
||||
CUDNN_RETURN_IF_ERROR(cudnnAddTensor(CudnnHandle(), &alpha, s_.b_tensor, s_.b_data,
|
||||
&alpha, s_.y_tensor, s_.y_data));
|
||||
}
|
||||
// To deal with asymmetric padding, we may have over-padded on one or both sides of the spatial dimensions
|
||||
|
|
|
|||
|
|
@ -111,8 +111,6 @@ constexpr size_t MAX_CACHED_ALGO_PERF_RESULTS = 10000;
|
|||
|
||||
template <typename AlgoPerfType>
|
||||
struct CudnnConvState {
|
||||
cudnnHandle_t handle;
|
||||
|
||||
// if x/w dims changed, update algo and cudnnTensors
|
||||
std::vector<int64_t> last_x_dims;
|
||||
std::vector<int64_t> last_w_dims;
|
||||
|
|
@ -175,8 +173,6 @@ class Conv : public CudaKernel {
|
|||
Conv(const OpKernelInfo& info) : CudaKernel(info), conv_attrs_(info) {
|
||||
auto pads_size = conv_attrs_.pads.size();
|
||||
ORT_ENFORCE(pads_size % 2 == 0);
|
||||
|
||||
s_.handle = CudnnHandle();
|
||||
}
|
||||
|
||||
Status ComputeInternal(OpKernelContext* context) const override;
|
||||
|
|
@ -190,7 +186,6 @@ class Conv : public CudaKernel {
|
|||
ConvAttributes conv_attrs_;
|
||||
mutable CudnnConvState<cudnnConvolutionFwdAlgoPerf_t> s_;
|
||||
constexpr static auto kDefaultConvAlgo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
|
||||
static const cudnnConvolutionFwdAlgo_t kAllAlgos[];
|
||||
};
|
||||
|
||||
Status SliceOutUnwantedOutputSection(cudaStream_t stream,
|
||||
|
|
|
|||
Loading…
Reference in a new issue