Minor optimization in CUDA Reduction ops (#4353)

This commit is contained in:
Hariharan Seshadri 2020-06-28 01:14:28 -07:00 committed by GitHub
parent 274e6b4153
commit 012aaa6491
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -376,7 +376,6 @@ static Status PrepareForReduce(const Tensor* X,
return Status::OK();
}
// `input_shape_override` is the input shape for compute purposes (if provided)
// `input_shape_override` is the input shape for compute purposes (if provided)
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
@ -395,16 +394,16 @@ static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& in
int64_t rank = prepare_reduce_metadata.rank;
int64_t stride = prepare_reduce_metadata.stride;
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(output.MutableDataRaw(), 0, output.SizeInBytes()));
// special case when there is a dim value of 0 in the shape.
if (input_count == 0) {
assert(output.Shape().Size() == 0);
return Status::OK();
}
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(output.MutableDataRaw(), 0, output.SizeInBytes()));
IAllocatorUniquePtr<float> temp_X;
cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType<CudaT>();
@ -629,10 +628,6 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
// special case when there is a dim value of 0 in the shape.
if (input_count == 0) {
assert(Y->Shape().Size() == 0);
@ -647,6 +642,10 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
return Status::OK();
}
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
size_t indices_bytes = 0;
size_t workspace_bytes = 0;
CudnnTensor input_tensor;
@ -706,10 +705,6 @@ Status ReduceKernel<true>::ComputeImpl<int8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(O
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
// special case when there is a dim value of 0 in the shape.
if (input_count == 0) {
assert(Y->Shape().Size() == 0);
@ -726,6 +721,10 @@ Status ReduceKernel<true>::ComputeImpl<int8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(O
return Status::OK();
}
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
size_t indices_bytes = 0;
size_t workspace_bytes = 0;
CudnnTensor input_tensor;
@ -785,10 +784,6 @@ Status ReduceKernel<true>::ComputeImpl<uint8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
// special case when there is a dim value of 0 in the shape.
if (input_count == 0) {
assert(Y->Shape().Size() == 0);
@ -805,6 +800,10 @@ Status ReduceKernel<true>::ComputeImpl<uint8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
return Status::OK();
}
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
// Therefore zeroing out the memory is required
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
size_t indices_bytes = 0;
size_t workspace_bytes = 0;
CudnnTensor input_tensor;