mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-21 21:52:11 +00:00
Minor optimization in CUDA Reduction ops (#4353)
This commit is contained in:
parent
274e6b4153
commit
012aaa6491
1 changed files with 16 additions and 17 deletions
|
|
@ -376,7 +376,6 @@ static Status PrepareForReduce(const Tensor* X,
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
// `input_shape_override` is the input shape for compute purposes (if provided)
|
||||
// `input_shape_override` is the input shape for compute purposes (if provided)
|
||||
template <typename T, cudnnReduceTensorIndices_t ReduceTensorIndices>
|
||||
static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata,
|
||||
|
|
@ -395,16 +394,16 @@ static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& in
|
|||
int64_t rank = prepare_reduce_metadata.rank;
|
||||
int64_t stride = prepare_reduce_metadata.stride;
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(output.MutableDataRaw(), 0, output.SizeInBytes()));
|
||||
|
||||
// special case when there is a dim value of 0 in the shape.
|
||||
if (input_count == 0) {
|
||||
assert(output.Shape().Size() == 0);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(output.MutableDataRaw(), 0, output.SizeInBytes()));
|
||||
|
||||
IAllocatorUniquePtr<float> temp_X;
|
||||
cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType<CudaT>();
|
||||
|
||||
|
|
@ -629,10 +628,6 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
|||
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
|
||||
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
// special case when there is a dim value of 0 in the shape.
|
||||
if (input_count == 0) {
|
||||
assert(Y->Shape().Size() == 0);
|
||||
|
|
@ -647,6 +642,10 @@ Status ReduceKernel<true>::ComputeImpl<int32_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
size_t indices_bytes = 0;
|
||||
size_t workspace_bytes = 0;
|
||||
CudnnTensor input_tensor;
|
||||
|
|
@ -706,10 +705,6 @@ Status ReduceKernel<true>::ComputeImpl<int8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(O
|
|||
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
|
||||
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
// special case when there is a dim value of 0 in the shape.
|
||||
if (input_count == 0) {
|
||||
assert(Y->Shape().Size() == 0);
|
||||
|
|
@ -726,6 +721,10 @@ Status ReduceKernel<true>::ComputeImpl<int8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(O
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
size_t indices_bytes = 0;
|
||||
size_t workspace_bytes = 0;
|
||||
CudnnTensor input_tensor;
|
||||
|
|
@ -785,10 +784,6 @@ Status ReduceKernel<true>::ComputeImpl<uint8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
|||
std::vector<int64_t>& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn;
|
||||
std::vector<int64_t>& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn;
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
// special case when there is a dim value of 0 in the shape.
|
||||
if (input_count == 0) {
|
||||
assert(Y->Shape().Size() == 0);
|
||||
|
|
@ -805,6 +800,10 @@ Status ReduceKernel<true>::ComputeImpl<uint8_t, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
// This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000.
|
||||
// Therefore zeroing out the memory is required
|
||||
CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes()));
|
||||
|
||||
size_t indices_bytes = 0;
|
||||
size_t workspace_bytes = 0;
|
||||
CudnnTensor input_tensor;
|
||||
|
|
|
|||
Loading…
Reference in a new issue