diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc index e10638b5b5..1b0c03624e 100644 --- a/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/cuda/reduction/reduction_ops.cc @@ -376,7 +376,6 @@ static Status PrepareForReduce(const Tensor* X, return Status::OK(); } -// `input_shape_override` is the input shape for compute purposes (if provided) // `input_shape_override` is the input shape for compute purposes (if provided) template static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& input, PrepareReduceMetadata& prepare_reduce_metadata, @@ -395,16 +394,16 @@ static Status ReduceComputeCore(CUDAExecutionProvider& cuda_ep, const Tensor& in int64_t rank = prepare_reduce_metadata.rank; int64_t stride = prepare_reduce_metadata.stride; - // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. - // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemset(output.MutableDataRaw(), 0, output.SizeInBytes())); - // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(output.Shape().Size() == 0); return Status::OK(); } + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + CUDA_RETURN_IF_ERROR(cudaMemset(output.MutableDataRaw(), 0, output.SizeInBytes())); + IAllocatorUniquePtr temp_X; cudnnDataType_t cudnn_type_X = CudnnTensor::GetDataType(); @@ -629,10 +628,6 @@ Status ReduceKernel::ComputeImpl( std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; - // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. - // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); - // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(Y->Shape().Size() == 0); @@ -647,6 +642,10 @@ Status ReduceKernel::ComputeImpl( return Status::OK(); } + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + size_t indices_bytes = 0; size_t workspace_bytes = 0; CudnnTensor input_tensor; @@ -706,10 +705,6 @@ Status ReduceKernel::ComputeImpl(O std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; - // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. - // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); - // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(Y->Shape().Size() == 0); @@ -726,6 +721,10 @@ Status ReduceKernel::ComputeImpl(O return Status::OK(); } + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + size_t indices_bytes = 0; size_t workspace_bytes = 0; CudnnTensor input_tensor; @@ -785,10 +784,6 @@ Status ReduceKernel::ComputeImpl( std::vector& input_dims_cudnn = prepare_reduce_metadata.input_dims_cudnn; std::vector& output_dims_cudnn = prepare_reduce_metadata.output_dims_cudnn; - // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. - // Therefore zeroing out the memory is required - CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); - // special case when there is a dim value of 0 in the shape. if (input_count == 0) { assert(Y->Shape().Size() == 0); @@ -805,6 +800,10 @@ Status ReduceKernel::ComputeImpl( return Status::OK(); } + // This reduction keep adding values to this buffer. If a non-zero value, say 1000, is here, the sum will start with 1000. + // Therefore zeroing out the memory is required + CUDA_RETURN_IF_ERROR(cudaMemset(Y->MutableDataRaw(), 0, Y->SizeInBytes())); + size_t indices_bytes = 0; size_t workspace_bytes = 0; CudnnTensor input_tensor;