Use built-in reduce_sum() for simple reduction cases, specifically reduce all to a scalar.

This commit is contained in:
Jesse Benson 2021-04-13 15:33:16 -07:00 committed by Jesse Benson
parent 3eb2d349a6
commit be79575c6a
3 changed files with 24 additions and 26 deletions

View file

@ -49,7 +49,6 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
int64_t C;
onnxruntime::contrib::GetNDCFromLogitAndLabelShape(logit_shape, label_shape, N_D, C);
const TensorShape logit_reshape({N_D, C});
const TensorShape label_reshape({N_D});
Tensor* total_loss = ctx->Output(0, reduction_ == ReductionType::NONE ? TensorShape(label.Shape()) : TensorShape({}));
T* total_loss_data = total_loss->template MutableData<T>();
T* tmp_loss_sample_buffer = nullptr;
@ -111,14 +110,15 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream()));
ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data);
// Compute buffer size in byte for reduction APIs.
const auto buffer_size =
compute_reduction_buffer_size<T>(static_cast<int>(N_D));
// Allocate reduction buffer whose size is buffer_size bytes, or nullptr if no reduction.
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
reduction_ != ReductionType::NONE ? buffer_size : 0);
auto normalize_factor_data = GetScratchBuffer<T>(1);
if (reduction_ == ReductionType::MEAN) {
// Compute buffer size in byte for reduction APIs.
const auto buffer_size =
compute_reduction_buffer_size<T>(static_cast<int>(N_D));
// Allocate reduction buffer whose size is buffer_size bytes.
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
buffer_size);
ORT_RETURN_IF_ERROR(reduce_sum(
Stream(),
weight_data_nd_data,
@ -157,14 +157,13 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
if (reduction_ != ReductionType::NONE) {
// ReduceSum on loss_per_sample
std::vector<int64_t> output_dims(1, 1);
ReduceKernelShared<T, T, CUDNN_REDUCE_TENSOR_NO_INDICES>(
ORT_RETURN_IF_ERROR(reduce_sum(
Stream(),
tmp_loss_sample_buffer,
label_reshape,
total_loss_data,
TensorShape({}),
CUDNN_REDUCE_TENSOR_ADD,
output_dims);
static_cast<int>(N_D),
reduction_buffer.get(),
buffer_size));
}
return Status::OK();

View file

@ -167,6 +167,13 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
weight_data = weight.template Data<T>();
}
// Compute buffer size in byte for reduction APIs.
const auto buffer_size =
compute_reduction_buffer_size<T>(static_cast<int>(N));
// Allocate reduction buffer whose size is buffer_size bytes.
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
buffer_size);
auto normalize_factor_data = GetScratchBuffer<T>(1);
if (reduction_ == ReductionType::SUM) {
const T normalize_factor = static_cast<T>(1);
@ -176,12 +183,6 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
const T normalize_factor = static_cast<T>(N);
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream()));
} else {
// Compute buffer size in byte for reduction APIs.
const auto buffer_size =
compute_reduction_buffer_size<T>(static_cast<int>(N));
// Allocate reduction buffer whose size is buffer_size bytes.
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
buffer_size);
ORT_RETURN_IF_ERROR(reduce_sum(
Stream(),
weight_data,
@ -202,14 +203,13 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
D);
// ReduceSum on loss_per_sample
std::vector<int64_t> output_dims(1, 1);
return ReduceKernelShared<T, T, CUDNN_REDUCE_TENSOR_NO_INDICES>(
return reduce_sum(
Stream(),
tmp_loss_sample.get(),
label_reshape,
total_loss_data,
TensorShape({}),
CUDNN_REDUCE_TENSOR_ADD,
output_dims);
static_cast<int>(N),
reduction_buffer.get(),
buffer_size);
}
template <typename T, typename Tin>

View file

@ -1,4 +1,3 @@
CudaKernelTest.SparseSoftmaxCrossEntropy_LargeSizeTensor
CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor
CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor
CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor