mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-23 22:13:38 +00:00
Use built-in reduce_sum() for simple reduction cases, specifically reduce all to a scalar.
This commit is contained in:
parent
3eb2d349a6
commit
be79575c6a
3 changed files with 24 additions and 26 deletions
|
|
@ -49,7 +49,6 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
|
|||
int64_t C;
|
||||
onnxruntime::contrib::GetNDCFromLogitAndLabelShape(logit_shape, label_shape, N_D, C);
|
||||
const TensorShape logit_reshape({N_D, C});
|
||||
const TensorShape label_reshape({N_D});
|
||||
Tensor* total_loss = ctx->Output(0, reduction_ == ReductionType::NONE ? TensorShape(label.Shape()) : TensorShape({}));
|
||||
T* total_loss_data = total_loss->template MutableData<T>();
|
||||
T* tmp_loss_sample_buffer = nullptr;
|
||||
|
|
@ -111,14 +110,15 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
|
|||
CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream()));
|
||||
ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data);
|
||||
|
||||
// Compute buffer size in byte for reduction APIs.
|
||||
const auto buffer_size =
|
||||
compute_reduction_buffer_size<T>(static_cast<int>(N_D));
|
||||
// Allocate reduction buffer whose size is buffer_size bytes, or nullptr if no reduction.
|
||||
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
|
||||
reduction_ != ReductionType::NONE ? buffer_size : 0);
|
||||
|
||||
auto normalize_factor_data = GetScratchBuffer<T>(1);
|
||||
if (reduction_ == ReductionType::MEAN) {
|
||||
// Compute buffer size in byte for reduction APIs.
|
||||
const auto buffer_size =
|
||||
compute_reduction_buffer_size<T>(static_cast<int>(N_D));
|
||||
// Allocate reduction buffer whose size is buffer_size bytes.
|
||||
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
|
||||
buffer_size);
|
||||
ORT_RETURN_IF_ERROR(reduce_sum(
|
||||
Stream(),
|
||||
weight_data_nd_data,
|
||||
|
|
@ -157,14 +157,13 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
|
|||
|
||||
if (reduction_ != ReductionType::NONE) {
|
||||
// ReduceSum on loss_per_sample
|
||||
std::vector<int64_t> output_dims(1, 1);
|
||||
ReduceKernelShared<T, T, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
||||
ORT_RETURN_IF_ERROR(reduce_sum(
|
||||
Stream(),
|
||||
tmp_loss_sample_buffer,
|
||||
label_reshape,
|
||||
total_loss_data,
|
||||
TensorShape({}),
|
||||
CUDNN_REDUCE_TENSOR_ADD,
|
||||
output_dims);
|
||||
static_cast<int>(N_D),
|
||||
reduction_buffer.get(),
|
||||
buffer_size));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
|
|
|
|||
|
|
@ -167,6 +167,13 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
|
|||
weight_data = weight.template Data<T>();
|
||||
}
|
||||
|
||||
// Compute buffer size in byte for reduction APIs.
|
||||
const auto buffer_size =
|
||||
compute_reduction_buffer_size<T>(static_cast<int>(N));
|
||||
// Allocate reduction buffer whose size is buffer_size bytes.
|
||||
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
|
||||
buffer_size);
|
||||
|
||||
auto normalize_factor_data = GetScratchBuffer<T>(1);
|
||||
if (reduction_ == ReductionType::SUM) {
|
||||
const T normalize_factor = static_cast<T>(1);
|
||||
|
|
@ -176,12 +183,6 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
|
|||
const T normalize_factor = static_cast<T>(N);
|
||||
CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream()));
|
||||
} else {
|
||||
// Compute buffer size in byte for reduction APIs.
|
||||
const auto buffer_size =
|
||||
compute_reduction_buffer_size<T>(static_cast<int>(N));
|
||||
// Allocate reduction buffer whose size is buffer_size bytes.
|
||||
IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
|
||||
buffer_size);
|
||||
ORT_RETURN_IF_ERROR(reduce_sum(
|
||||
Stream(),
|
||||
weight_data,
|
||||
|
|
@ -202,14 +203,13 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
|
|||
D);
|
||||
|
||||
// ReduceSum on loss_per_sample
|
||||
std::vector<int64_t> output_dims(1, 1);
|
||||
return ReduceKernelShared<T, T, CUDNN_REDUCE_TENSOR_NO_INDICES>(
|
||||
return reduce_sum(
|
||||
Stream(),
|
||||
tmp_loss_sample.get(),
|
||||
label_reshape,
|
||||
total_loss_data,
|
||||
TensorShape({}),
|
||||
CUDNN_REDUCE_TENSOR_ADD,
|
||||
output_dims);
|
||||
static_cast<int>(N),
|
||||
reduction_buffer.get(),
|
||||
buffer_size);
|
||||
}
|
||||
|
||||
template <typename T, typename Tin>
|
||||
|
|
|
|||
|
|
@ -1,4 +1,3 @@
|
|||
CudaKernelTest.SparseSoftmaxCrossEntropy_LargeSizeTensor
|
||||
CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor
|
||||
CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor
|
||||
CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor
|
||||
|
|
|
|||
Loading…
Reference in a new issue