Use built-in reduce_sum() for simple reduction cases, specifically reduce all to a scalar.

2026-07-08 17:17:15 +00:00 · 2021-04-13 15:33:16 -07:00 · 2021-04-13 15:33:16 -07:00 · be79575c6a
commit be79575c6a
parent 3eb2d349a6
3 changed files with 24 additions and 26 deletions
--- a/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc
+++ b/orttraining/orttraining/training_ops/cuda/loss/softmax_cross_entropy_loss_impl.cc
@ -49,7 +49,6 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
  int64_t C;
  onnxruntime::contrib::GetNDCFromLogitAndLabelShape(logit_shape, label_shape, N_D, C);
  const TensorShape logit_reshape({N_D, C});
-  const TensorShape label_reshape({N_D});
  Tensor* total_loss = ctx->Output(0, reduction_ == ReductionType::NONE ? TensorShape(label.Shape()) : TensorShape({}));
  T* total_loss_data = total_loss->template MutableData<T>();
  T* tmp_loss_sample_buffer = nullptr;
@ -111,14 +110,15 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co
  CUDA_RETURN_IF_ERROR(cudaMemsetAsync(weight_data_nd_data, 0, N_D * sizeof(T), Stream()));
  ComputeWeightsSoftmaxCrossEntropyImpl(Stream(), label_data, weight_data, N_D, C, ignore_index_, weight_data_nd_data);

+  // Compute buffer size in byte for reduction APIs.
+  const auto buffer_size =
+      compute_reduction_buffer_size<T>(static_cast<int>(N_D));
+  // Allocate reduction buffer whose size is buffer_size bytes, or nullptr if no reduction.
+  IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
+      reduction_ != ReductionType::NONE ? buffer_size : 0);
+
  auto normalize_factor_data = GetScratchBuffer<T>(1);
  if (reduction_ == ReductionType::MEAN) {
-    // Compute buffer size in byte for reduction APIs.
-    const auto buffer_size =
-        compute_reduction_buffer_size<T>(static_cast<int>(N_D));
-    // Allocate reduction buffer whose size is buffer_size bytes.
-    IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
-        buffer_size);
    ORT_RETURN_IF_ERROR(reduce_sum(
        Stream(),
        weight_data_nd_data,
@ -157,14 +157,13 @@ Status SoftmaxCrossEntropyLoss<T, Tin>::ComputeInternal(OpKernelContext* ctx) co

  if (reduction_ != ReductionType::NONE) {
    // ReduceSum on loss_per_sample
-    std::vector<int64_t> output_dims(1, 1);
-    ReduceKernelShared<T, T, CUDNN_REDUCE_TENSOR_NO_INDICES>(
+    ORT_RETURN_IF_ERROR(reduce_sum(
+        Stream(),
        tmp_loss_sample_buffer,
-        label_reshape,
        total_loss_data,
-        TensorShape({}),
-        CUDNN_REDUCE_TENSOR_ADD,
-        output_dims);
+        static_cast<int>(N_D),
+        reduction_buffer.get(),
+        buffer_size));
  }

  return Status::OK();
--- a/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc
+++ b/orttraining/orttraining/training_ops/cuda/loss/softmaxcrossentropy_impl.cc
@ -167,6 +167,13 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
    weight_data = weight.template Data<T>();
  }

+  // Compute buffer size in byte for reduction APIs.
+  const auto buffer_size =
+      compute_reduction_buffer_size<T>(static_cast<int>(N));
+  // Allocate reduction buffer whose size is buffer_size bytes.
+  IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
+      buffer_size);
+
  auto normalize_factor_data = GetScratchBuffer<T>(1);
  if (reduction_ == ReductionType::SUM) {
    const T normalize_factor = static_cast<T>(1);
@ -176,12 +183,6 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
      const T normalize_factor = static_cast<T>(N);
      CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(normalize_factor_data.get(), &normalize_factor, sizeof(T), cudaMemcpyHostToDevice, Stream()));
    } else {
-      // Compute buffer size in byte for reduction APIs.
-      const auto buffer_size =
-          compute_reduction_buffer_size<T>(static_cast<int>(N));
-      // Allocate reduction buffer whose size is buffer_size bytes.
-      IAllocatorUniquePtr<void> reduction_buffer = GetScratchBuffer<void>(
-          buffer_size);
      ORT_RETURN_IF_ERROR(reduce_sum(
          Stream(),
          weight_data,
@ -202,14 +203,13 @@ Status SparseSoftmaxCrossEntropy<T, Tin>::ComputeInternal(OpKernelContext* ctx)
                                D);

  // ReduceSum on loss_per_sample
-  std::vector<int64_t> output_dims(1, 1);
-  return ReduceKernelShared<T, T, CUDNN_REDUCE_TENSOR_NO_INDICES>(
+  return reduce_sum(
+      Stream(),
      tmp_loss_sample.get(),
-      label_reshape,
      total_loss_data,
-      TensorShape({}),
-      CUDNN_REDUCE_TENSOR_ADD,
-      output_dims);
+      static_cast<int>(N),
+      reduction_buffer.get(),
+      buffer_size);
 }

 template <typename T, typename Tin>
--- a/tools/ci_build/github/pai/pai-excluded-tests.txt
+++ b/tools/ci_build/github/pai/pai-excluded-tests.txt
@ -1,4 +1,3 @@
-CudaKernelTest.SparseSoftmaxCrossEntropy_LargeSizeTensor
 CudaKernelTest.NegativeLogLikelihoodLoss_TinySizeTensor
 CudaKernelTest.NegativeLogLikelihoodLoss_SmallSizeTensor
 CudaKernelTest.NegativeLogLikelihoodLoss_MediumSizeTensor