Workaround for miopenReduceTensor() behavior difference in ROCm 4.2

2026-07-13 18:08:13 +00:00 · 2021-05-23 12:23:08 -07:00 · 2021-05-23 12:23:08 -07:00 · 3d12e957a7
commit 3d12e957a7
parent f49a4b6329
4 changed files with 110 additions and 4 deletions
--- a/onnxruntime/core/providers/rocm/miopen_common.cc
+++ b/onnxruntime/core/providers/rocm/miopen_common.cc
@ -88,5 +88,29 @@ const float Consts<half>::Zero = 0;

 const float Consts<half>::One = 1;

+// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+// type as the input type. This differs from cudnnReduceTensor() and other
+// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+//
+// NOTE: this workaround can be removed in ROCm 4.3:
+//       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+template <>
+const half ReduceConsts<half>::One = 1.f;
+
+template <>
+const float ReduceConsts<float>::One = 1;
+
+template <>
+const double ReduceConsts<double>::One = 1;
+
+template <>
+const half ReduceConsts<half>::Zero = 0.f;
+
+template <>
+const float ReduceConsts<float>::Zero = 0;
+
+template <>
+const double ReduceConsts<double>::Zero = 0;
+
 }  // namespace rocm
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/miopen_common.h
+++ b/onnxruntime/core/providers/rocm/miopen_common.h
@ -44,5 +44,17 @@ struct Consts<half> {
  static const float One;
 };

+// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+// type as the input type. This differs from cudnnReduceTensor() and other
+// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+//
+// NOTE: this workaround can be removed in ROCm 4.3:
+//       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+template <typename ElemType>
+struct ReduceConsts {
+  static const ElemType Zero;
+  static const ElemType One;
+};
+
 }  // namespace rocm
 }  // namespace onnxruntime
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@ -185,8 +185,15 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
  else
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
-  const auto one = Consts<HipT>::One;
-  const auto zero = Consts<HipT>::Zero;
+
+  // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+  // type as the input type. This differs from cudnnReduceTensor() and other
+  // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+  //
+  // NOTE: this workaround can be removed in ROCm 4.3:
+  //       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+  const auto one = ReduceConsts<HipT>::One;
+  const auto zero = ReduceConsts<HipT>::Zero;
  MiopenTensor input_tensor;
  MiopenTensor output_tensor;
  ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
@ -483,8 +490,14 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
    ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
  }

-  const auto one = Consts<HipT>::One;
-  const auto zero = Consts<HipT>::Zero;
+  // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+  // type as the input type. This differs from cudnnReduceTensor() and other
+  // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+  //
+  // NOTE: this workaround can be removed in ROCm 4.3:
+  //       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+  const auto one = ReduceConsts<HipT>::One;
+  const auto zero = ReduceConsts<HipT>::Zero;
  MiopenTensor input_tensor;
  MiopenTensor output_tensor;
  ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@ -557,6 +557,25 @@ TEST(ReductionOpTest, ReduceLogSumExp_double) {
  test.Run();
 }

+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(ReductionOpTest, ReduceLogSumExp_half) {
+  OpTester test("ReduceLogSumExp");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<MLFloat16>("data", {3, 2, 2},
+                           FloatsToMLFloat16s({1.0f, 2.0f,
+                                               3.0f, 4.0f,
+
+                                               5.0f, 6.0f,
+                                               7.0f, 8.0f,
+
+                                               9.0f, 10.0f,
+                                               11.0f, 12.0f}));
+  test.AddOutput<MLFloat16>("reduced", {1, 2, 1}, FloatsToMLFloat16s({10.33174133f, 12.33174133f}));
+  test.Run();
+}
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
 TEST(ReductionOpTest, ReduceLogSumExp_int32) {
  OpTester test("ReduceLogSumExp");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@ -700,6 +719,25 @@ TEST(ReductionOpTest, ReduceMax_double) {
  test.Run();
 }

+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(ReductionOpTest, ReduceMax_half) {
+  OpTester test("ReduceMax");
+  test.AddAttribute("axes", std::vector<int64_t>{1, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<MLFloat16>("data", {3, 2, 2},
+                           FloatsToMLFloat16s({1.0f, 2.0f,
+                                               3.0f, 4.0f,
+
+                                               5.0f, 6.0f,
+                                               7.0f, 8.0f,
+
+                                               9.0f, 10.0f,
+                                               11.0f, 12.0f}));
+  test.AddOutput<MLFloat16>("reduced", {3, 1, 1}, FloatsToMLFloat16s({4.0f, 8.0f, 12.0f}));
+  test.Run();
+}
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
 TEST(ReductionOpTest, ReduceMax_int32) {
  OpTester test("ReduceMax");
  test.AddAttribute("axes", std::vector<int64_t>{1, 2});
@ -1167,6 +1205,25 @@ TEST(ReductionOpTest, ReduceMin_double) {
  test.Run();
 }

+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(ReductionOpTest, ReduceMin_half) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<MLFloat16>("data", {3, 2, 2},
+                           FloatsToMLFloat16s({1.0f, 2.0f,
+                                               3.0f, 4.0f,
+
+                                               5.0f, 6.0f,
+                                               7.0f, 8.0f,
+
+                                               9.0f, 10.0f,
+                                               11.0f, 12.0f}));
+  test.AddOutput<MLFloat16>("reduced", {1, 2, 1}, FloatsToMLFloat16s({1.0f, 3.0f}));
+  test.Run();
+}
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
 TEST(ReductionOpTest, ReduceMin_int32) {
  OpTester test("ReduceMin");
  test.AddAttribute("axes", std::vector<int64_t>{0, 2});