From 3d12e957a7b99bbe41c2344f0dc6d859530e1ba1 Mon Sep 17 00:00:00 2001
From: Jesse Benson <jesseb@microsoft.com>
Date: Sun, 23 May 2021 12:23:08 -0700
Subject: [PATCH] Workaround for miopenReduceTensor() behavior difference in
 ROCm 4.2

---
 .../core/providers/rocm/miopen_common.cc      | 24 ++++++++
 .../core/providers/rocm/miopen_common.h       | 12 ++++
 .../providers/rocm/reduction/reduction_ops.cc | 21 +++++--
 .../cpu/reduction/reduction_ops_test.cc       | 57 +++++++++++++++++++
 4 files changed, 110 insertions(+), 4 deletions(-)
diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc
index 6c18b202a7..c473894cfb 100644
--- a/onnxruntime/core/providers/rocm/miopen_common.cc
+++ b/onnxruntime/core/providers/rocm/miopen_common.cc
@@ -88,5 +88,29 @@ const float Consts<half>::Zero = 0;
 
 const float Consts<half>::One = 1;
 
+// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+// type as the input type. This differs from cudnnReduceTensor() and other
+// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+//
+// NOTE: this workaround can be removed in ROCm 4.3:
+//       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+template <>
+const half ReduceConsts<half>::One = 1.f;
+
+template <>
+const float ReduceConsts<float>::One = 1;
+
+template <>
+const double ReduceConsts<double>::One = 1;
+
+template <>
+const half ReduceConsts<half>::Zero = 0.f;
+
+template <>
+const float ReduceConsts<float>::Zero = 0;
+
+template <>
+const double ReduceConsts<double>::Zero = 0;
+
 }  // namespace rocm
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h
index 73d865dcfd..32491b0cb8 100644
--- a/onnxruntime/core/providers/rocm/miopen_common.h
+++ b/onnxruntime/core/providers/rocm/miopen_common.h
@@ -44,5 +44,17 @@ struct Consts<half> {
   static const float One;
 };
 
+// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+// type as the input type. This differs from cudnnReduceTensor() and other
+// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+//
+// NOTE: this workaround can be removed in ROCm 4.3:
+//       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+template <typename ElemType>
+struct ReduceConsts {
+  static const ElemType Zero;
+  static const ElemType One;
+};
+
 }  // namespace rocm
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
index 886a8ab2c6..907e9404d8 100644
--- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
+++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc
@@ -185,8 +185,15 @@ Status ReduceKernel<allow_multi_axes>::ReduceKernelShared(
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType<float>(), ReduceTensorIndices));
   else
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
-  const auto one = Consts<HipT>::One;
-  const auto zero = Consts<HipT>::Zero;
+
+  // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+  // type as the input type. This differs from cudnnReduceTensor() and other
+  // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+  //
+  // NOTE: this workaround can be removed in ROCm 4.3:
+  //       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+  const auto one = ReduceConsts<HipT>::One;
+  const auto zero = ReduceConsts<HipT>::Zero;
   MiopenTensor input_tensor;
   MiopenTensor output_tensor;
   ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
@@ -483,8 +490,14 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr
     ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices));
   }
 
-  const auto one = Consts<HipT>::One;
-  const auto zero = Consts<HipT>::Zero;
+  // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data
+  // type as the input type. This differs from cudnnReduceTensor() and other
+  // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16).
+  //
+  // NOTE: this workaround can be removed in ROCm 4.3:
+  //       https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914
+  const auto one = ReduceConsts<HipT>::One;
+  const auto zero = ReduceConsts<HipT>::Zero;
   MiopenTensor input_tensor;
   MiopenTensor output_tensor;
   ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X));
diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
index 799412b42a..01b0671261 100644
--- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
+++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc
@@ -557,6 +557,25 @@ TEST(ReductionOpTest, ReduceLogSumExp_double) {
   test.Run();
 }
 
+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(ReductionOpTest, ReduceLogSumExp_half) {
+  OpTester test("ReduceLogSumExp");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<MLFloat16>("data", {3, 2, 2},
+                           FloatsToMLFloat16s({1.0f, 2.0f,
+                                               3.0f, 4.0f,
+
+                                               5.0f, 6.0f,
+                                               7.0f, 8.0f,
+
+                                               9.0f, 10.0f,
+                                               11.0f, 12.0f}));
+  test.AddOutput<MLFloat16>("reduced", {1, 2, 1}, FloatsToMLFloat16s({10.33174133f, 12.33174133f}));
+  test.Run();
+}
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
 TEST(ReductionOpTest, ReduceLogSumExp_int32) {
   OpTester test("ReduceLogSumExp");
   test.AddAttribute("axes", std::vector<int64_t>{0, 2});
@@ -700,6 +719,25 @@ TEST(ReductionOpTest, ReduceMax_double) {
   test.Run();
 }
 
+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(ReductionOpTest, ReduceMax_half) {
+  OpTester test("ReduceMax");
+  test.AddAttribute("axes", std::vector<int64_t>{1, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<MLFloat16>("data", {3, 2, 2},
+                           FloatsToMLFloat16s({1.0f, 2.0f,
+                                               3.0f, 4.0f,
+
+                                               5.0f, 6.0f,
+                                               7.0f, 8.0f,
+
+                                               9.0f, 10.0f,
+                                               11.0f, 12.0f}));
+  test.AddOutput<MLFloat16>("reduced", {3, 1, 1}, FloatsToMLFloat16s({4.0f, 8.0f, 12.0f}));
+  test.Run();
+}
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
 TEST(ReductionOpTest, ReduceMax_int32) {
   OpTester test("ReduceMax");
   test.AddAttribute("axes", std::vector<int64_t>{1, 2});
@@ -1167,6 +1205,25 @@ TEST(ReductionOpTest, ReduceMin_double) {
   test.Run();
 }
 
+#if defined(USE_CUDA) || defined(USE_ROCM)
+TEST(ReductionOpTest, ReduceMin_half) {
+  OpTester test("ReduceMin");
+  test.AddAttribute("axes", std::vector<int64_t>{0, 2});
+  test.AddAttribute("keepdims", (int64_t)1);
+  test.AddInput<MLFloat16>("data", {3, 2, 2},
+                           FloatsToMLFloat16s({1.0f, 2.0f,
+                                               3.0f, 4.0f,
+
+                                               5.0f, 6.0f,
+                                               7.0f, 8.0f,
+
+                                               9.0f, 10.0f,
+                                               11.0f, 12.0f}));
+  test.AddOutput<MLFloat16>("reduced", {1, 2, 1}, FloatsToMLFloat16s({1.0f, 3.0f}));
+  test.Run();
+}
+#endif  // defined(USE_CUDA) || defined(USE_ROCM)
+
 TEST(ReductionOpTest, ReduceMin_int32) {
   OpTester test("ReduceMin");
   test.AddAttribute("axes", std::vector<int64_t>{0, 2});