From 3d12e957a7b99bbe41c2344f0dc6d859530e1ba1 Mon Sep 17 00:00:00 2001 From: Jesse Benson Date: Sun, 23 May 2021 12:23:08 -0700 Subject: [PATCH] Workaround for miopenReduceTensor() behavior difference in ROCm 4.2 --- .../core/providers/rocm/miopen_common.cc | 24 ++++++++ .../core/providers/rocm/miopen_common.h | 12 ++++ .../providers/rocm/reduction/reduction_ops.cc | 21 +++++-- .../cpu/reduction/reduction_ops_test.cc | 57 +++++++++++++++++++ 4 files changed, 110 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/rocm/miopen_common.cc b/onnxruntime/core/providers/rocm/miopen_common.cc index 6c18b202a7..c473894cfb 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.cc +++ b/onnxruntime/core/providers/rocm/miopen_common.cc @@ -88,5 +88,29 @@ const float Consts::Zero = 0; const float Consts::One = 1; +// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data +// type as the input type. This differs from cudnnReduceTensor() and other +// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16). +// +// NOTE: this workaround can be removed in ROCm 4.3: +// https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914 +template <> +const half ReduceConsts::One = 1.f; + +template <> +const float ReduceConsts::One = 1; + +template <> +const double ReduceConsts::One = 1; + +template <> +const half ReduceConsts::Zero = 0.f; + +template <> +const float ReduceConsts::Zero = 0; + +template <> +const double ReduceConsts::Zero = 0; + } // namespace rocm } // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/miopen_common.h b/onnxruntime/core/providers/rocm/miopen_common.h index 73d865dcfd..32491b0cb8 100644 --- a/onnxruntime/core/providers/rocm/miopen_common.h +++ b/onnxruntime/core/providers/rocm/miopen_common.h @@ -44,5 +44,17 @@ struct Consts { static const float One; }; +// As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data +// type as the input type. This differs from cudnnReduceTensor() and other +// MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16). +// +// NOTE: this workaround can be removed in ROCm 4.3: +// https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914 +template +struct ReduceConsts { + static const ElemType Zero; + static const ElemType One; +}; + } // namespace rocm } // namespace onnxruntime diff --git a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc index 886a8ab2c6..907e9404d8 100644 --- a/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc +++ b/onnxruntime/core/providers/rocm/reduction/reduction_ops.cc @@ -185,8 +185,15 @@ Status ReduceKernel::ReduceKernelShared( ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, MiopenTensor::GetDataType(), ReduceTensorIndices)); else ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices)); - const auto one = Consts::One; - const auto zero = Consts::Zero; + + // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data + // type as the input type. This differs from cudnnReduceTensor() and other + // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16). + // + // NOTE: this workaround can be removed in ROCm 4.3: + // https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914 + const auto one = ReduceConsts::One; + const auto zero = ReduceConsts::Zero; MiopenTensor input_tensor; MiopenTensor output_tensor; ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); @@ -483,8 +490,14 @@ Status ReduceComputeCore(ROCMExecutionProvider& rocm_ep, const Tensor& input, Pr ORT_RETURN_IF_ERROR(reduce_desc.Set(miopen_reduce_op, miopen_type_X, ReduceTensorIndices)); } - const auto one = Consts::One; - const auto zero = Consts::Zero; + // As of ROCm 4.2, miopenReduceTensor() requires alpha/beta to be the same data + // type as the input type. This differs from cudnnReduceTensor() and other + // MIOpen/cuDNN APIs where alpha/beta are float when input type is half (float16). + // + // NOTE: this workaround can be removed in ROCm 4.3: + // https://github.com/ROCmSoftwarePlatform/MIOpen/pull/914 + const auto one = ReduceConsts::One; + const auto zero = ReduceConsts::Zero; MiopenTensor input_tensor; MiopenTensor output_tensor; ORT_RETURN_IF_ERROR(input_tensor.Set(input_dims_miopen, miopen_type_X)); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc index 799412b42a..01b0671261 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc +++ b/onnxruntime/test/providers/cpu/reduction/reduction_ops_test.cc @@ -557,6 +557,25 @@ TEST(ReductionOpTest, ReduceLogSumExp_double) { test.Run(); } +#if defined(USE_CUDA) || defined(USE_ROCM) +TEST(ReductionOpTest, ReduceLogSumExp_half) { + OpTester test("ReduceLogSumExp"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + FloatsToMLFloat16s({1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f})); + test.AddOutput("reduced", {1, 2, 1}, FloatsToMLFloat16s({10.33174133f, 12.33174133f})); + test.Run(); +} +#endif // defined(USE_CUDA) || defined(USE_ROCM) + TEST(ReductionOpTest, ReduceLogSumExp_int32) { OpTester test("ReduceLogSumExp"); test.AddAttribute("axes", std::vector{0, 2}); @@ -700,6 +719,25 @@ TEST(ReductionOpTest, ReduceMax_double) { test.Run(); } +#if defined(USE_CUDA) || defined(USE_ROCM) +TEST(ReductionOpTest, ReduceMax_half) { + OpTester test("ReduceMax"); + test.AddAttribute("axes", std::vector{1, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + FloatsToMLFloat16s({1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f})); + test.AddOutput("reduced", {3, 1, 1}, FloatsToMLFloat16s({4.0f, 8.0f, 12.0f})); + test.Run(); +} +#endif // defined(USE_CUDA) || defined(USE_ROCM) + TEST(ReductionOpTest, ReduceMax_int32) { OpTester test("ReduceMax"); test.AddAttribute("axes", std::vector{1, 2}); @@ -1167,6 +1205,25 @@ TEST(ReductionOpTest, ReduceMin_double) { test.Run(); } +#if defined(USE_CUDA) || defined(USE_ROCM) +TEST(ReductionOpTest, ReduceMin_half) { + OpTester test("ReduceMin"); + test.AddAttribute("axes", std::vector{0, 2}); + test.AddAttribute("keepdims", (int64_t)1); + test.AddInput("data", {3, 2, 2}, + FloatsToMLFloat16s({1.0f, 2.0f, + 3.0f, 4.0f, + + 5.0f, 6.0f, + 7.0f, 8.0f, + + 9.0f, 10.0f, + 11.0f, 12.0f})); + test.AddOutput("reduced", {1, 2, 1}, FloatsToMLFloat16s({1.0f, 3.0f})); + test.Run(); +} +#endif // defined(USE_CUDA) || defined(USE_ROCM) + TEST(ReductionOpTest, ReduceMin_int32) { OpTester test("ReduceMin"); test.AddAttribute("axes", std::vector{0, 2});