From d49a8de9b1db3486246e65700380e1eb8e7d581d Mon Sep 17 00:00:00 2001 From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com> Date: Wed, 12 Apr 2023 12:19:14 +0800 Subject: [PATCH] [ROCm] add FP16 support for FusedConv Op (#15443) Add FP16 support for FusedConv Op and update UT --- onnxruntime/contrib_ops/rocm/fused_conv.cc | 19 +++-- .../contrib_ops/rocm/rocm_contrib_kernels.cc | 3 + .../test/contrib_ops/fused_conv_test.cc | 73 +++++++++++++++---- 3 files changed, 72 insertions(+), 23 deletions(-) diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc index c9e92f39c6..13b9574780 100644 --- a/onnxruntime/contrib_ops/rocm/fused_conv.cc +++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc @@ -441,15 +441,18 @@ class FusedConv : public onnxruntime::rocm::Conv { template typename FusedConv::FusionPlanCache FusedConv::plan_cache_; -ONNX_OPERATOR_TYPED_KERNEL_EX( - FusedConv, - kMSDomain, - 1, - float, - kRocmExecutionProvider, - (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), - FusedConv); +#define REGISTER_KERNEL_TYPED(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + FusedConv, \ + kMSDomain, \ + 1, \ + T, \ + kRocmExecutionProvider, \ + (*KernelDefBuilder::Create()).TypeConstraint("T", DataTypeImpl::GetTensorType()), \ + FusedConv); +REGISTER_KERNEL_TYPED(float); +REGISTER_KERNEL_TYPED(MLFloat16); } // namespace rocm } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc index 9dd0ad8849..15f39e7dcd 100644 --- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc @@ -107,6 +107,7 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float_int8_t, QAttention); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16_int8_t, QAttention); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, float, FusedConv); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, MLFloat16, FusedConv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FastGelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, TransposeMatMul); // backward compatibility class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, BFloat16, FusedMatMul); @@ -251,6 +252,8 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/test/contrib_ops/fused_conv_test.cc b/onnxruntime/test/contrib_ops/fused_conv_test.cc index 6eedc1b95c..d0b6ad5218 100644 --- a/onnxruntime/test/contrib_ops/fused_conv_test.cc +++ b/onnxruntime/test/contrib_ops/fused_conv_test.cc @@ -3,6 +3,8 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "test/common/tensor_op_test_utils.h" +#include "test/util/include/default_providers.h" namespace onnxruntime { namespace test { @@ -10,6 +12,12 @@ namespace test { #if !defined(DISABLE_CONTRIB_OPS) using namespace std; +#if defined(USE_ROCM) +#define ROCM_GTEST_SKIP(message) GTEST_SKIP_(message) +#else +#define ROCM_GTEST_SKIP(message) +#endif + struct ConvOpAndTestAttributes { string auto_pad; vector dilations; @@ -49,7 +57,6 @@ static std::unordered_set providers_except_cpu_gpu = { kAclExecutionProvider, kArmNNExecutionProvider}; - void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector>& inputs, const vector>& input_shapes, @@ -58,7 +65,10 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const std::unordered_set& excluded_provider_types = providers_except_cpu_gpu, bool weight_is_initializer = false, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, - const std::string& err_str = "") { + const std::string& err_str = "", + bool use_float16 = false) { + bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); + OpTester test("FusedConv", 1, onnxruntime::kMSDomain); test.AddAttribute("group", attributes.group); test.AddAttribute("kernel_shape", attributes.kernel_shape); @@ -86,17 +96,49 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, } const char* szNames[] = {"X", "W", "B", "Z"}; - test.AddInput(szNames[0], input_shapes[0], inputs[0]); - test.AddInput(szNames[1], input_shapes[1], inputs[1], weight_is_initializer); - if (inputs.size() >= 3) - test.AddInput(szNames[2], input_shapes[2], inputs[2]); - if (inputs.size() >= 4) - test.AddInput(szNames[3], input_shapes[3], inputs[3]); - test.AddOutput("Y", expected_output_shape, expected_output); - test.Run(expect_result, err_str, excluded_provider_types); + + if (use_float16 && enable_rocm) { + // Only ROCm EP supports float16. + test.AddInput(szNames[0], input_shapes[0], ToFloat16(inputs[0])); + test.AddInput(szNames[1], input_shapes[1], ToFloat16(inputs[1]), weight_is_initializer); + if (inputs.size() >= 3) + test.AddInput(szNames[2], input_shapes[2], ToFloat16(inputs[2])); + if (inputs.size() >= 4) + test.AddInput(szNames[3], input_shapes[3], ToFloat16(inputs[3])); + test.AddOutput("Y", expected_output_shape, ToFloat16(expected_output)); + test.Run(expect_result, err_str, excluded_provider_types); + } else { + test.AddInput(szNames[0], input_shapes[0], inputs[0]); + test.AddInput(szNames[1], input_shapes[1], inputs[1], weight_is_initializer); + if (inputs.size() >= 3) + test.AddInput(szNames[2], input_shapes[2], inputs[2]); + if (inputs.size() >= 4) + test.AddInput(szNames[3], input_shapes[3], inputs[3]); + test.AddOutput("Y", expected_output_shape, expected_output); + test.Run(expect_result, err_str, excluded_provider_types); + } +} + +void RunConvOp(const ConvOpAndTestAttributes& attributes, + const vector>& inputs, + const vector>& input_shapes, + const std::initializer_list& expected_output, + const vector& expected_output_shape, + const std::unordered_set& excluded_provider_types = providers_except_cpu_gpu, + bool weight_is_initializer = false, + OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, + const std::string& err_str = "") { + bool use_float16 = true; + TestConvOp(attributes, inputs, input_shapes, expected_output, expected_output_shape, excluded_provider_types, + weight_is_initializer, expect_result, err_str, use_float16); + + use_float16 = false; + TestConvOp(attributes, inputs, input_shapes, expected_output, expected_output_shape, excluded_provider_types, + weight_is_initializer, expect_result, err_str, use_float16); } TEST(FusedConvTest, Conv2D_HardSigmoid) { + ROCM_GTEST_SKIP("ROCm does not support Conv2D_HardSigmoid"); ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations @@ -114,7 +156,7 @@ TEST(FusedConvTest, Conv2D_HardSigmoid) { vector W_shape = {2, 1, 2, 2}; vector Y_shape = {1, 2, 2, 2}; auto expected_vals = {0.8f, 0.9f, 1.0f, 1.0f, 0.2f, 0.1f, 0.0f, 0.0f}; - TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, providers_except_cpu); + RunConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, providers_except_cpu); } TEST(FusedConvTest, Conv2D_Relu) { @@ -134,7 +176,7 @@ TEST(FusedConvTest, Conv2D_Relu) { vector W_shape = {2, 1, 2, 2}; vector Y_shape = {1, 2, 2, 2}; auto expected_vals = {12.0f, 16.0f, 24.0f, 28.0f, 0.0f, 0.0f, 0.0f, 0.0f}; - TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape); + RunConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape); } TEST(FusedConvTest, Conv2D_Bias_Relu) { @@ -156,7 +198,7 @@ TEST(FusedConvTest, Conv2D_Bias_Relu) { vector B = {1.0f, -1.0f}; vector B_shape = {2}; auto expected_vals = {13.0f, 17.0f, 25.0f, 29.0f, 11.0f, 15.0f, 23.0f, 27.0f}; - TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + RunConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); } #if defined(USE_CUDA) || defined(USE_ROCM) @@ -196,12 +238,13 @@ TEST(FusedConvTest, Conv2D_Bias_Z_Relu) { vector Z = {-1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; vector Z_shape = {1, 2, 2, 2}; auto expected_vals = {12.0f, 17.0f, 25.0f, 29.0f, 11.0f, 15.0f, 23.0f, 28.0f}; - TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_gpu); + RunConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_gpu); } #endif TEST(FusedConvTest, Cpu_Conv2D_Bias_Z_Relu) { + ROCM_GTEST_SKIP("ROCm skip Cpu_Conv2D_Bias_Z_Relu"); ConvOpAndTestAttributes attrs = { "", // auto_pad vector{1, 1}, // dilations @@ -222,7 +265,7 @@ TEST(FusedConvTest, Cpu_Conv2D_Bias_Z_Relu) { vector Z = {-1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; vector Z_shape = {1, 2, 2, 2}; auto expected_vals = {12.0f, 17.0f, 25.0f, 29.0f, 11.0f, 15.0f, 23.0f, 28.0f}; - TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_cpu); + RunConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_cpu); } #endif