diff --git a/onnxruntime/contrib_ops/cpu/fused_activation.cc b/onnxruntime/contrib_ops/cpu/fused_activation.cc index d63e19991e..2cee97e08e 100644 --- a/onnxruntime/contrib_ops/cpu/fused_activation.cc +++ b/onnxruntime/contrib_ops/cpu/fused_activation.cc @@ -26,6 +26,9 @@ common::Status GetFusedActivationAttr(const OpKernelInfo& info, MLAS_ACTIVATION& } else if (activation_type == "Clip") { activation.ActivationKind = MlasClipActivation; activation_params_count = 2; + } else if (activation_type == "HardSigmoid") { + activation.ActivationKind = MlasHardSigmoidActivation; + activation_params_count = 2; } else { return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "unimplemented activation: " + activation_type); } diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index ebe6899098..c270fd457f 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -121,6 +121,7 @@ enum MLAS_ACTIVATION_KIND { MlasTanhActivation, MlasLogisticActivation, MlasClipActivation, + MlasHardSigmoidActivation, }; struct MLAS_ACTIVATION { @@ -133,6 +134,10 @@ struct MLAS_ACTIVATION { float minimum; float maximum; } Clip; + struct { + float alpha; + float beta; + } HardSigmoid; float Values[2]; } Parameters; }; diff --git a/onnxruntime/core/mlas/lib/activate.cpp b/onnxruntime/core/mlas/lib/activate.cpp index 8317ec1033..ff93abf5cc 100644 --- a/onnxruntime/core/mlas/lib/activate.cpp +++ b/onnxruntime/core/mlas/lib/activate.cpp @@ -193,6 +193,45 @@ struct MLAS_ACTIVATION_FUNCTION } }; +template<> +struct MLAS_ACTIVATION_FUNCTION +{ + MLAS_FLOAT32X4 AlphaBroadcast; + MLAS_FLOAT32X4 BetaBroadcast; + MLAS_FLOAT32X4 MinimumBroadcast; + MLAS_FLOAT32X4 MaximumBroadcast; + + MLAS_ACTIVATION_FUNCTION(const MLAS_ACTIVATION* Activation) + { + AlphaBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.HardSigmoid.alpha); + BetaBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.HardSigmoid.beta); + MinimumBroadcast = MlasZeroFloat32x4(); + MaximumBroadcast = MlasBroadcastFloat32x4(1.0f); + } + + MLAS_FLOAT32X4 Activate(MLAS_FLOAT32X4 Value) + { + Value = MlasMultiplyAddFloat32x4(Value, AlphaBroadcast, BetaBroadcast); + Value = MlasMinimumFloat32x4(MaximumBroadcast, Value); + Value = MlasMaximumFloat32x4(MinimumBroadcast, Value); + + return Value; + } + + float Activate(float Value) + { +#if defined(MLAS_SSE2_INTRINSICS) + return _mm_cvtss_f32(Activate(_mm_set_ss(Value))); +#else + Value = MlasExtractLaneFloat32x4<0>(AlphaBroadcast) * Value + MlasExtractLaneFloat32x4<0>(BetaBroadcast); + Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast)); + Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast)); + + return Value; +#endif + } +}; + template void MlasActivationKernel( @@ -464,5 +503,11 @@ Return Value: MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); break; } + + case MlasHardSigmoidActivation: + { + MlasActivationKernel(Activation, Buffer, Bias, M, N, ldc); + break; + } } } diff --git a/onnxruntime/core/optimizer/conv_activation_fusion.cc b/onnxruntime/core/optimizer/conv_activation_fusion.cc index c514a8dcdb..da5a0f0c1e 100644 --- a/onnxruntime/core/optimizer/conv_activation_fusion.cc +++ b/onnxruntime/core/optimizer/conv_activation_fusion.cc @@ -136,7 +136,7 @@ Status ConvActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l auto conv_outputs = conv_node.MutableOutputDefs(); auto add_inputs = add_node.MutableInputDefs(); int32_t dependent = 0, independent = 0; - for (auto add_input: add_inputs) { + for (auto add_input : add_inputs) { if (add_input->Name() == conv_outputs[0]->Name()) { dependent++; } else { @@ -179,6 +179,14 @@ Status ConvActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l } else { continue; } + } else if ((node->GetExecutionProviderType().empty() || node->GetExecutionProviderType() == onnxruntime::kCpuExecutionProvider) && + graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "HardSigmoid", {6})) { + auto* alpha_attr = graph_utils::GetNodeAttribute(next_node, "alpha"); + auto* beta_attr = graph_utils::GetNodeAttribute(next_node, "beta"); + float alpha = (alpha_attr == nullptr ? 0.2f : alpha_attr->f()); + float beta = (beta_attr == nullptr ? 0.5f : beta_attr->f()); + activation_params.push_back(alpha); + activation_params.push_back(beta); } else { continue; } diff --git a/onnxruntime/test/contrib_ops/fused_conv_test.cc b/onnxruntime/test/contrib_ops/fused_conv_test.cc index bce624a723..8cbec7ee51 100644 --- a/onnxruntime/test/contrib_ops/fused_conv_test.cc +++ b/onnxruntime/test/contrib_ops/fused_conv_test.cc @@ -7,7 +7,7 @@ namespace onnxruntime { namespace test { -#if defined(USE_CUDA) && !defined(DISABLE_CONTRIB_OPS) +#if !defined(DISABLE_CONTRIB_OPS) using namespace std; struct ConvOpAndTestAttributes { @@ -18,24 +18,48 @@ struct ConvOpAndTestAttributes { vector pads; vector strides; string activation; + vector activation_parameters = {}; }; -static std::unordered_set excluded_providers = { - kCpuExecutionProvider, - kDnnlExecutionProvider, - kOpenVINOExecutionProvider, - kNupharExecutionProvider, - kVitisAIExecutionProvider, - kTensorrtExecutionProvider, - kNnapiExecutionProvider, - kRknpuExecutionProvider, - kDmlExecutionProvider, - kMIGraphXExecutionProvider, - kAclExecutionProvider, - kArmNNExecutionProvider, - kRocmExecutionProvider}; +static std::unordered_set providers_except_cpu = { + kCudaExecutionProvider, + kDnnlExecutionProvider, + kOpenVINOExecutionProvider, + kNupharExecutionProvider, + kVitisAIExecutionProvider, + kTensorrtExecutionProvider, + kNnapiExecutionProvider, + kRknpuExecutionProvider, + kDmlExecutionProvider, + kMIGraphXExecutionProvider, + kAclExecutionProvider, + kArmNNExecutionProvider, + kRocmExecutionProvider}; -void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector>& inputs, const vector>& input_shapes, const std::initializer_list& expected_output, const vector& expected_output_shape, bool weight_is_initializer = false, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& err_str = "") { +static std::unordered_set providers_except_cpu_cuda = { + kDnnlExecutionProvider, + kOpenVINOExecutionProvider, + kNupharExecutionProvider, + kVitisAIExecutionProvider, + kTensorrtExecutionProvider, + kNnapiExecutionProvider, + kRknpuExecutionProvider, + kDmlExecutionProvider, + kMIGraphXExecutionProvider, + kAclExecutionProvider, + kArmNNExecutionProvider, + kRocmExecutionProvider}; + + +void TestConvOp(const ConvOpAndTestAttributes& attributes, + const vector>& inputs, + const vector>& input_shapes, + const std::initializer_list& expected_output, + const vector& expected_output_shape, + const std::unordered_set& excluded_provider_types = providers_except_cpu_cuda, + bool weight_is_initializer = false, + OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, + const std::string& err_str = "") { OpTester test("FusedConv", 1, onnxruntime::kMSDomain); test.AddAttribute("group", attributes.group); test.AddAttribute("kernel_shape", attributes.kernel_shape); @@ -58,6 +82,10 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector(szNames[0], input_shapes[0], inputs[0]); test.AddInput(szNames[1], input_shapes[1], inputs[1], weight_is_initializer); @@ -66,7 +94,28 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector= 4) test.AddInput(szNames[3], input_shapes[3], inputs[3]); test.AddOutput("Y", expected_output_shape, expected_output); - test.Run(expect_result, err_str, excluded_providers); + test.Run(expect_result, err_str, excluded_provider_types); +} + +TEST(FusedConvTest, Conv2D_HardSigmoid) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + "HardSigmoid", // activation + vector{0.2f, 0.5f} // activation_parameters + }; + + vector X = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f}; + vector X_shape = {1, 1, 3, 3}; + vector W = {0.125f, 0.125f, 0.125f, 0.125f, -0.125f, -0.125f, -0.125f, -0.125f}; + vector W_shape = {2, 1, 2, 2}; + vector Y_shape = {1, 2, 2, 2}; + auto expected_vals = {0.8f, 0.9f, 1.0f, 1.0f, 0.2f, 0.1f, 0.0f, 0.0f}; + TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, providers_except_cpu); } TEST(FusedConvTest, Conv2D_Relu) { @@ -111,6 +160,23 @@ TEST(FusedConvTest, Conv2D_Bias_Relu) { TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); } +#if defined(USE_CUDA) + +static std::unordered_set providers_except_cuda = { + kCpuExecutionProvider, + kDnnlExecutionProvider, + kOpenVINOExecutionProvider, + kNupharExecutionProvider, + kVitisAIExecutionProvider, + kTensorrtExecutionProvider, + kNnapiExecutionProvider, + kRknpuExecutionProvider, + kDmlExecutionProvider, + kMIGraphXExecutionProvider, + kAclExecutionProvider, + kArmNNExecutionProvider, + kRocmExecutionProvider}; + TEST(FusedConvTest, Conv2D_Bias_Z_Relu) { ConvOpAndTestAttributes attrs = { "", // auto_pad @@ -132,8 +198,10 @@ TEST(FusedConvTest, Conv2D_Bias_Z_Relu) { vector Z = {-1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f}; vector Z_shape = {1, 2, 2, 2}; auto expected_vals = {12.0f, 17.0f, 25.0f, 29.0f, 11.0f, 15.0f, 23.0f, 28.0f}; - TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_cuda); } + +#endif #endif } // namespace test diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index ec36c2988e..a62b754b4d 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -732,7 +732,8 @@ TEST_F(GraphTransformationTests, FuseConvActivation) { {ORT_TSTR("fusion/conv_clip.onnx"), "Clip"}, {ORT_TSTR("fusion/conv_sigmoid.onnx"), "Sigmoid"}, {ORT_TSTR("fusion/conv_tanh.onnx"), "Tanh"}, - {ORT_TSTR("fusion/conv_leakyrelu.onnx"), "LeakyRelu"}}; + {ORT_TSTR("fusion/conv_leakyrelu.onnx"), "LeakyRelu"}, + {ORT_TSTR("fusion/conv_hardsigmoid.onnx"), "HardSigmoid"}}; #endif for (const auto& model : model_to_op_name) { auto model_uri = MODEL_FOLDER + model.first; diff --git a/onnxruntime/test/testdata/transform/fusion/conv_hardsigmoid.onnx b/onnxruntime/test/testdata/transform/fusion/conv_hardsigmoid.onnx new file mode 100644 index 0000000000..e4a01323ea --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/conv_hardsigmoid.onnx @@ -0,0 +1,22 @@ +:¾ + +X +W conv0_outConv0"Conv +8 + conv0_outhardsigmoid0_out HardSigmoid0" HardSigmoidConvClipFusionZ +X + + + + + + +Z +W + + + + +b +hardsigmoid0_out +B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/create_conv_hardsigmoid.py b/onnxruntime/test/testdata/transform/fusion/create_conv_hardsigmoid.py new file mode 100644 index 0000000000..5e7c05b50f --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/create_conv_hardsigmoid.py @@ -0,0 +1,22 @@ +import onnx +from onnx import helper +from onnx import TensorProto + +graph = helper.make_graph( + [ # nodes + # fusable, const_min_negative should be replaced + helper.make_node("Conv", ["X", "W"], ["conv0_out"], "Conv0"), + helper.make_node("HardSigmoid", ["conv0_out"], ["hardsigmoid0_out"], "HardSigmoid0"), + ], + "ConvClipFusion", #name + [ # inputs + helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1, 10, 10]), + helper.make_tensor_value_info('W', TensorProto.FLOAT, [1, 1, 3, 3]), + ], + [ # outputs + helper.make_tensor_value_info('hardsigmoid0_out', TensorProto.FLOAT, None), + ], +) + +model = helper.make_model(graph) +onnx.save(model, r'conv_hardsigmoid.onnx')