Fuse HardSigmoid with conv. (#8674)

* Fuse HardSigmoid with conv.
Add transform test case and FusedConv testcase.

* Limit Conv/HardSigmoid fusion in CpuExecutionProvider.

* Fix typo for arm build.

* change format one place
This commit is contained in:
Zhang Lei 2021-08-31 12:19:34 -07:00 committed by GitHub
parent 206537936f
commit 2e37fe3f68
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 194 additions and 20 deletions

View file

@ -26,6 +26,9 @@ common::Status GetFusedActivationAttr(const OpKernelInfo& info, MLAS_ACTIVATION&
} else if (activation_type == "Clip") {
activation.ActivationKind = MlasClipActivation;
activation_params_count = 2;
} else if (activation_type == "HardSigmoid") {
activation.ActivationKind = MlasHardSigmoidActivation;
activation_params_count = 2;
} else {
return Status(common::ONNXRUNTIME, common::INVALID_ARGUMENT, "unimplemented activation: " + activation_type);
}

View file

@ -121,6 +121,7 @@ enum MLAS_ACTIVATION_KIND {
MlasTanhActivation,
MlasLogisticActivation,
MlasClipActivation,
MlasHardSigmoidActivation,
};
struct MLAS_ACTIVATION {
@ -133,6 +134,10 @@ struct MLAS_ACTIVATION {
float minimum;
float maximum;
} Clip;
struct {
float alpha;
float beta;
} HardSigmoid;
float Values[2];
} Parameters;
};

View file

@ -193,6 +193,45 @@ struct MLAS_ACTIVATION_FUNCTION<MlasClipActivation>
}
};
template<>
struct MLAS_ACTIVATION_FUNCTION<MlasHardSigmoidActivation>
{
MLAS_FLOAT32X4 AlphaBroadcast;
MLAS_FLOAT32X4 BetaBroadcast;
MLAS_FLOAT32X4 MinimumBroadcast;
MLAS_FLOAT32X4 MaximumBroadcast;
MLAS_ACTIVATION_FUNCTION(const MLAS_ACTIVATION* Activation)
{
AlphaBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.HardSigmoid.alpha);
BetaBroadcast = MlasBroadcastFloat32x4(&Activation->Parameters.HardSigmoid.beta);
MinimumBroadcast = MlasZeroFloat32x4();
MaximumBroadcast = MlasBroadcastFloat32x4(1.0f);
}
MLAS_FLOAT32X4 Activate(MLAS_FLOAT32X4 Value)
{
Value = MlasMultiplyAddFloat32x4(Value, AlphaBroadcast, BetaBroadcast);
Value = MlasMinimumFloat32x4(MaximumBroadcast, Value);
Value = MlasMaximumFloat32x4(MinimumBroadcast, Value);
return Value;
}
float Activate(float Value)
{
#if defined(MLAS_SSE2_INTRINSICS)
return _mm_cvtss_f32(Activate(_mm_set_ss(Value)));
#else
Value = MlasExtractLaneFloat32x4<0>(AlphaBroadcast) * Value + MlasExtractLaneFloat32x4<0>(BetaBroadcast);
Value = std::min(Value, MlasExtractLaneFloat32x4<0>(MaximumBroadcast));
Value = std::max(Value, MlasExtractLaneFloat32x4<0>(MinimumBroadcast));
return Value;
#endif
}
};
template<MLAS_ACTIVATION_KIND ActivationKind, bool AddBias>
void
MlasActivationKernel(
@ -464,5 +503,11 @@ Return Value:
MlasActivationKernel<MlasClipActivation>(Activation, Buffer, Bias, M, N, ldc);
break;
}
case MlasHardSigmoidActivation:
{
MlasActivationKernel<MlasHardSigmoidActivation>(Activation, Buffer, Bias, M, N, ldc);
break;
}
}
}

View file

@ -136,7 +136,7 @@ Status ConvActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l
auto conv_outputs = conv_node.MutableOutputDefs();
auto add_inputs = add_node.MutableInputDefs();
int32_t dependent = 0, independent = 0;
for (auto add_input: add_inputs) {
for (auto add_input : add_inputs) {
if (add_input->Name() == conv_outputs[0]->Name()) {
dependent++;
} else {
@ -179,6 +179,14 @@ Status ConvActivationFusion::ApplyImpl(Graph& graph, bool& modified, int graph_l
} else {
continue;
}
} else if ((node->GetExecutionProviderType().empty() || node->GetExecutionProviderType() == onnxruntime::kCpuExecutionProvider) &&
graph_utils::IsSupportedOptypeVersionAndDomain(next_node, "HardSigmoid", {6})) {
auto* alpha_attr = graph_utils::GetNodeAttribute(next_node, "alpha");
auto* beta_attr = graph_utils::GetNodeAttribute(next_node, "beta");
float alpha = (alpha_attr == nullptr ? 0.2f : alpha_attr->f());
float beta = (beta_attr == nullptr ? 0.5f : beta_attr->f());
activation_params.push_back(alpha);
activation_params.push_back(beta);
} else {
continue;
}

View file

@ -7,7 +7,7 @@
namespace onnxruntime {
namespace test {
#if defined(USE_CUDA) && !defined(DISABLE_CONTRIB_OPS)
#if !defined(DISABLE_CONTRIB_OPS)
using namespace std;
struct ConvOpAndTestAttributes {
@ -18,24 +18,48 @@ struct ConvOpAndTestAttributes {
vector<int64_t> pads;
vector<int64_t> strides;
string activation;
vector<float> activation_parameters = {};
};
static std::unordered_set<std::string> excluded_providers = {
kCpuExecutionProvider,
kDnnlExecutionProvider,
kOpenVINOExecutionProvider,
kNupharExecutionProvider,
kVitisAIExecutionProvider,
kTensorrtExecutionProvider,
kNnapiExecutionProvider,
kRknpuExecutionProvider,
kDmlExecutionProvider,
kMIGraphXExecutionProvider,
kAclExecutionProvider,
kArmNNExecutionProvider,
kRocmExecutionProvider};
static std::unordered_set<std::string> providers_except_cpu = {
kCudaExecutionProvider,
kDnnlExecutionProvider,
kOpenVINOExecutionProvider,
kNupharExecutionProvider,
kVitisAIExecutionProvider,
kTensorrtExecutionProvider,
kNnapiExecutionProvider,
kRknpuExecutionProvider,
kDmlExecutionProvider,
kMIGraphXExecutionProvider,
kAclExecutionProvider,
kArmNNExecutionProvider,
kRocmExecutionProvider};
void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector<vector<float>>& inputs, const vector<vector<int64_t>>& input_shapes, const std::initializer_list<float>& expected_output, const vector<int64_t>& expected_output_shape, bool weight_is_initializer = false, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& err_str = "") {
static std::unordered_set<std::string> providers_except_cpu_cuda = {
kDnnlExecutionProvider,
kOpenVINOExecutionProvider,
kNupharExecutionProvider,
kVitisAIExecutionProvider,
kTensorrtExecutionProvider,
kNnapiExecutionProvider,
kRknpuExecutionProvider,
kDmlExecutionProvider,
kMIGraphXExecutionProvider,
kAclExecutionProvider,
kArmNNExecutionProvider,
kRocmExecutionProvider};
void TestConvOp(const ConvOpAndTestAttributes& attributes,
const vector<vector<float>>& inputs,
const vector<vector<int64_t>>& input_shapes,
const std::initializer_list<float>& expected_output,
const vector<int64_t>& expected_output_shape,
const std::unordered_set<std::string>& excluded_provider_types = providers_except_cpu_cuda,
bool weight_is_initializer = false,
OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess,
const std::string& err_str = "") {
OpTester test("FusedConv", 1, onnxruntime::kMSDomain);
test.AddAttribute("group", attributes.group);
test.AddAttribute("kernel_shape", attributes.kernel_shape);
@ -58,6 +82,10 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector<vector<f
ORT_ENFORCE(!attributes.activation.empty(), "activation must be set");
test.AddAttribute("activation", attributes.activation);
if (!attributes.activation_parameters.empty()) {
test.AddAttribute("activation_params", attributes.activation_parameters);
}
const char* szNames[] = {"X", "W", "B", "Z"};
test.AddInput<float>(szNames[0], input_shapes[0], inputs[0]);
test.AddInput<float>(szNames[1], input_shapes[1], inputs[1], weight_is_initializer);
@ -66,7 +94,28 @@ void TestConvOp(const ConvOpAndTestAttributes& attributes, const vector<vector<f
if (inputs.size() >= 4)
test.AddInput<float>(szNames[3], input_shapes[3], inputs[3]);
test.AddOutput<float>("Y", expected_output_shape, expected_output);
test.Run(expect_result, err_str, excluded_providers);
test.Run(expect_result, err_str, excluded_provider_types);
}
TEST(FusedConvTest, Conv2D_HardSigmoid) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
vector<int64_t>{1, 1}, // dilations
1, // group
vector<int64_t>{2, 2}, // kernel_shape
vector<int64_t>{0, 0, 0, 0}, // pads
vector<int64_t>{1, 1}, // strides
"HardSigmoid", // activation
vector<float>{0.2f, 0.5f} // activation_parameters
};
vector<float> X = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f};
vector<int64_t> X_shape = {1, 1, 3, 3};
vector<float> W = {0.125f, 0.125f, 0.125f, 0.125f, -0.125f, -0.125f, -0.125f, -0.125f};
vector<int64_t> W_shape = {2, 1, 2, 2};
vector<int64_t> Y_shape = {1, 2, 2, 2};
auto expected_vals = {0.8f, 0.9f, 1.0f, 1.0f, 0.2f, 0.1f, 0.0f, 0.0f};
TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, providers_except_cpu);
}
TEST(FusedConvTest, Conv2D_Relu) {
@ -111,6 +160,23 @@ TEST(FusedConvTest, Conv2D_Bias_Relu) {
TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape);
}
#if defined(USE_CUDA)
static std::unordered_set<std::string> providers_except_cuda = {
kCpuExecutionProvider,
kDnnlExecutionProvider,
kOpenVINOExecutionProvider,
kNupharExecutionProvider,
kVitisAIExecutionProvider,
kTensorrtExecutionProvider,
kNnapiExecutionProvider,
kRknpuExecutionProvider,
kDmlExecutionProvider,
kMIGraphXExecutionProvider,
kAclExecutionProvider,
kArmNNExecutionProvider,
kRocmExecutionProvider};
TEST(FusedConvTest, Conv2D_Bias_Z_Relu) {
ConvOpAndTestAttributes attrs = {
"", // auto_pad
@ -132,8 +198,10 @@ TEST(FusedConvTest, Conv2D_Bias_Z_Relu) {
vector<float> Z = {-1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f};
vector<int64_t> Z_shape = {1, 2, 2, 2};
auto expected_vals = {12.0f, 17.0f, 25.0f, 29.0f, 11.0f, 15.0f, 23.0f, 28.0f};
TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape);
TestConvOp(attrs, {X, W, B, Z}, {X_shape, W_shape, B_shape, Z_shape}, expected_vals, Y_shape, providers_except_cuda);
}
#endif
#endif
} // namespace test

View file

@ -732,7 +732,8 @@ TEST_F(GraphTransformationTests, FuseConvActivation) {
{ORT_TSTR("fusion/conv_clip.onnx"), "Clip"},
{ORT_TSTR("fusion/conv_sigmoid.onnx"), "Sigmoid"},
{ORT_TSTR("fusion/conv_tanh.onnx"), "Tanh"},
{ORT_TSTR("fusion/conv_leakyrelu.onnx"), "LeakyRelu"}};
{ORT_TSTR("fusion/conv_leakyrelu.onnx"), "LeakyRelu"},
{ORT_TSTR("fusion/conv_hardsigmoid.onnx"), "HardSigmoid"}};
#endif
for (const auto& model : model_to_op_name) {
auto model_uri = MODEL_FOLDER + model.first;

View file

@ -0,0 +1,22 @@


X
W conv0_outConv0"Conv
8
conv0_outhardsigmoid0_out HardSigmoid0" HardSigmoidConvClipFusionZ
X





Z
W




b
hardsigmoid0_out
B

View file

@ -0,0 +1,22 @@
import onnx
from onnx import helper
from onnx import TensorProto
graph = helper.make_graph(
[ # nodes
# fusable, const_min_negative should be replaced
helper.make_node("Conv", ["X", "W"], ["conv0_out"], "Conv0"),
helper.make_node("HardSigmoid", ["conv0_out"], ["hardsigmoid0_out"], "HardSigmoid0"),
],
"ConvClipFusion", #name
[ # inputs
helper.make_tensor_value_info('X', TensorProto.FLOAT, [1, 1, 10, 10]),
helper.make_tensor_value_info('W', TensorProto.FLOAT, [1, 1, 3, 3]),
],
[ # outputs
helper.make_tensor_value_info('hardsigmoid0_out', TensorProto.FLOAT, None),
],
)
model = helper.make_model(graph)
onnx.save(model, r'conv_hardsigmoid.onnx')