Disable MatMulIntegerToFloat transformation for FP16 on CPU EP (#18239)

### Description
MatMulIntegerToFloat is updated to support FP16. The nodes for FP16
Transformation use "Mul" FP16, which is not directly supported by the
CPU.

For now FP16 transformation is only supported for DML EP. Disabled all
FP16 tests on CPU.

Tests result without `-use_dml` build flag
```
onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat*"
Note: Google Test filter = *MatMulIntegerToFloat*
[==========] Running 8 tests from 4 test suites.
[----------] Global test environment set-up.
[----------] 1 test from CPU_U8S8_Precision_Tests
[ RUN      ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat
[       OK ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat (181 ms)
[----------] 1 test from CPU_U8S8_Precision_Tests (181 ms total)

[----------] 1 test from GraphTransformationTests
[ RUN      ] GraphTransformationTests.MatMulIntegerToFloatTest
[       OK ] GraphTransformationTests.MatMulIntegerToFloatTest (17 ms)
[----------] 1 test from GraphTransformationTests (17 ms total)

[----------] 1 test from QDQTransformerTests
[ RUN      ] QDQTransformerTests.MatMulIntegerToFloat
[       OK ] QDQTransformerTests.MatMulIntegerToFloat (656 ms)
[----------] 1 test from QDQTransformerTests (656 ms total)

[----------] 5 tests from MatMulIntegerToFloat
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 (195 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 (206 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (107 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (114 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint
[       OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (227 ms)
[----------] 5 tests from MatMulIntegerToFloat (854 ms total)

[----------] Global test environment tear-down
[==========] 8 tests from 4 test suites ran. (1713 ms total)
[  PASSED  ] 8 tests.
memleakdbg:
----- No memory leaks detected -----
```

```
onnxruntime_test_all.exe --gtest_filter="GraphTransformationTests.MatMulIntegerToFloat*"
Note: Google Test filter = GraphTransformationTests.MatMulIntegerToFloat*
[==========] Running 2 tests from 1 test suite.
[----------] Global test environment set-up.
[----------] 2 tests from GraphTransformationTests
[ RUN      ] GraphTransformationTests.MatMulIntegerToFloatTest
[       OK ] GraphTransformationTests.MatMulIntegerToFloatTest (13 ms)
[ RUN      ] GraphTransformationTests.MatMulIntegerToFloat16Test
[       OK ] GraphTransformationTests.MatMulIntegerToFloat16Test (4 ms)
[----------] 2 tests from GraphTransformationTests (20 ms total)

[----------] Global test environment tear-down
[==========] 2 tests from 1 test suite ran. (22 ms total)
[  PASSED  ] 2 tests.
memleakdbg:
----- No memory leaks detected -----
```
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
This commit is contained in:
raoanag 2023-11-03 10:05:09 -07:00 committed by GitHub
parent 2abfea5372
commit 7d4dba7e16
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 115 additions and 172 deletions

View file

@ -274,7 +274,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
onnxruntime::kRocmExecutionProvider,
onnxruntime::kAclExecutionProvider,
onnxruntime::kArmNNExecutionProvider};
const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
onnxruntime::kDmlExecutionProvider};
#ifdef MLAS_TARGET_AMD64_IX86
const bool avx2_precision_mode =
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@ -292,7 +293,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
}
transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_eps));

View file

@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
return bias_last_dim > 1;
}
bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
if (!node_arg.Exists()) {
return false;
}
const auto* type_proto = node_arg.TypeAsProto();
if (!type_proto) {
return false;
}
int32_t actual_data_type;
if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
return false;
}
return data_type == actual_data_type;
}
/**
MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
auto& mul_node = *node_ptr;
ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) {
const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
!graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
(!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
continue;
}

View file

@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
test.SetOutputRelErr("Y", 1e-4f);
#endif
test.Run();
if constexpr (std::is_same_v<OType, float>) {
test.Run();
} else {
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
}
}
template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
);
}
#if USE_DML
TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
}
#endif // USE_DML
TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
}
TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) {
auto test_case = [&](const std::vector<int64_t>& input_shape,
const std::vector<int64_t>& weights_shape,
const std::vector<int64_t>& b_scale_zp_shape) {
auto build_test_case = [&](ModelTestBuilder& builder) {
auto* input_arg = builder.MakeInput<uint8_t>(input_shape,
std::numeric_limits<uint8_t>::min(),
std::numeric_limits<uint8_t>::max());
auto* output_arg = builder.MakeOutput();
auto* weight = builder.MakeInitializer<int8_t>(weights_shape,
std::numeric_limits<int8_t>::min() / 2,
std::numeric_limits<int8_t>::max() / 2);
// add MatMulInteger
auto* matmul_integer_output = builder.MakeIntermediate();
auto* A_zp_arg = builder.MakeInput<uint8_t>({1},
std::numeric_limits<uint8_t>::min(),
std::numeric_limits<uint8_t>::max());
auto* B_zp_arg = builder.MakeInput<int8_t>(b_scale_zp_shape,
std::numeric_limits<int8_t>::min() / 2,
std::numeric_limits<int8_t>::max() / 2);
builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output});
// add Cast
auto* cast_output = builder.MakeIntermediate();
Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output});
cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16));
// add Mul1
auto* A_scale_arg = builder.MakeInput<MLFloat16>({1}, MLFloat16(-0.1f), MLFloat16(0.0f));
auto* B_scale_arg = builder.MakeInput<MLFloat16>(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f));
auto* mul1_output = builder.MakeIntermediate();
builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output});
// add Mul2
builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg});
};
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
auto op_to_count = CountOpsInGraph(session.GetGraph());
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
};
TransformerTester(build_test_case,
check_mp_reshape_graph,
TransformerLevel::Level1,
TransformerLevel::Level2,
12 /*opset_version*/,
1e-5 /*per_sample_tolerance*/,
1e-5 /*relative_per_sample_tolerance*/);
};
// Scale Scalar
test_case({5, 4, 3}, {3, 4}, {1});
// 2D B per-column
test_case({5, 4, 3}, {3, 4}, {4});
test_case({5, 4, 3}, {3, 4}, {1, 4});
// ND B per-column
test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
}
} // namespace test
} // namespace onnxruntime

View file

@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
EXPECT_EQ(op_to_count["Add"], 1);
}
#ifdef USE_DML
TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();
for (auto& node : graph.Nodes()) {
node.SetExecutionProviderType(kDmlExecutionProvider);
}
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
}
#endif // USE_DML
#endif
#ifndef DISABLE_CONTRIB_OPS

View file

@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias): # noqa: N802
return nodes
def MakeInitializer(suffix, output_type_fp16=False): # noqa: N802
def MakeInitializer(suffix): # noqa: N802
return [
helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]),
helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]),
helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]),
helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]),
]
def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
def GenerateModel(model_name): # noqa: N802
nodes = [
helper.make_node(
"DynamicQuantizeLinear",
@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
nodes.extend(MakeSubGraph("_3", False))
initializers = []
initializers.extend(MakeInitializer("_1", output_type_fp16))
initializers.extend(MakeInitializer("_3", output_type_fp16))
initializers.extend(MakeInitializer("_1"))
initializers.extend(MakeInitializer("_3"))
initializers.extend(
[
helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]),
helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]),
helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
]
)
@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
nodes,
"MatMulIntegerToFloat_fusion", # name
[ # inputs
helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]),
helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]),
# matrix b corresponding inputs for subgraph 2
helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]),
helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]),
helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]),
],
[ # outputs
helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]),
],
initializers,
)
@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
if __name__ == "__main__":
GenerateModel("matmul_integer_to_float.onnx")
GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True)
GenerateModel("matmul_integer_to_float.onnx")

View file

@ -1,90 +0,0 @@

Q
input a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
a
a_quantized
b_quantized_1
a_zp
b_zp_1matmul_output_int32_1MatMulInteger_1" MatMulInteger
4
a_scale
b_scale_1 multiplier_1 mul_right_1"Mul
G
matmul_output_int32_1matmul_output_float_1cast_1"Cast*
to 
F
matmul_output_float_1
multiplier_1 mul_output_1 mul_bottom_1"Mul
1
mul_output_1
bias_1output_1
bias_add_1"Add
a
a_quantized
b_quantized_2
a_zp
b_zp_2matmul_output_int32_2MatMulInteger_2" MatMulInteger
4
a_scale
b_scale_2 multiplier_2 mul_right_2"Mul
G
matmul_output_int32_2matmul_output_float_2cast_2"Cast*
to 
F
matmul_output_float_2
multiplier_2 mul_output_2 mul_bottom_2"Mul
1
mul_output_2
bias_2output_2
bias_add_2"Add
a
a_quantized
b_quantized_3
a_zp
b_zp_3matmul_output_int32_3MatMulInteger_3" MatMulInteger
4
a_scale
b_scale_3 multiplier_3 mul_right_3"Mul
G
matmul_output_int32_3matmul_output_float_3cast_3"Cast*
to 
B
matmul_output_float_3
multiplier_3output_3 mul_bottom_3"MulMatMulIntegerToFloat_fusion**B b_quantized_1**Bb_zp_1*
*³~B b_scale_1**B b_quantized_3**Bb_zp_3*
*³~B b_scale_3*
* €€€ˆ€ŠBbias_1**
*€x€€€„€ˆ€Š€Œ€Ž<>€‘Bbias_2Z
input



Z
b_quantized_2


Z
b_zp_2

Z
b_scale_2


b
output_1



b
output_2



b
output_3



B

View file

@ -0,0 +1,51 @@

U
A
B
a_zero_point
b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to
 
5
matmul_output_float
multiplierY
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
a_scale


Z
b_scale


CZ
a_zero_point

Z
b_zero_point

Cb
Y


M
NB