mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-26 03:00:54 +00:00
Disable MatMulIntegerToFloat transformation for FP16 on CPU EP (#18239)
### Description MatMulIntegerToFloat is updated to support FP16. The nodes for FP16 Transformation use "Mul" FP16, which is not directly supported by the CPU. For now FP16 transformation is only supported for DML EP. Disabled all FP16 tests on CPU. Tests result without `-use_dml` build flag ``` onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat*" Note: Google Test filter = *MatMulIntegerToFloat* [==========] Running 8 tests from 4 test suites. [----------] Global test environment set-up. [----------] 1 test from CPU_U8S8_Precision_Tests [ RUN ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat [ OK ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat (181 ms) [----------] 1 test from CPU_U8S8_Precision_Tests (181 ms total) [----------] 1 test from GraphTransformationTests [ RUN ] GraphTransformationTests.MatMulIntegerToFloatTest [ OK ] GraphTransformationTests.MatMulIntegerToFloatTest (17 ms) [----------] 1 test from GraphTransformationTests (17 ms total) [----------] 1 test from QDQTransformerTests [ RUN ] QDQTransformerTests.MatMulIntegerToFloat [ OK ] QDQTransformerTests.MatMulIntegerToFloat (656 ms) [----------] 1 test from QDQTransformerTests (656 ms total) [----------] 5 tests from MatMulIntegerToFloat [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 (195 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 (206 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (107 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (114 ms) [ RUN ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint [ OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (227 ms) [----------] 5 tests from MatMulIntegerToFloat (854 ms total) [----------] Global test environment tear-down [==========] 8 tests from 4 test suites ran. (1713 ms total) [ PASSED ] 8 tests. memleakdbg: ----- No memory leaks detected ----- ``` ``` onnxruntime_test_all.exe --gtest_filter="GraphTransformationTests.MatMulIntegerToFloat*" Note: Google Test filter = GraphTransformationTests.MatMulIntegerToFloat* [==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. [----------] 2 tests from GraphTransformationTests [ RUN ] GraphTransformationTests.MatMulIntegerToFloatTest [ OK ] GraphTransformationTests.MatMulIntegerToFloatTest (13 ms) [ RUN ] GraphTransformationTests.MatMulIntegerToFloat16Test [ OK ] GraphTransformationTests.MatMulIntegerToFloat16Test (4 ms) [----------] 2 tests from GraphTransformationTests (20 ms total) [----------] Global test environment tear-down [==========] 2 tests from 1 test suite ran. (22 ms total) [ PASSED ] 2 tests. memleakdbg: ----- No memory leaks detected ----- ``` ### Motivation and Context <!-- - Why is this change required? What problem does it solve? - If it fixes an open issue, please link to the issue here. -->
This commit is contained in:
parent
2abfea5372
commit
7d4dba7e16
7 changed files with 115 additions and 172 deletions
|
|
@ -274,7 +274,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
|
|||
onnxruntime::kRocmExecutionProvider,
|
||||
onnxruntime::kAclExecutionProvider,
|
||||
onnxruntime::kArmNNExecutionProvider};
|
||||
|
||||
const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
|
||||
onnxruntime::kDmlExecutionProvider};
|
||||
#ifdef MLAS_TARGET_AMD64_IX86
|
||||
const bool avx2_precision_mode =
|
||||
session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
|
||||
|
|
@ -292,7 +293,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
|
|||
}
|
||||
|
||||
transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
|
||||
transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
|
||||
transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
|
||||
transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
|
||||
|
||||
transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_eps));
|
||||
|
|
|
|||
|
|
@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
|
|||
return bias_last_dim > 1;
|
||||
}
|
||||
|
||||
bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
|
||||
if (!node_arg.Exists()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto* type_proto = node_arg.TypeAsProto();
|
||||
if (!type_proto) {
|
||||
return false;
|
||||
}
|
||||
|
||||
int32_t actual_data_type;
|
||||
if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return data_type == actual_data_type;
|
||||
}
|
||||
|
||||
/**
|
||||
MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
|
||||
|
||||
|
|
@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
|
|||
auto& mul_node = *node_ptr;
|
||||
|
||||
ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
|
||||
|
||||
if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) {
|
||||
const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
|
||||
if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
|
||||
!graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
|
||||
(!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
|
|||
test.SetOutputRelErr("Y", 1e-4f);
|
||||
#endif
|
||||
|
||||
test.Run();
|
||||
if constexpr (std::is_same_v<OType, float>) {
|
||||
test.Run();
|
||||
} else {
|
||||
test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
|
||||
|
|
@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
|
|||
);
|
||||
}
|
||||
|
||||
#if USE_DML
|
||||
TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
|
||||
RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
|
||||
RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
|
||||
|
|
@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
|
|||
TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
|
||||
RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
|
||||
}
|
||||
#endif // USE_DML
|
||||
|
||||
TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
|
||||
RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
|
||||
|
|
@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
|
|||
test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
|
||||
}
|
||||
|
||||
TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) {
|
||||
auto test_case = [&](const std::vector<int64_t>& input_shape,
|
||||
const std::vector<int64_t>& weights_shape,
|
||||
const std::vector<int64_t>& b_scale_zp_shape) {
|
||||
auto build_test_case = [&](ModelTestBuilder& builder) {
|
||||
auto* input_arg = builder.MakeInput<uint8_t>(input_shape,
|
||||
std::numeric_limits<uint8_t>::min(),
|
||||
std::numeric_limits<uint8_t>::max());
|
||||
auto* output_arg = builder.MakeOutput();
|
||||
auto* weight = builder.MakeInitializer<int8_t>(weights_shape,
|
||||
std::numeric_limits<int8_t>::min() / 2,
|
||||
std::numeric_limits<int8_t>::max() / 2);
|
||||
|
||||
// add MatMulInteger
|
||||
auto* matmul_integer_output = builder.MakeIntermediate();
|
||||
auto* A_zp_arg = builder.MakeInput<uint8_t>({1},
|
||||
std::numeric_limits<uint8_t>::min(),
|
||||
std::numeric_limits<uint8_t>::max());
|
||||
auto* B_zp_arg = builder.MakeInput<int8_t>(b_scale_zp_shape,
|
||||
std::numeric_limits<int8_t>::min() / 2,
|
||||
std::numeric_limits<int8_t>::max() / 2);
|
||||
builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output});
|
||||
|
||||
// add Cast
|
||||
auto* cast_output = builder.MakeIntermediate();
|
||||
Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output});
|
||||
cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16));
|
||||
|
||||
// add Mul1
|
||||
auto* A_scale_arg = builder.MakeInput<MLFloat16>({1}, MLFloat16(-0.1f), MLFloat16(0.0f));
|
||||
auto* B_scale_arg = builder.MakeInput<MLFloat16>(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f));
|
||||
auto* mul1_output = builder.MakeIntermediate();
|
||||
builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output});
|
||||
|
||||
// add Mul2
|
||||
builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg});
|
||||
};
|
||||
|
||||
auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
|
||||
auto op_to_count = CountOpsInGraph(session.GetGraph());
|
||||
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
|
||||
};
|
||||
|
||||
TransformerTester(build_test_case,
|
||||
check_mp_reshape_graph,
|
||||
TransformerLevel::Level1,
|
||||
TransformerLevel::Level2,
|
||||
12 /*opset_version*/,
|
||||
1e-5 /*per_sample_tolerance*/,
|
||||
1e-5 /*relative_per_sample_tolerance*/);
|
||||
};
|
||||
|
||||
// Scale Scalar
|
||||
test_case({5, 4, 3}, {3, 4}, {1});
|
||||
|
||||
// 2D B per-column
|
||||
test_case({5, 4, 3}, {3, 4}, {4});
|
||||
test_case({5, 4, 3}, {3, 4}, {1, 4});
|
||||
|
||||
// ND B per-column
|
||||
test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
|
|
@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
|
|||
EXPECT_EQ(op_to_count["Add"], 1);
|
||||
}
|
||||
|
||||
#ifdef USE_DML
|
||||
TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
|
||||
constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
|
||||
std::shared_ptr<Model> p_model;
|
||||
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
|
||||
Graph& graph = p_model->MainGraph();
|
||||
|
||||
for (auto& node : graph.Nodes()) {
|
||||
node.SetExecutionProviderType(kDmlExecutionProvider);
|
||||
}
|
||||
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
|
||||
ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
|
||||
ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
|
||||
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
|
||||
EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
|
||||
}
|
||||
#endif // USE_DML
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef DISABLE_CONTRIB_OPS
|
||||
|
|
|
|||
|
|
@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias): # noqa: N802
|
|||
return nodes
|
||||
|
||||
|
||||
def MakeInitializer(suffix, output_type_fp16=False): # noqa: N802
|
||||
def MakeInitializer(suffix): # noqa: N802
|
||||
return [
|
||||
helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]),
|
||||
helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]),
|
||||
helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]),
|
||||
helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]),
|
||||
]
|
||||
|
||||
|
||||
def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
|
||||
def GenerateModel(model_name): # noqa: N802
|
||||
nodes = [
|
||||
helper.make_node(
|
||||
"DynamicQuantizeLinear",
|
||||
|
|
@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
|
|||
nodes.extend(MakeSubGraph("_3", False))
|
||||
|
||||
initializers = []
|
||||
initializers.extend(MakeInitializer("_1", output_type_fp16))
|
||||
initializers.extend(MakeInitializer("_3", output_type_fp16))
|
||||
initializers.extend(MakeInitializer("_1"))
|
||||
initializers.extend(MakeInitializer("_3"))
|
||||
|
||||
initializers.extend(
|
||||
[
|
||||
helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]),
|
||||
helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
||||
helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]),
|
||||
helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
|
|||
nodes,
|
||||
"MatMulIntegerToFloat_fusion", # name
|
||||
[ # inputs
|
||||
helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]),
|
||||
helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]),
|
||||
# matrix b corresponding inputs for subgraph 2
|
||||
helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]),
|
||||
helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]),
|
||||
helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
|
||||
helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]),
|
||||
],
|
||||
[ # outputs
|
||||
helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]),
|
||||
],
|
||||
initializers,
|
||||
)
|
||||
|
|
@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802
|
|||
|
||||
|
||||
if __name__ == "__main__":
|
||||
GenerateModel("matmul_integer_to_float.onnx")
|
||||
GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True)
|
||||
GenerateModel("matmul_integer_to_float.onnx")
|
||||
|
|
@ -1,90 +0,0 @@
|
|||
:Ö
|
||||
Q
|
||||
inputa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
|
||||
a
|
||||
a_quantized
|
||||
b_quantized_1
|
||||
a_zp
|
||||
b_zp_1matmul_output_int32_1MatMulInteger_1"
MatMulInteger
|
||||
4
|
||||
a_scale
|
||||
b_scale_1multiplier_1mul_right_1"Mul
|
||||
G
|
||||
matmul_output_int32_1matmul_output_float_1cast_1"Cast*
|
||||
to
|
||||
F
|
||||
matmul_output_float_1
|
||||
multiplier_1mul_output_1mul_bottom_1"Mul
|
||||
1
|
||||
mul_output_1
|
||||
bias_1output_1
|
||||
bias_add_1"Add
|
||||
a
|
||||
a_quantized
|
||||
b_quantized_2
|
||||
a_zp
|
||||
b_zp_2matmul_output_int32_2MatMulInteger_2"
MatMulInteger
|
||||
4
|
||||
a_scale
|
||||
b_scale_2multiplier_2mul_right_2"Mul
|
||||
G
|
||||
matmul_output_int32_2matmul_output_float_2cast_2"Cast*
|
||||
to
|
||||
F
|
||||
matmul_output_float_2
|
||||
multiplier_2mul_output_2mul_bottom_2"Mul
|
||||
1
|
||||
mul_output_2
|
||||
bias_2output_2
|
||||
bias_add_2"Add
|
||||
a
|
||||
a_quantized
|
||||
b_quantized_3
|
||||
a_zp
|
||||
b_zp_3matmul_output_int32_3MatMulInteger_3"
MatMulInteger
|
||||
4
|
||||
a_scale
|
||||
b_scale_3multiplier_3mul_right_3"Mul
|
||||
G
|
||||
matmul_output_int32_3matmul_output_float_3cast_3"Cast*
|
||||
to
|
||||
B
|
||||
matmul_output_float_3
|
||||
multiplier_3output_3mul_bottom_3"MulMatMulIntegerToFloat_fusion**B
b_quantized_1**€Bb_zp_1*
|
||||
*³~B b_scale_1**B
b_quantized_3**€Bb_zp_3*
|
||||
*³~B b_scale_3*
|
||||
* €€€ˆ€ŠBbias_1**
|
||||
*€x€€€„€ˆ€Š€Œ€Ž€<>€‘Bbias_2Z
|
||||
input
|
||||
|
||||
|
||||
|
||||
Z
|
||||
b_quantized_2
|
||||
|
||||
|
||||
Z
|
||||
b_zp_2
|
||||
|
||||
|
||||
Z
|
||||
b_scale_2
|
||||
|
||||
|
||||
|
||||
b
|
||||
output_1
|
||||
|
||||
|
||||
|
||||
b
|
||||
output_2
|
||||
|
||||
|
||||
|
||||
b
|
||||
output_3
|
||||
|
||||
|
||||
|
||||
B
|
||||
51
onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
vendored
Normal file
51
onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
vendored
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
:Ě
|
||||
U
|
||||
A
|
||||
B
|
||||
a_zero_point
|
||||
b_zero_pointmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
|
||||
5
|
||||
matmul_output_float
|
||||
|
||||
multiplierY
|
||||
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
a_scale
|
||||
|
||||
|
||||
|
||||
Z
|
||||
b_scale
|
||||
|
||||
|
||||
CZ
|
||||
a_zero_point
|
||||
|
||||
|
||||
Z
|
||||
b_zero_point
|
||||
|
||||
Cb
|
||||
Y
|
||||
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
Loading…
Reference in a new issue