From 7d4dba7e1676785bfddea5cf6d866585b432ebe7 Mon Sep 17 00:00:00 2001 From: raoanag <127366241+raoanag@users.noreply.github.com> Date: Fri, 3 Nov 2023 10:05:09 -0700 Subject: [PATCH] Disable MatMulIntegerToFloat transformation for FP16 on CPU EP (#18239) ### Description MatMulIntegerToFloat is updated to support FP16. The nodes for FP16 Transformation use "Mul" FP16, which is not directly supported by the CPU. For now FP16 transformation is only supported for DML EP. Disabled all FP16 tests on CPU. Tests result without `-use_dml` build flag ``` onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat*" Note: Google Test filter = *MatMulIntegerToFloat* [==========] Running 8 tests from 4 test suites. [----------] Global test environment set-up. [----------] 1 test from CPU_U8S8_Precision_Tests [ RUN ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat [ OK ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat (181 ms) [----------] 1 test from CPU_U8S8_Precision_Tests (181 ms total) [----------] 1 test from GraphTransformationTests [ RUN ] GraphTransformationTests.MatMulIntegerToFloatTest [ OK ] GraphTransformationTests.MatMulIntegerToFloatTest (17 ms) [----------] 1 test from GraphTransformationTests (17 ms total) [----------] 1 test from QDQTransformerTests [ RUN ] QDQTransformerTests.MatMulIntegerToFloat [ OK ] QDQTransformerTests.MatMulIntegerToFloat (656 ms) [----------] 1 test from QDQTransformerTests (656 ms total) [----------] 5 tests from MatMulIntegerToFloat [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 (195 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 (206 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (107 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (114 ms) [ RUN ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint [ OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (227 ms) [----------] 5 tests from MatMulIntegerToFloat (854 ms total) [----------] Global test environment tear-down [==========] 8 tests from 4 test suites ran. (1713 ms total) [ PASSED ] 8 tests. memleakdbg: ----- No memory leaks detected ----- ``` ``` onnxruntime_test_all.exe --gtest_filter="GraphTransformationTests.MatMulIntegerToFloat*" Note: Google Test filter = GraphTransformationTests.MatMulIntegerToFloat* [==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. [----------] 2 tests from GraphTransformationTests [ RUN ] GraphTransformationTests.MatMulIntegerToFloatTest [ OK ] GraphTransformationTests.MatMulIntegerToFloatTest (13 ms) [ RUN ] GraphTransformationTests.MatMulIntegerToFloat16Test [ OK ] GraphTransformationTests.MatMulIntegerToFloat16Test (4 ms) [----------] 2 tests from GraphTransformationTests (20 ms total) [----------] Global test environment tear-down [==========] 2 tests from 1 test suite ran. (22 ms total) [ PASSED ] 2 tests. memleakdbg: ----- No memory leaks detected ----- ``` ### Motivation and Context --- .../core/optimizer/graph_transformer_utils.cc | 5 +- .../core/optimizer/matmul_integer_to_float.cc | 24 ++++- .../matmul_integer_to_float_test.cc | 72 ++------------- .../test/optimizer/graph_transform_test.cc | 18 ++++ .../fusion/matmul_integer_to_float.py | 27 +++--- .../fusion/matmul_integer_to_float16.onnx | 90 ------------------- .../matmul_integer_to_float16_int8.onnx | 51 +++++++++++ 7 files changed, 115 insertions(+), 172 deletions(-) delete mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 163751c38e..fd3f9c2ce6 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -274,7 +274,8 @@ InlinedVector> GenerateTransformers( onnxruntime::kRocmExecutionProvider, onnxruntime::kAclExecutionProvider, onnxruntime::kArmNNExecutionProvider}; - + const InlinedHashSet cpu_dml_eps = {onnxruntime::kCpuExecutionProvider, + onnxruntime::kDmlExecutionProvider}; #ifdef MLAS_TARGET_AMD64_IX86 const bool avx2_precision_mode = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow(); @@ -292,7 +293,7 @@ InlinedVector> GenerateTransformers( } transformers.emplace_back(std::make_unique(cpu_ep)); - transformers.emplace_back(std::make_unique(cpu_ep)); + transformers.emplace_back(std::make_unique(cpu_dml_eps)); transformers.emplace_back(std::make_unique(cpu_ep)); transformers.emplace_back(std::make_unique(cpu_cuda_rocm_acl_armnn_eps)); diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc index b9fe5dcad0..4fee1a6ce2 100644 --- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc +++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc @@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) { return bias_last_dim > 1; } +bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) { + if (!node_arg.Exists()) { + return false; + } + + const auto* type_proto = node_arg.TypeAsProto(); + if (!type_proto) { + return false; + } + + int32_t actual_data_type; + if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) { + return false; + } + + return data_type == actual_data_type; +} + /** MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat: @@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g auto& mul_node = *node_ptr; ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger)); - - if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) { + const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider; + if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) || + !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) || + (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) { continue; } diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 7eadd3fffc..51d9a57b5e 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, test.SetOutputRelErr("Y", 1e-4f); #endif - test.Run(); + if constexpr (std::is_same_v) { + test.Run(); + } else { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider}); + } + } template @@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { ); } +#if USE_DML TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) { RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8.onnx"); RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8.onnx"); @@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) { TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) { RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8_bias.onnx"); } +#endif // USE_DML TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) { RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8.onnx"); @@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) { test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27}); } -TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) { - auto test_case = [&](const std::vector& input_shape, - const std::vector& weights_shape, - const std::vector& b_scale_zp_shape) { - auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput(input_shape, - std::numeric_limits::min(), - std::numeric_limits::max()); - auto* output_arg = builder.MakeOutput(); - auto* weight = builder.MakeInitializer(weights_shape, - std::numeric_limits::min() / 2, - std::numeric_limits::max() / 2); - - // add MatMulInteger - auto* matmul_integer_output = builder.MakeIntermediate(); - auto* A_zp_arg = builder.MakeInput({1}, - std::numeric_limits::min(), - std::numeric_limits::max()); - auto* B_zp_arg = builder.MakeInput(b_scale_zp_shape, - std::numeric_limits::min() / 2, - std::numeric_limits::max() / 2); - builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output}); - - // add Cast - auto* cast_output = builder.MakeIntermediate(); - Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output}); - cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)); - - // add Mul1 - auto* A_scale_arg = builder.MakeInput({1}, MLFloat16(-0.1f), MLFloat16(0.0f)); - auto* B_scale_arg = builder.MakeInput(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f)); - auto* mul1_output = builder.MakeIntermediate(); - builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output}); - - // add Mul2 - builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg}); - }; - - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { - auto op_to_count = CountOpsInGraph(session.GetGraph()); - EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); - }; - - TransformerTester(build_test_case, - check_mp_reshape_graph, - TransformerLevel::Level1, - TransformerLevel::Level2, - 12 /*opset_version*/, - 1e-5 /*per_sample_tolerance*/, - 1e-5 /*relative_per_sample_tolerance*/); - }; - - // Scale Scalar - test_case({5, 4, 3}, {3, 4}, {1}); - - // 2D B per-column - test_case({5, 4, 3}, {3, 4}, {4}); - test_case({5, 4, 3}, {3, 4}, {1, 4}); - - // ND B per-column - test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27}); -} - } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index c3dc2734d8..4f45f48da7 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) { EXPECT_EQ(op_to_count["Add"], 1); } +#ifdef USE_DML + TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + for (auto& node : graph.Nodes()) { + node.SetExecutionProviderType(kDmlExecutionProvider); + } + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); +} +#endif // USE_DML + #endif #ifndef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py index ff35f81171..60bdd92dc9 100644 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py @@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias): # noqa: N802 return nodes -def MakeInitializer(suffix, output_type_fp16=False): # noqa: N802 +def MakeInitializer(suffix): # noqa: N802 return [ helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]), helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]), - helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]), + helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]), ] -def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 +def GenerateModel(model_name): # noqa: N802 nodes = [ helper.make_node( "DynamicQuantizeLinear", @@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 nodes.extend(MakeSubGraph("_3", False)) initializers = [] - initializers.extend(MakeInitializer("_1", output_type_fp16)) - initializers.extend(MakeInitializer("_3", output_type_fp16)) + initializers.extend(MakeInitializer("_1")) + initializers.extend(MakeInitializer("_3")) initializers.extend( [ - helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]), - helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]), + helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]), + helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]), ] ) @@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 nodes, "MatMulIntegerToFloat_fusion", # name [ # inputs - helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]), + helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]), # matrix b corresponding inputs for subgraph 2 helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]), helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]), - helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]), + helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]), ], [ # outputs - helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]), - helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]), - helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]), + helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]), + helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]), + helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]), ], initializers, ) @@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 if __name__ == "__main__": - GenerateModel("matmul_integer_to_float.onnx") - GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True) + GenerateModel("matmul_integer_to_float.onnx") \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx deleted file mode 100644 index 67d50eac6f..0000000000 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx +++ /dev/null @@ -1,90 +0,0 @@ - : -Q -input a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear -a - a_quantized - b_quantized_1 -a_zp -b_zp_1matmul_output_int32_1MatMulInteger_1" MatMulInteger -4 -a_scale - b_scale_1 multiplier_1 mul_right_1"Mul -G -matmul_output_int32_1matmul_output_float_1cast_1"Cast* -to -F -matmul_output_float_1 - multiplier_1 mul_output_1 mul_bottom_1"Mul -1 - mul_output_1 -bias_1output_1 -bias_add_1"Add -a - a_quantized - b_quantized_2 -a_zp -b_zp_2matmul_output_int32_2MatMulInteger_2" MatMulInteger -4 -a_scale - b_scale_2 multiplier_2 mul_right_2"Mul -G -matmul_output_int32_2matmul_output_float_2cast_2"Cast* -to -F -matmul_output_float_2 - multiplier_2 mul_output_2 mul_bottom_2"Mul -1 - mul_output_2 -bias_2output_2 -bias_add_2"Add -a - a_quantized - b_quantized_3 -a_zp -b_zp_3matmul_output_int32_3MatMulInteger_3" MatMulInteger -4 -a_scale - b_scale_3 multiplier_3 mul_right_3"Mul -G -matmul_output_int32_3matmul_output_float_3cast_3"Cast* -to -B -matmul_output_float_3 - multiplier_3output_3 mul_bottom_3"MulMatMulIntegerToFloat_fusion**B b_quantized_1**Bb_zp_1* -*~B b_scale_1**B b_quantized_3**Bb_zp_3* -*~B b_scale_3* -* Bbias_1** -*xBbias_2Z -input -  - - -Z - b_quantized_2 -  - -Z -b_zp_2 - - -Z - b_scale_2 - - - -b -output_1 -  - - -b -output_2 -  - - -b -output_3 -  - - -B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx new file mode 100644 index 0000000000..22293b0d10 --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx @@ -0,0 +1,51 @@ + : +U +A +B + a_zero_point + b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to + +5 +matmul_output_float + +multiplierY +mul_bottom"MulDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + + +Z +b_scale +  + +CZ + a_zero_point + + +Z + b_zero_point +  +Cb +Y + + + +M +NB \ No newline at end of file