diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 163751c38e..fd3f9c2ce6 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -274,7 +274,8 @@ InlinedVector> GenerateTransformers( onnxruntime::kRocmExecutionProvider, onnxruntime::kAclExecutionProvider, onnxruntime::kArmNNExecutionProvider}; - + const InlinedHashSet cpu_dml_eps = {onnxruntime::kCpuExecutionProvider, + onnxruntime::kDmlExecutionProvider}; #ifdef MLAS_TARGET_AMD64_IX86 const bool avx2_precision_mode = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow(); @@ -292,7 +293,7 @@ InlinedVector> GenerateTransformers( } transformers.emplace_back(std::make_unique(cpu_ep)); - transformers.emplace_back(std::make_unique(cpu_ep)); + transformers.emplace_back(std::make_unique(cpu_dml_eps)); transformers.emplace_back(std::make_unique(cpu_ep)); transformers.emplace_back(std::make_unique(cpu_cuda_rocm_acl_armnn_eps)); diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc index b9fe5dcad0..4fee1a6ce2 100644 --- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc +++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc @@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) { return bias_last_dim > 1; } +bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) { + if (!node_arg.Exists()) { + return false; + } + + const auto* type_proto = node_arg.TypeAsProto(); + if (!type_proto) { + return false; + } + + int32_t actual_data_type; + if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) { + return false; + } + + return data_type == actual_data_type; +} + /** MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat: @@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g auto& mul_node = *node_ptr; ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger)); - - if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) { + const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider; + if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) || + !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) || + (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) { continue; } diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 7eadd3fffc..51d9a57b5e 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector& A_dims, test.SetOutputRelErr("Y", 1e-4f); #endif - test.Run(); + if constexpr (std::is_same_v) { + test.Run(); + } else { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider}); + } + } template @@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) { ); } +#if USE_DML TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) { RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8.onnx"); RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_uint8.onnx"); @@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) { TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) { RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float16_int8_int8_bias.onnx"); } +#endif // USE_DML TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) { RunMatMulIntegerToFloatTest("testdata/matmul_integer_to_float_int8.onnx"); @@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) { test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27}); } -TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) { - auto test_case = [&](const std::vector& input_shape, - const std::vector& weights_shape, - const std::vector& b_scale_zp_shape) { - auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput(input_shape, - std::numeric_limits::min(), - std::numeric_limits::max()); - auto* output_arg = builder.MakeOutput(); - auto* weight = builder.MakeInitializer(weights_shape, - std::numeric_limits::min() / 2, - std::numeric_limits::max() / 2); - - // add MatMulInteger - auto* matmul_integer_output = builder.MakeIntermediate(); - auto* A_zp_arg = builder.MakeInput({1}, - std::numeric_limits::min(), - std::numeric_limits::max()); - auto* B_zp_arg = builder.MakeInput(b_scale_zp_shape, - std::numeric_limits::min() / 2, - std::numeric_limits::max() / 2); - builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output}); - - // add Cast - auto* cast_output = builder.MakeIntermediate(); - Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output}); - cast_node.AddAttribute("to", static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16)); - - // add Mul1 - auto* A_scale_arg = builder.MakeInput({1}, MLFloat16(-0.1f), MLFloat16(0.0f)); - auto* B_scale_arg = builder.MakeInput(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f)); - auto* mul1_output = builder.MakeIntermediate(); - builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output}); - - // add Mul2 - builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg}); - }; - - auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) { - auto op_to_count = CountOpsInGraph(session.GetGraph()); - EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); - }; - - TransformerTester(build_test_case, - check_mp_reshape_graph, - TransformerLevel::Level1, - TransformerLevel::Level2, - 12 /*opset_version*/, - 1e-5 /*per_sample_tolerance*/, - 1e-5 /*relative_per_sample_tolerance*/); - }; - - // Scale Scalar - test_case({5, 4, 3}, {3, 4}, {1}); - - // 2D B per-column - test_case({5, 4, 3}, {3, 4}, {4}); - test_case({5, 4, 3}, {3, 4}, {1, 4}); - - // ND B per-column - test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27}); -} - } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index c3dc2734d8..4f45f48da7 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) { EXPECT_EQ(op_to_count["Add"], 1); } +#ifdef USE_DML + TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + for (auto& node : graph.Nodes()) { + node.SetExecutionProviderType(kDmlExecutionProvider); + } + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique(), TransformerLevel::Level2)); + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_)); + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1); +} +#endif // USE_DML + #endif #ifndef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py index ff35f81171..60bdd92dc9 100644 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py @@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias): # noqa: N802 return nodes -def MakeInitializer(suffix, output_type_fp16=False): # noqa: N802 +def MakeInitializer(suffix): # noqa: N802 return [ helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]), helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]), - helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]), + helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]), ] -def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 +def GenerateModel(model_name): # noqa: N802 nodes = [ helper.make_node( "DynamicQuantizeLinear", @@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 nodes.extend(MakeSubGraph("_3", False)) initializers = [] - initializers.extend(MakeInitializer("_1", output_type_fp16)) - initializers.extend(MakeInitializer("_3", output_type_fp16)) + initializers.extend(MakeInitializer("_1")) + initializers.extend(MakeInitializer("_3")) initializers.extend( [ - helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]), - helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]), + helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]), + helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]), ] ) @@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 nodes, "MatMulIntegerToFloat_fusion", # name [ # inputs - helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]), + helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]), # matrix b corresponding inputs for subgraph 2 helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]), helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]), - helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]), + helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]), ], [ # outputs - helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]), - helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]), - helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]), + helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]), + helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]), + helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]), ], initializers, ) @@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False): # noqa: N802 if __name__ == "__main__": - GenerateModel("matmul_integer_to_float.onnx") - GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True) + GenerateModel("matmul_integer_to_float.onnx") \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx deleted file mode 100644 index 67d50eac6f..0000000000 --- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx +++ /dev/null @@ -1,90 +0,0 @@ - : -Q -input a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear -a - a_quantized - b_quantized_1 -a_zp -b_zp_1matmul_output_int32_1MatMulInteger_1" MatMulInteger -4 -a_scale - b_scale_1 multiplier_1 mul_right_1"Mul -G -matmul_output_int32_1matmul_output_float_1cast_1"Cast* -to -F -matmul_output_float_1 - multiplier_1 mul_output_1 mul_bottom_1"Mul -1 - mul_output_1 -bias_1output_1 -bias_add_1"Add -a - a_quantized - b_quantized_2 -a_zp -b_zp_2matmul_output_int32_2MatMulInteger_2" MatMulInteger -4 -a_scale - b_scale_2 multiplier_2 mul_right_2"Mul -G -matmul_output_int32_2matmul_output_float_2cast_2"Cast* -to -F -matmul_output_float_2 - multiplier_2 mul_output_2 mul_bottom_2"Mul -1 - mul_output_2 -bias_2output_2 -bias_add_2"Add -a - a_quantized - b_quantized_3 -a_zp -b_zp_3matmul_output_int32_3MatMulInteger_3" MatMulInteger -4 -a_scale - b_scale_3 multiplier_3 mul_right_3"Mul -G -matmul_output_int32_3matmul_output_float_3cast_3"Cast* -to -B -matmul_output_float_3 - multiplier_3output_3 mul_bottom_3"MulMatMulIntegerToFloat_fusion**B b_quantized_1**Bb_zp_1* -*~B b_scale_1**B b_quantized_3**Bb_zp_3* -*~B b_scale_3* -* Bbias_1** -*xBbias_2Z -input -  - - -Z - b_quantized_2 -  - -Z -b_zp_2 - - -Z - b_scale_2 - - - -b -output_1 -  - - -b -output_2 -  - - -b -output_3 -  - - -B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx new file mode 100644 index 0000000000..22293b0d10 --- /dev/null +++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx @@ -0,0 +1,51 @@ + : +U +A +B + a_zero_point + b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to + +5 +matmul_output_float + +multiplierY +mul_bottom"MulDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + + +Z +b_scale +  + +CZ + a_zero_point + + +Z + b_zero_point +  +Cb +Y + + + +M +NB \ No newline at end of file