Disable MatMulIntegerToFloat transformation for FP16 on CPU EP (#18239)

### Description MatMulIntegerToFloat is updated to support FP16. The nodes for FP16 Transformation use "Mul" FP16, which is not directly supported by the CPU. For now FP16 transformation is only supported for DML EP. Disabled all FP16 tests on CPU. Tests result without `-use_dml` build flag ``` onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat*" Note: Google Test filter = *MatMulIntegerToFloat* [==========] Running 8 tests from 4 test suites. [----------] Global test environment set-up. [----------] 1 test from CPU_U8S8_Precision_Tests [ RUN ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat [ OK ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat (181 ms) [----------] 1 test from CPU_U8S8_Precision_Tests (181 ms total) [----------] 1 test from GraphTransformationTests [ RUN ] GraphTransformationTests.MatMulIntegerToFloatTest [ OK ] GraphTransformationTests.MatMulIntegerToFloatTest (17 ms) [----------] 1 test from GraphTransformationTests (17 ms total) [----------] 1 test from QDQTransformerTests [ RUN ] QDQTransformerTests.MatMulIntegerToFloat [ OK ] QDQTransformerTests.MatMulIntegerToFloat (656 ms) [----------] 1 test from QDQTransformerTests (656 ms total) [----------] 5 tests from MatMulIntegerToFloat [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 (195 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 (206 ms) [ RUN ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 [ OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (107 ms) [ RUN ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 [ OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (114 ms) [ RUN ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint [ OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (227 ms) [----------] 5 tests from MatMulIntegerToFloat (854 ms total) [----------] Global test environment tear-down [==========] 8 tests from 4 test suites ran. (1713 ms total) [ PASSED ] 8 tests. memleakdbg: ----- No memory leaks detected ----- ``` ``` onnxruntime_test_all.exe --gtest_filter="GraphTransformationTests.MatMulIntegerToFloat*" Note: Google Test filter = GraphTransformationTests.MatMulIntegerToFloat* [==========] Running 2 tests from 1 test suite. [----------] Global test environment set-up. [----------] 2 tests from GraphTransformationTests [ RUN ] GraphTransformationTests.MatMulIntegerToFloatTest [ OK ] GraphTransformationTests.MatMulIntegerToFloatTest (13 ms) [ RUN ] GraphTransformationTests.MatMulIntegerToFloat16Test [ OK ] GraphTransformationTests.MatMulIntegerToFloat16Test (4 ms) [----------] 2 tests from GraphTransformationTests (20 ms total) [----------] Global test environment tear-down [==========] 2 tests from 1 test suite ran. (22 ms total) [ PASSED ] 2 tests. memleakdbg: ----- No memory leaks detected ----- ``` ### Motivation and Context
2026-06-26 03:00:54 +00:00 · 2023-11-03 10:05:09 -07:00 · 2023-11-03 10:05:09 -07:00 · 7d4dba7e16
commit 7d4dba7e16
parent 2abfea5372
7 changed files with 115 additions and 172 deletions
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@ -274,7 +274,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                            onnxruntime::kRocmExecutionProvider,
                                                                            onnxruntime::kAclExecutionProvider,
                                                                            onnxruntime::kArmNNExecutionProvider};
-
+      const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
+                                                            onnxruntime::kDmlExecutionProvider};
 #ifdef MLAS_TARGET_AMD64_IX86
      const bool avx2_precision_mode =
          session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@ -292,7 +293,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
      }

      transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
-      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
+      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
      transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));

      transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_eps));
--- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc
+++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
  return bias_last_dim > 1;
 }

+bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
+  if (!node_arg.Exists()) {
+    return false;
+  }
+
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto) {
+    return false;
+  }
+
+  int32_t actual_data_type;
+  if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
+    return false;
+  }
+
+  return data_type == actual_data_type;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:

@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
    auto& mul_node = *node_ptr;

    ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
-
-    if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) {
+    const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
+        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
+        (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
      continue;
    }

--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
  test.SetOutputRelErr("Y", 1e-4f);
 #endif

-  test.Run();
+  if constexpr (std::is_same_v<OType, float>) {
+    test.Run();
+  } else {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
+  }
+
 }

 template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
  );
 }

+#if USE_DML
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
  RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
  RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
 TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
  RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
 }
+#endif // USE_DML

 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
  RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
  test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }

-TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) {
-  auto test_case = [&](const std::vector<int64_t>& input_shape,
-                       const std::vector<int64_t>& weights_shape,
-                       const std::vector<int64_t>& b_scale_zp_shape) {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<uint8_t>(input_shape,
-                                                   std::numeric_limits<uint8_t>::min(),
-                                                   std::numeric_limits<uint8_t>::max());
-      auto* output_arg = builder.MakeOutput();
-      auto* weight = builder.MakeInitializer<int8_t>(weights_shape,
-                                                     std::numeric_limits<int8_t>::min() / 2,
-                                                     std::numeric_limits<int8_t>::max() / 2);
-
-      // add MatMulInteger
-      auto* matmul_integer_output = builder.MakeIntermediate();
-      auto* A_zp_arg = builder.MakeInput<uint8_t>({1},
-                                                  std::numeric_limits<uint8_t>::min(),
-                                                  std::numeric_limits<uint8_t>::max());
-      auto* B_zp_arg = builder.MakeInput<int8_t>(b_scale_zp_shape,
-                                                 std::numeric_limits<int8_t>::min() / 2,
-                                                 std::numeric_limits<int8_t>::max() / 2);
-      builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output});
-
-      // add Cast
-      auto* cast_output = builder.MakeIntermediate();
-      Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output});
-      cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16));
-
-      // add Mul1
-      auto* A_scale_arg = builder.MakeInput<MLFloat16>({1}, MLFloat16(-0.1f), MLFloat16(0.0f));
-      auto* B_scale_arg = builder.MakeInput<MLFloat16>(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f));
-      auto* mul1_output = builder.MakeIntermediate();
-      builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output});
-
-      // add Mul2
-      builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg});
-    };
-
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
-      auto op_to_count = CountOpsInGraph(session.GetGraph());
-      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
-    };
-
-    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
-                      TransformerLevel::Level1,
-                      TransformerLevel::Level2,
-                      12 /*opset_version*/,
-                      1e-5 /*per_sample_tolerance*/,
-                      1e-5 /*relative_per_sample_tolerance*/);
-  };
-
-  // Scale Scalar
-  test_case({5, 4, 3}, {3, 4}, {1});
-
-  // 2D B per-column
-  test_case({5, 4, 3}, {3, 4}, {4});
-  test_case({5, 4, 3}, {3, 4}, {1, 4});
-
-  // ND B per-column
-  test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
-}
-
 }  // namespace test
 }  // namespace onnxruntime
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
  EXPECT_EQ(op_to_count["Add"], 1);
 }

+#ifdef USE_DML
+ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  } 
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+}
+#endif  // USE_DML
+
 #endif

 #ifndef DISABLE_CONTRIB_OPS
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias):  # noqa: N802
    return nodes


-def MakeInitializer(suffix, output_type_fp16=False):  # noqa: N802
+def MakeInitializer(suffix):  # noqa: N802
    return [
        helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]),
        helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]),
-        helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]),
+        helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]),
    ]


-def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
+def GenerateModel(model_name):  # noqa: N802
    nodes = [
        helper.make_node(
            "DynamicQuantizeLinear",
@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
    nodes.extend(MakeSubGraph("_3", False))

    initializers = []
-    initializers.extend(MakeInitializer("_1", output_type_fp16))
-    initializers.extend(MakeInitializer("_3", output_type_fp16))
+    initializers.extend(MakeInitializer("_1"))
+    initializers.extend(MakeInitializer("_3"))

    initializers.extend(
        [
-            helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]),
-            helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]),
+            helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
        ]
    )

@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
        nodes,
        "MatMulIntegerToFloat_fusion",  # name
        [  # inputs
-            helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]),
+            helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]),
            # matrix b corresponding inputs for subgraph 2
            helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]),
            helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]),
-            helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
+            helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]),
        ],
        [  # outputs
-            helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
-            helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
-            helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]),
        ],
        initializers,
    )
@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802


 if __name__ == "__main__":
-    GenerateModel("matmul_integer_to_float.onnx")
-    GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True)
+    GenerateModel("matmul_integer_to_float.onnx")
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
@ -1,90 +0,0 @@
-	:Ö
-Q
-inputa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
-a
-a_quantized
-
b_quantized_1
-a_zp
-b_zp_1matmul_output_int32_1MatMulInteger_1"
MatMulInteger
-4
-a_scale
-	b_scale_1multiplier_1mul_right_1"Mul
-G
-matmul_output_int32_1matmul_output_float_1cast_1"Cast*	
-to 
-F
-matmul_output_float_1
-multiplier_1mul_output_1mul_bottom_1"Mul
-1
-mul_output_1
-bias_1output_1
-bias_add_1"Add
-a
-a_quantized
-
b_quantized_2
-a_zp
-b_zp_2matmul_output_int32_2MatMulInteger_2"
MatMulInteger
-4
-a_scale
-	b_scale_2multiplier_2mul_right_2"Mul
-G
-matmul_output_int32_2matmul_output_float_2cast_2"Cast*	
-to 
-F
-matmul_output_float_2
-multiplier_2mul_output_2mul_bottom_2"Mul
-1
-mul_output_2
-bias_2output_2
-bias_add_2"Add
-a
-a_quantized
-
b_quantized_3
-a_zp
-b_zp_3matmul_output_int32_3MatMulInteger_3"
MatMulInteger
-4
-a_scale
-	b_scale_3multiplier_3mul_right_3"Mul
-G
-matmul_output_int32_3matmul_output_float_3cast_3"Cast*	
-to 
-B
-matmul_output_float_3
-multiplier_3output_3mul_bottom_3"MulMatMulIntegerToFloat_fusion**B
b_quantized_1**€Bb_zp_1*
-*³~B	b_scale_1**B
b_quantized_3**€Bb_zp_3*
-*³~B	b_scale_3*
-*	€€€ˆ€ŠBbias_1**
-*€x€€€„€ˆ€Š€Œ€Ž€<>€‘Bbias_2Z
-input
-
-
-
-Z
-
b_quantized_2
-
-
-Z
-b_zp_2
-
-
-Z
-	b_scale_2
-
-
-
-b
-output_1
-
-
-
-b
-output_2
-
-
-
-b
-output_3
-
-
-
-B
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
@ -0,0 +1,51 @@
+	:Ě
+U
+A
+B
+a_zero_point
+b_zero_pointmatmul_output_int32
MatMulInteger"
MatMulInteger
+.
+a_scale
+b_scale
+multiplier	mul_right"Mul
+A
+matmul_output_int32matmul_output_floatcast"Cast*	
+to
+ 
+5
+matmul_output_float
+
+multiplierY
+mul_bottom"MulDynamicQuantizeMatMul_fusionZ
+A
+
+
+M
+KZ
+B
+
+
+K
+NZ
+a_scale
+
+
+
+Z
+b_scale
+	
+
+CZ
+a_zero_point
+
+
+Z
+b_zero_point
+	
+Cb
+Y
+
+
+
+M
+NB