From 7d4dba7e1676785bfddea5cf6d866585b432ebe7 Mon Sep 17 00:00:00 2001
From: raoanag <127366241+raoanag@users.noreply.github.com>
Date: Fri, 3 Nov 2023 10:05:09 -0700
Subject: [PATCH] Disable MatMulIntegerToFloat transformation for FP16 on CPU
 EP (#18239)

### Description
MatMulIntegerToFloat is updated to support FP16. The nodes for FP16
Transformation use "Mul" FP16, which is not directly supported by the
CPU.

For now FP16 transformation is only supported for DML EP. Disabled all
FP16 tests on CPU.

Tests result without `-use_dml` build flag
```
onnxruntime_test_all.exe --gtest_filter="*MatMulIntegerToFloat*"
Note: Google Test filter = *MatMulIntegerToFloat*
[==========] Running 8 tests from 4 test suites.
[----------] Global test environment set-up.
[----------] 1 test from CPU_U8S8_Precision_Tests
[ RUN      ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat
[       OK ] CPU_U8S8_Precision_Tests.MatMulIntegerToFloat (181 ms)
[----------] 1 test from CPU_U8S8_Precision_Tests (181 ms total)

[----------] 1 test from GraphTransformationTests
[ RUN      ] GraphTransformationTests.MatMulIntegerToFloatTest
[       OK ] GraphTransformationTests.MatMulIntegerToFloatTest (17 ms)
[----------] 1 test from GraphTransformationTests (17 ms total)

[----------] 1 test from QDQTransformerTests
[ RUN      ] QDQTransformerTests.MatMulIntegerToFloat
[       OK ] QDQTransformerTests.MatMulIntegerToFloat (656 ms)
[----------] 1 test from QDQTransformerTests (656 ms total)

[----------] 5 tests from MatMulIntegerToFloat
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_U8X8 (195 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_U8X8 (206 ms)
[ RUN      ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8
[       OK ] MatMulIntegerToFloat.HasZeroPoint_NoBias_test_S8S8 (107 ms)
[ RUN      ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8
[       OK ] MatMulIntegerToFloat.NoZeroPoint_HasBias_test_S8S8 (114 ms)
[ RUN      ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint
[       OK ] MatMulIntegerToFloat.MatMulInteger_With_ZeroPoint (227 ms)
[----------] 5 tests from MatMulIntegerToFloat (854 ms total)

[----------] Global test environment tear-down
[==========] 8 tests from 4 test suites ran. (1713 ms total)
[  PASSED  ] 8 tests.
memleakdbg:
----- No memory leaks detected -----
```

```
onnxruntime_test_all.exe --gtest_filter="GraphTransformationTests.MatMulIntegerToFloat*"
Note: Google Test filter = GraphTransformationTests.MatMulIntegerToFloat*
[==========] Running 2 tests from 1 test suite.
[----------] Global test environment set-up.
[----------] 2 tests from GraphTransformationTests
[ RUN      ] GraphTransformationTests.MatMulIntegerToFloatTest
[       OK ] GraphTransformationTests.MatMulIntegerToFloatTest (13 ms)
[ RUN      ] GraphTransformationTests.MatMulIntegerToFloat16Test
[       OK ] GraphTransformationTests.MatMulIntegerToFloat16Test (4 ms)
[----------] 2 tests from GraphTransformationTests (20 ms total)

[----------] Global test environment tear-down
[==========] 2 tests from 1 test suite ran. (22 ms total)
[  PASSED  ] 2 tests.
memleakdbg:
----- No memory leaks detected -----
```
### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/optimizer/graph_transformer_utils.cc |  5 +-
 .../core/optimizer/matmul_integer_to_float.cc | 24 ++++-
 .../matmul_integer_to_float_test.cc           | 72 ++-------------
 .../test/optimizer/graph_transform_test.cc    | 18 ++++
 .../fusion/matmul_integer_to_float.py         | 27 +++---
 .../fusion/matmul_integer_to_float16.onnx     | 90 -------------------
 .../matmul_integer_to_float16_int8.onnx       | 51 +++++++++++
 7 files changed, 115 insertions(+), 172 deletions(-)
 delete mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
 create mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx

diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 163751c38e..fd3f9c2ce6 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -274,7 +274,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                             onnxruntime::kRocmExecutionProvider,
                                                                             onnxruntime::kAclExecutionProvider,
                                                                             onnxruntime::kArmNNExecutionProvider};
-
+      const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
+                                                            onnxruntime::kDmlExecutionProvider};
 #ifdef MLAS_TARGET_AMD64_IX86
       const bool avx2_precision_mode =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@@ -292,7 +293,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
-      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
+      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
       transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
 
       transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_eps));
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
index b9fe5dcad0..4fee1a6ce2 100644
--- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc
+++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
   return bias_last_dim > 1;
 }
 
+bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
+  if (!node_arg.Exists()) {
+    return false;
+  }
+
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto) {
+    return false;
+  }
+
+  int32_t actual_data_type;
+  if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
+    return false;
+  }
+
+  return data_type == actual_data_type;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
     auto& mul_node = *node_ptr;
 
     ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
-
-    if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) {
+    const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
+        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
+        (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
       continue;
     }
 
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 7eadd3fffc..51d9a57b5e 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
   test.SetOutputRelErr("Y", 1e-4f);
 #endif
 
-  test.Run();
+  if constexpr (std::is_same_v<OType, float>) {
+    test.Run();
+  } else {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
+  }
+
 }
 
 template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
@@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
   );
 }
 
+#if USE_DML
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
   RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
@@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
 TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
   RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
 }
+#endif // USE_DML
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
   RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
@@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }
 
-TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) {
-  auto test_case = [&](const std::vector<int64_t>& input_shape,
-                       const std::vector<int64_t>& weights_shape,
-                       const std::vector<int64_t>& b_scale_zp_shape) {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<uint8_t>(input_shape,
-                                                   std::numeric_limits<uint8_t>::min(),
-                                                   std::numeric_limits<uint8_t>::max());
-      auto* output_arg = builder.MakeOutput();
-      auto* weight = builder.MakeInitializer<int8_t>(weights_shape,
-                                                     std::numeric_limits<int8_t>::min() / 2,
-                                                     std::numeric_limits<int8_t>::max() / 2);
-
-      // add MatMulInteger
-      auto* matmul_integer_output = builder.MakeIntermediate();
-      auto* A_zp_arg = builder.MakeInput<uint8_t>({1},
-                                                  std::numeric_limits<uint8_t>::min(),
-                                                  std::numeric_limits<uint8_t>::max());
-      auto* B_zp_arg = builder.MakeInput<int8_t>(b_scale_zp_shape,
-                                                 std::numeric_limits<int8_t>::min() / 2,
-                                                 std::numeric_limits<int8_t>::max() / 2);
-      builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output});
-
-      // add Cast
-      auto* cast_output = builder.MakeIntermediate();
-      Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output});
-      cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16));
-
-      // add Mul1
-      auto* A_scale_arg = builder.MakeInput<MLFloat16>({1}, MLFloat16(-0.1f), MLFloat16(0.0f));
-      auto* B_scale_arg = builder.MakeInput<MLFloat16>(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f));
-      auto* mul1_output = builder.MakeIntermediate();
-      builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output});
-
-      // add Mul2
-      builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg});
-    };
-
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
-      auto op_to_count = CountOpsInGraph(session.GetGraph());
-      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
-    };
-
-    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
-                      TransformerLevel::Level1,
-                      TransformerLevel::Level2,
-                      12 /*opset_version*/,
-                      1e-5 /*per_sample_tolerance*/,
-                      1e-5 /*relative_per_sample_tolerance*/);
-  };
-
-  // Scale Scalar
-  test_case({5, 4, 3}, {3, 4}, {1});
-
-  // 2D B per-column
-  test_case({5, 4, 3}, {3, 4}, {4});
-  test_case({5, 4, 3}, {3, 4}, {1, 4});
-
-  // ND B per-column
-  test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
-}
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index c3dc2734d8..4f45f48da7 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
   EXPECT_EQ(op_to_count["Add"], 1);
 }
 
+#ifdef USE_DML
+ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  } 
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+}
+#endif  // USE_DML
+
 #endif
 
 #ifndef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
index ff35f81171..60bdd92dc9 100644
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
@@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias):  # noqa: N802
     return nodes
 
 
-def MakeInitializer(suffix, output_type_fp16=False):  # noqa: N802
+def MakeInitializer(suffix):  # noqa: N802
     return [
         helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]),
         helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]),
-        helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]),
+        helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]),
     ]
 
 
-def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node(
             "DynamicQuantizeLinear",
@@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
     nodes.extend(MakeSubGraph("_3", False))
 
     initializers = []
-    initializers.extend(MakeInitializer("_1", output_type_fp16))
-    initializers.extend(MakeInitializer("_3", output_type_fp16))
+    initializers.extend(MakeInitializer("_1"))
+    initializers.extend(MakeInitializer("_3"))
 
     initializers.extend(
         [
-            helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]),
-            helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]),
+            helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
         ]
     )
 
@@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
         nodes,
         "MatMulIntegerToFloat_fusion",  # name
         [  # inputs
-            helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]),
+            helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]),
             # matrix b corresponding inputs for subgraph 2
             helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]),
             helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]),
-            helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
+            helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]),
         ],
         [  # outputs
-            helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
-            helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
-            helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]),
         ],
         initializers,
     )
@@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
 
 
 if __name__ == "__main__":
-    GenerateModel("matmul_integer_to_float.onnx")
-    GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True)
+    GenerateModel("matmul_integer_to_float.onnx")
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
deleted file mode 100644
index 67d50eac6f..0000000000
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
+++ /dev/null
@@ -1,90 +0,0 @@
-	:�
-Q
-inputa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
-a
-a_quantized
-b_quantized_1
-a_zp
-b_zp_1matmul_output_int32_1MatMulInteger_1"MatMulInteger
-4
-a_scale
-	b_scale_1multiplier_1mul_right_1"Mul
-G
-matmul_output_int32_1matmul_output_float_1cast_1"Cast*	
-to�
-F
-matmul_output_float_1
-multiplier_1mul_output_1mul_bottom_1"Mul
-1
-mul_output_1
-bias_1output_1
-bias_add_1"Add
-a
-a_quantized
-b_quantized_2
-a_zp
-b_zp_2matmul_output_int32_2MatMulInteger_2"MatMulInteger
-4
-a_scale
-	b_scale_2multiplier_2mul_right_2"Mul
-G
-matmul_output_int32_2matmul_output_float_2cast_2"Cast*	
-to�
-F
-matmul_output_float_2
-multiplier_2mul_output_2mul_bottom_2"Mul
-1
-mul_output_2
-bias_2output_2
-bias_add_2"Add
-a
-a_quantized
-b_quantized_3
-a_zp
-b_zp_3matmul_output_int32_3MatMulInteger_3"MatMulInteger
-4
-a_scale
-	b_scale_3multiplier_3mul_right_3"Mul
-G
-matmul_output_int32_3matmul_output_float_3cast_3"Cast*	
-to�
-B
-matmul_output_float_3
-multiplier_3output_3mul_bottom_3"MulMatMulIntegerToFloat_fusion**Bb_quantized_1**�Bb_zp_1*
-*�~B	b_scale_1**Bb_quantized_3**�Bb_zp_3*
-*�~B	b_scale_3*
-*	������Bbias_1**
-*�x����������������Bbias_2Z
-input
-
-
-
-Z
-b_quantized_2
-
-
-Z
-b_zp_2
-
-
-Z
-	b_scale_2
-
-
-
-b
-output_1
-
-
-
-b
-output_2
-
-
-
-b
-output_3
-
-
-
-B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
new file mode 100644
index 0000000000..22293b0d10
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
@@ -0,0 +1,51 @@
+	:�
+U
+A
+B
+a_zero_point
+b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
+.
+a_scale
+b_scale
+multiplier	mul_right"Mul
+A
+matmul_output_int32matmul_output_floatcast"Cast*	
+to
+�
+5
+matmul_output_float
+
+multiplierY
+mul_bottom"MulDynamicQuantizeMatMul_fusionZ
+A
+
+
+M
+KZ
+B
+
+
+K
+NZ
+a_scale
+
+
+
+Z
+b_scale
+	
+
+CZ
+a_zero_point
+
+
+Z
+b_zero_point
+	
+Cb
+Y
+
+
+
+M
+NB
\ No newline at end of file