diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc
index 163751c38e..fd3f9c2ce6 100644
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@@ -274,7 +274,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
                                                                             onnxruntime::kRocmExecutionProvider,
                                                                             onnxruntime::kAclExecutionProvider,
                                                                             onnxruntime::kArmNNExecutionProvider};
-
+      const InlinedHashSet<std::string_view> cpu_dml_eps = {onnxruntime::kCpuExecutionProvider,
+                                                            onnxruntime::kDmlExecutionProvider};
 #ifdef MLAS_TARGET_AMD64_IX86
       const bool avx2_precision_mode =
           session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsAvx2PrecisionMode, "0") == "1" && MlasPlatformU8S8Overflow();
@@ -292,7 +293,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
       }
 
       transformers.emplace_back(std::make_unique<GemmActivationFusion>(cpu_ep));
-      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_ep));
+      transformers.emplace_back(std::make_unique<MatMulIntegerToFloatFusion>(cpu_dml_eps));
       transformers.emplace_back(std::make_unique<DynamicQuantizeMatMulFusion>(cpu_ep));
 
       transformers.emplace_back(std::make_unique<ConvActivationFusion>(cpu_cuda_rocm_acl_armnn_eps));
diff --git a/onnxruntime/core/optimizer/matmul_integer_to_float.cc b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
index b9fe5dcad0..4fee1a6ce2 100644
--- a/onnxruntime/core/optimizer/matmul_integer_to_float.cc
+++ b/onnxruntime/core/optimizer/matmul_integer_to_float.cc
@@ -31,6 +31,24 @@ static bool CheckBiasShape(const TensorShapeProto* bias_shape) {
   return bias_last_dim > 1;
 }
 
+bool HasElementDataType(const NodeArg& node_arg, int32_t data_type) {
+  if (!node_arg.Exists()) {
+    return false;
+  }
+
+  const auto* type_proto = node_arg.TypeAsProto();
+  if (!type_proto) {
+    return false;
+  }
+
+  int32_t actual_data_type;
+  if (!utils::TryGetElementDataType(*type_proto, actual_data_type)) {
+    return false;
+  }
+
+  return data_type == actual_data_type;
+}
+
 /**
 MatMulIntegerToFloatFusion will fuse subgraph like below into MatMulIntegerToFloat:
 
@@ -63,8 +81,10 @@ Status MatMulIntegerToFloatFusion::ApplyImpl(Graph& graph, bool& modified, int g
     auto& mul_node = *node_ptr;
 
     ORT_RETURN_IF_ERROR(Recurse(mul_node, modified, graph_level, logger));
-
-    if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14})) {
+    const bool is_dml_ep = node_ptr->GetExecutionProviderType() == kDmlExecutionProvider;
+    if (!graph_utils::IsSupportedOptypeVersionAndDomain(mul_node, "Mul", {7, 13, 14}) ||
+        !graph_utils::IsSupportedProvider(mul_node, GetCompatibleExecutionProviders()) ||
+        (!is_dml_ep && HasElementDataType(*mul_node.InputDefs()[0], ONNX_NAMESPACE::TensorProto_DataType_FLOAT16))) {
       continue;
     }
 
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 7eadd3fffc..51d9a57b5e 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -98,7 +98,12 @@ void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
   test.SetOutputRelErr("Y", 1e-4f);
 #endif
 
-  test.Run();
+  if constexpr (std::is_same_v<OType, float>) {
+    test.Run();
+  } else {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCpuExecutionProvider});
+  }
+
 }
 
 template <typename IType, typename WType, typename OType, bool HasZeroPoint, bool HasBias>
@@ -148,6 +153,7 @@ void RunMatMulIntegerToFloatTest(const string& model_path) {
   );
 }
 
+#if USE_DML
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8_FP16) {
   RunMatMulIntegerToFloatTest<uint8_t, int8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_int8.onnx");
   RunMatMulIntegerToFloatTest<uint8_t, uint8_t, MLFloat16, true, false>("testdata/matmul_integer_to_float16_uint8.onnx");
@@ -165,6 +171,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8S8_FP16) {
 TEST(MatMulIntegerToFloat, NoZeroPoint_HasBias_test_S8S8_FP16) {
   RunMatMulIntegerToFloatTest<int8_t, int8_t, MLFloat16, false, true>("testdata/matmul_integer_to_float16_int8_int8_bias.onnx");
 }
+#endif // USE_DML
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_U8X8) {
   RunMatMulIntegerToFloatTest<uint8_t, int8_t, float, true, false>("testdata/matmul_integer_to_float_int8.onnx");
@@ -247,68 +254,5 @@ TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint) {
   test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
 }
 
-TEST(MatMulIntegerToFloat, MatMulInteger_With_ZeroPoint_FP16) {
-  auto test_case = [&](const std::vector<int64_t>& input_shape,
-                       const std::vector<int64_t>& weights_shape,
-                       const std::vector<int64_t>& b_scale_zp_shape) {
-    auto build_test_case = [&](ModelTestBuilder& builder) {
-      auto* input_arg = builder.MakeInput<uint8_t>(input_shape,
-                                                   std::numeric_limits<uint8_t>::min(),
-                                                   std::numeric_limits<uint8_t>::max());
-      auto* output_arg = builder.MakeOutput();
-      auto* weight = builder.MakeInitializer<int8_t>(weights_shape,
-                                                     std::numeric_limits<int8_t>::min() / 2,
-                                                     std::numeric_limits<int8_t>::max() / 2);
-
-      // add MatMulInteger
-      auto* matmul_integer_output = builder.MakeIntermediate();
-      auto* A_zp_arg = builder.MakeInput<uint8_t>({1},
-                                                  std::numeric_limits<uint8_t>::min(),
-                                                  std::numeric_limits<uint8_t>::max());
-      auto* B_zp_arg = builder.MakeInput<int8_t>(b_scale_zp_shape,
-                                                 std::numeric_limits<int8_t>::min() / 2,
-                                                 std::numeric_limits<int8_t>::max() / 2);
-      builder.AddNode("MatMulInteger", {input_arg, weight, A_zp_arg, B_zp_arg}, {matmul_integer_output});
-
-      // add Cast
-      auto* cast_output = builder.MakeIntermediate();
-      Node& cast_node = builder.AddNode("Cast", {matmul_integer_output}, {cast_output});
-      cast_node.AddAttribute("to", static_cast<int64_t>(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16));
-
-      // add Mul1
-      auto* A_scale_arg = builder.MakeInput<MLFloat16>({1}, MLFloat16(-0.1f), MLFloat16(0.0f));
-      auto* B_scale_arg = builder.MakeInput<MLFloat16>(b_scale_zp_shape, MLFloat16(-0.1f), MLFloat16(0.0f));
-      auto* mul1_output = builder.MakeIntermediate();
-      builder.AddNode("Mul", {A_scale_arg, B_scale_arg}, {mul1_output});
-
-      // add Mul2
-      builder.AddNode("Mul", {mul1_output, cast_output}, {output_arg});
-    };
-
-    auto check_mp_reshape_graph = [&](InferenceSessionWrapper& session) {
-      auto op_to_count = CountOpsInGraph(session.GetGraph());
-      EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
-    };
-
-    TransformerTester(build_test_case,
-                      check_mp_reshape_graph,
-                      TransformerLevel::Level1,
-                      TransformerLevel::Level2,
-                      12 /*opset_version*/,
-                      1e-5 /*per_sample_tolerance*/,
-                      1e-5 /*relative_per_sample_tolerance*/);
-  };
-
-  // Scale Scalar
-  test_case({5, 4, 3}, {3, 4}, {1});
-
-  // 2D B per-column
-  test_case({5, 4, 3}, {3, 4}, {4});
-  test_case({5, 4, 3}, {3, 4}, {1, 4});
-
-  // ND B per-column
-  test_case({15, 14, 13}, {15, 13, 27}, {15, 1, 27});
-}
-
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index c3dc2734d8..4f45f48da7 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -5189,6 +5189,24 @@ TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
   EXPECT_EQ(op_to_count["Add"], 1);
 }
 
+#ifdef USE_DML
+ TEST_F(GraphTransformationTests, MatMulIntegerToFloat16Test) {
+  constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float16_int8.onnx";
+  std::shared_ptr<Model> p_model;
+  ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
+  Graph& graph = p_model->MainGraph();
+
+  for (auto& node : graph.Nodes()) {
+    node.SetExecutionProviderType(kDmlExecutionProvider);
+  } 
+  onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
+  ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::make_unique<MatMulIntegerToFloatFusion>(), TransformerLevel::Level2));
+  ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_));
+  std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
+  EXPECT_EQ(op_to_count["com.microsoft.MatMulIntegerToFloat"], 1);
+}
+#endif  // USE_DML
+
 #endif
 
 #ifndef DISABLE_CONTRIB_OPS
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
index ff35f81171..60bdd92dc9 100644
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
@@ -49,15 +49,15 @@ def MakeSubGraph(suffix, has_bias):  # noqa: N802
     return nodes
 
 
-def MakeInitializer(suffix, output_type_fp16=False):  # noqa: N802
+def MakeInitializer(suffix):  # noqa: N802
     return [
         helper.make_tensor("b_quantized" + suffix, TensorProto.UINT8, [2, 3], [2, 4, 5, 6, 7, 8]),
         helper.make_tensor("b_zp" + suffix, TensorProto.UINT8, [], [128]),
-        helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [], [1.8]),
+        helper.make_tensor("b_scale" + suffix, TensorProto.FLOAT, [], [1.8]),
     ]
 
 
-def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
+def GenerateModel(model_name):  # noqa: N802
     nodes = [
         helper.make_node(
             "DynamicQuantizeLinear",
@@ -71,13 +71,13 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
     nodes.extend(MakeSubGraph("_3", False))
 
     initializers = []
-    initializers.extend(MakeInitializer("_1", output_type_fp16))
-    initializers.extend(MakeInitializer("_3", output_type_fp16))
+    initializers.extend(MakeInitializer("_1"))
+    initializers.extend(MakeInitializer("_3"))
 
     initializers.extend(
         [
-            helper.make_tensor("bias_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3], [2, 4, 5]),
-            helper.make_tensor("bias_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
+            helper.make_tensor("bias_1", TensorProto.FLOAT, [3], [2, 4, 5]),
+            helper.make_tensor("bias_2", TensorProto.FLOAT, [3, 3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
         ]
     )
 
@@ -85,16 +85,16 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
         nodes,
         "MatMulIntegerToFloat_fusion",  # name
         [  # inputs
-            helper.make_tensor_value_info("input", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 2]),
+            helper.make_tensor_value_info("input", TensorProto.FLOAT, [3, 2]),
             # matrix b corresponding inputs for subgraph 2
             helper.make_tensor_value_info("b_quantized_2", TensorProto.UINT8, [2, 3]),
             helper.make_tensor_value_info("b_zp_2", TensorProto.UINT8, [1]),
-            helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [1]),
+            helper.make_tensor_value_info("b_scale_2", TensorProto.FLOAT, [1]),
         ],
         [  # outputs
-            helper.make_tensor_value_info("output_1", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
-            helper.make_tensor_value_info("output_2", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
-            helper.make_tensor_value_info("output_3", TensorProto.FLOAT16 if output_type_fp16 else TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_1", TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_2", TensorProto.FLOAT, [3, 3]),
+            helper.make_tensor_value_info("output_3", TensorProto.FLOAT, [3, 3]),
         ],
         initializers,
     )
@@ -104,5 +104,4 @@ def GenerateModel(model_name, output_type_fp16=False):  # noqa: N802
 
 
 if __name__ == "__main__":
-    GenerateModel("matmul_integer_to_float.onnx")
-    GenerateModel("matmul_integer_to_float16.onnx", output_type_fp16=True)
+    GenerateModel("matmul_integer_to_float.onnx")
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
deleted file mode 100644
index 67d50eac6f..0000000000
--- a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16.onnx
+++ /dev/null
@@ -1,90 +0,0 @@
-	:╓
-Q
-inputa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
-a
-a_quantized
-b_quantized_1
-a_zp
-b_zp_1matmul_output_int32_1MatMulInteger_1"MatMulInteger
-4
-a_scale
-	b_scale_1multiplier_1mul_right_1"Mul
-G
-matmul_output_int32_1matmul_output_float_1cast_1"Cast*	
-toа
-F
-matmul_output_float_1
-multiplier_1mul_output_1mul_bottom_1"Mul
-1
-mul_output_1
-bias_1output_1
-bias_add_1"Add
-a
-a_quantized
-b_quantized_2
-a_zp
-b_zp_2matmul_output_int32_2MatMulInteger_2"MatMulInteger
-4
-a_scale
-	b_scale_2multiplier_2mul_right_2"Mul
-G
-matmul_output_int32_2matmul_output_float_2cast_2"Cast*	
-toа
-F
-matmul_output_float_2
-multiplier_2mul_output_2mul_bottom_2"Mul
-1
-mul_output_2
-bias_2output_2
-bias_add_2"Add
-a
-a_quantized
-b_quantized_3
-a_zp
-b_zp_3matmul_output_int32_3MatMulInteger_3"MatMulInteger
-4
-a_scale
-	b_scale_3multiplier_3mul_right_3"Mul
-G
-matmul_output_int32_3matmul_output_float_3cast_3"Cast*	
-toа
-B
-matmul_output_float_3
-multiplier_3output_3mul_bottom_3"MulMatMulIntegerToFloat_fusion**Bb_quantized_1**АBb_zp_1*
-*│~B	b_scale_1**Bb_quantized_3**АBb_zp_3*
-*│~B	b_scale_3*
-*	АААИАКBbias_1**
-*АxАААДАИАКАМАОАРАСBbias_2Z
-input
-
-
-
-Z
-b_quantized_2
-
-
-Z
-b_zp_2
-
-
-Z
-	b_scale_2
-
-
-
-b
-output_1
-
-
-
-b
-output_2
-
-
-
-b
-output_3
-
-
-
-B
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
new file mode 100644
index 0000000000..22293b0d10
--- /dev/null
+++ b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float16_int8.onnx
@@ -0,0 +1,51 @@
+	:╠
+U
+A
+B
+a_zero_point
+b_zero_pointmatmul_output_int32MatMulInteger"MatMulInteger
+.
+a_scale
+b_scale
+multiplier	mul_right"Mul
+A
+matmul_output_int32matmul_output_floatcast"Cast*	
+to
+а
+5
+matmul_output_float
+
+multiplierY
+mul_bottom"MulDynamicQuantizeMatMul_fusionZ
+A
+
+
+M
+KZ
+B
+
+
+K
+NZ
+a_scale
+
+
+
+Z
+b_scale
+	
+
+CZ
+a_zero_point
+
+
+Z
+b_zero_point
+	
+Cb
+Y
+
+
+
+M
+NB
\ No newline at end of file