[DML EP] Add FusedMatMul (#14196)

### Description Add FusedMatMul ### Motivation and Context - Add the FusedMatMul fusion for DML - Fix the FusedMatMul logic and tests when transposed batches are involved
2026-07-20 19:12:24 +00:00 · 2023-01-12 02:17:04 -08:00 · 2023-01-12 02:17:04 -08:00 · 99a4036c80
commit 99a4036c80
parent 712f781702
5 changed files with 24 additions and 77 deletions
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@ -1131,6 +1131,7 @@ if (onnxruntime_USE_DML)

  target_add_dml(onnxruntime_providers_dml)
  target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_common)
+  target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_framework)
  onnxruntime_add_include_to_target(onnxruntime_providers_dml onnxruntime_common)
  if (GDK_PLATFORM STREQUAL Scarlett)
    target_link_libraries(onnxruntime_providers_dml PRIVATE ${gdk_dx_libs})
--- a/onnxruntime/core/optimizer/graph_transformer_utils.cc
+++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc
@ -274,7 +274,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
      transformers.emplace_back(std::make_unique<GatherToSplitFusion>(cpu_cuda_rocm_eps));
      transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));

-      transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_rocm_eps));
+      transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
      transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_cuda_dml_rocm_eps));
      transformers.emplace_back(std::make_unique<BiasSoftmaxFusion>(cpu_cuda_rocm_eps));
      transformers.emplace_back(std::make_unique<BiasDropoutFusion>(cuda_rocm_eps));
--- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
+++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp
@ -4,6 +4,7 @@
 #include "precomp.h"
 #include "OperatorHelper.h"
 #include "core/providers/common.h"
+#include "core/providers/cpu/math/matmul_helper.h"

 namespace OperatorHelper
 {
@ -614,11 +615,11 @@ namespace OperatorHelper
            ML_CHECK_VALID_ARGUMENT(dimensionCount > 2,
                "FusedMatMul operator: Tensor size should be more than 2, if attribute transBatch is true");

-            std::rotate(newSizes.begin(), newSizes.end() - 2, newSizes.end() - 1);
-            std::rotate(newStrides.begin(), newStrides.end() - 2, newStrides.end() - 1);
+            std::rotate(newSizes.begin(), newSizes.begin() + 1, newSizes.end() - 1);
+            std::rotate(newStrides.begin(), newStrides.begin() + 1, newStrides.end() - 1);
        }

-        if (transpose)
+        if (transpose && dimensionCount > 1)
        {
            std::swap(newStrides[dimensionCount - 2], newStrides[dimensionCount - 1]);
            std::swap(newSizes[dimensionCount - 2], newSizes[dimensionCount - 1]);
@ -1668,65 +1669,31 @@ namespace OperatorHelper
    {
        ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() == 2);

-        // Following numpy.matmul for shape inference:
-        // https://docs.scipy.org/doc/numpy/reference/generated/numpy.matmul.html
-        // The behavior depends on the arguments in the following way.
-        // * If both arguments are 2 - D they are multiplied like conventional matrices.
-        // * If either argument is N - D, N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly.
-        // * If the first argument is 1 - D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix multiplication the prepended 1 is removed.
-        // * If the second argument is 1 - D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed.
-
        auto inputShape0 = shapeInfo.GetInputTensorShape(0);
        auto inputShape1 = shapeInfo.GetInputTensorShape(1);
        ML_CHECK_VALID_ARGUMENT(inputShape0.size() >= 1);
        ML_CHECK_VALID_ARGUMENT(inputShape1.size() >= 1);

-        auto [sizesA, stridesA] = GetFusedMatMulSizesAndStrides(
-            inputShape0,
-            shapeInfo.GetOptionalAttribute(AttrName::TransBatchA, -1),
-            shapeInfo.GetOptionalAttribute(AttrName::TransA, -1)
-        );
-        inputShape0 = sizesA;
+        std::vector<int64_t> aSizes(inputShape0.begin(), inputShape0.end());
+        std::vector<int64_t> bSizes(inputShape1.begin(), inputShape1.end());
+        auto transAAttr = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransA, 0);
+        auto transBAttr = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransB, 0);

-        auto [sizesB, stridesB] = GetFusedMatMulSizesAndStrides(
-            inputShape1,
-            shapeInfo.GetOptionalAttribute(AttrName::TransBatchB, -1),
-            shapeInfo.GetOptionalAttribute(AttrName::TransB, -1)
-        );
-        inputShape1 = sizesB;
+        const bool transA = transAAttr && aSizes.size() != 1;
+        const bool transB = transBAttr && bSizes.size() != 1;
+        auto transBatchA = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransBatchA, 0);
+        auto transBatchB = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransBatchB, 0);

-        std::vector<uint32_t> outputMatrixDims;
+        onnxruntime::MatMulComputeHelper helper;
+        ML_CHECK_VALID_ARGUMENT(helper.Compute(onnxruntime::TensorShape(aSizes), onnxruntime::TensorShape(bSizes), transA, transB, transBatchA, transBatchB, false).IsOK());

-        // Modify the input and truncated output shapes per the above comments.
-        // The extra dimensions of the output beyond the two matrix dimensions
-        // will be computed afterward by broadcasting.
-        if (inputShape0.size() == 1)
-        {
-            inputShape0.insert(inputShape0.begin(), 1);
-        }
-        else
-        {
-            outputMatrixDims.push_back(inputShape0[inputShape0.size() - 2]);
-        }
+        auto outputDims = helper.OutputShape().AsShapeVector();

-        if (inputShape1.size() == 1)
-        {
-            inputShape1.push_back(1);
-        }
-        else
-        {
-            outputMatrixDims.push_back(inputShape1[inputShape1.size() - 1]);
-        }
+        std::vector<uint32_t> outputShape;
+        outputShape.reserve(outputDims.size());
+        std::transform(outputDims.begin(), outputDims.end(), std::back_inserter(outputShape), [](int64_t dimSize){ return static_cast<uint32_t>(dimSize); });

-        // Remove the matrix dimensions from each input, resulting in broadcastable shapes.
-        std::vector<uint32_t> batchDims0(inputShape0.begin(), inputShape0.end() - 2);
-        std::vector<uint32_t> batchDims1(inputShape1.begin(), inputShape1.end() - 2);
-
-        // Broadcast the extra dimensions of each input, then add the truncated matrix dimensions.
-        std::vector<uint32_t> outputDims = BroadcastTensorShape(batchDims0, batchDims1);
-        outputDims.insert(outputDims.end(), outputMatrixDims.begin(), outputMatrixDims.end());
-
-        return {std::move(outputDims)};
+        return {std::move(outputShape)};
    }

    void TopKHelper::Initialize(
--- a/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc
+++ b/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc
@ -229,28 +229,16 @@ TEST(FusedMatMulOpTest, DoubleTypeNoTranspose) {
 #endif

 TEST(FusedMatMulOpTest, FloatTypeTransposeA) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
-  }
  RunFusedMatMulTest<float>("FusedMatMul", 1, true, false);
 }

 TEST(FusedMatMulOpTest, FloatTypeTransposeB) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
-  }
  RunFusedMatMulTest<float>("FusedMatMul", 1, false, true);
  // b is constant. This tests weight packing logic
  RunFusedMatMulTest<float>("FusedMatMul", 1, false, true, false, false, 1.0f, true);
 }

 TEST(FusedMatMulOpTest, FloatTypeTransposeAB) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
-  }
  RunFusedMatMulTest<float>("FusedMatMul", 1, true, true);

  // b is constant. This tests weight packing logic
@ -258,10 +246,6 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeAB) {
 }

 TEST(FusedMatMulOpTest, FloatTypeScale) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
-  }
  RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, false, false, 0.5f);
  RunFusedMatMulTest<float>("FusedMatMul", 1, true, false, false, false, 2.0f);
  RunFusedMatMulTest<float>("FusedMatMul", 1, true, true, false, false, 4.0f);
@ -273,11 +257,6 @@ TEST(FusedMatMulOpTest, FloatTypeScale) {
 }

 TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) {
-  // TODO: Unskip when fixed #41968513
-  if (DefaultDmlExecutionProvider().get() != nullptr) {
-    GTEST_SKIP() << "Skipping because of the following error: DmlCommandRecorder.cpp(338): The parameter is incorrect";
-  }
-
  RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, true, false);
  RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, false, true);
  RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, true, true, 0.5f);
@ -292,7 +271,7 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) {
  RunFusedMatMulTest<float>("FusedMatMul", 1, true, true, true, true);
 }

-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
 TEST(FusedMatMulOpTest, Float16_NoTranspose) {
 #ifdef USE_CUDA
  int min_cuda_architecture = 530;
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@ -94,9 +94,9 @@ TEST_P(ModelTest, Run) {
  std::basic_string<ORTCHAR_T> model_path = param.substr(pos + 1);
  double per_sample_tolerance = 1e-3;
  // when cuda is enabled, set it to a larger value for resolving random MNIST test failure
-  // when openvino is enabled, set it to a larger value for resolving MNIST accuracy mismatch
+  // when openvino or dml are enabled, set it to a larger value for resolving MNIST accuracy mismatch
  double relative_per_sample_tolerance = 1e-3;
-  if (provider_name == "openvino") {
+  if (provider_name == "openvino" || provider_name == "dml") {
    relative_per_sample_tolerance = 0.009;
  }