diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index e850a3703b..78b738cd29 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -1131,6 +1131,7 @@ if (onnxruntime_USE_DML) target_add_dml(onnxruntime_providers_dml) target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_common) + target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_framework) onnxruntime_add_include_to_target(onnxruntime_providers_dml onnxruntime_common) if (GDK_PLATFORM STREQUAL Scarlett) target_link_libraries(onnxruntime_providers_dml PRIVATE ${gdk_dx_libs}) diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index cb072f4832..fe211b0681 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -274,7 +274,7 @@ InlinedVector> GenerateTransformers( transformers.emplace_back(std::make_unique(cpu_cuda_rocm_eps)); transformers.emplace_back(std::make_unique(cpu_cuda_rocm_eps)); - transformers.emplace_back(std::make_unique(cpu_cuda_rocm_eps)); + transformers.emplace_back(std::make_unique(cpu_cuda_dml_rocm_eps)); transformers.emplace_back(std::make_unique(cpu_cuda_dml_rocm_eps)); transformers.emplace_back(std::make_unique(cpu_cuda_rocm_eps)); transformers.emplace_back(std::make_unique(cuda_rocm_eps)); diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp index 2a51d122cf..f8887f8b85 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorHelper.cpp @@ -4,6 +4,7 @@ #include "precomp.h" #include "OperatorHelper.h" #include "core/providers/common.h" +#include "core/providers/cpu/math/matmul_helper.h" namespace OperatorHelper { @@ -614,11 +615,11 @@ namespace OperatorHelper ML_CHECK_VALID_ARGUMENT(dimensionCount > 2, "FusedMatMul operator: Tensor size should be more than 2, if attribute transBatch is true"); - std::rotate(newSizes.begin(), newSizes.end() - 2, newSizes.end() - 1); - std::rotate(newStrides.begin(), newStrides.end() - 2, newStrides.end() - 1); + std::rotate(newSizes.begin(), newSizes.begin() + 1, newSizes.end() - 1); + std::rotate(newStrides.begin(), newStrides.begin() + 1, newStrides.end() - 1); } - if (transpose) + if (transpose && dimensionCount > 1) { std::swap(newStrides[dimensionCount - 2], newStrides[dimensionCount - 1]); std::swap(newSizes[dimensionCount - 2], newSizes[dimensionCount - 1]); @@ -1668,65 +1669,31 @@ namespace OperatorHelper { ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() == 2); - // Following numpy.matmul for shape inference: - // https://docs.scipy.org/doc/numpy/reference/generated/numpy.matmul.html - // The behavior depends on the arguments in the following way. - // * If both arguments are 2 - D they are multiplied like conventional matrices. - // * If either argument is N - D, N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly. - // * If the first argument is 1 - D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix multiplication the prepended 1 is removed. - // * If the second argument is 1 - D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed. - auto inputShape0 = shapeInfo.GetInputTensorShape(0); auto inputShape1 = shapeInfo.GetInputTensorShape(1); ML_CHECK_VALID_ARGUMENT(inputShape0.size() >= 1); ML_CHECK_VALID_ARGUMENT(inputShape1.size() >= 1); - auto [sizesA, stridesA] = GetFusedMatMulSizesAndStrides( - inputShape0, - shapeInfo.GetOptionalAttribute(AttrName::TransBatchA, -1), - shapeInfo.GetOptionalAttribute(AttrName::TransA, -1) - ); - inputShape0 = sizesA; + std::vector aSizes(inputShape0.begin(), inputShape0.end()); + std::vector bSizes(inputShape1.begin(), inputShape1.end()); + auto transAAttr = shapeInfo.GetOptionalAttribute(AttrName::TransA, 0); + auto transBAttr = shapeInfo.GetOptionalAttribute(AttrName::TransB, 0); - auto [sizesB, stridesB] = GetFusedMatMulSizesAndStrides( - inputShape1, - shapeInfo.GetOptionalAttribute(AttrName::TransBatchB, -1), - shapeInfo.GetOptionalAttribute(AttrName::TransB, -1) - ); - inputShape1 = sizesB; + const bool transA = transAAttr && aSizes.size() != 1; + const bool transB = transBAttr && bSizes.size() != 1; + auto transBatchA = shapeInfo.GetOptionalAttribute(AttrName::TransBatchA, 0); + auto transBatchB = shapeInfo.GetOptionalAttribute(AttrName::TransBatchB, 0); - std::vector outputMatrixDims; + onnxruntime::MatMulComputeHelper helper; + ML_CHECK_VALID_ARGUMENT(helper.Compute(onnxruntime::TensorShape(aSizes), onnxruntime::TensorShape(bSizes), transA, transB, transBatchA, transBatchB, false).IsOK()); - // Modify the input and truncated output shapes per the above comments. - // The extra dimensions of the output beyond the two matrix dimensions - // will be computed afterward by broadcasting. - if (inputShape0.size() == 1) - { - inputShape0.insert(inputShape0.begin(), 1); - } - else - { - outputMatrixDims.push_back(inputShape0[inputShape0.size() - 2]); - } + auto outputDims = helper.OutputShape().AsShapeVector(); - if (inputShape1.size() == 1) - { - inputShape1.push_back(1); - } - else - { - outputMatrixDims.push_back(inputShape1[inputShape1.size() - 1]); - } + std::vector outputShape; + outputShape.reserve(outputDims.size()); + std::transform(outputDims.begin(), outputDims.end(), std::back_inserter(outputShape), [](int64_t dimSize){ return static_cast(dimSize); }); - // Remove the matrix dimensions from each input, resulting in broadcastable shapes. - std::vector batchDims0(inputShape0.begin(), inputShape0.end() - 2); - std::vector batchDims1(inputShape1.begin(), inputShape1.end() - 2); - - // Broadcast the extra dimensions of each input, then add the truncated matrix dimensions. - std::vector outputDims = BroadcastTensorShape(batchDims0, batchDims1); - outputDims.insert(outputDims.end(), outputMatrixDims.begin(), outputMatrixDims.end()); - - return {std::move(outputDims)}; + return {std::move(outputShape)}; } void TopKHelper::Initialize( diff --git a/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc b/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc index 74e0334d09..bde044bfb9 100644 --- a/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc +++ b/onnxruntime/test/contrib_ops/fused_matmul_op_test.cc @@ -229,28 +229,16 @@ TEST(FusedMatMulOpTest, DoubleTypeNoTranspose) { #endif TEST(FusedMatMulOpTest, FloatTypeTransposeA) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range"; - } RunFusedMatMulTest("FusedMatMul", 1, true, false); } TEST(FusedMatMulOpTest, FloatTypeTransposeB) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range"; - } RunFusedMatMulTest("FusedMatMul", 1, false, true); // b is constant. This tests weight packing logic RunFusedMatMulTest("FusedMatMul", 1, false, true, false, false, 1.0f, true); } TEST(FusedMatMulOpTest, FloatTypeTransposeAB) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range"; - } RunFusedMatMulTest("FusedMatMul", 1, true, true); // b is constant. This tests weight packing logic @@ -258,10 +246,6 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeAB) { } TEST(FusedMatMulOpTest, FloatTypeScale) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range"; - } RunFusedMatMulTest("FusedMatMul", 1, false, false, false, false, 0.5f); RunFusedMatMulTest("FusedMatMul", 1, true, false, false, false, 2.0f); RunFusedMatMulTest("FusedMatMul", 1, true, true, false, false, 4.0f); @@ -273,11 +257,6 @@ TEST(FusedMatMulOpTest, FloatTypeScale) { } TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) { - // TODO: Unskip when fixed #41968513 - if (DefaultDmlExecutionProvider().get() != nullptr) { - GTEST_SKIP() << "Skipping because of the following error: DmlCommandRecorder.cpp(338): The parameter is incorrect"; - } - RunFusedMatMulTest("FusedMatMul", 1, false, false, true, false); RunFusedMatMulTest("FusedMatMul", 1, false, false, false, true); RunFusedMatMulTest("FusedMatMul", 1, false, false, true, true, 0.5f); @@ -292,7 +271,7 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) { RunFusedMatMulTest("FusedMatMul", 1, true, true, true, true); } -#if defined(USE_CUDA) || defined(USE_ROCM) +#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) TEST(FusedMatMulOpTest, Float16_NoTranspose) { #ifdef USE_CUDA int min_cuda_architecture = 530; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index 206f934a34..0507aa789d 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -94,9 +94,9 @@ TEST_P(ModelTest, Run) { std::basic_string model_path = param.substr(pos + 1); double per_sample_tolerance = 1e-3; // when cuda is enabled, set it to a larger value for resolving random MNIST test failure - // when openvino is enabled, set it to a larger value for resolving MNIST accuracy mismatch + // when openvino or dml are enabled, set it to a larger value for resolving MNIST accuracy mismatch double relative_per_sample_tolerance = 1e-3; - if (provider_name == "openvino") { + if (provider_name == "openvino" || provider_name == "dml") { relative_per_sample_tolerance = 0.009; }