[DML EP] Add FusedMatMul (#14196)

### Description
Add FusedMatMul



### Motivation and Context
- Add the FusedMatMul fusion for DML
- Fix the FusedMatMul logic and tests when transposed batches are
involved
This commit is contained in:
Patrice Vignola 2023-01-12 02:17:04 -08:00 committed by GitHub
parent 712f781702
commit 99a4036c80
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 24 additions and 77 deletions

View file

@ -1131,6 +1131,7 @@ if (onnxruntime_USE_DML)
target_add_dml(onnxruntime_providers_dml)
target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_common)
target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_framework)
onnxruntime_add_include_to_target(onnxruntime_providers_dml onnxruntime_common)
if (GDK_PLATFORM STREQUAL Scarlett)
target_link_libraries(onnxruntime_providers_dml PRIVATE ${gdk_dx_libs})

View file

@ -274,7 +274,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
transformers.emplace_back(std::make_unique<GatherToSplitFusion>(cpu_cuda_rocm_eps));
transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_rocm_eps));
transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_cuda_dml_rocm_eps));
transformers.emplace_back(std::make_unique<BiasSoftmaxFusion>(cpu_cuda_rocm_eps));
transformers.emplace_back(std::make_unique<BiasDropoutFusion>(cuda_rocm_eps));

View file

@ -4,6 +4,7 @@
#include "precomp.h"
#include "OperatorHelper.h"
#include "core/providers/common.h"
#include "core/providers/cpu/math/matmul_helper.h"
namespace OperatorHelper
{
@ -614,11 +615,11 @@ namespace OperatorHelper
ML_CHECK_VALID_ARGUMENT(dimensionCount > 2,
"FusedMatMul operator: Tensor size should be more than 2, if attribute transBatch is true");
std::rotate(newSizes.begin(), newSizes.end() - 2, newSizes.end() - 1);
std::rotate(newStrides.begin(), newStrides.end() - 2, newStrides.end() - 1);
std::rotate(newSizes.begin(), newSizes.begin() + 1, newSizes.end() - 1);
std::rotate(newStrides.begin(), newStrides.begin() + 1, newStrides.end() - 1);
}
if (transpose)
if (transpose && dimensionCount > 1)
{
std::swap(newStrides[dimensionCount - 2], newStrides[dimensionCount - 1]);
std::swap(newSizes[dimensionCount - 2], newSizes[dimensionCount - 1]);
@ -1668,65 +1669,31 @@ namespace OperatorHelper
{
ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() == 2);
// Following numpy.matmul for shape inference:
// https://docs.scipy.org/doc/numpy/reference/generated/numpy.matmul.html
// The behavior depends on the arguments in the following way.
// * If both arguments are 2 - D they are multiplied like conventional matrices.
// * If either argument is N - D, N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly.
// * If the first argument is 1 - D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix multiplication the prepended 1 is removed.
// * If the second argument is 1 - D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed.
auto inputShape0 = shapeInfo.GetInputTensorShape(0);
auto inputShape1 = shapeInfo.GetInputTensorShape(1);
ML_CHECK_VALID_ARGUMENT(inputShape0.size() >= 1);
ML_CHECK_VALID_ARGUMENT(inputShape1.size() >= 1);
auto [sizesA, stridesA] = GetFusedMatMulSizesAndStrides(
inputShape0,
shapeInfo.GetOptionalAttribute(AttrName::TransBatchA, -1),
shapeInfo.GetOptionalAttribute(AttrName::TransA, -1)
);
inputShape0 = sizesA;
std::vector<int64_t> aSizes(inputShape0.begin(), inputShape0.end());
std::vector<int64_t> bSizes(inputShape1.begin(), inputShape1.end());
auto transAAttr = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransA, 0);
auto transBAttr = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransB, 0);
auto [sizesB, stridesB] = GetFusedMatMulSizesAndStrides(
inputShape1,
shapeInfo.GetOptionalAttribute(AttrName::TransBatchB, -1),
shapeInfo.GetOptionalAttribute(AttrName::TransB, -1)
);
inputShape1 = sizesB;
const bool transA = transAAttr && aSizes.size() != 1;
const bool transB = transBAttr && bSizes.size() != 1;
auto transBatchA = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransBatchA, 0);
auto transBatchB = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransBatchB, 0);
std::vector<uint32_t> outputMatrixDims;
onnxruntime::MatMulComputeHelper helper;
ML_CHECK_VALID_ARGUMENT(helper.Compute(onnxruntime::TensorShape(aSizes), onnxruntime::TensorShape(bSizes), transA, transB, transBatchA, transBatchB, false).IsOK());
// Modify the input and truncated output shapes per the above comments.
// The extra dimensions of the output beyond the two matrix dimensions
// will be computed afterward by broadcasting.
if (inputShape0.size() == 1)
{
inputShape0.insert(inputShape0.begin(), 1);
}
else
{
outputMatrixDims.push_back(inputShape0[inputShape0.size() - 2]);
}
auto outputDims = helper.OutputShape().AsShapeVector();
if (inputShape1.size() == 1)
{
inputShape1.push_back(1);
}
else
{
outputMatrixDims.push_back(inputShape1[inputShape1.size() - 1]);
}
std::vector<uint32_t> outputShape;
outputShape.reserve(outputDims.size());
std::transform(outputDims.begin(), outputDims.end(), std::back_inserter(outputShape), [](int64_t dimSize){ return static_cast<uint32_t>(dimSize); });
// Remove the matrix dimensions from each input, resulting in broadcastable shapes.
std::vector<uint32_t> batchDims0(inputShape0.begin(), inputShape0.end() - 2);
std::vector<uint32_t> batchDims1(inputShape1.begin(), inputShape1.end() - 2);
// Broadcast the extra dimensions of each input, then add the truncated matrix dimensions.
std::vector<uint32_t> outputDims = BroadcastTensorShape(batchDims0, batchDims1);
outputDims.insert(outputDims.end(), outputMatrixDims.begin(), outputMatrixDims.end());
return {std::move(outputDims)};
return {std::move(outputShape)};
}
void TopKHelper::Initialize(

View file

@ -229,28 +229,16 @@ TEST(FusedMatMulOpTest, DoubleTypeNoTranspose) {
#endif
TEST(FusedMatMulOpTest, FloatTypeTransposeA) {
// TODO: Unskip when fixed #41968513
if (DefaultDmlExecutionProvider().get() != nullptr) {
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
}
RunFusedMatMulTest<float>("FusedMatMul", 1, true, false);
}
TEST(FusedMatMulOpTest, FloatTypeTransposeB) {
// TODO: Unskip when fixed #41968513
if (DefaultDmlExecutionProvider().get() != nullptr) {
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
}
RunFusedMatMulTest<float>("FusedMatMul", 1, false, true);
// b is constant. This tests weight packing logic
RunFusedMatMulTest<float>("FusedMatMul", 1, false, true, false, false, 1.0f, true);
}
TEST(FusedMatMulOpTest, FloatTypeTransposeAB) {
// TODO: Unskip when fixed #41968513
if (DefaultDmlExecutionProvider().get() != nullptr) {
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
}
RunFusedMatMulTest<float>("FusedMatMul", 1, true, true);
// b is constant. This tests weight packing logic
@ -258,10 +246,6 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeAB) {
}
TEST(FusedMatMulOpTest, FloatTypeScale) {
// TODO: Unskip when fixed #41968513
if (DefaultDmlExecutionProvider().get() != nullptr) {
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
}
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, false, false, 0.5f);
RunFusedMatMulTest<float>("FusedMatMul", 1, true, false, false, false, 2.0f);
RunFusedMatMulTest<float>("FusedMatMul", 1, true, true, false, false, 4.0f);
@ -273,11 +257,6 @@ TEST(FusedMatMulOpTest, FloatTypeScale) {
}
TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) {
// TODO: Unskip when fixed #41968513
if (DefaultDmlExecutionProvider().get() != nullptr) {
GTEST_SKIP() << "Skipping because of the following error: DmlCommandRecorder.cpp(338): The parameter is incorrect";
}
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, true, false);
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, false, true);
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, true, true, 0.5f);
@ -292,7 +271,7 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) {
RunFusedMatMulTest<float>("FusedMatMul", 1, true, true, true, true);
}
#if defined(USE_CUDA) || defined(USE_ROCM)
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
TEST(FusedMatMulOpTest, Float16_NoTranspose) {
#ifdef USE_CUDA
int min_cuda_architecture = 530;

View file

@ -94,9 +94,9 @@ TEST_P(ModelTest, Run) {
std::basic_string<ORTCHAR_T> model_path = param.substr(pos + 1);
double per_sample_tolerance = 1e-3;
// when cuda is enabled, set it to a larger value for resolving random MNIST test failure
// when openvino is enabled, set it to a larger value for resolving MNIST accuracy mismatch
// when openvino or dml are enabled, set it to a larger value for resolving MNIST accuracy mismatch
double relative_per_sample_tolerance = 1e-3;
if (provider_name == "openvino") {
if (provider_name == "openvino" || provider_name == "dml") {
relative_per_sample_tolerance = 0.009;
}