mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
[DML EP] Add FusedMatMul (#14196)
### Description Add FusedMatMul ### Motivation and Context - Add the FusedMatMul fusion for DML - Fix the FusedMatMul logic and tests when transposed batches are involved
This commit is contained in:
parent
712f781702
commit
99a4036c80
5 changed files with 24 additions and 77 deletions
|
|
@ -1131,6 +1131,7 @@ if (onnxruntime_USE_DML)
|
|||
|
||||
target_add_dml(onnxruntime_providers_dml)
|
||||
target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_common)
|
||||
target_link_libraries(onnxruntime_providers_dml PRIVATE onnxruntime_framework)
|
||||
onnxruntime_add_include_to_target(onnxruntime_providers_dml onnxruntime_common)
|
||||
if (GDK_PLATFORM STREQUAL Scarlett)
|
||||
target_link_libraries(onnxruntime_providers_dml PRIVATE ${gdk_dx_libs})
|
||||
|
|
|
|||
|
|
@ -274,7 +274,7 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
|
|||
transformers.emplace_back(std::make_unique<GatherToSplitFusion>(cpu_cuda_rocm_eps));
|
||||
transformers.emplace_back(std::make_unique<GatherToSliceFusion>(cpu_cuda_rocm_eps));
|
||||
|
||||
transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_rocm_eps));
|
||||
transformers.emplace_back(std::make_unique<MatmulTransposeFusion>(cpu_cuda_dml_rocm_eps));
|
||||
transformers.emplace_back(std::make_unique<BiasGeluFusion>(cpu_cuda_dml_rocm_eps));
|
||||
transformers.emplace_back(std::make_unique<BiasSoftmaxFusion>(cpu_cuda_rocm_eps));
|
||||
transformers.emplace_back(std::make_unique<BiasDropoutFusion>(cuda_rocm_eps));
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
#include "precomp.h"
|
||||
#include "OperatorHelper.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/providers/cpu/math/matmul_helper.h"
|
||||
|
||||
namespace OperatorHelper
|
||||
{
|
||||
|
|
@ -614,11 +615,11 @@ namespace OperatorHelper
|
|||
ML_CHECK_VALID_ARGUMENT(dimensionCount > 2,
|
||||
"FusedMatMul operator: Tensor size should be more than 2, if attribute transBatch is true");
|
||||
|
||||
std::rotate(newSizes.begin(), newSizes.end() - 2, newSizes.end() - 1);
|
||||
std::rotate(newStrides.begin(), newStrides.end() - 2, newStrides.end() - 1);
|
||||
std::rotate(newSizes.begin(), newSizes.begin() + 1, newSizes.end() - 1);
|
||||
std::rotate(newStrides.begin(), newStrides.begin() + 1, newStrides.end() - 1);
|
||||
}
|
||||
|
||||
if (transpose)
|
||||
if (transpose && dimensionCount > 1)
|
||||
{
|
||||
std::swap(newStrides[dimensionCount - 2], newStrides[dimensionCount - 1]);
|
||||
std::swap(newSizes[dimensionCount - 2], newSizes[dimensionCount - 1]);
|
||||
|
|
@ -1668,65 +1669,31 @@ namespace OperatorHelper
|
|||
{
|
||||
ML_CHECK_VALID_ARGUMENT(shapeInfo.GetInputCount() == 2);
|
||||
|
||||
// Following numpy.matmul for shape inference:
|
||||
// https://docs.scipy.org/doc/numpy/reference/generated/numpy.matmul.html
|
||||
// The behavior depends on the arguments in the following way.
|
||||
// * If both arguments are 2 - D they are multiplied like conventional matrices.
|
||||
// * If either argument is N - D, N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly.
|
||||
// * If the first argument is 1 - D, it is promoted to a matrix by prepending a 1 to its dimensions. After matrix multiplication the prepended 1 is removed.
|
||||
// * If the second argument is 1 - D, it is promoted to a matrix by appending a 1 to its dimensions. After matrix multiplication the appended 1 is removed.
|
||||
|
||||
auto inputShape0 = shapeInfo.GetInputTensorShape(0);
|
||||
auto inputShape1 = shapeInfo.GetInputTensorShape(1);
|
||||
ML_CHECK_VALID_ARGUMENT(inputShape0.size() >= 1);
|
||||
ML_CHECK_VALID_ARGUMENT(inputShape1.size() >= 1);
|
||||
|
||||
auto [sizesA, stridesA] = GetFusedMatMulSizesAndStrides(
|
||||
inputShape0,
|
||||
shapeInfo.GetOptionalAttribute(AttrName::TransBatchA, -1),
|
||||
shapeInfo.GetOptionalAttribute(AttrName::TransA, -1)
|
||||
);
|
||||
inputShape0 = sizesA;
|
||||
std::vector<int64_t> aSizes(inputShape0.begin(), inputShape0.end());
|
||||
std::vector<int64_t> bSizes(inputShape1.begin(), inputShape1.end());
|
||||
auto transAAttr = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransA, 0);
|
||||
auto transBAttr = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransB, 0);
|
||||
|
||||
auto [sizesB, stridesB] = GetFusedMatMulSizesAndStrides(
|
||||
inputShape1,
|
||||
shapeInfo.GetOptionalAttribute(AttrName::TransBatchB, -1),
|
||||
shapeInfo.GetOptionalAttribute(AttrName::TransB, -1)
|
||||
);
|
||||
inputShape1 = sizesB;
|
||||
const bool transA = transAAttr && aSizes.size() != 1;
|
||||
const bool transB = transBAttr && bSizes.size() != 1;
|
||||
auto transBatchA = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransBatchA, 0);
|
||||
auto transBatchB = shapeInfo.GetOptionalAttribute<int64_t>(AttrName::TransBatchB, 0);
|
||||
|
||||
std::vector<uint32_t> outputMatrixDims;
|
||||
onnxruntime::MatMulComputeHelper helper;
|
||||
ML_CHECK_VALID_ARGUMENT(helper.Compute(onnxruntime::TensorShape(aSizes), onnxruntime::TensorShape(bSizes), transA, transB, transBatchA, transBatchB, false).IsOK());
|
||||
|
||||
// Modify the input and truncated output shapes per the above comments.
|
||||
// The extra dimensions of the output beyond the two matrix dimensions
|
||||
// will be computed afterward by broadcasting.
|
||||
if (inputShape0.size() == 1)
|
||||
{
|
||||
inputShape0.insert(inputShape0.begin(), 1);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputMatrixDims.push_back(inputShape0[inputShape0.size() - 2]);
|
||||
}
|
||||
auto outputDims = helper.OutputShape().AsShapeVector();
|
||||
|
||||
if (inputShape1.size() == 1)
|
||||
{
|
||||
inputShape1.push_back(1);
|
||||
}
|
||||
else
|
||||
{
|
||||
outputMatrixDims.push_back(inputShape1[inputShape1.size() - 1]);
|
||||
}
|
||||
std::vector<uint32_t> outputShape;
|
||||
outputShape.reserve(outputDims.size());
|
||||
std::transform(outputDims.begin(), outputDims.end(), std::back_inserter(outputShape), [](int64_t dimSize){ return static_cast<uint32_t>(dimSize); });
|
||||
|
||||
// Remove the matrix dimensions from each input, resulting in broadcastable shapes.
|
||||
std::vector<uint32_t> batchDims0(inputShape0.begin(), inputShape0.end() - 2);
|
||||
std::vector<uint32_t> batchDims1(inputShape1.begin(), inputShape1.end() - 2);
|
||||
|
||||
// Broadcast the extra dimensions of each input, then add the truncated matrix dimensions.
|
||||
std::vector<uint32_t> outputDims = BroadcastTensorShape(batchDims0, batchDims1);
|
||||
outputDims.insert(outputDims.end(), outputMatrixDims.begin(), outputMatrixDims.end());
|
||||
|
||||
return {std::move(outputDims)};
|
||||
return {std::move(outputShape)};
|
||||
}
|
||||
|
||||
void TopKHelper::Initialize(
|
||||
|
|
|
|||
|
|
@ -229,28 +229,16 @@ TEST(FusedMatMulOpTest, DoubleTypeNoTranspose) {
|
|||
#endif
|
||||
|
||||
TEST(FusedMatMulOpTest, FloatTypeTransposeA) {
|
||||
// TODO: Unskip when fixed #41968513
|
||||
if (DefaultDmlExecutionProvider().get() != nullptr) {
|
||||
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
|
||||
}
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, true, false);
|
||||
}
|
||||
|
||||
TEST(FusedMatMulOpTest, FloatTypeTransposeB) {
|
||||
// TODO: Unskip when fixed #41968513
|
||||
if (DefaultDmlExecutionProvider().get() != nullptr) {
|
||||
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
|
||||
}
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, false, true);
|
||||
// b is constant. This tests weight packing logic
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, false, true, false, false, 1.0f, true);
|
||||
}
|
||||
|
||||
TEST(FusedMatMulOpTest, FloatTypeTransposeAB) {
|
||||
// TODO: Unskip when fixed #41968513
|
||||
if (DefaultDmlExecutionProvider().get() != nullptr) {
|
||||
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
|
||||
}
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, true, true);
|
||||
|
||||
// b is constant. This tests weight packing logic
|
||||
|
|
@ -258,10 +246,6 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeAB) {
|
|||
}
|
||||
|
||||
TEST(FusedMatMulOpTest, FloatTypeScale) {
|
||||
// TODO: Unskip when fixed #41968513
|
||||
if (DefaultDmlExecutionProvider().get() != nullptr) {
|
||||
GTEST_SKIP() << "Skipping because of the following error: Assertion failed: vector subscript out of range";
|
||||
}
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, false, false, 0.5f);
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, true, false, false, false, 2.0f);
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, true, true, false, false, 4.0f);
|
||||
|
|
@ -273,11 +257,6 @@ TEST(FusedMatMulOpTest, FloatTypeScale) {
|
|||
}
|
||||
|
||||
TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) {
|
||||
// TODO: Unskip when fixed #41968513
|
||||
if (DefaultDmlExecutionProvider().get() != nullptr) {
|
||||
GTEST_SKIP() << "Skipping because of the following error: DmlCommandRecorder.cpp(338): The parameter is incorrect";
|
||||
}
|
||||
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, true, false);
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, false, true);
|
||||
RunFusedMatMulTest<float>("FusedMatMul", 1, false, false, true, true, 0.5f);
|
||||
|
|
@ -292,7 +271,7 @@ TEST(FusedMatMulOpTest, FloatTypeTransposeBatch) {
|
|||
RunFusedMatMulTest<float>("FusedMatMul", 1, true, true, true, true);
|
||||
}
|
||||
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM)
|
||||
#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
|
||||
TEST(FusedMatMulOpTest, Float16_NoTranspose) {
|
||||
#ifdef USE_CUDA
|
||||
int min_cuda_architecture = 530;
|
||||
|
|
|
|||
|
|
@ -94,9 +94,9 @@ TEST_P(ModelTest, Run) {
|
|||
std::basic_string<ORTCHAR_T> model_path = param.substr(pos + 1);
|
||||
double per_sample_tolerance = 1e-3;
|
||||
// when cuda is enabled, set it to a larger value for resolving random MNIST test failure
|
||||
// when openvino is enabled, set it to a larger value for resolving MNIST accuracy mismatch
|
||||
// when openvino or dml are enabled, set it to a larger value for resolving MNIST accuracy mismatch
|
||||
double relative_per_sample_tolerance = 1e-3;
|
||||
if (provider_name == "openvino") {
|
||||
if (provider_name == "openvino" || provider_name == "dml") {
|
||||
relative_per_sample_tolerance = 0.009;
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue