Fuse MatMulInteger and scale followed (#4350)

* Fuse MatMulInteger and scale followed

* Add bias
This commit is contained in:
Yufeng Li 2020-07-02 13:08:21 -07:00 committed by GitHub
parent 10c25416bb
commit 67a7d93b49
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
28 changed files with 998 additions and 55 deletions

View file

@ -47,6 +47,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float_uint8_t_int8_t, QAttention);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DynamicQuantizeMatMul);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DynamicQuantizeMatMul);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, MatMulIntegerToFloat);
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat);
// ******** End: Quantization ******************* //
// This section includes all op kernel declarations for former experimental ops which have now been removed from onnx.
@ -108,6 +110,8 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float_uint8_t_int8_t, QAttention)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DynamicQuantizeMatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DynamicQuantizeMatMul)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, MatMulIntegerToFloat)>,
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat)>,
};
for (auto& function_table_entry : function_table) {

View file

@ -52,7 +52,6 @@ template <typename T>
Status DynamicQuantizeMatMul<T>::Compute(OpKernelContext* ctx) const {
auto* a = ctx->Input<Tensor>(0);
auto* b = ctx->Input<Tensor>(1);
ORT_ENFORCE(a != nullptr && b != nullptr);
auto* b_scale_tensor = ctx->Input<Tensor>(2);
ORT_ENFORCE(IsScalarOr1ElementVector(b_scale_tensor),
@ -88,6 +87,8 @@ Status DynamicQuantizeMatMul<T>::Compute(OpKernelContext* ctx) const {
const auto* b_data = b->template Data<T>();
const Tensor* bias_tensor = ctx->Input<Tensor>(4);
Tensor* y = ctx->Output(0, helper.OutputShape());
auto* y_data = y->template MutableData<float>();
@ -107,7 +108,7 @@ Status DynamicQuantizeMatMul<T>::Compute(OpKernelContext* ctx) const {
y_data + helper.OutputOffsets()[i],
static_cast<int>(helper.N()),
&multiplier,
nullptr,
nullptr != bias_tensor ? bias_tensor->Data<float>() : nullptr,
thread_pool);
}

View file

@ -0,0 +1,95 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "matmul_integer_to_float.h"
#include "core/mlas/inc/mlas.h"
#include "core/providers/common.h"
#include "core/providers/cpu/math/matmul_helper.h"
#include "core/util/math_cpuonly.h"
#include "core/util/qmath.h"
namespace onnxruntime {
namespace contrib {
#define REGISTER_MATMUL_INTEGER_TO_FLOAT(T) \
ONNX_OPERATOR_TYPED_KERNEL_EX( \
MatMulIntegerToFloat, \
kMSDomain, \
1, \
T, \
kCpuExecutionProvider, \
KernelDefBuilder() \
.TypeConstraint("T1", DataTypeImpl::GetTensorType<uint8_t>()) \
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()) \
.TypeConstraint("T3", DataTypeImpl::GetTensorType<float>()), \
MatMulIntegerToFloat<uint8_t, T>);
REGISTER_MATMUL_INTEGER_TO_FLOAT(int8_t)
REGISTER_MATMUL_INTEGER_TO_FLOAT(uint8_t)
template <typename T1, typename T2>
Status MatMulIntegerToFloat<T1, T2>::Compute(OpKernelContext* ctx) const {
const Tensor* a = ctx->Input<Tensor>(0);
const Tensor* b = ctx->Input<Tensor>(1);
const Tensor* a_scale_tensor = ctx->Input<Tensor>(2);
ORT_ENFORCE(IsScalarOr1ElementVector(a_scale_tensor),
"MatMulIntegerToFloat : input A scale must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
float a_scale = *a_scale_tensor->template Data<float>();
const Tensor* b_scale_tensor = ctx->Input<Tensor>(3);
ORT_ENFORCE(IsScalarOr1ElementVector(b_scale_tensor),
"MatMulIntegerToFloat : input B scale must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
float b_scale = *b_scale_tensor->template Data<float>();
float multiplier = a_scale * b_scale;
// validate zero points
T1 a_zp = 0;
const Tensor* a_zp_tensor = ctx->Input<Tensor>(4);
if (a_zp_tensor != nullptr) {
ORT_ENFORCE(IsScalarOr1ElementVector(a_zp_tensor),
"MatMulIntegerToFloat : input A zero point must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
a_zp = *a_zp_tensor->template Data<T1>();
}
T2 b_zp = 0;
const Tensor* b_zp_tensor = ctx->Input<Tensor>(5);
if (b_zp_tensor != nullptr) {
ORT_ENFORCE(IsScalarOr1ElementVector(b_zp_tensor),
"MatMulIntegerToFloat : input B zero point must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
b_zp = *b_zp_tensor->template Data<T2>();
}
MatMulComputeHelper helper;
ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
Tensor* Y = ctx->Output(0, helper.OutputShape());
const Tensor* bias_tensor = ctx->Input<Tensor>(6);
concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
for (size_t i = 0; i < helper.OutputOffsets().size(); i++) {
QGemm(static_cast<int>(helper.M()),
static_cast<int>(helper.N()),
static_cast<int>(helper.K()),
a->template Data<T1>() + helper.LeftOffsets()[i],
static_cast<int>(helper.K()),
a_zp,
b->template Data<T2>() + helper.RightOffsets()[i],
static_cast<int>(helper.N()),
b_zp,
Y->template MutableData<float>() + helper.OutputOffsets()[i],
static_cast<int>(helper.N()),
&multiplier,
nullptr != bias_tensor ? bias_tensor->Data<float>() : nullptr,
thread_pool);
}
return Status::OK();
}
} // namespace contrib
} // namespace onnxruntime

View file

@ -0,0 +1,22 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#pragma once
#include "core/common/common.h"
#include "core/framework/op_kernel.h"
namespace onnxruntime {
namespace contrib {
template <typename T1, typename T2>
class MatMulIntegerToFloat final : public OpKernel {
public:
MatMulIntegerToFloat(const OpKernelInfo& info) : OpKernel(info) {
}
Status Compute(OpKernelContext* context) const override;
};
} // namespace contrib
} // namespace onnxruntime

View file

@ -1696,6 +1696,11 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-
"of elements should be equal to the number of columns of input 'B'.",
"T2",
OpSchema::Optional)
.Input(4,
"bias",
"1D input tensor, whose dimension is same as B's last dimension",
"T1",
OpSchema::Optional)
.Output(0, "Y", "Matrix multiply results from A * B", "T1")
.TypeConstraint(
"T1",
@ -1710,6 +1715,65 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-
ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
});
ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulIntegerToFloat)
.SetDomain(kMSDomain)
.SinceVersion(1)
.Input(0, "A", "N-dimensional matrix A", "T1")
.Input(1, "B", "N-dimensional matrix B", "T2")
.Input(
2,
"a_scale",
"Scale of quantized input 'A'. It could be a scalar or a 1-D tensor, "
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
"of elements should be equal to the number of columns of input 'A'.",
"T3")
.Input(
3,
"b_scale",
"Scale of quantized input 'B'. It could be a scalar or a 1-D tensor, "
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
"of elements should be equal to the number of columns of input 'B'.",
"T3")
.Input(
4,
"a_zero_point",
"Zero point tensor for input 'A'. It's optional and default value is 0. It could be a scalar or a 1-D tensor, "
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
"of elements should be equal to the number of columns of input 'A'.",
"T1",
OpSchema::Optional)
.Input(
5,
"b_zero_point",
"Zero point tensor for input 'B'. It's optional and default value is 0. It could be a scalar or a 1-D tensor, "
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
"of elements should be equal to the number of columns of input 'B'.",
"T2",
OpSchema::Optional)
.Input(
6,
"bias",
"1D input tensor, whose dimension is same as B's last dimension",
"T3",
OpSchema::Optional)
.Output(0, "Y", "Matrix multiply results from A * B", "T3")
.TypeConstraint(
"T1",
{"tensor(int8)", "tensor(uint8)"},
"Constrain input A data type to 8-bit integer tensor.")
.TypeConstraint(
"T2",
{"tensor(int8)", "tensor(uint8)"},
"Constrain input B data type to 8-bit integer tensor.")
.TypeConstraint(
"T3",
{"tensor(float)"},
"Constrain input a_scale, b_scale and output Y data type as float tensor.")
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
propagateElemTypeFromInputToOutput(ctx, 2, 0);
ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
});
static const char* TransposeMatMul_doc = R"DOC(
Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
)DOC";

View file

@ -12,6 +12,26 @@ using namespace ONNX_NAMESPACE;
using namespace ::onnxruntime::common;
namespace onnxruntime {
// Check if bias is a 1-D tensor, or N-D tensor with the prior N-1 dimension equal to 1.
// And its last dimension equal to MatMul's last dimension
static bool CheckBiasShape(const TensorShapeProto* bias_shape, const TensorShapeProto* matmul_shape) {
if (nullptr == matmul_shape || matmul_shape->dim_size() <= 1 ||
nullptr == bias_shape || bias_shape->dim_size() < 1) {
return false;
}
// First N-1 dimension must equal to 1
for (int i = 0; i < bias_shape->dim_size() - 1; i++) {
if (bias_shape->dim(i).dim_value() != 1) {
return false;
}
}
int64_t bias_last_dim = bias_shape->dim(bias_shape->dim_size() - 1).dim_value();
int64_t matmul_last_dim = matmul_shape->dim(matmul_shape->dim_size() - 1).dim_value();
return bias_last_dim == matmul_last_dim && bias_last_dim > 0;
}
/**
DynamicQuantizeMatMulFusion will fuse subgraph like below into DynamicQuantizeMatMul:
(input)
@ -20,14 +40,38 @@ DynamicQuantizeMatMulFusion will fuse subgraph like below into DynamicQuantizeMa
DynamicQuantizeLinear --------+
| |
v v
MatMulInteger (B const) Mul (B const)
| |
v v
Cast ------------------>Mul
MatMulInteger (B const) Mul (B const) (input)
| | |
v v v
Cast ------------------>Mul ----> DynamicQuantizeMatMul
| |
v v
Add (B const, Optional) (output)
|
v
(output)
*/
(output)
It also fuses subgraph like below into MatMulIntegerToFloat:
input input
| |
v v
+----------------------------DynamicQuantizeLinear------------------------+ DynamicQuantizeLinear
| | | |
| +----------------+--------------+ | +---------+----------+
| | | | | |
V v v v V v
MatMulInteger(B const) Mul(B const) MatMulInteger (B const) Mul (B const) ---> MatMulIntegerToFloat MatMulIntegerToFloat
| | | | | |
v v v v v v
Cast ---------------->Mul Cast ---------------->Mul (output1) ----------(output2)
| |
v v
Add (B const, Optional) Add (B const, Optional)
| |
v v
(output1) (output2)
*/
Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
GraphViewer graph_viewer(graph);
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
@ -79,8 +123,7 @@ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int
// Check Nodes' Edges count and Nodes' outputs are not in Graph output
if (!optimizer_utils::CheckOutputEdges(graph, cast_node, 1) ||
!optimizer_utils::CheckOutputEdges(graph, matmulinteger_node, 1) ||
!optimizer_utils::CheckOutputEdges(graph, mul_node_right, 1) ||
!optimizer_utils::CheckOutputEdges(graph, dql_node_left, 3)) {
!optimizer_utils::CheckOutputEdges(graph, mul_node_right, 1)) {
continue;
}
@ -94,34 +137,87 @@ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int
continue;
}
std::vector<NodeArg*> input_defs{dql_node_left.MutableInputDefs()[0],
matmulinteger_node.MutableInputDefs()[1],
mul_node_right.MutableInputDefs()[1]};
if (matmulinteger_node.InputDefs().size() == 4) {
const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]);
if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) {
continue;
// Find bias node
Node* add_node = nullptr;
// const Node* add_node = FindBiasNode(graph, mul_node, ;
if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {
const Node* tmp_add_node = graph_utils::FirstChildByType(mul_node, "Add");
if (nullptr != tmp_add_node) {
const NodeArg& tmp_add_node_B = *(tmp_add_node->InputDefs()[1]);
if (graph_utils::IsConstantInitializer(graph, tmp_add_node_B.Name(), true) &&
CheckBiasShape(tmp_add_node_B.Shape(), matmulinteger_B.Shape())) {
add_node = graph.GetNode(tmp_add_node->Index());
}
}
input_defs.push_back(matmulinteger_node.MutableInputDefs()[3]);
}
Node& fused_node = graph.AddNode(graph.GenerateNodeName("DynamicQuantizeMatMul"),
"DynamicQuantizeMatMul",
"fused DynamicQuantizeMatMul",
input_defs,
mul_node.MutableOutputDefs(),
nullptr,
kMSDomain);
// DynamicQuantizeLinear outputs are only used by one MatMulInteger,
// thus it can fused into DynamicQuantizeMatMul
NodeArg optional_node_arg("", nullptr);
std::vector<NodeArg*> input_defs;
std::string op_type_to_fuse = "DynamicQuantizeMatMul";
if (optimizer_utils::CheckOutputEdges(graph, dql_node_left, 3)) {
input_defs.push_back(dql_node_left.MutableInputDefs()[0]);
input_defs.push_back(matmulinteger_node.MutableInputDefs()[1]);
input_defs.push_back(mul_node_right.MutableInputDefs()[1]);
input_defs.push_back(&optional_node_arg);
if (matmulinteger_node.InputDefs().size() == 4) {
const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]);
if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) {
continue;
}
input_defs[3] = matmulinteger_node.MutableInputDefs()[3];
}
nodes_to_remove.push_back(dql_node_left);
} else {
op_type_to_fuse = "MatMulIntegerToFloat";
input_defs.push_back(matmulinteger_node.MutableInputDefs()[0]);
input_defs.push_back(matmulinteger_node.MutableInputDefs()[1]);
input_defs.push_back(mul_node_right.MutableInputDefs()[0]);
input_defs.push_back(mul_node_right.MutableInputDefs()[1]);
input_defs.push_back(&optional_node_arg);
input_defs.push_back(&optional_node_arg);
if (matmulinteger_node.InputDefs().size() >= 3) {
// Add zero point of A
input_defs[4] = matmulinteger_node.MutableInputDefs()[2];
// Add zero point of B
if (matmulinteger_node.InputDefs().size() == 4) {
const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]);
if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) {
continue;
}
input_defs[5] = matmulinteger_node.MutableInputDefs()[3];
}
}
}
if (add_node != nullptr) {
input_defs.push_back(add_node->MutableInputDefs()[1]);
}
Node* fused_node = &graph.AddNode(graph.GenerateNodeName(op_type_to_fuse),
op_type_to_fuse,
"",
input_defs,
add_node != nullptr ? add_node->MutableOutputDefs() : mul_node.MutableOutputDefs(),
nullptr,
kMSDomain);
// Assign provider to this new node. Provider should be same as the provider for old node.
fused_node.SetExecutionProviderType(mul_node.GetExecutionProviderType());
ORT_ENFORCE(nullptr != fused_node);
fused_node->SetExecutionProviderType(mul_node.GetExecutionProviderType());
nodes_to_remove.push_back(dql_node_left);
nodes_to_remove.push_back(matmulinteger_node);
nodes_to_remove.push_back(cast_node);
nodes_to_remove.push_back(mul_node_right);
nodes_to_remove.push_back(mul_node);
if (add_node != nullptr) {
nodes_to_remove.push_back(*add_node);
}
}
for (const auto& node : nodes_to_remove) {

View file

@ -23,7 +23,9 @@ namespace test {
template <typename T>
void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
std::vector<int64_t> B_dims,
const std::string& reference_model) {
const std::string& reference_model,
bool has_zp = true,
bool has_bias = false) {
// create rand inputs
RandomValueGenerator random{};
@ -38,11 +40,24 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
std::vector<float> B_scale = random.Uniform<float>({1}, -0.1f, 0.1f);
std::vector<T> B_zero_point = {static_cast<T>(random.Uniform<int32_t>({1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0])};
std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
test.AddInput<float>("A", A_dims, A_data);
test.AddInput<T>("B", B_dims, B_data);
test.AddInput<float>("b_scale", {1}, B_scale);
test.AddInput<T>("b_zero_point", {1}, B_zero_point);
if (has_zp) {
test.AddInput<T>("b_zero_point", {1}, B_zero_point);
} else {
test.AddMissingOptionalInput<T>();
}
if (has_bias) {
test.AddInput<float>("bias", {B_dims.back()}, Bias);
} else {
test.AddMissingOptionalInput<float>();
}
test.AddReferenceOutputs(reference_model);
test.Run();
@ -56,6 +71,16 @@ TEST(DynamicQuantizeMatMul, Int8_test) {
TestDynamicQuantizeMatMul<int8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_int8.onnx");
}
TEST(DynamicQuantizeMatMul, Int8_test_bias) {
#ifdef MLAS_SUPPORTS_GEMM_U8X8
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
std::vector<int64_t> Y_dims{4, 128};
TestDynamicQuantizeMatMul<int8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_int8_bias.onnx", false, true);
#endif
}
TEST(DynamicQuantizeMatMul, UInt8_test) {
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
@ -64,5 +89,13 @@ TEST(DynamicQuantizeMatMul, UInt8_test) {
TestDynamicQuantizeMatMul<uint8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8.onnx");
}
TEST(DynamicQuantizeMatMul, UInt8_test_bias) {
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
std::vector<int64_t> Y_dims{4, 128};
TestDynamicQuantizeMatMul<uint8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8_bias.onnx", false, true);
}
} // namespace test
} // namespace onnxruntime

View file

@ -0,0 +1,115 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
#include "core/framework/tensor.h"
#include "core/session/inference_session.h"
#include "test/common/tensor_op_test_utils.h"
#include "test/framework/test_utils.h"
#include "test/providers/provider_test_utils.h"
#include "test/util/include/default_providers.h"
#include "core/util/qmath.h"
#include <chrono>
#include <random>
#include "gtest/gtest.h"
#include "gmock/gmock.h"
using namespace std;
namespace onnxruntime {
namespace test {
template <typename T>
void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
std::vector<int64_t> B_dims,
const std::string& reference_model,
bool has_zp = true,
bool has_bias = false) {
// create rand inputs
RandomValueGenerator random{};
std::vector<uint8_t> A_data;
std::vector<int> tmp_A_data = random.Uniform<int32_t>(A_dims, 0, 255);
std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> T {
return static_cast<uint8_t>(v);
});
std::vector<T> B_data;
std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T {
return static_cast<T>(v);
});
std::vector<float> A_scale = random.Uniform<float>({1}, -0.1f, 0.1f);
std::vector<float> B_scale = random.Uniform<float>({1}, -0.1f, 0.1f);
std::vector<uint8_t> A_zero_point{127};
std::vector<T> B_zero_point = {static_cast<T>(random.Uniform<int32_t>({1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0])};
std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
test.AddInput<uint8_t>("A", A_dims, A_data);
test.AddInput<T>("B", B_dims, B_data);
test.AddInput<float>("a_scale", {1}, A_scale);
test.AddInput<float>("b_scale", {1}, B_scale);
if (has_zp) {
test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
test.AddInput<T>("b_zero_point", {1}, B_zero_point);
} else {
test.AddMissingOptionalInput<T>();
test.AddMissingOptionalInput<T>();
}
if (has_bias) {
test.AddInput<float>("bias", {B_dims.back()}, Bias);
} else {
test.AddMissingOptionalInput<float>();
}
test.AddReferenceOutputs(reference_model);
test.Run();
}
TEST(MatMulIntegerToFloat, Int8_test) {
#ifdef MLAS_SUPPORTS_GEMM_U8X8
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
std::vector<int64_t> Y_dims{4, 128};
TestMatMulIntegerToFloat<int8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_int8.onnx");
#endif
}
TEST(MatMulIntegerToFloat, Int8_bias_test) {
#ifdef MLAS_SUPPORTS_GEMM_U8X8
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
std::vector<int64_t> Y_dims{4, 128};
TestMatMulIntegerToFloat<int8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_int8_bias.onnx", false, true);
#endif
}
TEST(MatMulIntegerToFloat, UInt8_test) {
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
std::vector<int64_t> Y_dims{4, 128};
TestMatMulIntegerToFloat<uint8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_uint8.onnx");
}
TEST(MatMulIntegerToFloat, UInt8_bias_test) {
std::vector<int64_t> A_dims{4, 128};
std::vector<int64_t> B_dims{128, 128};
std::vector<int64_t> Y_dims{4, 128};
TestMatMulIntegerToFloat<uint8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_uint8_bias.onnx", false, true);
}
} // namespace test
} // namespace onnxruntime

View file

@ -2472,6 +2472,84 @@ TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest) {
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
}
TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_Bias) {
auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0);
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
EXPECT_EQ(op_to_count["Cast"], 0);
EXPECT_EQ(op_to_count["Mul"], 0);
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
}
TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_ND_bias) {
auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias_ND.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0);
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
EXPECT_EQ(op_to_count["Cast"], 0);
EXPECT_EQ(op_to_count["Mul"], 0);
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
EXPECT_EQ(op_to_count["Add"], 1);
}
TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_Bias_No_B_ZP) {
auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0);
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
EXPECT_EQ(op_to_count["Cast"], 0);
EXPECT_EQ(op_to_count["Mul"], 0);
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
}
TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
auto model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float.onnx";
std::shared_ptr<Model> p_model;
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
Graph& graph = p_model->MainGraph();
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
ASSERT_TRUE(ret.IsOK());
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 1);
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
EXPECT_EQ(op_to_count["Cast"], 0);
EXPECT_EQ(op_to_count["Mul"], 0);
EXPECT_EQ(op_to_count["MatMulIntegerToFloat"], 3);
EXPECT_EQ(op_to_count["Add"], 1);
}
#endif
} // namespace test

View file

@ -1,4 +1,4 @@


M
A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
W
@ -17,7 +17,7 @@ A
matmul_output_float
multiplierY
mul_bottom"MulDynamicQuantizeLinear_fusionZ
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
A

@ -40,4 +40,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusionZ

M
NB
NB

View file

@ -0,0 +1,44 @@
:
M
A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
I
a_quantized
B
a_zpmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to 
E
matmul_output_float
multipliermul_bottom_output
mul_bottom"Mul
&
mul_bottom_output
biasYadd"AddDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
b_scale

Z
bias

Nb
Y

M
NB

View file

@ -3,24 +3,41 @@ from onnx import helper
from onnx import TensorProto
from enum import Enum
def GenerateModel(model_name, sign):
def GenerateModel(model_name, sign, b_zp = True, bias = False):
nodes = [ # DynamicQuantizeMatMul subgraph
helper.make_node("DynamicQuantizeLinear", ["A"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"),
helper.make_node("MatMulInteger", ["a_quantized", "B", "a_zp", "b_zero_point"], ["matmul_output_int32"], "MatMulInteger"),
helper.make_node(
"MatMulInteger",
["a_quantized", "B", "a_zp", "b_zero_point"] if b_zp else ["a_quantized", "B", "a_zp"],
["matmul_output_int32"],
"MatMulInteger"),
helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["Y"], "mul_bottom"),
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["mul_bottom_output" if bias else "Y"], "mul_bottom"),
]
inputs = [
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['M', 'K']),
helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']),
helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]),
]
if b_zp:
inputs.extend([helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1])])
if bias:
nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")])
inputs.extend([helper.make_tensor_value_info('bias', TensorProto.FLOAT, ['N'])])
graph = helper.make_graph(
nodes,
"DynamicQuantizeMatMul_fusion", #name
[ # inputs
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['M', 'K']),
helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']),
helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]),
helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1]),
],
inputs,
[ # outputs
helper.make_tensor_value_info('Y', TensorProto.FLOAT, ['M', 'N']),
])
@ -30,4 +47,6 @@ def GenerateModel(model_name, sign):
if __name__ == "__main__":
GenerateModel('dynamic_quantize_matmul_int8.onnx', True)
GenerateModel('dynamic_quantize_matmul_int8.onnx', False)
GenerateModel('dynamic_quantize_matmul_uint8.onnx', False)
GenerateModel('dynamic_quantize_matmul_int8_bias.onnx', True, False, True)
GenerateModel('dynamic_quantize_matmul_uint8_bias.onnx', False, False, True)

View file

@ -1,4 +1,4 @@


M
A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
W
@ -17,7 +17,7 @@ A
matmul_output_float
multiplierY
mul_bottom"MulDynamicQuantizeLinear_fusionZ
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
A

@ -40,4 +40,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusionZ

M
NB
NB

View file

@ -0,0 +1,44 @@
:
M
A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
I
a_quantized
B
a_zpmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to 
E
matmul_output_float
multipliermul_bottom_output
mul_bottom"Mul
&
mul_bottom_output
biasYadd"AddDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
b_scale

Z
bias

Nb
Y

M
NB

View file

@ -0,0 +1,55 @@
import onnx
from onnx import helper
from onnx import TensorProto
from enum import Enum
def GenerateModel(model_name, sign, has_zp = True, bias = False):
nodes = [ # subgraph
helper.make_node(
"MatMulInteger",
["A", "B", "a_zero_point", "b_zero_point"] if has_zp else ["A", "B"],
["matmul_output_int32"],
"MatMulInteger"),
helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["mul_bottom_output" if bias else "Y"], "mul_bottom"),
]
inputs = [ # inputs
helper.make_tensor_value_info('A', TensorProto.UINT8, ['M', 'K']),
helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']),
helper.make_tensor_value_info('a_scale', TensorProto.FLOAT, [1]),
helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]),
]
if has_zp:
inputs.extend([
helper.make_tensor_value_info('a_zero_point', TensorProto.UINT8, [1]),
helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1]),
])
if bias:
nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")])
inputs.extend([helper.make_tensor_value_info('bias', TensorProto.FLOAT, ['N'])])
graph = helper.make_graph(
nodes,
"DynamicQuantizeMatMul_fusion", #name
inputs,
[ # outputs
helper.make_tensor_value_info('Y', TensorProto.FLOAT, ['M', 'N']),
])
model = helper.make_model(graph)
onnx.save(model, model_name)
if __name__ == "__main__":
GenerateModel('matmul_integer_to_float_int8.onnx', True)
GenerateModel('matmul_integer_to_float_uint8.onnx', False)
GenerateModel('matmul_integer_to_float_int8_bias.onnx', True, False, True)
GenerateModel('matmul_integer_to_float_uint8_bias.onnx', False, False, True)

View file

@ -0,0 +1,49 @@

U
A
B
a_zero_point
b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to 
5
matmul_output_float
multiplierY
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
a_scale

Z
b_scale

Z
a_zero_point

Z
b_zero_point

b
Y

M
NB

View file

@ -0,0 +1,45 @@

9
A
Bmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to 
E
matmul_output_float
multipliermul_bottom_output
mul_bottom"Mul
&
mul_bottom_output
biasYadd"AddDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
a_scale

Z
b_scale

Z
bias

Nb
Y

M
NB

View file

@ -0,0 +1,49 @@

U
A
B
a_zero_point
b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to 
5
matmul_output_float
multiplierY
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
a_scale

Z
b_scale

Z
a_zero_point

Z
b_zero_point

b
Y

M
NB

View file

@ -0,0 +1,45 @@

9
A
Bmatmul_output_int32 MatMulInteger" MatMulInteger
.
a_scale
b_scale
multiplier mul_right"Mul
A
matmul_output_int32matmul_output_floatcast"Cast*
to 
E
matmul_output_float
multipliermul_bottom_output
mul_bottom"Mul
&
mul_bottom_output
biasYadd"AddDynamicQuantizeMatMul_fusionZ
A

M
KZ
B

K
NZ
a_scale

Z
b_scale

Z
bias

Nb
Y

M
NB

View file

@ -1,4 +1,4 @@


Q
input a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
Y
@ -17,7 +17,7 @@ A
matmul_output_float
multiplieroutput
mul_bottom"MulDynamicQuantizeLinear_fusion**B b_quantized* *Bb_zp*"ffæ?Bb_scaleZ
mul_bottom"MulDynamicQuantizeLinear_fusion**B b_quantized*"ffæ?Bb_scale* *Bb_zpZ
input


@ -25,4 +25,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusion**B b_quantized* *
output


B
B

View file

@ -3,21 +3,43 @@ from onnx import helper
from onnx import TensorProto
from enum import Enum
def GenerateModel(model_name):
nodes = [ # LayerNorm subgraph
def GenerateModel(model_name, b_has_zp = True, has_bias = False, bias_ND = False):
mul_output = "Mul_output" if has_bias else "output"
nodes = [ # construct graph
helper.make_node("DynamicQuantizeLinear", ["input"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"),
helper.make_node("MatMulInteger", ["a_quantized", "b_quantized", "a_zp", "b_zp"], ["matmul_output_int32"], "MatMulInteger"),
helper.make_node(
"MatMulInteger",
["a_quantized", "b_quantized", "a_zp", "b_zp"] if b_has_zp else ["a_quantized", "b_quantized", "a_zp"],
["matmul_output_int32"],
"MatMulInteger"),
helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["output"], "mul_bottom"),
helper.make_node("Mul", ["matmul_output_float", "multiplier"], [mul_output], "mul_bottom"),
]
if has_bias:
nodes.extend([helper.make_node("Add", [mul_output, "bias"], ["output"], "bias_add")])
initializers = [ # initializers
helper.make_tensor('b_quantized', TensorProto.UINT8, [2,3], [2, 4, 5, 6, 7, 8]),
helper.make_tensor('b_zp', TensorProto.UINT8, [], [128]),
helper.make_tensor('b_scale', TensorProto.FLOAT, [], [1.8]),
]
if b_has_zp:
initializers.extend([ # initializers
helper.make_tensor('b_zp', TensorProto.UINT8, [], [128]),
])
if has_bias:
if bias_ND:
initializers.extend([ # initializers
helper.make_tensor('bias', TensorProto.FLOAT, [3, 3], [3.0, 4.0, 6.0, 3.0, 4.0, 6.0, 3.0, 4.0, 5.0]),
])
else:
initializers.extend([ # initializers
helper.make_tensor('bias', TensorProto.FLOAT, [3], [3.0, 4.0, 5.0]),
])
graph = helper.make_graph(
nodes,
"DynamicQuantizeLinear_fusion", #name
@ -33,4 +55,7 @@ def GenerateModel(model_name):
onnx.save(model, model_name)
if __name__ == "__main__":
GenerateModel('dynamic_quantize_matmul.onnx')
GenerateModel('dynamic_quantize_matmul.onnx')
GenerateModel('dynamic_quantize_matmul_bias.onnx', True, True)
GenerateModel('dynamic_quantize_matmul_bias_b_no_zp.onnx', False, True)
GenerateModel('dynamic_quantize_matmul_bias_ND.onnx', False, True, True)

View file

@ -0,0 +1,60 @@
import onnx
from onnx import helper
from onnx import TensorProto
from enum import Enum
def MakeSubGraph(suffix, has_bias):
mul_bottom_output = "mul_output" + suffix if has_bias else "output" + suffix
nodes = [
helper.make_node("MatMulInteger", ["a_quantized", "b_quantized" + suffix, "a_zp", "b_zp" + suffix], ["matmul_output_int32" + suffix], "MatMulInteger" + suffix),
helper.make_node("Mul", ["a_scale", "b_scale" + suffix], ["multiplier" + suffix], "mul_right" + suffix),
helper.make_node("Cast", ["matmul_output_int32" + suffix], ["matmul_output_float" + suffix], "cast" + suffix, to=1),
helper.make_node("Mul", ["matmul_output_float" + suffix, "multiplier" + suffix], [mul_bottom_output], "mul_bottom" + suffix),
]
if has_bias:
nodes.extend([helper.make_node("Add", [mul_bottom_output, "bias" + suffix], ["output" + suffix], "bias_add" + suffix),])
return nodes
def MakeInitializer(suffix):
return [
helper.make_tensor('b_quantized' + suffix, TensorProto.UINT8, [2,3], [2, 4, 5, 6, 7, 8]),
helper.make_tensor('b_zp' + suffix, TensorProto.UINT8, [], [128]),
helper.make_tensor('b_scale' + suffix, TensorProto.FLOAT, [], [1.8]),
]
def GenerateModel(model_name):
nodes = [helper.make_node("DynamicQuantizeLinear", ["input"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"),]
nodes.extend(MakeSubGraph("_1", True))
nodes.extend(MakeSubGraph("_2", True))
nodes.extend(MakeSubGraph("_3", False))
initializers = []
initializers.extend(MakeInitializer("_1"))
initializers.extend(MakeInitializer("_2"))
initializers.extend(MakeInitializer("_3"))
initializers.extend([
helper.make_tensor('bias_1', TensorProto.FLOAT, [3], [2, 4, 5]),
helper.make_tensor('bias_2', TensorProto.FLOAT, [3,3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
])
graph = helper.make_graph(
nodes,
"MatMulIntegerToFloat_fusion", #name
[ # inputs
helper.make_tensor_value_info('input', TensorProto.FLOAT, [3, 2]),
],
[ # outputs
helper.make_tensor_value_info('output_1', TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info('output_2', TensorProto.FLOAT, [3, 3]),
helper.make_tensor_value_info('output_3', TensorProto.FLOAT, [3, 3]),
],
initializers)
model = helper.make_model(graph)
onnx.save(model, model_name)
if __name__ == "__main__":
GenerateModel('matmul_integer_to_float.onnx')