mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Fuse MatMulInteger and scale followed (#4350)
* Fuse MatMulInteger and scale followed * Add bias
This commit is contained in:
parent
10c25416bb
commit
67a7d93b49
28 changed files with 998 additions and 55 deletions
|
|
@ -47,6 +47,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
|
|||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float_uint8_t_int8_t, QAttention);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DynamicQuantizeMatMul);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DynamicQuantizeMatMul);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, MatMulIntegerToFloat);
|
||||
class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat);
|
||||
// ******** End: Quantization ******************* //
|
||||
|
||||
// This section includes all op kernel declarations for former experimental ops which have now been removed from onnx.
|
||||
|
|
@ -108,6 +110,8 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) {
|
|||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float_uint8_t_int8_t, QAttention)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DynamicQuantizeMatMul)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DynamicQuantizeMatMul)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, MatMulIntegerToFloat)>,
|
||||
BuildKernelCreateInfo<ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat)>,
|
||||
};
|
||||
|
||||
for (auto& function_table_entry : function_table) {
|
||||
|
|
|
|||
|
|
@ -52,7 +52,6 @@ template <typename T>
|
|||
Status DynamicQuantizeMatMul<T>::Compute(OpKernelContext* ctx) const {
|
||||
auto* a = ctx->Input<Tensor>(0);
|
||||
auto* b = ctx->Input<Tensor>(1);
|
||||
ORT_ENFORCE(a != nullptr && b != nullptr);
|
||||
|
||||
auto* b_scale_tensor = ctx->Input<Tensor>(2);
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(b_scale_tensor),
|
||||
|
|
@ -88,6 +87,8 @@ Status DynamicQuantizeMatMul<T>::Compute(OpKernelContext* ctx) const {
|
|||
|
||||
const auto* b_data = b->template Data<T>();
|
||||
|
||||
const Tensor* bias_tensor = ctx->Input<Tensor>(4);
|
||||
|
||||
Tensor* y = ctx->Output(0, helper.OutputShape());
|
||||
auto* y_data = y->template MutableData<float>();
|
||||
|
||||
|
|
@ -107,7 +108,7 @@ Status DynamicQuantizeMatMul<T>::Compute(OpKernelContext* ctx) const {
|
|||
y_data + helper.OutputOffsets()[i],
|
||||
static_cast<int>(helper.N()),
|
||||
&multiplier,
|
||||
nullptr,
|
||||
nullptr != bias_tensor ? bias_tensor->Data<float>() : nullptr,
|
||||
thread_pool);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,95 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "matmul_integer_to_float.h"
|
||||
|
||||
#include "core/mlas/inc/mlas.h"
|
||||
#include "core/providers/common.h"
|
||||
#include "core/providers/cpu/math/matmul_helper.h"
|
||||
#include "core/util/math_cpuonly.h"
|
||||
#include "core/util/qmath.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
#define REGISTER_MATMUL_INTEGER_TO_FLOAT(T) \
|
||||
ONNX_OPERATOR_TYPED_KERNEL_EX( \
|
||||
MatMulIntegerToFloat, \
|
||||
kMSDomain, \
|
||||
1, \
|
||||
T, \
|
||||
kCpuExecutionProvider, \
|
||||
KernelDefBuilder() \
|
||||
.TypeConstraint("T1", DataTypeImpl::GetTensorType<uint8_t>()) \
|
||||
.TypeConstraint("T2", DataTypeImpl::GetTensorType<T>()) \
|
||||
.TypeConstraint("T3", DataTypeImpl::GetTensorType<float>()), \
|
||||
MatMulIntegerToFloat<uint8_t, T>);
|
||||
|
||||
REGISTER_MATMUL_INTEGER_TO_FLOAT(int8_t)
|
||||
REGISTER_MATMUL_INTEGER_TO_FLOAT(uint8_t)
|
||||
|
||||
template <typename T1, typename T2>
|
||||
Status MatMulIntegerToFloat<T1, T2>::Compute(OpKernelContext* ctx) const {
|
||||
const Tensor* a = ctx->Input<Tensor>(0);
|
||||
const Tensor* b = ctx->Input<Tensor>(1);
|
||||
|
||||
const Tensor* a_scale_tensor = ctx->Input<Tensor>(2);
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(a_scale_tensor),
|
||||
"MatMulIntegerToFloat : input A scale must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
|
||||
|
||||
float a_scale = *a_scale_tensor->template Data<float>();
|
||||
|
||||
const Tensor* b_scale_tensor = ctx->Input<Tensor>(3);
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(b_scale_tensor),
|
||||
"MatMulIntegerToFloat : input B scale must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
|
||||
|
||||
float b_scale = *b_scale_tensor->template Data<float>();
|
||||
|
||||
float multiplier = a_scale * b_scale;
|
||||
|
||||
// validate zero points
|
||||
T1 a_zp = 0;
|
||||
const Tensor* a_zp_tensor = ctx->Input<Tensor>(4);
|
||||
if (a_zp_tensor != nullptr) {
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(a_zp_tensor),
|
||||
"MatMulIntegerToFloat : input A zero point must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
|
||||
a_zp = *a_zp_tensor->template Data<T1>();
|
||||
}
|
||||
|
||||
T2 b_zp = 0;
|
||||
const Tensor* b_zp_tensor = ctx->Input<Tensor>(5);
|
||||
if (b_zp_tensor != nullptr) {
|
||||
ORT_ENFORCE(IsScalarOr1ElementVector(b_zp_tensor),
|
||||
"MatMulIntegerToFloat : input B zero point must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet.");
|
||||
b_zp = *b_zp_tensor->template Data<T2>();
|
||||
}
|
||||
|
||||
MatMulComputeHelper helper;
|
||||
ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape()));
|
||||
Tensor* Y = ctx->Output(0, helper.OutputShape());
|
||||
|
||||
const Tensor* bias_tensor = ctx->Input<Tensor>(6);
|
||||
|
||||
concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool();
|
||||
for (size_t i = 0; i < helper.OutputOffsets().size(); i++) {
|
||||
QGemm(static_cast<int>(helper.M()),
|
||||
static_cast<int>(helper.N()),
|
||||
static_cast<int>(helper.K()),
|
||||
a->template Data<T1>() + helper.LeftOffsets()[i],
|
||||
static_cast<int>(helper.K()),
|
||||
a_zp,
|
||||
b->template Data<T2>() + helper.RightOffsets()[i],
|
||||
static_cast<int>(helper.N()),
|
||||
b_zp,
|
||||
Y->template MutableData<float>() + helper.OutputOffsets()[i],
|
||||
static_cast<int>(helper.N()),
|
||||
&multiplier,
|
||||
nullptr != bias_tensor ? bias_tensor->Data<float>() : nullptr,
|
||||
thread_pool);
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -0,0 +1,22 @@
|
|||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "core/common/common.h"
|
||||
#include "core/framework/op_kernel.h"
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace contrib {
|
||||
|
||||
template <typename T1, typename T2>
|
||||
class MatMulIntegerToFloat final : public OpKernel {
|
||||
public:
|
||||
MatMulIntegerToFloat(const OpKernelInfo& info) : OpKernel(info) {
|
||||
}
|
||||
|
||||
Status Compute(OpKernelContext* context) const override;
|
||||
};
|
||||
|
||||
} // namespace contrib
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -1696,6 +1696,11 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-
|
|||
"of elements should be equal to the number of columns of input 'B'.",
|
||||
"T2",
|
||||
OpSchema::Optional)
|
||||
.Input(4,
|
||||
"bias",
|
||||
"1D input tensor, whose dimension is same as B's last dimension",
|
||||
"T1",
|
||||
OpSchema::Optional)
|
||||
.Output(0, "Y", "Matrix multiply results from A * B", "T1")
|
||||
.TypeConstraint(
|
||||
"T1",
|
||||
|
|
@ -1710,6 +1715,65 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-
|
|||
ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
|
||||
});
|
||||
|
||||
ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulIntegerToFloat)
|
||||
.SetDomain(kMSDomain)
|
||||
.SinceVersion(1)
|
||||
.Input(0, "A", "N-dimensional matrix A", "T1")
|
||||
.Input(1, "B", "N-dimensional matrix B", "T2")
|
||||
.Input(
|
||||
2,
|
||||
"a_scale",
|
||||
"Scale of quantized input 'A'. It could be a scalar or a 1-D tensor, "
|
||||
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
|
||||
"of elements should be equal to the number of columns of input 'A'.",
|
||||
"T3")
|
||||
.Input(
|
||||
3,
|
||||
"b_scale",
|
||||
"Scale of quantized input 'B'. It could be a scalar or a 1-D tensor, "
|
||||
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
|
||||
"of elements should be equal to the number of columns of input 'B'.",
|
||||
"T3")
|
||||
.Input(
|
||||
4,
|
||||
"a_zero_point",
|
||||
"Zero point tensor for input 'A'. It's optional and default value is 0. It could be a scalar or a 1-D tensor, "
|
||||
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
|
||||
"of elements should be equal to the number of columns of input 'A'.",
|
||||
"T1",
|
||||
OpSchema::Optional)
|
||||
.Input(
|
||||
5,
|
||||
"b_zero_point",
|
||||
"Zero point tensor for input 'B'. It's optional and default value is 0. It could be a scalar or a 1-D tensor, "
|
||||
"which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number "
|
||||
"of elements should be equal to the number of columns of input 'B'.",
|
||||
"T2",
|
||||
OpSchema::Optional)
|
||||
.Input(
|
||||
6,
|
||||
"bias",
|
||||
"1D input tensor, whose dimension is same as B's last dimension",
|
||||
"T3",
|
||||
OpSchema::Optional)
|
||||
.Output(0, "Y", "Matrix multiply results from A * B", "T3")
|
||||
.TypeConstraint(
|
||||
"T1",
|
||||
{"tensor(int8)", "tensor(uint8)"},
|
||||
"Constrain input A data type to 8-bit integer tensor.")
|
||||
.TypeConstraint(
|
||||
"T2",
|
||||
{"tensor(int8)", "tensor(uint8)"},
|
||||
"Constrain input B data type to 8-bit integer tensor.")
|
||||
.TypeConstraint(
|
||||
"T3",
|
||||
{"tensor(float)"},
|
||||
"Constrain input a_scale, b_scale and output Y data type as float tensor.")
|
||||
.TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) {
|
||||
propagateElemTypeFromInputToOutput(ctx, 2, 0);
|
||||
ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1);
|
||||
});
|
||||
|
||||
static const char* TransposeMatMul_doc = R"DOC(
|
||||
Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
|
||||
)DOC";
|
||||
|
|
|
|||
|
|
@ -12,6 +12,26 @@ using namespace ONNX_NAMESPACE;
|
|||
using namespace ::onnxruntime::common;
|
||||
namespace onnxruntime {
|
||||
|
||||
// Check if bias is a 1-D tensor, or N-D tensor with the prior N-1 dimension equal to 1.
|
||||
// And its last dimension equal to MatMul's last dimension
|
||||
static bool CheckBiasShape(const TensorShapeProto* bias_shape, const TensorShapeProto* matmul_shape) {
|
||||
if (nullptr == matmul_shape || matmul_shape->dim_size() <= 1 ||
|
||||
nullptr == bias_shape || bias_shape->dim_size() < 1) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// First N-1 dimension must equal to 1
|
||||
for (int i = 0; i < bias_shape->dim_size() - 1; i++) {
|
||||
if (bias_shape->dim(i).dim_value() != 1) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
int64_t bias_last_dim = bias_shape->dim(bias_shape->dim_size() - 1).dim_value();
|
||||
int64_t matmul_last_dim = matmul_shape->dim(matmul_shape->dim_size() - 1).dim_value();
|
||||
return bias_last_dim == matmul_last_dim && bias_last_dim > 0;
|
||||
}
|
||||
|
||||
/**
|
||||
DynamicQuantizeMatMulFusion will fuse subgraph like below into DynamicQuantizeMatMul:
|
||||
(input)
|
||||
|
|
@ -20,14 +40,38 @@ DynamicQuantizeMatMulFusion will fuse subgraph like below into DynamicQuantizeMa
|
|||
DynamicQuantizeLinear --------+
|
||||
| |
|
||||
v v
|
||||
MatMulInteger (B const) Mul (B const)
|
||||
| |
|
||||
v v
|
||||
Cast ------------------>Mul
|
||||
MatMulInteger (B const) Mul (B const) (input)
|
||||
| | |
|
||||
v v v
|
||||
Cast ------------------>Mul ----> DynamicQuantizeMatMul
|
||||
| |
|
||||
v v
|
||||
Add (B const, Optional) (output)
|
||||
|
|
||||
v
|
||||
(output)
|
||||
*/
|
||||
(output)
|
||||
|
||||
It also fuses subgraph like below into MatMulIntegerToFloat:
|
||||
input input
|
||||
| |
|
||||
v v
|
||||
+----------------------------DynamicQuantizeLinear------------------------+ DynamicQuantizeLinear
|
||||
| | | |
|
||||
| +----------------+--------------+ | +---------+----------+
|
||||
| | | | | |
|
||||
V v v v V v
|
||||
MatMulInteger(B const) Mul(B const) MatMulInteger (B const) Mul (B const) ---> MatMulIntegerToFloat MatMulIntegerToFloat
|
||||
| | | | | |
|
||||
v v v v v v
|
||||
Cast ---------------->Mul Cast ---------------->Mul (output1) ----------(output2)
|
||||
| |
|
||||
v v
|
||||
Add (B const, Optional) Add (B const, Optional)
|
||||
| |
|
||||
v v
|
||||
(output1) (output2)
|
||||
|
||||
*/
|
||||
Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
|
||||
GraphViewer graph_viewer(graph);
|
||||
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
|
||||
|
|
@ -79,8 +123,7 @@ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int
|
|||
// Check Nodes' Edges count and Nodes' outputs are not in Graph output
|
||||
if (!optimizer_utils::CheckOutputEdges(graph, cast_node, 1) ||
|
||||
!optimizer_utils::CheckOutputEdges(graph, matmulinteger_node, 1) ||
|
||||
!optimizer_utils::CheckOutputEdges(graph, mul_node_right, 1) ||
|
||||
!optimizer_utils::CheckOutputEdges(graph, dql_node_left, 3)) {
|
||||
!optimizer_utils::CheckOutputEdges(graph, mul_node_right, 1)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -94,34 +137,87 @@ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int
|
|||
continue;
|
||||
}
|
||||
|
||||
std::vector<NodeArg*> input_defs{dql_node_left.MutableInputDefs()[0],
|
||||
matmulinteger_node.MutableInputDefs()[1],
|
||||
mul_node_right.MutableInputDefs()[1]};
|
||||
|
||||
if (matmulinteger_node.InputDefs().size() == 4) {
|
||||
const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]);
|
||||
if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) {
|
||||
continue;
|
||||
// Find bias node
|
||||
Node* add_node = nullptr;
|
||||
// const Node* add_node = FindBiasNode(graph, mul_node, ;
|
||||
if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) {
|
||||
const Node* tmp_add_node = graph_utils::FirstChildByType(mul_node, "Add");
|
||||
if (nullptr != tmp_add_node) {
|
||||
const NodeArg& tmp_add_node_B = *(tmp_add_node->InputDefs()[1]);
|
||||
if (graph_utils::IsConstantInitializer(graph, tmp_add_node_B.Name(), true) &&
|
||||
CheckBiasShape(tmp_add_node_B.Shape(), matmulinteger_B.Shape())) {
|
||||
add_node = graph.GetNode(tmp_add_node->Index());
|
||||
}
|
||||
}
|
||||
input_defs.push_back(matmulinteger_node.MutableInputDefs()[3]);
|
||||
}
|
||||
|
||||
Node& fused_node = graph.AddNode(graph.GenerateNodeName("DynamicQuantizeMatMul"),
|
||||
"DynamicQuantizeMatMul",
|
||||
"fused DynamicQuantizeMatMul",
|
||||
input_defs,
|
||||
mul_node.MutableOutputDefs(),
|
||||
nullptr,
|
||||
kMSDomain);
|
||||
// DynamicQuantizeLinear outputs are only used by one MatMulInteger,
|
||||
// thus it can fused into DynamicQuantizeMatMul
|
||||
NodeArg optional_node_arg("", nullptr);
|
||||
std::vector<NodeArg*> input_defs;
|
||||
std::string op_type_to_fuse = "DynamicQuantizeMatMul";
|
||||
if (optimizer_utils::CheckOutputEdges(graph, dql_node_left, 3)) {
|
||||
input_defs.push_back(dql_node_left.MutableInputDefs()[0]);
|
||||
input_defs.push_back(matmulinteger_node.MutableInputDefs()[1]);
|
||||
input_defs.push_back(mul_node_right.MutableInputDefs()[1]);
|
||||
input_defs.push_back(&optional_node_arg);
|
||||
|
||||
if (matmulinteger_node.InputDefs().size() == 4) {
|
||||
const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]);
|
||||
if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) {
|
||||
continue;
|
||||
}
|
||||
input_defs[3] = matmulinteger_node.MutableInputDefs()[3];
|
||||
}
|
||||
|
||||
nodes_to_remove.push_back(dql_node_left);
|
||||
} else {
|
||||
op_type_to_fuse = "MatMulIntegerToFloat";
|
||||
|
||||
input_defs.push_back(matmulinteger_node.MutableInputDefs()[0]);
|
||||
input_defs.push_back(matmulinteger_node.MutableInputDefs()[1]);
|
||||
input_defs.push_back(mul_node_right.MutableInputDefs()[0]);
|
||||
input_defs.push_back(mul_node_right.MutableInputDefs()[1]);
|
||||
input_defs.push_back(&optional_node_arg);
|
||||
input_defs.push_back(&optional_node_arg);
|
||||
|
||||
if (matmulinteger_node.InputDefs().size() >= 3) {
|
||||
// Add zero point of A
|
||||
input_defs[4] = matmulinteger_node.MutableInputDefs()[2];
|
||||
|
||||
// Add zero point of B
|
||||
if (matmulinteger_node.InputDefs().size() == 4) {
|
||||
const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]);
|
||||
if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) {
|
||||
continue;
|
||||
}
|
||||
input_defs[5] = matmulinteger_node.MutableInputDefs()[3];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (add_node != nullptr) {
|
||||
input_defs.push_back(add_node->MutableInputDefs()[1]);
|
||||
}
|
||||
|
||||
Node* fused_node = &graph.AddNode(graph.GenerateNodeName(op_type_to_fuse),
|
||||
op_type_to_fuse,
|
||||
"",
|
||||
input_defs,
|
||||
add_node != nullptr ? add_node->MutableOutputDefs() : mul_node.MutableOutputDefs(),
|
||||
nullptr,
|
||||
kMSDomain);
|
||||
// Assign provider to this new node. Provider should be same as the provider for old node.
|
||||
fused_node.SetExecutionProviderType(mul_node.GetExecutionProviderType());
|
||||
ORT_ENFORCE(nullptr != fused_node);
|
||||
fused_node->SetExecutionProviderType(mul_node.GetExecutionProviderType());
|
||||
|
||||
nodes_to_remove.push_back(dql_node_left);
|
||||
nodes_to_remove.push_back(matmulinteger_node);
|
||||
nodes_to_remove.push_back(cast_node);
|
||||
nodes_to_remove.push_back(mul_node_right);
|
||||
nodes_to_remove.push_back(mul_node);
|
||||
if (add_node != nullptr) {
|
||||
nodes_to_remove.push_back(*add_node);
|
||||
}
|
||||
}
|
||||
|
||||
for (const auto& node : nodes_to_remove) {
|
||||
|
|
|
|||
|
|
@ -23,7 +23,9 @@ namespace test {
|
|||
template <typename T>
|
||||
void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
|
||||
std::vector<int64_t> B_dims,
|
||||
const std::string& reference_model) {
|
||||
const std::string& reference_model,
|
||||
bool has_zp = true,
|
||||
bool has_bias = false) {
|
||||
// create rand inputs
|
||||
RandomValueGenerator random{};
|
||||
|
||||
|
|
@ -38,11 +40,24 @@ void TestDynamicQuantizeMatMul(const std::vector<int64_t>& A_dims,
|
|||
std::vector<float> B_scale = random.Uniform<float>({1}, -0.1f, 0.1f);
|
||||
std::vector<T> B_zero_point = {static_cast<T>(random.Uniform<int32_t>({1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0])};
|
||||
|
||||
std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
|
||||
|
||||
OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain);
|
||||
test.AddInput<float>("A", A_dims, A_data);
|
||||
test.AddInput<T>("B", B_dims, B_data);
|
||||
test.AddInput<float>("b_scale", {1}, B_scale);
|
||||
test.AddInput<T>("b_zero_point", {1}, B_zero_point);
|
||||
|
||||
if (has_zp) {
|
||||
test.AddInput<T>("b_zero_point", {1}, B_zero_point);
|
||||
} else {
|
||||
test.AddMissingOptionalInput<T>();
|
||||
}
|
||||
|
||||
if (has_bias) {
|
||||
test.AddInput<float>("bias", {B_dims.back()}, Bias);
|
||||
} else {
|
||||
test.AddMissingOptionalInput<float>();
|
||||
}
|
||||
|
||||
test.AddReferenceOutputs(reference_model);
|
||||
test.Run();
|
||||
|
|
@ -56,6 +71,16 @@ TEST(DynamicQuantizeMatMul, Int8_test) {
|
|||
TestDynamicQuantizeMatMul<int8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_int8.onnx");
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, Int8_test_bias) {
|
||||
#ifdef MLAS_SUPPORTS_GEMM_U8X8
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestDynamicQuantizeMatMul<int8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_int8_bias.onnx", false, true);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, UInt8_test) {
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
|
|
@ -64,5 +89,13 @@ TEST(DynamicQuantizeMatMul, UInt8_test) {
|
|||
TestDynamicQuantizeMatMul<uint8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8.onnx");
|
||||
}
|
||||
|
||||
TEST(DynamicQuantizeMatMul, UInt8_test_bias) {
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestDynamicQuantizeMatMul<uint8_t>(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8_bias.onnx", false, true);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
|
|||
115
onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
Normal file
115
onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
Normal file
|
|
@ -0,0 +1,115 @@
|
|||
|
||||
|
||||
// Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
#include "core/framework/tensor.h"
|
||||
#include "core/session/inference_session.h"
|
||||
#include "test/common/tensor_op_test_utils.h"
|
||||
#include "test/framework/test_utils.h"
|
||||
#include "test/providers/provider_test_utils.h"
|
||||
#include "test/util/include/default_providers.h"
|
||||
#include "core/util/qmath.h"
|
||||
|
||||
#include <chrono>
|
||||
#include <random>
|
||||
|
||||
#include "gtest/gtest.h"
|
||||
#include "gmock/gmock.h"
|
||||
|
||||
using namespace std;
|
||||
|
||||
namespace onnxruntime {
|
||||
namespace test {
|
||||
|
||||
template <typename T>
|
||||
void TestMatMulIntegerToFloat(const std::vector<int64_t>& A_dims,
|
||||
std::vector<int64_t> B_dims,
|
||||
const std::string& reference_model,
|
||||
bool has_zp = true,
|
||||
bool has_bias = false) {
|
||||
// create rand inputs
|
||||
RandomValueGenerator random{};
|
||||
|
||||
std::vector<uint8_t> A_data;
|
||||
std::vector<int> tmp_A_data = random.Uniform<int32_t>(A_dims, 0, 255);
|
||||
std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> T {
|
||||
return static_cast<uint8_t>(v);
|
||||
});
|
||||
|
||||
std::vector<T> B_data;
|
||||
std::vector<int> tmp_B_data = random.Uniform<int32_t>(B_dims, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
|
||||
std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T {
|
||||
return static_cast<T>(v);
|
||||
});
|
||||
|
||||
std::vector<float> A_scale = random.Uniform<float>({1}, -0.1f, 0.1f);
|
||||
std::vector<float> B_scale = random.Uniform<float>({1}, -0.1f, 0.1f);
|
||||
|
||||
std::vector<uint8_t> A_zero_point{127};
|
||||
std::vector<T> B_zero_point = {static_cast<T>(random.Uniform<int32_t>({1}, std::numeric_limits<T>::min(), std::numeric_limits<T>::max())[0])};
|
||||
|
||||
std::vector<float> Bias = random.Uniform<float>({B_dims.back()}, -0.1f, 0.1f);
|
||||
|
||||
OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain);
|
||||
test.AddInput<uint8_t>("A", A_dims, A_data);
|
||||
test.AddInput<T>("B", B_dims, B_data);
|
||||
test.AddInput<float>("a_scale", {1}, A_scale);
|
||||
test.AddInput<float>("b_scale", {1}, B_scale);
|
||||
|
||||
if (has_zp) {
|
||||
test.AddInput<uint8_t>("a_zero_point", {1}, A_zero_point);
|
||||
test.AddInput<T>("b_zero_point", {1}, B_zero_point);
|
||||
} else {
|
||||
test.AddMissingOptionalInput<T>();
|
||||
test.AddMissingOptionalInput<T>();
|
||||
}
|
||||
|
||||
if (has_bias) {
|
||||
test.AddInput<float>("bias", {B_dims.back()}, Bias);
|
||||
} else {
|
||||
test.AddMissingOptionalInput<float>();
|
||||
}
|
||||
|
||||
test.AddReferenceOutputs(reference_model);
|
||||
test.Run();
|
||||
}
|
||||
|
||||
TEST(MatMulIntegerToFloat, Int8_test) {
|
||||
#ifdef MLAS_SUPPORTS_GEMM_U8X8
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestMatMulIntegerToFloat<int8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_int8.onnx");
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(MatMulIntegerToFloat, Int8_bias_test) {
|
||||
#ifdef MLAS_SUPPORTS_GEMM_U8X8
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestMatMulIntegerToFloat<int8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_int8_bias.onnx", false, true);
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST(MatMulIntegerToFloat, UInt8_test) {
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestMatMulIntegerToFloat<uint8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_uint8.onnx");
|
||||
}
|
||||
|
||||
TEST(MatMulIntegerToFloat, UInt8_bias_test) {
|
||||
std::vector<int64_t> A_dims{4, 128};
|
||||
std::vector<int64_t> B_dims{128, 128};
|
||||
std::vector<int64_t> Y_dims{4, 128};
|
||||
|
||||
TestMatMulIntegerToFloat<uint8_t>(A_dims, B_dims, "testdata/matmul_integer_to_float_uint8_bias.onnx", false, true);
|
||||
}
|
||||
|
||||
} // namespace test
|
||||
} // namespace onnxruntime
|
||||
|
|
@ -2472,6 +2472,84 @@ TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest) {
|
|||
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
|
||||
}
|
||||
|
||||
TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_Bias) {
|
||||
auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias.onnx";
|
||||
std::shared_ptr<Model> p_model;
|
||||
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
|
||||
Graph& graph = p_model->MainGraph();
|
||||
|
||||
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
|
||||
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
|
||||
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
|
||||
ASSERT_TRUE(ret.IsOK());
|
||||
|
||||
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0);
|
||||
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
|
||||
EXPECT_EQ(op_to_count["Cast"], 0);
|
||||
EXPECT_EQ(op_to_count["Mul"], 0);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
|
||||
}
|
||||
|
||||
TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_ND_bias) {
|
||||
auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias_ND.onnx";
|
||||
std::shared_ptr<Model> p_model;
|
||||
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
|
||||
Graph& graph = p_model->MainGraph();
|
||||
|
||||
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
|
||||
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
|
||||
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
|
||||
ASSERT_TRUE(ret.IsOK());
|
||||
|
||||
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0);
|
||||
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
|
||||
EXPECT_EQ(op_to_count["Cast"], 0);
|
||||
EXPECT_EQ(op_to_count["Mul"], 0);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
|
||||
EXPECT_EQ(op_to_count["Add"], 1);
|
||||
}
|
||||
|
||||
TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_Bias_No_B_ZP) {
|
||||
auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx";
|
||||
std::shared_ptr<Model> p_model;
|
||||
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
|
||||
Graph& graph = p_model->MainGraph();
|
||||
|
||||
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
|
||||
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
|
||||
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
|
||||
ASSERT_TRUE(ret.IsOK());
|
||||
|
||||
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0);
|
||||
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
|
||||
EXPECT_EQ(op_to_count["Cast"], 0);
|
||||
EXPECT_EQ(op_to_count["Mul"], 0);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1);
|
||||
}
|
||||
|
||||
TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) {
|
||||
auto model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float.onnx";
|
||||
std::shared_ptr<Model> p_model;
|
||||
ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_));
|
||||
Graph& graph = p_model->MainGraph();
|
||||
|
||||
onnxruntime::GraphTransformerManager graph_transformation_mgr{5};
|
||||
graph_transformation_mgr.Register(onnxruntime::make_unique<DynamicQuantizeMatMulFusion>(), TransformerLevel::Level2);
|
||||
auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_);
|
||||
ASSERT_TRUE(ret.IsOK());
|
||||
|
||||
std::map<std::string, int> op_to_count = CountOpsInGraph(graph);
|
||||
EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 1);
|
||||
EXPECT_EQ(op_to_count["MatMulInteger"], 0);
|
||||
EXPECT_EQ(op_to_count["Cast"], 0);
|
||||
EXPECT_EQ(op_to_count["Mul"], 0);
|
||||
EXPECT_EQ(op_to_count["MatMulIntegerToFloat"], 3);
|
||||
EXPECT_EQ(op_to_count["Add"], 1);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // namespace test
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
:è
|
||||
:č
|
||||
M
|
||||
Aa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
|
||||
W
|
||||
|
|
@ -17,7 +17,7 @@ A
|
|||
matmul_output_float
|
||||
|
||||
multiplierY
|
||||
mul_bottom"MulDynamicQuantizeLinear_fusionZ
|
||||
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
|
|
@ -40,4 +40,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusionZ
|
|||
|
||||
|
||||
M
|
||||
NB
|
||||
NB
|
||||
44
onnxruntime/test/testdata/dynamic_quantize_matmul_int8_bias.onnx
vendored
Normal file
44
onnxruntime/test/testdata/dynamic_quantize_matmul_int8_bias.onnx
vendored
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
:‹
|
||||
M
|
||||
Aa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
|
||||
I
|
||||
a_quantized
|
||||
B
|
||||
a_zpmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
E
|
||||
matmul_output_float
|
||||
|
||||
multipliermul_bottom_output
|
||||
mul_bottom"Mul
|
||||
&
|
||||
mul_bottom_output
|
||||
biasYadd"AddDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
b_scale
|
||||
|
||||
|
||||
Z
|
||||
bias
|
||||
|
||||
Nb
|
||||
Y
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
|
|
@ -3,24 +3,41 @@ from onnx import helper
|
|||
from onnx import TensorProto
|
||||
from enum import Enum
|
||||
|
||||
def GenerateModel(model_name, sign):
|
||||
def GenerateModel(model_name, sign, b_zp = True, bias = False):
|
||||
nodes = [ # DynamicQuantizeMatMul subgraph
|
||||
helper.make_node("DynamicQuantizeLinear", ["A"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"),
|
||||
helper.make_node("MatMulInteger", ["a_quantized", "B", "a_zp", "b_zero_point"], ["matmul_output_int32"], "MatMulInteger"),
|
||||
|
||||
helper.make_node(
|
||||
"MatMulInteger",
|
||||
["a_quantized", "B", "a_zp", "b_zero_point"] if b_zp else ["a_quantized", "B", "a_zp"],
|
||||
["matmul_output_int32"],
|
||||
"MatMulInteger"),
|
||||
|
||||
helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
|
||||
|
||||
helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
|
||||
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["Y"], "mul_bottom"),
|
||||
|
||||
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["mul_bottom_output" if bias else "Y"], "mul_bottom"),
|
||||
]
|
||||
|
||||
inputs = [
|
||||
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['M', 'K']),
|
||||
helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']),
|
||||
helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]),
|
||||
]
|
||||
|
||||
if b_zp:
|
||||
inputs.extend([helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1])])
|
||||
|
||||
if bias:
|
||||
nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")])
|
||||
|
||||
inputs.extend([helper.make_tensor_value_info('bias', TensorProto.FLOAT, ['N'])])
|
||||
|
||||
graph = helper.make_graph(
|
||||
nodes,
|
||||
"DynamicQuantizeMatMul_fusion", #name
|
||||
[ # inputs
|
||||
helper.make_tensor_value_info('A', TensorProto.FLOAT, ['M', 'K']),
|
||||
helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']),
|
||||
helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]),
|
||||
helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1]),
|
||||
],
|
||||
inputs,
|
||||
[ # outputs
|
||||
helper.make_tensor_value_info('Y', TensorProto.FLOAT, ['M', 'N']),
|
||||
])
|
||||
|
|
@ -30,4 +47,6 @@ def GenerateModel(model_name, sign):
|
|||
|
||||
if __name__ == "__main__":
|
||||
GenerateModel('dynamic_quantize_matmul_int8.onnx', True)
|
||||
GenerateModel('dynamic_quantize_matmul_int8.onnx', False)
|
||||
GenerateModel('dynamic_quantize_matmul_uint8.onnx', False)
|
||||
GenerateModel('dynamic_quantize_matmul_int8_bias.onnx', True, False, True)
|
||||
GenerateModel('dynamic_quantize_matmul_uint8_bias.onnx', False, False, True)
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
:è
|
||||
:č
|
||||
M
|
||||
Aa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
|
||||
W
|
||||
|
|
@ -17,7 +17,7 @@ A
|
|||
matmul_output_float
|
||||
|
||||
multiplierY
|
||||
mul_bottom"MulDynamicQuantizeLinear_fusionZ
|
||||
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
|
|
@ -40,4 +40,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusionZ
|
|||
|
||||
|
||||
M
|
||||
NB
|
||||
NB
|
||||
44
onnxruntime/test/testdata/dynamic_quantize_matmul_uint8_bias.onnx
vendored
Normal file
44
onnxruntime/test/testdata/dynamic_quantize_matmul_uint8_bias.onnx
vendored
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
:‹
|
||||
M
|
||||
Aa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
|
||||
I
|
||||
a_quantized
|
||||
B
|
||||
a_zpmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
E
|
||||
matmul_output_float
|
||||
|
||||
multipliermul_bottom_output
|
||||
mul_bottom"Mul
|
||||
&
|
||||
mul_bottom_output
|
||||
biasYadd"AddDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
b_scale
|
||||
|
||||
|
||||
Z
|
||||
bias
|
||||
|
||||
Nb
|
||||
Y
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
55
onnxruntime/test/testdata/matmul_integer_to_float.py
vendored
Normal file
55
onnxruntime/test/testdata/matmul_integer_to_float.py
vendored
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
import onnx
|
||||
from onnx import helper
|
||||
from onnx import TensorProto
|
||||
from enum import Enum
|
||||
|
||||
def GenerateModel(model_name, sign, has_zp = True, bias = False):
|
||||
nodes = [ # subgraph
|
||||
helper.make_node(
|
||||
"MatMulInteger",
|
||||
["A", "B", "a_zero_point", "b_zero_point"] if has_zp else ["A", "B"],
|
||||
["matmul_output_int32"],
|
||||
"MatMulInteger"),
|
||||
|
||||
helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
|
||||
|
||||
helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
|
||||
|
||||
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["mul_bottom_output" if bias else "Y"], "mul_bottom"),
|
||||
]
|
||||
|
||||
inputs = [ # inputs
|
||||
helper.make_tensor_value_info('A', TensorProto.UINT8, ['M', 'K']),
|
||||
helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']),
|
||||
helper.make_tensor_value_info('a_scale', TensorProto.FLOAT, [1]),
|
||||
helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]),
|
||||
|
||||
]
|
||||
|
||||
if has_zp:
|
||||
inputs.extend([
|
||||
helper.make_tensor_value_info('a_zero_point', TensorProto.UINT8, [1]),
|
||||
helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1]),
|
||||
])
|
||||
|
||||
if bias:
|
||||
nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")])
|
||||
|
||||
inputs.extend([helper.make_tensor_value_info('bias', TensorProto.FLOAT, ['N'])])
|
||||
|
||||
graph = helper.make_graph(
|
||||
nodes,
|
||||
"DynamicQuantizeMatMul_fusion", #name
|
||||
inputs,
|
||||
[ # outputs
|
||||
helper.make_tensor_value_info('Y', TensorProto.FLOAT, ['M', 'N']),
|
||||
])
|
||||
|
||||
model = helper.make_model(graph)
|
||||
onnx.save(model, model_name)
|
||||
|
||||
if __name__ == "__main__":
|
||||
GenerateModel('matmul_integer_to_float_int8.onnx', True)
|
||||
GenerateModel('matmul_integer_to_float_uint8.onnx', False)
|
||||
GenerateModel('matmul_integer_to_float_int8_bias.onnx', True, False, True)
|
||||
GenerateModel('matmul_integer_to_float_uint8_bias.onnx', False, False, True)
|
||||
49
onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
vendored
Normal file
49
onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx
vendored
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
:Ę
|
||||
U
|
||||
A
|
||||
B
|
||||
a_zero_point
|
||||
b_zero_pointmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
5
|
||||
matmul_output_float
|
||||
|
||||
multiplierY
|
||||
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
a_scale
|
||||
|
||||
|
||||
Z
|
||||
b_scale
|
||||
|
||||
|
||||
Z
|
||||
a_zero_point
|
||||
|
||||
|
||||
Z
|
||||
b_zero_point
|
||||
|
||||
|
||||
b
|
||||
Y
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
45
onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
vendored
Normal file
45
onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx
vendored
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
:Ã
|
||||
9
|
||||
A
|
||||
Bmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
E
|
||||
matmul_output_float
|
||||
|
||||
multipliermul_bottom_output
|
||||
mul_bottom"Mul
|
||||
&
|
||||
mul_bottom_output
|
||||
biasYadd"AddDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
a_scale
|
||||
|
||||
|
||||
Z
|
||||
b_scale
|
||||
|
||||
|
||||
Z
|
||||
bias
|
||||
|
||||
Nb
|
||||
Y
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
49
onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
vendored
Normal file
49
onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx
vendored
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
:Ę
|
||||
U
|
||||
A
|
||||
B
|
||||
a_zero_point
|
||||
b_zero_pointmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
5
|
||||
matmul_output_float
|
||||
|
||||
multiplierY
|
||||
mul_bottom"MulDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
a_scale
|
||||
|
||||
|
||||
Z
|
||||
b_scale
|
||||
|
||||
|
||||
Z
|
||||
a_zero_point
|
||||
|
||||
|
||||
Z
|
||||
b_zero_point
|
||||
|
||||
|
||||
b
|
||||
Y
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
45
onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
vendored
Normal file
45
onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx
vendored
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
:Ã
|
||||
9
|
||||
A
|
||||
Bmatmul_output_int32
MatMulInteger"
MatMulInteger
|
||||
.
|
||||
a_scale
|
||||
b_scale
|
||||
multiplier mul_right"Mul
|
||||
A
|
||||
matmul_output_int32matmul_output_floatcast"Cast*
|
||||
to
|
||||
E
|
||||
matmul_output_float
|
||||
|
||||
multipliermul_bottom_output
|
||||
mul_bottom"Mul
|
||||
&
|
||||
mul_bottom_output
|
||||
biasYadd"AddDynamicQuantizeMatMul_fusionZ
|
||||
A
|
||||
|
||||
|
||||
M
|
||||
KZ
|
||||
B
|
||||
|
||||
|
||||
K
|
||||
NZ
|
||||
a_scale
|
||||
|
||||
|
||||
Z
|
||||
b_scale
|
||||
|
||||
|
||||
Z
|
||||
bias
|
||||
|
||||
Nb
|
||||
Y
|
||||
|
||||
|
||||
M
|
||||
NB
|
||||
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.fused.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.fused.onnx
vendored
Normal file
Binary file not shown.
|
|
@ -1,4 +1,4 @@
|
|||
:ì
|
||||
:ì
|
||||
Q
|
||||
inputa_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear
|
||||
Y
|
||||
|
|
@ -17,7 +17,7 @@ A
|
|||
matmul_output_float
|
||||
|
||||
multiplieroutput
|
||||
mul_bottom"MulDynamicQuantizeLinear_fusion**Bb_quantized**€Bb_zp*"ffæ?Bb_scaleZ
|
||||
mul_bottom"MulDynamicQuantizeLinear_fusion**Bb_quantized*"ffæ?Bb_scale**€Bb_zpZ
|
||||
input
|
||||
|
||||
|
||||
|
|
@ -25,4 +25,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusion**Bb_quantized**
|
|||
output
|
||||
|
||||
|
||||
B
|
||||
B
|
||||
|
|
@ -3,21 +3,43 @@ from onnx import helper
|
|||
from onnx import TensorProto
|
||||
from enum import Enum
|
||||
|
||||
def GenerateModel(model_name):
|
||||
nodes = [ # LayerNorm subgraph
|
||||
def GenerateModel(model_name, b_has_zp = True, has_bias = False, bias_ND = False):
|
||||
mul_output = "Mul_output" if has_bias else "output"
|
||||
nodes = [ # construct graph
|
||||
helper.make_node("DynamicQuantizeLinear", ["input"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"),
|
||||
helper.make_node("MatMulInteger", ["a_quantized", "b_quantized", "a_zp", "b_zp"], ["matmul_output_int32"], "MatMulInteger"),
|
||||
helper.make_node(
|
||||
"MatMulInteger",
|
||||
["a_quantized", "b_quantized", "a_zp", "b_zp"] if b_has_zp else ["a_quantized", "b_quantized", "a_zp"],
|
||||
["matmul_output_int32"],
|
||||
"MatMulInteger"),
|
||||
helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"),
|
||||
helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1),
|
||||
helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["output"], "mul_bottom"),
|
||||
helper.make_node("Mul", ["matmul_output_float", "multiplier"], [mul_output], "mul_bottom"),
|
||||
]
|
||||
|
||||
if has_bias:
|
||||
nodes.extend([helper.make_node("Add", [mul_output, "bias"], ["output"], "bias_add")])
|
||||
|
||||
initializers = [ # initializers
|
||||
helper.make_tensor('b_quantized', TensorProto.UINT8, [2,3], [2, 4, 5, 6, 7, 8]),
|
||||
helper.make_tensor('b_zp', TensorProto.UINT8, [], [128]),
|
||||
helper.make_tensor('b_scale', TensorProto.FLOAT, [], [1.8]),
|
||||
]
|
||||
|
||||
if b_has_zp:
|
||||
initializers.extend([ # initializers
|
||||
helper.make_tensor('b_zp', TensorProto.UINT8, [], [128]),
|
||||
])
|
||||
|
||||
if has_bias:
|
||||
if bias_ND:
|
||||
initializers.extend([ # initializers
|
||||
helper.make_tensor('bias', TensorProto.FLOAT, [3, 3], [3.0, 4.0, 6.0, 3.0, 4.0, 6.0, 3.0, 4.0, 5.0]),
|
||||
])
|
||||
else:
|
||||
initializers.extend([ # initializers
|
||||
helper.make_tensor('bias', TensorProto.FLOAT, [3], [3.0, 4.0, 5.0]),
|
||||
])
|
||||
|
||||
graph = helper.make_graph(
|
||||
nodes,
|
||||
"DynamicQuantizeLinear_fusion", #name
|
||||
|
|
@ -33,4 +55,7 @@ def GenerateModel(model_name):
|
|||
onnx.save(model, model_name)
|
||||
|
||||
if __name__ == "__main__":
|
||||
GenerateModel('dynamic_quantize_matmul.onnx')
|
||||
GenerateModel('dynamic_quantize_matmul.onnx')
|
||||
GenerateModel('dynamic_quantize_matmul_bias.onnx', True, True)
|
||||
GenerateModel('dynamic_quantize_matmul_bias_b_no_zp.onnx', False, True)
|
||||
GenerateModel('dynamic_quantize_matmul_bias_ND.onnx', False, True, True)
|
||||
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_b_no_zp_bias.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_b_no_zp_bias.onnx
vendored
Normal file
Binary file not shown.
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias.onnx
vendored
Normal file
Binary file not shown.
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_ND.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_ND.onnx
vendored
Normal file
Binary file not shown.
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx
vendored
Normal file
Binary file not shown.
BIN
onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx
vendored
Normal file
BIN
onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx
vendored
Normal file
Binary file not shown.
60
onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
vendored
Normal file
60
onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py
vendored
Normal file
|
|
@ -0,0 +1,60 @@
|
|||
import onnx
|
||||
from onnx import helper
|
||||
from onnx import TensorProto
|
||||
from enum import Enum
|
||||
|
||||
def MakeSubGraph(suffix, has_bias):
|
||||
mul_bottom_output = "mul_output" + suffix if has_bias else "output" + suffix
|
||||
nodes = [
|
||||
helper.make_node("MatMulInteger", ["a_quantized", "b_quantized" + suffix, "a_zp", "b_zp" + suffix], ["matmul_output_int32" + suffix], "MatMulInteger" + suffix),
|
||||
helper.make_node("Mul", ["a_scale", "b_scale" + suffix], ["multiplier" + suffix], "mul_right" + suffix),
|
||||
helper.make_node("Cast", ["matmul_output_int32" + suffix], ["matmul_output_float" + suffix], "cast" + suffix, to=1),
|
||||
helper.make_node("Mul", ["matmul_output_float" + suffix, "multiplier" + suffix], [mul_bottom_output], "mul_bottom" + suffix),
|
||||
]
|
||||
|
||||
if has_bias:
|
||||
nodes.extend([helper.make_node("Add", [mul_bottom_output, "bias" + suffix], ["output" + suffix], "bias_add" + suffix),])
|
||||
|
||||
return nodes
|
||||
|
||||
def MakeInitializer(suffix):
|
||||
return [
|
||||
helper.make_tensor('b_quantized' + suffix, TensorProto.UINT8, [2,3], [2, 4, 5, 6, 7, 8]),
|
||||
helper.make_tensor('b_zp' + suffix, TensorProto.UINT8, [], [128]),
|
||||
helper.make_tensor('b_scale' + suffix, TensorProto.FLOAT, [], [1.8]),
|
||||
]
|
||||
|
||||
def GenerateModel(model_name):
|
||||
nodes = [helper.make_node("DynamicQuantizeLinear", ["input"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"),]
|
||||
nodes.extend(MakeSubGraph("_1", True))
|
||||
nodes.extend(MakeSubGraph("_2", True))
|
||||
nodes.extend(MakeSubGraph("_3", False))
|
||||
|
||||
initializers = []
|
||||
initializers.extend(MakeInitializer("_1"))
|
||||
initializers.extend(MakeInitializer("_2"))
|
||||
initializers.extend(MakeInitializer("_3"))
|
||||
|
||||
initializers.extend([
|
||||
helper.make_tensor('bias_1', TensorProto.FLOAT, [3], [2, 4, 5]),
|
||||
helper.make_tensor('bias_2', TensorProto.FLOAT, [3,3], [1, 2, 3, 4, 5, 6, 7, 8, 9]),
|
||||
])
|
||||
|
||||
graph = helper.make_graph(
|
||||
nodes,
|
||||
"MatMulIntegerToFloat_fusion", #name
|
||||
[ # inputs
|
||||
helper.make_tensor_value_info('input', TensorProto.FLOAT, [3, 2]),
|
||||
],
|
||||
[ # outputs
|
||||
helper.make_tensor_value_info('output_1', TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info('output_2', TensorProto.FLOAT, [3, 3]),
|
||||
helper.make_tensor_value_info('output_3', TensorProto.FLOAT, [3, 3]),
|
||||
],
|
||||
initializers)
|
||||
|
||||
model = helper.make_model(graph)
|
||||
onnx.save(model, model_name)
|
||||
|
||||
if __name__ == "__main__":
|
||||
GenerateModel('matmul_integer_to_float.onnx')
|
||||
Loading…
Reference in a new issue