From 67a7d93b4901dc45873090a52f490990efa2cbb4 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Thu, 2 Jul 2020 13:08:21 -0700 Subject: [PATCH] Fuse MatMulInteger and scale followed (#4350) * Fuse MatMulInteger and scale followed * Add bias --- .../contrib_ops/cpu/cpu_contrib_kernels.cc | 4 + .../quantization/dynamic_quantize_matmul.cc | 5 +- .../quantization/matmul_integer_to_float.cc | 95 +++++++++++ .../quantization/matmul_integer_to_float.h | 22 +++ .../core/graph/contrib_ops/contrib_defs.cc | 64 ++++++++ .../dynamic_quantize_matmul_fusion.cc | 148 +++++++++++++++--- .../dynamic_quantize_matmul_test.cc | 37 ++++- .../matmul_integer_to_float_test.cc | 115 ++++++++++++++ .../test/optimizer/graph_transform_test.cc | 78 +++++++++ .../dynamic_quantize_matmul_int8.onnx | 6 +- .../dynamic_quantize_matmul_int8_bias.onnx | 44 ++++++ .../testdata/dynamic_quantize_matmul_test.py | 39 +++-- .../dynamic_quantize_matmul_uint8.onnx | 6 +- .../dynamic_quantize_matmul_uint8_bias.onnx | 44 ++++++ .../test/testdata/matmul_integer_to_float.py | 55 +++++++ .../matmul_integer_to_float_int8.onnx | 49 ++++++ .../matmul_integer_to_float_int8_bias.onnx | 45 ++++++ .../matmul_integer_to_float_uint8.onnx | 49 ++++++ .../matmul_integer_to_float_uint8_bias.onnx | 45 ++++++ .../fusion/dynamic_quantize_matmul.fused.onnx | Bin 0 -> 435 bytes .../fusion/dynamic_quantize_matmul.onnx | 6 +- .../fusion/dynamic_quantize_matmul.py | 37 ++++- .../dynamic_quantize_matmul_b_no_zp_bias.onnx | Bin 0 -> 554 bytes .../fusion/dynamic_quantize_matmul_bias.onnx | Bin 0 -> 574 bytes .../dynamic_quantize_matmul_bias_ND.onnx | Bin 0 -> 580 bytes .../dynamic_quantize_matmul_bias_b_no_zp.onnx | Bin 0 -> 554 bytes .../fusion/matmul_integer_to_float.onnx | Bin 0 -> 1510 bytes .../fusion/matmul_integer_to_float.py | 60 +++++++ 28 files changed, 998 insertions(+), 55 deletions(-) create mode 100644 onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.cc create mode 100644 onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.h create mode 100644 onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc create mode 100644 onnxruntime/test/testdata/dynamic_quantize_matmul_int8_bias.onnx create mode 100644 onnxruntime/test/testdata/dynamic_quantize_matmul_uint8_bias.onnx create mode 100644 onnxruntime/test/testdata/matmul_integer_to_float.py create mode 100644 onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx create mode 100644 onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx create mode 100644 onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx create mode 100644 onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.fused.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_b_no_zp_bias.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_ND.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx create mode 100644 onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.py diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index 4d6edd070d..14cf2a648e 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -47,6 +47,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, float_uint8_t_int8_t, QAttention); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, DynamicQuantizeMatMul); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, DynamicQuantizeMatMul); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, MatMulIntegerToFloat); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, MatMulIntegerToFloat); // ******** End: Quantization ******************* // // This section includes all op kernel declarations for former experimental ops which have now been removed from onnx. @@ -108,6 +110,8 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc index 7c71778a4a..882ba63034 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc @@ -52,7 +52,6 @@ template Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { auto* a = ctx->Input(0); auto* b = ctx->Input(1); - ORT_ENFORCE(a != nullptr && b != nullptr); auto* b_scale_tensor = ctx->Input(2); ORT_ENFORCE(IsScalarOr1ElementVector(b_scale_tensor), @@ -88,6 +87,8 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { const auto* b_data = b->template Data(); + const Tensor* bias_tensor = ctx->Input(4); + Tensor* y = ctx->Output(0, helper.OutputShape()); auto* y_data = y->template MutableData(); @@ -107,7 +108,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const { y_data + helper.OutputOffsets()[i], static_cast(helper.N()), &multiplier, - nullptr, + nullptr != bias_tensor ? bias_tensor->Data() : nullptr, thread_pool); } diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.cc new file mode 100644 index 0000000000..0613535969 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.cc @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "matmul_integer_to_float.h" + +#include "core/mlas/inc/mlas.h" +#include "core/providers/common.h" +#include "core/providers/cpu/math/matmul_helper.h" +#include "core/util/math_cpuonly.h" +#include "core/util/qmath.h" + +namespace onnxruntime { +namespace contrib { + +#define REGISTER_MATMUL_INTEGER_TO_FLOAT(T) \ + ONNX_OPERATOR_TYPED_KERNEL_EX( \ + MatMulIntegerToFloat, \ + kMSDomain, \ + 1, \ + T, \ + kCpuExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T2", DataTypeImpl::GetTensorType()) \ + .TypeConstraint("T3", DataTypeImpl::GetTensorType()), \ + MatMulIntegerToFloat); + +REGISTER_MATMUL_INTEGER_TO_FLOAT(int8_t) +REGISTER_MATMUL_INTEGER_TO_FLOAT(uint8_t) + +template +Status MatMulIntegerToFloat::Compute(OpKernelContext* ctx) const { + const Tensor* a = ctx->Input(0); + const Tensor* b = ctx->Input(1); + + const Tensor* a_scale_tensor = ctx->Input(2); + ORT_ENFORCE(IsScalarOr1ElementVector(a_scale_tensor), + "MatMulIntegerToFloat : input A scale must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet."); + + float a_scale = *a_scale_tensor->template Data(); + + const Tensor* b_scale_tensor = ctx->Input(3); + ORT_ENFORCE(IsScalarOr1ElementVector(b_scale_tensor), + "MatMulIntegerToFloat : input B scale must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet."); + + float b_scale = *b_scale_tensor->template Data(); + + float multiplier = a_scale * b_scale; + + // validate zero points + T1 a_zp = 0; + const Tensor* a_zp_tensor = ctx->Input(4); + if (a_zp_tensor != nullptr) { + ORT_ENFORCE(IsScalarOr1ElementVector(a_zp_tensor), + "MatMulIntegerToFloat : input A zero point must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet."); + a_zp = *a_zp_tensor->template Data(); + } + + T2 b_zp = 0; + const Tensor* b_zp_tensor = ctx->Input(5); + if (b_zp_tensor != nullptr) { + ORT_ENFORCE(IsScalarOr1ElementVector(b_zp_tensor), + "MatMulIntegerToFloat : input B zero point must be a scalar or 1D tensor of size 1. Per-Channel is not supported yet."); + b_zp = *b_zp_tensor->template Data(); + } + + MatMulComputeHelper helper; + ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b->Shape())); + Tensor* Y = ctx->Output(0, helper.OutputShape()); + + const Tensor* bias_tensor = ctx->Input(6); + + concurrency::ThreadPool* thread_pool = ctx->GetOperatorThreadPool(); + for (size_t i = 0; i < helper.OutputOffsets().size(); i++) { + QGemm(static_cast(helper.M()), + static_cast(helper.N()), + static_cast(helper.K()), + a->template Data() + helper.LeftOffsets()[i], + static_cast(helper.K()), + a_zp, + b->template Data() + helper.RightOffsets()[i], + static_cast(helper.N()), + b_zp, + Y->template MutableData() + helper.OutputOffsets()[i], + static_cast(helper.N()), + &multiplier, + nullptr != bias_tensor ? bias_tensor->Data() : nullptr, + thread_pool); + } + + return Status::OK(); +} + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.h b/onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.h new file mode 100644 index 0000000000..5984dc7ef0 --- /dev/null +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_integer_to_float.h @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/common/common.h" +#include "core/framework/op_kernel.h" + +namespace onnxruntime { +namespace contrib { + +template +class MatMulIntegerToFloat final : public OpKernel { + public: + MatMulIntegerToFloat(const OpKernelInfo& info) : OpKernel(info) { + } + + Status Compute(OpKernelContext* context) const override; +}; + +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 93b69f59e9..21e5b3c4e7 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -1696,6 +1696,11 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy- "of elements should be equal to the number of columns of input 'B'.", "T2", OpSchema::Optional) + .Input(4, + "bias", + "1D input tensor, whose dimension is same as B's last dimension", + "T1", + OpSchema::Optional) .Output(0, "Y", "Matrix multiply results from A * B", "T1") .TypeConstraint( "T1", @@ -1710,6 +1715,65 @@ Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy- ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1); }); + ONNX_CONTRIB_OPERATOR_SCHEMA(MatMulIntegerToFloat) + .SetDomain(kMSDomain) + .SinceVersion(1) + .Input(0, "A", "N-dimensional matrix A", "T1") + .Input(1, "B", "N-dimensional matrix B", "T2") + .Input( + 2, + "a_scale", + "Scale of quantized input 'A'. It could be a scalar or a 1-D tensor, " + "which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number " + "of elements should be equal to the number of columns of input 'A'.", + "T3") + .Input( + 3, + "b_scale", + "Scale of quantized input 'B'. It could be a scalar or a 1-D tensor, " + "which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number " + "of elements should be equal to the number of columns of input 'B'.", + "T3") + .Input( + 4, + "a_zero_point", + "Zero point tensor for input 'A'. It's optional and default value is 0. It could be a scalar or a 1-D tensor, " + "which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number " + "of elements should be equal to the number of columns of input 'A'.", + "T1", + OpSchema::Optional) + .Input( + 5, + "b_zero_point", + "Zero point tensor for input 'B'. It's optional and default value is 0. It could be a scalar or a 1-D tensor, " + "which means a per-tensor or per-column quantization. If it's a 1-D tensor, its number " + "of elements should be equal to the number of columns of input 'B'.", + "T2", + OpSchema::Optional) + .Input( + 6, + "bias", + "1D input tensor, whose dimension is same as B's last dimension", + "T3", + OpSchema::Optional) + .Output(0, "Y", "Matrix multiply results from A * B", "T3") + .TypeConstraint( + "T1", + {"tensor(int8)", "tensor(uint8)"}, + "Constrain input A data type to 8-bit integer tensor.") + .TypeConstraint( + "T2", + {"tensor(int8)", "tensor(uint8)"}, + "Constrain input B data type to 8-bit integer tensor.") + .TypeConstraint( + "T3", + {"tensor(float)"}, + "Constrain input a_scale, b_scale and output Y data type as float tensor.") + .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { + propagateElemTypeFromInputToOutput(ctx, 2, 0); + ONNX_NAMESPACE::matmulShapeInference(ctx, 0, 1); + }); + static const char* TransposeMatMul_doc = R"DOC( Matrix product that behaves like numpy.matmul: https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html )DOC"; diff --git a/onnxruntime/core/optimizer/dynamic_quantize_matmul_fusion.cc b/onnxruntime/core/optimizer/dynamic_quantize_matmul_fusion.cc index dce6dd57d7..e817d099cc 100644 --- a/onnxruntime/core/optimizer/dynamic_quantize_matmul_fusion.cc +++ b/onnxruntime/core/optimizer/dynamic_quantize_matmul_fusion.cc @@ -12,6 +12,26 @@ using namespace ONNX_NAMESPACE; using namespace ::onnxruntime::common; namespace onnxruntime { +// Check if bias is a 1-D tensor, or N-D tensor with the prior N-1 dimension equal to 1. +// And its last dimension equal to MatMul's last dimension +static bool CheckBiasShape(const TensorShapeProto* bias_shape, const TensorShapeProto* matmul_shape) { + if (nullptr == matmul_shape || matmul_shape->dim_size() <= 1 || + nullptr == bias_shape || bias_shape->dim_size() < 1) { + return false; + } + + // First N-1 dimension must equal to 1 + for (int i = 0; i < bias_shape->dim_size() - 1; i++) { + if (bias_shape->dim(i).dim_value() != 1) { + return false; + } + } + + int64_t bias_last_dim = bias_shape->dim(bias_shape->dim_size() - 1).dim_value(); + int64_t matmul_last_dim = matmul_shape->dim(matmul_shape->dim_size() - 1).dim_value(); + return bias_last_dim == matmul_last_dim && bias_last_dim > 0; +} + /** DynamicQuantizeMatMulFusion will fuse subgraph like below into DynamicQuantizeMatMul: (input) @@ -20,14 +40,38 @@ DynamicQuantizeMatMulFusion will fuse subgraph like below into DynamicQuantizeMa DynamicQuantizeLinear --------+ | | v v -MatMulInteger (B const) Mul (B const) - | | - v v - Cast ------------------>Mul +MatMulInteger (B const) Mul (B const) (input) + | | | + v v v + Cast ------------------>Mul ----> DynamicQuantizeMatMul + | | + v v + Add (B const, Optional) (output) | v - (output) -*/ + (output) + +It also fuses subgraph like below into MatMulIntegerToFloat: + input input + | | + v v + +----------------------------DynamicQuantizeLinear------------------------+ DynamicQuantizeLinear + | | | | + | +----------------+--------------+ | +---------+----------+ + | | | | | | + V v v v V v + MatMulInteger(B const) Mul(B const) MatMulInteger (B const) Mul (B const) ---> MatMulIntegerToFloat MatMulIntegerToFloat + | | | | | | + v v v v v v + Cast ---------------->Mul Cast ---------------->Mul (output1) ----------(output2) + | | + v v + Add (B const, Optional) Add (B const, Optional) + | | + v v + (output1) (output2) + + */ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { GraphViewer graph_viewer(graph); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); @@ -79,8 +123,7 @@ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int // Check Nodes' Edges count and Nodes' outputs are not in Graph output if (!optimizer_utils::CheckOutputEdges(graph, cast_node, 1) || !optimizer_utils::CheckOutputEdges(graph, matmulinteger_node, 1) || - !optimizer_utils::CheckOutputEdges(graph, mul_node_right, 1) || - !optimizer_utils::CheckOutputEdges(graph, dql_node_left, 3)) { + !optimizer_utils::CheckOutputEdges(graph, mul_node_right, 1)) { continue; } @@ -94,34 +137,87 @@ Status DynamicQuantizeMatMulFusion::ApplyImpl(Graph& graph, bool& modified, int continue; } - std::vector input_defs{dql_node_left.MutableInputDefs()[0], - matmulinteger_node.MutableInputDefs()[1], - mul_node_right.MutableInputDefs()[1]}; - - if (matmulinteger_node.InputDefs().size() == 4) { - const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]); - if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) { - continue; + // Find bias node + Node* add_node = nullptr; + // const Node* add_node = FindBiasNode(graph, mul_node, ; + if (optimizer_utils::CheckOutputEdges(graph, mul_node, 1)) { + const Node* tmp_add_node = graph_utils::FirstChildByType(mul_node, "Add"); + if (nullptr != tmp_add_node) { + const NodeArg& tmp_add_node_B = *(tmp_add_node->InputDefs()[1]); + if (graph_utils::IsConstantInitializer(graph, tmp_add_node_B.Name(), true) && + CheckBiasShape(tmp_add_node_B.Shape(), matmulinteger_B.Shape())) { + add_node = graph.GetNode(tmp_add_node->Index()); + } } - input_defs.push_back(matmulinteger_node.MutableInputDefs()[3]); } - Node& fused_node = graph.AddNode(graph.GenerateNodeName("DynamicQuantizeMatMul"), - "DynamicQuantizeMatMul", - "fused DynamicQuantizeMatMul", - input_defs, - mul_node.MutableOutputDefs(), - nullptr, - kMSDomain); + // DynamicQuantizeLinear outputs are only used by one MatMulInteger, + // thus it can fused into DynamicQuantizeMatMul + NodeArg optional_node_arg("", nullptr); + std::vector input_defs; + std::string op_type_to_fuse = "DynamicQuantizeMatMul"; + if (optimizer_utils::CheckOutputEdges(graph, dql_node_left, 3)) { + input_defs.push_back(dql_node_left.MutableInputDefs()[0]); + input_defs.push_back(matmulinteger_node.MutableInputDefs()[1]); + input_defs.push_back(mul_node_right.MutableInputDefs()[1]); + input_defs.push_back(&optional_node_arg); + if (matmulinteger_node.InputDefs().size() == 4) { + const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]); + if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) { + continue; + } + input_defs[3] = matmulinteger_node.MutableInputDefs()[3]; + } + + nodes_to_remove.push_back(dql_node_left); + } else { + op_type_to_fuse = "MatMulIntegerToFloat"; + + input_defs.push_back(matmulinteger_node.MutableInputDefs()[0]); + input_defs.push_back(matmulinteger_node.MutableInputDefs()[1]); + input_defs.push_back(mul_node_right.MutableInputDefs()[0]); + input_defs.push_back(mul_node_right.MutableInputDefs()[1]); + input_defs.push_back(&optional_node_arg); + input_defs.push_back(&optional_node_arg); + + if (matmulinteger_node.InputDefs().size() >= 3) { + // Add zero point of A + input_defs[4] = matmulinteger_node.MutableInputDefs()[2]; + + // Add zero point of B + if (matmulinteger_node.InputDefs().size() == 4) { + const NodeArg& matmulinteger_B_zp = *(matmulinteger_node.InputDefs()[3]); + if (!graph_utils::IsConstantInitializer(graph, matmulinteger_B_zp.Name(), true)) { + continue; + } + input_defs[5] = matmulinteger_node.MutableInputDefs()[3]; + } + } + } + + if (add_node != nullptr) { + input_defs.push_back(add_node->MutableInputDefs()[1]); + } + + Node* fused_node = &graph.AddNode(graph.GenerateNodeName(op_type_to_fuse), + op_type_to_fuse, + "", + input_defs, + add_node != nullptr ? add_node->MutableOutputDefs() : mul_node.MutableOutputDefs(), + nullptr, + kMSDomain); // Assign provider to this new node. Provider should be same as the provider for old node. - fused_node.SetExecutionProviderType(mul_node.GetExecutionProviderType()); + ORT_ENFORCE(nullptr != fused_node); + fused_node->SetExecutionProviderType(mul_node.GetExecutionProviderType()); - nodes_to_remove.push_back(dql_node_left); nodes_to_remove.push_back(matmulinteger_node); nodes_to_remove.push_back(cast_node); nodes_to_remove.push_back(mul_node_right); nodes_to_remove.push_back(mul_node); + if (add_node != nullptr) { + nodes_to_remove.push_back(*add_node); + } } for (const auto& node : nodes_to_remove) { diff --git a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc index b66fd66ae2..dcc8fe7697 100644 --- a/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc +++ b/onnxruntime/test/contrib_ops/dynamic_quantize_matmul_test.cc @@ -23,7 +23,9 @@ namespace test { template void TestDynamicQuantizeMatMul(const std::vector& A_dims, std::vector B_dims, - const std::string& reference_model) { + const std::string& reference_model, + bool has_zp = true, + bool has_bias = false) { // create rand inputs RandomValueGenerator random{}; @@ -38,11 +40,24 @@ void TestDynamicQuantizeMatMul(const std::vector& A_dims, std::vector B_scale = random.Uniform({1}, -0.1f, 0.1f); std::vector B_zero_point = {static_cast(random.Uniform({1}, std::numeric_limits::min(), std::numeric_limits::max())[0])}; + std::vector Bias = random.Uniform({B_dims.back()}, -0.1f, 0.1f); + OpTester test("DynamicQuantizeMatMul", 1, onnxruntime::kMSDomain); test.AddInput("A", A_dims, A_data); test.AddInput("B", B_dims, B_data); test.AddInput("b_scale", {1}, B_scale); - test.AddInput("b_zero_point", {1}, B_zero_point); + + if (has_zp) { + test.AddInput("b_zero_point", {1}, B_zero_point); + } else { + test.AddMissingOptionalInput(); + } + + if (has_bias) { + test.AddInput("bias", {B_dims.back()}, Bias); + } else { + test.AddMissingOptionalInput(); + } test.AddReferenceOutputs(reference_model); test.Run(); @@ -56,6 +71,16 @@ TEST(DynamicQuantizeMatMul, Int8_test) { TestDynamicQuantizeMatMul(A_dims, B_dims, "testdata/dynamic_quantize_matmul_int8.onnx"); } +TEST(DynamicQuantizeMatMul, Int8_test_bias) { +#ifdef MLAS_SUPPORTS_GEMM_U8X8 + std::vector A_dims{4, 128}; + std::vector B_dims{128, 128}; + std::vector Y_dims{4, 128}; + + TestDynamicQuantizeMatMul(A_dims, B_dims, "testdata/dynamic_quantize_matmul_int8_bias.onnx", false, true); +#endif +} + TEST(DynamicQuantizeMatMul, UInt8_test) { std::vector A_dims{4, 128}; std::vector B_dims{128, 128}; @@ -64,5 +89,13 @@ TEST(DynamicQuantizeMatMul, UInt8_test) { TestDynamicQuantizeMatMul(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8.onnx"); } +TEST(DynamicQuantizeMatMul, UInt8_test_bias) { + std::vector A_dims{4, 128}; + std::vector B_dims{128, 128}; + std::vector Y_dims{4, 128}; + + TestDynamicQuantizeMatMul(A_dims, B_dims, "testdata/dynamic_quantize_matmul_uint8_bias.onnx", false, true); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc new file mode 100644 index 0000000000..28407f0f7d --- /dev/null +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -0,0 +1,115 @@ + + +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/framework/tensor.h" +#include "core/session/inference_session.h" +#include "test/common/tensor_op_test_utils.h" +#include "test/framework/test_utils.h" +#include "test/providers/provider_test_utils.h" +#include "test/util/include/default_providers.h" +#include "core/util/qmath.h" + +#include +#include + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +using namespace std; + +namespace onnxruntime { +namespace test { + +template +void TestMatMulIntegerToFloat(const std::vector& A_dims, + std::vector B_dims, + const std::string& reference_model, + bool has_zp = true, + bool has_bias = false) { + // create rand inputs + RandomValueGenerator random{}; + + std::vector A_data; + std::vector tmp_A_data = random.Uniform(A_dims, 0, 255); + std::transform(tmp_A_data.begin(), tmp_A_data.end(), std::back_inserter(A_data), [](int32_t v) -> T { + return static_cast(v); + }); + + std::vector B_data; + std::vector tmp_B_data = random.Uniform(B_dims, std::numeric_limits::min(), std::numeric_limits::max()); + std::transform(tmp_B_data.begin(), tmp_B_data.end(), std::back_inserter(B_data), [](int32_t v) -> T { + return static_cast(v); + }); + + std::vector A_scale = random.Uniform({1}, -0.1f, 0.1f); + std::vector B_scale = random.Uniform({1}, -0.1f, 0.1f); + + std::vector A_zero_point{127}; + std::vector B_zero_point = {static_cast(random.Uniform({1}, std::numeric_limits::min(), std::numeric_limits::max())[0])}; + + std::vector Bias = random.Uniform({B_dims.back()}, -0.1f, 0.1f); + + OpTester test("MatMulIntegerToFloat", 1, onnxruntime::kMSDomain); + test.AddInput("A", A_dims, A_data); + test.AddInput("B", B_dims, B_data); + test.AddInput("a_scale", {1}, A_scale); + test.AddInput("b_scale", {1}, B_scale); + + if (has_zp) { + test.AddInput("a_zero_point", {1}, A_zero_point); + test.AddInput("b_zero_point", {1}, B_zero_point); + } else { + test.AddMissingOptionalInput(); + test.AddMissingOptionalInput(); + } + + if (has_bias) { + test.AddInput("bias", {B_dims.back()}, Bias); + } else { + test.AddMissingOptionalInput(); + } + + test.AddReferenceOutputs(reference_model); + test.Run(); +} + +TEST(MatMulIntegerToFloat, Int8_test) { +#ifdef MLAS_SUPPORTS_GEMM_U8X8 + std::vector A_dims{4, 128}; + std::vector B_dims{128, 128}; + std::vector Y_dims{4, 128}; + + TestMatMulIntegerToFloat(A_dims, B_dims, "testdata/matmul_integer_to_float_int8.onnx"); +#endif +} + +TEST(MatMulIntegerToFloat, Int8_bias_test) { +#ifdef MLAS_SUPPORTS_GEMM_U8X8 + std::vector A_dims{4, 128}; + std::vector B_dims{128, 128}; + std::vector Y_dims{4, 128}; + + TestMatMulIntegerToFloat(A_dims, B_dims, "testdata/matmul_integer_to_float_int8_bias.onnx", false, true); +#endif +} + +TEST(MatMulIntegerToFloat, UInt8_test) { + std::vector A_dims{4, 128}; + std::vector B_dims{128, 128}; + std::vector Y_dims{4, 128}; + + TestMatMulIntegerToFloat(A_dims, B_dims, "testdata/matmul_integer_to_float_uint8.onnx"); +} + +TEST(MatMulIntegerToFloat, UInt8_bias_test) { + std::vector A_dims{4, 128}; + std::vector B_dims{128, 128}; + std::vector Y_dims{4, 128}; + + TestMatMulIntegerToFloat(A_dims, B_dims, "testdata/matmul_integer_to_float_uint8_bias.onnx", false, true); +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index c59c6f34f6..f0e66b978a 100644 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -2472,6 +2472,84 @@ TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest) { EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1); } +TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_Bias) { + auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0); + EXPECT_EQ(op_to_count["MatMulInteger"], 0); + EXPECT_EQ(op_to_count["Cast"], 0); + EXPECT_EQ(op_to_count["Mul"], 0); + EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1); +} + +TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_ND_bias) { + auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias_ND.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0); + EXPECT_EQ(op_to_count["MatMulInteger"], 0); + EXPECT_EQ(op_to_count["Cast"], 0); + EXPECT_EQ(op_to_count["Mul"], 0); + EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1); + EXPECT_EQ(op_to_count["Add"], 1); +} + +TEST_F(GraphTransformationTests, DynamicQuantizeMatMulTest_With_Bias_No_B_ZP) { + auto model_uri = MODEL_FOLDER "fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 0); + EXPECT_EQ(op_to_count["MatMulInteger"], 0); + EXPECT_EQ(op_to_count["Cast"], 0); + EXPECT_EQ(op_to_count["Mul"], 0); + EXPECT_EQ(op_to_count["DynamicQuantizeMatMul"], 1); +} + +TEST_F(GraphTransformationTests, MatMulIntegerToFloatTest) { + auto model_uri = MODEL_FOLDER "fusion/matmul_integer_to_float.onnx"; + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + graph_transformation_mgr.Register(onnxruntime::make_unique(), TransformerLevel::Level2); + auto ret = graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level2, *logger_); + ASSERT_TRUE(ret.IsOK()); + + std::map op_to_count = CountOpsInGraph(graph); + EXPECT_EQ(op_to_count["DynamicQuantizeLinear"], 1); + EXPECT_EQ(op_to_count["MatMulInteger"], 0); + EXPECT_EQ(op_to_count["Cast"], 0); + EXPECT_EQ(op_to_count["Mul"], 0); + EXPECT_EQ(op_to_count["MatMulIntegerToFloat"], 3); + EXPECT_EQ(op_to_count["Add"], 1); +} + #endif } // namespace test diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_int8.onnx b/onnxruntime/test/testdata/dynamic_quantize_matmul_int8.onnx index 9127e436a0..62bbededa1 100644 --- a/onnxruntime/test/testdata/dynamic_quantize_matmul_int8.onnx +++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_int8.onnx @@ -1,4 +1,4 @@ -:è +:è M A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear W @@ -17,7 +17,7 @@ A matmul_output_float multiplierY -mul_bottom"MulDynamicQuantizeLinear_fusionZ +mul_bottom"MulDynamicQuantizeMatMul_fusionZ A  @@ -40,4 +40,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusionZ  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_int8_bias.onnx b/onnxruntime/test/testdata/dynamic_quantize_matmul_int8_bias.onnx new file mode 100644 index 0000000000..efce674aea --- /dev/null +++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_int8_bias.onnx @@ -0,0 +1,44 @@ +:‹ +M +A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear +I + a_quantized +B +a_zpmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to  +E +matmul_output_float + +multipliermul_bottom_output +mul_bottom"Mul +& +mul_bottom_output +biasYadd"AddDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +b_scale + + +Z +bias +  +Nb +Y + + +M +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py index 635d968765..d01f696e84 100644 --- a/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py +++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_test.py @@ -3,24 +3,41 @@ from onnx import helper from onnx import TensorProto from enum import Enum -def GenerateModel(model_name, sign): +def GenerateModel(model_name, sign, b_zp = True, bias = False): nodes = [ # DynamicQuantizeMatMul subgraph helper.make_node("DynamicQuantizeLinear", ["A"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"), - helper.make_node("MatMulInteger", ["a_quantized", "B", "a_zp", "b_zero_point"], ["matmul_output_int32"], "MatMulInteger"), + + helper.make_node( + "MatMulInteger", + ["a_quantized", "B", "a_zp", "b_zero_point"] if b_zp else ["a_quantized", "B", "a_zp"], + ["matmul_output_int32"], + "MatMulInteger"), + helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"), + helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1), - helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["Y"], "mul_bottom"), + + helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["mul_bottom_output" if bias else "Y"], "mul_bottom"), ] + inputs = [ + helper.make_tensor_value_info('A', TensorProto.FLOAT, ['M', 'K']), + helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']), + helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]), + ] + + if b_zp: + inputs.extend([helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1])]) + + if bias: + nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")]) + + inputs.extend([helper.make_tensor_value_info('bias', TensorProto.FLOAT, ['N'])]) + graph = helper.make_graph( nodes, "DynamicQuantizeMatMul_fusion", #name - [ # inputs - helper.make_tensor_value_info('A', TensorProto.FLOAT, ['M', 'K']), - helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']), - helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]), - helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1]), - ], + inputs, [ # outputs helper.make_tensor_value_info('Y', TensorProto.FLOAT, ['M', 'N']), ]) @@ -30,4 +47,6 @@ def GenerateModel(model_name, sign): if __name__ == "__main__": GenerateModel('dynamic_quantize_matmul_int8.onnx', True) - GenerateModel('dynamic_quantize_matmul_int8.onnx', False) \ No newline at end of file + GenerateModel('dynamic_quantize_matmul_uint8.onnx', False) + GenerateModel('dynamic_quantize_matmul_int8_bias.onnx', True, False, True) + GenerateModel('dynamic_quantize_matmul_uint8_bias.onnx', False, False, True) \ No newline at end of file diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8.onnx b/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8.onnx index a4305de244..0f50b6f2cd 100644 --- a/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8.onnx +++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8.onnx @@ -1,4 +1,4 @@ -:è +:è M A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear W @@ -17,7 +17,7 @@ A matmul_output_float multiplierY -mul_bottom"MulDynamicQuantizeLinear_fusionZ +mul_bottom"MulDynamicQuantizeMatMul_fusionZ A  @@ -40,4 +40,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusionZ  M -NB \ No newline at end of file +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8_bias.onnx b/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8_bias.onnx new file mode 100644 index 0000000000..ac0d833a00 --- /dev/null +++ b/onnxruntime/test/testdata/dynamic_quantize_matmul_uint8_bias.onnx @@ -0,0 +1,44 @@ +:‹ +M +A a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear +I + a_quantized +B +a_zpmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to  +E +matmul_output_float + +multipliermul_bottom_output +mul_bottom"Mul +& +mul_bottom_output +biasYadd"AddDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +b_scale + + +Z +bias +  +Nb +Y + + +M +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float.py b/onnxruntime/test/testdata/matmul_integer_to_float.py new file mode 100644 index 0000000000..a3351d8cbf --- /dev/null +++ b/onnxruntime/test/testdata/matmul_integer_to_float.py @@ -0,0 +1,55 @@ +import onnx +from onnx import helper +from onnx import TensorProto +from enum import Enum + +def GenerateModel(model_name, sign, has_zp = True, bias = False): + nodes = [ # subgraph + helper.make_node( + "MatMulInteger", + ["A", "B", "a_zero_point", "b_zero_point"] if has_zp else ["A", "B"], + ["matmul_output_int32"], + "MatMulInteger"), + + helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"), + + helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1), + + helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["mul_bottom_output" if bias else "Y"], "mul_bottom"), + ] + + inputs = [ # inputs + helper.make_tensor_value_info('A', TensorProto.UINT8, ['M', 'K']), + helper.make_tensor_value_info('B', TensorProto.INT8 if sign else TensorProto.UINT8, ['K', 'N']), + helper.make_tensor_value_info('a_scale', TensorProto.FLOAT, [1]), + helper.make_tensor_value_info('b_scale', TensorProto.FLOAT, [1]), + + ] + + if has_zp: + inputs.extend([ + helper.make_tensor_value_info('a_zero_point', TensorProto.UINT8, [1]), + helper.make_tensor_value_info('b_zero_point', TensorProto.INT8 if sign else TensorProto.UINT8, [1]), + ]) + + if bias: + nodes.extend([helper.make_node("Add", ["mul_bottom_output", "bias"], ["Y"], "add")]) + + inputs.extend([helper.make_tensor_value_info('bias', TensorProto.FLOAT, ['N'])]) + + graph = helper.make_graph( + nodes, + "DynamicQuantizeMatMul_fusion", #name + inputs, + [ # outputs + helper.make_tensor_value_info('Y', TensorProto.FLOAT, ['M', 'N']), + ]) + + model = helper.make_model(graph) + onnx.save(model, model_name) + +if __name__ == "__main__": + GenerateModel('matmul_integer_to_float_int8.onnx', True) + GenerateModel('matmul_integer_to_float_uint8.onnx', False) + GenerateModel('matmul_integer_to_float_int8_bias.onnx', True, False, True) + GenerateModel('matmul_integer_to_float_uint8_bias.onnx', False, False, True) \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx new file mode 100644 index 0000000000..7c52814c55 --- /dev/null +++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8.onnx @@ -0,0 +1,49 @@ +:Ê +U +A +B + a_zero_point + b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to  +5 +matmul_output_float + +multiplierY +mul_bottom"MulDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + +Z +b_scale + + +Z + a_zero_point + + +Z + b_zero_point + + +b +Y + + +M +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx new file mode 100644 index 0000000000..8e168450c0 --- /dev/null +++ b/onnxruntime/test/testdata/matmul_integer_to_float_int8_bias.onnx @@ -0,0 +1,45 @@ +:Ã +9 +A +Bmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to  +E +matmul_output_float + +multipliermul_bottom_output +mul_bottom"Mul +& +mul_bottom_output +biasYadd"AddDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + +Z +b_scale + + +Z +bias +  +Nb +Y + + +M +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx new file mode 100644 index 0000000000..d4621fbfad --- /dev/null +++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8.onnx @@ -0,0 +1,49 @@ +:Ê +U +A +B + a_zero_point + b_zero_pointmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to  +5 +matmul_output_float + +multiplierY +mul_bottom"MulDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + +Z +b_scale + + +Z + a_zero_point + + +Z + b_zero_point + + +b +Y + + +M +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx new file mode 100644 index 0000000000..df83afb3ba --- /dev/null +++ b/onnxruntime/test/testdata/matmul_integer_to_float_uint8_bias.onnx @@ -0,0 +1,45 @@ +:Ã +9 +A +Bmatmul_output_int32 MatMulInteger" MatMulInteger +. +a_scale +b_scale +multiplier mul_right"Mul +A +matmul_output_int32matmul_output_floatcast"Cast* +to  +E +matmul_output_float + +multipliermul_bottom_output +mul_bottom"Mul +& +mul_bottom_output +biasYadd"AddDynamicQuantizeMatMul_fusionZ +A + + +M +KZ +B + + +K +NZ +a_scale + + +Z +b_scale + + +Z +bias +  +Nb +Y + + +M +NB \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.fused.onnx b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.fused.onnx new file mode 100644 index 0000000000000000000000000000000000000000..077168c0dbe77074820c9e09b0b9d377d54022c5 GIT binary patch literal 435 zcmZvYu};G<6h*O{rq*kz+yH7tDuFs?$iUbI$zr4fj4Y9pI$$Yrf@4$KiG`2g2l);x zluAw2!NYs^`0CzA{N)dZZv=g*YHMJaChwLjBX?pAgEVO}UI_5gWLF1%WsPI5uO7CF zmoj^5!4)?vTP#km@50gEd z-yaZmm^B(X0e>O(A2pg#wD9u$_nz%LJfFQx+mgW;Mg#=}oSl0(ok3geNa-<5hinQL zos_Z4-ZmLU>;}TFP!_r1#%gyw-8f(jz-3%1^%<81#q1L3em0s*CDp5=<}llDYb`#c N*u)3w1iCk(=npX?fcF3Z literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx index c2be298a4e..7f0dfcedf0 100644 --- a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx +++ b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.onnx @@ -1,4 +1,4 @@ -:ì +:ì Q input a_quantizeda_scalea_zpDynamicQuantizeLinear"DynamicQuantizeLinear Y @@ -17,7 +17,7 @@ A matmul_output_float multiplieroutput -mul_bottom"MulDynamicQuantizeLinear_fusion**B b_quantized* *€Bb_zp*"ffæ?Bb_scaleZ +mul_bottom"MulDynamicQuantizeLinear_fusion**B b_quantized*"ffæ?Bb_scale* *€Bb_zpZ input   @@ -25,4 +25,4 @@ mul_bottom"MulDynamicQuantizeLinear_fusion**B b_quantized* * output   -B \ No newline at end of file +B \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py index b22cdf8e1b..a1d68664a7 100644 --- a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py +++ b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul.py @@ -3,21 +3,43 @@ from onnx import helper from onnx import TensorProto from enum import Enum -def GenerateModel(model_name): - nodes = [ # LayerNorm subgraph +def GenerateModel(model_name, b_has_zp = True, has_bias = False, bias_ND = False): + mul_output = "Mul_output" if has_bias else "output" + nodes = [ # construct graph helper.make_node("DynamicQuantizeLinear", ["input"], ["a_quantized", "a_scale", "a_zp"], "DynamicQuantizeLinear"), - helper.make_node("MatMulInteger", ["a_quantized", "b_quantized", "a_zp", "b_zp"], ["matmul_output_int32"], "MatMulInteger"), + helper.make_node( + "MatMulInteger", + ["a_quantized", "b_quantized", "a_zp", "b_zp"] if b_has_zp else ["a_quantized", "b_quantized", "a_zp"], + ["matmul_output_int32"], + "MatMulInteger"), helper.make_node("Mul", ["a_scale", "b_scale"], ["multiplier"], "mul_right"), helper.make_node("Cast", ["matmul_output_int32"], ["matmul_output_float"], "cast", to=1), - helper.make_node("Mul", ["matmul_output_float", "multiplier"], ["output"], "mul_bottom"), + helper.make_node("Mul", ["matmul_output_float", "multiplier"], [mul_output], "mul_bottom"), ] + if has_bias: + nodes.extend([helper.make_node("Add", [mul_output, "bias"], ["output"], "bias_add")]) + initializers = [ # initializers helper.make_tensor('b_quantized', TensorProto.UINT8, [2,3], [2, 4, 5, 6, 7, 8]), - helper.make_tensor('b_zp', TensorProto.UINT8, [], [128]), helper.make_tensor('b_scale', TensorProto.FLOAT, [], [1.8]), ] + if b_has_zp: + initializers.extend([ # initializers + helper.make_tensor('b_zp', TensorProto.UINT8, [], [128]), + ]) + + if has_bias: + if bias_ND: + initializers.extend([ # initializers + helper.make_tensor('bias', TensorProto.FLOAT, [3, 3], [3.0, 4.0, 6.0, 3.0, 4.0, 6.0, 3.0, 4.0, 5.0]), + ]) + else: + initializers.extend([ # initializers + helper.make_tensor('bias', TensorProto.FLOAT, [3], [3.0, 4.0, 5.0]), + ]) + graph = helper.make_graph( nodes, "DynamicQuantizeLinear_fusion", #name @@ -33,4 +55,7 @@ def GenerateModel(model_name): onnx.save(model, model_name) if __name__ == "__main__": - GenerateModel('dynamic_quantize_matmul.onnx') \ No newline at end of file + GenerateModel('dynamic_quantize_matmul.onnx') + GenerateModel('dynamic_quantize_matmul_bias.onnx', True, True) + GenerateModel('dynamic_quantize_matmul_bias_b_no_zp.onnx', False, True) + GenerateModel('dynamic_quantize_matmul_bias_ND.onnx', False, True, True) \ No newline at end of file diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_b_no_zp_bias.onnx b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_b_no_zp_bias.onnx new file mode 100644 index 0000000000000000000000000000000000000000..9724c06a949252161f2ed4cc70a1c53ff22b4154 GIT binary patch literal 554 zcmZvZzfQw25XKY7ZQN@?#RyRaCU0b5fUN_xfdz>U6&s7?G)b|PI3;lgI`Jsw<#-O< zHWg{r;Vj)B{l4$c#2x-%m_kdcMWtz1@XtypEthdbUBSzlNMnixUoO0h=PxBPIh)o4 zuTsULV8_ew22B=p!+$Vrc=S9II;&EiSK3H(sr3DwcQO%rQl&3S$Md*g`zhSQ&J(y{ zZ4L}W%S9^V!t1Pgg`9tA<`@c$;h4ywa+2mkdw3>F&G3_X{SF|V5A2nV;NcLlnTCcc z1tV2&;jQaD%(c$5t%VzCv;l`wl(fBF*(2+Wizs5wIEv`?(F60ODrK(xD}spAM}8aO zR@)^(^P>J)-)1;TejkI~UHpNWwpq`zMkCAmHnAE78~OLk-EW^lkJyxe0h>VK0BUD+ KzvBd`-}?h}nWz;2 literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias.onnx b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias.onnx new file mode 100644 index 0000000000000000000000000000000000000000..aa20374fb4ffa4dc73e00fbced268fc200081a23 GIT binary patch literal 574 zcmZvau};G<5QY;cZQN@>#RyRaCU0b5fQ50G-TC|fI~h9IEBjln|q|ez`#ibIu{m2Vyo;A_$ulvV zHVtn=#=M}%)9?;$7WBh^Fl=~Whnh;xGp@2KWqGBv0Vbrnzw=HeTurLIin#E4HLno!Pemg#MRup_;Gn%%|7>7VJ4t?@g56$xSNEco8ElsA5A|Dv%3{{Szttj+)c literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_ND.onnx b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_ND.onnx new file mode 100644 index 0000000000000000000000000000000000000000..9fc9a0b0db83b71284ed7c58aaff59c5f0d02d10 GIT binary patch literal 580 zcmZvazfQw25XSSzZQN@?#RyRaCX0m(3}s`0Hn1Skp<-jPoF*xj5~n22Kqnrh@4>>; z@Eo{pD$**$S-$h>_kDLVcJ96rOra&@qEw{I*+*V>%zIH4$4SPOi)Oq~6g}$G>wu-Qfw?j*xIaW} zx}l~@Kuc9qxa&F#GNrO~YvCGd&wv8K3)0?>?BaFCcox`bNEzzDZFW|33J;y?8@x>wc478OCU27+*SmbTr*pBY$J;?R@tSj-iK5 Uf;M1& literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx b/onnxruntime/test/testdata/transform/fusion/dynamic_quantize_matmul_bias_b_no_zp.onnx new file mode 100644 index 0000000000000000000000000000000000000000..9724c06a949252161f2ed4cc70a1c53ff22b4154 GIT binary patch literal 554 zcmZvZzfQw25XKY7ZQN@?#RyRaCU0b5fUN_xfdz>U6&s7?G)b|PI3;lgI`Jsw<#-O< zHWg{r;Vj)B{l4$c#2x-%m_kdcMWtz1@XtypEthdbUBSzlNMnixUoO0h=PxBPIh)o4 zuTsULV8_ew22B=p!+$Vrc=S9II;&EiSK3H(sr3DwcQO%rQl&3S$Md*g`zhSQ&J(y{ zZ4L}W%S9^V!t1Pgg`9tA<`@c$;h4ywa+2mkdw3>F&G3_X{SF|V5A2nV;NcLlnTCcc z1tV2&;jQaD%(c$5t%VzCv;l`wl(fBF*(2+Wizs5wIEv`?(F60ODrK(xD}spAM}8aO zR@)^(^P>J)-)1;TejkI~UHpNWwpq`zMkCAmHnAE78~OLk-EW^lkJyxe0h>VK0BUD+ KzvBd`-}?h}nWz;2 literal 0 HcmV?d00001 diff --git a/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx b/onnxruntime/test/testdata/transform/fusion/matmul_integer_to_float.onnx new file mode 100644 index 0000000000000000000000000000000000000000..fd8e6bbea7a09c0d11bb10bbfdc6ec4ffde5df37 GIT binary patch literal 1510 zcma)+&2F1O6ommhHW@pqMHRI|%Es!79e1+oCN}+3rDl1JebR-O}JxFah z?zi^E$KNsEh}CU*;hTs{CSpl} zx%R6YE^pH4Yb?XNaL3&FnQ}m`NR^PB`bAVUo1&8x+el;*dy@I?#N7{x`T@F;FQof* z?G)=M;V804E56Uf!wMKY^ z@I5wSmN;CS6Z-Sy+~rVPP5IqG!1hwGn|nzJQ