From fedb68429cb219d8651e1c20f53d7f314154b033 Mon Sep 17 00:00:00 2001 From: Guoyu Wang <62914304+gwang-msft@users.noreply.github.com> Date: Wed, 3 Mar 2021 15:44:49 -0800 Subject: [PATCH] [NNAPI EP] Add per-tensor u8s8 support for Qlinear[Conv/MatMul] (#6818) * NNAPI Add per-tensor u8s8 support * Update some comments * Address CR comments * Address CR comments --- .../nnapi/nnapi_builtin/builders/helper.cc | 209 ++++++++++-------- .../nnapi_builtin/builders/op_builder.cc | 105 ++++++--- .../cpu/math/quantize_linear_matmul_test.cc | 86 ++++++- 3 files changed, 273 insertions(+), 127 deletions(-) diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc index ce1e8c359e..c947b96569 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/helper.cc @@ -137,7 +137,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) { bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node, const std::vector& indices, const OpSupportCheckParams& params) { const auto& op_type = node.OpType(); - bool is_qlinear_conv = (op_type == "QLinearConv"); + auto qlinear_op_type = GetQLinearOpType(node); + bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv); + bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul); const auto input_defs(node.InputDefs()); for (const auto idx : indices) { if (idx >= input_defs.size()) { @@ -145,46 +147,53 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const << " >= input number, " << input_defs.size(); return false; } + const auto scale_name = input_defs[idx]->Name(); - if (Contains(initializers, scale_name)) { - const auto& scale_tensor = *initializers.at(scale_name); - int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; - bool is_conv_weight = is_qlinear_conv && idx == 4; - bool is_conv_u8s8_weight = false; - - if (is_conv_weight) { - const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); - is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; - } - - // We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv - // We only support per-channel quantization for u8s8 - // For all other cases, the scales should be a scalar - if (is_conv_u8s8_weight) { - if (params.android_sdk_ver < 29) { - LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, " - << "system API level: " << params.android_sdk_ver; - return false; - } - - const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); - if (weight_tensor.dims()[0] != scales_dim) { - LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," - << " weight dimension[0] " << weight_tensor.dims()[0] - << " scale dimension " << scales_dim; - return false; - } - } else { - if (scales_dim != 1) { - LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " - << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; - return false; - } - } - } else { - LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known"; + if (!Contains(initializers, scale_name)) { + LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor"; return false; } + + // If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul) + bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 4; + bool is_conv_matmul_u8s8_weight = false; + + if (is_conv_matmul_weight) { + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; + } + + const auto& scale_tensor = *initializers.at(scale_name); + int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; + if (!is_conv_matmul_u8s8_weight) { + if (scales_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " + << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; + return false; + } + } else if (scales_dim != 1) { + // For u8s8 Qlinear[Conv/MatMul], we support + // 1. Per-tensor, the weight will be transformed to uint8 later + // 2. Per-channel, only from Android API level 29 + if (is_qlinear_matmul) { + LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization"; + return false; + } + + if (params.android_sdk_ver < 29) { + LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, " + << "system API level: " << params.android_sdk_ver; + return false; + } + + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + if (weight_tensor.dims()[0] != scales_dim) { + LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," + << " weight dimension[0] " << weight_tensor.dims()[0] + << " scale dimension " << scales_dim; + return false; + } + } } return true; @@ -193,7 +202,9 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node, const std::vector& indices) { const auto& op_type = node.OpType(); - bool is_qlinear_conv = (op_type == "QLinearConv"); + auto qlinear_op_type = GetQLinearOpType(node); + bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv); + bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul); const auto input_defs(node.InputDefs()); for (const auto idx : indices) { if (idx >= input_defs.size()) { @@ -203,65 +214,77 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co } const auto zero_point_name = input_defs[idx]->Name(); - if (Contains(initializers, zero_point_name)) { - bool is_conv_weight = is_qlinear_conv && idx == 5; - bool is_conv_u8s8_weight = false; - if (is_conv_weight) { - const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); - is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; - } + if (!Contains(initializers, zero_point_name)) { + LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor"; + return false; + } - const auto& zero_tensor = *initializers.at(zero_point_name); - int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0]; - if (is_conv_u8s8_weight) { - if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) { - LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, " - << "actual zero point type: [" << zero_tensor.data_type() << "]"; - return false; - } + bool is_conv_matmul_weight = is_qlinear_conv && idx == 5; + bool is_conv_matmul_u8s8_weight = false; + if (is_conv_matmul_weight) { + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8; + } - // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar, - // or a tensor with same channel as weight, for NNAPI we only support it be - // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel - // quantization is 0 there is no input for it - const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); - if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) { - LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," - << " weight dimension[0] " << weight_tensor.dims()[0] - << " zero point dimension " << zero_dim; - return false; - } + const auto& zero_tensor = *initializers.at(zero_point_name); + int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0]; - std::unique_ptr unpacked_tensor; - size_t tensor_byte_size; - auto status = onnxruntime::utils::UnpackInitializerData( - zero_tensor, - node.ModelPath(), - unpacked_tensor, tensor_byte_size); - if (!status.IsOK()) { - LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage(); - return false; - } - - // Verify all onnx weight zero point(s) are 0(s) - const int8_t* zero_points = reinterpret_cast(unpacked_tensor.get()); - for (size_t i = 0; i < tensor_byte_size; i++) { - if (zero_points[i] != 0) { - LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, " - << "zero_points[" << i << "] has value: " << zero_points[i]; - return false; - } - } - } else { - if (zero_dim != 1) { - LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " - << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; - return false; - } + if (!is_conv_matmul_u8s8_weight) { + if (zero_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, " + << " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+"; + return false; } } else { - LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known"; - return false; + // For u8s8 Qlinear[Conv/MatMul], we support + // 1. Per-tensor, the weight will be transformed to uint8 later + // 2. Per-channel, only from Android API level 29 + if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) { + LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, " + << "actual zero point type: [" << zero_tensor.data_type() << "]"; + return false; + } + + if (zero_dim != 1) { + if (is_qlinear_matmul) { + LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization"; + return false; + } + } + + // For onnx, u8s8 QlinearConv, the weight zero point can be a scalar, + // or a tensor with same channel as weight, for NNAPI we only support it be + // 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel + // quantization is 0 there is no input for it + const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name()); + if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) { + LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight," + << " weight dimension[0] " << weight_tensor.dims()[0] + << " zero point dimension " << zero_dim; + return false; + } + + std::unique_ptr unpacked_tensor; + size_t tensor_byte_size; + auto status = onnxruntime::utils::UnpackInitializerData( + zero_tensor, + node.ModelPath(), + unpacked_tensor, tensor_byte_size); + if (!status.IsOK()) { + LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name + << ", error msg: " << status.ErrorMessage(); + return false; + } + + // Verify all onnx weight zero point(s) are 0(s) + const int8_t* zero_points = reinterpret_cast(unpacked_tensor.get()); + for (size_t i = 0; i < tensor_byte_size; i++) { + if (zero_points[i] != 0) { + LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, " + << "zero_points[" << i << "] has value: " << zero_points[i]; + return false; + } + } } } diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc index e1a438262c..a26a8f1c4a 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/op_builder.cc @@ -272,15 +272,23 @@ enum DataLayout { L_1230 = 1, }; -// TODO, replace this with more efficient code in optimizers +// This is primarily used for adding the weight (an initializer) of Conv/QlinearConv +// And perform layout change from ONNX -> NNAPI +// If is_per_tensor_u8s8 is true, the QlinearConv is per-tensor u8s8 (input X is unsigned int8 +// and weight W is signed int8 and it is per-tensor (NOT per-channel) quantized), in this case, +// since NNAPI requires X and W to be same type for per-tensor quantization, +// the initializer tensor W will be converted from int8 to uint8 by flip each byte by XOR 0x80 +// byte ^ 0x80 == byte + 128 static Status AddInitializerInNewLayout(ModelBuilder& model_builder, const std::string& name, const OperandType& source_operand_type, - DataLayout new_layout) ORT_MUST_USE_RESULT; + DataLayout new_layout, + bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT; static Status AddInitializerInNewLayout(ModelBuilder& model_builder, const std::string& name, const OperandType& source_operand_type, - DataLayout new_layout) { + DataLayout new_layout, + bool is_per_tensor_u8s8) { const auto& tensor = *model_builder.GetInitializerTensors().at(name); const Shape& shape = source_operand_type.dimensions; ORT_RETURN_IF_NOT(shape.size() == 4, @@ -322,6 +330,8 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder, std::unique_ptr buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]); uint8_t* buffer = buffer_holder.get(); size_t element_size = operand_type.GetElementByteSize(); + + uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0; for (uint32_t out = 0; out < out_t; out++) { for (uint32_t in = 0; in < in_t; in++) { for (uint32_t h = 0; h < h_t; h++) { @@ -345,7 +355,7 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder, } for (size_t i = 0; i < element_size; i++) { - buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i]; + buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i] ^ bit_flip_val; } } } @@ -355,13 +365,21 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder, return model_builder.AddOperandFromPersistMemoryBuffer(name, &buffer[0], operand_type); } -// TODO, replace this with more efficient code in optimizers +// This is primarily used for adding the input B (an initializer) of MatMul/QlinearMatMul/Gemm (not transposed) +// and transpose it, since for NNAPI only supports A*B' +// +// If is_per_tensor_u8s8 is true, the QlinearMatMul is per-tensor u8s8 (input A is unsigned int8 +// and input B is signed int8), in this case, since NNAPI requires A and B to be same type, +// the initializer tensor B will be converted from int8 to uint8 by flip each byte by XOR 0x80 +// byte ^ 0x80 == byte + 128 static Status AddInitializerTransposed(ModelBuilder& model_builder, const OperandType& source_operand_type, - const std::string& name) ORT_MUST_USE_RESULT; + const std::string& name, + bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT; static Status AddInitializerTransposed(ModelBuilder& model_builder, const OperandType& source_operand_type, - const std::string& name) { + const std::string& name, + bool is_per_tensor_u8s8) { const auto& tensor = *model_builder.GetInitializerTensors().at(name); const Shape& shape = source_operand_type.dimensions; @@ -397,10 +415,11 @@ static Status AddInitializerTransposed(ModelBuilder& model_builder, std::unique_ptr buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]); uint8_t* buffer = buffer_holder.get(); size_t element_size = operand_type.GetElementByteSize(); + uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0; for (uint32_t x = 0; x < x_t; x++) { for (uint32_t y = 0; y < y_t; y++) { for (size_t i = 0; i < element_size; i++) { - buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i]; + buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i] ^ bit_flip_val; } } } @@ -518,16 +537,26 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint( return Status::OK(); } -static Status GetConvOpQuantizationScaleAndZeroPoint( +// Get scale and zero point for +// [QlinearConv] input, weight, output +// [QlinearMatMul] A, B, Y +// +// In case of u8s8 (input/A is uint8 and weight/B is int8) +// If the QlinearConv is using per-channel u8s8, return the scales vector +// If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor +// will be convert to uint8 later, will return the same scale and 128 as zero point +// Also will set is_per_tensor_u8s8 to true to be used later +static Status GetConvMatMulOpQuantizationScaleAndZeroPoint( const ModelBuilder& model_builder, const Node& node, float& a_scale, float& w_scale, float& y_scale, int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point, - optional>& w_scales) ORT_MUST_USE_RESULT; -static Status GetConvOpQuantizationScaleAndZeroPoint( + optional>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT; +static Status GetConvMatMulOpQuantizationScaleAndZeroPoint( const ModelBuilder& model_builder, const Node& node, float& a_scale, float& w_scale, float& y_scale, int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point, - optional>& w_scales) { + optional>& w_scales, bool& is_per_tensor_u8s8) { + is_per_tensor_u8s8 = false; // Get scale and zero points // We will handle per-channel weight scale and zero point later ORT_RETURN_IF_ERROR( @@ -543,14 +572,26 @@ static Status GetConvOpQuantizationScaleAndZeroPoint( if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8) return Status::OK(); - // Now we have u8s8 QlinearConv + // This is per-tensor u8s8 + // NNAPI does not support per-tensor u8s8 + // For this case we will need to convert the int8 weight tensor to uint8 + // And have same scale and 128 as zero point + // The conversion of the weight tensor itself will be done in the OpBuilder + const auto& scale_tensor = *initializers.at(input_defs[4]->Name()); + int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; + if (scale_dim == 1) { + w_zero_point = 128; + is_per_tensor_u8s8 = true; + return Status::OK(); + } + + // Now we have u8s8 per-channel QlinearConv // u8s8 QlinearConv always have 0 as zero point so we are not getting it here // and we do not use w_scale here, so we reset them back to 0 w_scale = 0.0f; w_zero_point = 0; // We need to copy the 1d scales array for per-channel quantization - const auto& scale_tensor = *initializers.at(input_defs[4]->Name()); const auto* scales = GetTensorFloatData(scale_tensor); size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0]; vector scales_vec(scales_size, 0.0f); @@ -1345,12 +1386,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N // this is for per-channel quantization weights optional> w_scales; - + bool is_per_tensor_u8s8 = false; if (is_qlinear_conv) { - ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node, - x_scale, w_scale, y_scale, - x_zero_point, w_zero_point, y_zero_point, - w_scales)); + ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node, + x_scale, w_scale, y_scale, + x_zero_point, w_zero_point, y_zero_point, + w_scales, is_per_tensor_u8s8)); } Shape onnx_weight_shape; @@ -1366,7 +1407,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N onnx_weight_type = Type::TENSOR_QUANT8_ASYMM; break; case ONNX_NAMESPACE::TensorProto_DataType_INT8: - onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL; + // We support both per-tensor and per-channel u8s8 + // For per-tensor u8s8 we will convert the int8 weight to uint8 + if (is_per_tensor_u8s8) { + // Per-Tensor u8s8 + onnx_weight_type = Type::TENSOR_QUANT8_ASYMM; + } else { + // Per-Channel u8s8 + onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL; + } break; default: return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, @@ -1384,9 +1433,9 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N // Pre-process weights if (conv_2d || grouped_conv_2d) { - ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231)); + ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231, is_per_tensor_u8s8)); } else { // depthwise_conv_2d - ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230)); + ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230, is_per_tensor_u8s8)); } if (is_qlinear_conv) { @@ -1697,10 +1746,14 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N b_zero_point = 0, y_zero_point = 0; + bool is_per_tensor_u8s8 = false; if (is_qlinear_matmul) { - ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node, - a_scale, b_scale, y_scale, - a_zero_point, b_zero_point, y_zero_point)); + optional> w_scales; + ORT_RETURN_IF_ERROR( + GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node, + a_scale, b_scale, y_scale, + a_zero_point, b_zero_point, y_zero_point, + w_scales, is_per_tensor_u8s8)); } uint32_t input_2_idx; @@ -1717,7 +1770,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N onnx_mat_b_shape.push_back(SafeInt(dim)); const OperandType onnx_mat_b_operand_type(onnx_mat_b_type, onnx_mat_b_shape, b_scale, b_zero_point); - ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2)); + ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2, is_per_tensor_u8s8)); } input_2_idx = operand_indices.at(input2); diff --git a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc index 0c95ff70e1..bda483061b 100644 --- a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc @@ -58,15 +58,15 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) { test.AddInput("a_zero_point", {}, {113}); test.AddInput("T2", {2, 4, 3}, - {-43, 51, -34, - 60, 26, -17, - 0, 63, -55, - 47, -29, -31, + {-43, 51, -34, + 60, 26, -17, + 0, 63, -55, + 47, -29, -31, - -62, 51, -42, - 60, 26, -22, - 0, -8, -19, - 37, -2, -47}); + -62, 51, -42, + 60, 26, -22, + 0, -8, -19, + 37, -2, -47}); test.AddInput("b_scale", {}, {0.00802f}); test.AddInput("b_zero_point", {}, {-2}); @@ -83,6 +83,76 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) { test.Run(); } +TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8U8) { + auto run_test = [](bool only_t1_not_initializer) { + OpTester test("QLinearMatMul", 10); + test.AddInput("T1", {2, 4}, + {208, 236, 0, 238, + 3, 214, 255, 29}); + + test.AddInput("a_scale", {}, {0.0066f}, only_t1_not_initializer); + test.AddInput("a_zero_point", {}, {113}, only_t1_not_initializer); + + test.AddInput("T2", {4, 3}, + {152, 51, 244, + 60, 26, 255, + 0, 127, 246, + 127, 254, 247}, + only_t1_not_initializer); + + test.AddInput("b_scale", {}, {0.00705f}, only_t1_not_initializer); + test.AddInput("b_zero_point", {}, {114}, only_t1_not_initializer); + + test.AddInput("y_scale", {}, {0.0107f}, only_t1_not_initializer); + test.AddInput("y_zero_point", {}, {118}, only_t1_not_initializer); + test.AddOutput("T3", {2, 3}, + {168, 115, 255, + 1, 66, 151}); + + test.Run(); + }; + + run_test(false); + + // NNAPI will require all inputs except T1 to be initializers + run_test(true); +} + +TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8S8) { + auto run_test = [](bool only_t1_not_initializer) { + OpTester test("QLinearMatMul", 10); + test.AddInput("T1", {2, 4}, + {208, 126, 0, 238, + 3, 214, 255, 29}); + + test.AddInput("a_scale", {}, {0.0066f}, only_t1_not_initializer); + test.AddInput("a_zero_point", {}, {113}, only_t1_not_initializer); + + test.AddInput("T2", {4, 3}, + {-43, 51, -34, + 60, 26, -17, + 0, 63, -55, + 47, -29, -31}, + only_t1_not_initializer); + + test.AddInput("b_scale", {}, {0.00802f}, only_t1_not_initializer); + test.AddInput("b_zero_point", {}, {0}, only_t1_not_initializer); + + test.AddInput("y_scale", {}, {0.0123f}, only_t1_not_initializer); + test.AddInput("y_zero_point", {}, {118}, only_t1_not_initializer); + test.AddOutput("T3", {2, 3}, + {129, 94, 113, + 147, 154, 104}); + + test.Run(); + }; + + run_test(false); + + // NNAPI will require all inputs except T1 to be initializers + run_test(true); +} + static void QLinearMatMul2DTest(bool only_t1_not_initializer) { // Test non-empty inputs OpTester test_non_empty("QLinearMatMul", 10);