mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-06-08 00:23:03 +00:00
[NNAPI EP] Add per-tensor u8s8 support for Qlinear[Conv/MatMul] (#6818)
* NNAPI Add per-tensor u8s8 support * Update some comments * Address CR comments * Address CR comments
This commit is contained in:
parent
3c5d811e77
commit
fedb68429c
3 changed files with 273 additions and 127 deletions
|
|
@ -137,7 +137,9 @@ bool HasValidBinaryOpQuantizedInputs(const Node& node) {
|
|||
bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const Node& node,
|
||||
const std::vector<size_t>& indices, const OpSupportCheckParams& params) {
|
||||
const auto& op_type = node.OpType();
|
||||
bool is_qlinear_conv = (op_type == "QLinearConv");
|
||||
auto qlinear_op_type = GetQLinearOpType(node);
|
||||
bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
|
||||
bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
|
||||
const auto input_defs(node.InputDefs());
|
||||
for (const auto idx : indices) {
|
||||
if (idx >= input_defs.size()) {
|
||||
|
|
@ -145,46 +147,53 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
|
|||
<< " >= input number, " << input_defs.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto scale_name = input_defs[idx]->Name();
|
||||
if (Contains(initializers, scale_name)) {
|
||||
const auto& scale_tensor = *initializers.at(scale_name);
|
||||
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
|
||||
bool is_conv_weight = is_qlinear_conv && idx == 4;
|
||||
bool is_conv_u8s8_weight = false;
|
||||
|
||||
if (is_conv_weight) {
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
|
||||
}
|
||||
|
||||
// We need to check the per-channel quantization scales dimensions for u8s8 QlinearConv
|
||||
// We only support per-channel quantization for u8s8
|
||||
// For all other cases, the scales should be a scalar
|
||||
if (is_conv_u8s8_weight) {
|
||||
if (params.android_sdk_ver < 29) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
|
||||
<< "system API level: " << params.android_sdk_ver;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
if (weight_tensor.dims()[0] != scales_dim) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
|
||||
<< " weight dimension[0] " << weight_tensor.dims()[0]
|
||||
<< " scale dimension " << scales_dim;
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
if (scales_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
|
||||
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be known";
|
||||
if (!Contains(initializers, scale_name)) {
|
||||
LOGS_DEFAULT(VERBOSE) << "The scale of " << op_type << " must be an initializer tensor";
|
||||
return false;
|
||||
}
|
||||
|
||||
// If this op is Qlinear[Conv/MatMul], we want to check u8s8 support for weight tensor (or B tensor for QlinearMatMul)
|
||||
bool is_conv_matmul_weight = (is_qlinear_conv || is_qlinear_matmul) && idx == 4;
|
||||
bool is_conv_matmul_u8s8_weight = false;
|
||||
|
||||
if (is_conv_matmul_weight) {
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
|
||||
}
|
||||
|
||||
const auto& scale_tensor = *initializers.at(scale_name);
|
||||
int64_t scales_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
|
||||
if (!is_conv_matmul_u8s8_weight) {
|
||||
if (scales_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
|
||||
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
|
||||
return false;
|
||||
}
|
||||
} else if (scales_dim != 1) {
|
||||
// For u8s8 Qlinear[Conv/MatMul], we support
|
||||
// 1. Per-tensor, the weight will be transformed to uint8 later
|
||||
// 2. Per-channel, only from Android API level 29
|
||||
if (is_qlinear_matmul) {
|
||||
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (params.android_sdk_ver < 29) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " only supports per-channel quantization on Android API 29+, "
|
||||
<< "system API level: " << params.android_sdk_ver;
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
if (weight_tensor.dims()[0] != scales_dim) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
|
||||
<< " weight dimension[0] " << weight_tensor.dims()[0]
|
||||
<< " scale dimension " << scales_dim;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
|
|
@ -193,7 +202,9 @@ bool HasValidQuantizationScales(const InitializedTensorSet& initializers, const
|
|||
bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, const Node& node,
|
||||
const std::vector<size_t>& indices) {
|
||||
const auto& op_type = node.OpType();
|
||||
bool is_qlinear_conv = (op_type == "QLinearConv");
|
||||
auto qlinear_op_type = GetQLinearOpType(node);
|
||||
bool is_qlinear_conv = (qlinear_op_type == QLinearOpType::QLinearConv);
|
||||
bool is_qlinear_matmul = (qlinear_op_type == QLinearOpType::QLinearMatMul);
|
||||
const auto input_defs(node.InputDefs());
|
||||
for (const auto idx : indices) {
|
||||
if (idx >= input_defs.size()) {
|
||||
|
|
@ -203,65 +214,77 @@ bool HasValidQuantizationZeroPoints(const InitializedTensorSet& initializers, co
|
|||
}
|
||||
|
||||
const auto zero_point_name = input_defs[idx]->Name();
|
||||
if (Contains(initializers, zero_point_name)) {
|
||||
bool is_conv_weight = is_qlinear_conv && idx == 5;
|
||||
bool is_conv_u8s8_weight = false;
|
||||
if (is_conv_weight) {
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
is_conv_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
|
||||
}
|
||||
if (!Contains(initializers, zero_point_name)) {
|
||||
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be an initializer tensor";
|
||||
return false;
|
||||
}
|
||||
|
||||
const auto& zero_tensor = *initializers.at(zero_point_name);
|
||||
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
|
||||
if (is_conv_u8s8_weight) {
|
||||
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
|
||||
LOGS_DEFAULT(VERBOSE) << "u8s8 QlinearConv only supports int8 zero point for weight, "
|
||||
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
|
||||
return false;
|
||||
}
|
||||
bool is_conv_matmul_weight = is_qlinear_conv && idx == 5;
|
||||
bool is_conv_matmul_u8s8_weight = false;
|
||||
if (is_conv_matmul_weight) {
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
is_conv_matmul_u8s8_weight = weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8;
|
||||
}
|
||||
|
||||
// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
|
||||
// or a tensor with same channel as weight, for NNAPI we only support it be
|
||||
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
|
||||
// quantization is 0 there is no input for it
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
|
||||
<< " weight dimension[0] " << weight_tensor.dims()[0]
|
||||
<< " zero point dimension " << zero_dim;
|
||||
return false;
|
||||
}
|
||||
const auto& zero_tensor = *initializers.at(zero_point_name);
|
||||
int64_t zero_dim = zero_tensor.dims().empty() ? 1 : zero_tensor.dims()[0];
|
||||
|
||||
std::unique_ptr<uint8_t[]> unpacked_tensor;
|
||||
size_t tensor_byte_size;
|
||||
auto status = onnxruntime::utils::UnpackInitializerData(
|
||||
zero_tensor,
|
||||
node.ModelPath(),
|
||||
unpacked_tensor, tensor_byte_size);
|
||||
if (!status.IsOK()) {
|
||||
LOGS_DEFAULT(ERROR) << "QLinearConv erro when unpack zero tensor:" << status.ErrorMessage();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify all onnx weight zero point(s) are 0(s)
|
||||
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
|
||||
for (size_t i = 0; i < tensor_byte_size; i++) {
|
||||
if (zero_points[i] != 0) {
|
||||
LOGS_DEFAULT(VERBOSE) << "QLinearConv only support 0 as zero point, "
|
||||
<< "zero_points[" << i << "] has value: " << zero_points[i];
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (zero_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
|
||||
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
|
||||
return false;
|
||||
}
|
||||
if (!is_conv_matmul_u8s8_weight) {
|
||||
if (zero_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " does not support per-channel quantization, "
|
||||
<< " for now, only u8s8 QlinearConv supports per-channel quantization on API 29+";
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
LOGS_DEFAULT(VERBOSE) << "The zero point of " << op_type << " must be known";
|
||||
return false;
|
||||
// For u8s8 Qlinear[Conv/MatMul], we support
|
||||
// 1. Per-tensor, the weight will be transformed to uint8 later
|
||||
// 2. Per-channel, only from Android API level 29
|
||||
if (zero_tensor.data_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8) {
|
||||
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only supports int8 zero point for weight, "
|
||||
<< "actual zero point type: [" << zero_tensor.data_type() << "]";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (zero_dim != 1) {
|
||||
if (is_qlinear_matmul) {
|
||||
LOGS_DEFAULT(VERBOSE) << "QLinearMatMul does not support per-channel quantization";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
// For onnx, u8s8 QlinearConv, the weight zero point can be a scalar,
|
||||
// or a tensor with same channel as weight, for NNAPI we only support it be
|
||||
// 0 (scalar) or all 0 (tensor), NNAPI will assume the zero point for per-channel
|
||||
// quantization is 0 there is no input for it
|
||||
const auto& weight_tensor = *initializers.at(node.InputDefs()[3]->Name());
|
||||
if (weight_tensor.dims()[0] != zero_dim && zero_dim != 1) {
|
||||
LOGS_DEFAULT(VERBOSE) << op_type << " mismatch int8 per-channel quantization weight,"
|
||||
<< " weight dimension[0] " << weight_tensor.dims()[0]
|
||||
<< " zero point dimension " << zero_dim;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::unique_ptr<uint8_t[]> unpacked_tensor;
|
||||
size_t tensor_byte_size;
|
||||
auto status = onnxruntime::utils::UnpackInitializerData(
|
||||
zero_tensor,
|
||||
node.ModelPath(),
|
||||
unpacked_tensor, tensor_byte_size);
|
||||
if (!status.IsOK()) {
|
||||
LOGS_DEFAULT(ERROR) << "Qlinear[Conv/MatMul] error when unpack zero tensor: " << zero_point_name
|
||||
<< ", error msg: " << status.ErrorMessage();
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify all onnx weight zero point(s) are 0(s)
|
||||
const int8_t* zero_points = reinterpret_cast<const int8_t*>(unpacked_tensor.get());
|
||||
for (size_t i = 0; i < tensor_byte_size; i++) {
|
||||
if (zero_points[i] != 0) {
|
||||
LOGS_DEFAULT(VERBOSE) << "u8s8 Qlinear[Conv/MatMul] only support 0 as zero point, "
|
||||
<< "zero_points[" << i << "] has value: " << zero_points[i];
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -272,15 +272,23 @@ enum DataLayout {
|
|||
L_1230 = 1,
|
||||
};
|
||||
|
||||
// TODO, replace this with more efficient code in optimizers
|
||||
// This is primarily used for adding the weight (an initializer) of Conv/QlinearConv
|
||||
// And perform layout change from ONNX -> NNAPI
|
||||
// If is_per_tensor_u8s8 is true, the QlinearConv is per-tensor u8s8 (input X is unsigned int8
|
||||
// and weight W is signed int8 and it is per-tensor (NOT per-channel) quantized), in this case,
|
||||
// since NNAPI requires X and W to be same type for per-tensor quantization,
|
||||
// the initializer tensor W will be converted from int8 to uint8 by flip each byte by XOR 0x80
|
||||
// byte ^ 0x80 == byte + 128
|
||||
static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
|
||||
const std::string& name,
|
||||
const OperandType& source_operand_type,
|
||||
DataLayout new_layout) ORT_MUST_USE_RESULT;
|
||||
DataLayout new_layout,
|
||||
bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
|
||||
static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
|
||||
const std::string& name,
|
||||
const OperandType& source_operand_type,
|
||||
DataLayout new_layout) {
|
||||
DataLayout new_layout,
|
||||
bool is_per_tensor_u8s8) {
|
||||
const auto& tensor = *model_builder.GetInitializerTensors().at(name);
|
||||
const Shape& shape = source_operand_type.dimensions;
|
||||
ORT_RETURN_IF_NOT(shape.size() == 4,
|
||||
|
|
@ -322,6 +330,8 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
|
|||
std::unique_ptr<uint8_t[]> buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]);
|
||||
uint8_t* buffer = buffer_holder.get();
|
||||
size_t element_size = operand_type.GetElementByteSize();
|
||||
|
||||
uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0;
|
||||
for (uint32_t out = 0; out < out_t; out++) {
|
||||
for (uint32_t in = 0; in < in_t; in++) {
|
||||
for (uint32_t h = 0; h < h_t; h++) {
|
||||
|
|
@ -345,7 +355,7 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < element_size; i++) {
|
||||
buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i];
|
||||
buffer[element_size * nnapi_idx + i] = src[element_size * onnx_idx + i] ^ bit_flip_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -355,13 +365,21 @@ static Status AddInitializerInNewLayout(ModelBuilder& model_builder,
|
|||
return model_builder.AddOperandFromPersistMemoryBuffer(name, &buffer[0], operand_type);
|
||||
}
|
||||
|
||||
// TODO, replace this with more efficient code in optimizers
|
||||
// This is primarily used for adding the input B (an initializer) of MatMul/QlinearMatMul/Gemm (not transposed)
|
||||
// and transpose it, since for NNAPI only supports A*B'
|
||||
//
|
||||
// If is_per_tensor_u8s8 is true, the QlinearMatMul is per-tensor u8s8 (input A is unsigned int8
|
||||
// and input B is signed int8), in this case, since NNAPI requires A and B to be same type,
|
||||
// the initializer tensor B will be converted from int8 to uint8 by flip each byte by XOR 0x80
|
||||
// byte ^ 0x80 == byte + 128
|
||||
static Status AddInitializerTransposed(ModelBuilder& model_builder,
|
||||
const OperandType& source_operand_type,
|
||||
const std::string& name) ORT_MUST_USE_RESULT;
|
||||
const std::string& name,
|
||||
bool is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
|
||||
static Status AddInitializerTransposed(ModelBuilder& model_builder,
|
||||
const OperandType& source_operand_type,
|
||||
const std::string& name) {
|
||||
const std::string& name,
|
||||
bool is_per_tensor_u8s8) {
|
||||
const auto& tensor = *model_builder.GetInitializerTensors().at(name);
|
||||
const Shape& shape = source_operand_type.dimensions;
|
||||
|
||||
|
|
@ -397,10 +415,11 @@ static Status AddInitializerTransposed(ModelBuilder& model_builder,
|
|||
std::unique_ptr<uint8_t[]> buffer_holder(new uint8_t[operand_type.GetOperandBlobByteSize()]);
|
||||
uint8_t* buffer = buffer_holder.get();
|
||||
size_t element_size = operand_type.GetElementByteSize();
|
||||
uint8_t bit_flip_val = is_per_tensor_u8s8 ? 0x80 : 0;
|
||||
for (uint32_t x = 0; x < x_t; x++) {
|
||||
for (uint32_t y = 0; y < y_t; y++) {
|
||||
for (size_t i = 0; i < element_size; i++) {
|
||||
buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i];
|
||||
buffer[element_size * (y * x_t + x) + i] = src[element_size * (x * y_t + y) + i] ^ bit_flip_val;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -518,16 +537,26 @@ static Status GetBinaryOpQuantizationScaleAndZeroPoint(
|
|||
return Status::OK();
|
||||
}
|
||||
|
||||
static Status GetConvOpQuantizationScaleAndZeroPoint(
|
||||
// Get scale and zero point for
|
||||
// [QlinearConv] input, weight, output
|
||||
// [QlinearMatMul] A, B, Y
|
||||
//
|
||||
// In case of u8s8 (input/A is uint8 and weight/B is int8)
|
||||
// If the QlinearConv is using per-channel u8s8, return the scales vector
|
||||
// If the Qlinear[Conv/MatMul] is using per-tensor u8s8, the weight/B tensor
|
||||
// will be convert to uint8 later, will return the same scale and 128 as zero point
|
||||
// Also will set is_per_tensor_u8s8 to true to be used later
|
||||
static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
|
||||
const ModelBuilder& model_builder, const Node& node,
|
||||
float& a_scale, float& w_scale, float& y_scale,
|
||||
int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
|
||||
optional<vector<float>>& w_scales) ORT_MUST_USE_RESULT;
|
||||
static Status GetConvOpQuantizationScaleAndZeroPoint(
|
||||
optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) ORT_MUST_USE_RESULT;
|
||||
static Status GetConvMatMulOpQuantizationScaleAndZeroPoint(
|
||||
const ModelBuilder& model_builder, const Node& node,
|
||||
float& a_scale, float& w_scale, float& y_scale,
|
||||
int32_t& a_zero_point, int32_t& w_zero_point, int32_t& y_zero_point,
|
||||
optional<vector<float>>& w_scales) {
|
||||
optional<vector<float>>& w_scales, bool& is_per_tensor_u8s8) {
|
||||
is_per_tensor_u8s8 = false;
|
||||
// Get scale and zero points
|
||||
// We will handle per-channel weight scale and zero point later
|
||||
ORT_RETURN_IF_ERROR(
|
||||
|
|
@ -543,14 +572,26 @@ static Status GetConvOpQuantizationScaleAndZeroPoint(
|
|||
if (weight_tensor.data_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8)
|
||||
return Status::OK();
|
||||
|
||||
// Now we have u8s8 QlinearConv
|
||||
// This is per-tensor u8s8
|
||||
// NNAPI does not support per-tensor u8s8
|
||||
// For this case we will need to convert the int8 weight tensor to uint8
|
||||
// And have same scale and 128 as zero point
|
||||
// The conversion of the weight tensor itself will be done in the OpBuilder
|
||||
const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
|
||||
int64_t scale_dim = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
|
||||
if (scale_dim == 1) {
|
||||
w_zero_point = 128;
|
||||
is_per_tensor_u8s8 = true;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Now we have u8s8 per-channel QlinearConv
|
||||
// u8s8 QlinearConv always have 0 as zero point so we are not getting it here
|
||||
// and we do not use w_scale here, so we reset them back to 0
|
||||
w_scale = 0.0f;
|
||||
w_zero_point = 0;
|
||||
|
||||
// We need to copy the 1d scales array for per-channel quantization
|
||||
const auto& scale_tensor = *initializers.at(input_defs[4]->Name());
|
||||
const auto* scales = GetTensorFloatData(scale_tensor);
|
||||
size_t scales_size = scale_tensor.dims().empty() ? 1 : scale_tensor.dims()[0];
|
||||
vector<float> scales_vec(scales_size, 0.0f);
|
||||
|
|
@ -1345,12 +1386,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
|
||||
// this is for per-channel quantization weights
|
||||
optional<vector<float>> w_scales;
|
||||
|
||||
bool is_per_tensor_u8s8 = false;
|
||||
if (is_qlinear_conv) {
|
||||
ORT_RETURN_IF_ERROR(GetConvOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
x_scale, w_scale, y_scale,
|
||||
x_zero_point, w_zero_point, y_zero_point,
|
||||
w_scales));
|
||||
ORT_RETURN_IF_ERROR(GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
x_scale, w_scale, y_scale,
|
||||
x_zero_point, w_zero_point, y_zero_point,
|
||||
w_scales, is_per_tensor_u8s8));
|
||||
}
|
||||
|
||||
Shape onnx_weight_shape;
|
||||
|
|
@ -1366,7 +1407,15 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
|
||||
break;
|
||||
case ONNX_NAMESPACE::TensorProto_DataType_INT8:
|
||||
onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
|
||||
// We support both per-tensor and per-channel u8s8
|
||||
// For per-tensor u8s8 we will convert the int8 weight to uint8
|
||||
if (is_per_tensor_u8s8) {
|
||||
// Per-Tensor u8s8
|
||||
onnx_weight_type = Type::TENSOR_QUANT8_ASYMM;
|
||||
} else {
|
||||
// Per-Channel u8s8
|
||||
onnx_weight_type = Type::TENSOR_QUANT8_SYMM_PER_CHANNEL;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
|
||||
|
|
@ -1384,9 +1433,9 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
|
||||
// Pre-process weights
|
||||
if (conv_2d || grouped_conv_2d) {
|
||||
ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231));
|
||||
ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_0231, is_per_tensor_u8s8));
|
||||
} else { // depthwise_conv_2d
|
||||
ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230));
|
||||
ORT_RETURN_IF_ERROR(AddInitializerInNewLayout(model_builder, weight, onnx_weight_operand_type, L_1230, is_per_tensor_u8s8));
|
||||
}
|
||||
|
||||
if (is_qlinear_conv) {
|
||||
|
|
@ -1697,10 +1746,14 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
b_zero_point = 0,
|
||||
y_zero_point = 0;
|
||||
|
||||
bool is_per_tensor_u8s8 = false;
|
||||
if (is_qlinear_matmul) {
|
||||
ORT_RETURN_IF_ERROR(GetBinaryOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
a_scale, b_scale, y_scale,
|
||||
a_zero_point, b_zero_point, y_zero_point));
|
||||
optional<vector<float>> w_scales;
|
||||
ORT_RETURN_IF_ERROR(
|
||||
GetConvMatMulOpQuantizationScaleAndZeroPoint(model_builder, node,
|
||||
a_scale, b_scale, y_scale,
|
||||
a_zero_point, b_zero_point, y_zero_point,
|
||||
w_scales, is_per_tensor_u8s8));
|
||||
}
|
||||
|
||||
uint32_t input_2_idx;
|
||||
|
|
@ -1717,7 +1770,7 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N
|
|||
onnx_mat_b_shape.push_back(SafeInt<uint32_t>(dim));
|
||||
|
||||
const OperandType onnx_mat_b_operand_type(onnx_mat_b_type, onnx_mat_b_shape, b_scale, b_zero_point);
|
||||
ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2));
|
||||
ORT_RETURN_IF_ERROR(AddInitializerTransposed(model_builder, onnx_mat_b_operand_type, input2, is_per_tensor_u8s8));
|
||||
}
|
||||
|
||||
input_2_idx = operand_indices.at(input2);
|
||||
|
|
|
|||
|
|
@ -58,15 +58,15 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) {
|
|||
test.AddInput<uint8_t>("a_zero_point", {}, {113});
|
||||
|
||||
test.AddInput<int8_t>("T2", {2, 4, 3},
|
||||
{-43, 51, -34,
|
||||
60, 26, -17,
|
||||
0, 63, -55,
|
||||
47, -29, -31,
|
||||
{-43, 51, -34,
|
||||
60, 26, -17,
|
||||
0, 63, -55,
|
||||
47, -29, -31,
|
||||
|
||||
-62, 51, -42,
|
||||
60, 26, -22,
|
||||
0, -8, -19,
|
||||
37, -2, -47});
|
||||
-62, 51, -42,
|
||||
60, 26, -22,
|
||||
0, -8, -19,
|
||||
37, -2, -47});
|
||||
|
||||
test.AddInput<float>("b_scale", {}, {0.00802f});
|
||||
test.AddInput<int8_t>("b_zero_point", {}, {-2});
|
||||
|
|
@ -83,6 +83,76 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_U8S8) {
|
|||
test.Run();
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8U8) {
|
||||
auto run_test = [](bool only_t1_not_initializer) {
|
||||
OpTester test("QLinearMatMul", 10);
|
||||
test.AddInput<uint8_t>("T1", {2, 4},
|
||||
{208, 236, 0, 238,
|
||||
3, 214, 255, 29});
|
||||
|
||||
test.AddInput<float>("a_scale", {}, {0.0066f}, only_t1_not_initializer);
|
||||
test.AddInput<uint8_t>("a_zero_point", {}, {113}, only_t1_not_initializer);
|
||||
|
||||
test.AddInput<uint8_t>("T2", {4, 3},
|
||||
{152, 51, 244,
|
||||
60, 26, 255,
|
||||
0, 127, 246,
|
||||
127, 254, 247},
|
||||
only_t1_not_initializer);
|
||||
|
||||
test.AddInput<float>("b_scale", {}, {0.00705f}, only_t1_not_initializer);
|
||||
test.AddInput<uint8_t>("b_zero_point", {}, {114}, only_t1_not_initializer);
|
||||
|
||||
test.AddInput<float>("y_scale", {}, {0.0107f}, only_t1_not_initializer);
|
||||
test.AddInput<uint8_t>("y_zero_point", {}, {118}, only_t1_not_initializer);
|
||||
test.AddOutput<uint8_t>("T3", {2, 3},
|
||||
{168, 115, 255,
|
||||
1, 66, 151});
|
||||
|
||||
test.Run();
|
||||
};
|
||||
|
||||
run_test(false);
|
||||
|
||||
// NNAPI will require all inputs except T1 to be initializers
|
||||
run_test(true);
|
||||
}
|
||||
|
||||
TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8S8) {
|
||||
auto run_test = [](bool only_t1_not_initializer) {
|
||||
OpTester test("QLinearMatMul", 10);
|
||||
test.AddInput<uint8_t>("T1", {2, 4},
|
||||
{208, 126, 0, 238,
|
||||
3, 214, 255, 29});
|
||||
|
||||
test.AddInput<float>("a_scale", {}, {0.0066f}, only_t1_not_initializer);
|
||||
test.AddInput<uint8_t>("a_zero_point", {}, {113}, only_t1_not_initializer);
|
||||
|
||||
test.AddInput<int8_t>("T2", {4, 3},
|
||||
{-43, 51, -34,
|
||||
60, 26, -17,
|
||||
0, 63, -55,
|
||||
47, -29, -31},
|
||||
only_t1_not_initializer);
|
||||
|
||||
test.AddInput<float>("b_scale", {}, {0.00802f}, only_t1_not_initializer);
|
||||
test.AddInput<int8_t>("b_zero_point", {}, {0}, only_t1_not_initializer);
|
||||
|
||||
test.AddInput<float>("y_scale", {}, {0.0123f}, only_t1_not_initializer);
|
||||
test.AddInput<uint8_t>("y_zero_point", {}, {118}, only_t1_not_initializer);
|
||||
test.AddOutput<uint8_t>("T3", {2, 3},
|
||||
{129, 94, 113,
|
||||
147, 154, 104});
|
||||
|
||||
test.Run();
|
||||
};
|
||||
|
||||
run_test(false);
|
||||
|
||||
// NNAPI will require all inputs except T1 to be initializers
|
||||
run_test(true);
|
||||
}
|
||||
|
||||
static void QLinearMatMul2DTest(bool only_t1_not_initializer) {
|
||||
// Test non-empty inputs
|
||||
OpTester test_non_empty("QLinearMatMul", 10);
|
||||
|
|
|
|||
Loading…
Reference in a new issue