From b000ec91cc3141a43ba7a136a8ce1b2bc82e6341 Mon Sep 17 00:00:00 2001 From: Zhang Lei Date: Fri, 10 Dec 2021 16:40:01 -0800 Subject: [PATCH] Add quantization tool and its unittest with s8s8 support (#10007) * Add quantization tool with s8s8 support * Add unittest for existing s8s8 support operators * Comment ready unittest for upcomming s8s8 operator (ConvInteger, and Resize) * Minor change on quantization tools * Use different s8 min value upon weight or activation. * use same qmin for reduce ranged s8. --- .../tools/quantization/operators/concat.py | 3 +- .../python/tools/quantization/quant_utils.py | 13 ++- .../test/python/quantization/op_test_utils.py | 26 ++++- .../python/quantization/test_conv_dynamic.py | 45 ++++++--- .../python/quantization/test_op_concat.py | 47 ++++++--- .../python/quantization/test_op_gavgpool.py | 48 +++++---- .../test/python/quantization/test_op_gemm.py | 98 ++++++++++++------- .../python/quantization/test_op_maxpool.py | 53 ++++++---- .../test/python/quantization/test_op_pad.py | 76 +++++++++++--- .../python/quantization/test_op_pooling.py | 46 ++++++--- .../python/quantization/test_op_reshape.py | 30 ++++-- .../python/quantization/test_op_resize.py | 33 +++++-- .../quantization/test_op_squeeze_unsqueeze.py | 43 +++++--- .../python/quantization/test_op_transpose.py | 31 ++++-- .../test/python/quantization/test_qdq.py | 2 +- 15 files changed, 418 insertions(+), 176 deletions(-) diff --git a/onnxruntime/python/tools/quantization/operators/concat.py b/onnxruntime/python/tools/quantization/operators/concat.py index ada842c7b6..76c05828e5 100644 --- a/onnxruntime/python/tools/quantization/operators/concat.py +++ b/onnxruntime/python/tools/quantization/operators/concat.py @@ -12,7 +12,8 @@ class QLinearConcat(QuantOperatorBase): data_found, output_scale_name, output_zp_name, _, _ = \ self.quantizer._get_quantization_params(node.output[0]) - (q_input_names, zero_point_names, scale_names, nodes) = self.quantizer.quantize_inputs(node, [*range(0, len(node.input))]) + (q_input_names, zero_point_names, scale_names, nodes) = \ + self.quantizer.quantize_inputs(node, [*range(0, len(node.input))], initializer_use_weight_qType=False) if not data_found or q_input_names is None: return super().quantize() diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index e57160275f..73dd7fdbf7 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -176,7 +176,7 @@ def quantize_data(data, qType, symmetric, reduce_range=False): if len(data): rmin = min(data) rmax = max(data) - qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range) + qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, for_weight=True) zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric) @@ -184,7 +184,7 @@ def quantize_data(data, qType, symmetric, reduce_range=False): return rmin, rmax, zero_point, scale, quantized_data -def get_qmin_qmax_for_qType(qType, reduce_range=False): +def get_qmin_qmax_for_qType(qType, reduce_range=False, for_weight=False): ''' Return qmin and qmax, the minimum and maximum value representable by the given qType :parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8 @@ -193,18 +193,21 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False): if qType == onnx_proto.TensorProto.UINT8: (qmin, qmax) = (0,127) if reduce_range else (0,255) elif qType == onnx_proto.TensorProto.INT8: - (qmin, qmax) = (-64,64) if reduce_range else (-127,127) + if for_weight: + (qmin, qmax) = (-64,64) if reduce_range else (-127,127) + else: + (qmin, qmax) = (-64,64) if reduce_range else (-128,127) else: raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType)) return qmin, qmax -def get_qrange_for_qType(qType, reduce_range=False): +def get_qrange_for_qType(qType, reduce_range=False, for_weight=False): ''' Helper function to get the quantization range for a type. parameter qType: quantization type. return: quantization range. ''' - qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range) + qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, for_weight=for_weight) return qmax - qmin class QuantizedInitializer: diff --git a/onnxruntime/test/python/quantization/op_test_utils.py b/onnxruntime/test/python/quantization/op_test_utils.py index a5aaae027c..a3f0192289 100644 --- a/onnxruntime/test/python/quantization/op_test_utils.py +++ b/onnxruntime/test/python/quantization/op_test_utils.py @@ -34,7 +34,6 @@ def InputFeedsNegOneZeroOne(n, name2shape): dr = TestDataFeeds(input_data_list) return dr - def check_op_type_order(testcase, model_to_check, ops): if isinstance(model_to_check, string_types): model = onnx.load(model_to_check) @@ -78,3 +77,28 @@ def check_op_nodes(testcase, model_path, node_checker): model = onnx.load(Path(model_path)) for node in model.graph.node: testcase.assertTrue(node_checker(node)) + +def check_qtype_by_node_type(testcase, model_to_check, check_list): + if isinstance(model_to_check, string_types): + model = onnx.load(model_to_check) + elif isinstance(model_to_check, onnx.ModelProto): + model = model_to_check + model = onnx.shape_inference.infer_shapes(model) + value_infos = {vi.name: vi for vi in model.graph.value_info} + value_infos.update({ot.name: ot for ot in model.graph.output}) + value_infos.update({it.name: it for it in model.graph.input}) + initializers = {init.name : init for init in model.graph.initializer} + + for node in model.graph.node: + if node.op_type in check_list: + input_output_check_list = check_list[node.op_type] + for check_item in input_output_check_list: + tensor_name = node.input[check_item[1]] if check_item[0] == 'i' else node.output[check_item[1]] + testcase.assertTrue((tensor_name in value_infos) or (tensor_name in initializers)) + if tensor_name in value_infos: + vi = value_infos[tensor_name] + testcase.assertTrue(vi.type.HasField('tensor_type')) + testcase.assertTrue(vi.type.tensor_type.elem_type == check_item[2]) + else: #if (tensor_name in initializers): + init = initializers[tensor_name] + testcase.assertTrue(init.data_type == check_item[2]) diff --git a/onnxruntime/test/python/quantization/test_conv_dynamic.py b/onnxruntime/test/python/quantization/test_conv_dynamic.py index a95ac9b50f..6a54081979 100644 --- a/onnxruntime/test/python/quantization/test_conv_dynamic.py +++ b/onnxruntime/test/python/quantization/test_conv_dynamic.py @@ -11,17 +11,18 @@ import onnx import onnxruntime import numpy as np from onnx import helper, TensorProto, numpy_helper -from onnxruntime.quantization import quantize_dynamic -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order +from onnxruntime.quantization import quantize_dynamic, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order, check_qtype_by_node_type def generate_input_initializer(tensor_shape, tensor_dtype, input_name): - ''' - Helper function to generate initializers for test inputs - ''' - tensor = np.random.normal(0, 0.3, tensor_shape).astype(tensor_dtype) - init = numpy_helper.from_array(tensor, input_name) - return init + ''' + Helper function to generate initializers for test inputs + ''' + tensor = np.random.normal(0, 0.3, tensor_shape).astype(tensor_dtype) + init = numpy_helper.from_array(tensor, input_name) + return init + class TestONNXModel(unittest.TestCase): def construct_model(self, model_path): @@ -52,19 +53,31 @@ class TestONNXModel(unittest.TestCase): model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) onnx.save(model, model_path) - def dynamic_quant_conv(self, model_fp32_path, model_int8_path): - quantize_dynamic(model_fp32_path, model_int8_path) - quant_nodes = {'ConvInteger' : 2} + def dynamic_quant_conv_test(self, activation_type, weight_type, extra_options={}): + np.random.seed(1) + model_fp32_path = 'conv_bias.fp32.onnx' + self.construct_model(model_fp32_path) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_int8_path = 'conv_bias.quant.{}{}.onnx'.format(activation_type_str, weight_type_str) + + quantize_dynamic(model_fp32_path, model_int8_path, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) + quant_nodes = {'ConvInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) + qnode_io_qtypes = {'ConvInteger': [['i', 2, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)}) def test_quant_conv(self): - np.random.seed(1) - model_fp32_path = 'conv_bias.fp32.onnx' - model_int8_path = 'conv_bias.quant.onnx' - self.construct_model(model_fp32_path) + self.dynamic_quant_conv_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={}) + + # TODO: uncomment following after ConvInteger s8 supportted + # def test_quant_conv_s8s8(self): + # self.dynamic_quant_conv_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True}) - self.dynamic_quant_conv(model_fp32_path, model_int8_path) if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_concat.py b/onnxruntime/test/python/quantization/test_op_concat.py index f7952d4c50..01291df157 100644 --- a/onnxruntime/test/python/quantization/test_op_concat.py +++ b/onnxruntime/test/python/quantization/test_op_concat.py @@ -7,8 +7,8 @@ import unittest import numpy as np from onnx import helper, TensorProto, numpy_helper, save -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import InputFeedsNegOneZeroOne, check_model_correctness, check_op_type_count +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import InputFeedsNegOneZeroOne, check_model_correctness, check_op_type_count, check_qtype_by_node_type class TestONNXModel(unittest.TestCase): @@ -47,7 +47,7 @@ class TestONNXModel(unittest.TestCase): conv3_node = helper.make_node('Conv', ['input', 'conv3_weight'], ['conv3_output'], name='conv3_node') concat_node = helper.make_node('Concat', ['conv1_output', 'conv2_output', 'conv3_output'], [ - 'concat_output'], name='concat_node', axis=1) + 'concat_output'], name='concat_node', axis=1) identity_node = helper.make_node('Identity', ['concat_output'], ['output'], name='identity_node') @@ -57,31 +57,48 @@ class TestONNXModel(unittest.TestCase): model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) save(model, model_path) - def test_quantize_concat(self): + def quantize_concat_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) - model_fp32_path = 'concat_fp32.onnx' - model_uint8_path = 'concat_uint8.onnx' - model_uint8_qdq_path = 'concat_uint8_qdq.onnx' - self.construct_model(model_fp32_path) + data_reader = InputFeedsNegOneZeroOne(1, {'input': [1, 3, 15, 15]}) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_q8_path = 'concat_{}{}.onnx'.format(activation_type_str, weight_type_str) + model_q8_qdq_path = 'concat_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str) # Verify QOperator mode - data_reader = InputFeedsNegOneZeroOne(1, {'input': [1, 3, 15, 15]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) + data_reader.rewind() + quantize_static(model_fp32_path, model_q8_path, data_reader, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) qnode_counts = {'QLinearConv': 3, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'QLinearConcat': 1} - check_op_type_count(self, model_uint8_path, **qnode_counts) + check_op_type_count(self, model_q8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'QLinearConcat': [['i', 1, activation_proto_qtype], [ + 'i', 4, activation_proto_qtype], ['i', 7, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) data_reader.rewind() - check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) + check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 5, 'DequantizeLinear': 8, 'Concat': 1} - check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes) data_reader.rewind() - check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next()) + check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next()) + + def test_quantize_concat(self): + self.quantize_concat_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={}) + + def test_quantize_concat_s8s8(self): + self.quantize_concat_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True}) if __name__ == '__main__': diff --git a/onnxruntime/test/python/quantization/test_op_gavgpool.py b/onnxruntime/test/python/quantization/test_op_gavgpool.py index 7e89758c99..abf739aa33 100644 --- a/onnxruntime/test/python/quantization/test_op_gavgpool.py +++ b/onnxruntime/test/python/quantization/test_op_gavgpool.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, quantize_dynamic -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count +from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type class TestOpGlobalAveragePool(unittest.TestCase): @@ -44,10 +44,10 @@ class TestOpGlobalAveragePool(unittest.TestCase): output_name = 'output' initializers = [] - #make 1st GlobalAveragePool node + # make 1st GlobalAveragePool node gavgpool_node_1 = onnx.helper.make_node('GlobalAveragePool', [input_name], [expand_input]) - #make Expand node + # make Expand node expand_shape_name = 'expand_shape' initializers.append(onnx.numpy_helper.from_array(np.array(input_shape, dtype=np.int64), name=expand_shape_name)) expand_node = onnx.helper.make_node('Expand', [expand_input, expand_shape_name], [conv_input]) @@ -59,7 +59,7 @@ class TestOpGlobalAveragePool(unittest.TestCase): initializers.append(onnx.numpy_helper.from_array(conv_weight_data, name=weight_name)) conv_node = onnx.helper.make_node('Conv', [conv_input, weight_name], [gavgpool_input_2nd], name=conv_name) - #make 1st GlobalAveragePool node + # make 1st GlobalAveragePool node gavgpool_node_2 = onnx.helper.make_node('GlobalAveragePool', [gavgpool_input_2nd], [output_name]) # make graph @@ -69,30 +69,42 @@ class TestOpGlobalAveragePool(unittest.TestCase): graph = helper.make_graph([gavgpool_node_1, expand_node, conv_node, gavgpool_node_2], graph_name, [input_tensor], [output_tensor], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) - model.ir_version = 7 # use stable onnx ir version + model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) - def test_quantize_reshape(self): + def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) model_fp32_path = 'gavg_pool_fp32.onnx' - model_int8_path = 'gavg_pool_fp32.quant.onnx' data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_gavgpool(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 1, 1]) - quantize_static(model_fp32_path, - model_int8_path, - data_reader) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_q8_path = 'gavg_pool_{}{}.onnx'.format(activation_type_str, weight_type_str) + data_reader.rewind() - quant_nodes = {'QLinearConv' : 1, - 'GlobalAveragePool' : 1, - 'QLinearGlobalAveragePool' : 1, - 'QuantizeLinear' : 1, - 'DequantizeLinear' : 1} - check_op_type_count(self, model_int8_path, **quant_nodes) - check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next()) + quantize_static(model_fp32_path, model_q8_path, data_reader, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) + + quant_nodes = {'QLinearConv': 1, 'GlobalAveragePool': 1, 'QLinearGlobalAveragePool': 1, + 'QuantizeLinear': 1, 'DequantizeLinear': 1} + check_op_type_count(self, model_q8_path, **quant_nodes) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'QLinearGlobalAveragePool': [['i', 2, activation_proto_qtype], ['i', 4, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next()) + + def test_quantize_gavgpool(self): + self.quantize_gavgpool_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={}) + + def test_quantize_gavgpool_s8s8(self): + self.quantize_gavgpool_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True}) if __name__ == '__main__': diff --git a/onnxruntime/test/python/quantization/test_op_gemm.py b/onnxruntime/test/python/quantization/test_op_gemm.py index e41ee633e2..cf61402fa5 100644 --- a/onnxruntime/test/python/quantization/test_op_gemm.py +++ b/onnxruntime/test/python/quantization/test_op_gemm.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count +from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type class TestOpGEMM(unittest.TestCase): @@ -46,12 +46,12 @@ class TestOpGEMM(unittest.TestCase): bias_data = np.random.normal(0, 0.1, bias_shape).astype(np.float32) initializers.append(onnx.numpy_helper.from_array(bias_data, name=bias_name)) - return onnx.helper.make_node('Gemm', [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB = 1) + return onnx.helper.make_node('Gemm', [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB=1) # make gemm1 node gemm1_output_name = "gemm1_output" gemm1_node = make_gemm(input_name, [100, 10], 'linear1.weight', [100], 'linear1.bias', gemm1_output_name) - #make Clip + # make Clip clip_min_name = 'clip_min' clip_max_name = 'clip_max' clip_output_name = 'clip_output' @@ -71,7 +71,7 @@ class TestOpGEMM(unittest.TestCase): graph = helper.make_graph([gemm1_node, clip_node, gemm2_node], graph_name, [input_tensor], [output_tensor], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) - model.ir_version = 7 # use stable onnx ir version + model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) @@ -121,55 +121,82 @@ class TestOpGEMM(unittest.TestCase): onnx.save(model, output_model_path) - def static_quant_test(self, model_fp32_path, model_int8_path): - data_reader = self.input_feeds(1, {'input': [5, 10]}) - quantize_static(model_fp32_path, - model_int8_path, - data_reader) + def static_quant_test(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_int8_path = 'gemm_fp32.quant_{}{}.onnx'.format(activation_type_str, weight_type_str) + data_reader.rewind() - quant_nodes = {'QLinearMatMul' : 2, - 'QLinearAdd' : 2, - 'QuantizeLinear' : 1, - 'DequantizeLinear' : 1} + quantize_static(model_fp32_path, model_int8_path, data_reader, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) + quant_nodes = {'QLinearMatMul': 2, 'QLinearAdd': 2, 'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_int8_path, **quant_nodes) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) + data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next()) - def static_quant_test_qdq(self, model_fp32_path, model_int8_path): - data_reader = self.input_feeds(1, {'input': [5, 10]}) - quantize_static(model_fp32_path, - model_int8_path, - data_reader, - quant_format=QuantFormat.QDQ) + def static_quant_test_qdq(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_int8_path = 'gemm_fp32.quant_dqd_{}{}.onnx'.format(activation_type_str, weight_type_str) + data_reader.rewind() - quant_nodes = {'MatMul' : 2, - 'Add' : 2, - 'QuantizeLinear' : 5, - 'DequantizeLinear' : 9} + quantize_static(model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) + quant_nodes = {'MatMul': 2, 'Add': 2, 'QuantizeLinear': 5, 'DequantizeLinear': 9} check_op_type_count(self, model_int8_path, **quant_nodes) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) + data_reader.rewind() check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next()) + def dynamic_quant_test(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}): + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_int8_path = 'gemm_fp32.quant_dynamic_{}{}.onnx'.format(activation_type_str, weight_type_str) - def dynamic_quant_test(self, model_fp32_path, model_int8_path): - quantize_dynamic(model_fp32_path, model_int8_path) - quant_nodes = {'MatMulInteger' : 2} + quantize_dynamic(model_fp32_path, model_int8_path, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) + quant_nodes = {'MatMulInteger': 2} check_op_type_count(self, model_int8_path, **quant_nodes) - check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(5,10).astype(np.float32)}) + qnode_io_qtypes = {'MatMulInteger': [['i', 2, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(5, 10).astype(np.float32)}) def dynamic_attention_quant_test(self, model_fp32_path, model_int8_path, per_channel, reduce_range): quantize_dynamic(model_fp32_path, model_int8_path, per_channel=per_channel, reduce_range=reduce_range) - quant_nodes = {'QAttention' : 1, 'MatMulInteger' : 1} + quant_nodes = {'QAttention': 1, 'MatMulInteger': 1} check_op_type_count(self, model_int8_path, **quant_nodes) - check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(1,5,10).astype(np.float32)}) + check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(1, 5, 10).astype(np.float32)}) - def test_quantize_reshape(self): + def test_quantize_gemm(self): np.random.seed(1) model_fp32_path = 'gemm_fp32.onnx' - model_int8_path = 'gemm_fp32.quant.onnx' self.construct_model_gemm(model_fp32_path) + data_reader = self.input_feeds(1, {'input': [5, 10]}) - self.static_quant_test(model_fp32_path, model_int8_path) - self.static_quant_test_qdq(model_fp32_path, model_int8_path) - self.dynamic_quant_test(model_fp32_path, model_int8_path) + self.static_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8) + self.static_quant_test_qdq(model_fp32_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8) + self.dynamic_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8) + + def test_quantize_gemm_s8s8(self): + np.random.seed(1) + model_fp32_path = 'gemm_fp32.onnx' + self.construct_model_gemm(model_fp32_path) + data_reader = self.input_feeds(1, {'input': [5, 10]}) + + self.static_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, + extra_options={'ActivationSymmetric': True}) + self.static_quant_test_qdq(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, + extra_options={'ActivationSymmetric': True}) + self.dynamic_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8, + extra_options={'ActivationSymmetric': True}) def test_quantize_attention(self): np.random.seed(1) @@ -182,5 +209,6 @@ class TestOpGEMM(unittest.TestCase): self.dynamic_attention_quant_test(model_fp32_path, model_int8_path, False, True) self.dynamic_attention_quant_test(model_fp32_path, model_int8_path, False, False) + if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_maxpool.py b/onnxruntime/test/python/quantization/test_op_maxpool.py index 9a891f10b9..f474c9a547 100644 --- a/onnxruntime/test/python/quantization/test_op_maxpool.py +++ b/onnxruntime/test/python/quantization/test_op_maxpool.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type class TestOpMaxPool(unittest.TestCase): @@ -54,40 +54,55 @@ class TestOpMaxPool(unittest.TestCase): graph = helper.make_graph([conv_node, identity_node, maxpool_node], 'TestOpQuantizerMaxPool_test_model', [input_tensor], [identity_out, output_tensor], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)]) - model.ir_version = 7 # use stable onnx ir version + model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) - def test_quantize_maxpool(self): + def quantize_maxpool_test(self, activation_type, weight_type, extra_options={}): np.random.seed(1) - model_fp32_path = 'maxpool_fp32.onnx' - model_uint8_path = 'maxpool_uint8.onnx' - model_uint8_qdq_path = 'maxpool_uint8_qdq.onnx' - self.construct_model_conv_maxpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) + data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_q8_path = 'maxpool_{}{}.onnx'.format(activation_type_str, weight_type_str) + model_q8_qdq_path = 'maxpool_dqd_{}{}.onnx'.format(activation_type_str, weight_type_str) # Verify QOperator mode - data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) - - # make sure maxpool become xint8 operator, its input name could tell that - check_op_nodes(self, model_uint8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output')) - qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1} - check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() - check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) + quantize_static(model_fp32_path, model_q8_path, data_reader, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) + # make sure maxpool become xint8 operator, its input name could tell that + check_op_nodes(self, model_q8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output')) + qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1} + check_op_type_count(self, model_q8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'MaxPool': 1} - check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes) data_reader.rewind() - check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next()) + check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next()) + def test_quantize_maxpool(self): + self.quantize_maxpool_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={ }) + + def test_quantize_maxpool_s8s8(self): + self.quantize_maxpool_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True}) if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_pad.py b/onnxruntime/test/python/quantization/test_op_pad.py index f83d97757e..b34996a9b8 100644 --- a/onnxruntime/test/python/quantization/test_op_pad.py +++ b/onnxruntime/test/python/quantization/test_op_pad.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, quantize_dynamic -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count +from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type class TestOpQuatizerPad(unittest.TestCase): @@ -51,7 +51,7 @@ class TestOpQuatizerPad(unittest.TestCase): graph = helper.make_graph([pad_node], 'TestOpQuantizerPad_test_model', [input_tensor], [output_tensor], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) - model.ir_version = 7 # use stable onnx ir version + model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) @@ -91,14 +91,17 @@ class TestOpQuatizerPad(unittest.TestCase): graph = helper.make_graph([conv_node, identity_node, pad_node], 'TestOpQuantizerPad_test_model', [input_tensor], [identity_out, output_tensor], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) - model.ir_version = 7 # use stable onnx ir version + model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) - def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None): + def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None, + activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}): if data_reader is not None: - quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True) + quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) else: - quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True) + quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) def verify_should_not_trigger(self, quantize_mode='static'): np.random.seed(108) @@ -118,23 +121,39 @@ class TestOpQuatizerPad(unittest.TestCase): def test_dynamic_quantize_no_trigger(self): self.verify_should_not_trigger(quantize_mode='dynamic') - def verify_quantize_with_pad_mode(self, pad_mode, constant_value=None, quantize_mode='static'): + def verify_quantize_with_pad_mode(self, pad_mode, constant_value=None, quantize_mode='static', rtol=0.01, atol=0.05, + activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}): np.random.seed(108) tag_pad_mode = pad_mode if pad_mode is not None else 'none' tag_constant_value = '' if constant_value is None else '_value' model_fp32_path = 'qop_pad_{}_fp32_{}{}.onnx'.format(quantize_mode, tag_pad_mode, tag_constant_value) - model_i8_path = 'qop_pad_{}_i8_{}{}.onnx'.format(quantize_mode, tag_pad_mode, tag_constant_value) data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]}) self.construct_model_conv_pad(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31], pad_mode, [0, 0, 1, 2, 0, 0, 3, 4], constant_value=constant_value) - self.quantize_model(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_i8_path = 'qop_pad_{}_i8_{}{}_{}{}.onnx'.format( + quantize_mode, tag_pad_mode, tag_constant_value, activation_type_str, weight_type_str) data_reader.rewind() + self.quantize_model(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) # DequantizeLinear=2 means there are one DequantizeLinear Node aftr both conv and pad, # which means pad node is running in quantized semantic. # In dynamic quantize mode, pad operator in fact not quantized as input is fp32. - kwargs = {'DynamicQuantizeLinear': 1} if quantize_mode != 'static' else {'DequantizeLinear': 2, 'QuantizeLinear': 1} + if quantize_mode != 'static': + kwargs = {'DynamicQuantizeLinear': 1} if activation_type == QuantType.QUInt8 else {'QuantizeLinear': 1} + else: + kwargs = {'DequantizeLinear': 2, 'QuantizeLinear': 1} check_op_type_count(self, model_i8_path, **kwargs) - check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next()) + # check node input/output type if such node exists in the graph + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) + qnode_io_qtypes.update({'ConvInteger': [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_i8_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next(), rtol=rtol, atol=atol) def test_static_mode_edge(self): self.verify_quantize_with_pad_mode('edge', constant_value=None) @@ -148,6 +167,22 @@ class TestOpQuatizerPad(unittest.TestCase): def test_static_mode_constant_value(self): self.verify_quantize_with_pad_mode('constant', constant_value=3.75) + def test_static_mode_edge_s8s8(self): + self.verify_quantize_with_pad_mode('edge', constant_value=None, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8, + weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + def test_static_mode_reflect_s8s8(self): + self.verify_quantize_with_pad_mode('reflect', constant_value=None, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8, + weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + def test_static_mode_constant_default_s8s8(self): + self.verify_quantize_with_pad_mode('constant', constant_value=None, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8, + weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + def test_static_mode_constant_value_s8s8(self): + self.verify_quantize_with_pad_mode('constant', constant_value=3.75, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8, + weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + def test_dynamic_mode_edge(self): self.verify_quantize_with_pad_mode('edge', constant_value=None, quantize_mode='dynamic') @@ -160,6 +195,23 @@ class TestOpQuatizerPad(unittest.TestCase): def test_dynamic_mode_constant_value(self): self.verify_quantize_with_pad_mode('constant', constant_value=3.75, quantize_mode='dynamic') + # TODO: uncomment following after ConvInteger s8 supported + # def test_dynamic_mode_edge_s8s8(self): + # self.verify_quantize_with_pad_mode('edge', constant_value=None, quantize_mode='dynamic', activation_type=QuantType.QInt8, + # weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + # def test_dynamic_mode_reflect_s8s8(self): + # self.verify_quantize_with_pad_mode('reflect', constant_value=None, quantize_mode='dynamic', activation_type=QuantType.QInt8, + # weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + # def test_dynamic_mode_constant_default_s8s8(self): + # self.verify_quantize_with_pad_mode('constant', constant_value=None, quantize_mode='dynamic', activation_type=QuantType.QInt8, + # weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + # def test_dynamic_mode_constant_value_s8s8(self): + # self.verify_quantize_with_pad_mode('constant', constant_value=3.75, quantize_mode='dynamic', activation_type=QuantType.QInt8, + # weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_pooling.py b/onnxruntime/test/python/quantization/test_op_pooling.py index bb40ff9a8f..ee2bff4e02 100644 --- a/onnxruntime/test/python/quantization/test_op_pooling.py +++ b/onnxruntime/test/python/quantization/test_op_pooling.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type class TestOpAveragePool(unittest.TestCase): @@ -57,34 +57,50 @@ class TestOpAveragePool(unittest.TestCase): model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) - def test_quantize_avgpool(self): + def quantize_avgpool_test(self, activation_type, weight_type, extra_options = {}): np.random.seed(1) - model_fp32_path = 'avgpool_fp32.onnx' - model_uint8_path = 'avgpool_uint8.onnx' - model_uint8_qdq_path = 'avgpool_uint8_qdq.onnx' - self.construct_model_conv_avgpool(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], {'kernel_shape': [3, 3]}, [1, 3, 22, 38]) + data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) + + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_q8_path = 'avgpool_{}{}.onnx'.format(activation_type_str, weight_type_str) + model_q8_qdq_path = 'avgpool_qdq_{}{}.onnx'.format(activation_type_str, weight_type_str) # Verify QOperator mode - data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) - qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'QLinearAveragePool': 1} - check_op_type_count(self, model_uint8_path, **qnode_counts) data_reader.rewind() - check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) + quantize_static(model_fp32_path, model_q8_path, data_reader, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) + qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'QLinearAveragePool': 1} + check_op_type_count(self, model_q8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'QLinearConv' : [['i', 2, activation_proto_qtype], ['i', 7, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}) + qnode_io_qtypes.update({'QLinearAveragePool' : [['i', 4, activation_proto_qtype]]}) # shape info note workig on custome ops + check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes) + data_reader.rewind() + check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'AveragePool': 1} - check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes) data_reader.rewind() - check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next()) + check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next()) + def test_quantize_avgpool(self): + self.quantize_avgpool_test(QuantType.QUInt8, QuantType.QUInt8) + + def test_quantize_avgpool_s8s8(self): + self.quantize_avgpool_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True}) if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_reshape.py b/onnxruntime/test/python/quantization/test_op_reshape.py index a457a21570..0397d1a9c2 100644 --- a/onnxruntime/test/python/quantization/test_op_reshape.py +++ b/onnxruntime/test/python/quantization/test_op_reshape.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type class TestOpReshape(unittest.TestCase): @@ -67,35 +67,51 @@ class TestOpReshape(unittest.TestCase): onnx.save(model, output_model_path) - def test_quantize_reshape(self): + def quantize_reshape_test(self, activation_type, weight_type, extra_options = {}): np.random.seed(1) model_fp32_path = 'reshape_fp32.onnx' - model_uint8_path = 'reshape_uint8.onnx' - model_uint8_qdq_path = 'reshape_uint8_qdq.onnx' self.construct_model_matmul_reshape(model_fp32_path, [3, 7], [7, 3], [1, 9]) + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_uint8_path = 'reshape_{}{}.onnx'.format(activation_type_str, weight_type_str) + model_uint8_qdq_path = 'reshape_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str) + # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [3, 7]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) + quantize_static(model_fp32_path, model_uint8_path, data_reader, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) # make sure transpose become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "reshape_node" or node.input[0] != 'matmul_output')) qnode_counts = {'QLinearMatMul': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'Reshape': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) qdqnode_counts = {'MatMul': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Reshape': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next()) + def test_quantize_reshape(self): + self.quantize_reshape_test(QuantType.QUInt8, QuantType.QUInt8) + + def test_quantize_reshape_s8s8(self): + self.quantize_reshape_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True}) if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_resize.py b/onnxruntime/test/python/quantization/test_op_resize.py index 66724c38a1..df9371c2c0 100644 --- a/onnxruntime/test/python/quantization/test_op_resize.py +++ b/onnxruntime/test/python/quantization/test_op_resize.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type class TestOpResize(unittest.TestCase): @@ -81,12 +81,9 @@ class TestOpResize(unittest.TestCase): model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) - def test_quantize_resize(self): + def quantize_resize_test(self, activation_type, weight_type, extra_options = {}): np.random.seed(1) - model_fp32_path = 'resize_fp32.onnx' - model_uint8_path = 'resize_uint8.onnx' - model_uint8_qdq_path = 'resize_uint8_qdq.onnx' kwargs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'} self.construct_model_conv_resize(model_fp32_path, @@ -95,25 +92,43 @@ class TestOpResize(unittest.TestCase): kwargs, [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 2.0, 2.0], None) + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_uint8_path = 'resize_{}{}.onnx'.format(activation_type_str, weight_type_str) + model_uint8_qdq_path = 'resize_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str) + # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) - + quantize_static(model_fp32_path, model_uint8_path, data_reader, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) # make sure resize become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "resize_node" or node.input[0] != 'conv_output')) qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'Resize': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ mode data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Resize': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next()) + def test_quantize_resize(self): + self.quantize_resize_test(QuantType.QUInt8, QuantType.QUInt8) + + # TODO: Uncomment following after resize s8 support is enabled + # def test_quantize_resize_s8s8(self): + # self.quantize_resize_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True}) if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py b/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py index 579dba9c9c..18252ec7a1 100644 --- a/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py +++ b/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type class TestOpSqueezeUnsqueeze(unittest.TestCase): @@ -26,8 +26,8 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase): return dr def construct_model_conv_squeezes(self, output_model_path, - conv_input_shape, conv_weight_shape, conv_output_shape, - opset = 13): + conv_input_shape, conv_weight_shape, conv_output_shape, + opset=13): # (input) # / | \ # Conv1 conv2 conv3 @@ -55,7 +55,6 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase): conv3_weight_initializer = onnx.numpy_helper.from_array(conv3_weight_arr, name='conv3_weight') conv3_node = onnx.helper.make_node('Conv', ['input', 'conv3_weight'], ['conv3_output'], name='conv3_node') - if (opset >= 13): squeeze_axes_initializer = onnx.numpy_helper.from_array(np.array([0], dtype=np.int64), name='squeeze_axes') squeeze1_node = helper.make_node('Squeeze', ['conv1_output', 'squeeze_axes'], ['squeeze1_output'], name='suqeeze1_node') @@ -66,9 +65,10 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase): add1_node = helper.make_node('Add', ['squeeze1_output', 'squeeze2_output'], ['add1_output'], name='add1_node') if (opset >= 13): - unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output', 'squeeze_axes'], ['unsqueeze_output'], name = 'unsqueeze_node') + unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output', 'squeeze_axes'], [ + 'unsqueeze_output'], name='unsqueeze_node') else: - unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output'], ['unsqueeze_output'], name = 'unsqueeze_node', axes=[0]) + unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output'], ['unsqueeze_output'], name='unsqueeze_node', axes=[0]) output_tensor = helper.make_tensor_value_info('output', TensorProto.FLOAT, conv_output_shape) add2_node = helper.make_node('Add', ['unsqueeze_output', 'conv3_output'], ['output'], name='add2_node') @@ -79,33 +79,43 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase): graph = helper.make_graph([conv1_node, conv2_node, conv3_node, squeeze1_node, squeeze2_node, add1_node, unsqueeze_node, add2_node], 'TestOpSuqeezes_test_model', [input_tensor], [output_tensor], initializer=initializers) model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)]) - model.ir_version = 7 # use stable onnx ir version + model.ir_version = 7 # use stable onnx ir version onnx.save(model, output_model_path) - def run_quantize_squeezes_of_opset(self, opset = 13): + def run_quantize_squeezes_of_opset(self, opset=13, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}): np.random.seed(1) model_fp32_path = 'squeezes_opset{}_fp32.onnx'.format(opset) - model_uint8_path = 'squeezes_opset{}_uint8.onnx'.format(opset) - model_uint8_qdq_path = 'squeezes_opset{}_uint8_qdq.onnx'.format(opset) - self.construct_model_conv_squeezes(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], opset=opset) + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_uint8_path = 'squeezes_opset{}_{}{}.onnx'.format(opset, activation_type_str, weight_type_str) + model_uint8_qdq_path = 'squeezes_opset{}_{}{}_qdq.onnx'.format(opset, activation_type_str, weight_type_str) + # Verify QOperator mode data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) + quantize_static(model_fp32_path, model_uint8_path, data_reader, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) # make sure squeezes become xint8 operator, its input name could tell that qnode_counts = {'QuantizeLinear': 1, 'DequantizeLinear': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next(), rtol=0.01, atol=0.5) # Verify QDQ mode data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type=activation_type, weight_type=weight_type, extra_options=extra_options) qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 9, 'DequantizeLinear': 12} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next(), rtol=0.01, atol=0.5) @@ -113,5 +123,10 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase): self.run_quantize_squeezes_of_opset(11) self.run_quantize_squeezes_of_opset(13) + def test_quantize_squeeze_unsqueeze_s8s8(self): + self.run_quantize_squeezes_of_opset(11, QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + self.run_quantize_squeezes_of_opset(13, QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True}) + + if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_op_transpose.py b/onnxruntime/test/python/quantization/test_op_transpose.py index f1dd8a780e..a83ec6022d 100644 --- a/onnxruntime/test/python/quantization/test_op_transpose.py +++ b/onnxruntime/test/python/quantization/test_op_transpose.py @@ -10,8 +10,8 @@ import unittest import onnx import numpy as np from onnx import helper, TensorProto -from onnxruntime.quantization import quantize_static, QuantFormat -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes +from onnxruntime.quantization import quantize_static, QuantFormat, QuantType +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type class TestOpTranspose(unittest.TestCase): @@ -62,32 +62,47 @@ class TestOpTranspose(unittest.TestCase): onnx.save(model, output_model_path) - def test_quantize_transpose(self): + def quantize_transpose_test(self, activation_type, weight_type, extra_options = {}): np.random.seed(1) model_fp32_path = 'transpose_fp32.onnx' - model_uint8_path = 'transpose_uint8.onnx' - model_uint8_qdq_path = 'transpose_uint8_qdq.onnx' - self.construct_model_matmul_transpose(model_fp32_path, [3, 7], [7, 5], [5, 3]) + activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8 + activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8' + weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8' + model_uint8_path = 'transpose_{}{}.onnx'.format(activation_type_str, weight_type_str) + model_uint8_qdq_path = 'transpose_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str) + # Verify QOperator model data_reader = self.input_feeds(1, {'input': [3, 7]}) - quantize_static(model_fp32_path, model_uint8_path, data_reader) + quantize_static(model_fp32_path, model_uint8_path, data_reader, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) # make sure transpose become xint8 operator, its input name could tell that check_op_nodes(self, model_uint8_path, lambda node: (node.name != "transpose_node" or node.input[0] != 'matmul_output')) qnode_counts = {'QLinearMatMul': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'Transpose': 1} check_op_type_count(self, model_uint8_path, **qnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]}) + check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next()) # Verify QDQ model data_reader.rewind() - quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ) + quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ, + activation_type = activation_type, weight_type = weight_type, extra_options = extra_options) qdqnode_counts = {'MatMul': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Transpose': 1} check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts) + qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]} + check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes) data_reader.rewind() check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next()) + def test_quantize_transpose(self): + self.quantize_transpose_test(QuantType.QUInt8, QuantType.QUInt8) + + def test_quantize_transpose_s8s8(self): + self.quantize_transpose_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True}) if __name__ == '__main__': unittest.main() diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index d8d4280e37..2aca1eacdb 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -104,7 +104,7 @@ class TestQDQExtraOptions(unittest.TestCase): # This QuantizeLinear node should be followed by Add1 if node.name == 'P_QuantizeLinear': qdq_added_to_node_output_flag = True - self.assertTrue(node.input[0] is 'P') + self.assertTrue(node.input[0] == 'P') self.assertTrue(qdq_added_to_node_output_flag)