Add quantization tool and its unittest with s8s8 support (#10007)

* Add quantization tool with s8s8 support * Add unittest for existing s8s8 support operators * Comment ready unittest for upcomming s8s8 operator (ConvInteger, and Resize) * Minor change on quantization tools * Use different s8 min value upon weight or activation. * use same qmin for reduce ranged s8.
2026-07-23 19:32:23 +00:00 · 2021-12-10 16:40:01 -08:00 · 2021-12-10 16:40:01 -08:00 · b000ec91cc
commit b000ec91cc
parent 7a70d22150
15 changed files with 418 additions and 176 deletions
--- a/onnxruntime/python/tools/quantization/operators/concat.py
+++ b/onnxruntime/python/tools/quantization/operators/concat.py
@ -12,7 +12,8 @@ class QLinearConcat(QuantOperatorBase):

        data_found, output_scale_name, output_zp_name, _, _ = \
            self.quantizer._get_quantization_params(node.output[0])
-        (q_input_names, zero_point_names, scale_names, nodes) = self.quantizer.quantize_inputs(node, [*range(0, len(node.input))])
+        (q_input_names, zero_point_names, scale_names, nodes) = \
+            self.quantizer.quantize_inputs(node, [*range(0, len(node.input))], initializer_use_weight_qType=False)
        if not data_found or q_input_names is None:
            return super().quantize()

--- a/onnxruntime/python/tools/quantization/quant_utils.py
+++ b/onnxruntime/python/tools/quantization/quant_utils.py
@ -176,7 +176,7 @@ def quantize_data(data, qType, symmetric, reduce_range=False):
    if len(data):
        rmin = min(data)
        rmax = max(data)
-        qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range)
+        qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, for_weight=True)

        zero_point, scale = compute_scale_zp(rmin, rmax, qmin, qmax, symmetric)

@ -184,7 +184,7 @@ def quantize_data(data, qType, symmetric, reduce_range=False):

    return rmin, rmax, zero_point, scale, quantized_data

-def get_qmin_qmax_for_qType(qType, reduce_range=False):
+def get_qmin_qmax_for_qType(qType, reduce_range=False, for_weight=False):
    '''
    Return qmin and qmax, the minimum and maximum value representable by the given qType
    :parameter qType: onnx.onnx_pb.TensorProto.UINT8 or onnx.onnx_pb.TensorProto.UINT8
@ -193,18 +193,21 @@ def get_qmin_qmax_for_qType(qType, reduce_range=False):
    if qType == onnx_proto.TensorProto.UINT8:
        (qmin, qmax) = (0,127) if reduce_range else (0,255)
    elif qType == onnx_proto.TensorProto.INT8:
-        (qmin, qmax) = (-64,64) if reduce_range else (-127,127)
+        if for_weight:
+            (qmin, qmax) = (-64,64) if reduce_range else (-127,127)
+        else:
+            (qmin, qmax) = (-64,64) if reduce_range else (-128,127)
    else:
        raise ValueError("Unexpected data type {} requested. Only INT8 and UINT8 are supported.".format(qType))
    return qmin, qmax

-def get_qrange_for_qType(qType, reduce_range=False):
+def get_qrange_for_qType(qType, reduce_range=False, for_weight=False):
    '''
    Helper function to get the quantization range for a type.
        parameter qType: quantization type.
        return: quantization range.
    '''
-    qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range)
+    qmin, qmax = get_qmin_qmax_for_qType(qType, reduce_range, for_weight=for_weight)
    return  qmax - qmin

 class QuantizedInitializer:
--- a/onnxruntime/test/python/quantization/op_test_utils.py
+++ b/onnxruntime/test/python/quantization/op_test_utils.py
@ -34,7 +34,6 @@ def InputFeedsNegOneZeroOne(n, name2shape):
    dr = TestDataFeeds(input_data_list)
    return dr

-
 def check_op_type_order(testcase, model_to_check, ops):
    if isinstance(model_to_check, string_types):
        model = onnx.load(model_to_check)
@ -78,3 +77,28 @@ def check_op_nodes(testcase, model_path, node_checker):
    model = onnx.load(Path(model_path))
    for node in model.graph.node:
        testcase.assertTrue(node_checker(node))
+
+def check_qtype_by_node_type(testcase, model_to_check, check_list):
+    if isinstance(model_to_check, string_types):
+        model = onnx.load(model_to_check)
+    elif isinstance(model_to_check, onnx.ModelProto):
+        model = model_to_check
+    model = onnx.shape_inference.infer_shapes(model)
+    value_infos = {vi.name: vi for vi in model.graph.value_info}
+    value_infos.update({ot.name: ot for ot in model.graph.output})
+    value_infos.update({it.name: it for it in model.graph.input})
+    initializers = {init.name : init for init in model.graph.initializer}
+
+    for node in model.graph.node:
+        if node.op_type in check_list:
+            input_output_check_list = check_list[node.op_type]
+            for check_item in input_output_check_list:
+                tensor_name = node.input[check_item[1]] if check_item[0] == 'i' else node.output[check_item[1]]
+                testcase.assertTrue((tensor_name in value_infos) or (tensor_name in initializers))
+                if tensor_name in value_infos:
+                    vi = value_infos[tensor_name]
+                    testcase.assertTrue(vi.type.HasField('tensor_type'))
+                    testcase.assertTrue(vi.type.tensor_type.elem_type == check_item[2])
+                else: #if (tensor_name in initializers):
+                    init = initializers[tensor_name]
+                    testcase.assertTrue(init.data_type == check_item[2])
--- a/onnxruntime/test/python/quantization/test_conv_dynamic.py
+++ b/onnxruntime/test/python/quantization/test_conv_dynamic.py
@ -11,17 +11,18 @@ import onnx
 import onnxruntime
 import numpy as np
 from onnx import helper, TensorProto, numpy_helper
-from onnxruntime.quantization import quantize_dynamic
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order
+from onnxruntime.quantization import quantize_dynamic, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_type_order, check_qtype_by_node_type


 def generate_input_initializer(tensor_shape, tensor_dtype, input_name):
-  '''
-  Helper function to generate initializers for test inputs
-  '''
-  tensor = np.random.normal(0, 0.3, tensor_shape).astype(tensor_dtype)
-  init = numpy_helper.from_array(tensor, input_name)
-  return init
+    '''
+    Helper function to generate initializers for test inputs
+    '''
+    tensor = np.random.normal(0, 0.3, tensor_shape).astype(tensor_dtype)
+    init = numpy_helper.from_array(tensor, input_name)
+    return init
+

 class TestONNXModel(unittest.TestCase):
    def construct_model(self, model_path):
@ -52,19 +53,31 @@ class TestONNXModel(unittest.TestCase):
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
        onnx.save(model, model_path)

-    def dynamic_quant_conv(self, model_fp32_path, model_int8_path):
-        quantize_dynamic(model_fp32_path, model_int8_path)
-        quant_nodes = {'ConvInteger' : 2}
+    def dynamic_quant_conv_test(self, activation_type, weight_type, extra_options={}):
+        np.random.seed(1)
+        model_fp32_path = 'conv_bias.fp32.onnx'
+        self.construct_model(model_fp32_path)
+
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_int8_path = 'conv_bias.quant.{}{}.onnx'.format(activation_type_str, weight_type_str)
+
+        quantize_dynamic(model_fp32_path, model_int8_path,
+                         activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
+        quant_nodes = {'ConvInteger': 2}
        check_op_type_count(self, model_int8_path, **quant_nodes)
+        qnode_io_qtypes = {'ConvInteger': [['i', 2, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
        check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(4, 2, 8, 8).astype(np.float32)})

    def test_quant_conv(self):
-        np.random.seed(1)
-        model_fp32_path = 'conv_bias.fp32.onnx'
-        model_int8_path = 'conv_bias.quant.onnx'
-        self.construct_model(model_fp32_path)
+        self.dynamic_quant_conv_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={})
+
+    # TODO: uncomment following after ConvInteger s8 supportted
+    # def test_quant_conv_s8s8(self):
+    #    self.dynamic_quant_conv_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True})

-        self.dynamic_quant_conv(model_fp32_path, model_int8_path)

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_concat.py
+++ b/onnxruntime/test/python/quantization/test_op_concat.py
@ -7,8 +7,8 @@
 import unittest
 import numpy as np
 from onnx import helper, TensorProto, numpy_helper, save
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import InputFeedsNegOneZeroOne, check_model_correctness, check_op_type_count
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import InputFeedsNegOneZeroOne, check_model_correctness, check_op_type_count, check_qtype_by_node_type


 class TestONNXModel(unittest.TestCase):
@ -47,7 +47,7 @@ class TestONNXModel(unittest.TestCase):
        conv3_node = helper.make_node('Conv', ['input', 'conv3_weight'], ['conv3_output'], name='conv3_node')

        concat_node = helper.make_node('Concat', ['conv1_output', 'conv2_output', 'conv3_output'], [
-                                            'concat_output'], name='concat_node', axis=1)
+            'concat_output'], name='concat_node', axis=1)

        identity_node = helper.make_node('Identity', ['concat_output'], ['output'], name='identity_node')

@ -57,31 +57,48 @@ class TestONNXModel(unittest.TestCase):
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
        save(model, model_path)

-    def test_quantize_concat(self):
+    def quantize_concat_test(self, activation_type, weight_type, extra_options={}):
        np.random.seed(1)
-
        model_fp32_path = 'concat_fp32.onnx'
-        model_uint8_path = 'concat_uint8.onnx'
-        model_uint8_qdq_path = 'concat_uint8_qdq.onnx'
-
        self.construct_model(model_fp32_path)
+        data_reader = InputFeedsNegOneZeroOne(1, {'input': [1, 3, 15, 15]})
+
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_q8_path = 'concat_{}{}.onnx'.format(activation_type_str, weight_type_str)
+        model_q8_qdq_path = 'concat_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str)

        # Verify QOperator mode
-        data_reader = InputFeedsNegOneZeroOne(1, {'input': [1, 3, 15, 15]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
+        data_reader.rewind()
+        quantize_static(model_fp32_path, model_q8_path, data_reader,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)

        qnode_counts = {'QLinearConv': 3, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'QLinearConcat': 1}
-        check_op_type_count(self, model_uint8_path, **qnode_counts)
+        check_op_type_count(self, model_q8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'QLinearConcat': [['i', 1, activation_proto_qtype], [
+                               'i', 4, activation_proto_qtype], ['i', 7, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes)
        data_reader.rewind()
-        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next())
+        check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())

        # Verify QDQ mode
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
        qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 5, 'DequantizeLinear': 8, 'Concat': 1}
-        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
-        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
+        check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next())
+
+    def test_quantize_concat(self):
+        self.quantize_concat_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={})
+
+    def test_quantize_concat_s8s8(self):
+        self.quantize_concat_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True})


 if __name__ == '__main__':
--- a/onnxruntime/test/python/quantization/test_op_gavgpool.py
+++ b/onnxruntime/test/python/quantization/test_op_gavgpool.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, quantize_dynamic
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type


 class TestOpGlobalAveragePool(unittest.TestCase):
@ -44,10 +44,10 @@ class TestOpGlobalAveragePool(unittest.TestCase):
        output_name = 'output'
        initializers = []

-        #make 1st GlobalAveragePool node
+        # make 1st GlobalAveragePool node
        gavgpool_node_1 = onnx.helper.make_node('GlobalAveragePool', [input_name], [expand_input])

-        #make Expand node
+        # make Expand node
        expand_shape_name = 'expand_shape'
        initializers.append(onnx.numpy_helper.from_array(np.array(input_shape, dtype=np.int64), name=expand_shape_name))
        expand_node = onnx.helper.make_node('Expand', [expand_input, expand_shape_name], [conv_input])
@ -59,7 +59,7 @@ class TestOpGlobalAveragePool(unittest.TestCase):
        initializers.append(onnx.numpy_helper.from_array(conv_weight_data, name=weight_name))
        conv_node = onnx.helper.make_node('Conv', [conv_input, weight_name], [gavgpool_input_2nd], name=conv_name)

-        #make 1st GlobalAveragePool node
+        # make 1st GlobalAveragePool node
        gavgpool_node_2 = onnx.helper.make_node('GlobalAveragePool', [gavgpool_input_2nd], [output_name])

        # make graph
@ -69,30 +69,42 @@ class TestOpGlobalAveragePool(unittest.TestCase):
        graph = helper.make_graph([gavgpool_node_1, expand_node, conv_node, gavgpool_node_2], graph_name,
                                  [input_tensor], [output_tensor], initializer=initializers)
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7 # use stable onnx ir version
+        model.ir_version = 7  # use stable onnx ir version

        onnx.save(model, output_model_path)

-    def test_quantize_reshape(self):
+    def quantize_gavgpool_test(self, activation_type, weight_type, extra_options={}):
        np.random.seed(1)
        model_fp32_path = 'gavg_pool_fp32.onnx'
-        model_int8_path = 'gavg_pool_fp32.quant.onnx'
        data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]})
        self.construct_model_gavgpool(model_fp32_path,
                                      [1, 8, 33, 33],
                                      [16, 8, 3, 3],
                                      [1, 16, 1, 1])
-        quantize_static(model_fp32_path,
-                        model_int8_path,
-                        data_reader)
+
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_q8_path = 'gavg_pool_{}{}.onnx'.format(activation_type_str, weight_type_str)
+
        data_reader.rewind()
-        quant_nodes = {'QLinearConv' : 1,
-                       'GlobalAveragePool' : 1,
-                       'QLinearGlobalAveragePool' : 1,
-                       'QuantizeLinear' : 1,
-                       'DequantizeLinear' : 1}
-        check_op_type_count(self, model_int8_path, **quant_nodes)
-        check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())
+        quantize_static(model_fp32_path, model_q8_path, data_reader,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
+
+        quant_nodes = {'QLinearConv': 1, 'GlobalAveragePool': 1, 'QLinearGlobalAveragePool': 1,
+                       'QuantizeLinear': 1, 'DequantizeLinear': 1}
+        check_op_type_count(self, model_q8_path, **quant_nodes)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'QLinearGlobalAveragePool': [['i', 2, activation_proto_qtype], ['i', 4, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())
+
+    def test_quantize_gavgpool(self):
+        self.quantize_gavgpool_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={})
+
+    def test_quantize_gavgpool_s8s8(self):
+        self.quantize_gavgpool_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True})


 if __name__ == '__main__':
--- a/onnxruntime/test/python/quantization/test_op_gemm.py
+++ b/onnxruntime/test/python/quantization/test_op_gemm.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type


 class TestOpGEMM(unittest.TestCase):
@ -46,12 +46,12 @@ class TestOpGEMM(unittest.TestCase):
            bias_data = np.random.normal(0, 0.1, bias_shape).astype(np.float32)
            initializers.append(onnx.numpy_helper.from_array(bias_data, name=bias_name))

-            return onnx.helper.make_node('Gemm', [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB = 1)
+            return onnx.helper.make_node('Gemm', [input_name, weight_name, bias_name], [output_name], alpha=1.0, beta=1.0, transB=1)
        # make gemm1 node
        gemm1_output_name = "gemm1_output"
        gemm1_node = make_gemm(input_name, [100, 10], 'linear1.weight', [100], 'linear1.bias', gemm1_output_name)

-        #make Clip
+        # make Clip
        clip_min_name = 'clip_min'
        clip_max_name = 'clip_max'
        clip_output_name = 'clip_output'
@ -71,7 +71,7 @@ class TestOpGEMM(unittest.TestCase):
        graph = helper.make_graph([gemm1_node, clip_node, gemm2_node], graph_name,
                                  [input_tensor], [output_tensor], initializer=initializers)
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7 # use stable onnx ir version
+        model.ir_version = 7  # use stable onnx ir version

        onnx.save(model, output_model_path)

@ -121,55 +121,82 @@ class TestOpGEMM(unittest.TestCase):

        onnx.save(model, output_model_path)

-    def static_quant_test(self, model_fp32_path, model_int8_path):
-        data_reader = self.input_feeds(1, {'input': [5, 10]})
-        quantize_static(model_fp32_path,
-                        model_int8_path,
-                        data_reader)
+    def static_quant_test(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}):
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_int8_path = 'gemm_fp32.quant_{}{}.onnx'.format(activation_type_str, weight_type_str)
+
        data_reader.rewind()
-        quant_nodes = {'QLinearMatMul' : 2,
-                       'QLinearAdd' : 2,
-                       'QuantizeLinear' : 1,
-                       'DequantizeLinear' : 1}
+        quantize_static(model_fp32_path, model_int8_path, data_reader,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
+        quant_nodes = {'QLinearMatMul': 2, 'QLinearAdd': 2, 'QuantizeLinear': 1, 'DequantizeLinear': 1}
        check_op_type_count(self, model_int8_path, **quant_nodes)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
+        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())

-    def static_quant_test_qdq(self, model_fp32_path, model_int8_path):
-        data_reader = self.input_feeds(1, {'input': [5, 10]})
-        quantize_static(model_fp32_path,
-                        model_int8_path,
-                        data_reader,
-                        quant_format=QuantFormat.QDQ)
+    def static_quant_test_qdq(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}):
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_int8_path = 'gemm_fp32.quant_dqd_{}{}.onnx'.format(activation_type_str, weight_type_str)
+
        data_reader.rewind()
-        quant_nodes = {'MatMul' : 2,
-                       'Add' : 2,
-                       'QuantizeLinear' : 5,
-                       'DequantizeLinear' : 9}
+        quantize_static(model_fp32_path, model_int8_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
+        quant_nodes = {'MatMul': 2, 'Add': 2, 'QuantizeLinear': 5, 'DequantizeLinear': 9}
        check_op_type_count(self, model_int8_path, **quant_nodes)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
+        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next())

+    def dynamic_quant_test(self, model_fp32_path, data_reader, activation_type, weight_type, extra_options={}):
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_int8_path = 'gemm_fp32.quant_dynamic_{}{}.onnx'.format(activation_type_str, weight_type_str)

-    def dynamic_quant_test(self, model_fp32_path, model_int8_path):
-        quantize_dynamic(model_fp32_path, model_int8_path)
-        quant_nodes = {'MatMulInteger' : 2}
+        quantize_dynamic(model_fp32_path, model_int8_path,
+                         activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
+        quant_nodes = {'MatMulInteger': 2}
        check_op_type_count(self, model_int8_path, **quant_nodes)
-        check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(5,10).astype(np.float32)})
+        qnode_io_qtypes = {'MatMulInteger': [['i', 2, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(5, 10).astype(np.float32)})

    def dynamic_attention_quant_test(self, model_fp32_path, model_int8_path, per_channel, reduce_range):
        quantize_dynamic(model_fp32_path, model_int8_path, per_channel=per_channel, reduce_range=reduce_range)
-        quant_nodes = {'QAttention' : 1, 'MatMulInteger' : 1}
+        quant_nodes = {'QAttention': 1, 'MatMulInteger': 1}
        check_op_type_count(self, model_int8_path, **quant_nodes)
-        check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(1,5,10).astype(np.float32)})
+        check_model_correctness(self, model_fp32_path, model_int8_path, {'input': np.random.rand(1, 5, 10).astype(np.float32)})

-    def test_quantize_reshape(self):
+    def test_quantize_gemm(self):
        np.random.seed(1)
        model_fp32_path = 'gemm_fp32.onnx'
-        model_int8_path = 'gemm_fp32.quant.onnx'
        self.construct_model_gemm(model_fp32_path)
+        data_reader = self.input_feeds(1, {'input': [5, 10]})

-        self.static_quant_test(model_fp32_path, model_int8_path)
-        self.static_quant_test_qdq(model_fp32_path, model_int8_path)
-        self.dynamic_quant_test(model_fp32_path, model_int8_path)
+        self.static_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8)
+        self.static_quant_test_qdq(model_fp32_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8)
+        self.dynamic_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8)
+
+    def test_quantize_gemm_s8s8(self):
+        np.random.seed(1)
+        model_fp32_path = 'gemm_fp32.onnx'
+        self.construct_model_gemm(model_fp32_path)
+        data_reader = self.input_feeds(1, {'input': [5, 10]})
+
+        self.static_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
+                               extra_options={'ActivationSymmetric': True})
+        self.static_quant_test_qdq(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
+                                   extra_options={'ActivationSymmetric': True})
+        self.dynamic_quant_test(model_fp32_path, data_reader, activation_type=QuantType.QInt8, weight_type=QuantType.QInt8,
+                                extra_options={'ActivationSymmetric': True})

    def test_quantize_attention(self):
        np.random.seed(1)
@ -182,5 +209,6 @@ class TestOpGEMM(unittest.TestCase):
        self.dynamic_attention_quant_test(model_fp32_path, model_int8_path, False, True)
        self.dynamic_attention_quant_test(model_fp32_path, model_int8_path, False, False)

+
 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_maxpool.py
+++ b/onnxruntime/test/python/quantization/test_op_maxpool.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type


 class TestOpMaxPool(unittest.TestCase):
@ -54,40 +54,55 @@ class TestOpMaxPool(unittest.TestCase):
        graph = helper.make_graph([conv_node, identity_node, maxpool_node], 'TestOpQuantizerMaxPool_test_model',
                                  [input_tensor], [identity_out, output_tensor], initializer=initializers)
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
-        model.ir_version = 7 # use stable onnx ir version
+        model.ir_version = 7  # use stable onnx ir version
        onnx.save(model, output_model_path)

-    def test_quantize_maxpool(self):
+    def quantize_maxpool_test(self, activation_type, weight_type, extra_options={}):
        np.random.seed(1)
-
        model_fp32_path = 'maxpool_fp32.onnx'
-        model_uint8_path = 'maxpool_uint8.onnx'
-        model_uint8_qdq_path = 'maxpool_uint8_qdq.onnx'
-
        self.construct_model_conv_maxpool(model_fp32_path,
                                          [1, 2, 26, 42], [3, 2, 3, 3],
                                          [1, 3, 24, 40], {'kernel_shape': [3, 3]},
                                          [1, 3, 22, 38])
+        data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]})
+
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_q8_path = 'maxpool_{}{}.onnx'.format(activation_type_str, weight_type_str)
+        model_q8_qdq_path = 'maxpool_dqd_{}{}.onnx'.format(activation_type_str, weight_type_str)

        # Verify QOperator mode
-        data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
-
-        # make sure maxpool become xint8 operator, its input name could tell that
-        check_op_nodes(self, model_uint8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output'))
-        qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1}
-        check_op_type_count(self, model_uint8_path, **qnode_counts)
        data_reader.rewind()
-        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next())
+        quantize_static(model_fp32_path, model_q8_path, data_reader,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
+        # make sure maxpool become xint8 operator, its input name could tell that
+        check_op_nodes(self, model_q8_path, lambda node: (node.name != "maxpool_node" or node.input[0] != 'conv_output'))
+        qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'MaxPool': 1}
+        check_op_type_count(self, model_q8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())

        # Verify QDQ mode
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
        qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'MaxPool': 1}
-        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
-        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
+        check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next())

+    def test_quantize_maxpool(self):
+        self.quantize_maxpool_test(QuantType.QUInt8, QuantType.QUInt8, extra_options={ })
+
+    def test_quantize_maxpool_s8s8(self):
+        self.quantize_maxpool_test(QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True})

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_pad.py
+++ b/onnxruntime/test/python/quantization/test_op_pad.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, quantize_dynamic
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from onnxruntime.quantization import quantize_static, quantize_dynamic, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type


 class TestOpQuatizerPad(unittest.TestCase):
@ -51,7 +51,7 @@ class TestOpQuatizerPad(unittest.TestCase):
        graph = helper.make_graph([pad_node], 'TestOpQuantizerPad_test_model',
                                  [input_tensor], [output_tensor], initializer=initializers)
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7 # use stable onnx ir version
+        model.ir_version = 7  # use stable onnx ir version

        onnx.save(model, output_model_path)

@ -91,14 +91,17 @@ class TestOpQuatizerPad(unittest.TestCase):
        graph = helper.make_graph([conv_node, identity_node, pad_node], 'TestOpQuantizerPad_test_model',
                                  [input_tensor], [identity_out, output_tensor], initializer=initializers)
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)])
-        model.ir_version = 7 # use stable onnx ir version
+        model.ir_version = 7  # use stable onnx ir version
        onnx.save(model, output_model_path)

-    def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None):
+    def quantize_model(self, model_fp32_path, model_i8_path, data_reader=None,
+                       activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}):
        if data_reader is not None:
-            quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True)
+            quantize_static(model_fp32_path, model_i8_path, data_reader, reduce_range=True,
+                            activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
        else:
-            quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True)
+            quantize_dynamic(model_fp32_path, model_i8_path, reduce_range=True,
+                             activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)

    def verify_should_not_trigger(self, quantize_mode='static'):
        np.random.seed(108)
@ -118,23 +121,39 @@ class TestOpQuatizerPad(unittest.TestCase):
    def test_dynamic_quantize_no_trigger(self):
        self.verify_should_not_trigger(quantize_mode='dynamic')

-    def verify_quantize_with_pad_mode(self, pad_mode, constant_value=None, quantize_mode='static'):
+    def verify_quantize_with_pad_mode(self, pad_mode, constant_value=None, quantize_mode='static', rtol=0.01, atol=0.05,
+                                      activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}):
        np.random.seed(108)
        tag_pad_mode = pad_mode if pad_mode is not None else 'none'
        tag_constant_value = '' if constant_value is None else '_value'
        model_fp32_path = 'qop_pad_{}_fp32_{}{}.onnx'.format(quantize_mode, tag_pad_mode, tag_constant_value)
-        model_i8_path = 'qop_pad_{}_i8_{}{}.onnx'.format(quantize_mode, tag_pad_mode, tag_constant_value)
        data_reader = self.input_feeds(1, {'input': [1, 8, 33, 33]})
        self.construct_model_conv_pad(model_fp32_path, [1, 8, 33, 33], [16, 8, 3, 3], [1, 16, 31, 31],
                                      pad_mode, [0, 0, 1, 2, 0, 0, 3, 4], constant_value=constant_value)
-        self.quantize_model(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader)
+
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_i8_path = 'qop_pad_{}_i8_{}{}_{}{}.onnx'.format(
+            quantize_mode, tag_pad_mode, tag_constant_value, activation_type_str, weight_type_str)
        data_reader.rewind()
+        self.quantize_model(model_fp32_path, model_i8_path, None if quantize_mode != 'static' else data_reader,
+                            activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
        # DequantizeLinear=2 means there are one DequantizeLinear Node aftr both conv and pad,
        # which means pad node is running in quantized semantic.
        # In dynamic quantize mode, pad operator in fact not quantized as input is fp32.
-        kwargs = {'DynamicQuantizeLinear': 1} if quantize_mode != 'static' else {'DequantizeLinear': 2, 'QuantizeLinear': 1}
+        if quantize_mode != 'static':
+            kwargs = {'DynamicQuantizeLinear': 1} if activation_type == QuantType.QUInt8 else {'QuantizeLinear': 1}
+        else:
+            kwargs = {'DequantizeLinear': 2, 'QuantizeLinear': 1}
        check_op_type_count(self, model_i8_path, **kwargs)
-        check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next())
+        # check node input/output type if such node exists in the graph
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]})
+        qnode_io_qtypes.update({'ConvInteger': [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_i8_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(self, model_fp32_path, model_i8_path, data_reader.get_next(), rtol=rtol, atol=atol)

    def test_static_mode_edge(self):
        self.verify_quantize_with_pad_mode('edge', constant_value=None)
@ -148,6 +167,22 @@ class TestOpQuatizerPad(unittest.TestCase):
    def test_static_mode_constant_value(self):
        self.verify_quantize_with_pad_mode('constant', constant_value=3.75)

+    def test_static_mode_edge_s8s8(self):
+        self.verify_quantize_with_pad_mode('edge', constant_value=None, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8,
+                                           weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+    def test_static_mode_reflect_s8s8(self):
+        self.verify_quantize_with_pad_mode('reflect', constant_value=None, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8,
+                                           weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+    def test_static_mode_constant_default_s8s8(self):
+        self.verify_quantize_with_pad_mode('constant', constant_value=None, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8,
+                                           weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+    def test_static_mode_constant_value_s8s8(self):
+        self.verify_quantize_with_pad_mode('constant', constant_value=3.75, rtol=0.1, atol=0.1, activation_type=QuantType.QInt8,
+                                           weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
    def test_dynamic_mode_edge(self):
        self.verify_quantize_with_pad_mode('edge', constant_value=None, quantize_mode='dynamic')

@ -160,6 +195,23 @@ class TestOpQuatizerPad(unittest.TestCase):
    def test_dynamic_mode_constant_value(self):
        self.verify_quantize_with_pad_mode('constant', constant_value=3.75, quantize_mode='dynamic')

+    # TODO: uncomment following after ConvInteger s8 supported
+    # def test_dynamic_mode_edge_s8s8(self):
+    #     self.verify_quantize_with_pad_mode('edge', constant_value=None, quantize_mode='dynamic', activation_type=QuantType.QInt8,
+    #                                        weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+    # def test_dynamic_mode_reflect_s8s8(self):
+    #     self.verify_quantize_with_pad_mode('reflect', constant_value=None, quantize_mode='dynamic', activation_type=QuantType.QInt8,
+    #                                        weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+    # def test_dynamic_mode_constant_default_s8s8(self):
+    #     self.verify_quantize_with_pad_mode('constant', constant_value=None, quantize_mode='dynamic', activation_type=QuantType.QInt8,
+    #                                        weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+    # def test_dynamic_mode_constant_value_s8s8(self):
+    #     self.verify_quantize_with_pad_mode('constant', constant_value=3.75, quantize_mode='dynamic', activation_type=QuantType.QInt8,
+    #                                        weight_type=QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_pooling.py
+++ b/onnxruntime/test/python/quantization/test_op_pooling.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type


 class TestOpAveragePool(unittest.TestCase):
@ -57,34 +57,50 @@ class TestOpAveragePool(unittest.TestCase):
        model.ir_version = 7 # use stable onnx ir version
        onnx.save(model, output_model_path)

-    def test_quantize_avgpool(self):
+    def quantize_avgpool_test(self, activation_type, weight_type, extra_options = {}):
        np.random.seed(1)
-
        model_fp32_path = 'avgpool_fp32.onnx'
-        model_uint8_path = 'avgpool_uint8.onnx'
-        model_uint8_qdq_path = 'avgpool_uint8_qdq.onnx'
-
        self.construct_model_conv_avgpool(model_fp32_path,
                                          [1, 2, 26, 42], [3, 2, 3, 3],
                                          [1, 3, 24, 40], {'kernel_shape': [3, 3]},
                                          [1, 3, 22, 38])
+        data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]})
+
+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_q8_path = 'avgpool_{}{}.onnx'.format(activation_type_str, weight_type_str)
+        model_q8_qdq_path = 'avgpool_qdq_{}{}.onnx'.format(activation_type_str, weight_type_str)

        # Verify QOperator mode
-        data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
-        qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'QLinearAveragePool': 1}
-        check_op_type_count(self, model_uint8_path, **qnode_counts)
        data_reader.rewind()
-        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next())
+        quantize_static(model_fp32_path, model_q8_path, data_reader,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
+        qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'QLinearAveragePool': 1}
+        check_op_type_count(self, model_q8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'QLinearConv' : [['i', 2, activation_proto_qtype], ['i', 7, activation_proto_qtype], ['o', 0, activation_proto_qtype]]})
+        qnode_io_qtypes.update({'QLinearAveragePool' : [['i', 4, activation_proto_qtype]]}) # shape info note workig on custome ops
+        check_qtype_by_node_type(self, model_q8_path, qnode_io_qtypes)
+        data_reader.rewind()
+        check_model_correctness(self, model_fp32_path, model_q8_path, data_reader.get_next())

        # Verify QDQ mode
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_q8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'AveragePool': 1}
-        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        check_op_type_count(self, model_q8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_q8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
-        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())
+        check_model_correctness(self, model_fp32_path, model_q8_qdq_path, data_reader.get_next())

+    def test_quantize_avgpool(self):
+        self.quantize_avgpool_test(QuantType.QUInt8, QuantType.QUInt8)
+
+    def test_quantize_avgpool_s8s8(self):
+        self.quantize_avgpool_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True})

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_reshape.py
+++ b/onnxruntime/test/python/quantization/test_op_reshape.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type


 class TestOpReshape(unittest.TestCase):
@ -67,35 +67,51 @@ class TestOpReshape(unittest.TestCase):

        onnx.save(model, output_model_path)

-    def test_quantize_reshape(self):
+    def quantize_reshape_test(self, activation_type, weight_type, extra_options = {}):
        np.random.seed(1)
        model_fp32_path = 'reshape_fp32.onnx'
-        model_uint8_path = 'reshape_uint8.onnx'
-        model_uint8_qdq_path = 'reshape_uint8_qdq.onnx'

        self.construct_model_matmul_reshape(model_fp32_path,
                                            [3, 7],
                                            [7, 3],
                                            [1, 9])

+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_uint8_path = 'reshape_{}{}.onnx'.format(activation_type_str, weight_type_str)
+        model_uint8_qdq_path = 'reshape_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str)
+
        # Verify QOperator mode
        data_reader = self.input_feeds(1, {'input': [3, 7]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
+        quantize_static(model_fp32_path, model_uint8_path, data_reader,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        # make sure transpose become xint8 operator, its input name could tell that
        check_op_nodes(self, model_uint8_path, lambda node: (node.name != "reshape_node" or node.input[0] != 'matmul_output'))
        qnode_counts = {'QLinearMatMul': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'Reshape': 1}
        check_op_type_count(self, model_uint8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next())

        # Verify QDQ mode
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        qdqnode_counts = {'MatMul': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Reshape': 1}
        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())

+    def test_quantize_reshape(self):
+        self.quantize_reshape_test(QuantType.QUInt8, QuantType.QUInt8)
+
+    def test_quantize_reshape_s8s8(self):
+        self.quantize_reshape_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True})

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_resize.py
+++ b/onnxruntime/test/python/quantization/test_op_resize.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type


 class TestOpResize(unittest.TestCase):
@ -81,12 +81,9 @@ class TestOpResize(unittest.TestCase):
        model.ir_version = 7 # use stable onnx ir version
        onnx.save(model, output_model_path)

-    def test_quantize_resize(self):
+    def quantize_resize_test(self, activation_type, weight_type, extra_options = {}):
        np.random.seed(1)
-
        model_fp32_path = 'resize_fp32.onnx'
-        model_uint8_path = 'resize_uint8.onnx'
-        model_uint8_qdq_path = 'resize_uint8_qdq.onnx'

        kwargs = {'coordinate_transformation_mode': 'asymmetric', 'mode': 'nearest', 'nearest_mode': 'floor'}
        self.construct_model_conv_resize(model_fp32_path,
@ -95,25 +92,43 @@ class TestOpResize(unittest.TestCase):
                                         kwargs,
                                         [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0], [1.0, 1.0, 2.0, 2.0], None)

+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_uint8_path = 'resize_{}{}.onnx'.format(activation_type_str, weight_type_str)
+        model_uint8_qdq_path = 'resize_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str)
+
        # Verify QOperator mode
        data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
-
+        quantize_static(model_fp32_path, model_uint8_path, data_reader,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        # make sure resize become xint8 operator, its input name could tell that
        check_op_nodes(self, model_uint8_path, lambda node: (node.name != "resize_node" or node.input[0] != 'conv_output'))
        qnode_counts = {'QLinearConv': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 2, 'Resize': 1}
        check_op_type_count(self, model_uint8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next())

        # Verify QDQ mode
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        qdqnode_counts = {'Conv': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Resize': 1}
        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())

+    def test_quantize_resize(self):
+        self.quantize_resize_test(QuantType.QUInt8, QuantType.QUInt8)
+
+    # TODO: Uncomment following after resize s8 support is enabled
+    # def test_quantize_resize_s8s8(self):
+    #     self.quantize_resize_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True})

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py
+++ b/onnxruntime/test/python/quantization/test_op_squeeze_unsqueeze.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type


 class TestOpSqueezeUnsqueeze(unittest.TestCase):
@ -26,8 +26,8 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase):
        return dr

    def construct_model_conv_squeezes(self, output_model_path,
-                                     conv_input_shape, conv_weight_shape, conv_output_shape,
-                                     opset = 13):
+                                      conv_input_shape, conv_weight_shape, conv_output_shape,
+                                      opset=13):
        #             (input)
        #            /   |     \
        #         Conv1 conv2    conv3
@ -55,7 +55,6 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase):
        conv3_weight_initializer = onnx.numpy_helper.from_array(conv3_weight_arr, name='conv3_weight')
        conv3_node = onnx.helper.make_node('Conv', ['input', 'conv3_weight'], ['conv3_output'], name='conv3_node')

-
        if (opset >= 13):
            squeeze_axes_initializer = onnx.numpy_helper.from_array(np.array([0], dtype=np.int64), name='squeeze_axes')
            squeeze1_node = helper.make_node('Squeeze', ['conv1_output', 'squeeze_axes'], ['squeeze1_output'], name='suqeeze1_node')
@ -66,9 +65,10 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase):

        add1_node = helper.make_node('Add', ['squeeze1_output', 'squeeze2_output'], ['add1_output'], name='add1_node')
        if (opset >= 13):
-            unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output', 'squeeze_axes'], ['unsqueeze_output'], name = 'unsqueeze_node')
+            unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output', 'squeeze_axes'], [
+                                              'unsqueeze_output'], name='unsqueeze_node')
        else:
-            unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output'], ['unsqueeze_output'], name = 'unsqueeze_node', axes=[0])
+            unsqueeze_node = helper.make_node('Unsqueeze', ['add1_output'], ['unsqueeze_output'], name='unsqueeze_node', axes=[0])

        output_tensor = helper.make_tensor_value_info('output', TensorProto.FLOAT, conv_output_shape)
        add2_node = helper.make_node('Add', ['unsqueeze_output', 'conv3_output'], ['output'], name='add2_node')
@ -79,33 +79,43 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase):
        graph = helper.make_graph([conv1_node, conv2_node, conv3_node, squeeze1_node, squeeze2_node, add1_node, unsqueeze_node, add2_node],
                                  'TestOpSuqeezes_test_model', [input_tensor], [output_tensor], initializer=initializers)
        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", opset)])
-        model.ir_version = 7 # use stable onnx ir version
+        model.ir_version = 7  # use stable onnx ir version
        onnx.save(model, output_model_path)

-    def run_quantize_squeezes_of_opset(self, opset = 13):
+    def run_quantize_squeezes_of_opset(self, opset=13, activation_type=QuantType.QUInt8, weight_type=QuantType.QUInt8, extra_options={}):
        np.random.seed(1)

        model_fp32_path = 'squeezes_opset{}_fp32.onnx'.format(opset)
-        model_uint8_path = 'squeezes_opset{}_uint8.onnx'.format(opset)
-        model_uint8_qdq_path = 'squeezes_opset{}_uint8_qdq.onnx'.format(opset)
-
        self.construct_model_conv_squeezes(model_fp32_path, [1, 2, 26, 42], [3, 2, 3, 3], [1, 3, 24, 40], opset=opset)

+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_uint8_path = 'squeezes_opset{}_{}{}.onnx'.format(opset, activation_type_str, weight_type_str)
+        model_uint8_qdq_path = 'squeezes_opset{}_{}{}_qdq.onnx'.format(opset, activation_type_str, weight_type_str)
+
        # Verify QOperator mode
        data_reader = self.input_feeds(1, {'input': [1, 2, 26, 42]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
+        quantize_static(model_fp32_path, model_uint8_path, data_reader,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)

        # make sure squeezes become xint8 operator, its input name could tell that
        qnode_counts = {'QuantizeLinear': 1, 'DequantizeLinear': 1}
        check_op_type_count(self, model_uint8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear': [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next(), rtol=0.01, atol=0.5)

        # Verify QDQ mode
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type=activation_type, weight_type=weight_type, extra_options=extra_options)
        qdqnode_counts = {'Conv': 3, 'QuantizeLinear': 9, 'DequantizeLinear': 12}
        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear': [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next(), rtol=0.01, atol=0.5)

@ -113,5 +123,10 @@ class TestOpSqueezeUnsqueeze(unittest.TestCase):
        self.run_quantize_squeezes_of_opset(11)
        self.run_quantize_squeezes_of_opset(13)

+    def test_quantize_squeeze_unsqueeze_s8s8(self):
+        self.run_quantize_squeezes_of_opset(11, QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+        self.run_quantize_squeezes_of_opset(13, QuantType.QInt8, QuantType.QInt8, extra_options={'ActivationSymmetric': True})
+
+
 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_op_transpose.py
+++ b/onnxruntime/test/python/quantization/test_op_transpose.py
@ -10,8 +10,8 @@ import unittest
 import onnx
 import numpy as np
 from onnx import helper, TensorProto
-from onnxruntime.quantization import quantize_static, QuantFormat
-from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes
+from onnxruntime.quantization import quantize_static, QuantFormat, QuantType
+from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_op_nodes, check_qtype_by_node_type


 class TestOpTranspose(unittest.TestCase):
@ -62,32 +62,47 @@ class TestOpTranspose(unittest.TestCase):

        onnx.save(model, output_model_path)

-    def test_quantize_transpose(self):
+    def quantize_transpose_test(self, activation_type, weight_type, extra_options = {}):
        np.random.seed(1)
        model_fp32_path = 'transpose_fp32.onnx'
-        model_uint8_path = 'transpose_uint8.onnx'
-        model_uint8_qdq_path = 'transpose_uint8_qdq.onnx'
-
        self.construct_model_matmul_transpose(model_fp32_path, [3, 7], [7, 5], [5, 3])

+        activation_proto_qtype = TensorProto.UINT8 if activation_type == QuantType.QUInt8 else TensorProto.INT8
+        activation_type_str = 'u8' if (activation_type == QuantType.QUInt8) else 's8'
+        weight_type_str = 'u8' if (weight_type == QuantType.QUInt8) else 's8'
+        model_uint8_path = 'transpose_{}{}.onnx'.format(activation_type_str, weight_type_str)
+        model_uint8_qdq_path = 'transpose_{}{}_qdq.onnx'.format(activation_type_str, weight_type_str)
+
        # Verify QOperator model
        data_reader = self.input_feeds(1, {'input': [3, 7]})
-        quantize_static(model_fp32_path, model_uint8_path, data_reader)
+        quantize_static(model_fp32_path, model_uint8_path, data_reader,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        # make sure transpose become xint8 operator, its input name could tell that
        check_op_nodes(self, model_uint8_path, lambda node: (node.name != "transpose_node" or node.input[0] != 'matmul_output'))
        qnode_counts = {'QLinearMatMul': 1, 'QuantizeLinear': 1, 'DequantizeLinear': 1, 'Transpose': 1}
        check_op_type_count(self, model_uint8_path, **qnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        qnode_io_qtypes.update({'DequantizeLinear' : [['i', 2, activation_proto_qtype]]})
+        check_qtype_by_node_type(self, model_uint8_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_path, data_reader.get_next())

        # Verify QDQ model
        data_reader.rewind()
-        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ)
+        quantize_static(model_fp32_path, model_uint8_qdq_path, data_reader, quant_format=QuantFormat.QDQ,
+                        activation_type = activation_type, weight_type = weight_type, extra_options = extra_options)
        qdqnode_counts = {'MatMul': 1, 'QuantizeLinear': 3, 'DequantizeLinear': 4, 'Transpose': 1}
        check_op_type_count(self, model_uint8_qdq_path, **qdqnode_counts)
+        qnode_io_qtypes = {'QuantizeLinear' : [['i', 2, activation_proto_qtype], ['o', 0, activation_proto_qtype]]}
+        check_qtype_by_node_type(self, model_uint8_qdq_path, qnode_io_qtypes)
        data_reader.rewind()
        check_model_correctness(self, model_fp32_path, model_uint8_qdq_path, data_reader.get_next())

+    def test_quantize_transpose(self):
+        self.quantize_transpose_test(QuantType.QUInt8, QuantType.QUInt8)
+
+    def test_quantize_transpose_s8s8(self):
+        self.quantize_transpose_test(QuantType.QInt8, QuantType.QInt8, extra_options = {'ActivationSymmetric' : True})

 if __name__ == '__main__':
    unittest.main()
--- a/onnxruntime/test/python/quantization/test_qdq.py
+++ b/onnxruntime/test/python/quantization/test_qdq.py
@ -104,7 +104,7 @@ class TestQDQExtraOptions(unittest.TestCase):
            # This QuantizeLinear node should be followed by Add1
            if node.name == 'P_QuantizeLinear':
                qdq_added_to_node_output_flag = True
-                self.assertTrue(node.input[0] is 'P')
+                self.assertTrue(node.input[0] == 'P')

        self.assertTrue(qdq_added_to_node_output_flag)