From e26e11b9f7f7b1d153d9ce2ac160cffb241e4ded Mon Sep 17 00:00:00 2001
From: PhaniShekhar <30535191+PhaniShekhar@users.noreply.github.com>
Date: Tue, 18 Jun 2019 20:44:45 -0700
Subject: [PATCH] Quantization tool to support quantization of Conv and MatMul
 nodes. (#1057)

* Move quantization tool from onnx to onnxruntime

* Fix some issues

* Use u8_s8 for asymmetric mode and u8_u8 for symmetric mode irrespective of whether inputs are initializers or from previous

* Address PR comments

* Fix error message formatting

* Separate static/dynamic and quantization mode
---
 .../python/tools/quantization/README.md       |  104 ++
 .../python/tools/quantization/quantize.py     | 1055 +++++++++++++++++
 2 files changed, 1159 insertions(+)
 create mode 100644 onnxruntime/python/tools/quantization/README.md
 create mode 100644 onnxruntime/python/tools/quantization/quantize.py

diff --git a/onnxruntime/python/tools/quantization/README.md b/onnxruntime/python/tools/quantization/README.md
new file mode 100644
index 0000000000..19c8e2cd55
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/README.md
@@ -0,0 +1,104 @@
+# Quantization tool Overview
+This tool supports quantization of an onnx model. quantize() takes a model in ModelProto format and returns the quantized model in ModelProto format.
+
+## Quantize an onnx model
+```python
+import onnx
+from quantize import quantize, QuantizationMode
+
+# Load the onnx model
+model = onnx.load('path/to/the/model.onnx')
+# Quantize
+quantized_model = quantize(model, quantization_mode=QuantizationMode.IntegerOps)
+# Save the quantized model
+onnx.save(quantized_model, 'path/to/the/quantized_model.onnx')
+```
+
+## Examples of various quantization modes
+
+- **QuantizationMode.IntegerOps with static input quantization**:
+    Quantize using integer ops. Inputs/activations are quantized using static scale and zero point values which are specified through "input_quantization_params" option.
+    ```python
+    quantized_model = quantize(model, quantization_mode=QuantizationMode.IntegerOps,
+                               static=True,
+                               input_quantization_params={
+                                    'input_1': [np.uint8(113), np.float32(0.05)]
+                               })
+    ```
+
+- **QuantizationMode.IntegerOps with dynamic input quantization**:
+    Quantize using integer ops. Inputs/activations are quantized using dynamic scale and zero point values which are computed while running the model.
+    ```python
+    quantized_model = quantize(model, quantization_mode=QuantizationMode.IntegerOps, static=False)
+    ```
+
+- **QuantizationMode.QLinearOps with static input quantization**:
+    Quantize using QLinear ops. Inputs/activations are quantized using static scale and zero point values which are specified through "input_quantization_params" option.
+    Output scale and zero point values have to be specified using "output_quantization_params" option.
+    ```python
+    quantized_model = quantize(model, quantization_mode=QuantizationMode.QLinearOps,
+                               static=True,
+                               input_quantization_params={
+                                    'input_1': [np.uint8(113), np.float32(0.05)]
+                               },
+                               output_quantization_params={
+                                    'output_1': [np.uint8(113), np.float32(0.05)]
+                               })
+    ```
+
+- **QuantizationMode.QLinearOps with dynamic input quantization**:
+    Quantize using QLinear ops. Inputs/activations are quantized using dynamic scale and zero point values which are computed while running the model.
+    Output scale and zero point values have to be specified using "output_quantization_params" option.
+    ```python
+    quantized_model = quantize(model, quantization_mode=QuantizationMode.QLinearOps,
+                               static=False,
+                               output_quantization_params={
+                                    'output_1': [np.uint8(113), np.float32(0.05)]
+                               })
+    ```
+
+## Options
+
+See below for a description of all the options to quantize():
+
+- **model**: ModelProto to quantize
+- **per_channel**: *default: True*
+    If True, weights of Conv nodes are quantized per output channel.
+    If False, they are quantized per tensor. Refer [QLinearConv](https://github.com/onnx/onnx/blob/master/docs/Operators.md#qlinearconv) for more information.
+- **nbits**: *default: 8*
+    Number of bits to represent quantized data. Currently only nbits=8 is supported.
+- **quantization_mode**: *default: QuantizationMode.IntegerOps*
+*QuantizationMode.IntegerOps*:  Quantize using integer ops. Only [ConvInteger](https://github.com/onnx/onnx/blob/master/docs/Operators.md#ConvInteger) and [MatMulInteger](https://github.com/onnx/onnx/blob/master/docs/Operators.md#MatMulInteger) ops are supported now.
+*QuantizationMode.QLinearOps*: Quantize using QLinear ops. Only [QLinearConv](https://github.com/onnx/onnx/blob/master/docs/Operators.md#qlinearconv) and [QLinearMatMul](https://github.com/onnx/onnx/blob/master/docs/Operators.md#QLinearMatMul) ops are supported now.
+- **static**: *default:False*
+If True, the inputs/activations are quantized using static scale and zero point values specified through input_quantization_params.
+If False, the inputs/activations are quantized using dynamic scale and zero point values computed while running the model.
+- **asymmetric_input_types**: *default: False*
+    If True, weights are quantized into signed integers and inputs/activations into unsigned integers.
+    If False, weights and inputs/activations are quantized into unsigned integers.
+- **input_quantization_params**: *default: None*
+    Dictionary to specify the zero point and scale values for inputs to conv and matmul nodes.
+        Should be specified when static is set to True.
+        The input_quantization_params should be specified in the following format:
+            {
+                "input_name": [zero_point, scale]
+            }.
+        zero_point should be of type np.uint8 and scale should be of type np.float32.
+        example:
+            {
+                'resnet_model/Relu_1:0': [np.uint8(0), np.float32(0.019539741799235344)],
+                'resnet_model/Relu_2:0': [np.uint8(0), np.float32(0.011359662748873234)]
+            }
+- **output_quantization_params**: *default: None*
+    Dictionary to specify the zero point and scale values for outputs of conv and matmul nodes.
+    Should be specified in QuantizationMode.QLinearOps mode.
+        The output_quantization_params should be specified in the following format:
+            {
+                "output_name": [zero_point, scale]
+            }
+        zero_point can be of type np.uint8/np.int8 and scale should be of type np.float32.
+        example:
+            {
+                'resnet_model/Relu_3:0': [np.int8(0), np.float32(0.011359662748873234)],
+                'resnet_model/Relu_4:0': [np.uint8(0), np.float32(0.011359662748873234)]
+            }
\ No newline at end of file
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
new file mode 100644
index 0000000000..3bf180e84c
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -0,0 +1,1055 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import os
+import onnx
+import onnx.numpy_helper
+import struct
+
+import numpy as np
+from onnx import onnx_pb as onnx_proto
+
+__producer__ = "onnx.quantize"
+__version__ = "0.1.0"
+onnx_domain = "ai.onnx"
+onnx_op_set_version = 10
+
+type_to_name = {
+    1: "FLOAT",
+    2: "UINT8",
+    3: "INT8",
+    4: "UINT16",
+    5: "INT16",
+    6: "INT32",
+    7: "INT64",
+    8: "STRING",
+    9: "BOOL",
+    10: "FLOAT16",
+    11: "DOUBLE",
+    12: "UINT32",
+    13: "UINT64",
+    14: "COMPLEX64",
+    15: "COMPLEX128",
+}
+
+# Quantization mode
+# IntegerOps: Use IntegerOps in quantized model. Only ConvInteger and MatMulInteger ops are supported now.
+# QLinearOps: Use QLinearOps in quantized model. Only QLinearConv and QLinearMatMul ops are supported now.
+class QuantizationMode():
+    IntegerOps = 0
+    QLinearOps = 1
+
+# Data Quantization mode
+# Linear_NonScaled: Quantize data using linear, non scaled tranformation.
+# Linear_Scaled: Quantize data using linear, scaled transformation.
+class DataQuantizationMode():
+    Linear_NonScaled = 0
+    Linear_Scaled = 1
+
+    @staticmethod
+    def mode_for_data_type(data_type):
+        return DataQuantizationMode.Linear_Scaled if data_type == onnx_proto.TensorProto.INT8\
+            else DataQuantizationMode.Linear_NonScaled
+
+
+quantization_modes = [getattr(QuantizationMode, attr) for attr in dir(QuantizationMode)\
+    if not callable(getattr(QuantizationMode, attr)) and not attr.startswith("__")]
+data_quantization_modes = [getattr(DataQuantizationMode, attr) for attr in dir(DataQuantizationMode)\
+    if not callable(getattr(DataQuantizationMode, attr)) and not attr.startswith("__")]
+
+
+class Weight:
+    '''
+        Represents a linearly quantized weight input from ONNX operators
+    '''
+    def __init__(self, name, initializer, rmins, rmaxs, zero_points, scales, data=[], quantized_data=[], axis=None,
+                 qType=onnx_proto.TensorProto.UINT8):
+        self.name = name
+        self.initializer = initializer  # TensorProto initializer in ONNX graph
+        self.rmins = rmins  # List of minimum range for each axis
+        self.rmaxs = rmaxs  # List of maximum range for each axis
+        self.zero_points = zero_points  # 1D tensor of zero points computed for each axis. scalar if axis is empty
+        self.scales = scales  # 1D tensor of scales computed for each axis. scalar if axis is empty
+        self.data = data  # original data from initializer TensorProto
+        self.quantized_data = quantized_data  # weight-packed data from data
+        self.axis = axis  # Scalar to specify which dimension in the initializer to weight pack.
+                          # If empty, single zero point and scales computed from a single rmin and rmax
+        self.qType = qType # type of quantized data.
+
+
+def quantize_data(data, quantize_range, mode=DataQuantizationMode.Linear_NonScaled):
+    '''
+        :parameter quantize_range: list of data to weight pack.
+        :parameter mode: mode to quantize data of type DataQuantizationMode
+        :return: minimum, maximum, zero point, scale, and quantized weights
+
+        To pack weights, we compute a linear transformation
+            - in non-scaled mode, from [rmin, rmax] -> [0, 2^{b-1}] and
+            - in scaled mode, from [-m , m] -> [-(2^{b-1}-1), 2^{b-1}-1] where
+                m = max(abs(rmin), abs(rmax))
+
+        and add necessary intermediate nodes to trasnform quantized weight to full weight using the equation
+        r = S(q-z), where
+            r: real original value
+            q: quantized value
+            S: scale
+            z: zero point
+    '''
+    rmin = min(min(data), 0)
+    rmax = max(max(data), 0)
+
+    if mode == DataQuantizationMode.Linear_Scaled:
+        max_range = max(abs(rmin), abs(rmax))
+        scale = (float(max_range)*2) / quantize_range
+        zero_point = 0
+        quantized_data = (np.asarray(data) / scale).round().astype('b') #signed byte type
+    else:
+        scale = (float(rmax) - rmin) / quantize_range if rmin != rmax else 1
+        zero_point = round((0 - rmin) / scale) # round to nearest integer
+        quantized_data = ((np.asarray(data) / scale).round() + zero_point).astype('B') # unsigned byte type
+    return rmin, rmax, zero_point, scale, quantized_data
+
+
+def _attribute_to_kwarg(attribute):
+    '''
+    Convert attribute to kwarg format for use with onnx.helper.make_node.
+        :parameter attribute: attribute in AttributeProto format.
+        :return: attribute in {key: value} format.
+    '''
+    if (attribute.type == 0):
+        raise ValueError('attribute {} does not have type specified.'.format(attribute.name))
+
+    # Based on attribute type definitions from AttributeProto
+    # definition in https://github.com/onnx/onnx/blob/master/onnx/onnx.proto
+    if (attribute.type == 1):
+        value = attribute.f
+    elif (attribute.type == 2):
+        value = attribute.i
+    elif (attribute.type == 3):
+        value = attribute.s
+    elif (attribute.type == 4):
+        value = attribute.t
+    elif (attribute.type == 5):
+        value = attribute.g
+    elif (attribute.type == 6):
+        value = attribute.floats
+    elif (attribute.type == 7):
+        value = attribute.ints
+    elif (attribute.type == 8):
+        value = attribute.strings
+    elif (attribute.type == 9):
+        value = attribute.tensors
+    elif (attribute.type == 10):
+        value = attribute.graphs
+    else:
+        raise ValueError('attribute {} has unsupported type {}.'.format(attribute.name, attribute.type))
+
+    return {attribute.name: value}
+
+def _find_by_name(item_name, item_list):
+    '''
+    Helper function to find item by name in a list.
+        parameter item_name: name of the item.
+        parameter item_list: list of items.
+        return: item if found. None otherwise.
+    '''
+    items = [item for item in item_list if item.name == item_name]
+    return items[0] if len(items) > 0 else None
+
+def _get_mul_node(inputs, output, name):
+    '''
+    Helper function to create a Mul node.
+        parameter inputs: list of input names.
+        parameter output: output name.
+        parameter name: name of the node.
+        return: Mul node in NodeProto format.
+    '''
+    return onnx.helper.make_node("Mul", inputs, [output], name)
+
+def _find_node_by_name(node_name, graph, new_nodes_list):
+    '''
+    Helper function to check if a node exists in a graph or
+    new set of nodes created during quantization.
+        parameter node_name: name of the node.
+        parameter graph: GraphProto.
+        parameter new_nodes_list: list of nodes added during quantization.
+        return: NodeProto if found. None otherwise.
+    '''
+    graph_nodes_list = list(graph.node) # deep copy
+    graph_nodes_list.extend(new_nodes_list)
+    node = _find_by_name(node_name, graph_nodes_list)
+    return node
+
+def _add_initializer_if_not_present(graph, name, value, shape, type):
+    '''
+    Helper function to add an initializer if it is not present in the graph.
+        parameter graph: GraphProto.
+        parameter name: Initializer's name.
+        parameter value: Initializer's value.
+        parameter shape: Initializer's shape.
+        parameter type: Initializer's type.
+    '''
+    if _find_by_name(name, graph.initializer) is None:
+        initializer = onnx.helper.make_tensor(name, type, shape, value)
+        value_info = onnx.helper.make_tensor_value_info(name, type, shape)
+        graph.initializer.extend([initializer])
+        graph.input.extend([value_info])
+
+def _get_qrange_for_qType(qType):
+    '''
+    Helper function to get the quantization range for a type.
+        parameter qType: quantization type.
+        return: quantization range.
+    '''
+    if qType == onnx_proto.TensorProto.UINT8:
+        return 255  # 2^b - 1
+    elif qType == onnx_proto.TensorProto.INT8:
+        return 254  # [-(2^{b-1}-1), 2^{b-1}-1]: [-127, 127] for 8 bits.
+    else:
+        raise ValueError('unsupported quantization data type')
+
+def _find_nodes_using_initializer(graph, initializer):
+    '''
+    Helper function to find all nodes with an initializer as a input.
+        parameter graph: GraphProto.
+        parameter initializer: Initializer in TensorProto format.
+        return: List of nodes.
+    '''
+    result = []
+    for node in graph.node:
+        for node_input in node.input:
+            if node_input == initializer.name:
+                result.append(node)
+    return result
+
+class ONNXQuantizer:
+    def __init__(self, model, per_channel, mode, static, weight_qType, input_qType,
+            input_quantization_params, output_quantization_params):
+        self.model = model
+        self.per_channel = per_channel # weight-pack per channel
+        self.weight_qType = weight_qType  # quantize data type
+        self.mode = mode # QuantizationMode.Value
+        self.static = static # use static quantization for inputs.
+        self.input_qType = input_qType # quantize input type
+        self.input_quantization_params = input_quantization_params # zero point and scale values for node inputs.
+        self.output_quantization_params = output_quantization_params # zero point and scale values for node outputs.
+
+        if not self.mode in quantization_modes:
+            raise ValueError('unsupported quantization mode {}'.format(self.mode))
+
+        # QuantizeRange tensor name and zero tensor name for scale and zero point calculation.
+        # Used when static is False
+        self.fixed_qrange_non_scaled_name = "fixed_quantization_range_non_scaled"
+        self.fixed_qrange_scaled_name = "fixed_quantization_range_scaled"
+        # In non scaled mode, to compute zero point, we subtract rmin from 0 (represented by fixed_zero_name tensor)
+        self.fixed_zero_name = "fixed_zero"
+        # In scaled mode, zero point is always zero (respresented by fixed_zero_point_name tensor)
+        self.fixed_zero_zp_name = "fixed_zero_zp"
+
+        # List of weights quantized.
+        self._quantized_weights = []
+
+    def quantize_model(self):
+        # Create a new topologically sorted list for quantizing a model
+        new_list = []
+        for node in self.model.graph.node:
+            if node.op_type == 'Conv':
+                new_list += self._quantize_convolution(node, new_list)
+            elif node.op_type == 'MatMul':
+                new_list += self._quantize_matmul(node, new_list)
+            else:
+                new_list.append(node)
+
+        # extend is used to append to the list for a protobuf fields
+        # https://developers.google.com/protocol-buffers/docs/reference/python-generated?csw=1#fields
+        self.model.graph.ClearField('node')
+        self.model.graph.node.extend(new_list)
+
+        # Remove weights which are already quantized from graph.
+        self._remove_quantized_weights()
+
+        # update opset.
+        opset_info = next((opset for opset in self.model.opset_import if opset.domain == '' or opset.domain == onnx_domain), None)
+        if opset_info is not None:
+            self.model.opset_import.remove(opset_info)
+        self.model.opset_import.extend([onnx.helper.make_opsetid(onnx_domain, onnx_op_set_version)])
+
+        return self.model
+
+    def find_weight_data(self, initializer):
+        '''
+            :param initializer: TensorProto initializer object from a graph
+            :return: a list of initialized data in a given initializer object
+        '''
+        if initializer.data_type == onnx_proto.TensorProto.FLOAT:
+            weights = onnx.numpy_helper.to_array(initializer)
+        else:
+            raise ValueError('Model contains conv operator weights in {}. Only float type quantization is supported.'.format(
+                type_to_name[initializer.data_type]))
+        return weights
+
+    def _remove_quantized_weights(self):
+        ''' Remove the weights which are already quantized from graph initializer list.
+            This function assumes that after quantization, all nodes that previously use a weight:
+                - use output from DequantizeLinear as input if they do not support quantization.
+                - use quantized weight if they support quantization.
+        '''
+        for weight in self._quantized_weights:
+            # Remove existing weight initializer
+            self.model.graph.initializer.remove(weight.initializer)
+
+            # Removing input weight to a convolution
+            try:
+                weight_input = next(val for val in self.model.graph.input if val.name == weight.name)
+            except StopIteration:
+                raise ValueError('invalid weight name {} found in the graph '.format(weight.name))
+            self.model.graph.input.remove(weight_input)
+
+
+    def _update_graph(self, weight):
+        '''
+            Given a weight object, update the graph by doing the following:
+             - remove old initializer, update new initializers for quantized weight, zero point, and scale
+             - remove old weight input, update with new inputs for quantized weight, zero point, and scale
+            This function does NOT update the nodes in the graph, just initializers and inputs
+        '''
+        packed_weight_name = weight.name + '_quantized'
+        scale_name = weight.name + '_scale'
+        zero_point_name = weight.name + '_zero_point'
+
+        # Update packed weight, zero point, and scale initializers
+        packed_weight_np_data = np.asarray(weight.quantized_data,
+            dtype=onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[weight.qType]).reshape(weight.initializer.dims)
+        packed_weight_initializer = onnx.numpy_helper.from_array(packed_weight_np_data, packed_weight_name)
+
+        if weight.axis is not None:
+            zero_scale_shape = [weight.initializer.dims[weight.axis]]
+        else: # scale and zero point must be scalar
+            zero_scale_shape = []
+        zero_point_type = weight.qType
+        scale_initializer = onnx.helper.make_tensor(scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape, weight.scales)
+        zero_initializer = onnx.helper.make_tensor(zero_point_name, zero_point_type, zero_scale_shape, weight.zero_points)
+
+        self.model.graph.initializer.extend([packed_weight_initializer, scale_initializer, zero_initializer])
+
+        # Create input for initialized scale and zeros
+        packed_weight_value_info = onnx.helper.make_tensor_value_info(packed_weight_name, weight.qType,
+                                        weight.initializer.dims)
+        scale_value_info = onnx.helper.make_tensor_value_info(scale_name, onnx_proto.TensorProto.FLOAT, zero_scale_shape)
+        zero_point_value_info = onnx.helper.make_tensor_value_info(zero_point_name,
+            zero_point_type, zero_scale_shape) # zero_point is int for dequantize operator
+
+        self.model.graph.input.extend([packed_weight_value_info, scale_value_info, zero_point_value_info])
+
+        self._quantized_weights.append(weight)
+
+    def _get_quantized_weight(self, initializer, qType):
+        '''
+            :param initializer: TensorProto initializer
+            :param qType: type to quantize to
+            :return: Weight class with quantization information
+        '''
+        weights_data = self.find_weight_data(initializer)
+        rmin, rmax, zero_point, scale, quantized_weights_data = quantize_data(weights_data.flatten().tolist(),
+            _get_qrange_for_qType(qType), mode=DataQuantizationMode.mode_for_data_type(qType))
+        weight = Weight(initializer.name, initializer, [rmin], [rmax], [zero_point], [scale],
+                        weights_data, quantized_weights_data, axis=None, qType=qType)
+        return weight
+
+    def _get_quantized_weight_convolution(self, initializer, qType):
+        '''
+            :param initializer: initializer TypeProto to quantize
+            :param qType: type to quantize to
+            :return: Weight class object with quantization information for a given initializer
+        '''
+        if not self.per_channel:
+            return self._get_quantized_weight(initializer, qType)
+
+        weights = self.find_weight_data(initializer)
+        # Quantize per output channel
+        # Assuming (M x C/group x kH x kW) format where M is number of output channels.
+        channel_count = initializer.dims[0]
+        np_data = np.reshape(weights, initializer.dims)
+        rmin_list = []
+        rmax_list = []
+        zero_point_list = []
+        scale_list = []
+        quantized_per_channel_data_list = []
+        for i in range(channel_count):
+            # for each channel, compute quantization data. Assuming (M x C/group x kH x kW)
+            per_channel_data = np_data[i,:,:,:].flatten()
+            rmin, rmax, zero_point, scale, quantized_per_channel_data = quantize_data(per_channel_data.flatten().tolist(),
+                _get_qrange_for_qType(qType), mode=DataQuantizationMode.mode_for_data_type(qType))
+            rmin_list.append(rmin)
+            rmax_list.append(rmax)
+            zero_point_list.append(zero_point)
+            scale_list.append(scale)
+            quantized_per_channel_data_list.append(quantized_per_channel_data)
+        channel_index = 0 # (M x C/group x kH x kW)
+        # combine per_channel_data into one
+        reshape_dims = list(initializer.dims)  # deep copy
+        reshape_dims[channel_index] = 1  # only one per channel for reshape
+        quantized_weights = np.asarray(quantized_per_channel_data_list[0]).reshape(reshape_dims)
+        for i in range(1, len(quantized_per_channel_data_list)):
+            channel_weights = np.asarray(quantized_per_channel_data_list[i]).reshape(reshape_dims)
+            quantized_weights = np.concatenate((quantized_weights, channel_weights), axis=0)
+
+        weight = Weight(initializer.name, initializer, rmin_list, rmax_list, zero_point_list,
+                        scale_list, weights, quantized_weights.flatten().tolist(), channel_index, qType)
+        return weight
+
+    def _get_dynamic_input_quantization_params(self, input_name, nodes_list, qType):
+        '''
+        Create nodes for dynamic quantization of input and add them to nodes_list.
+            parameter input_name: Name of the input.
+            parameter nodes_list: new nodes are appended to this list.
+            parameter qType: type to quantize to.
+            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
+        '''
+        mode = DataQuantizationMode.mode_for_data_type(qType)
+        if mode == DataQuantizationMode.Linear_Scaled:
+            return self._get_dynamic_input_quantization_params_scaled(input_name, nodes_list)
+
+        return self._get_dynamic_input_quantization_params_non_scaled(input_name, nodes_list)
+
+    def _get_dynamic_input_quantization_params_scaled(self, input_name, nodes_list):
+        '''
+        Create nodes for dynamic quantization of input and add them to nodes_list
+        in DataQuantizationMode.Linear_Scaled
+            parameter input_name: Name of the input.
+            parameter nodes_list: new nodes are appended to this list.
+            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
+        '''
+        qType = onnx_proto.TensorProto.INT8
+
+        # Reduce min and Reduce max
+        input_scale_name = input_name + "_scale"
+
+        reduce_min_name = input_name + "_ReduceMin"
+        reduce_min_node = onnx.helper.make_node("ReduceMin", [input_name],
+            [reduce_min_name + ":0"], reduce_min_name, keepdims=0)
+        nodes_list.append(reduce_min_node)
+
+        reduce_max_name = input_name + "_ReduceMax"
+        reduce_max_node = onnx.helper.make_node("ReduceMax", [input_name],
+            [reduce_max_name + ":0"], reduce_max_name, keepdims=0)
+        nodes_list.append(reduce_max_node)
+
+        # Compute scale
+        #   Find abs(rmin)
+        reduce_min_abs_name = reduce_min_name + "_Abs"
+        reduce_min_abs_node = onnx.helper.make_node("Abs", [reduce_min_node.output[0]],
+            [reduce_min_abs_name + ":0"], reduce_min_abs_name)
+        nodes_list.append(reduce_min_abs_node)
+        #   Find abs(rmax)
+        reduce_max_abs_name = reduce_max_name + "_Abs"
+        reduce_max_abs_node = onnx.helper.make_node("Abs", [reduce_max_node.output[0]],
+            [reduce_max_abs_name + ":0"], reduce_max_abs_name)
+        nodes_list.append(reduce_max_abs_node)
+        #   Compute max of abs(rmin) and abs(rmax)
+        abs_max_name = input_name + "_Abs_Max"
+        abs_max_node = onnx.helper.make_node("Max", [reduce_min_abs_node.output[0], reduce_max_abs_node.output[0]],
+            [abs_max_name + ":0"], abs_max_name)
+        nodes_list.append(abs_max_node)
+        #   and divide by (quantize_range/2.0) which will be equal to max(...)*2.0/quantize_range
+        _add_initializer_if_not_present(self.model.graph, self.fixed_qrange_scaled_name,
+            [_get_qrange_for_qType(qType)/2.0], [], onnx_proto.TensorProto.FLOAT)
+        scale_div_name = input_name + "scale_Div"
+        scale_div_node = onnx.helper.make_node("Div", [abs_max_node.output[0], self.fixed_qrange_scaled_name],
+            [input_scale_name], scale_div_name)
+        nodes_list.append(scale_div_node)
+
+        # Zero point
+        _add_initializer_if_not_present(self.model.graph, self.fixed_zero_zp_name,
+            [0], [], qType)
+
+        return input_scale_name, self.fixed_zero_zp_name, [], []
+
+    def _get_dynamic_input_quantization_params_non_scaled(self, input_name, nodes_list):
+        '''
+        Create nodes for dynamic quantization of input and add them to nodes_list
+        in DataQuantizationMode.Linear_NonScaled
+            parameter input_name: Name of the input.
+            parameter nodes_list: new nodes are appended to this list.
+            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
+        '''
+        qType = onnx_proto.TensorProto.UINT8
+        # Reduce min and Reduce max
+        input_scale_name = input_name + "_scale"
+        input_zp_name = input_name + "_zero_point"
+
+        reduce_min_name = input_name + "_ReduceMin"
+        reduce_min_node = onnx.helper.make_node("ReduceMin", [input_name],
+            [reduce_min_name + ":0"], reduce_min_name, keepdims=0)
+        nodes_list.append(reduce_min_node)
+
+        reduce_max_name = input_name + "_ReduceMax"
+        reduce_max_node = onnx.helper.make_node("ReduceMax", [input_name],
+            [reduce_max_name + ":0"], reduce_max_name, keepdims=0)
+        nodes_list.append(reduce_max_node)
+
+        # Add tensors for quantize range and zero value.
+        _add_initializer_if_not_present(self.model.graph, self.fixed_qrange_non_scaled_name,
+            [_get_qrange_for_qType(qType)], [], onnx_proto.TensorProto.FLOAT)
+        _add_initializer_if_not_present(self.model.graph, self.fixed_zero_name,
+            [0.0], [], onnx_proto.TensorProto.FLOAT)
+
+        # Compute Scale
+        #   Subtract rmax and rmin
+        scale_sub_name = input_name + "_scale_Sub"
+        scale_sub_node = onnx.helper.make_node("Sub", [reduce_max_node.output[0], reduce_min_node.output[0]],
+            [scale_sub_name + ":0"], scale_sub_name)
+        nodes_list.append(scale_sub_node)
+        #   and divide by quantize range
+        scale_div_name = input_name + "_scale_Div"
+        scale_div_node = onnx.helper.make_node("Div", [scale_sub_node.output[0], self.fixed_qrange_non_scaled_name],
+            [input_scale_name], scale_div_name)
+        nodes_list.append(scale_div_node)
+
+        # Compute zero point
+        #   Subtract zero and rmin
+        zp_sub_name = input_name + "_zero_point_Sub"
+        zp_sub_node = onnx.helper.make_node("Sub", [self.fixed_zero_name, reduce_min_node.output[0]],
+            [zp_sub_name + ":0"], zp_sub_name)
+        nodes_list.append(zp_sub_node)
+        #   Divide by scale
+        zp_div_name = input_name + "_zero_point_Div"
+        zp_div_node = onnx.helper.make_node("Div", [zp_sub_node.output[0], input_scale_name],
+            [zp_div_name + ":0"], zp_div_name)
+        nodes_list.append(zp_div_node)
+        #   Compute floor
+        zp_floor_name = input_name + "_zero_point_Floor"
+        zp_floor_node = onnx.helper.make_node("Floor", zp_div_node.output,
+            [zp_floor_name + ":0"], zp_floor_name)
+        nodes_list.append(zp_floor_node)
+        #   Cast to integer
+        zp_cast_name = input_name + "_zero_point_Cast"
+        zp_cast_node = onnx.helper.make_node("Cast", zp_floor_node.output,
+            [input_zp_name], zp_cast_name, to=qType)
+        nodes_list.append(zp_cast_node)
+
+        return input_scale_name, input_zp_name, [], []
+
+    def _get_static_input_quantization_params(self, input_name, qType):
+        '''
+        Create initializers and inputs in the graph for static quantization of input.
+
+        Zero point and scale values are obtained from self.input_quantization_params if specified.
+        ValueError is thrown otherwise.
+
+            parameter input_name: Name of the input.
+            parameter qType: type to quantize to.
+            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
+        '''
+        if self.input_quantization_params is None or input_name not in self.input_quantization_params:
+            raise ValueError("Quantization parameters are not specified for input {}.".format(input_name))
+        params = self.input_quantization_params[input_name]
+        if params is None or len(params) != 2:
+            raise ValueError("Quantization parameters should contain zero point and scale. "
+                "Specified values for input {}: {}".format(input_name, params))
+
+        if not np.isscalar(params[0]):
+            raise ValueError("Zero point for input {} should be a scalar value. Value specified: {}".format(
+                input_name, params[0]))
+        if not np.isscalar(params[1]):
+            raise ValueError("Scale for input {} should be a scalar value. Value specified: {}".format(
+                input_name, params[1]))
+
+        zero_point_values = [params[0].item()]
+        zero_point_shape = []
+        zero_point_name = input_name + "_zero_point"
+
+        zero_point_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[params[0].dtype]
+        if zero_point_type != qType:
+            raise ValueError("Zero point and input data types should be the same. "
+                "Zero point for input {} is specified as {}, but input is being quantized to {}."
+                .format(input_name, params[0].dtype, onnx.mapping.TENSOR_TYPE_TO_NP_TYPE[qType]))
+
+        scale_values = [params[1].item()]
+        scale_shape = []
+        scale_name = input_name + "_scale"
+
+        # Add initializers
+        _add_initializer_if_not_present(self.model.graph, zero_point_name, zero_point_values,
+            zero_point_shape, qType)
+        _add_initializer_if_not_present(self.model.graph, scale_name, scale_values,
+            scale_shape, onnx_proto.TensorProto.FLOAT)
+
+        return scale_name, zero_point_name, scale_shape, zero_point_shape
+
+    def _get_output_quantization_params(self, output_name):
+        '''
+        Create initializers and inputs in the graph for zero point and scale of output.
+        Used when mode is QuantizationMode.QLinearOps.
+
+        Zero point and scale values are obtained from self.output_quantization_params if specified.
+        ValueError is thrown otherwise.
+
+            parameter output_name: Name of the output.
+            return: scale_name, zero_point_name, scale_shape, zero_point_shape.
+        '''
+        if self.output_quantization_params is None or output_name not in self.output_quantization_params:
+            raise ValueError("Quantization parameters are not specified for output {}.".format(output_name))
+        params = self.output_quantization_params[output_name]
+        if params is None or len(params) != 2:
+            raise ValueError("Quantization parameters should contain zero point and scale. "
+                "Specified values for output {}: {}".format(output_name, params))
+
+        if not np.isscalar(params[0]):
+            raise ValueError("Zero point for output {} should be a scalar value. Value specified: {}".format(
+                output_name, params[0]))
+        if not np.isscalar(params[1]):
+            raise ValueError("Scale for output {} should be a scalar value. Value specified: {}".format(
+                output_name, params[1]))
+
+        zero_point_values = [params[0].item()]
+        zero_point_shape = []
+        zero_point_name = output_name + "_zero_point"
+        zero_point_type = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[params[0].dtype]
+
+        scale_values = [params[1].item()]
+        scale_shape = []
+        scale_name = output_name + "_scale"
+
+        # Add initializers
+        _add_initializer_if_not_present(self.model.graph, zero_point_name, zero_point_values, zero_point_shape,
+            zero_point_type)
+        _add_initializer_if_not_present(self.model.graph, scale_name, scale_values, scale_shape,
+            onnx_proto.TensorProto.FLOAT)
+
+        return scale_name, zero_point_name, scale_shape, zero_point_shape
+
+    def _get_quantize_input_nodes(self, node, input_index, qType):
+        '''
+        Given a input for a node (which is not a initializer), this function
+            - add elements to graph to compute zero point and scale for this input.
+            - add new QuantizeLinear nodes to quantize the input.
+
+            parameter node: node being quantized in NodeProto format.
+            parameter input_index: index of input in node.input.
+            parameter qType: type to quantize to.
+            return: List of newly created nodes in NodeProto format.
+        '''
+        input_name = node.input[input_index]
+
+        nodes = []
+        if self.static:
+            scale_name, zp_name, scale_shape, zp_shape = \
+                self._get_static_input_quantization_params(input_name, qType)
+        else:
+            scale_name, zp_name, scale_shape, zp_shape = \
+                self._get_dynamic_input_quantization_params(input_name, nodes, qType)
+
+        # Add QuantizeLinear Node
+        output_name = input_name + "_quantized"
+        qlinear_node = onnx.helper.make_node("QuantizeLinear", [input_name, scale_name, zp_name],
+            [output_name], input_name + "_QuantizeLinear")
+        return nodes + [qlinear_node]
+
+    def _update_unsupported_nodes_using_weight(self, weight, new_nodes_list):
+        '''Find all nodes using a weight that do not support quantization and
+        add a DequantizeLinear node before those nodes. This includes all nodes except Conv, MatMul.
+
+            parameter weight: Weight object
+            parameter new_nodes_list: List of new nodes created before processing current node.
+            return: List of new nodes created.
+        '''
+        nodes_using_weight = _find_nodes_using_initializer(self.model.graph, weight.initializer)
+        unsupported_nodes = [node for node in nodes_using_weight if node.op_type not in ["Conv", "MatMul"]]
+
+        nodes_list = []
+        dequantize_linear_name = weight.name + "_DequantizeLinear"
+        output_name = weight.name + "_dequantized"
+
+        # Check if DequantizeLinear node needs to be added to graph.
+        if len(unsupported_nodes) != 0 and \
+            _find_node_by_name(dequantize_linear_name, self.model.graph, new_nodes_list) is None:
+            inputs = [weight.name + "_quantized", weight.name + "_scale", weight.name + "_zero_point"]
+            node = onnx.helper.make_node("DequantizeLinear", inputs, [output_name],
+                                         dequantize_linear_name)
+            nodes_list.append(node)
+
+        # Update unsupported nodes to take dequantized weight as input.
+        for node in unsupported_nodes:
+            for i, node_input in enumerate(node.input):
+                if node_input == weight.name:
+                    node.input[i] = output_name
+
+        return nodes_list
+
+    def _is_quantized(self, weight):
+        '''
+        Check if this weight is already quantized to the expected type and quantization axis.
+        If it is already quantized to (type, axis) different from expected values,
+        this function will throw an exception and stop the quantization.
+
+            parameter weight: Weight object.
+            return: Boolean indicating if quantized weight is already added to graph.
+        '''
+        quantized_initializer_name = weight.name + "_quantized"
+        quantized_initializer = _find_by_name(quantized_initializer_name, self.model.graph.initializer)
+        zero_point = _find_by_name(weight.name + "_zero_point", self.model.graph.initializer)
+        if quantized_initializer is None:
+            return False
+
+        # Compare type
+        if quantized_initializer.data_type != weight.qType:
+            raise ValueError("{} is being used by multiple nodes which are being quantized to different types. "
+                "Please use different initializers for these nodes.", weight.name)
+
+        expected_dims = [] if weight.axis is None else [len(weight.zero_points)]
+        # Compare quantization axis
+        if zero_point.dims != expected_dims:
+            raise ValueError("{} is being used by multiple nodes which are being quantized to different shapes. "
+                "Please use different initializers for these nodes.", weight.name)
+
+        return True
+
+    def _quantize_inputs(self, node, indices, weight_index, new_nodes_list):
+        '''
+        Given a node, this function quantizes the inputs as follows:
+            - If input is a initializer, quantize the initializer data, replace old initializer
+              with new initializer
+            - Else, add QuantizeLinear nodes to perform quantization
+
+            parameter node: node being quantized in NodeProto format.
+            parameter indices: input indices to quantize.
+            parameter weight_index: index of weight input.
+                                    In Asymmetric mode, this input is quantized into signed integer.
+            parameter new_nodes_list: List of new nodes created before processing this node. This is used to
+                                      check that two QuantizeLinear nodes are not being added for same input.
+            return: (List of quantized input names,
+                     List of zero point names used for input quantization,
+                     List of scale names used for input quantization,
+                     List of new QuantizeLinear nodes created)
+        '''
+        assert (node.op_type == "Conv" or node.op_type == "MatMul")
+
+        quantized_input_names = []
+        zero_point_names = []
+        scale_names = []
+        nodes = []
+
+        for input_index in indices:
+            qType = self.weight_qType if input_index == weight_index else self.input_qType
+            node_input = node.input[input_index]
+            initializer = _find_by_name(node_input, self.model.graph.initializer)
+            if initializer is not None:
+                # Quantize the data
+                if node.op_type == "Conv" and input_index == weight_index:
+                    weight = self._get_quantized_weight_convolution(initializer, qType)
+                else:
+                    weight = self._get_quantized_weight(initializer, qType)
+
+                if not self._is_quantized(weight):
+                    nodes.extend(self._update_unsupported_nodes_using_weight(weight, new_nodes_list))
+                    self._update_graph(weight)
+
+                quantized_input_names.append(weight.name + "_quantized")
+                zero_point_names.append(weight.name + "_zero_point")
+                scale_names.append(weight.name + "_scale")
+            else:
+                # Not an initializer input. Add QuantizeLinear node.
+                # Find if there is already a quantizeLinear node for this input
+                qlinear_node = _find_node_by_name(node_input + "_QuantizeLinear", self.model.graph, new_nodes_list)
+                if qlinear_node is None:
+                    quantize_input_nodes = self._get_quantize_input_nodes(node, input_index, qType)
+                    nodes.extend(quantize_input_nodes)
+                    qlinear_node = quantize_input_nodes[-1]
+
+                quantized_input_names.extend(qlinear_node.output)
+                scale_names.append(qlinear_node.input[1])
+                zero_point_names.append(qlinear_node.input[2])
+
+        return (quantized_input_names, zero_point_names, scale_names, nodes)
+
+    def _quantize_convolution_integer_ops(self, node, new_nodes_list):
+        '''
+        Used when self.mode is QuantizationMode.IntegerOps.
+            parameter node: Conv node.
+            parameter new_nodes_list: List of new nodes created before processing this node.
+            return: a list of nodes in topological order that represents quantized Conv node.
+        '''
+        assert (node.op_type == "Conv")
+
+        (quantized_input_names, zero_point_names, scale_names, nodes) = \
+            self._quantize_inputs(node, [0, 1], 1, new_nodes_list)
+
+        conv_integer_output = node.output[0] + "_quantized"
+        conv_integer_name = ""
+        if node.name != "":
+            conv_integer_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(_attribute_to_kwarg(attribute))
+        conv_integer_node = onnx.helper.make_node("ConvInteger", quantized_input_names + zero_point_names,
+            [conv_integer_output], conv_integer_name, **kwargs)
+        nodes.append(conv_integer_node)
+
+        # Add cast operation to cast convInteger output to float.
+        cast_op_output = conv_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node("Cast", [conv_integer_output], [cast_op_output],
+            conv_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT)
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert (len(scale_names) == 2)
+        if conv_integer_name != "":
+            scales_mul_op = conv_integer_name + "_scales_mul"
+        else:
+            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
+
+        scales_mul_node = _find_node_by_name(scales_mul_op, self.model.graph, new_nodes_list)
+        if scales_mul_node is None:
+            scales_mul_node = _get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of ConvInteger
+        # and make the output of this node the same as output of original conv node.
+        output_scale_mul_op = ""
+        if conv_integer_name != "":
+            output_scale_mul_op = conv_integer_name + "_output_scale_mul"
+        nodes.append(_get_mul_node([cast_op_output, scales_mul_op_output], node.output[0], output_scale_mul_op))
+        return nodes
+
+    def _quantize_matmul_integer_ops(self, node, new_nodes_list):
+        '''
+        Used when self.mode is QuantizationMode.IntegerOps.
+            parameter node: MatMul node.
+            parameter new_nodes_list: List of new nodes created before processing this node.
+            return: a list of nodes in topological order that represents quantized MatMul node.
+        '''
+        assert (node.op_type == "MatMul")
+
+        (quantized_input_names, zero_point_names, scale_names, nodes) = \
+            self._quantize_inputs(node, [0, 1], 1, new_nodes_list)
+
+        matmul_integer_output = node.output[0] + "_quantized"
+        matmul_integer_name = ""
+        if node.name != "":
+            matmul_integer_name = node.name + "_quant"
+        matmul_integer_node = onnx.helper.make_node("MatMulInteger", quantized_input_names + zero_point_names,
+            [matmul_integer_output], matmul_integer_name)
+        nodes.append(matmul_integer_node)
+
+        # Add cast operation to cast matmulInteger output to float.
+        cast_op_output = matmul_integer_output + "_cast_output"
+        cast_node = onnx.helper.make_node("Cast", [matmul_integer_output], [cast_op_output],
+            matmul_integer_output + "_cast", to=onnx_proto.TensorProto.FLOAT)
+        nodes.append(cast_node)
+
+        # Add mul operation to multiply scales of two inputs.
+        assert (len(scale_names) == 2)
+        if matmul_integer_name != "":
+            scales_mul_op = matmul_integer_name + "_scales_mul"
+        else:
+            scales_mul_op = scale_names[0] + "_" + scale_names[1] + "_mul"
+
+        scales_mul_node = _find_node_by_name(scales_mul_op, self.model.graph, new_nodes_list)
+        if scales_mul_node is None:
+            scales_mul_node = _get_mul_node(scale_names, scales_mul_op + ":0", scales_mul_op)
+            nodes.append(scales_mul_node)
+
+        scales_mul_op_output = scales_mul_node.output[0]
+
+        # Add mul operation to multiply mul_scales_op result with output of MatMulInteger
+        # and make the output of this node the same as output of original matmul node.
+        output_scale_mul_op = ""
+        if matmul_integer_name != "":
+            output_scale_mul_op = matmul_integer_name + "_output_scale_mul"
+        nodes.append(_get_mul_node([cast_op_output, scales_mul_op_output], node.output[0],
+            output_scale_mul_op))
+        return nodes
+
+    def _quantize_convolution_qlinear_ops(self, node, new_nodes_list):
+        '''
+        Used when self.mode is QuantizationMode.QLinearOps.
+            parameter node: Conv node.
+            parameter new_nodes_list: List of new nodes created before processing this node.
+            return: a list of nodes in topological order that represents quantized Conv node.
+        '''
+        assert (node.op_type == "Conv")
+
+        (quantized_input_names, zero_point_names, scale_names, nodes) = \
+            self._quantize_inputs(node, [0, 1], 1, new_nodes_list)
+
+        output_scale_name, output_zp_name, output_scale_shape, output_zp_shape = \
+            self._get_output_quantization_params(node.output[0])
+
+        qlinear_conv_output = node.output[0] + "_quantized"
+        qlinear_conv_name = ""
+        if node.name != "":
+            qlinear_conv_name = node.name + "_quant"
+        kwargs = {}
+        for attribute in node.attribute:
+            kwargs.update(_attribute_to_kwarg(attribute))
+        qlinear_conv_inputs = []
+        # Input 0
+        qlinear_conv_inputs.append(quantized_input_names[0])
+        qlinear_conv_inputs.append(scale_names[0])
+        qlinear_conv_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_conv_inputs.append(quantized_input_names[1])
+        qlinear_conv_inputs.append(scale_names[1])
+        qlinear_conv_inputs.append(zero_point_names[1])
+        # Output
+        qlinear_conv_inputs.append(output_scale_name)
+        qlinear_conv_inputs.append(output_zp_name)
+
+        qlinear_conv_node = onnx.helper.make_node("QLinearConv", qlinear_conv_inputs,
+            [qlinear_conv_output], qlinear_conv_name, **kwargs)
+        nodes.append(qlinear_conv_node)
+
+        # Add DequantizeLinear node.
+        dqlinear_name = node.output[0] + "_DequantizeLinear"
+        dqlinear_inputs = [qlinear_conv_output, output_scale_name, output_zp_name]
+        dqlinear_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [node.output[0]], dqlinear_name)
+        nodes.append(dqlinear_node)
+        return nodes
+
+    def _quantize_matmul_qlinear_ops(self, node, new_nodes_list):
+        '''
+        Used when self.mode is QuantizationMode.QLinearOps.
+            parameter node: MatMul node.
+            parameter new_nodes_list: List of new nodes created before processing this node.
+            return: a list of nodes in topological order that represents quantized Conv node.
+        '''
+        assert (node.op_type == "MatMul")
+
+        (quantized_input_names, zero_point_names, scale_names, nodes) = \
+            self._quantize_inputs(node, [0, 1], 1, new_nodes_list)
+
+        output_scale_name, output_zp_name, output_scale_shape, output_zp_shape = \
+            self._get_output_quantization_params(node.output[0])
+
+        qlinear_matmul_output = node.output[0] + "_quantized"
+        qlinear_matmul_name = ""
+        if node.name != "":
+            qlinear_matmul_name = node.name + "_quant"
+
+        qlinear_matmul_inputs = []
+        # Input 0
+        qlinear_matmul_inputs.append(quantized_input_names[0])
+        qlinear_matmul_inputs.append(scale_names[0])
+        qlinear_matmul_inputs.append(zero_point_names[0])
+        # Input 1
+        qlinear_matmul_inputs.append(quantized_input_names[1])
+        qlinear_matmul_inputs.append(scale_names[1])
+        qlinear_matmul_inputs.append(zero_point_names[1])
+        # Output
+        qlinear_matmul_inputs.append(output_scale_name)
+        qlinear_matmul_inputs.append(output_zp_name)
+
+        qlinear_matmul_node = onnx.helper.make_node("QLinearMatMul", qlinear_matmul_inputs,
+            [qlinear_matmul_output], qlinear_matmul_name)
+        nodes.append(qlinear_matmul_node)
+
+        # Add DequantizeLinear node.
+        dqlinear_name = node.output[0] + "_DequantizeLinear"
+        dqlinear_inputs = [qlinear_matmul_output, output_scale_name, output_zp_name]
+        dqlinear_node = onnx.helper.make_node("DequantizeLinear", dqlinear_inputs, [node.output[0]], dqlinear_name)
+        nodes.append(dqlinear_node)
+        return nodes
+
+    def _quantize_convolution(self, node, new_nodes_list):
+        '''
+            https://github.com/onnx/onnx/blob/master/docs/Operators.md#Conv
+            :param node: Conv node
+            :param new_nodes_list: List of new nodes created before processing this node.
+            :return: a list of nodes in topological order that represents quantized Conv node
+        '''
+        assert (node.op_type == "Conv")
+
+        if self.mode == QuantizationMode.IntegerOps:
+            return self._quantize_convolution_integer_ops(node, new_nodes_list)
+
+        if self.mode == QuantizationMode.QLinearOps:
+            return self._quantize_convolution_qlinear_ops(node, new_nodes_list)
+
+        return [node]
+
+    def _quantize_matmul(self, node, new_nodes_list):
+        '''
+            https://github.com/onnx/onnx/blob/master/docs/Operators.md#MatMul
+            :param node: MatMul node
+            :param new_nodes_list: List of new nodes created before processing this node.
+            :return: a list of nodes in topological order that represents quantized MatMul node
+        '''
+        assert(node.op_type == 'MatMul')
+
+        if self.mode == QuantizationMode.IntegerOps:
+            return self._quantize_matmul_integer_ops(node, new_nodes_list)
+
+        if self.mode == QuantizationMode.QLinearOps:
+            return self._quantize_matmul_qlinear_ops(node, new_nodes_list)
+
+        return [node]
+
+
+def quantize(model, per_channel=True, nbits=8, quantization_mode=QuantizationMode.IntegerOps,
+    static=False, asymmetric_input_types=False, input_quantization_params=None, output_quantization_params=None):
+    '''
+        Given an onnx model, create a quantized onnx model and save it into a file
+
+    :param model: ModelProto to quantize
+    :param per_channel: quantize weights per channel
+    :param nbits: number of bits to represent quantized data. Currently only supporting 8-bit types
+    :param quantization_mode: Can be one of the QuantizationMode types.
+        IntegerOps:
+            the function will use integer ops. Only ConvInteger and MatMulInteger ops are supported now.
+        QLinearOps:
+            the function will use QLinear ops. Only QLinearConv and QLinearMatMul ops are supported now.
+    :param static:
+        True: The inputs/activations are quantized using static scale and zero point values
+              specified through input_quantization_params.
+        False: The inputs/activations are quantized using dynamic scale and zero point values
+               computed while running the model.
+    :param asymmetric_input_types:
+        True: Weights are quantized into signed integers and inputs/activations into unsigned integers.
+        False: Weights and inputs/activations are quantized into unsigned integers.
+    :param input_quantization_params:
+        Dictionary to specify the zero point and scale values for inputs to conv and matmul nodes.
+        Should be specified when static is set to True.
+        The input_quantization_params should be specified in the following format:
+            {
+                "input_name": [zero_point, scale]
+            }.
+        zero_point should be of type np.uint8 and scale should be of type np.float32.
+        example:
+            {
+                'resnet_model/Relu_1:0': [np.uint8(0), np.float32(0.019539741799235344)],
+                'resnet_model/Relu_2:0': [np.uint8(0), np.float32(0.011359662748873234)]
+            }
+    :param output_quantization_params:
+        Dictionary to specify the zero point and scale values for outputs of conv and matmul nodes.
+        Should be specified in QuantizationMode.QLinearOps mode.
+        The output_quantization_params should be specified in the following format:
+            {
+                "output_name": [zero_point, scale]
+            }
+        zero_point can be of type np.uint8/np.int8 and scale should be of type np.float32.
+        example:
+            {
+                'resnet_model/Relu_3:0': [np.int8(0), np.float32(0.011359662748873234)],
+                'resnet_model/Relu_4:0': [np.uint8(0), np.float32(0.011359662748873234)]
+            }
+    :return: ModelProto with quantization
+    '''
+    if nbits == 8:
+        input_qType = onnx_proto.TensorProto.UINT8
+        weight_qType = onnx_proto.TensorProto.INT8 if asymmetric_input_types else onnx_proto.TensorProto.UINT8
+        mode = quantization_mode
+        copy_model = onnx_proto.ModelProto()
+        copy_model.CopyFrom(model)
+        quantizer = ONNXQuantizer(copy_model, per_channel, mode, static, weight_qType, input_qType,
+                        input_quantization_params, output_quantization_params)
+        quantizer.quantize_model()
+        quantizer.model.producer_name = __producer__
+        quantizer.model.producer_version = __version__
+        return quantizer.model
+    else:
+        raise ValueError('Unknown value for nbits. only 8 bit quantization is currently supported')
\ No newline at end of file