onnxruntime/onnxruntime/python/tools/quantization/quantize.py
RRRachelllll555 f7c1e51810
Remove shape inference and fix save large model(>2g) issue (#5210)
* remove shape inference and fix save large model problem

* remove unnecessary import

* refine code and add external format for quantize_qat

* remove initializers in tensors_to_calibrate

* small refine

Co-authored-by: t-yguo <t-yguo@microsoft.com>
2020-09-18 08:46:31 -07:00

304 lines
No EOL
13 KiB
Python

# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
import os
import onnx
import onnx.numpy_helper
import struct
from pathlib import Path
import numpy as np
from onnx import onnx_pb as onnx_proto
from onnxruntime import SessionOptions, InferenceSession, GraphOptimizationLevel
from .quant_utils import QuantizationMode, QuantizedValueType, QuantizedInitializer, QuantizedValue, quantization_modes
from .quant_utils import find_by_name, get_elem_index, get_mul_node, generate_identified_filename, attribute_to_kwarg
from .quant_utils import QuantType
from .registry import CreateOpQuantizer, CreateDefaultOpQuantizer, QLinearOpsRegistry, IntegerOpsRegistry
from .onnx_model import ONNXModel
from .onnx_quantizer import ONNXQuantizer
from .calibrate import CalibrationDataReader, calibrate
def optimize_model(model_path: Path):
'''
Generate model that applies graph optimization (constant folding,etc.)
parameter model_path: path to the original onnx model
return: optimized onnx model
'''
opt_model_path = generate_identified_filename(model_path, "-opt")
sess_option = SessionOptions()
sess_option.optimized_model_filepath = opt_model_path.as_posix()
sess_option.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_BASIC
_ = InferenceSession(model_path.as_posix(), sess_option)
optimized_model = onnx.load(opt_model_path.as_posix())
return optimized_model
def quantize(model,
per_channel=False,
nbits=8,
quantization_mode=QuantizationMode.IntegerOps,
static=False,
force_fusions=False,
symmetric_activation=False,
symmetric_weight=False,
quantization_params=None,
nodes_to_quantize=None,
nodes_to_exclude=None,
op_types_to_quantize=[]):
'''
Given an onnx model, create a quantized onnx model and save it into a file
:param model: ModelProto to quantize
:param per_channel: quantize weights per channel
:param nbits: number of bits to represent quantized data. Currently only supporting 8-bit types
:param quantization_mode: Can be one of the QuantizationMode types.
IntegerOps:
the function will use integer ops. Only ConvInteger and MatMulInteger ops are supported now.
QLinearOps:
the function will use QLinear ops. Only QLinearConv and QLinearMatMul ops are supported now.
:param static:
True: The inputs/activations are quantized using static scale and zero point values
specified through quantization_params.
False: The inputs/activations are quantized using dynamic scale and zero point values
computed while running the model.
:param symmetric_activation:
True: activations are quantized into signed integers.
False: activations are quantized into unsigned integers.
:param symmetric_weight:
True: weights are quantized into signed integers.
False: weights are quantized into unsigned integers.
:param quantization_params:
Dictionary to specify the zero point and scale values for inputs to conv and matmul nodes.
Should be specified when static is set to True.
The quantization_params should be specified in the following format:
{
"input_name": [zero_point, scale]
}.
zero_point should be of type np.uint8 and scale should be of type np.float32.
example:
{
'resnet_model/Relu_1:0': [np.uint8(0), np.float32(0.019539741799235344)],
'resnet_model/Relu_2:0': [np.uint8(0), np.float32(0.011359662748873234)]
}
:param nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
:param nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
:param op_types_to_quantize: specify the types of operators to quantize, like ['Conv'] to quantize Conv only. It quantizes all supported operators by default.
:return: ModelProto with quantization
'''
print("Warning: onnxruntime.quantization.quantize is deprecated.\n\
Please use quantize_static for static quantization, quantize_dynamic for dynamic quantization.")
if nbits == 8 or nbits == 7:
input_qType = onnx_proto.TensorProto.INT8 if symmetric_activation else onnx_proto.TensorProto.UINT8
weight_qType = onnx_proto.TensorProto.INT8 if symmetric_weight else onnx_proto.TensorProto.UINT8
mode = quantization_mode
copy_model = onnx_proto.ModelProto()
copy_model.CopyFrom(model)
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
op_types_to_quantize = list(QLinearOpsRegistry.keys()) if static else list(IntegerOpsRegistry.keys())
quantizer = ONNXQuantizer(copy_model, per_channel, nbits == 7, mode, static, weight_qType, input_qType, quantization_params,
nodes_to_quantize, nodes_to_exclude, op_types_to_quantize)
quantizer.quantize_model()
return quantizer.model.model
else:
raise ValueError('Only 8 and 7 bit quantization is currently supported')
def quantize_static(model_input,
model_output,
calibration_data_reader: CalibrationDataReader,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QUInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
use_external_data_format=False):
'''
Given an onnx model and calibration data reader, create a quantized onnx model and save it into a file
:param model_input: file path of model to quantize
:param model_output: file path of quantized model
:param calibration_data_reader: a calibration data reader. It enumerates calibration data and generates inputs for the original model.
:param op_types_to_quantize: specify the types of operators to quantize, like ['Conv'] to quantize Conv only. It quantizes all supported operators by default.
:param op_types: operators to quantize
:param per_channel: quantize weights per channel
:param reduce_range: quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode
:param activation_type: quantization data type of activation
:param weight_type: quantization data type of weight
:param nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
:param nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
:parma use_external_data_format: option used for large size (>2GB) model. Set to False by default.
'''
if activation_type != QuantType.QUInt8:
raise ValueError("Static quantization only support uint8 for activation now.")
input_qType = onnx_proto.TensorProto.INT8 if activation_type == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
weight_qType = onnx_proto.TensorProto.INT8 if weight_type == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
mode = QuantizationMode.QLinearOps
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
op_types_to_quantize = list(QLinearOpsRegistry.keys())
quantization_params_dict = calibrate(model_input, calibration_data_reader, op_types_to_quantize, nodes_to_quantize,
nodes_to_exclude)
quantizer = ONNXQuantizer(
onnx.load(model_input),
per_channel,
reduce_range,
mode,
True, # static
weight_qType,
input_qType,
quantization_params_dict,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)
def quantize_dynamic(model_input: Path,
model_output: Path,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QUInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
use_external_data_format=False):
'''
Given an onnx model, create a quantized onnx model and save it into a file
:param model_input: file path of model to quantize
:param model_output: file path of quantized model
:param op_types_to_quantize: specify the types of operators to quantize, like ['Conv'] to quantize Conv only. It quantizes all supported operators by default
:param per_channel: quantize weights per channel
:param reduce_range: quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode
:param nbits: number of bits to represent quantized data. Currently only supporting 8-bit types
:param activation_type: quantization data type of activation
:param weight_type: quantization data type of weight
:param nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
:param nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
:parma use_external_data_format: option used for large size (>2GB) model. Set to False by default.
'''
input_qType = onnx_proto.TensorProto.INT8 if activation_type == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
weight_qType = onnx_proto.TensorProto.INT8 if weight_type == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
mode = QuantizationMode.IntegerOps
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
op_types_to_quantize = list(IntegerOpsRegistry.keys())
quantizer = ONNXQuantizer(
onnx.load(model_input),
per_channel,
reduce_range,
mode,
False, #static
weight_qType,
input_qType,
None,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)
def quantize_qat(model_input: Path,
model_output: Path,
op_types_to_quantize=[],
per_channel=False,
reduce_range=False,
activation_type=QuantType.QUInt8,
weight_type=QuantType.QUInt8,
nodes_to_quantize=[],
nodes_to_exclude=[],
use_external_data_format=False):
'''
Given a quantize-aware traning onnx model, create a quantized onnx model and save it into a file
:param model_input: file path of model to quantize
:param model_output: file path of quantized model
:param op_types_to_quantize: specify the types of operators to quantize, like ['Conv'] to quantize Conv only. It quantizes all supported operators by default
:param per_channel: quantize weights per channel
:param reduce_range: quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode
:param activation_type: quantization data type of activation
:param nodes_to_quantize:
List of nodes names to quantize. When this list is not None only the nodes in this list
are quantized.
example:
[
'Conv__224',
'Conv__252'
]
:param nodes_to_exclude:
List of nodes names to exclude. The nodes in this list will be excluded from quantization
when it is not None.
:parma use_external_data_format: option used for large size (>2GB) model. Set to False by default.
'''
input_qType = onnx_proto.TensorProto.INT8 if activation_type == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
weight_qType = onnx_proto.TensorProto.INT8 if weight_type == QuantType.QInt8 else onnx_proto.TensorProto.UINT8
mode = QuantizationMode.IntegerOps
#optimize the original model
optimized_model = optimize_model(Path(model_input))
if not op_types_to_quantize or len(op_types_to_quantize) == 0:
op_types_to_quantize = list(IntegerOpsRegistry.keys())
quantizer = ONNXQuantizer(
optimized_model,
per_channel,
reduce_range,
mode,
False, #static
weight_qType,
input_qType,
None,
nodes_to_quantize,
nodes_to_exclude,
op_types_to_quantize)
quantizer.quantize_model()
quantizer.model.save_model_to_file(model_output, use_external_data_format)