diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 0762506e62..dc95cbabbb 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -21,21 +21,24 @@ import re import subprocess import json - -def augment_graph(model): +def augment_graph(model, quantization_candidates=['Conv', 'MatMul'], black_nodes=[], white_nodes=[]): ''' - Adds ReduceMin and ReduceMax nodes to all Conv and MatMul nodes in + Adds ReduceMin and ReduceMax nodes to all quantization_candidates op type nodes in model and ensures their outputs are stored as part of the graph output parameter model: loaded FP32 ONNX model to quantize + parameter quantization_candidates: node op types for nodes to be quantized. + Calibraton will be done for them. + parameter black_nodes: nodes with these names will be force ignored by this + calibration augmentation, no mather what's their op type. + parameter white_nodes: nodes with these names will be force to be calibration augmented. return: augmented ONNX model ''' - # Candidate nodes for quantization. Calibration will be done for these nodes only - # When more nodes are extended to support quantization, add them to this list - quantization_candidates = ['Conv', 'MatMul'] + added_nodes = [] added_outputs = [] for node in model.graph.node: - if node.op_type in quantization_candidates: + should_be_calibrate = ((node.op_type in quantization_candidates) and (node.name not in black_nodes)) or (node.name in white_nodes) + if should_be_calibrate: input_name = node.output[0] # Adding ReduceMin nodes reduce_min_name = '' @@ -217,6 +220,15 @@ def main(): parser = argparse.ArgumentParser(description='parsing model and test data set paths') parser.add_argument('--model_path', required=True) parser.add_argument('--dataset_path', required=True) + parser.add_argument('--force_fusions', default=False, action='store_true') + parser.add_argument('--op_types', type=str, default='Conv,MatMul', + help='comma delimited operator types to be calibrated and quantized') + parser.add_argument('--black_nodes', type=str, default='', + help='comma delimited operator names that should not be quantized') + parser.add_argument('--white_nodes', type=str, default='', + help='comma delimited operator names force to be quantized') + parser.add_argument('--augmented_model_path', type=str, default = 'augmented_model.onnx', + help='save augmented model to this file for verification purpose') parser.add_argument('--output_model_path', type=str, default='calibrated_quantized_model.onnx') parser.add_argument('--dataset_size', type=int, @@ -228,6 +240,9 @@ def main(): choices=['preprocess_method1', 'preprocess_method2', 'None'], help="Refer to Readme.md for guidance on choosing this option.") args = parser.parse_args() + calibrate_op_types = args.op_types.split(',') + black_nodes = args.black_nodes.split(',') + white_nodes = args.white_nodes.split(',') model_path = args.model_path output_model_path = args.output_model_path images_folder = args.dataset_path @@ -235,25 +250,25 @@ def main(): size_limit = args.dataset_size # Generating augmented ONNX model - augmented_model_path = 'augmented_model.onnx' model = onnx.load(model_path) - augmented_model = augment_graph(model) - onnx.save(augmented_model, augmented_model_path) + augmented_model = augment_graph(model, calibrate_op_types, black_nodes, white_nodes) + onnx.save(augmented_model, args.augmented_model_path) # Conducting inference - session = onnxruntime.InferenceSession(augmented_model_path, None) + session = onnxruntime.InferenceSession(args.augmented_model_path, None) (samples, channels, height, width) = session.get_inputs()[0].shape # Generating inputs for quantization if args.data_preprocess == "None": inputs = load_pb_file(images_folder, args.dataset_size, samples, channels, height, width) else: - inputs = load_batch(images_folder, height, width, size_limit, args.data_preprocess) + inputs = load_batch(images_folder, height, width, args.data_preprocess, size_limit) print(inputs.shape) dict_for_quantization = get_intermediate_outputs(model_path, session, inputs, calib_mode) quantization_params_dict = calculate_quantization_params(model, quantization_thresholds=dict_for_quantization) calibrated_quantized_model = quantize(onnx.load(model_path), quantization_mode=QuantizationMode.QLinearOps, + force_fusions=args.force_fusions, quantization_params=quantization_params_dict) onnx.save(calibrated_quantized_model, output_model_path) diff --git a/onnxruntime/python/tools/quantization/data_preprocess.py b/onnxruntime/python/tools/quantization/data_preprocess.py index 1e73010360..8b1551e8a0 100644 --- a/onnxruntime/python/tools/quantization/data_preprocess.py +++ b/onnxruntime/python/tools/quantization/data_preprocess.py @@ -30,7 +30,8 @@ def preprocess_method1(image_filepath, height, width): parameter width: image width in pixels return: matrix characterizing image ''' - pillow_img = Image.open(image_filepath).resize((width, height)) + pillow_img = Image.new("RGB", (width, height)) + pillow_img.paste(Image.open(image_filepath).resize((width, height))) input_data = np.float32(pillow_img) / 127.5 - 1.0 # normalization input_data -= np.mean(input_data) # normalization nhwc_data = np.expand_dims(input_data, axis=0) @@ -47,7 +48,8 @@ def preprocess_method2(image_filepath, height, width): parameter width: image width in pixels return: matrix characterizing image ''' - pillow_img = Image.open(image_filepath).resize((width, height)) + pillow_img = Image.new("RGB", (width, height)) + pillow_img.paste(Image.open(image_filepath).resize((width, height))) input_data = np.float32(pillow_img) - \ np.array([123.68, 116.78, 103.94], dtype=np.float32) nhwc_data = np.expand_dims(input_data, axis=0) diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py index 5983de3ac7..83f1ae711b 100644 --- a/onnxruntime/python/tools/quantization/quantize.py +++ b/onnxruntime/python/tools/quantization/quantize.py @@ -15,6 +15,7 @@ from onnx import shape_inference __producer__ = "onnx.quantize" __version__ = "0.1.0" onnx_domain = "ai.onnx" +ms_domain = "com.microsoft" onnx_op_set_version = 11 type_to_name = { @@ -314,6 +315,8 @@ class ONNXQuantizer: new_list += self._quantize_matmul(node, new_list) elif node.op_type == 'Gather' and self._is_valid_quantize_value(node.input[0]): new_list += self._quantize_gather_ops(node, new_list) + elif node.op_type == 'Add' or node.op_type == 'Mul': + new_list += self._quantize_binary_math_ops(node, new_list) elif node.op_type == 'Relu' or node.op_type == 'Clip': new_list += self._handle_activation_ops(node, new_list) else: @@ -875,7 +878,6 @@ class ONNXQuantizer: List of scale names used for input quantization, List of new QuantizeLinear nodes created) ''' - assert (node.op_type == "Conv" or node.op_type == "MatMul" or node.op_type == "Gather") quantized_input_names = [] zero_point_names = [] @@ -1018,6 +1020,61 @@ class ONNXQuantizer: return [] + def _quantize_binary_math_ops(self, node, new_nodes_list): + ''' + Used when self.mode is QuantizationMode.QLinearOps. + Quantize the given binary math op, like Add, Mul, etc, to QLinearAdd, QLinearMul... + + parameter node: Current binary math node + parameter new_nodes_list: List of new nodes created before processing current node + return: List of nodes in topological order that represents quantized binary math node + ''' + if self.mode is not QuantizationMode.QLinearOps: + return self._handle_other_ops(node, new_nodes_list) + + data_found, output_scale_name, output_zp_name, _, _ = \ + self._get_quantization_params(node.output[0]) + if (not data_found): # only try to quantize when given quantization parameters for it + return self._handle_other_ops(node, new_nodes_list) + + (quantized_input_names, zero_point_names, scale_names, nodes) = \ + self._quantize_inputs(node, [0, 1], new_nodes_list) + + qlinear_binary_math_output = node.output[0] + "_quantized" + qlinear_binary_math_name = "" + if node.name != "": + qlinear_binary_math_name = node.name + "_quant" + kwargs = {} + for attribute in node.attribute: + kwargs.update(_attribute_to_kwarg(attribute)) + kwargs["domain"]=ms_domain + + qlinear_binary_math_inputs = [] + # Input 0 + qlinear_binary_math_inputs.append(quantized_input_names[0]) + qlinear_binary_math_inputs.append(scale_names[0]) + qlinear_binary_math_inputs.append(zero_point_names[0]) + # Input 1 + qlinear_binary_math_inputs.append(quantized_input_names[1]) + qlinear_binary_math_inputs.append(scale_names[1]) + qlinear_binary_math_inputs.append(zero_point_names[1]) + + # Output + qlinear_binary_math_inputs.append(output_scale_name) + qlinear_binary_math_inputs.append(output_zp_name) + + qlinear_binary_math_node = onnx.helper.make_node( + "QLinear" + node.op_type, qlinear_binary_math_inputs, + [qlinear_binary_math_output], qlinear_binary_math_name, **kwargs) + nodes.append(qlinear_binary_math_node) + + # Create an entry for this quantized value + q_output = QuantizedValue(node.output[0], qlinear_binary_math_output, output_scale_name, + output_zp_name, QuantizedValueType.Input) + self.quantized_value_map[node.output[0]] = q_output + + return nodes + def _quantize_gather_ops(self, node, new_nodes_list): assert (node.op_type == "Gather") (quantized_input_names, zero_point_names, scale_names, nodes) = \