diff --git a/onnxruntime/python/tools/transformers/convert_to_onnx.py b/onnxruntime/python/tools/transformers/convert_to_onnx.py index 89be710762..5df6c53723 100644 --- a/onnxruntime/python/tools/transformers/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/convert_to_onnx.py @@ -34,7 +34,7 @@ from benchmark_helper import create_onnxruntime_session, setup_logger, prepare_e logger = logging.getLogger('') -def parse_arguments(): +def parse_arguments(argv=None): parser = argparse.ArgumentParser() parser.add_argument('-m', @@ -94,6 +94,13 @@ def parse_arguments(): choices=list(Precision), help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization") + parser.add_argument("-t", + "--test_cases", + required=False, + type=int, + default=1000, + help="Number of test cases for parity") + parser.add_argument('--verbose', required=False, action='store_true') parser.set_defaults(verbose=False) @@ -135,20 +142,48 @@ def parse_arguments(): help='Nuclear/top-p sampling accumulation probability.') sampling_option_group.add_argument('--do_sample_top_k', type=int, default=0, help='Use top-k if non-zero.') - args = parser.parse_args() + fp16_option_group = parser.add_argument_group( + "float to float16 conversion parameters that works when \"--precision fp16\" is specified") + fp16_option_group.add_argument('--keep_io_types', + required=False, + action='store_true', + help='Use float32 for past inputs, present and logits outputs.') + fp16_option_group.set_defaults(keep_io_types=False) + fp16_option_group.add_argument('--io_block_list', + nargs='+', + default=[], + help='List of inputs or outputs in float32 instead of float16') + fp16_option_group.add_argument( + '--op_block_list', + nargs='+', + default=[], + help= + 'List of operators (like Attention Gather Add LayerNormalization FastGelu MatMul) to compute in float32 instead of float16.' + ) + fp16_option_group.add_argument('--node_block_list', + nargs='+', + default=[], + help='List of node names to compute in float32 instead of float16.') + + args = parser.parse_args(argv) return args -def main(): +def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"): + result = {} from transformers import __version__ as transformers_version if version.parse(transformers_version) < version.parse( "3.1.0"): # past_key_values name does not exist in 3.0.2 or older raise RuntimeError("This tool requires transformers 3.1.0 or later.") - args = parse_arguments() + args = parse_arguments(argv) setup_logger(args.verbose) + if not experiment_name: + import sys + experiment_name = " ".join(argv if argv else sys.argv[1:]) + if args.tolerance == 0: args.tolerance = DEFAULT_TOLERANCE[args.precision] @@ -219,6 +254,7 @@ def main(): logger.info(f"Exporting ONNX model to {raw_onnx_model}") use_padding = MODEL_CLASSES[args.model_class][2] + gpt2helper.export_onnx(model, device, raw_onnx_model, @@ -227,13 +263,23 @@ def main(): has_position_ids=use_padding, has_attention_mask=use_padding) + fp16_params = {"keep_io_types": args.keep_io_types} + if args.io_block_list: + fp16_params["keep_io_types"] = args.io_block_list + if args.node_block_list: + fp16_params["node_block_list"] = args.node_block_list + if args.op_block_list: + fp16_params["op_block_list"] = args.op_block_list + + is_io_float16 = (args.precision == Precision.FLOAT16 and not args.keep_io_types) + if args.optimize_onnx or args.precision != Precision.FLOAT32: output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else 'fp32'] logger.info(f"Optimizing model to {output_path}") gpt2helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16, model.config.num_attention_heads, model.config.hidden_size, - args.use_external_data_format) + args.use_external_data_format, **fp16_params) else: output_path = raw_onnx_model @@ -252,16 +298,80 @@ def main(): logger.info(f"Output path: {output_path}") session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose) - if session is not None: - gpt2helper.test_parity(session, - model, - device, - args.precision == Precision.FLOAT16, - rtol=args.tolerance, - atol=args.tolerance, - model_class=args.model_class, - has_position_ids=use_padding, - has_attention_mask=use_padding) + if args.model_class == "GPT2LMHeadModel" and session is not None: + parity_result = gpt2helper.test_parity(session, + model, + device, + is_io_float16, + rtol=args.tolerance, + atol=args.tolerance, + model_class=args.model_class, + has_position_ids=use_padding, + has_attention_mask=use_padding, + total_test_cases=args.test_cases, + verbose=args.verbose) + + latency = gpt2helper.test_performance(session, + model, + device, + is_io_float16, + total_runs=100, + use_io_binding=True, + model_class=args.model_class, + has_position_ids=use_padding, + has_attention_mask=use_padding, + batch_size=8, + sequence_length=1, + past_sequence_length=32) + + if args.precision == Precision.FLOAT16: + logger.info(f"fp16 conversion parameters:{fp16_params}") + + # Write results to file + import csv + from onnxruntime import __version__ as ort_version + latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)" + csv_file_existed = os.path.exists(csv_filename) + with open(csv_filename, mode="a", newline='') as csv_file: + column_names = [ + "experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases", + "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "ORT_TRANSFORMER_OPTIONS", + "ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "diff_50_percentile", "diff_90_percentile", + "diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate", + "onnx_size_in_MB" + ] + csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) + if not csv_file_existed: + csv_writer.writeheader() + row = { + "experiment": experiment_name, + "run_id": run_id, + "model_name": args.model_name_or_path, + "model_class": args.model_class, + "gpu": args.use_gpu, + "precision": args.precision, + "optimizer": args.optimize_onnx, + "test_cases": args.test_cases, + "keep_io_types": args.keep_io_types, + "io_block_list": args.io_block_list, + "op_block_list": args.op_block_list, + "node_block_list": args.node_block_list, + "ORT_TRANSFORMER_OPTIONS": os.getenv('ORT_TRANSFORMER_OPTIONS'), + "ORT_CUDA_GEMM_OPTIONS": os.getenv('ORT_CUDA_GEMM_OPTIONS'), + "onnxruntime": ort_version, + latency_name: f"{latency:.2f}", + "diff_50_percentile": parity_result["max_diff_percentile_50"], + "diff_90_percentile": parity_result["max_diff_percentile_90"], + "diff_95_percentile": parity_result["max_diff_percentile_95"], + "diff_99_percentile": parity_result["max_diff_percentile_99"], + "diff_pass_rate": parity_result["diff_pass_rate"], + "nan_rate": parity_result["nan_rate"], + "top1_match_rate": parity_result["top1_match_rate"], + "onnx_size_in_MB": "{}".format(int(os.path.getsize(output_path) / 1024 / 1024)) + } + logger.info(f"result: {row}") + result.update(row) + csv_writer.writerow(row) if args.input_test_file: test_inputs = [] @@ -275,14 +385,12 @@ def main(): if use_padding: if "attention_mask" in data: - numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32 + numpy_float = numpy.float16 if is_io_float16 else numpy.float32 attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"], dtype=numpy_float)).to(device) else: padding = -1 - attention_mask = ( - input_ids != - padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32) + attention_mask = (input_ids != padding).type(torch.float16 if is_io_float16 else torch.float32) input_ids.masked_fill_(input_ids == padding, 0) if "position_ids" in data: @@ -324,6 +432,7 @@ def main(): save_test_data_dir=Path(output_path).parent) logger.info(f"Done. Output model: {output_path}") + return result if __name__ == '__main__': diff --git a/onnxruntime/python/tools/transformers/float16.py b/onnxruntime/python/tools/transformers/float16.py new file mode 100644 index 0000000000..b459c74b04 --- /dev/null +++ b/onnxruntime/python/tools/transformers/float16.py @@ -0,0 +1,344 @@ +#------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +#-------------------------------------------------------------------------- + +# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py + +import itertools +import numpy as np +import onnx +from onnx import helper, numpy_helper +from onnx import onnx_pb as onnx_proto + + +def _npfloat16_to_int(np_list): + ''' + Convert numpy float16 to python int. + + :param np_list: numpy float16 list + :return int_list: python int list + ''' + return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list] + + +def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4): + ''' + Convert float32 numpy array to float16 without changing sign or finiteness. + Positive values less than min_positive_val are mapped to min_positive_val. + Positive finite values greater than max_finite_val are mapped to max_finite_val. + Similar for negative values. NaN, 0, inf, and -inf are unchanged. + ''' + def between(a, b, c): + return np.logical_and(a < b, b < c) + + np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array) + np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array) + np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array) + np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array) + return np.float16(np_array) + + +def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4): + ''' + Convert tensor float to float16. + + :param tensor: TensorProto object + :return tensor_float16: converted TensorProto object + + Example: + + :: + + from onnxmltools.utils.float16_converter import convert_tensor_float_to_float16 + new_tensor = convert_tensor_float_to_float16(tensor) + + ''' + if not isinstance(tensor, onnx_proto.TensorProto): + raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor)) + + if tensor.data_type == onnx_proto.TensorProto.FLOAT: + tensor.data_type = onnx_proto.TensorProto.FLOAT16 + # convert float_data (float type) to float16 and write to int32_data + if tensor.float_data: + float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val) + int_list = _npfloat16_to_int(float16_data) + tensor.int32_data[:] = int_list + tensor.float_data[:] = [] + # convert raw_data (bytes type) + if tensor.raw_data: + # convert n.raw_data to float + float32_list = np.fromstring(tensor.raw_data, dtype='float32') + # convert float to float16 + float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val) + # convert float16 to bytes and write back to raw_data + tensor.raw_data = float16_list.tostring() + return tensor + + +def make_value_info_from_tensor(tensor): + shape = numpy_helper.to_array(tensor).shape + return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape) + + +DEFAULT_OP_BLOCK_LIST = [ + 'ArrayFeatureExtractor', 'Binarizer', 'CastMap', 'CategoryMapper', 'DictVectorizer', 'FeatureVectorizer', 'Imputer', + 'LabelEncoder', 'LinearClassifier', 'LinearRegressor', 'Normalizer', 'OneHotEncoder', 'SVMClassifier', + 'SVMRegressor', 'Scaler', 'TreeEnsembleClassifier', 'TreeEnsembleRegressor', 'ZipMap', 'NonMaxSuppression', 'TopK', + 'RoiAlign', 'Resize', 'Range', 'CumSum', 'Min', 'Max', 'Upsample' +] + + +def convert_float_to_float16(model, + min_positive_val=1e-7, + max_finite_val=1e4, + keep_io_types=False, + disable_shape_infer=False, + op_block_list=None, + node_block_list=None): + ''' + Convert tensor float type in the ONNX ModelProto input to tensor float16. + + :param model: ONNX ModelProto object + :param disable_shape_infer: Type/shape information is needed for conversion to work. + Set to True only if the model already has type/shape information for all tensors. + :return: converted ONNX ModelProto object + + Examples: + + :: + + Example 1: Convert ONNX ModelProto object: + from onnxmltools.utils.float16_converter import convert_float_to_float16 + new_onnx_model = convert_float_to_float16(onnx_model) + + Example 2: Convert ONNX model binary file: + from onnxmltools.utils.float16_converter import convert_float_to_float16 + from onnxmltools.utils import load_model, save_model + onnx_model = load_model('model.onnx') + new_onnx_model = convert_float_to_float16(onnx_model) + save_model(new_onnx_model, 'new_model.onnx') + + ''' + func_infer_shape = None + if not disable_shape_infer and onnx.__version__ >= '1.2': + try: + from onnx.shape_inference import infer_shapes + func_infer_shape = infer_shapes + finally: + pass + + if not isinstance(model, onnx_proto.ModelProto): + raise ValueError('Expected model type is an ONNX ModelProto but got %s' % type(model)) + + # create blocklists + if op_block_list is None: + op_block_list = DEFAULT_OP_BLOCK_LIST + if node_block_list is None: + node_block_list = [] + op_block_list = set(op_block_list) + node_block_list = set(node_block_list) + + # create a queue for BFS + queue = [] + value_info_list = [] + node_list = [] + # type inference on input model + if func_infer_shape is not None: + model = func_infer_shape(model) + queue.append(model) + name_mapping = {} + graph_io_to_skip = set() + io_casts = set() + + fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT] + fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT] + if isinstance(keep_io_types, list): + fp32_inputs = [n for n in fp32_inputs if n in keep_io_types] + fp32_outputs = [n for n in fp32_outputs if n in keep_io_types] + print("keep_io_types", keep_io_types, "fp32_inputs", fp32_inputs, "fp32_outputs", fp32_outputs) + elif not keep_io_types: + fp32_inputs = [] + fp32_outputs = [] + + for i, n in enumerate(model.graph.input): + if n.name in fp32_inputs: + output_name = 'graph_input_cast_' + str(i) + name_mapping[n.name] = output_name + graph_io_to_skip.add(n.name) + + node_name = 'graph_input_cast' + str(i) + new_value_info = model.graph.value_info.add() + new_value_info.CopyFrom(n) + new_value_info.name = output_name + new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 + # add Cast node (from tensor(float) to tensor(float16) after graph input + new_node = [helper.make_node('Cast', [n.name], [output_name], to=10, name=node_name)] + model.graph.node.extend(new_node) + value_info_list.append(new_value_info) + io_casts.add(node_name) + + for i, n in enumerate(model.graph.output): + if n.name in fp32_outputs: + input_name = 'graph_output_cast_' + str(i) + name_mapping[n.name] = input_name + graph_io_to_skip.add(n.name) + + node_name = 'graph_output_cast' + str(i) + # add Cast node (from tensor(float16) to tensor(float) before graph output + new_value_info = model.graph.value_info.add() + new_value_info.CopyFrom(n) + new_value_info.name = input_name + new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 + new_node = [helper.make_node('Cast', [input_name], [n.name], to=1, name=node_name)] + model.graph.node.extend(new_node) + value_info_list.append(new_value_info) + io_casts.add(node_name) + + fp32_initializer_counters = {} + while queue: + next_level = [] + for q in queue: + # if q is model, push q.graph (GraphProto) + if isinstance(q, onnx_proto.ModelProto): + next_level.append(q.graph) + # if q is model.graph, push q.node.attribute (AttributeProto) + if isinstance(q, onnx_proto.GraphProto): + for n in q.initializer: # TensorProto type + if n.data_type == onnx_proto.TensorProto.FLOAT: + fp32_initializer_counters[n.name] = [0, + 0] # two counters: used by fp16 nodes, used by fp32 nodes + + for n in q.node: + # if n is in the block list (doesn't support float16), no conversion for the node, + # and save the node for further processing + if n.name in io_casts: + continue + for i in range(len(n.input)): + if n.input[i] in name_mapping: + n.input[i] = name_mapping[n.input[i]] + for i in range(len(n.output)): + if n.output[i] in name_mapping: + n.output[i] = name_mapping[n.output[i]] + + is_node_blocked = n.op_type in op_block_list or n.name in node_block_list + for input in n.input: + if input in fp32_initializer_counters: + fp32_initializer_counters[input][int(is_node_blocked)] += 1 + + if is_node_blocked: + node_list.append(n) + else: + if n.op_type == 'Cast': + for attr in n.attribute: + if attr.name == 'to' and attr.i == 1: + attr.i = 10 + break + for attr in n.attribute: + next_level.append(attr) + # if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto) + # and process node.attribute.t and node.attribute.tensors (TensorProto) + if isinstance(q, onnx_proto.AttributeProto): + next_level.append(q.g) + for n in q.graphs: + next_level.append(n) + q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val)) + for n in q.tensors: + n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val) + # if q is graph, process graph.initializer(TensorProto), input, output and value_info (ValueInfoProto) + if isinstance(q, onnx_proto.GraphProto): + for n in q.initializer: # TensorProto type + if n.data_type == onnx_proto.TensorProto.FLOAT: + # TODO: handle initializer that used by subgraph + if fp32_initializer_counters[n.name][1] == 0: # not used by fp32 node + n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val) + value_info_list.append(make_value_info_from_tensor(n)) + else: + # TODO: add a cast node to handle the case that an intiailizer is used by both fp32 and fp16 nodes + assert fp32_initializer_counters[n.name][0] == 0 + # for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to + # tensor(float16) except map and seq(map). And save them in value_info_list for further processing + for n in itertools.chain(q.input, q.output, q.value_info): + if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT: + if n.name not in graph_io_to_skip: + n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16 + value_info_list.append(n) + queue = next_level + + # process the nodes in block list that doesn't support tensor(float16) + for node in node_list: + # if input's name is in the value_info_list meaning input is tensor(float16) type, + # insert a float16 to float Cast node before the node, + # change current node's input name and create new value_info for the new name + for i in range(len(node.input)): + input = node.input[i] + for value_info in value_info_list: + if input == value_info.name: + # create new value_info for current node's new input name + new_value_info = model.graph.value_info.add() + new_value_info.CopyFrom(value_info) + output_name = node.name + '_input_cast_' + str(i) + new_value_info.name = output_name + new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT + # add Cast node (from tensor(float16) to tensor(float) before current node + node_name = node.name + '_input_cast' + str(i) + new_node = [helper.make_node('Cast', [input], [output_name], to=1, name=node_name)] + model.graph.node.extend(new_node) + # change current node's input name + node.input[i] = output_name + break + # if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to + # float16 Cast node after the node, change current node's output name and create new value_info for the new name + for i in range(len(node.output)): + output = node.output[i] + for value_info in value_info_list: + if output == value_info.name: + # create new value_info for current node's new output + new_value_info = model.graph.value_info.add() + new_value_info.CopyFrom(value_info) + input_name = node.name + '_output_cast_' + str(i) + new_value_info.name = input_name + new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT + # add Cast node (from tensor(float) to tensor(float16) after current node + node_name = node.name + '_output_cast' + str(i) + new_node = [helper.make_node('Cast', [input_name], [output], to=10, name=node_name)] + model.graph.node.extend(new_node) + # change current node's input name + node.output[i] = input_name + break + return model + + +def convert_float_to_float16_model_path(model_path, min_positive_val=1e-7, max_finite_val=1e4, keep_io_types=False): + ''' + Convert tensor float type in the ONNX Model to tensor float16. + *It is to fix an issue that infer_shapes func cannot be used to infer >2GB models. + *But this function can be applied to all model sizes. + :param model_path: ONNX Model path + :return: converted ONNX ModelProto object + Examples + :: + #Convert to ONNX ModelProto object and save model binary file: + from onnxmltools.utils.float16_converter import convert_float_to_float16_model_path + new_onnx_model = convert_float_to_float16_model_path('model.onnx') + onnx.save(new_onnx_model, 'new_model.onnx') + ''' + + disable_shape_infer = False + if onnx.__version__ >= '1.8': + try: + # infer_shapes_path can be applied to all model sizes + from onnx.shape_inference import infer_shapes_path + import tempfile + import os + # shape_infer_model_path should be in the same folder of model_path + with tempfile.NamedTemporaryFile(dir=os.path.dirname(model_path)) as tmpfile: + shape_infer_model_path = tmpfile.name + infer_shapes_path(model_path, shape_infer_model_path) + model = onnx.load(shape_infer_model_path) + disable_shape_infer = True + finally: + pass + if not disable_shape_infer: + model = onnx.load(model_path) + return convert_float_to_float16(model, min_positive_val, max_finite_val, keep_io_types, disable_shape_infer) diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py index 73f37cc415..016543eaf1 100644 --- a/onnxruntime/python/tools/transformers/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/gpt2_helper.py @@ -12,6 +12,7 @@ import random import numpy import time import re +import pickle from pathlib import Path from typing import List, Dict, Tuple, Union from transformers import GPT2Model, GPT2LMHeadModel, GPT2Config, TFGPT2Model @@ -99,7 +100,9 @@ class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel): super().__init__(config) def forward(self, input_ids, *past): - return super().forward(input_ids, past_key_values=past) + result = super().forward(input_ids, past_key_values=past, return_dict=False) + + return MyGPT2Model.post_process(result, self.config.n_layer) # Maps model class name to a tuple of model class, name of first output and use padding or not @@ -154,7 +157,7 @@ class Gpt2Helper: float_type = torch.float16 if float16 else torch.float32 past_shape = [2, batch_size, num_attention_heads, past_sequence_length, int(hidden_size / num_attention_heads)] - past = [torch.rand(past_shape, dtype=float_type, device=device) for _ in range(num_layer)] + past = [(torch.rand(past_shape, dtype=float_type, device=device) * 2.0 - 1.0) for _ in range(num_layer)] input_ids = torch.randint(low=0, high=vocab_size - 1, size=(batch_size, sequence_length), @@ -261,6 +264,53 @@ class Gpt2Helper: return is_all_close + @staticmethod + def compare_outputs_v2(torch_outputs, ort_outputs, atol=1e-06): + """Compare outputs from PyTorch and OnnxRuntime + + Args: + torch_outputs (Tuple[Torch.Tensor]): PyTorch model output + ort_outputs (List[numpy.ndarray]): OnnxRuntime output + atol (float, optional): Absolute tollerance. Defaults to 1e-06. + + Returns: + is_all_close(bool): whether all elements are close. + max_abs_diff(float): maximum absolute difference. + messages(str): a list of debug message for each output + """ + is_all_close = True + is_top1_matched = False + max_diffs = [] + messages = [] + for i in range(len(ort_outputs)): + ort_output = ort_outputs[i] + torch_output = (torch_outputs[0] if i == 0 else torch_outputs[1][i - 1]).cpu().numpy() + is_close = numpy.allclose(ort_output, torch_output, atol=atol, rtol=0) + max_diffs.append(numpy.amax(numpy.abs(torch_output - ort_output))) + is_all_close = is_all_close and is_close + + if numpy.isnan(torch_output).any(): + logger.debug(f'PyTorch output {i} has nan') + if numpy.isinf(torch_output).any(): + logger.debug(f'PyTorch output {i} has inf') + if numpy.isnan(ort_output).any(): + logger.debug(f'ORT output {i} has nan') + if numpy.isinf(ort_output).any(): + logger.debug(f'ORT output {i} has inf') + + diff = numpy.fabs(ort_output - torch_output) + idx = numpy.unravel_index(diff.argmax(), diff.shape) + messages.append( + f'diff={diff[idx]:.9f} index={idx} ort={ort_output[idx]:.9f} torch={float(torch_output[idx]):.9f}') + + if i == 0: # logits + ort_max_index = numpy.unravel_index(numpy.argmax(ort_output, axis=None), ort_output.shape) + torch_max_index = numpy.unravel_index(numpy.argmax(torch_output, axis=None), torch_output.shape) + is_top1_matched = numpy.array_equal(ort_max_index, torch_max_index) + + max_diff_output_index = max_diffs.index(max(max_diffs)) + return is_all_close, max(max_diffs), max_diff_output_index, messages, is_top1_matched + @staticmethod def export_onnx(model, device, @@ -345,19 +395,31 @@ class Gpt2Helper: is_float16, num_attention_heads, hidden_size, - use_external_data_format=False): + use_external_data_format=False, + **kwargs): """ Optimize ONNX model with an option to convert it to use mixed precision. """ from optimizer import optimize_model + + from fusion_options import FusionOptions + optimization_options = FusionOptions('gpt2') + #optimization_options.enable_gelu = False + #optimization_options.enable_layer_norm = False + #optimization_options.enable_attention = False m = optimize_model(onnx_model_path, model_type='gpt2', num_heads=num_attention_heads, hidden_size=hidden_size, opt_level=0, - optimization_options=None, + optimization_options=optimization_options, use_gpu=False) + if is_float16: - m.convert_model_float32_to_float16(cast_input_output=False) + op_full_list = set([node.op_type for node in m.nodes()]) + op_block_list = set(kwargs["op_block_list"]) if "op_block_list" in kwargs else set() + op_remain_list = op_full_list.difference(op_block_list) + logger.info(f"op_block_list={op_block_list} op_remain_list={op_remain_list}") + m.convert_float_to_float16(use_symbolic_shape_infer=True, **kwargs) m.save_model_to_file(optimized_model_path, use_external_data_format) @@ -526,6 +588,22 @@ class Gpt2Helper: return ort_outputs, average_latency + @staticmethod + def save_outputs(i, ort_outputs, torch_outputs): + with open(f'ort_outputs_{i}.pickle', 'wb') as f: + pickle.dump(ort_outputs, f) + logger.info(f"ORT output are saved to ort_outputs_{i}.pickle") + + with open(f'torch_outputs_{i}.pickle', 'wb') as f: + pickle.dump(torch_outputs, f) + logger.info(f"Torch output are saved to torch_outputs_{i}.pickle") + + @staticmethod + def save_inputs(i, dummy_inputs, ort_outputs, torch_outputs): + with open(f'dummy_inputs_{i}.pickle', 'wb') as f: + pickle.dump(dummy_inputs, f) + logger.info(f"inputs are saved to dummy_inputs_{i}.pickle") + @staticmethod def test_parity(ort_session, model, @@ -537,14 +615,16 @@ class Gpt2Helper: use_io_binding=True, model_class="GPT2LMHeadModel", has_position_ids=True, - has_attention_mask=True): + has_attention_mask=True, + verbose=False, + enable_pickle_output=False): """ Generate random inputs and compare the results of PyTorch and Onnx Runtime. """ config: GPT2Config = model.config logger.info( - f"Running parity test (rtol={rtol}, atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding} model_class={model_class} is_float16={is_float16}) ..." + f"Running parity test (atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..." ) max_batch_size = 8 @@ -558,7 +638,10 @@ class Gpt2Helper: output_buffers = Gpt2Helper.get_output_buffers(max_output_shapes, device, is_float16) passed_test_cases = 0 - for _ in range(total_test_cases): + top1_matched_cases = 0 + + max_abs_diff_list = [] + for i in range(total_test_cases): sequence_length = random.randint(1, max_seq_len) past_sequence_length = random.randint(0, max_past_seq_len) batch_size = random.randint(1, max_batch_size) @@ -569,7 +652,6 @@ class Gpt2Helper: config.num_attention_heads, config.hidden_size, config.n_layer, config.vocab_size, device, is_float16, has_position_ids, has_attention_mask) - outputs = Gpt2Helper.pytorch_inference(model, dummy_inputs) if use_io_binding: ort_outputs = Gpt2Helper.onnxruntime_inference(ort_session, dummy_inputs) @@ -579,13 +661,84 @@ class Gpt2Helper: ort_outputs = Gpt2Helper.onnxruntime_inference_with_binded_io(ort_session, dummy_inputs, output_buffers, output_shapes) - is_all_close = Gpt2Helper.compare_outputs(outputs, ort_outputs, rtol=rtol, atol=atol) + is_all_close, max_abs_diff, max_diff_output_index, messages, is_top1_matched = Gpt2Helper.compare_outputs_v2( + outputs, ort_outputs, atol=atol) + if not numpy.isnan(max_abs_diff): + max_abs_diff_list.append(max_abs_diff) if is_all_close: passed_test_cases += 1 - logger.info(f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}") + if is_top1_matched: + top1_matched_cases += 1 + + if verbose and not is_all_close: + logger.info( + f"test_case={i} batch_size={batch_size} past_sequence_length={past_sequence_length} sequence_length={sequence_length} MaxDiff={max_abs_diff}" + ) + for i, message in enumerate(messages): + logger.info(f"\t{i}: Name={ort_session.get_outputs()[i].name}, {message}") + + # Collect data for debugging + if enable_pickle_output and (numpy.isnan(max_abs_diff) or max_abs_diff > 100 * atol): + Gpt2Helper.save_inputs(i, dummy_inputs) + Gpt2Helper.save_outputs(i, ort_outputs, outputs) + + if max_abs_diff_list: + result = { + f"max_diff_percentile_{p}": "{:.5f}".format(numpy.percentile(max_abs_diff_list, p)) + for p in [50, 90, 95, 99] + } + else: + result = {f"max_diff_percentile_{p}": "nan" for p in [50, 90, 95, 99]} + + result["top1_match_rate"] = top1_matched_cases * 1.0 / total_test_cases + result["diff_pass_rate"] = passed_test_cases * 1.0 / total_test_cases + result["nan_rate"] = (total_test_cases - len(max_abs_diff_list)) * 1.0 / total_test_cases + + logger.info( + f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}; Nan={total_test_cases-len(max_abs_diff_list)}; Top1_Matched={top1_matched_cases}" + ) + if passed_test_cases > 0.95 * total_test_cases: logger.info(f"Parity is good: passed rate={int(passed_test_cases*100/total_test_cases):.0f}%") - return passed_test_cases == total_test_cases + + return result + + @staticmethod + def test_performance(ort_session, + model, + device, + is_float16=False, + total_runs=100, + use_io_binding=True, + model_class="GPT2LMHeadModel", + has_position_ids=True, + has_attention_mask=True, + batch_size=8, + sequence_length=1, + past_sequence_length=32): + """ Generate random inputs and measure average latency of Onnx Runtime. + """ + + config: GPT2Config = model.config + + output_buffers = None + if use_io_binding: + output_shapes = Gpt2Helper.get_output_shapes(batch_size, past_sequence_length, sequence_length, config, + model_class) + output_buffers = Gpt2Helper.get_output_buffers(output_shapes, device, is_float16) + + dummy_inputs = Gpt2Helper.get_dummy_inputs(batch_size, past_sequence_length, sequence_length, + config.num_attention_heads, config.hidden_size, config.n_layer, + config.vocab_size, device, is_float16, has_position_ids, + has_attention_mask) + + if use_io_binding: + _, latency = Gpt2Helper.onnxruntime_inference(ort_session, dummy_inputs, total_runs) + else: + _, latency = Gpt2Helper.onnxruntime_inference_with_binded_io(ort_session, dummy_inputs, output_buffers, + output_shapes, total_runs) + + return latency @staticmethod def torchscript(model, config, device, has_position_ids=True, has_attention_mask=True): diff --git a/onnxruntime/python/tools/transformers/gpt2_parity.py b/onnxruntime/python/tools/transformers/gpt2_parity.py new file mode 100644 index 0000000000..e70f191e65 --- /dev/null +++ b/onnxruntime/python/tools/transformers/gpt2_parity.py @@ -0,0 +1,177 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +from convert_to_onnx import main +import os +import argparse +import logging +from gpt2_helper import PRETRAINED_GPT2_MODELS +from benchmark_helper import setup_logger + +logger = logging.getLogger('') + + +def parse_arguments(argv=None): + parser = argparse.ArgumentParser() + + parser.add_argument('-m', + '--model_name_or_path', + required=True, + type=str, + help='Model path, or pretrained model name in the list: ' + ', '.join(PRETRAINED_GPT2_MODELS)) + + parser.add_argument('--csv', + required=False, + type=str, + default='gpt2_parity_results.csv', + help='path of csv file to save the result') + + parser.add_argument('--runs', + required=False, + type=int, + default=5, + help="number of repeated runs to get median value of each metric") + + parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference") + parser.set_defaults(use_gpu=False) + + parser.add_argument('--all', required=False, action='store_true', help="run all combinations of mixed precision") + parser.set_defaults(all=False) + + parser.add_argument('-e', '--use_external_data_format', required=False, action='store_true') + parser.set_defaults(use_external_data_format=False) + + parser.add_argument('--verbose', required=False, action='store_true') + parser.set_defaults(verbose=False) + + args = parser.parse_args(argv) + + return args + + +class ParityTask: + def __init__(self, total_runs, csv_path): + self.total_runs = total_runs + self.csv_path = csv_path + self.latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)" + self.metric_names = [ + self.latency_name, "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile", + "diff_pass_rate", "nan_rate", "top1_match_rate", "onnx_size_in_MB" + ] + + def run(self, argv, name): + results = [] + experiment_name = name + for i in range(self.total_runs): + try: + result = main(argv, experiment_name=experiment_name, run_id=i, csv_filename=self.csv_path) + except: + logger.error(f"Failed to run experiment{experiment_name}") + continue + if result: + results.append(result) + + if len(results) == 0: + return + + # Calculate median value per metric + all_results = {} + for name in self.metric_names: + all_results[name] = [] + + for result in results: + for name in self.metric_names: + if name in result: + all_results[name].append(result[name]) + + import statistics + median_result = results[0] + for name in self.metric_names: + median_result[name] = statistics.median(all_results[name]) + + self.save_result(median_result) + + def save_result(self, result): + import csv + csv_filename = self.csv_path + + csv_file_existed = os.path.exists(csv_filename) + with open(csv_filename, mode="a", newline='') as csv_file: + column_names = [ + "experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases", + "keep_io_types", "io_block_list", "op_block_list", "node_block_list", "ORT_TRANSFORMER_OPTIONS", + "ORT_CUDA_GEMM_OPTIONS", "onnxruntime" + ] + self.metric_names + + csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) + if not csv_file_existed: + csv_writer.writeheader() + + row = {} + for name in column_names: + row[name] = result[name] + + row["run_id"] = "median" + + csv_writer.writerow(row) + logger.info(f"result saved to {csv_filename}: {row}") + + +def run_parity(args): + task = ParityTask(args.runs, args.csv) + + model = args.model_name_or_path + fp32_baseline = f"-m {model} -o -p fp32".split() + if args.use_gpu: + fp32_baseline.append("--use_gpu") + + if args.use_external_data_format: + fp32_baseline.append("--use_external_data_format") + + task.run(fp32_baseline, "fp32 baseline") + + # The following tests for fp16 requires GPU + if not args.use_gpu: + logger.info("skip mixed precision since --use_gpu is not specified") + return + + baseline = f"-m {model} -o --use_gpu -p fp16".split() + if args.use_external_data_format: + baseline.append("--use_external_data_format") + task.run(baseline, "fp16 baseline") + + if not args.all: + logger.info("skip remaining combinations since --all is not specified") + return + + fp32_logits = ["--io_block_list", "logits"] + task.run(baseline + fp32_logits, "fp16 except logits") + + fp32_io = ["--keep_io_types"] + task.run(baseline + fp32_io, "Graph I/O FP32, Other FP16") + + op_list = "Attention Gather Add LayerNormalization FastGelu MatMul".split() + task.run(baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32") + + for op in op_list: + op_block_list = ["--op_block_list"] + [o for o in op_list if o != op] + task.run(baseline + fp32_io + op_block_list, f"FP32 except {op} in fp16") + + for op in op_list: + op_block_list = ["--op_block_list", op] + task.run(baseline + op_block_list, f"FP16 except {op} in fp32") + + op_block_list = ["--op_block_list", "LayerNormalization", "FastGelu"] + task.run(baseline + op_block_list, f"FP16 except LayerNormalization and FastGelu in fp32") + + task.run(baseline + op_block_list + fp32_logits, f"FP16 except logits, LayerNormalization and FastGelu in fp32") + + +if __name__ == '__main__': + args = parse_arguments() + setup_logger(args.verbose) + + run_parity(args) diff --git a/onnxruntime/python/tools/transformers/onnx_model.py b/onnxruntime/python/tools/transformers/onnx_model.py index a54e35ee43..c6b40e79b8 100644 --- a/onnxruntime/python/tools/transformers/onnx_model.py +++ b/onnxruntime/python/tools/transformers/onnx_model.py @@ -18,6 +18,9 @@ logger = logging.getLogger(__name__) class OnnxModel: def __init__(self, model): + self.initialize(model) + + def initialize(self, model): self.model = model self._node_name_suffix: Dict[str, int] = {} # key is node name prefix, value is the last suffix generated self.shape_infer_helper = None @@ -495,31 +498,18 @@ class OnnxModel: initializer=graph.initializer, value_info=graph.value_info) - self.model = helper.make_model(graph_def, producer_name='onnxruntime-tools') + self.model = helper.make_model(graph_def, producer_name='onnxruntime') # restore opset version self.model.opset_import[0].version = original_opset_version - def convert_model_float32_to_float16(self, cast_input_output=True, use_symbolic_shape_infer=True): - """Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs. - For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance. - Args: - cast_input_output (bool, optional): keep data type of inputs and outputs, and add Cast nodes to convert float32 inputs to float16, and float16 to float32 for outputs. Defaults to True. - use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. - """ - from packaging.version import Version - import onnxconverter_common as oc - if Version(oc.__version__) > Version("1.7.0"): - model = self.model - if use_symbolic_shape_infer: - # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference. - shape_infer_helper = SymbolicShapeInferenceHelper(model) - model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False) - self.model = oc.float16.convert_float_to_float16(model, - keep_io_types=cast_input_output, - disable_shape_infer=use_symbolic_shape_infer) - return + def _naive_float_to_float16(self, keep_io_types=True): + """Convert model from single precision to half precision naively. + It might generate invalid model or cause precision loss. + Args: + cast_input_output (bool, optional): [description]. Defaults to True. + """ graph = self.model.graph initializers = graph.initializer @@ -540,7 +530,7 @@ class OnnxModel: if att.name == 'to' and att.i == 1: att.CopyFrom(helper.make_attribute("to", int(TensorProto.FLOAT16))) - if not cast_input_output: + if not keep_io_types: self.change_input_output_float32_to_float16() return @@ -570,6 +560,107 @@ class OnnxModel: cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.FLOAT))]) self.add_node(cast_node) + def get_dtype(self, input_or_output: str): + """Try get data type given a name (could be initializer, graph input or output).""" + tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info} + + if input_or_output in tensor_type_map: + return tensor_type_map[input_or_output].tensor_type.elem_type + + graph_input = self.find_graph_input(input_or_output) + if graph_input: + return graph_input.type.tensor_type.elem_type + + graph_output = self.find_graph_output(input_or_output) + if graph_output: + return graph_output.type.tensor_type.elem_type + + return None + + def convert_model_float32_to_float16(self, cast_input_output=True, **kwargs): + logger.warn( + 'The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!') + self._naive_float_to_float16(keep_io_types=cast_input_output) + + def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs): + """Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs. + For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance. + Args: + keep_io_types (bool, optional): keep data type of inputs and outputs. Defaults to True. + use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference. + kwargs: parameters for float16 conversion. + """ + if "keep_io_types" not in kwargs: + kwargs["keep_io_types"] = True + + def float_to_float16_func(): + # TODO: import from onnxconverter_common when it is stable + #try: + # import onnxconverter_common as oc + # from packaging.version import Version + # if Version(oc.__version__) > Version("1.9.0"): + # from onnxconverter_common.float16 import convert_float_to_float16 + # return convert_float_to_float16 + #except ImportError: + # pass + + from float16 import convert_float_to_float16 + return convert_float_to_float16 + + convert_float_to_float16 = float_to_float16_func() + + model = self.model + if use_symbolic_shape_infer: + # Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference. + shape_infer_helper = SymbolicShapeInferenceHelper(model) + model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False) + + parameters = {'disable_shape_infer': use_symbolic_shape_infer} + parameters.update({ + key: kwargs[key] + for key in ['keep_io_types', 'min_positive_val', 'max_finite_val', 'op_block_list', 'node_block_list'] + if key in kwargs + }) + + fp16_model = convert_float_to_float16(model, **parameters) + self.initialize(fp16_model) + + def get_node_attribute(node, attribute_name: str): + for attr in node.attribute: + if attr.name == attribute_name: + value = helper.get_attribute_value(attr) + return value + return None + + # Convert_float_to_float16 might add Cast(to=10) --> Cast(to=1) when two consequent nodes are computed in FP32. + # Below are post-processing that removes those Cast nodes. + # Remove first Cast nodes in path like --> Cast --> Cast --> + nodes_to_remove = [] + for node in self.nodes(): + if node.op_type == "Cast": + parent = self.get_parent(node, 0) + if parent and parent.op_type == "Cast": + if self.get_children(parent) == 1: # cannot be removed if its output is used by multiple nodes + self.replace_input_of_all_nodes(parent.output[0], parent.input[0]) + nodes_to_remove.append(parent) + + # Remove the second cast node. + for node in self.nodes(): + if node.op_type == "Cast" and get_node_attribute(node, "to") == int(TensorProto.FLOAT) and \ + self.get_dtype(node.input[0]) == int(TensorProto.FLOAT): + + if self.find_graph_output(node.output[0]): + self.replace_output_of_all_nodes(node.input[0], node.output[0]) + else: + self.replace_input_of_all_nodes(node.output[0], node.input[0]) + nodes_to_remove.append(node) + + self.remove_nodes(nodes_to_remove) + + if nodes_to_remove: + self.prune_graph() + print(f"removed {len(nodes_to_remove)} Cast nodes from float16 model") + def create_node_name(self, op_type, name_prefix=None): """Create a unique node name that starts with a prefix (default is operator type). The name will not be duplicated with any name that generated or existed in current graphs. diff --git a/onnxruntime/python/tools/transformers/requirements.txt b/onnxruntime/python/tools/transformers/requirements.txt new file mode 100644 index 0000000000..6f58a38d4a --- /dev/null +++ b/onnxruntime/python/tools/transformers/requirements.txt @@ -0,0 +1,11 @@ +onnx >= 1.8 +numpy +coloredlogs +psutil +py-cpuinfo +py3nvml +packaging +transformers >= 4.0 + +# please follow https://pytorch.org/ to install PyTorch for your OS +torch >= 1.8 \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/requirements_cpu.txt b/onnxruntime/python/tools/transformers/requirements_cpu.txt deleted file mode 100644 index ff4e066ed9..0000000000 --- a/onnxruntime/python/tools/transformers/requirements_cpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -onnx -numpy -coloredlogs -psutil -py-cpuinfo -py3nvml -packaging -transformers -onnxruntime -onnxconverter_common ---find-links https://download.pytorch.org/whl/torch_stable.html -torch==1.7.1+cpu -torchvision==0.8.2+cpu -torchaudio===0.7.2 diff --git a/onnxruntime/python/tools/transformers/requirements_gpu.txt b/onnxruntime/python/tools/transformers/requirements_gpu.txt deleted file mode 100644 index d3dc4e8aee..0000000000 --- a/onnxruntime/python/tools/transformers/requirements_gpu.txt +++ /dev/null @@ -1,14 +0,0 @@ -onnx -numpy -coloredlogs -psutil -py-cpuinfo -py3nvml -packaging -transformers -onnxruntime-gpu -onnxconverter_common ---find-links https://download.pytorch.org/whl/torch_stable.html -torch===1.7.1 -torchvision===0.8.2 -torchaudio===0.7.2 \ No newline at end of file