Add gpt2 mixed precision conversion and parity tools (#8845)

This commit is contained in:
Tianlei Wu 2021-08-26 15:34:45 -07:00 committed by GitHub
parent e8564d6597
commit cb59f46e04
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 937 additions and 80 deletions

View file

@ -34,7 +34,7 @@ from benchmark_helper import create_onnxruntime_session, setup_logger, prepare_e
logger = logging.getLogger('')
def parse_arguments():
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('-m',
@ -94,6 +94,13 @@ def parse_arguments():
choices=list(Precision),
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization")
parser.add_argument("-t",
"--test_cases",
required=False,
type=int,
default=1000,
help="Number of test cases for parity")
parser.add_argument('--verbose', required=False, action='store_true')
parser.set_defaults(verbose=False)
@ -135,20 +142,48 @@ def parse_arguments():
help='Nuclear/top-p sampling accumulation probability.')
sampling_option_group.add_argument('--do_sample_top_k', type=int, default=0, help='Use top-k if non-zero.')
args = parser.parse_args()
fp16_option_group = parser.add_argument_group(
"float to float16 conversion parameters that works when \"--precision fp16\" is specified")
fp16_option_group.add_argument('--keep_io_types',
required=False,
action='store_true',
help='Use float32 for past inputs, present and logits outputs.')
fp16_option_group.set_defaults(keep_io_types=False)
fp16_option_group.add_argument('--io_block_list',
nargs='+',
default=[],
help='List of inputs or outputs in float32 instead of float16')
fp16_option_group.add_argument(
'--op_block_list',
nargs='+',
default=[],
help=
'List of operators (like Attention Gather Add LayerNormalization FastGelu MatMul) to compute in float32 instead of float16.'
)
fp16_option_group.add_argument('--node_block_list',
nargs='+',
default=[],
help='List of node names to compute in float32 instead of float16.')
args = parser.parse_args(argv)
return args
def main():
def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"):
result = {}
from transformers import __version__ as transformers_version
if version.parse(transformers_version) < version.parse(
"3.1.0"): # past_key_values name does not exist in 3.0.2 or older
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
args = parse_arguments()
args = parse_arguments(argv)
setup_logger(args.verbose)
if not experiment_name:
import sys
experiment_name = " ".join(argv if argv else sys.argv[1:])
if args.tolerance == 0:
args.tolerance = DEFAULT_TOLERANCE[args.precision]
@ -219,6 +254,7 @@ def main():
logger.info(f"Exporting ONNX model to {raw_onnx_model}")
use_padding = MODEL_CLASSES[args.model_class][2]
gpt2helper.export_onnx(model,
device,
raw_onnx_model,
@ -227,13 +263,23 @@ def main():
has_position_ids=use_padding,
has_attention_mask=use_padding)
fp16_params = {"keep_io_types": args.keep_io_types}
if args.io_block_list:
fp16_params["keep_io_types"] = args.io_block_list
if args.node_block_list:
fp16_params["node_block_list"] = args.node_block_list
if args.op_block_list:
fp16_params["op_block_list"] = args.op_block_list
is_io_float16 = (args.precision == Precision.FLOAT16 and not args.keep_io_types)
if args.optimize_onnx or args.precision != Precision.FLOAT32:
output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else 'fp32']
logger.info(f"Optimizing model to {output_path}")
gpt2helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16,
model.config.num_attention_heads, model.config.hidden_size,
args.use_external_data_format)
args.use_external_data_format, **fp16_params)
else:
output_path = raw_onnx_model
@ -252,16 +298,80 @@ def main():
logger.info(f"Output path: {output_path}")
session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose)
if session is not None:
gpt2helper.test_parity(session,
model,
device,
args.precision == Precision.FLOAT16,
rtol=args.tolerance,
atol=args.tolerance,
model_class=args.model_class,
has_position_ids=use_padding,
has_attention_mask=use_padding)
if args.model_class == "GPT2LMHeadModel" and session is not None:
parity_result = gpt2helper.test_parity(session,
model,
device,
is_io_float16,
rtol=args.tolerance,
atol=args.tolerance,
model_class=args.model_class,
has_position_ids=use_padding,
has_attention_mask=use_padding,
total_test_cases=args.test_cases,
verbose=args.verbose)
latency = gpt2helper.test_performance(session,
model,
device,
is_io_float16,
total_runs=100,
use_io_binding=True,
model_class=args.model_class,
has_position_ids=use_padding,
has_attention_mask=use_padding,
batch_size=8,
sequence_length=1,
past_sequence_length=32)
if args.precision == Precision.FLOAT16:
logger.info(f"fp16 conversion parameters:{fp16_params}")
# Write results to file
import csv
from onnxruntime import __version__ as ort_version
latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
csv_file_existed = os.path.exists(csv_filename)
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases",
"keep_io_types", "io_block_list", "op_block_list", "node_block_list", "ORT_TRANSFORMER_OPTIONS",
"ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "diff_50_percentile", "diff_90_percentile",
"diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate",
"onnx_size_in_MB"
]
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
if not csv_file_existed:
csv_writer.writeheader()
row = {
"experiment": experiment_name,
"run_id": run_id,
"model_name": args.model_name_or_path,
"model_class": args.model_class,
"gpu": args.use_gpu,
"precision": args.precision,
"optimizer": args.optimize_onnx,
"test_cases": args.test_cases,
"keep_io_types": args.keep_io_types,
"io_block_list": args.io_block_list,
"op_block_list": args.op_block_list,
"node_block_list": args.node_block_list,
"ORT_TRANSFORMER_OPTIONS": os.getenv('ORT_TRANSFORMER_OPTIONS'),
"ORT_CUDA_GEMM_OPTIONS": os.getenv('ORT_CUDA_GEMM_OPTIONS'),
"onnxruntime": ort_version,
latency_name: f"{latency:.2f}",
"diff_50_percentile": parity_result["max_diff_percentile_50"],
"diff_90_percentile": parity_result["max_diff_percentile_90"],
"diff_95_percentile": parity_result["max_diff_percentile_95"],
"diff_99_percentile": parity_result["max_diff_percentile_99"],
"diff_pass_rate": parity_result["diff_pass_rate"],
"nan_rate": parity_result["nan_rate"],
"top1_match_rate": parity_result["top1_match_rate"],
"onnx_size_in_MB": "{}".format(int(os.path.getsize(output_path) / 1024 / 1024))
}
logger.info(f"result: {row}")
result.update(row)
csv_writer.writerow(row)
if args.input_test_file:
test_inputs = []
@ -275,14 +385,12 @@ def main():
if use_padding:
if "attention_mask" in data:
numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
numpy_float = numpy.float16 if is_io_float16 else numpy.float32
attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"],
dtype=numpy_float)).to(device)
else:
padding = -1
attention_mask = (
input_ids !=
padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32)
attention_mask = (input_ids != padding).type(torch.float16 if is_io_float16 else torch.float32)
input_ids.masked_fill_(input_ids == padding, 0)
if "position_ids" in data:
@ -324,6 +432,7 @@ def main():
save_test_data_dir=Path(output_path).parent)
logger.info(f"Done. Output model: {output_path}")
return result
if __name__ == '__main__':

View file

@ -0,0 +1,344 @@
#-------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
#--------------------------------------------------------------------------
# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
import itertools
import numpy as np
import onnx
from onnx import helper, numpy_helper
from onnx import onnx_pb as onnx_proto
def _npfloat16_to_int(np_list):
'''
Convert numpy float16 to python int.
:param np_list: numpy float16 list
:return int_list: python int list
'''
return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
'''
Convert float32 numpy array to float16 without changing sign or finiteness.
Positive values less than min_positive_val are mapped to min_positive_val.
Positive finite values greater than max_finite_val are mapped to max_finite_val.
Similar for negative values. NaN, 0, inf, and -inf are unchanged.
'''
def between(a, b, c):
return np.logical_and(a < b, b < c)
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
return np.float16(np_array)
def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
'''
Convert tensor float to float16.
:param tensor: TensorProto object
:return tensor_float16: converted TensorProto object
Example:
::
from onnxmltools.utils.float16_converter import convert_tensor_float_to_float16
new_tensor = convert_tensor_float_to_float16(tensor)
'''
if not isinstance(tensor, onnx_proto.TensorProto):
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
if tensor.data_type == onnx_proto.TensorProto.FLOAT:
tensor.data_type = onnx_proto.TensorProto.FLOAT16
# convert float_data (float type) to float16 and write to int32_data
if tensor.float_data:
float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)
int_list = _npfloat16_to_int(float16_data)
tensor.int32_data[:] = int_list
tensor.float_data[:] = []
# convert raw_data (bytes type)
if tensor.raw_data:
# convert n.raw_data to float
float32_list = np.fromstring(tensor.raw_data, dtype='float32')
# convert float to float16
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
# convert float16 to bytes and write back to raw_data
tensor.raw_data = float16_list.tostring()
return tensor
def make_value_info_from_tensor(tensor):
shape = numpy_helper.to_array(tensor).shape
return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
DEFAULT_OP_BLOCK_LIST = [
'ArrayFeatureExtractor', 'Binarizer', 'CastMap', 'CategoryMapper', 'DictVectorizer', 'FeatureVectorizer', 'Imputer',
'LabelEncoder', 'LinearClassifier', 'LinearRegressor', 'Normalizer', 'OneHotEncoder', 'SVMClassifier',
'SVMRegressor', 'Scaler', 'TreeEnsembleClassifier', 'TreeEnsembleRegressor', 'ZipMap', 'NonMaxSuppression', 'TopK',
'RoiAlign', 'Resize', 'Range', 'CumSum', 'Min', 'Max', 'Upsample'
]
def convert_float_to_float16(model,
min_positive_val=1e-7,
max_finite_val=1e4,
keep_io_types=False,
disable_shape_infer=False,
op_block_list=None,
node_block_list=None):
'''
Convert tensor float type in the ONNX ModelProto input to tensor float16.
:param model: ONNX ModelProto object
:param disable_shape_infer: Type/shape information is needed for conversion to work.
Set to True only if the model already has type/shape information for all tensors.
:return: converted ONNX ModelProto object
Examples:
::
Example 1: Convert ONNX ModelProto object:
from onnxmltools.utils.float16_converter import convert_float_to_float16
new_onnx_model = convert_float_to_float16(onnx_model)
Example 2: Convert ONNX model binary file:
from onnxmltools.utils.float16_converter import convert_float_to_float16
from onnxmltools.utils import load_model, save_model
onnx_model = load_model('model.onnx')
new_onnx_model = convert_float_to_float16(onnx_model)
save_model(new_onnx_model, 'new_model.onnx')
'''
func_infer_shape = None
if not disable_shape_infer and onnx.__version__ >= '1.2':
try:
from onnx.shape_inference import infer_shapes
func_infer_shape = infer_shapes
finally:
pass
if not isinstance(model, onnx_proto.ModelProto):
raise ValueError('Expected model type is an ONNX ModelProto but got %s' % type(model))
# create blocklists
if op_block_list is None:
op_block_list = DEFAULT_OP_BLOCK_LIST
if node_block_list is None:
node_block_list = []
op_block_list = set(op_block_list)
node_block_list = set(node_block_list)
# create a queue for BFS
queue = []
value_info_list = []
node_list = []
# type inference on input model
if func_infer_shape is not None:
model = func_infer_shape(model)
queue.append(model)
name_mapping = {}
graph_io_to_skip = set()
io_casts = set()
fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
if isinstance(keep_io_types, list):
fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
print("keep_io_types", keep_io_types, "fp32_inputs", fp32_inputs, "fp32_outputs", fp32_outputs)
elif not keep_io_types:
fp32_inputs = []
fp32_outputs = []
for i, n in enumerate(model.graph.input):
if n.name in fp32_inputs:
output_name = 'graph_input_cast_' + str(i)
name_mapping[n.name] = output_name
graph_io_to_skip.add(n.name)
node_name = 'graph_input_cast' + str(i)
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(n)
new_value_info.name = output_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
# add Cast node (from tensor(float) to tensor(float16) after graph input
new_node = [helper.make_node('Cast', [n.name], [output_name], to=10, name=node_name)]
model.graph.node.extend(new_node)
value_info_list.append(new_value_info)
io_casts.add(node_name)
for i, n in enumerate(model.graph.output):
if n.name in fp32_outputs:
input_name = 'graph_output_cast_' + str(i)
name_mapping[n.name] = input_name
graph_io_to_skip.add(n.name)
node_name = 'graph_output_cast' + str(i)
# add Cast node (from tensor(float16) to tensor(float) before graph output
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(n)
new_value_info.name = input_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
new_node = [helper.make_node('Cast', [input_name], [n.name], to=1, name=node_name)]
model.graph.node.extend(new_node)
value_info_list.append(new_value_info)
io_casts.add(node_name)
fp32_initializer_counters = {}
while queue:
next_level = []
for q in queue:
# if q is model, push q.graph (GraphProto)
if isinstance(q, onnx_proto.ModelProto):
next_level.append(q.graph)
# if q is model.graph, push q.node.attribute (AttributeProto)
if isinstance(q, onnx_proto.GraphProto):
for n in q.initializer: # TensorProto type
if n.data_type == onnx_proto.TensorProto.FLOAT:
fp32_initializer_counters[n.name] = [0,
0] # two counters: used by fp16 nodes, used by fp32 nodes
for n in q.node:
# if n is in the block list (doesn't support float16), no conversion for the node,
# and save the node for further processing
if n.name in io_casts:
continue
for i in range(len(n.input)):
if n.input[i] in name_mapping:
n.input[i] = name_mapping[n.input[i]]
for i in range(len(n.output)):
if n.output[i] in name_mapping:
n.output[i] = name_mapping[n.output[i]]
is_node_blocked = n.op_type in op_block_list or n.name in node_block_list
for input in n.input:
if input in fp32_initializer_counters:
fp32_initializer_counters[input][int(is_node_blocked)] += 1
if is_node_blocked:
node_list.append(n)
else:
if n.op_type == 'Cast':
for attr in n.attribute:
if attr.name == 'to' and attr.i == 1:
attr.i = 10
break
for attr in n.attribute:
next_level.append(attr)
# if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
# and process node.attribute.t and node.attribute.tensors (TensorProto)
if isinstance(q, onnx_proto.AttributeProto):
next_level.append(q.g)
for n in q.graphs:
next_level.append(n)
q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
for n in q.tensors:
n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
# if q is graph, process graph.initializer(TensorProto), input, output and value_info (ValueInfoProto)
if isinstance(q, onnx_proto.GraphProto):
for n in q.initializer: # TensorProto type
if n.data_type == onnx_proto.TensorProto.FLOAT:
# TODO: handle initializer that used by subgraph
if fp32_initializer_counters[n.name][1] == 0: # not used by fp32 node
n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
value_info_list.append(make_value_info_from_tensor(n))
else:
# TODO: add a cast node to handle the case that an intiailizer is used by both fp32 and fp16 nodes
assert fp32_initializer_counters[n.name][0] == 0
# for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
# tensor(float16) except map and seq(map). And save them in value_info_list for further processing
for n in itertools.chain(q.input, q.output, q.value_info):
if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
if n.name not in graph_io_to_skip:
n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
value_info_list.append(n)
queue = next_level
# process the nodes in block list that doesn't support tensor(float16)
for node in node_list:
# if input's name is in the value_info_list meaning input is tensor(float16) type,
# insert a float16 to float Cast node before the node,
# change current node's input name and create new value_info for the new name
for i in range(len(node.input)):
input = node.input[i]
for value_info in value_info_list:
if input == value_info.name:
# create new value_info for current node's new input name
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(value_info)
output_name = node.name + '_input_cast_' + str(i)
new_value_info.name = output_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
# add Cast node (from tensor(float16) to tensor(float) before current node
node_name = node.name + '_input_cast' + str(i)
new_node = [helper.make_node('Cast', [input], [output_name], to=1, name=node_name)]
model.graph.node.extend(new_node)
# change current node's input name
node.input[i] = output_name
break
# if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
# float16 Cast node after the node, change current node's output name and create new value_info for the new name
for i in range(len(node.output)):
output = node.output[i]
for value_info in value_info_list:
if output == value_info.name:
# create new value_info for current node's new output
new_value_info = model.graph.value_info.add()
new_value_info.CopyFrom(value_info)
input_name = node.name + '_output_cast_' + str(i)
new_value_info.name = input_name
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
# add Cast node (from tensor(float) to tensor(float16) after current node
node_name = node.name + '_output_cast' + str(i)
new_node = [helper.make_node('Cast', [input_name], [output], to=10, name=node_name)]
model.graph.node.extend(new_node)
# change current node's input name
node.output[i] = input_name
break
return model
def convert_float_to_float16_model_path(model_path, min_positive_val=1e-7, max_finite_val=1e4, keep_io_types=False):
'''
Convert tensor float type in the ONNX Model to tensor float16.
*It is to fix an issue that infer_shapes func cannot be used to infer >2GB models.
*But this function can be applied to all model sizes.
:param model_path: ONNX Model path
:return: converted ONNX ModelProto object
Examples
::
#Convert to ONNX ModelProto object and save model binary file:
from onnxmltools.utils.float16_converter import convert_float_to_float16_model_path
new_onnx_model = convert_float_to_float16_model_path('model.onnx')
onnx.save(new_onnx_model, 'new_model.onnx')
'''
disable_shape_infer = False
if onnx.__version__ >= '1.8':
try:
# infer_shapes_path can be applied to all model sizes
from onnx.shape_inference import infer_shapes_path
import tempfile
import os
# shape_infer_model_path should be in the same folder of model_path
with tempfile.NamedTemporaryFile(dir=os.path.dirname(model_path)) as tmpfile:
shape_infer_model_path = tmpfile.name
infer_shapes_path(model_path, shape_infer_model_path)
model = onnx.load(shape_infer_model_path)
disable_shape_infer = True
finally:
pass
if not disable_shape_infer:
model = onnx.load(model_path)
return convert_float_to_float16(model, min_positive_val, max_finite_val, keep_io_types, disable_shape_infer)

View file

@ -12,6 +12,7 @@ import random
import numpy
import time
import re
import pickle
from pathlib import Path
from typing import List, Dict, Tuple, Union
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Config, TFGPT2Model
@ -99,7 +100,9 @@ class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
super().__init__(config)
def forward(self, input_ids, *past):
return super().forward(input_ids, past_key_values=past)
result = super().forward(input_ids, past_key_values=past, return_dict=False)
return MyGPT2Model.post_process(result, self.config.n_layer)
# Maps model class name to a tuple of model class, name of first output and use padding or not
@ -154,7 +157,7 @@ class Gpt2Helper:
float_type = torch.float16 if float16 else torch.float32
past_shape = [2, batch_size, num_attention_heads, past_sequence_length, int(hidden_size / num_attention_heads)]
past = [torch.rand(past_shape, dtype=float_type, device=device) for _ in range(num_layer)]
past = [(torch.rand(past_shape, dtype=float_type, device=device) * 2.0 - 1.0) for _ in range(num_layer)]
input_ids = torch.randint(low=0,
high=vocab_size - 1,
size=(batch_size, sequence_length),
@ -261,6 +264,53 @@ class Gpt2Helper:
return is_all_close
@staticmethod
def compare_outputs_v2(torch_outputs, ort_outputs, atol=1e-06):
"""Compare outputs from PyTorch and OnnxRuntime
Args:
torch_outputs (Tuple[Torch.Tensor]): PyTorch model output
ort_outputs (List[numpy.ndarray]): OnnxRuntime output
atol (float, optional): Absolute tollerance. Defaults to 1e-06.
Returns:
is_all_close(bool): whether all elements are close.
max_abs_diff(float): maximum absolute difference.
messages(str): a list of debug message for each output
"""
is_all_close = True
is_top1_matched = False
max_diffs = []
messages = []
for i in range(len(ort_outputs)):
ort_output = ort_outputs[i]
torch_output = (torch_outputs[0] if i == 0 else torch_outputs[1][i - 1]).cpu().numpy()
is_close = numpy.allclose(ort_output, torch_output, atol=atol, rtol=0)
max_diffs.append(numpy.amax(numpy.abs(torch_output - ort_output)))
is_all_close = is_all_close and is_close
if numpy.isnan(torch_output).any():
logger.debug(f'PyTorch output {i} has nan')
if numpy.isinf(torch_output).any():
logger.debug(f'PyTorch output {i} has inf')
if numpy.isnan(ort_output).any():
logger.debug(f'ORT output {i} has nan')
if numpy.isinf(ort_output).any():
logger.debug(f'ORT output {i} has inf')
diff = numpy.fabs(ort_output - torch_output)
idx = numpy.unravel_index(diff.argmax(), diff.shape)
messages.append(
f'diff={diff[idx]:.9f} index={idx} ort={ort_output[idx]:.9f} torch={float(torch_output[idx]):.9f}')
if i == 0: # logits
ort_max_index = numpy.unravel_index(numpy.argmax(ort_output, axis=None), ort_output.shape)
torch_max_index = numpy.unravel_index(numpy.argmax(torch_output, axis=None), torch_output.shape)
is_top1_matched = numpy.array_equal(ort_max_index, torch_max_index)
max_diff_output_index = max_diffs.index(max(max_diffs))
return is_all_close, max(max_diffs), max_diff_output_index, messages, is_top1_matched
@staticmethod
def export_onnx(model,
device,
@ -345,19 +395,31 @@ class Gpt2Helper:
is_float16,
num_attention_heads,
hidden_size,
use_external_data_format=False):
use_external_data_format=False,
**kwargs):
""" Optimize ONNX model with an option to convert it to use mixed precision.
"""
from optimizer import optimize_model
from fusion_options import FusionOptions
optimization_options = FusionOptions('gpt2')
#optimization_options.enable_gelu = False
#optimization_options.enable_layer_norm = False
#optimization_options.enable_attention = False
m = optimize_model(onnx_model_path,
model_type='gpt2',
num_heads=num_attention_heads,
hidden_size=hidden_size,
opt_level=0,
optimization_options=None,
optimization_options=optimization_options,
use_gpu=False)
if is_float16:
m.convert_model_float32_to_float16(cast_input_output=False)
op_full_list = set([node.op_type for node in m.nodes()])
op_block_list = set(kwargs["op_block_list"]) if "op_block_list" in kwargs else set()
op_remain_list = op_full_list.difference(op_block_list)
logger.info(f"op_block_list={op_block_list} op_remain_list={op_remain_list}")
m.convert_float_to_float16(use_symbolic_shape_infer=True, **kwargs)
m.save_model_to_file(optimized_model_path, use_external_data_format)
@ -526,6 +588,22 @@ class Gpt2Helper:
return ort_outputs, average_latency
@staticmethod
def save_outputs(i, ort_outputs, torch_outputs):
with open(f'ort_outputs_{i}.pickle', 'wb') as f:
pickle.dump(ort_outputs, f)
logger.info(f"ORT output are saved to ort_outputs_{i}.pickle")
with open(f'torch_outputs_{i}.pickle', 'wb') as f:
pickle.dump(torch_outputs, f)
logger.info(f"Torch output are saved to torch_outputs_{i}.pickle")
@staticmethod
def save_inputs(i, dummy_inputs, ort_outputs, torch_outputs):
with open(f'dummy_inputs_{i}.pickle', 'wb') as f:
pickle.dump(dummy_inputs, f)
logger.info(f"inputs are saved to dummy_inputs_{i}.pickle")
@staticmethod
def test_parity(ort_session,
model,
@ -537,14 +615,16 @@ class Gpt2Helper:
use_io_binding=True,
model_class="GPT2LMHeadModel",
has_position_ids=True,
has_attention_mask=True):
has_attention_mask=True,
verbose=False,
enable_pickle_output=False):
""" Generate random inputs and compare the results of PyTorch and Onnx Runtime.
"""
config: GPT2Config = model.config
logger.info(
f"Running parity test (rtol={rtol}, atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding} model_class={model_class} is_float16={is_float16}) ..."
f"Running parity test (atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..."
)
max_batch_size = 8
@ -558,7 +638,10 @@ class Gpt2Helper:
output_buffers = Gpt2Helper.get_output_buffers(max_output_shapes, device, is_float16)
passed_test_cases = 0
for _ in range(total_test_cases):
top1_matched_cases = 0
max_abs_diff_list = []
for i in range(total_test_cases):
sequence_length = random.randint(1, max_seq_len)
past_sequence_length = random.randint(0, max_past_seq_len)
batch_size = random.randint(1, max_batch_size)
@ -569,7 +652,6 @@ class Gpt2Helper:
config.num_attention_heads, config.hidden_size, config.n_layer,
config.vocab_size, device, is_float16, has_position_ids,
has_attention_mask)
outputs = Gpt2Helper.pytorch_inference(model, dummy_inputs)
if use_io_binding:
ort_outputs = Gpt2Helper.onnxruntime_inference(ort_session, dummy_inputs)
@ -579,13 +661,84 @@ class Gpt2Helper:
ort_outputs = Gpt2Helper.onnxruntime_inference_with_binded_io(ort_session, dummy_inputs, output_buffers,
output_shapes)
is_all_close = Gpt2Helper.compare_outputs(outputs, ort_outputs, rtol=rtol, atol=atol)
is_all_close, max_abs_diff, max_diff_output_index, messages, is_top1_matched = Gpt2Helper.compare_outputs_v2(
outputs, ort_outputs, atol=atol)
if not numpy.isnan(max_abs_diff):
max_abs_diff_list.append(max_abs_diff)
if is_all_close:
passed_test_cases += 1
logger.info(f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}")
if is_top1_matched:
top1_matched_cases += 1
if verbose and not is_all_close:
logger.info(
f"test_case={i} batch_size={batch_size} past_sequence_length={past_sequence_length} sequence_length={sequence_length} MaxDiff={max_abs_diff}"
)
for i, message in enumerate(messages):
logger.info(f"\t{i}: Name={ort_session.get_outputs()[i].name}, {message}")
# Collect data for debugging
if enable_pickle_output and (numpy.isnan(max_abs_diff) or max_abs_diff > 100 * atol):
Gpt2Helper.save_inputs(i, dummy_inputs)
Gpt2Helper.save_outputs(i, ort_outputs, outputs)
if max_abs_diff_list:
result = {
f"max_diff_percentile_{p}": "{:.5f}".format(numpy.percentile(max_abs_diff_list, p))
for p in [50, 90, 95, 99]
}
else:
result = {f"max_diff_percentile_{p}": "nan" for p in [50, 90, 95, 99]}
result["top1_match_rate"] = top1_matched_cases * 1.0 / total_test_cases
result["diff_pass_rate"] = passed_test_cases * 1.0 / total_test_cases
result["nan_rate"] = (total_test_cases - len(max_abs_diff_list)) * 1.0 / total_test_cases
logger.info(
f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}; Nan={total_test_cases-len(max_abs_diff_list)}; Top1_Matched={top1_matched_cases}"
)
if passed_test_cases > 0.95 * total_test_cases:
logger.info(f"Parity is good: passed rate={int(passed_test_cases*100/total_test_cases):.0f}%")
return passed_test_cases == total_test_cases
return result
@staticmethod
def test_performance(ort_session,
model,
device,
is_float16=False,
total_runs=100,
use_io_binding=True,
model_class="GPT2LMHeadModel",
has_position_ids=True,
has_attention_mask=True,
batch_size=8,
sequence_length=1,
past_sequence_length=32):
""" Generate random inputs and measure average latency of Onnx Runtime.
"""
config: GPT2Config = model.config
output_buffers = None
if use_io_binding:
output_shapes = Gpt2Helper.get_output_shapes(batch_size, past_sequence_length, sequence_length, config,
model_class)
output_buffers = Gpt2Helper.get_output_buffers(output_shapes, device, is_float16)
dummy_inputs = Gpt2Helper.get_dummy_inputs(batch_size, past_sequence_length, sequence_length,
config.num_attention_heads, config.hidden_size, config.n_layer,
config.vocab_size, device, is_float16, has_position_ids,
has_attention_mask)
if use_io_binding:
_, latency = Gpt2Helper.onnxruntime_inference(ort_session, dummy_inputs, total_runs)
else:
_, latency = Gpt2Helper.onnxruntime_inference_with_binded_io(ort_session, dummy_inputs, output_buffers,
output_shapes, total_runs)
return latency
@staticmethod
def torchscript(model, config, device, has_position_ids=True, has_attention_mask=True):

View file

@ -0,0 +1,177 @@
# -------------------------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
from convert_to_onnx import main
import os
import argparse
import logging
from gpt2_helper import PRETRAINED_GPT2_MODELS
from benchmark_helper import setup_logger
logger = logging.getLogger('')
def parse_arguments(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument('-m',
'--model_name_or_path',
required=True,
type=str,
help='Model path, or pretrained model name in the list: ' + ', '.join(PRETRAINED_GPT2_MODELS))
parser.add_argument('--csv',
required=False,
type=str,
default='gpt2_parity_results.csv',
help='path of csv file to save the result')
parser.add_argument('--runs',
required=False,
type=int,
default=5,
help="number of repeated runs to get median value of each metric")
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference")
parser.set_defaults(use_gpu=False)
parser.add_argument('--all', required=False, action='store_true', help="run all combinations of mixed precision")
parser.set_defaults(all=False)
parser.add_argument('-e', '--use_external_data_format', required=False, action='store_true')
parser.set_defaults(use_external_data_format=False)
parser.add_argument('--verbose', required=False, action='store_true')
parser.set_defaults(verbose=False)
args = parser.parse_args(argv)
return args
class ParityTask:
def __init__(self, total_runs, csv_path):
self.total_runs = total_runs
self.csv_path = csv_path
self.latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
self.metric_names = [
self.latency_name, "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile",
"diff_pass_rate", "nan_rate", "top1_match_rate", "onnx_size_in_MB"
]
def run(self, argv, name):
results = []
experiment_name = name
for i in range(self.total_runs):
try:
result = main(argv, experiment_name=experiment_name, run_id=i, csv_filename=self.csv_path)
except:
logger.error(f"Failed to run experiment{experiment_name}")
continue
if result:
results.append(result)
if len(results) == 0:
return
# Calculate median value per metric
all_results = {}
for name in self.metric_names:
all_results[name] = []
for result in results:
for name in self.metric_names:
if name in result:
all_results[name].append(result[name])
import statistics
median_result = results[0]
for name in self.metric_names:
median_result[name] = statistics.median(all_results[name])
self.save_result(median_result)
def save_result(self, result):
import csv
csv_filename = self.csv_path
csv_file_existed = os.path.exists(csv_filename)
with open(csv_filename, mode="a", newline='') as csv_file:
column_names = [
"experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases",
"keep_io_types", "io_block_list", "op_block_list", "node_block_list", "ORT_TRANSFORMER_OPTIONS",
"ORT_CUDA_GEMM_OPTIONS", "onnxruntime"
] + self.metric_names
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
if not csv_file_existed:
csv_writer.writeheader()
row = {}
for name in column_names:
row[name] = result[name]
row["run_id"] = "median"
csv_writer.writerow(row)
logger.info(f"result saved to {csv_filename}: {row}")
def run_parity(args):
task = ParityTask(args.runs, args.csv)
model = args.model_name_or_path
fp32_baseline = f"-m {model} -o -p fp32".split()
if args.use_gpu:
fp32_baseline.append("--use_gpu")
if args.use_external_data_format:
fp32_baseline.append("--use_external_data_format")
task.run(fp32_baseline, "fp32 baseline")
# The following tests for fp16 requires GPU
if not args.use_gpu:
logger.info("skip mixed precision since --use_gpu is not specified")
return
baseline = f"-m {model} -o --use_gpu -p fp16".split()
if args.use_external_data_format:
baseline.append("--use_external_data_format")
task.run(baseline, "fp16 baseline")
if not args.all:
logger.info("skip remaining combinations since --all is not specified")
return
fp32_logits = ["--io_block_list", "logits"]
task.run(baseline + fp32_logits, "fp16 except logits")
fp32_io = ["--keep_io_types"]
task.run(baseline + fp32_io, "Graph I/O FP32, Other FP16")
op_list = "Attention Gather Add LayerNormalization FastGelu MatMul".split()
task.run(baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32")
for op in op_list:
op_block_list = ["--op_block_list"] + [o for o in op_list if o != op]
task.run(baseline + fp32_io + op_block_list, f"FP32 except {op} in fp16")
for op in op_list:
op_block_list = ["--op_block_list", op]
task.run(baseline + op_block_list, f"FP16 except {op} in fp32")
op_block_list = ["--op_block_list", "LayerNormalization", "FastGelu"]
task.run(baseline + op_block_list, f"FP16 except LayerNormalization and FastGelu in fp32")
task.run(baseline + op_block_list + fp32_logits, f"FP16 except logits, LayerNormalization and FastGelu in fp32")
if __name__ == '__main__':
args = parse_arguments()
setup_logger(args.verbose)
run_parity(args)

View file

@ -18,6 +18,9 @@ logger = logging.getLogger(__name__)
class OnnxModel:
def __init__(self, model):
self.initialize(model)
def initialize(self, model):
self.model = model
self._node_name_suffix: Dict[str, int] = {} # key is node name prefix, value is the last suffix generated
self.shape_infer_helper = None
@ -495,31 +498,18 @@ class OnnxModel:
initializer=graph.initializer,
value_info=graph.value_info)
self.model = helper.make_model(graph_def, producer_name='onnxruntime-tools')
self.model = helper.make_model(graph_def, producer_name='onnxruntime')
# restore opset version
self.model.opset_import[0].version = original_opset_version
def convert_model_float32_to_float16(self, cast_input_output=True, use_symbolic_shape_infer=True):
"""Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs.
For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance.
Args:
cast_input_output (bool, optional): keep data type of inputs and outputs, and add Cast nodes to convert float32 inputs to float16, and float16 to float32 for outputs. Defaults to True.
use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference.
"""
from packaging.version import Version
import onnxconverter_common as oc
if Version(oc.__version__) > Version("1.7.0"):
model = self.model
if use_symbolic_shape_infer:
# Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
shape_infer_helper = SymbolicShapeInferenceHelper(model)
model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False)
self.model = oc.float16.convert_float_to_float16(model,
keep_io_types=cast_input_output,
disable_shape_infer=use_symbolic_shape_infer)
return
def _naive_float_to_float16(self, keep_io_types=True):
"""Convert model from single precision to half precision naively.
It might generate invalid model or cause precision loss.
Args:
cast_input_output (bool, optional): [description]. Defaults to True.
"""
graph = self.model.graph
initializers = graph.initializer
@ -540,7 +530,7 @@ class OnnxModel:
if att.name == 'to' and att.i == 1:
att.CopyFrom(helper.make_attribute("to", int(TensorProto.FLOAT16)))
if not cast_input_output:
if not keep_io_types:
self.change_input_output_float32_to_float16()
return
@ -570,6 +560,107 @@ class OnnxModel:
cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.FLOAT))])
self.add_node(cast_node)
def get_dtype(self, input_or_output: str):
"""Try get data type given a name (could be initializer, graph input or output)."""
tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
if input_or_output in tensor_type_map:
return tensor_type_map[input_or_output].tensor_type.elem_type
graph_input = self.find_graph_input(input_or_output)
if graph_input:
return graph_input.type.tensor_type.elem_type
graph_output = self.find_graph_output(input_or_output)
if graph_output:
return graph_output.type.tensor_type.elem_type
return None
def convert_model_float32_to_float16(self, cast_input_output=True, **kwargs):
logger.warn(
'The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!')
self._naive_float_to_float16(keep_io_types=cast_input_output)
def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
"""Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs.
For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance.
Args:
keep_io_types (bool, optional): keep data type of inputs and outputs. Defaults to True.
use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference.
kwargs: parameters for float16 conversion.
"""
if "keep_io_types" not in kwargs:
kwargs["keep_io_types"] = True
def float_to_float16_func():
# TODO: import from onnxconverter_common when it is stable
#try:
# import onnxconverter_common as oc
# from packaging.version import Version
# if Version(oc.__version__) > Version("1.9.0"):
# from onnxconverter_common.float16 import convert_float_to_float16
# return convert_float_to_float16
#except ImportError:
# pass
from float16 import convert_float_to_float16
return convert_float_to_float16
convert_float_to_float16 = float_to_float16_func()
model = self.model
if use_symbolic_shape_infer:
# Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
shape_infer_helper = SymbolicShapeInferenceHelper(model)
model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False)
parameters = {'disable_shape_infer': use_symbolic_shape_infer}
parameters.update({
key: kwargs[key]
for key in ['keep_io_types', 'min_positive_val', 'max_finite_val', 'op_block_list', 'node_block_list']
if key in kwargs
})
fp16_model = convert_float_to_float16(model, **parameters)
self.initialize(fp16_model)
def get_node_attribute(node, attribute_name: str):
for attr in node.attribute:
if attr.name == attribute_name:
value = helper.get_attribute_value(attr)
return value
return None
# Convert_float_to_float16 might add Cast(to=10) --> Cast(to=1) when two consequent nodes are computed in FP32.
# Below are post-processing that removes those Cast nodes.
# Remove first Cast nodes in path like --> Cast --> Cast -->
nodes_to_remove = []
for node in self.nodes():
if node.op_type == "Cast":
parent = self.get_parent(node, 0)
if parent and parent.op_type == "Cast":
if self.get_children(parent) == 1: # cannot be removed if its output is used by multiple nodes
self.replace_input_of_all_nodes(parent.output[0], parent.input[0])
nodes_to_remove.append(parent)
# Remove the second cast node.
for node in self.nodes():
if node.op_type == "Cast" and get_node_attribute(node, "to") == int(TensorProto.FLOAT) and \
self.get_dtype(node.input[0]) == int(TensorProto.FLOAT):
if self.find_graph_output(node.output[0]):
self.replace_output_of_all_nodes(node.input[0], node.output[0])
else:
self.replace_input_of_all_nodes(node.output[0], node.input[0])
nodes_to_remove.append(node)
self.remove_nodes(nodes_to_remove)
if nodes_to_remove:
self.prune_graph()
print(f"removed {len(nodes_to_remove)} Cast nodes from float16 model")
def create_node_name(self, op_type, name_prefix=None):
"""Create a unique node name that starts with a prefix (default is operator type).
The name will not be duplicated with any name that generated or existed in current graphs.

View file

@ -0,0 +1,11 @@
onnx >= 1.8
numpy
coloredlogs
psutil
py-cpuinfo
py3nvml
packaging
transformers >= 4.0
# please follow https://pytorch.org/ to install PyTorch for your OS
torch >= 1.8

View file

@ -1,14 +0,0 @@
onnx
numpy
coloredlogs
psutil
py-cpuinfo
py3nvml
packaging
transformers
onnxruntime
onnxconverter_common
--find-links https://download.pytorch.org/whl/torch_stable.html
torch==1.7.1+cpu
torchvision==0.8.2+cpu
torchaudio===0.7.2

View file

@ -1,14 +0,0 @@
onnx
numpy
coloredlogs
psutil
py-cpuinfo
py3nvml
packaging
transformers
onnxruntime-gpu
onnxconverter_common
--find-links https://download.pytorch.org/whl/torch_stable.html
torch===1.7.1
torchvision===0.8.2
torchaudio===0.7.2