mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-05-27 22:45:57 +00:00
Add gpt2 mixed precision conversion and parity tools (#8845)
This commit is contained in:
parent
e8564d6597
commit
cb59f46e04
8 changed files with 937 additions and 80 deletions
|
|
@ -34,7 +34,7 @@ from benchmark_helper import create_onnxruntime_session, setup_logger, prepare_e
|
|||
logger = logging.getLogger('')
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-m',
|
||||
|
|
@ -94,6 +94,13 @@ def parse_arguments():
|
|||
choices=list(Precision),
|
||||
help="Precision of model to run. fp32 for full precision, fp16 for half precision, and int8 for quantization")
|
||||
|
||||
parser.add_argument("-t",
|
||||
"--test_cases",
|
||||
required=False,
|
||||
type=int,
|
||||
default=1000,
|
||||
help="Number of test cases for parity")
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true')
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
|
|
@ -135,20 +142,48 @@ def parse_arguments():
|
|||
help='Nuclear/top-p sampling accumulation probability.')
|
||||
sampling_option_group.add_argument('--do_sample_top_k', type=int, default=0, help='Use top-k if non-zero.')
|
||||
|
||||
args = parser.parse_args()
|
||||
fp16_option_group = parser.add_argument_group(
|
||||
"float to float16 conversion parameters that works when \"--precision fp16\" is specified")
|
||||
fp16_option_group.add_argument('--keep_io_types',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help='Use float32 for past inputs, present and logits outputs.')
|
||||
fp16_option_group.set_defaults(keep_io_types=False)
|
||||
fp16_option_group.add_argument('--io_block_list',
|
||||
nargs='+',
|
||||
default=[],
|
||||
help='List of inputs or outputs in float32 instead of float16')
|
||||
fp16_option_group.add_argument(
|
||||
'--op_block_list',
|
||||
nargs='+',
|
||||
default=[],
|
||||
help=
|
||||
'List of operators (like Attention Gather Add LayerNormalization FastGelu MatMul) to compute in float32 instead of float16.'
|
||||
)
|
||||
fp16_option_group.add_argument('--node_block_list',
|
||||
nargs='+',
|
||||
default=[],
|
||||
help='List of node names to compute in float32 instead of float16.')
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main():
|
||||
def main(argv=None, experiment_name="", run_id=0, csv_filename="gpt2_parity_results.csv"):
|
||||
result = {}
|
||||
from transformers import __version__ as transformers_version
|
||||
if version.parse(transformers_version) < version.parse(
|
||||
"3.1.0"): # past_key_values name does not exist in 3.0.2 or older
|
||||
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
|
||||
|
||||
args = parse_arguments()
|
||||
args = parse_arguments(argv)
|
||||
setup_logger(args.verbose)
|
||||
|
||||
if not experiment_name:
|
||||
import sys
|
||||
experiment_name = " ".join(argv if argv else sys.argv[1:])
|
||||
|
||||
if args.tolerance == 0:
|
||||
args.tolerance = DEFAULT_TOLERANCE[args.precision]
|
||||
|
||||
|
|
@ -219,6 +254,7 @@ def main():
|
|||
|
||||
logger.info(f"Exporting ONNX model to {raw_onnx_model}")
|
||||
use_padding = MODEL_CLASSES[args.model_class][2]
|
||||
|
||||
gpt2helper.export_onnx(model,
|
||||
device,
|
||||
raw_onnx_model,
|
||||
|
|
@ -227,13 +263,23 @@ def main():
|
|||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding)
|
||||
|
||||
fp16_params = {"keep_io_types": args.keep_io_types}
|
||||
if args.io_block_list:
|
||||
fp16_params["keep_io_types"] = args.io_block_list
|
||||
if args.node_block_list:
|
||||
fp16_params["node_block_list"] = args.node_block_list
|
||||
if args.op_block_list:
|
||||
fp16_params["op_block_list"] = args.op_block_list
|
||||
|
||||
is_io_float16 = (args.precision == Precision.FLOAT16 and not args.keep_io_types)
|
||||
|
||||
if args.optimize_onnx or args.precision != Precision.FLOAT32:
|
||||
output_path = onnx_model_paths[str(args.precision) if args.precision != Precision.INT8 else 'fp32']
|
||||
|
||||
logger.info(f"Optimizing model to {output_path}")
|
||||
gpt2helper.optimize_onnx(raw_onnx_model, output_path, args.precision == Precision.FLOAT16,
|
||||
model.config.num_attention_heads, model.config.hidden_size,
|
||||
args.use_external_data_format)
|
||||
args.use_external_data_format, **fp16_params)
|
||||
else:
|
||||
output_path = raw_onnx_model
|
||||
|
||||
|
|
@ -252,16 +298,80 @@ def main():
|
|||
logger.info(f"Output path: {output_path}")
|
||||
|
||||
session = create_onnxruntime_session(output_path, args.use_gpu, enable_all_optimization=True, verbose=args.verbose)
|
||||
if session is not None:
|
||||
gpt2helper.test_parity(session,
|
||||
model,
|
||||
device,
|
||||
args.precision == Precision.FLOAT16,
|
||||
rtol=args.tolerance,
|
||||
atol=args.tolerance,
|
||||
model_class=args.model_class,
|
||||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding)
|
||||
if args.model_class == "GPT2LMHeadModel" and session is not None:
|
||||
parity_result = gpt2helper.test_parity(session,
|
||||
model,
|
||||
device,
|
||||
is_io_float16,
|
||||
rtol=args.tolerance,
|
||||
atol=args.tolerance,
|
||||
model_class=args.model_class,
|
||||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding,
|
||||
total_test_cases=args.test_cases,
|
||||
verbose=args.verbose)
|
||||
|
||||
latency = gpt2helper.test_performance(session,
|
||||
model,
|
||||
device,
|
||||
is_io_float16,
|
||||
total_runs=100,
|
||||
use_io_binding=True,
|
||||
model_class=args.model_class,
|
||||
has_position_ids=use_padding,
|
||||
has_attention_mask=use_padding,
|
||||
batch_size=8,
|
||||
sequence_length=1,
|
||||
past_sequence_length=32)
|
||||
|
||||
if args.precision == Precision.FLOAT16:
|
||||
logger.info(f"fp16 conversion parameters:{fp16_params}")
|
||||
|
||||
# Write results to file
|
||||
import csv
|
||||
from onnxruntime import __version__ as ort_version
|
||||
latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
|
||||
csv_file_existed = os.path.exists(csv_filename)
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = [
|
||||
"experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases",
|
||||
"keep_io_types", "io_block_list", "op_block_list", "node_block_list", "ORT_TRANSFORMER_OPTIONS",
|
||||
"ORT_CUDA_GEMM_OPTIONS", "onnxruntime", latency_name, "diff_50_percentile", "diff_90_percentile",
|
||||
"diff_95_percentile", "diff_99_percentile", "diff_pass_rate", "nan_rate", "top1_match_rate",
|
||||
"onnx_size_in_MB"
|
||||
]
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
if not csv_file_existed:
|
||||
csv_writer.writeheader()
|
||||
row = {
|
||||
"experiment": experiment_name,
|
||||
"run_id": run_id,
|
||||
"model_name": args.model_name_or_path,
|
||||
"model_class": args.model_class,
|
||||
"gpu": args.use_gpu,
|
||||
"precision": args.precision,
|
||||
"optimizer": args.optimize_onnx,
|
||||
"test_cases": args.test_cases,
|
||||
"keep_io_types": args.keep_io_types,
|
||||
"io_block_list": args.io_block_list,
|
||||
"op_block_list": args.op_block_list,
|
||||
"node_block_list": args.node_block_list,
|
||||
"ORT_TRANSFORMER_OPTIONS": os.getenv('ORT_TRANSFORMER_OPTIONS'),
|
||||
"ORT_CUDA_GEMM_OPTIONS": os.getenv('ORT_CUDA_GEMM_OPTIONS'),
|
||||
"onnxruntime": ort_version,
|
||||
latency_name: f"{latency:.2f}",
|
||||
"diff_50_percentile": parity_result["max_diff_percentile_50"],
|
||||
"diff_90_percentile": parity_result["max_diff_percentile_90"],
|
||||
"diff_95_percentile": parity_result["max_diff_percentile_95"],
|
||||
"diff_99_percentile": parity_result["max_diff_percentile_99"],
|
||||
"diff_pass_rate": parity_result["diff_pass_rate"],
|
||||
"nan_rate": parity_result["nan_rate"],
|
||||
"top1_match_rate": parity_result["top1_match_rate"],
|
||||
"onnx_size_in_MB": "{}".format(int(os.path.getsize(output_path) / 1024 / 1024))
|
||||
}
|
||||
logger.info(f"result: {row}")
|
||||
result.update(row)
|
||||
csv_writer.writerow(row)
|
||||
|
||||
if args.input_test_file:
|
||||
test_inputs = []
|
||||
|
|
@ -275,14 +385,12 @@ def main():
|
|||
|
||||
if use_padding:
|
||||
if "attention_mask" in data:
|
||||
numpy_float = numpy.float16 if args.precision == Precision.FLOAT16 else numpy.float32
|
||||
numpy_float = numpy.float16 if is_io_float16 else numpy.float32
|
||||
attention_mask = torch.from_numpy(numpy.asarray(data["attention_mask"],
|
||||
dtype=numpy_float)).to(device)
|
||||
else:
|
||||
padding = -1
|
||||
attention_mask = (
|
||||
input_ids !=
|
||||
padding).type(torch.float16 if args.precision == Precision.FLOAT16 else torch.float32)
|
||||
attention_mask = (input_ids != padding).type(torch.float16 if is_io_float16 else torch.float32)
|
||||
input_ids.masked_fill_(input_ids == padding, 0)
|
||||
|
||||
if "position_ids" in data:
|
||||
|
|
@ -324,6 +432,7 @@ def main():
|
|||
save_test_data_dir=Path(output_path).parent)
|
||||
|
||||
logger.info(f"Done. Output model: {output_path}")
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
344
onnxruntime/python/tools/transformers/float16.py
Normal file
344
onnxruntime/python/tools/transformers/float16.py
Normal file
|
|
@ -0,0 +1,344 @@
|
|||
#-------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License.
|
||||
#--------------------------------------------------------------------------
|
||||
|
||||
# This file is modified from https://github.com/microsoft/onnxconverter-common/blob/master/onnxconverter_common/float16.py
|
||||
|
||||
import itertools
|
||||
import numpy as np
|
||||
import onnx
|
||||
from onnx import helper, numpy_helper
|
||||
from onnx import onnx_pb as onnx_proto
|
||||
|
||||
|
||||
def _npfloat16_to_int(np_list):
|
||||
'''
|
||||
Convert numpy float16 to python int.
|
||||
|
||||
:param np_list: numpy float16 list
|
||||
:return int_list: python int list
|
||||
'''
|
||||
return [int(bin(_.view('H'))[2:].zfill(16), 2) for _ in np_list]
|
||||
|
||||
|
||||
def convert_np_to_float16(np_array, min_positive_val=1e-7, max_finite_val=1e4):
|
||||
'''
|
||||
Convert float32 numpy array to float16 without changing sign or finiteness.
|
||||
Positive values less than min_positive_val are mapped to min_positive_val.
|
||||
Positive finite values greater than max_finite_val are mapped to max_finite_val.
|
||||
Similar for negative values. NaN, 0, inf, and -inf are unchanged.
|
||||
'''
|
||||
def between(a, b, c):
|
||||
return np.logical_and(a < b, b < c)
|
||||
|
||||
np_array = np.where(between(0, np_array, min_positive_val), min_positive_val, np_array)
|
||||
np_array = np.where(between(-min_positive_val, np_array, 0), -min_positive_val, np_array)
|
||||
np_array = np.where(between(max_finite_val, np_array, float('inf')), max_finite_val, np_array)
|
||||
np_array = np.where(between(float('-inf'), np_array, -max_finite_val), -max_finite_val, np_array)
|
||||
return np.float16(np_array)
|
||||
|
||||
|
||||
def convert_tensor_float_to_float16(tensor, min_positive_val=1e-7, max_finite_val=1e4):
|
||||
'''
|
||||
Convert tensor float to float16.
|
||||
|
||||
:param tensor: TensorProto object
|
||||
:return tensor_float16: converted TensorProto object
|
||||
|
||||
Example:
|
||||
|
||||
::
|
||||
|
||||
from onnxmltools.utils.float16_converter import convert_tensor_float_to_float16
|
||||
new_tensor = convert_tensor_float_to_float16(tensor)
|
||||
|
||||
'''
|
||||
if not isinstance(tensor, onnx_proto.TensorProto):
|
||||
raise ValueError('Expected input type is an ONNX TensorProto but got %s' % type(tensor))
|
||||
|
||||
if tensor.data_type == onnx_proto.TensorProto.FLOAT:
|
||||
tensor.data_type = onnx_proto.TensorProto.FLOAT16
|
||||
# convert float_data (float type) to float16 and write to int32_data
|
||||
if tensor.float_data:
|
||||
float16_data = convert_np_to_float16(np.array(tensor.float_data), min_positive_val, max_finite_val)
|
||||
int_list = _npfloat16_to_int(float16_data)
|
||||
tensor.int32_data[:] = int_list
|
||||
tensor.float_data[:] = []
|
||||
# convert raw_data (bytes type)
|
||||
if tensor.raw_data:
|
||||
# convert n.raw_data to float
|
||||
float32_list = np.fromstring(tensor.raw_data, dtype='float32')
|
||||
# convert float to float16
|
||||
float16_list = convert_np_to_float16(float32_list, min_positive_val, max_finite_val)
|
||||
# convert float16 to bytes and write back to raw_data
|
||||
tensor.raw_data = float16_list.tostring()
|
||||
return tensor
|
||||
|
||||
|
||||
def make_value_info_from_tensor(tensor):
|
||||
shape = numpy_helper.to_array(tensor).shape
|
||||
return helper.make_tensor_value_info(tensor.name, tensor.data_type, shape)
|
||||
|
||||
|
||||
DEFAULT_OP_BLOCK_LIST = [
|
||||
'ArrayFeatureExtractor', 'Binarizer', 'CastMap', 'CategoryMapper', 'DictVectorizer', 'FeatureVectorizer', 'Imputer',
|
||||
'LabelEncoder', 'LinearClassifier', 'LinearRegressor', 'Normalizer', 'OneHotEncoder', 'SVMClassifier',
|
||||
'SVMRegressor', 'Scaler', 'TreeEnsembleClassifier', 'TreeEnsembleRegressor', 'ZipMap', 'NonMaxSuppression', 'TopK',
|
||||
'RoiAlign', 'Resize', 'Range', 'CumSum', 'Min', 'Max', 'Upsample'
|
||||
]
|
||||
|
||||
|
||||
def convert_float_to_float16(model,
|
||||
min_positive_val=1e-7,
|
||||
max_finite_val=1e4,
|
||||
keep_io_types=False,
|
||||
disable_shape_infer=False,
|
||||
op_block_list=None,
|
||||
node_block_list=None):
|
||||
'''
|
||||
Convert tensor float type in the ONNX ModelProto input to tensor float16.
|
||||
|
||||
:param model: ONNX ModelProto object
|
||||
:param disable_shape_infer: Type/shape information is needed for conversion to work.
|
||||
Set to True only if the model already has type/shape information for all tensors.
|
||||
:return: converted ONNX ModelProto object
|
||||
|
||||
Examples:
|
||||
|
||||
::
|
||||
|
||||
Example 1: Convert ONNX ModelProto object:
|
||||
from onnxmltools.utils.float16_converter import convert_float_to_float16
|
||||
new_onnx_model = convert_float_to_float16(onnx_model)
|
||||
|
||||
Example 2: Convert ONNX model binary file:
|
||||
from onnxmltools.utils.float16_converter import convert_float_to_float16
|
||||
from onnxmltools.utils import load_model, save_model
|
||||
onnx_model = load_model('model.onnx')
|
||||
new_onnx_model = convert_float_to_float16(onnx_model)
|
||||
save_model(new_onnx_model, 'new_model.onnx')
|
||||
|
||||
'''
|
||||
func_infer_shape = None
|
||||
if not disable_shape_infer and onnx.__version__ >= '1.2':
|
||||
try:
|
||||
from onnx.shape_inference import infer_shapes
|
||||
func_infer_shape = infer_shapes
|
||||
finally:
|
||||
pass
|
||||
|
||||
if not isinstance(model, onnx_proto.ModelProto):
|
||||
raise ValueError('Expected model type is an ONNX ModelProto but got %s' % type(model))
|
||||
|
||||
# create blocklists
|
||||
if op_block_list is None:
|
||||
op_block_list = DEFAULT_OP_BLOCK_LIST
|
||||
if node_block_list is None:
|
||||
node_block_list = []
|
||||
op_block_list = set(op_block_list)
|
||||
node_block_list = set(node_block_list)
|
||||
|
||||
# create a queue for BFS
|
||||
queue = []
|
||||
value_info_list = []
|
||||
node_list = []
|
||||
# type inference on input model
|
||||
if func_infer_shape is not None:
|
||||
model = func_infer_shape(model)
|
||||
queue.append(model)
|
||||
name_mapping = {}
|
||||
graph_io_to_skip = set()
|
||||
io_casts = set()
|
||||
|
||||
fp32_inputs = [n.name for n in model.graph.input if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
|
||||
fp32_outputs = [n.name for n in model.graph.output if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT]
|
||||
if isinstance(keep_io_types, list):
|
||||
fp32_inputs = [n for n in fp32_inputs if n in keep_io_types]
|
||||
fp32_outputs = [n for n in fp32_outputs if n in keep_io_types]
|
||||
print("keep_io_types", keep_io_types, "fp32_inputs", fp32_inputs, "fp32_outputs", fp32_outputs)
|
||||
elif not keep_io_types:
|
||||
fp32_inputs = []
|
||||
fp32_outputs = []
|
||||
|
||||
for i, n in enumerate(model.graph.input):
|
||||
if n.name in fp32_inputs:
|
||||
output_name = 'graph_input_cast_' + str(i)
|
||||
name_mapping[n.name] = output_name
|
||||
graph_io_to_skip.add(n.name)
|
||||
|
||||
node_name = 'graph_input_cast' + str(i)
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(n)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
# add Cast node (from tensor(float) to tensor(float16) after graph input
|
||||
new_node = [helper.make_node('Cast', [n.name], [output_name], to=10, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
value_info_list.append(new_value_info)
|
||||
io_casts.add(node_name)
|
||||
|
||||
for i, n in enumerate(model.graph.output):
|
||||
if n.name in fp32_outputs:
|
||||
input_name = 'graph_output_cast_' + str(i)
|
||||
name_mapping[n.name] = input_name
|
||||
graph_io_to_skip.add(n.name)
|
||||
|
||||
node_name = 'graph_output_cast' + str(i)
|
||||
# add Cast node (from tensor(float16) to tensor(float) before graph output
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(n)
|
||||
new_value_info.name = input_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
new_node = [helper.make_node('Cast', [input_name], [n.name], to=1, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
value_info_list.append(new_value_info)
|
||||
io_casts.add(node_name)
|
||||
|
||||
fp32_initializer_counters = {}
|
||||
while queue:
|
||||
next_level = []
|
||||
for q in queue:
|
||||
# if q is model, push q.graph (GraphProto)
|
||||
if isinstance(q, onnx_proto.ModelProto):
|
||||
next_level.append(q.graph)
|
||||
# if q is model.graph, push q.node.attribute (AttributeProto)
|
||||
if isinstance(q, onnx_proto.GraphProto):
|
||||
for n in q.initializer: # TensorProto type
|
||||
if n.data_type == onnx_proto.TensorProto.FLOAT:
|
||||
fp32_initializer_counters[n.name] = [0,
|
||||
0] # two counters: used by fp16 nodes, used by fp32 nodes
|
||||
|
||||
for n in q.node:
|
||||
# if n is in the block list (doesn't support float16), no conversion for the node,
|
||||
# and save the node for further processing
|
||||
if n.name in io_casts:
|
||||
continue
|
||||
for i in range(len(n.input)):
|
||||
if n.input[i] in name_mapping:
|
||||
n.input[i] = name_mapping[n.input[i]]
|
||||
for i in range(len(n.output)):
|
||||
if n.output[i] in name_mapping:
|
||||
n.output[i] = name_mapping[n.output[i]]
|
||||
|
||||
is_node_blocked = n.op_type in op_block_list or n.name in node_block_list
|
||||
for input in n.input:
|
||||
if input in fp32_initializer_counters:
|
||||
fp32_initializer_counters[input][int(is_node_blocked)] += 1
|
||||
|
||||
if is_node_blocked:
|
||||
node_list.append(n)
|
||||
else:
|
||||
if n.op_type == 'Cast':
|
||||
for attr in n.attribute:
|
||||
if attr.name == 'to' and attr.i == 1:
|
||||
attr.i = 10
|
||||
break
|
||||
for attr in n.attribute:
|
||||
next_level.append(attr)
|
||||
# if q is model.graph.node.attribute, push q.g and q.graphs (GraphProto)
|
||||
# and process node.attribute.t and node.attribute.tensors (TensorProto)
|
||||
if isinstance(q, onnx_proto.AttributeProto):
|
||||
next_level.append(q.g)
|
||||
for n in q.graphs:
|
||||
next_level.append(n)
|
||||
q.t.CopyFrom(convert_tensor_float_to_float16(q.t, min_positive_val, max_finite_val))
|
||||
for n in q.tensors:
|
||||
n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
|
||||
# if q is graph, process graph.initializer(TensorProto), input, output and value_info (ValueInfoProto)
|
||||
if isinstance(q, onnx_proto.GraphProto):
|
||||
for n in q.initializer: # TensorProto type
|
||||
if n.data_type == onnx_proto.TensorProto.FLOAT:
|
||||
# TODO: handle initializer that used by subgraph
|
||||
if fp32_initializer_counters[n.name][1] == 0: # not used by fp32 node
|
||||
n = convert_tensor_float_to_float16(n, min_positive_val, max_finite_val)
|
||||
value_info_list.append(make_value_info_from_tensor(n))
|
||||
else:
|
||||
# TODO: add a cast node to handle the case that an intiailizer is used by both fp32 and fp16 nodes
|
||||
assert fp32_initializer_counters[n.name][0] == 0
|
||||
# for all ValueInfoProto with tensor(float) type in input, output and value_info, convert them to
|
||||
# tensor(float16) except map and seq(map). And save them in value_info_list for further processing
|
||||
for n in itertools.chain(q.input, q.output, q.value_info):
|
||||
if n.type.tensor_type.elem_type == onnx_proto.TensorProto.FLOAT:
|
||||
if n.name not in graph_io_to_skip:
|
||||
n.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT16
|
||||
value_info_list.append(n)
|
||||
queue = next_level
|
||||
|
||||
# process the nodes in block list that doesn't support tensor(float16)
|
||||
for node in node_list:
|
||||
# if input's name is in the value_info_list meaning input is tensor(float16) type,
|
||||
# insert a float16 to float Cast node before the node,
|
||||
# change current node's input name and create new value_info for the new name
|
||||
for i in range(len(node.input)):
|
||||
input = node.input[i]
|
||||
for value_info in value_info_list:
|
||||
if input == value_info.name:
|
||||
# create new value_info for current node's new input name
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
output_name = node.name + '_input_cast_' + str(i)
|
||||
new_value_info.name = output_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
|
||||
# add Cast node (from tensor(float16) to tensor(float) before current node
|
||||
node_name = node.name + '_input_cast' + str(i)
|
||||
new_node = [helper.make_node('Cast', [input], [output_name], to=1, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.input[i] = output_name
|
||||
break
|
||||
# if output's name is in the value_info_list meaning output is tensor(float16) type, insert a float to
|
||||
# float16 Cast node after the node, change current node's output name and create new value_info for the new name
|
||||
for i in range(len(node.output)):
|
||||
output = node.output[i]
|
||||
for value_info in value_info_list:
|
||||
if output == value_info.name:
|
||||
# create new value_info for current node's new output
|
||||
new_value_info = model.graph.value_info.add()
|
||||
new_value_info.CopyFrom(value_info)
|
||||
input_name = node.name + '_output_cast_' + str(i)
|
||||
new_value_info.name = input_name
|
||||
new_value_info.type.tensor_type.elem_type = onnx_proto.TensorProto.FLOAT
|
||||
# add Cast node (from tensor(float) to tensor(float16) after current node
|
||||
node_name = node.name + '_output_cast' + str(i)
|
||||
new_node = [helper.make_node('Cast', [input_name], [output], to=10, name=node_name)]
|
||||
model.graph.node.extend(new_node)
|
||||
# change current node's input name
|
||||
node.output[i] = input_name
|
||||
break
|
||||
return model
|
||||
|
||||
|
||||
def convert_float_to_float16_model_path(model_path, min_positive_val=1e-7, max_finite_val=1e4, keep_io_types=False):
|
||||
'''
|
||||
Convert tensor float type in the ONNX Model to tensor float16.
|
||||
*It is to fix an issue that infer_shapes func cannot be used to infer >2GB models.
|
||||
*But this function can be applied to all model sizes.
|
||||
:param model_path: ONNX Model path
|
||||
:return: converted ONNX ModelProto object
|
||||
Examples
|
||||
::
|
||||
#Convert to ONNX ModelProto object and save model binary file:
|
||||
from onnxmltools.utils.float16_converter import convert_float_to_float16_model_path
|
||||
new_onnx_model = convert_float_to_float16_model_path('model.onnx')
|
||||
onnx.save(new_onnx_model, 'new_model.onnx')
|
||||
'''
|
||||
|
||||
disable_shape_infer = False
|
||||
if onnx.__version__ >= '1.8':
|
||||
try:
|
||||
# infer_shapes_path can be applied to all model sizes
|
||||
from onnx.shape_inference import infer_shapes_path
|
||||
import tempfile
|
||||
import os
|
||||
# shape_infer_model_path should be in the same folder of model_path
|
||||
with tempfile.NamedTemporaryFile(dir=os.path.dirname(model_path)) as tmpfile:
|
||||
shape_infer_model_path = tmpfile.name
|
||||
infer_shapes_path(model_path, shape_infer_model_path)
|
||||
model = onnx.load(shape_infer_model_path)
|
||||
disable_shape_infer = True
|
||||
finally:
|
||||
pass
|
||||
if not disable_shape_infer:
|
||||
model = onnx.load(model_path)
|
||||
return convert_float_to_float16(model, min_positive_val, max_finite_val, keep_io_types, disable_shape_infer)
|
||||
|
|
@ -12,6 +12,7 @@ import random
|
|||
import numpy
|
||||
import time
|
||||
import re
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Tuple, Union
|
||||
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Config, TFGPT2Model
|
||||
|
|
@ -99,7 +100,9 @@ class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
|
|||
super().__init__(config)
|
||||
|
||||
def forward(self, input_ids, *past):
|
||||
return super().forward(input_ids, past_key_values=past)
|
||||
result = super().forward(input_ids, past_key_values=past, return_dict=False)
|
||||
|
||||
return MyGPT2Model.post_process(result, self.config.n_layer)
|
||||
|
||||
|
||||
# Maps model class name to a tuple of model class, name of first output and use padding or not
|
||||
|
|
@ -154,7 +157,7 @@ class Gpt2Helper:
|
|||
float_type = torch.float16 if float16 else torch.float32
|
||||
past_shape = [2, batch_size, num_attention_heads, past_sequence_length, int(hidden_size / num_attention_heads)]
|
||||
|
||||
past = [torch.rand(past_shape, dtype=float_type, device=device) for _ in range(num_layer)]
|
||||
past = [(torch.rand(past_shape, dtype=float_type, device=device) * 2.0 - 1.0) for _ in range(num_layer)]
|
||||
input_ids = torch.randint(low=0,
|
||||
high=vocab_size - 1,
|
||||
size=(batch_size, sequence_length),
|
||||
|
|
@ -261,6 +264,53 @@ class Gpt2Helper:
|
|||
|
||||
return is_all_close
|
||||
|
||||
@staticmethod
|
||||
def compare_outputs_v2(torch_outputs, ort_outputs, atol=1e-06):
|
||||
"""Compare outputs from PyTorch and OnnxRuntime
|
||||
|
||||
Args:
|
||||
torch_outputs (Tuple[Torch.Tensor]): PyTorch model output
|
||||
ort_outputs (List[numpy.ndarray]): OnnxRuntime output
|
||||
atol (float, optional): Absolute tollerance. Defaults to 1e-06.
|
||||
|
||||
Returns:
|
||||
is_all_close(bool): whether all elements are close.
|
||||
max_abs_diff(float): maximum absolute difference.
|
||||
messages(str): a list of debug message for each output
|
||||
"""
|
||||
is_all_close = True
|
||||
is_top1_matched = False
|
||||
max_diffs = []
|
||||
messages = []
|
||||
for i in range(len(ort_outputs)):
|
||||
ort_output = ort_outputs[i]
|
||||
torch_output = (torch_outputs[0] if i == 0 else torch_outputs[1][i - 1]).cpu().numpy()
|
||||
is_close = numpy.allclose(ort_output, torch_output, atol=atol, rtol=0)
|
||||
max_diffs.append(numpy.amax(numpy.abs(torch_output - ort_output)))
|
||||
is_all_close = is_all_close and is_close
|
||||
|
||||
if numpy.isnan(torch_output).any():
|
||||
logger.debug(f'PyTorch output {i} has nan')
|
||||
if numpy.isinf(torch_output).any():
|
||||
logger.debug(f'PyTorch output {i} has inf')
|
||||
if numpy.isnan(ort_output).any():
|
||||
logger.debug(f'ORT output {i} has nan')
|
||||
if numpy.isinf(ort_output).any():
|
||||
logger.debug(f'ORT output {i} has inf')
|
||||
|
||||
diff = numpy.fabs(ort_output - torch_output)
|
||||
idx = numpy.unravel_index(diff.argmax(), diff.shape)
|
||||
messages.append(
|
||||
f'diff={diff[idx]:.9f} index={idx} ort={ort_output[idx]:.9f} torch={float(torch_output[idx]):.9f}')
|
||||
|
||||
if i == 0: # logits
|
||||
ort_max_index = numpy.unravel_index(numpy.argmax(ort_output, axis=None), ort_output.shape)
|
||||
torch_max_index = numpy.unravel_index(numpy.argmax(torch_output, axis=None), torch_output.shape)
|
||||
is_top1_matched = numpy.array_equal(ort_max_index, torch_max_index)
|
||||
|
||||
max_diff_output_index = max_diffs.index(max(max_diffs))
|
||||
return is_all_close, max(max_diffs), max_diff_output_index, messages, is_top1_matched
|
||||
|
||||
@staticmethod
|
||||
def export_onnx(model,
|
||||
device,
|
||||
|
|
@ -345,19 +395,31 @@ class Gpt2Helper:
|
|||
is_float16,
|
||||
num_attention_heads,
|
||||
hidden_size,
|
||||
use_external_data_format=False):
|
||||
use_external_data_format=False,
|
||||
**kwargs):
|
||||
""" Optimize ONNX model with an option to convert it to use mixed precision.
|
||||
"""
|
||||
from optimizer import optimize_model
|
||||
|
||||
from fusion_options import FusionOptions
|
||||
optimization_options = FusionOptions('gpt2')
|
||||
#optimization_options.enable_gelu = False
|
||||
#optimization_options.enable_layer_norm = False
|
||||
#optimization_options.enable_attention = False
|
||||
m = optimize_model(onnx_model_path,
|
||||
model_type='gpt2',
|
||||
num_heads=num_attention_heads,
|
||||
hidden_size=hidden_size,
|
||||
opt_level=0,
|
||||
optimization_options=None,
|
||||
optimization_options=optimization_options,
|
||||
use_gpu=False)
|
||||
|
||||
if is_float16:
|
||||
m.convert_model_float32_to_float16(cast_input_output=False)
|
||||
op_full_list = set([node.op_type for node in m.nodes()])
|
||||
op_block_list = set(kwargs["op_block_list"]) if "op_block_list" in kwargs else set()
|
||||
op_remain_list = op_full_list.difference(op_block_list)
|
||||
logger.info(f"op_block_list={op_block_list} op_remain_list={op_remain_list}")
|
||||
m.convert_float_to_float16(use_symbolic_shape_infer=True, **kwargs)
|
||||
|
||||
m.save_model_to_file(optimized_model_path, use_external_data_format)
|
||||
|
||||
|
|
@ -526,6 +588,22 @@ class Gpt2Helper:
|
|||
|
||||
return ort_outputs, average_latency
|
||||
|
||||
@staticmethod
|
||||
def save_outputs(i, ort_outputs, torch_outputs):
|
||||
with open(f'ort_outputs_{i}.pickle', 'wb') as f:
|
||||
pickle.dump(ort_outputs, f)
|
||||
logger.info(f"ORT output are saved to ort_outputs_{i}.pickle")
|
||||
|
||||
with open(f'torch_outputs_{i}.pickle', 'wb') as f:
|
||||
pickle.dump(torch_outputs, f)
|
||||
logger.info(f"Torch output are saved to torch_outputs_{i}.pickle")
|
||||
|
||||
@staticmethod
|
||||
def save_inputs(i, dummy_inputs, ort_outputs, torch_outputs):
|
||||
with open(f'dummy_inputs_{i}.pickle', 'wb') as f:
|
||||
pickle.dump(dummy_inputs, f)
|
||||
logger.info(f"inputs are saved to dummy_inputs_{i}.pickle")
|
||||
|
||||
@staticmethod
|
||||
def test_parity(ort_session,
|
||||
model,
|
||||
|
|
@ -537,14 +615,16 @@ class Gpt2Helper:
|
|||
use_io_binding=True,
|
||||
model_class="GPT2LMHeadModel",
|
||||
has_position_ids=True,
|
||||
has_attention_mask=True):
|
||||
has_attention_mask=True,
|
||||
verbose=False,
|
||||
enable_pickle_output=False):
|
||||
""" Generate random inputs and compare the results of PyTorch and Onnx Runtime.
|
||||
"""
|
||||
|
||||
config: GPT2Config = model.config
|
||||
|
||||
logger.info(
|
||||
f"Running parity test (rtol={rtol}, atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding} model_class={model_class} is_float16={is_float16}) ..."
|
||||
f"Running parity test (atol={atol}, test_cases={total_test_cases}, use_io_binding={use_io_binding}, model_class={model_class}, is_float16={is_float16}) ..."
|
||||
)
|
||||
|
||||
max_batch_size = 8
|
||||
|
|
@ -558,7 +638,10 @@ class Gpt2Helper:
|
|||
output_buffers = Gpt2Helper.get_output_buffers(max_output_shapes, device, is_float16)
|
||||
|
||||
passed_test_cases = 0
|
||||
for _ in range(total_test_cases):
|
||||
top1_matched_cases = 0
|
||||
|
||||
max_abs_diff_list = []
|
||||
for i in range(total_test_cases):
|
||||
sequence_length = random.randint(1, max_seq_len)
|
||||
past_sequence_length = random.randint(0, max_past_seq_len)
|
||||
batch_size = random.randint(1, max_batch_size)
|
||||
|
|
@ -569,7 +652,6 @@ class Gpt2Helper:
|
|||
config.num_attention_heads, config.hidden_size, config.n_layer,
|
||||
config.vocab_size, device, is_float16, has_position_ids,
|
||||
has_attention_mask)
|
||||
|
||||
outputs = Gpt2Helper.pytorch_inference(model, dummy_inputs)
|
||||
if use_io_binding:
|
||||
ort_outputs = Gpt2Helper.onnxruntime_inference(ort_session, dummy_inputs)
|
||||
|
|
@ -579,13 +661,84 @@ class Gpt2Helper:
|
|||
ort_outputs = Gpt2Helper.onnxruntime_inference_with_binded_io(ort_session, dummy_inputs, output_buffers,
|
||||
output_shapes)
|
||||
|
||||
is_all_close = Gpt2Helper.compare_outputs(outputs, ort_outputs, rtol=rtol, atol=atol)
|
||||
is_all_close, max_abs_diff, max_diff_output_index, messages, is_top1_matched = Gpt2Helper.compare_outputs_v2(
|
||||
outputs, ort_outputs, atol=atol)
|
||||
if not numpy.isnan(max_abs_diff):
|
||||
max_abs_diff_list.append(max_abs_diff)
|
||||
if is_all_close:
|
||||
passed_test_cases += 1
|
||||
logger.info(f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}")
|
||||
if is_top1_matched:
|
||||
top1_matched_cases += 1
|
||||
|
||||
if verbose and not is_all_close:
|
||||
logger.info(
|
||||
f"test_case={i} batch_size={batch_size} past_sequence_length={past_sequence_length} sequence_length={sequence_length} MaxDiff={max_abs_diff}"
|
||||
)
|
||||
for i, message in enumerate(messages):
|
||||
logger.info(f"\t{i}: Name={ort_session.get_outputs()[i].name}, {message}")
|
||||
|
||||
# Collect data for debugging
|
||||
if enable_pickle_output and (numpy.isnan(max_abs_diff) or max_abs_diff > 100 * atol):
|
||||
Gpt2Helper.save_inputs(i, dummy_inputs)
|
||||
Gpt2Helper.save_outputs(i, ort_outputs, outputs)
|
||||
|
||||
if max_abs_diff_list:
|
||||
result = {
|
||||
f"max_diff_percentile_{p}": "{:.5f}".format(numpy.percentile(max_abs_diff_list, p))
|
||||
for p in [50, 90, 95, 99]
|
||||
}
|
||||
else:
|
||||
result = {f"max_diff_percentile_{p}": "nan" for p in [50, 90, 95, 99]}
|
||||
|
||||
result["top1_match_rate"] = top1_matched_cases * 1.0 / total_test_cases
|
||||
result["diff_pass_rate"] = passed_test_cases * 1.0 / total_test_cases
|
||||
result["nan_rate"] = (total_test_cases - len(max_abs_diff_list)) * 1.0 / total_test_cases
|
||||
|
||||
logger.info(
|
||||
f"Parity Test Cases={total_test_cases}; Passed={passed_test_cases}; Nan={total_test_cases-len(max_abs_diff_list)}; Top1_Matched={top1_matched_cases}"
|
||||
)
|
||||
|
||||
if passed_test_cases > 0.95 * total_test_cases:
|
||||
logger.info(f"Parity is good: passed rate={int(passed_test_cases*100/total_test_cases):.0f}%")
|
||||
return passed_test_cases == total_test_cases
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def test_performance(ort_session,
|
||||
model,
|
||||
device,
|
||||
is_float16=False,
|
||||
total_runs=100,
|
||||
use_io_binding=True,
|
||||
model_class="GPT2LMHeadModel",
|
||||
has_position_ids=True,
|
||||
has_attention_mask=True,
|
||||
batch_size=8,
|
||||
sequence_length=1,
|
||||
past_sequence_length=32):
|
||||
""" Generate random inputs and measure average latency of Onnx Runtime.
|
||||
"""
|
||||
|
||||
config: GPT2Config = model.config
|
||||
|
||||
output_buffers = None
|
||||
if use_io_binding:
|
||||
output_shapes = Gpt2Helper.get_output_shapes(batch_size, past_sequence_length, sequence_length, config,
|
||||
model_class)
|
||||
output_buffers = Gpt2Helper.get_output_buffers(output_shapes, device, is_float16)
|
||||
|
||||
dummy_inputs = Gpt2Helper.get_dummy_inputs(batch_size, past_sequence_length, sequence_length,
|
||||
config.num_attention_heads, config.hidden_size, config.n_layer,
|
||||
config.vocab_size, device, is_float16, has_position_ids,
|
||||
has_attention_mask)
|
||||
|
||||
if use_io_binding:
|
||||
_, latency = Gpt2Helper.onnxruntime_inference(ort_session, dummy_inputs, total_runs)
|
||||
else:
|
||||
_, latency = Gpt2Helper.onnxruntime_inference_with_binded_io(ort_session, dummy_inputs, output_buffers,
|
||||
output_shapes, total_runs)
|
||||
|
||||
return latency
|
||||
|
||||
@staticmethod
|
||||
def torchscript(model, config, device, has_position_ids=True, has_attention_mask=True):
|
||||
|
|
|
|||
177
onnxruntime/python/tools/transformers/gpt2_parity.py
Normal file
177
onnxruntime/python/tools/transformers/gpt2_parity.py
Normal file
|
|
@ -0,0 +1,177 @@
|
|||
# -------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
from convert_to_onnx import main
|
||||
import os
|
||||
import argparse
|
||||
import logging
|
||||
from gpt2_helper import PRETRAINED_GPT2_MODELS
|
||||
from benchmark_helper import setup_logger
|
||||
|
||||
logger = logging.getLogger('')
|
||||
|
||||
|
||||
def parse_arguments(argv=None):
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('-m',
|
||||
'--model_name_or_path',
|
||||
required=True,
|
||||
type=str,
|
||||
help='Model path, or pretrained model name in the list: ' + ', '.join(PRETRAINED_GPT2_MODELS))
|
||||
|
||||
parser.add_argument('--csv',
|
||||
required=False,
|
||||
type=str,
|
||||
default='gpt2_parity_results.csv',
|
||||
help='path of csv file to save the result')
|
||||
|
||||
parser.add_argument('--runs',
|
||||
required=False,
|
||||
type=int,
|
||||
default=5,
|
||||
help="number of repeated runs to get median value of each metric")
|
||||
|
||||
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU for inference")
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('--all', required=False, action='store_true', help="run all combinations of mixed precision")
|
||||
parser.set_defaults(all=False)
|
||||
|
||||
parser.add_argument('-e', '--use_external_data_format', required=False, action='store_true')
|
||||
parser.set_defaults(use_external_data_format=False)
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true')
|
||||
parser.set_defaults(verbose=False)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class ParityTask:
|
||||
def __init__(self, total_runs, csv_path):
|
||||
self.total_runs = total_runs
|
||||
self.csv_path = csv_path
|
||||
self.latency_name = "average_latency(batch_size=8,sequence_length=1,past_sequence_length=32)"
|
||||
self.metric_names = [
|
||||
self.latency_name, "diff_50_percentile", "diff_90_percentile", "diff_95_percentile", "diff_99_percentile",
|
||||
"diff_pass_rate", "nan_rate", "top1_match_rate", "onnx_size_in_MB"
|
||||
]
|
||||
|
||||
def run(self, argv, name):
|
||||
results = []
|
||||
experiment_name = name
|
||||
for i in range(self.total_runs):
|
||||
try:
|
||||
result = main(argv, experiment_name=experiment_name, run_id=i, csv_filename=self.csv_path)
|
||||
except:
|
||||
logger.error(f"Failed to run experiment{experiment_name}")
|
||||
continue
|
||||
if result:
|
||||
results.append(result)
|
||||
|
||||
if len(results) == 0:
|
||||
return
|
||||
|
||||
# Calculate median value per metric
|
||||
all_results = {}
|
||||
for name in self.metric_names:
|
||||
all_results[name] = []
|
||||
|
||||
for result in results:
|
||||
for name in self.metric_names:
|
||||
if name in result:
|
||||
all_results[name].append(result[name])
|
||||
|
||||
import statistics
|
||||
median_result = results[0]
|
||||
for name in self.metric_names:
|
||||
median_result[name] = statistics.median(all_results[name])
|
||||
|
||||
self.save_result(median_result)
|
||||
|
||||
def save_result(self, result):
|
||||
import csv
|
||||
csv_filename = self.csv_path
|
||||
|
||||
csv_file_existed = os.path.exists(csv_filename)
|
||||
with open(csv_filename, mode="a", newline='') as csv_file:
|
||||
column_names = [
|
||||
"experiment", "run_id", "model_name", "model_class", "gpu", "precision", "optimizer", "test_cases",
|
||||
"keep_io_types", "io_block_list", "op_block_list", "node_block_list", "ORT_TRANSFORMER_OPTIONS",
|
||||
"ORT_CUDA_GEMM_OPTIONS", "onnxruntime"
|
||||
] + self.metric_names
|
||||
|
||||
csv_writer = csv.DictWriter(csv_file, fieldnames=column_names)
|
||||
if not csv_file_existed:
|
||||
csv_writer.writeheader()
|
||||
|
||||
row = {}
|
||||
for name in column_names:
|
||||
row[name] = result[name]
|
||||
|
||||
row["run_id"] = "median"
|
||||
|
||||
csv_writer.writerow(row)
|
||||
logger.info(f"result saved to {csv_filename}: {row}")
|
||||
|
||||
|
||||
def run_parity(args):
|
||||
task = ParityTask(args.runs, args.csv)
|
||||
|
||||
model = args.model_name_or_path
|
||||
fp32_baseline = f"-m {model} -o -p fp32".split()
|
||||
if args.use_gpu:
|
||||
fp32_baseline.append("--use_gpu")
|
||||
|
||||
if args.use_external_data_format:
|
||||
fp32_baseline.append("--use_external_data_format")
|
||||
|
||||
task.run(fp32_baseline, "fp32 baseline")
|
||||
|
||||
# The following tests for fp16 requires GPU
|
||||
if not args.use_gpu:
|
||||
logger.info("skip mixed precision since --use_gpu is not specified")
|
||||
return
|
||||
|
||||
baseline = f"-m {model} -o --use_gpu -p fp16".split()
|
||||
if args.use_external_data_format:
|
||||
baseline.append("--use_external_data_format")
|
||||
task.run(baseline, "fp16 baseline")
|
||||
|
||||
if not args.all:
|
||||
logger.info("skip remaining combinations since --all is not specified")
|
||||
return
|
||||
|
||||
fp32_logits = ["--io_block_list", "logits"]
|
||||
task.run(baseline + fp32_logits, "fp16 except logits")
|
||||
|
||||
fp32_io = ["--keep_io_types"]
|
||||
task.run(baseline + fp32_io, "Graph I/O FP32, Other FP16")
|
||||
|
||||
op_list = "Attention Gather Add LayerNormalization FastGelu MatMul".split()
|
||||
task.run(baseline + fp32_io + ["--op_block_list"] + [o for o in op_list], "Everthing in FP32")
|
||||
|
||||
for op in op_list:
|
||||
op_block_list = ["--op_block_list"] + [o for o in op_list if o != op]
|
||||
task.run(baseline + fp32_io + op_block_list, f"FP32 except {op} in fp16")
|
||||
|
||||
for op in op_list:
|
||||
op_block_list = ["--op_block_list", op]
|
||||
task.run(baseline + op_block_list, f"FP16 except {op} in fp32")
|
||||
|
||||
op_block_list = ["--op_block_list", "LayerNormalization", "FastGelu"]
|
||||
task.run(baseline + op_block_list, f"FP16 except LayerNormalization and FastGelu in fp32")
|
||||
|
||||
task.run(baseline + op_block_list + fp32_logits, f"FP16 except logits, LayerNormalization and FastGelu in fp32")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
args = parse_arguments()
|
||||
setup_logger(args.verbose)
|
||||
|
||||
run_parity(args)
|
||||
|
|
@ -18,6 +18,9 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
class OnnxModel:
|
||||
def __init__(self, model):
|
||||
self.initialize(model)
|
||||
|
||||
def initialize(self, model):
|
||||
self.model = model
|
||||
self._node_name_suffix: Dict[str, int] = {} # key is node name prefix, value is the last suffix generated
|
||||
self.shape_infer_helper = None
|
||||
|
|
@ -495,31 +498,18 @@ class OnnxModel:
|
|||
initializer=graph.initializer,
|
||||
value_info=graph.value_info)
|
||||
|
||||
self.model = helper.make_model(graph_def, producer_name='onnxruntime-tools')
|
||||
self.model = helper.make_model(graph_def, producer_name='onnxruntime')
|
||||
|
||||
# restore opset version
|
||||
self.model.opset_import[0].version = original_opset_version
|
||||
|
||||
def convert_model_float32_to_float16(self, cast_input_output=True, use_symbolic_shape_infer=True):
|
||||
"""Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs.
|
||||
For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance.
|
||||
Args:
|
||||
cast_input_output (bool, optional): keep data type of inputs and outputs, and add Cast nodes to convert float32 inputs to float16, and float16 to float32 for outputs. Defaults to True.
|
||||
use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference.
|
||||
"""
|
||||
from packaging.version import Version
|
||||
import onnxconverter_common as oc
|
||||
if Version(oc.__version__) > Version("1.7.0"):
|
||||
model = self.model
|
||||
if use_symbolic_shape_infer:
|
||||
# Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
|
||||
shape_infer_helper = SymbolicShapeInferenceHelper(model)
|
||||
model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False)
|
||||
self.model = oc.float16.convert_float_to_float16(model,
|
||||
keep_io_types=cast_input_output,
|
||||
disable_shape_infer=use_symbolic_shape_infer)
|
||||
return
|
||||
def _naive_float_to_float16(self, keep_io_types=True):
|
||||
"""Convert model from single precision to half precision naively.
|
||||
It might generate invalid model or cause precision loss.
|
||||
|
||||
Args:
|
||||
cast_input_output (bool, optional): [description]. Defaults to True.
|
||||
"""
|
||||
graph = self.model.graph
|
||||
initializers = graph.initializer
|
||||
|
||||
|
|
@ -540,7 +530,7 @@ class OnnxModel:
|
|||
if att.name == 'to' and att.i == 1:
|
||||
att.CopyFrom(helper.make_attribute("to", int(TensorProto.FLOAT16)))
|
||||
|
||||
if not cast_input_output:
|
||||
if not keep_io_types:
|
||||
self.change_input_output_float32_to_float16()
|
||||
return
|
||||
|
||||
|
|
@ -570,6 +560,107 @@ class OnnxModel:
|
|||
cast_node.attribute.extend([helper.make_attribute("to", int(TensorProto.FLOAT))])
|
||||
self.add_node(cast_node)
|
||||
|
||||
def get_dtype(self, input_or_output: str):
|
||||
"""Try get data type given a name (could be initializer, graph input or output)."""
|
||||
tensor_type_map = {obj.name: obj.type for obj in self.model.graph.value_info}
|
||||
|
||||
if input_or_output in tensor_type_map:
|
||||
return tensor_type_map[input_or_output].tensor_type.elem_type
|
||||
|
||||
graph_input = self.find_graph_input(input_or_output)
|
||||
if graph_input:
|
||||
return graph_input.type.tensor_type.elem_type
|
||||
|
||||
graph_output = self.find_graph_output(input_or_output)
|
||||
if graph_output:
|
||||
return graph_output.type.tensor_type.elem_type
|
||||
|
||||
return None
|
||||
|
||||
def convert_model_float32_to_float16(self, cast_input_output=True, **kwargs):
|
||||
logger.warn(
|
||||
'The function convert_model_float32_to_float16 is deprecated. Use convert_float_to_float16 instead!')
|
||||
self._naive_float_to_float16(keep_io_types=cast_input_output)
|
||||
|
||||
def convert_float_to_float16(self, use_symbolic_shape_infer=True, **kwargs):
|
||||
"""Convert a graph to FLOAT16. By default, we will keep data types of inputs and outputs.
|
||||
For decoder model with past_key_values, it is recommended to set cast_input_output=False for better performance.
|
||||
Args:
|
||||
keep_io_types (bool, optional): keep data type of inputs and outputs. Defaults to True.
|
||||
use_symbolic_shape_infer (bool, optional): use symbolic shape inference instead of onnx shape inference.
|
||||
kwargs: parameters for float16 conversion.
|
||||
"""
|
||||
if "keep_io_types" not in kwargs:
|
||||
kwargs["keep_io_types"] = True
|
||||
|
||||
def float_to_float16_func():
|
||||
# TODO: import from onnxconverter_common when it is stable
|
||||
#try:
|
||||
# import onnxconverter_common as oc
|
||||
# from packaging.version import Version
|
||||
# if Version(oc.__version__) > Version("1.9.0"):
|
||||
# from onnxconverter_common.float16 import convert_float_to_float16
|
||||
# return convert_float_to_float16
|
||||
#except ImportError:
|
||||
# pass
|
||||
|
||||
from float16 import convert_float_to_float16
|
||||
return convert_float_to_float16
|
||||
|
||||
convert_float_to_float16 = float_to_float16_func()
|
||||
|
||||
model = self.model
|
||||
if use_symbolic_shape_infer:
|
||||
# Use symbolic shape inference since custom operators (like Gelu, SkipLayerNormalization etc) are not recognized by onnx shape inference.
|
||||
shape_infer_helper = SymbolicShapeInferenceHelper(model)
|
||||
model = shape_infer_helper.infer_shapes(model, auto_merge=True, guess_output_rank=False)
|
||||
|
||||
parameters = {'disable_shape_infer': use_symbolic_shape_infer}
|
||||
parameters.update({
|
||||
key: kwargs[key]
|
||||
for key in ['keep_io_types', 'min_positive_val', 'max_finite_val', 'op_block_list', 'node_block_list']
|
||||
if key in kwargs
|
||||
})
|
||||
|
||||
fp16_model = convert_float_to_float16(model, **parameters)
|
||||
self.initialize(fp16_model)
|
||||
|
||||
def get_node_attribute(node, attribute_name: str):
|
||||
for attr in node.attribute:
|
||||
if attr.name == attribute_name:
|
||||
value = helper.get_attribute_value(attr)
|
||||
return value
|
||||
return None
|
||||
|
||||
# Convert_float_to_float16 might add Cast(to=10) --> Cast(to=1) when two consequent nodes are computed in FP32.
|
||||
# Below are post-processing that removes those Cast nodes.
|
||||
# Remove first Cast nodes in path like --> Cast --> Cast -->
|
||||
nodes_to_remove = []
|
||||
for node in self.nodes():
|
||||
if node.op_type == "Cast":
|
||||
parent = self.get_parent(node, 0)
|
||||
if parent and parent.op_type == "Cast":
|
||||
if self.get_children(parent) == 1: # cannot be removed if its output is used by multiple nodes
|
||||
self.replace_input_of_all_nodes(parent.output[0], parent.input[0])
|
||||
nodes_to_remove.append(parent)
|
||||
|
||||
# Remove the second cast node.
|
||||
for node in self.nodes():
|
||||
if node.op_type == "Cast" and get_node_attribute(node, "to") == int(TensorProto.FLOAT) and \
|
||||
self.get_dtype(node.input[0]) == int(TensorProto.FLOAT):
|
||||
|
||||
if self.find_graph_output(node.output[0]):
|
||||
self.replace_output_of_all_nodes(node.input[0], node.output[0])
|
||||
else:
|
||||
self.replace_input_of_all_nodes(node.output[0], node.input[0])
|
||||
nodes_to_remove.append(node)
|
||||
|
||||
self.remove_nodes(nodes_to_remove)
|
||||
|
||||
if nodes_to_remove:
|
||||
self.prune_graph()
|
||||
print(f"removed {len(nodes_to_remove)} Cast nodes from float16 model")
|
||||
|
||||
def create_node_name(self, op_type, name_prefix=None):
|
||||
"""Create a unique node name that starts with a prefix (default is operator type).
|
||||
The name will not be duplicated with any name that generated or existed in current graphs.
|
||||
|
|
|
|||
11
onnxruntime/python/tools/transformers/requirements.txt
Normal file
11
onnxruntime/python/tools/transformers/requirements.txt
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
onnx >= 1.8
|
||||
numpy
|
||||
coloredlogs
|
||||
psutil
|
||||
py-cpuinfo
|
||||
py3nvml
|
||||
packaging
|
||||
transformers >= 4.0
|
||||
|
||||
# please follow https://pytorch.org/ to install PyTorch for your OS
|
||||
torch >= 1.8
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
onnx
|
||||
numpy
|
||||
coloredlogs
|
||||
psutil
|
||||
py-cpuinfo
|
||||
py3nvml
|
||||
packaging
|
||||
transformers
|
||||
onnxruntime
|
||||
onnxconverter_common
|
||||
--find-links https://download.pytorch.org/whl/torch_stable.html
|
||||
torch==1.7.1+cpu
|
||||
torchvision==0.8.2+cpu
|
||||
torchaudio===0.7.2
|
||||
|
|
@ -1,14 +0,0 @@
|
|||
onnx
|
||||
numpy
|
||||
coloredlogs
|
||||
psutil
|
||||
py-cpuinfo
|
||||
py3nvml
|
||||
packaging
|
||||
transformers
|
||||
onnxruntime-gpu
|
||||
onnxconverter_common
|
||||
--find-links https://download.pytorch.org/whl/torch_stable.html
|
||||
torch===1.7.1
|
||||
torchvision===0.8.2
|
||||
torchaudio===0.7.2
|
||||
Loading…
Reference in a new issue