mirror of
https://github.com/saymrwulf/onnxruntime.git
synced 2026-07-03 03:58:54 +00:00
Update benchmark of gpt2 model with past state (#4043)
* update benchmark_gpt2 to use past state only * update dynamic axes of input/output tensors * Remove --use_openmp option since it is default for onnxruntime 1.3 cpu. * Use same option names as benchmark.py
This commit is contained in:
parent
ed0a8e5b5c
commit
60fa4b1f90
2 changed files with 170 additions and 124 deletions
|
|
@ -224,7 +224,7 @@ class BertOnnxModel(OnnxModel):
|
|||
self.clean_graph()
|
||||
self.prune_graph()
|
||||
|
||||
def optimize(self, options: BertOptimizationOptions = None):
|
||||
def optimize(self, options: BertOptimizationOptions = None, add_dynamic_axes=False):
|
||||
if (options is None) or options.enable_layer_norm:
|
||||
self.fuse_layer_norm()
|
||||
|
||||
|
|
@ -263,7 +263,8 @@ class BertOnnxModel(OnnxModel):
|
|||
self.remove_unused_constant()
|
||||
|
||||
# Use symbolic batch dimension in input and output.
|
||||
self.use_dynamic_axes()
|
||||
if add_dynamic_axes:
|
||||
self.use_dynamic_axes()
|
||||
|
||||
logger.info(f"opset verion: {self.model.opset_import[0].version}")
|
||||
|
||||
|
|
|
|||
|
|
@ -3,6 +3,9 @@
|
|||
# Licensed under the MIT License. See License.txt in the project root for
|
||||
# license information.
|
||||
# --------------------------------------------------------------------------
|
||||
# This script benchmarks gpt2 model with past state.
|
||||
# For gpt2 model without past state, use benchmark.py to measure performance.
|
||||
|
||||
import os
|
||||
import sys
|
||||
import numpy
|
||||
|
|
@ -11,14 +14,14 @@ import psutil
|
|||
import argparse
|
||||
import logging
|
||||
import torch
|
||||
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer
|
||||
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer, AutoConfig
|
||||
|
||||
logger = logging.getLogger('')
|
||||
|
||||
# Map alias to a tuple of Model Class and pretrained model name
|
||||
MODEL_CLASSES = {
|
||||
"gpt2": (GPT2Model, GPT2Tokenizer, "gpt2"),
|
||||
"distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer, "distilgpt2")
|
||||
"distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer, "distilgpt2"),
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -34,13 +37,9 @@ def dump_environment():
|
|||
logger.info("no environment variable of OMP_WAIT_POLICY")
|
||||
|
||||
|
||||
def setup_environment(use_openmp=False):
|
||||
def setup_environment():
|
||||
# ATTENTION: these environment variables must be set before importing onnxruntime.
|
||||
if use_openmp:
|
||||
os.environ["OMP_NUM_THREADS"] = str(psutil.cpu_count(logical=True))
|
||||
else:
|
||||
os.environ["OMP_NUM_THREADS"] = '1'
|
||||
|
||||
os.environ["OMP_NUM_THREADS"] = str(psutil.cpu_count(logical=True))
|
||||
os.environ["OMP_WAIT_POLICY"] = 'ACTIVE'
|
||||
dump_environment()
|
||||
|
||||
|
|
@ -53,15 +52,15 @@ def pytorch_inference(model, input_ids, past=None, total_runs=100):
|
|||
outputs = model(input_ids=input_ids, past=past)
|
||||
latency.append(time.time() - start)
|
||||
|
||||
logger.info("PyTorch Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))
|
||||
return outputs
|
||||
average_latency = sum(latency) * 1000 / len(latency)
|
||||
logger.debug("PyTorch Inference time = {} ms".format(format(average_latency, '.2f')))
|
||||
return outputs, average_latency
|
||||
|
||||
|
||||
def onnxruntime_inference(ort_session, input_ids, past=None, total_runs=100):
|
||||
# Use contiguous array as input might improve performance.
|
||||
# You can check the results from performance test tool to see whether you need it.
|
||||
ort_inputs = {'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy())}
|
||||
|
||||
# TODO: pass input tensor stored in GPU
|
||||
if past is not None:
|
||||
for i, past_i in enumerate(past):
|
||||
ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past[i].cpu().numpy())
|
||||
|
|
@ -72,49 +71,77 @@ def onnxruntime_inference(ort_session, input_ids, past=None, total_runs=100):
|
|||
ort_outputs = ort_session.run(None, ort_inputs)
|
||||
latency.append(time.time() - start)
|
||||
|
||||
logger.info("OnnxRuntime Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))
|
||||
average_latency = sum(latency) * 1000 / len(latency)
|
||||
logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, '.2f')))
|
||||
|
||||
return ort_outputs
|
||||
return ort_outputs, average_latency
|
||||
|
||||
|
||||
def inference(model, ort_session, input_ids, past=None, total_runs=100, verify_outputs=True):
|
||||
outputs = pytorch_inference(model, input_ids, past, total_runs)
|
||||
ort_outputs = onnxruntime_inference(ort_session, input_ids, past, total_runs)
|
||||
outputs, torch_latency = pytorch_inference(model, input_ids, past, total_runs)
|
||||
ort_outputs, ort_latency = onnxruntime_inference(ort_session, input_ids, past, total_runs)
|
||||
if verify_outputs:
|
||||
logger.info('PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(0),
|
||||
numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04))
|
||||
|
||||
is_close = numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04)
|
||||
logger.debug(f'PyTorch and OnnxRuntime output 0 (last_state) are close: {is_close}')
|
||||
|
||||
is_all_close = is_close
|
||||
for layer in range(model.config.n_layer):
|
||||
logger.info('PyTorch and OnnxRuntime layer {} state (present_{}) are close:'.format(layer, layer),
|
||||
numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04))
|
||||
is_close = numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04)
|
||||
logger.debug(f'PyTorch and OnnxRuntime layer {layer} state (present_{layer}) are close:{is_close}')
|
||||
is_all_close = is_all_close and is_close
|
||||
|
||||
if not is_all_close:
|
||||
logger.warning(f'PyTorch and OnnxRuntime results are not all close.')
|
||||
|
||||
return torch_latency, ort_latency
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
parser.add_argument('--model_type',
|
||||
parser.add_argument('-m',
|
||||
'--model_type',
|
||||
required=True,
|
||||
type=str,
|
||||
choices=list(MODEL_CLASSES.keys()),
|
||||
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
|
||||
help='Model type selected in the list: ' + ', '.join(MODEL_CLASSES.keys()))
|
||||
|
||||
parser.add_argument('--cache_dir', required=True, type=str, help="cache directory")
|
||||
parser.add_argument('-c',
|
||||
'--cache_dir',
|
||||
required=False,
|
||||
type=str,
|
||||
default='./cache_models',
|
||||
help='Directory to cache pre-trained models')
|
||||
|
||||
parser.add_argument('--output_dir', required=True, type=str, help="output onnx model directory")
|
||||
parser.add_argument('--onnx_dir',
|
||||
required=False,
|
||||
type=str,
|
||||
default='./onnx_models',
|
||||
help='Directory to store onnx models')
|
||||
|
||||
parser.add_argument('--total_runs', required=False, type=int, help="total runs", default=100)
|
||||
parser.add_argument('-t',
|
||||
'--test_times',
|
||||
required=False,
|
||||
default=100,
|
||||
type=int,
|
||||
help='Number of repeat times to get average inference latency.')
|
||||
|
||||
parser.add_argument('--enable_past_input', required=False, action='store_true')
|
||||
parser.set_defaults(enable_past_input=False)
|
||||
parser.add_argument('-v', '--validate_onnx', required=False, action='store_true', help='Validate ONNX model')
|
||||
|
||||
parser.add_argument('--enable_optimization', required=False, action='store_true')
|
||||
parser.set_defaults(enable_optimization=False)
|
||||
parser.add_argument('-o',
|
||||
'--optimize_onnx',
|
||||
required=False,
|
||||
action='store_true',
|
||||
help='Use optimizer.py to optimize onnx model')
|
||||
parser.set_defaults(optimize_onnx=False)
|
||||
|
||||
parser.add_argument('--verify_outputs', required=False, action='store_true')
|
||||
parser.set_defaults(verify_outputs=False)
|
||||
parser.add_argument('--use_gpu', required=False, action='store_true')
|
||||
parser.set_defaults(use_gpu=False)
|
||||
|
||||
parser.add_argument('--use_openmp', required=False, action='store_true')
|
||||
parser.set_defaults(use_openmp=False)
|
||||
parser.add_argument('-b', '--batch_sizes', nargs='+', type=int, default=[1])
|
||||
|
||||
parser.add_argument('-s', '--sequence_lengths', nargs='+', type=int, default=[8, 16, 32, 64, 128, 256])
|
||||
|
||||
parser.add_argument('--verbose', required=False, action='store_true')
|
||||
parser.set_defaults(verbose=False)
|
||||
|
|
@ -133,6 +160,7 @@ def setup_logger(verbose=True):
|
|||
else:
|
||||
log_handler.setFormatter(logging.Formatter('%(filename)20s: %(message)s'))
|
||||
logging_level = logging.INFO
|
||||
logging.getLogger("transformers").setLevel(logging.ERROR)
|
||||
log_handler.setLevel(logging_level)
|
||||
|
||||
# Avoid duplicated handlers when runing this script in multiple cells of Jupyter Notebook.
|
||||
|
|
@ -142,21 +170,71 @@ def setup_logger(verbose=True):
|
|||
logger.setLevel(logging_level)
|
||||
|
||||
|
||||
def remove_past_outputs(export_model_path, output_model_path):
|
||||
from onnx import ModelProto
|
||||
from OnnxModel import OnnxModel
|
||||
def export_onnx(model, config, tokenizer, device, output_dir):
|
||||
model.to(device)
|
||||
|
||||
model = ModelProto()
|
||||
with open(export_model_path, "rb") as f:
|
||||
model.ParseFromString(f.read())
|
||||
bert_model = OnnxModel(model)
|
||||
inputs = tokenizer.encode_plus("Here is an example input for GPT2 model",
|
||||
add_special_tokens=True,
|
||||
return_tensors='pt')
|
||||
input_ids = inputs['input_ids'].to(device)
|
||||
logger.debug(f"input_ids={input_ids}")
|
||||
outputs = model(input_ids=input_ids, past=None)
|
||||
assert len(outputs) == 2
|
||||
logger.debug(f"output 0 shape={outputs[0].shape}")
|
||||
logger.debug(f"outputs[1][0] shape={outputs[1][0].shape}")
|
||||
|
||||
# remove past state outputs and only keep the first output.
|
||||
keep_output_names = [bert_model.model.graph.output[0].name]
|
||||
logger.info(f"Prune graph to keep the first output and drop past state outputs:{keep_output_names}")
|
||||
bert_model.prune_graph(keep_output_names)
|
||||
num_layer = model.config.n_layer
|
||||
present_names = [f'present_{i}' for i in range(num_layer)]
|
||||
output_names = ["last_state"] + present_names
|
||||
|
||||
bert_model.save_model_to_file(output_model_path)
|
||||
input_names = ['input_ids']
|
||||
|
||||
# input_ids has only one word for model with past state.
|
||||
# Shape of input tensors:
|
||||
# input_ids: (batch_size, 1)
|
||||
# past_{i}: (2, batch_size, num_heads, seq_len, hidden_size/num_heads)
|
||||
# Shape of output tensors:
|
||||
# last_state: (batch_size, seq_len + 1, hidden_size)
|
||||
# present_{i}: (2, batch_size, num_heads, seq_len + 1, hidden_size/num_heads)
|
||||
dynamic_axes = {'input_ids': {0: 'batch_size'}, 'last_state': {0: 'batch_size', 1: 'seq_len_plus_1'}}
|
||||
|
||||
for name in present_names:
|
||||
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len_plus_1'}
|
||||
|
||||
past_names = [f'past_{i}' for i in range(num_layer)]
|
||||
input_names = ['input_ids'] + past_names
|
||||
dummy_past = [torch.zeros(list(outputs[1][0].shape), dtype=torch.float32, device=device) for _ in range(num_layer)]
|
||||
for name in past_names:
|
||||
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
|
||||
logger.debug(f"vocab_size:{model.config.vocab_size}")
|
||||
|
||||
dummy_input_ids = torch.randint(low=0,
|
||||
high=model.config.vocab_size - 1,
|
||||
size=(1, 1),
|
||||
dtype=torch.int64,
|
||||
device=device)
|
||||
logger.debug(f"dummy_input_ids={dummy_input_ids}")
|
||||
export_inputs = (dummy_input_ids, tuple(dummy_past))
|
||||
|
||||
export_model_path = os.path.join(output_dir, 'gpt2_past.onnx')
|
||||
|
||||
# Let's run performance test on PyTorch before updating environment variable.
|
||||
with torch.no_grad():
|
||||
outputs = model(input_ids=dummy_input_ids, past=dummy_past)
|
||||
|
||||
logger.debug(f"present_0 shape={outputs[1][0].shape}")
|
||||
|
||||
torch.onnx.export(model,
|
||||
args=export_inputs,
|
||||
f=export_model_path,
|
||||
input_names=input_names,
|
||||
output_names=output_names,
|
||||
example_outputs=outputs,
|
||||
dynamic_axes=dynamic_axes,
|
||||
opset_version=11,
|
||||
do_constant_folding=True,
|
||||
verbose=False)
|
||||
return export_model_path
|
||||
|
||||
|
||||
def main():
|
||||
|
|
@ -164,107 +242,74 @@ def main():
|
|||
setup_logger(args.verbose)
|
||||
dump_environment()
|
||||
|
||||
enable_past_input = args.enable_past_input
|
||||
|
||||
cache_dir = args.cache_dir
|
||||
if not os.path.exists(cache_dir):
|
||||
os.makedirs(cache_dir)
|
||||
|
||||
output_dir = args.output_dir
|
||||
output_dir = args.onnx_dir
|
||||
if not os.path.exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
(model_class, tokenizer_class, model_name_or_path) = MODEL_CLASSES[args.model_type]
|
||||
use_torchscript = False
|
||||
(model_class, tokenizer_class, model_name) = MODEL_CLASSES[args.model_type]
|
||||
config = AutoConfig.from_pretrained(model_name, torchscript=use_torchscript, cache_dir=cache_dir)
|
||||
model = model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir)
|
||||
tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=cache_dir)
|
||||
#if use_torchscript:
|
||||
# model = torch.jit.trace(model, (input_ids, past))
|
||||
|
||||
tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
|
||||
model = model_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
|
||||
model.eval().cpu()
|
||||
|
||||
inputs = tokenizer.encode_plus("Here is an example input for GPT2 model",
|
||||
add_special_tokens=True,
|
||||
return_tensors='pt')
|
||||
input_ids = inputs['input_ids']
|
||||
outputs = model(input_ids=input_ids, past=None)
|
||||
|
||||
num_layer = model.config.n_layer
|
||||
present_names = [f'present_{i}' for i in range(num_layer)]
|
||||
output_names = ["last_state"] + present_names
|
||||
|
||||
input_names = ['input_ids']
|
||||
dynamic_axes = {'input_ids': {0: 'batch_size', 1: 'seq_len'}, 'last_state': {0: 'batch_size', 1: 'seq_len'}}
|
||||
for name in present_names:
|
||||
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
|
||||
|
||||
if enable_past_input:
|
||||
past_names = [f'past_{i}' for i in range(num_layer)]
|
||||
input_names = ['input_ids'] + past_names
|
||||
dummy_past = [torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer)]
|
||||
for name in past_names:
|
||||
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
|
||||
export_inputs = (inputs['input_ids'], tuple(dummy_past))
|
||||
else:
|
||||
export_inputs = (inputs['input_ids'])
|
||||
|
||||
export_model_path = os.path.join(output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input)))
|
||||
|
||||
torch.onnx.export(model,
|
||||
args=export_inputs,
|
||||
f=export_model_path,
|
||||
input_names=input_names,
|
||||
output_names=output_names,
|
||||
dynamic_axes=dynamic_axes,
|
||||
opset_version=11,
|
||||
do_constant_folding=True,
|
||||
verbose=False)
|
||||
|
||||
# Let's run performance test on PyTorch before updating environment variable.
|
||||
past = dummy_past if enable_past_input else None
|
||||
outputs = pytorch_inference(model, input_ids, past, total_runs=args.total_runs)
|
||||
device = torch.device("cuda:0" if args.use_gpu else "cpu")
|
||||
export_model_path = export_onnx(model, config, tokenizer, device, output_dir)
|
||||
|
||||
# setup environment variables before importing onnxruntime.
|
||||
setup_environment(args.use_openmp)
|
||||
setup_environment()
|
||||
import onnxruntime
|
||||
|
||||
if enable_past_input:
|
||||
if not args.optimize_onnx:
|
||||
onnx_model_path = export_model_path
|
||||
else:
|
||||
onnx_model_path = os.path.join(output_dir, 'gpt2_past{}_out1.onnx'.format(int(enable_past_input)))
|
||||
remove_past_outputs(export_model_path, onnx_model_path)
|
||||
|
||||
if args.enable_optimization:
|
||||
from optimizer import optimize_model
|
||||
m = optimize_model(onnx_model_path,
|
||||
m = optimize_model(export_model_path,
|
||||
model_type='gpt2',
|
||||
num_heads=12,
|
||||
hidden_size=768,
|
||||
num_heads=config.num_attention_heads,
|
||||
hidden_size=config.hidden_size,
|
||||
opt_level=0,
|
||||
optimization_options=None)
|
||||
onnx_model_path = os.path.join(output_dir, 'gpt2_past{}_optimized.onnx'.format(int(enable_past_input)))
|
||||
optimization_options=None,
|
||||
use_gpu=args.use_gpu)
|
||||
onnx_model_path = os.path.join(output_dir, 'gpt2_past_optimized.onnx')
|
||||
m.save_model_to_file(onnx_model_path)
|
||||
|
||||
if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
|
||||
logger.warning(
|
||||
"onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.")
|
||||
if args.use_gpu and 'CUDAExecutionProvider' not in onnxruntime.get_available_providers():
|
||||
logger.warning("Please install onnxruntime-gpu package to test GPU inference.")
|
||||
|
||||
sess_options = onnxruntime.SessionOptions()
|
||||
|
||||
if args.use_openmp:
|
||||
sess_options.intra_op_num_threads = 1
|
||||
else:
|
||||
sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
|
||||
logger.info(f"session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
|
||||
sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
|
||||
logger.info(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
|
||||
|
||||
logger.info(f"Start inferencing onnx model: {onnx_model_path}")
|
||||
session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=['CPUExecutionProvider'])
|
||||
session = onnxruntime.InferenceSession(onnx_model_path, sess_options)
|
||||
|
||||
ort_outputs = onnxruntime_inference(session, input_ids, past, args.total_runs)
|
||||
if args.verify_outputs:
|
||||
logger.info('PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(0),
|
||||
numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04))
|
||||
|
||||
for layer in range(model.config.n_layer):
|
||||
logger.info('PyTorch and OnnxRuntime layer {} state (present_{}) are close:'.format(layer, layer),
|
||||
numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04))
|
||||
for batch_size in args.batch_sizes:
|
||||
for sequence_length in args.sequence_lengths:
|
||||
past_shape = [
|
||||
2, batch_size, config.num_attention_heads, sequence_length,
|
||||
int(config.hidden_size / config.num_attention_heads)
|
||||
]
|
||||
dummy_past = [torch.rand(past_shape, dtype=torch.float32, device=device) for _ in range(config.n_layer)]
|
||||
dummy_input_ids = torch.randint(low=0,
|
||||
high=model.config.vocab_size - 1,
|
||||
size=(batch_size, 1),
|
||||
dtype=torch.int64,
|
||||
device=device)
|
||||
torch_latency, ort_latency = inference(model,
|
||||
session,
|
||||
dummy_input_ids,
|
||||
dummy_past,
|
||||
args.test_times,
|
||||
verify_outputs=args.validate_onnx)
|
||||
logger.info(
|
||||
f"batch_size={batch_size}, sequence_length={sequence_length}, torch_latency={torch_latency}, ort_latency={ort_latency}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
|||
Loading…
Reference in a new issue