Update benchmark of gpt2 model with past state (#4043)

* update benchmark_gpt2 to use past state only
* update dynamic axes of input/output tensors
* Remove --use_openmp option since it is default for onnxruntime 1.3 cpu.
* Use same option names as benchmark.py
This commit is contained in:
Tianlei Wu 2020-05-28 13:55:43 -07:00 committed by GitHub
parent ed0a8e5b5c
commit 60fa4b1f90
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 170 additions and 124 deletions

View file

@ -224,7 +224,7 @@ class BertOnnxModel(OnnxModel):
self.clean_graph()
self.prune_graph()
def optimize(self, options: BertOptimizationOptions = None):
def optimize(self, options: BertOptimizationOptions = None, add_dynamic_axes=False):
if (options is None) or options.enable_layer_norm:
self.fuse_layer_norm()
@ -263,7 +263,8 @@ class BertOnnxModel(OnnxModel):
self.remove_unused_constant()
# Use symbolic batch dimension in input and output.
self.use_dynamic_axes()
if add_dynamic_axes:
self.use_dynamic_axes()
logger.info(f"opset verion: {self.model.opset_import[0].version}")

View file

@ -3,6 +3,9 @@
# Licensed under the MIT License. See License.txt in the project root for
# license information.
# --------------------------------------------------------------------------
# This script benchmarks gpt2 model with past state.
# For gpt2 model without past state, use benchmark.py to measure performance.
import os
import sys
import numpy
@ -11,14 +14,14 @@ import psutil
import argparse
import logging
import torch
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer
from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer, AutoConfig
logger = logging.getLogger('')
# Map alias to a tuple of Model Class and pretrained model name
MODEL_CLASSES = {
"gpt2": (GPT2Model, GPT2Tokenizer, "gpt2"),
"distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer, "distilgpt2")
"distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer, "distilgpt2"),
}
@ -34,13 +37,9 @@ def dump_environment():
logger.info("no environment variable of OMP_WAIT_POLICY")
def setup_environment(use_openmp=False):
def setup_environment():
# ATTENTION: these environment variables must be set before importing onnxruntime.
if use_openmp:
os.environ["OMP_NUM_THREADS"] = str(psutil.cpu_count(logical=True))
else:
os.environ["OMP_NUM_THREADS"] = '1'
os.environ["OMP_NUM_THREADS"] = str(psutil.cpu_count(logical=True))
os.environ["OMP_WAIT_POLICY"] = 'ACTIVE'
dump_environment()
@ -53,15 +52,15 @@ def pytorch_inference(model, input_ids, past=None, total_runs=100):
outputs = model(input_ids=input_ids, past=past)
latency.append(time.time() - start)
logger.info("PyTorch Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))
return outputs
average_latency = sum(latency) * 1000 / len(latency)
logger.debug("PyTorch Inference time = {} ms".format(format(average_latency, '.2f')))
return outputs, average_latency
def onnxruntime_inference(ort_session, input_ids, past=None, total_runs=100):
# Use contiguous array as input might improve performance.
# You can check the results from performance test tool to see whether you need it.
ort_inputs = {'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy())}
# TODO: pass input tensor stored in GPU
if past is not None:
for i, past_i in enumerate(past):
ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past[i].cpu().numpy())
@ -72,49 +71,77 @@ def onnxruntime_inference(ort_session, input_ids, past=None, total_runs=100):
ort_outputs = ort_session.run(None, ort_inputs)
latency.append(time.time() - start)
logger.info("OnnxRuntime Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f')))
average_latency = sum(latency) * 1000 / len(latency)
logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, '.2f')))
return ort_outputs
return ort_outputs, average_latency
def inference(model, ort_session, input_ids, past=None, total_runs=100, verify_outputs=True):
outputs = pytorch_inference(model, input_ids, past, total_runs)
ort_outputs = onnxruntime_inference(ort_session, input_ids, past, total_runs)
outputs, torch_latency = pytorch_inference(model, input_ids, past, total_runs)
ort_outputs, ort_latency = onnxruntime_inference(ort_session, input_ids, past, total_runs)
if verify_outputs:
logger.info('PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(0),
numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04))
is_close = numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04)
logger.debug(f'PyTorch and OnnxRuntime output 0 (last_state) are close: {is_close}')
is_all_close = is_close
for layer in range(model.config.n_layer):
logger.info('PyTorch and OnnxRuntime layer {} state (present_{}) are close:'.format(layer, layer),
numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04))
is_close = numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04)
logger.debug(f'PyTorch and OnnxRuntime layer {layer} state (present_{layer}) are close:{is_close}')
is_all_close = is_all_close and is_close
if not is_all_close:
logger.warning(f'PyTorch and OnnxRuntime results are not all close.')
return torch_latency, ort_latency
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--model_type',
parser.add_argument('-m',
'--model_type',
required=True,
type=str,
choices=list(MODEL_CLASSES.keys()),
help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
help='Model type selected in the list: ' + ', '.join(MODEL_CLASSES.keys()))
parser.add_argument('--cache_dir', required=True, type=str, help="cache directory")
parser.add_argument('-c',
'--cache_dir',
required=False,
type=str,
default='./cache_models',
help='Directory to cache pre-trained models')
parser.add_argument('--output_dir', required=True, type=str, help="output onnx model directory")
parser.add_argument('--onnx_dir',
required=False,
type=str,
default='./onnx_models',
help='Directory to store onnx models')
parser.add_argument('--total_runs', required=False, type=int, help="total runs", default=100)
parser.add_argument('-t',
'--test_times',
required=False,
default=100,
type=int,
help='Number of repeat times to get average inference latency.')
parser.add_argument('--enable_past_input', required=False, action='store_true')
parser.set_defaults(enable_past_input=False)
parser.add_argument('-v', '--validate_onnx', required=False, action='store_true', help='Validate ONNX model')
parser.add_argument('--enable_optimization', required=False, action='store_true')
parser.set_defaults(enable_optimization=False)
parser.add_argument('-o',
'--optimize_onnx',
required=False,
action='store_true',
help='Use optimizer.py to optimize onnx model')
parser.set_defaults(optimize_onnx=False)
parser.add_argument('--verify_outputs', required=False, action='store_true')
parser.set_defaults(verify_outputs=False)
parser.add_argument('--use_gpu', required=False, action='store_true')
parser.set_defaults(use_gpu=False)
parser.add_argument('--use_openmp', required=False, action='store_true')
parser.set_defaults(use_openmp=False)
parser.add_argument('-b', '--batch_sizes', nargs='+', type=int, default=[1])
parser.add_argument('-s', '--sequence_lengths', nargs='+', type=int, default=[8, 16, 32, 64, 128, 256])
parser.add_argument('--verbose', required=False, action='store_true')
parser.set_defaults(verbose=False)
@ -133,6 +160,7 @@ def setup_logger(verbose=True):
else:
log_handler.setFormatter(logging.Formatter('%(filename)20s: %(message)s'))
logging_level = logging.INFO
logging.getLogger("transformers").setLevel(logging.ERROR)
log_handler.setLevel(logging_level)
# Avoid duplicated handlers when runing this script in multiple cells of Jupyter Notebook.
@ -142,21 +170,71 @@ def setup_logger(verbose=True):
logger.setLevel(logging_level)
def remove_past_outputs(export_model_path, output_model_path):
from onnx import ModelProto
from OnnxModel import OnnxModel
def export_onnx(model, config, tokenizer, device, output_dir):
model.to(device)
model = ModelProto()
with open(export_model_path, "rb") as f:
model.ParseFromString(f.read())
bert_model = OnnxModel(model)
inputs = tokenizer.encode_plus("Here is an example input for GPT2 model",
add_special_tokens=True,
return_tensors='pt')
input_ids = inputs['input_ids'].to(device)
logger.debug(f"input_ids={input_ids}")
outputs = model(input_ids=input_ids, past=None)
assert len(outputs) == 2
logger.debug(f"output 0 shape={outputs[0].shape}")
logger.debug(f"outputs[1][0] shape={outputs[1][0].shape}")
# remove past state outputs and only keep the first output.
keep_output_names = [bert_model.model.graph.output[0].name]
logger.info(f"Prune graph to keep the first output and drop past state outputs:{keep_output_names}")
bert_model.prune_graph(keep_output_names)
num_layer = model.config.n_layer
present_names = [f'present_{i}' for i in range(num_layer)]
output_names = ["last_state"] + present_names
bert_model.save_model_to_file(output_model_path)
input_names = ['input_ids']
# input_ids has only one word for model with past state.
# Shape of input tensors:
# input_ids: (batch_size, 1)
# past_{i}: (2, batch_size, num_heads, seq_len, hidden_size/num_heads)
# Shape of output tensors:
# last_state: (batch_size, seq_len + 1, hidden_size)
# present_{i}: (2, batch_size, num_heads, seq_len + 1, hidden_size/num_heads)
dynamic_axes = {'input_ids': {0: 'batch_size'}, 'last_state': {0: 'batch_size', 1: 'seq_len_plus_1'}}
for name in present_names:
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len_plus_1'}
past_names = [f'past_{i}' for i in range(num_layer)]
input_names = ['input_ids'] + past_names
dummy_past = [torch.zeros(list(outputs[1][0].shape), dtype=torch.float32, device=device) for _ in range(num_layer)]
for name in past_names:
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
logger.debug(f"vocab_size:{model.config.vocab_size}")
dummy_input_ids = torch.randint(low=0,
high=model.config.vocab_size - 1,
size=(1, 1),
dtype=torch.int64,
device=device)
logger.debug(f"dummy_input_ids={dummy_input_ids}")
export_inputs = (dummy_input_ids, tuple(dummy_past))
export_model_path = os.path.join(output_dir, 'gpt2_past.onnx')
# Let's run performance test on PyTorch before updating environment variable.
with torch.no_grad():
outputs = model(input_ids=dummy_input_ids, past=dummy_past)
logger.debug(f"present_0 shape={outputs[1][0].shape}")
torch.onnx.export(model,
args=export_inputs,
f=export_model_path,
input_names=input_names,
output_names=output_names,
example_outputs=outputs,
dynamic_axes=dynamic_axes,
opset_version=11,
do_constant_folding=True,
verbose=False)
return export_model_path
def main():
@ -164,107 +242,74 @@ def main():
setup_logger(args.verbose)
dump_environment()
enable_past_input = args.enable_past_input
cache_dir = args.cache_dir
if not os.path.exists(cache_dir):
os.makedirs(cache_dir)
output_dir = args.output_dir
output_dir = args.onnx_dir
if not os.path.exists(output_dir):
os.makedirs(output_dir)
(model_class, tokenizer_class, model_name_or_path) = MODEL_CLASSES[args.model_type]
use_torchscript = False
(model_class, tokenizer_class, model_name) = MODEL_CLASSES[args.model_type]
config = AutoConfig.from_pretrained(model_name, torchscript=use_torchscript, cache_dir=cache_dir)
model = model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir)
tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=cache_dir)
#if use_torchscript:
# model = torch.jit.trace(model, (input_ids, past))
tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
model = model_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
model.eval().cpu()
inputs = tokenizer.encode_plus("Here is an example input for GPT2 model",
add_special_tokens=True,
return_tensors='pt')
input_ids = inputs['input_ids']
outputs = model(input_ids=input_ids, past=None)
num_layer = model.config.n_layer
present_names = [f'present_{i}' for i in range(num_layer)]
output_names = ["last_state"] + present_names
input_names = ['input_ids']
dynamic_axes = {'input_ids': {0: 'batch_size', 1: 'seq_len'}, 'last_state': {0: 'batch_size', 1: 'seq_len'}}
for name in present_names:
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
if enable_past_input:
past_names = [f'past_{i}' for i in range(num_layer)]
input_names = ['input_ids'] + past_names
dummy_past = [torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer)]
for name in past_names:
dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'}
export_inputs = (inputs['input_ids'], tuple(dummy_past))
else:
export_inputs = (inputs['input_ids'])
export_model_path = os.path.join(output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input)))
torch.onnx.export(model,
args=export_inputs,
f=export_model_path,
input_names=input_names,
output_names=output_names,
dynamic_axes=dynamic_axes,
opset_version=11,
do_constant_folding=True,
verbose=False)
# Let's run performance test on PyTorch before updating environment variable.
past = dummy_past if enable_past_input else None
outputs = pytorch_inference(model, input_ids, past, total_runs=args.total_runs)
device = torch.device("cuda:0" if args.use_gpu else "cpu")
export_model_path = export_onnx(model, config, tokenizer, device, output_dir)
# setup environment variables before importing onnxruntime.
setup_environment(args.use_openmp)
setup_environment()
import onnxruntime
if enable_past_input:
if not args.optimize_onnx:
onnx_model_path = export_model_path
else:
onnx_model_path = os.path.join(output_dir, 'gpt2_past{}_out1.onnx'.format(int(enable_past_input)))
remove_past_outputs(export_model_path, onnx_model_path)
if args.enable_optimization:
from optimizer import optimize_model
m = optimize_model(onnx_model_path,
m = optimize_model(export_model_path,
model_type='gpt2',
num_heads=12,
hidden_size=768,
num_heads=config.num_attention_heads,
hidden_size=config.hidden_size,
opt_level=0,
optimization_options=None)
onnx_model_path = os.path.join(output_dir, 'gpt2_past{}_optimized.onnx'.format(int(enable_past_input)))
optimization_options=None,
use_gpu=args.use_gpu)
onnx_model_path = os.path.join(output_dir, 'gpt2_past_optimized.onnx')
m.save_model_to_file(onnx_model_path)
if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
logger.warning(
"onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.")
if args.use_gpu and 'CUDAExecutionProvider' not in onnxruntime.get_available_providers():
logger.warning("Please install onnxruntime-gpu package to test GPU inference.")
sess_options = onnxruntime.SessionOptions()
if args.use_openmp:
sess_options.intra_op_num_threads = 1
else:
sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
logger.info(f"session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
sess_options.intra_op_num_threads = psutil.cpu_count(logical=True)
logger.info(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}")
logger.info(f"Start inferencing onnx model: {onnx_model_path}")
session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=['CPUExecutionProvider'])
session = onnxruntime.InferenceSession(onnx_model_path, sess_options)
ort_outputs = onnxruntime_inference(session, input_ids, past, args.total_runs)
if args.verify_outputs:
logger.info('PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(0),
numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04))
for layer in range(model.config.n_layer):
logger.info('PyTorch and OnnxRuntime layer {} state (present_{}) are close:'.format(layer, layer),
numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04))
for batch_size in args.batch_sizes:
for sequence_length in args.sequence_lengths:
past_shape = [
2, batch_size, config.num_attention_heads, sequence_length,
int(config.hidden_size / config.num_attention_heads)
]
dummy_past = [torch.rand(past_shape, dtype=torch.float32, device=device) for _ in range(config.n_layer)]
dummy_input_ids = torch.randint(low=0,
high=model.config.vocab_size - 1,
size=(batch_size, 1),
dtype=torch.int64,
device=device)
torch_latency, ort_latency = inference(model,
session,
dummy_input_ids,
dummy_past,
args.test_times,
verify_outputs=args.validate_onnx)
logger.info(
f"batch_size={batch_size}, sequence_length={sequence_length}, torch_latency={torch_latency}, ort_latency={ort_latency}"
)
if __name__ == '__main__':