From 60fa4b1f905f55c9734bcd9bf3572f7bea738102 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 28 May 2020 13:55:43 -0700 Subject: [PATCH] Update benchmark of gpt2 model with past state (#4043) * update benchmark_gpt2 to use past state only * update dynamic axes of input/output tensors * Remove --use_openmp option since it is default for onnxruntime 1.3 cpu. * Use same option names as benchmark.py --- .../tools/transformers/BertOnnxModel.py | 5 +- .../tools/transformers/benchmark_gpt2.py | 289 ++++++++++-------- 2 files changed, 170 insertions(+), 124 deletions(-) diff --git a/onnxruntime/python/tools/transformers/BertOnnxModel.py b/onnxruntime/python/tools/transformers/BertOnnxModel.py index 971566c5eb..50ddd29263 100644 --- a/onnxruntime/python/tools/transformers/BertOnnxModel.py +++ b/onnxruntime/python/tools/transformers/BertOnnxModel.py @@ -224,7 +224,7 @@ class BertOnnxModel(OnnxModel): self.clean_graph() self.prune_graph() - def optimize(self, options: BertOptimizationOptions = None): + def optimize(self, options: BertOptimizationOptions = None, add_dynamic_axes=False): if (options is None) or options.enable_layer_norm: self.fuse_layer_norm() @@ -263,7 +263,8 @@ class BertOnnxModel(OnnxModel): self.remove_unused_constant() # Use symbolic batch dimension in input and output. - self.use_dynamic_axes() + if add_dynamic_axes: + self.use_dynamic_axes() logger.info(f"opset verion: {self.model.opset_import[0].version}") diff --git a/onnxruntime/python/tools/transformers/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/benchmark_gpt2.py index 43eb2fe048..b24f35bae4 100644 --- a/onnxruntime/python/tools/transformers/benchmark_gpt2.py +++ b/onnxruntime/python/tools/transformers/benchmark_gpt2.py @@ -3,6 +3,9 @@ # Licensed under the MIT License. See License.txt in the project root for # license information. # -------------------------------------------------------------------------- +# This script benchmarks gpt2 model with past state. +# For gpt2 model without past state, use benchmark.py to measure performance. + import os import sys import numpy @@ -11,14 +14,14 @@ import psutil import argparse import logging import torch -from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer +from transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer, AutoConfig logger = logging.getLogger('') # Map alias to a tuple of Model Class and pretrained model name MODEL_CLASSES = { "gpt2": (GPT2Model, GPT2Tokenizer, "gpt2"), - "distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer, "distilgpt2") + "distilgpt2": (GPT2LMHeadModel, GPT2Tokenizer, "distilgpt2"), } @@ -34,13 +37,9 @@ def dump_environment(): logger.info("no environment variable of OMP_WAIT_POLICY") -def setup_environment(use_openmp=False): +def setup_environment(): # ATTENTION: these environment variables must be set before importing onnxruntime. - if use_openmp: - os.environ["OMP_NUM_THREADS"] = str(psutil.cpu_count(logical=True)) - else: - os.environ["OMP_NUM_THREADS"] = '1' - + os.environ["OMP_NUM_THREADS"] = str(psutil.cpu_count(logical=True)) os.environ["OMP_WAIT_POLICY"] = 'ACTIVE' dump_environment() @@ -53,15 +52,15 @@ def pytorch_inference(model, input_ids, past=None, total_runs=100): outputs = model(input_ids=input_ids, past=past) latency.append(time.time() - start) - logger.info("PyTorch Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f'))) - return outputs + average_latency = sum(latency) * 1000 / len(latency) + logger.debug("PyTorch Inference time = {} ms".format(format(average_latency, '.2f'))) + return outputs, average_latency def onnxruntime_inference(ort_session, input_ids, past=None, total_runs=100): - # Use contiguous array as input might improve performance. - # You can check the results from performance test tool to see whether you need it. ort_inputs = {'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy())} + # TODO: pass input tensor stored in GPU if past is not None: for i, past_i in enumerate(past): ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past[i].cpu().numpy()) @@ -72,49 +71,77 @@ def onnxruntime_inference(ort_session, input_ids, past=None, total_runs=100): ort_outputs = ort_session.run(None, ort_inputs) latency.append(time.time() - start) - logger.info("OnnxRuntime Inference time = {} ms".format(format(sum(latency) * 1000 / len(latency), '.2f'))) + average_latency = sum(latency) * 1000 / len(latency) + logger.debug("OnnxRuntime Inference time = {} ms".format(format(average_latency, '.2f'))) - return ort_outputs + return ort_outputs, average_latency def inference(model, ort_session, input_ids, past=None, total_runs=100, verify_outputs=True): - outputs = pytorch_inference(model, input_ids, past, total_runs) - ort_outputs = onnxruntime_inference(ort_session, input_ids, past, total_runs) + outputs, torch_latency = pytorch_inference(model, input_ids, past, total_runs) + ort_outputs, ort_latency = onnxruntime_inference(ort_session, input_ids, past, total_runs) if verify_outputs: - logger.info('PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(0), - numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04)) + is_close = numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04) + logger.debug(f'PyTorch and OnnxRuntime output 0 (last_state) are close: {is_close}') + + is_all_close = is_close for layer in range(model.config.n_layer): - logger.info('PyTorch and OnnxRuntime layer {} state (present_{}) are close:'.format(layer, layer), - numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04)) + is_close = numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04) + logger.debug(f'PyTorch and OnnxRuntime layer {layer} state (present_{layer}) are close:{is_close}') + is_all_close = is_all_close and is_close + + if not is_all_close: + logger.warning(f'PyTorch and OnnxRuntime results are not all close.') + + return torch_latency, ort_latency def parse_arguments(): parser = argparse.ArgumentParser() - parser.add_argument('--model_type', + parser.add_argument('-m', + '--model_type', required=True, type=str, choices=list(MODEL_CLASSES.keys()), - help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + help='Model type selected in the list: ' + ', '.join(MODEL_CLASSES.keys())) - parser.add_argument('--cache_dir', required=True, type=str, help="cache directory") + parser.add_argument('-c', + '--cache_dir', + required=False, + type=str, + default='./cache_models', + help='Directory to cache pre-trained models') - parser.add_argument('--output_dir', required=True, type=str, help="output onnx model directory") + parser.add_argument('--onnx_dir', + required=False, + type=str, + default='./onnx_models', + help='Directory to store onnx models') - parser.add_argument('--total_runs', required=False, type=int, help="total runs", default=100) + parser.add_argument('-t', + '--test_times', + required=False, + default=100, + type=int, + help='Number of repeat times to get average inference latency.') - parser.add_argument('--enable_past_input', required=False, action='store_true') - parser.set_defaults(enable_past_input=False) + parser.add_argument('-v', '--validate_onnx', required=False, action='store_true', help='Validate ONNX model') - parser.add_argument('--enable_optimization', required=False, action='store_true') - parser.set_defaults(enable_optimization=False) + parser.add_argument('-o', + '--optimize_onnx', + required=False, + action='store_true', + help='Use optimizer.py to optimize onnx model') + parser.set_defaults(optimize_onnx=False) - parser.add_argument('--verify_outputs', required=False, action='store_true') - parser.set_defaults(verify_outputs=False) + parser.add_argument('--use_gpu', required=False, action='store_true') + parser.set_defaults(use_gpu=False) - parser.add_argument('--use_openmp', required=False, action='store_true') - parser.set_defaults(use_openmp=False) + parser.add_argument('-b', '--batch_sizes', nargs='+', type=int, default=[1]) + + parser.add_argument('-s', '--sequence_lengths', nargs='+', type=int, default=[8, 16, 32, 64, 128, 256]) parser.add_argument('--verbose', required=False, action='store_true') parser.set_defaults(verbose=False) @@ -133,6 +160,7 @@ def setup_logger(verbose=True): else: log_handler.setFormatter(logging.Formatter('%(filename)20s: %(message)s')) logging_level = logging.INFO + logging.getLogger("transformers").setLevel(logging.ERROR) log_handler.setLevel(logging_level) # Avoid duplicated handlers when runing this script in multiple cells of Jupyter Notebook. @@ -142,21 +170,71 @@ def setup_logger(verbose=True): logger.setLevel(logging_level) -def remove_past_outputs(export_model_path, output_model_path): - from onnx import ModelProto - from OnnxModel import OnnxModel +def export_onnx(model, config, tokenizer, device, output_dir): + model.to(device) - model = ModelProto() - with open(export_model_path, "rb") as f: - model.ParseFromString(f.read()) - bert_model = OnnxModel(model) + inputs = tokenizer.encode_plus("Here is an example input for GPT2 model", + add_special_tokens=True, + return_tensors='pt') + input_ids = inputs['input_ids'].to(device) + logger.debug(f"input_ids={input_ids}") + outputs = model(input_ids=input_ids, past=None) + assert len(outputs) == 2 + logger.debug(f"output 0 shape={outputs[0].shape}") + logger.debug(f"outputs[1][0] shape={outputs[1][0].shape}") - # remove past state outputs and only keep the first output. - keep_output_names = [bert_model.model.graph.output[0].name] - logger.info(f"Prune graph to keep the first output and drop past state outputs:{keep_output_names}") - bert_model.prune_graph(keep_output_names) + num_layer = model.config.n_layer + present_names = [f'present_{i}' for i in range(num_layer)] + output_names = ["last_state"] + present_names - bert_model.save_model_to_file(output_model_path) + input_names = ['input_ids'] + + # input_ids has only one word for model with past state. + # Shape of input tensors: + # input_ids: (batch_size, 1) + # past_{i}: (2, batch_size, num_heads, seq_len, hidden_size/num_heads) + # Shape of output tensors: + # last_state: (batch_size, seq_len + 1, hidden_size) + # present_{i}: (2, batch_size, num_heads, seq_len + 1, hidden_size/num_heads) + dynamic_axes = {'input_ids': {0: 'batch_size'}, 'last_state': {0: 'batch_size', 1: 'seq_len_plus_1'}} + + for name in present_names: + dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len_plus_1'} + + past_names = [f'past_{i}' for i in range(num_layer)] + input_names = ['input_ids'] + past_names + dummy_past = [torch.zeros(list(outputs[1][0].shape), dtype=torch.float32, device=device) for _ in range(num_layer)] + for name in past_names: + dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} + logger.debug(f"vocab_size:{model.config.vocab_size}") + + dummy_input_ids = torch.randint(low=0, + high=model.config.vocab_size - 1, + size=(1, 1), + dtype=torch.int64, + device=device) + logger.debug(f"dummy_input_ids={dummy_input_ids}") + export_inputs = (dummy_input_ids, tuple(dummy_past)) + + export_model_path = os.path.join(output_dir, 'gpt2_past.onnx') + + # Let's run performance test on PyTorch before updating environment variable. + with torch.no_grad(): + outputs = model(input_ids=dummy_input_ids, past=dummy_past) + + logger.debug(f"present_0 shape={outputs[1][0].shape}") + + torch.onnx.export(model, + args=export_inputs, + f=export_model_path, + input_names=input_names, + output_names=output_names, + example_outputs=outputs, + dynamic_axes=dynamic_axes, + opset_version=11, + do_constant_folding=True, + verbose=False) + return export_model_path def main(): @@ -164,107 +242,74 @@ def main(): setup_logger(args.verbose) dump_environment() - enable_past_input = args.enable_past_input - cache_dir = args.cache_dir if not os.path.exists(cache_dir): os.makedirs(cache_dir) - output_dir = args.output_dir + output_dir = args.onnx_dir if not os.path.exists(output_dir): os.makedirs(output_dir) - (model_class, tokenizer_class, model_name_or_path) = MODEL_CLASSES[args.model_type] + use_torchscript = False + (model_class, tokenizer_class, model_name) = MODEL_CLASSES[args.model_type] + config = AutoConfig.from_pretrained(model_name, torchscript=use_torchscript, cache_dir=cache_dir) + model = model_class.from_pretrained(model_name, config=config, cache_dir=cache_dir) + tokenizer = tokenizer_class.from_pretrained(model_name, cache_dir=cache_dir) + #if use_torchscript: + # model = torch.jit.trace(model, (input_ids, past)) - tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) - model = model_class.from_pretrained(model_name_or_path, cache_dir=cache_dir) - model.eval().cpu() - - inputs = tokenizer.encode_plus("Here is an example input for GPT2 model", - add_special_tokens=True, - return_tensors='pt') - input_ids = inputs['input_ids'] - outputs = model(input_ids=input_ids, past=None) - - num_layer = model.config.n_layer - present_names = [f'present_{i}' for i in range(num_layer)] - output_names = ["last_state"] + present_names - - input_names = ['input_ids'] - dynamic_axes = {'input_ids': {0: 'batch_size', 1: 'seq_len'}, 'last_state': {0: 'batch_size', 1: 'seq_len'}} - for name in present_names: - dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} - - if enable_past_input: - past_names = [f'past_{i}' for i in range(num_layer)] - input_names = ['input_ids'] + past_names - dummy_past = [torch.zeros(list(outputs[1][0].shape)) for _ in range(num_layer)] - for name in past_names: - dynamic_axes[name] = {1: 'batch_size', 3: 'seq_len'} - export_inputs = (inputs['input_ids'], tuple(dummy_past)) - else: - export_inputs = (inputs['input_ids']) - - export_model_path = os.path.join(output_dir, 'gpt2_past{}.onnx'.format(int(enable_past_input))) - - torch.onnx.export(model, - args=export_inputs, - f=export_model_path, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes, - opset_version=11, - do_constant_folding=True, - verbose=False) - - # Let's run performance test on PyTorch before updating environment variable. - past = dummy_past if enable_past_input else None - outputs = pytorch_inference(model, input_ids, past, total_runs=args.total_runs) + device = torch.device("cuda:0" if args.use_gpu else "cpu") + export_model_path = export_onnx(model, config, tokenizer, device, output_dir) # setup environment variables before importing onnxruntime. - setup_environment(args.use_openmp) + setup_environment() import onnxruntime - if enable_past_input: + if not args.optimize_onnx: onnx_model_path = export_model_path else: - onnx_model_path = os.path.join(output_dir, 'gpt2_past{}_out1.onnx'.format(int(enable_past_input))) - remove_past_outputs(export_model_path, onnx_model_path) - - if args.enable_optimization: from optimizer import optimize_model - m = optimize_model(onnx_model_path, + m = optimize_model(export_model_path, model_type='gpt2', - num_heads=12, - hidden_size=768, + num_heads=config.num_attention_heads, + hidden_size=config.hidden_size, opt_level=0, - optimization_options=None) - onnx_model_path = os.path.join(output_dir, 'gpt2_past{}_optimized.onnx'.format(int(enable_past_input))) + optimization_options=None, + use_gpu=args.use_gpu) + onnx_model_path = os.path.join(output_dir, 'gpt2_past_optimized.onnx') m.save_model_to_file(onnx_model_path) - if 'CUDAExecutionProvider' in onnxruntime.get_available_providers(): - logger.warning( - "onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.") + if args.use_gpu and 'CUDAExecutionProvider' not in onnxruntime.get_available_providers(): + logger.warning("Please install onnxruntime-gpu package to test GPU inference.") sess_options = onnxruntime.SessionOptions() - - if args.use_openmp: - sess_options.intra_op_num_threads = 1 - else: - sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) - logger.info(f"session option: intra_op_num_threads={sess_options.intra_op_num_threads}") + sess_options.intra_op_num_threads = psutil.cpu_count(logical=True) + logger.info(f"Session option: intra_op_num_threads={sess_options.intra_op_num_threads}") logger.info(f"Start inferencing onnx model: {onnx_model_path}") - session = onnxruntime.InferenceSession(onnx_model_path, sess_options, providers=['CPUExecutionProvider']) + session = onnxruntime.InferenceSession(onnx_model_path, sess_options) - ort_outputs = onnxruntime_inference(session, input_ids, past, args.total_runs) - if args.verify_outputs: - logger.info('PyTorch and OnnxRuntime output 0 (last_state) are close:'.format(0), - numpy.allclose(ort_outputs[0], outputs[0].cpu(), rtol=1e-05, atol=1e-04)) - - for layer in range(model.config.n_layer): - logger.info('PyTorch and OnnxRuntime layer {} state (present_{}) are close:'.format(layer, layer), - numpy.allclose(ort_outputs[1 + layer], outputs[1][layer].cpu(), rtol=1e-05, atol=1e-04)) + for batch_size in args.batch_sizes: + for sequence_length in args.sequence_lengths: + past_shape = [ + 2, batch_size, config.num_attention_heads, sequence_length, + int(config.hidden_size / config.num_attention_heads) + ] + dummy_past = [torch.rand(past_shape, dtype=torch.float32, device=device) for _ in range(config.n_layer)] + dummy_input_ids = torch.randint(low=0, + high=model.config.vocab_size - 1, + size=(batch_size, 1), + dtype=torch.int64, + device=device) + torch_latency, ort_latency = inference(model, + session, + dummy_input_ids, + dummy_past, + args.test_times, + verify_outputs=args.validate_onnx) + logger.info( + f"batch_size={batch_size}, sequence_length={sequence_length}, torch_latency={torch_latency}, ort_latency={ort_latency}" + ) if __name__ == '__main__':