Update transformers benchmark for transformers 4.3.* and ORT 1.7 (#6796)

* update benchmark for transformers 4.* and ORT 1.7

* Fix gpt2 onnx conversion for transformers 4.3.*. Add a check of transformer version >= 3.1.

* remove code related to openmp

* update pretrain model list: keep representitive models only
This commit is contained in:
Tianlei Wu 2021-02-24 12:52:35 -08:00 committed by GitHub
parent 71a70ecf6e
commit f4acdb2ecd
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
12 changed files with 137 additions and 316 deletions

View file

@ -210,7 +210,7 @@ For GPU, please append --use_gpu to the command.
bert_perf_test.py can be used to check the BERT model inference performance. Below are examples:
```console
python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128 --samples 100 --test_times 10 --inclusive
python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128
```
For GPU, please append --use_gpu to the command.
@ -219,7 +219,7 @@ After test is finished, a file like perf_results_CPU_B1_S128_<date_time>.txt or
## Profiling
profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and time spent on a node or subgraph.
profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and CPU time spent on a node or subgraph.
Examples commands:

View file

@ -80,9 +80,6 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b
)
return results
if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
logger.warning("Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
for model_name in model_names:
all_input_names = MODELS[model_name][0]
for num_inputs in input_counts:

View file

@ -16,6 +16,7 @@ import argparse
import logging
import torch
import onnx
from packaging import version
from transformers import AutoConfig
from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
from quantize_helper import QuantizeHelper
@ -113,6 +114,10 @@ def parse_arguments(argv=None):
def main(args):
from transformers import __version__ as transformers_version
if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
logger.info(f"Arguments:{args}")
if args.precision == Precision.FLOAT16:
assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"
@ -279,7 +284,7 @@ def main(args):
return csv_filename
if __name__ == '__main__':
if __name__ == '__main__':
args = parse_arguments()
setup_logger(args.verbose)
main(args)

View file

@ -35,44 +35,10 @@ class TestSetting:
sequence_length: int
test_cases: int
test_times: int
contiguous: bool
use_gpu: bool
warmup: bool
omp_num_threads: int
omp_wait_policy: str
intra_op_num_threads: int
seed: int
verbose: bool
contiguous: bool
inclusive: bool
extra_latency: float = 0
def get_setting(self) -> str:
return f"batch_size={self.batch_size},sequence_length={self.sequence_length},test_cases={self.test_cases},test_times={self.test_times},contiguous={self.contiguous},use_gpu={self.use_gpu},warmup={self.warmup}"
def check(self, intra_op_threads, omp_threads, omp_policy) -> bool:
if intra_op_threads is None:
if self.intra_op_num_threads is not None and self.intra_op_num_threads > 0:
return False
else:
assert intra_op_threads > 0
if not (self.intra_op_num_threads is None or self.intra_op_num_threads == intra_op_threads):
return False
if omp_threads is None:
if self.omp_num_threads is not None and self.omp_num_threads > 0:
return False
else:
assert omp_threads > 0
if not (self.omp_num_threads is None or self.omp_num_threads == omp_threads):
return False
if self.omp_wait_policy is not None:
if omp_policy != self.omp_wait_policy:
return False
return True
@dataclass
class ModelSetting:
@ -84,22 +50,17 @@ class ModelSetting:
def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
# Import onnxruntime shall be after OpenMP environment variable setting.
# So we put the import in function to delay importing instead of top of this script.
import onnxruntime
if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
print(
"Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
)
elif (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
print("Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
if intra_op_num_threads is None and graph_optimization_level is None:
session = onnxruntime.InferenceSession(model_path)
else:
execution_providers = ['CPUExecutionProvider'
] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
sess_options = onnxruntime.SessionOptions()
sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
@ -127,8 +88,8 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
return session
def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
if warmup and len(all_inputs) > 0:
def onnxruntime_inference(session, all_inputs, output_names):
if len(all_inputs) > 0:
# Use a random input as warm up.
session.run(output_names, random.choice(all_inputs))
@ -142,57 +103,16 @@ def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
latency_list.append(latency)
return results, latency_list
def get_contiguous_inputs(all_inputs):
"""
Convert input to be contiguous.
"""
contiguous_inputs = []
start_time = timeit.default_timer()
for test_case_id, inputs in enumerate(all_inputs):
real_inputs = {}
for key, value in inputs.items():
real_inputs[key] = np.ascontiguousarray(value)
contiguous_inputs.append(real_inputs)
latency = timeit.default_timer() - start_time
average_latency_ms = latency / len(contiguous_inputs) * 1000
return contiguous_inputs, average_latency_ms
def to_string(model_path, session, test_setting):
sess_options = session.get_session_options()
option = "model={}".format(os.path.basename(model_path))
option += ",graph_optimization_level={},intra_op_num_threads={}".format(sess_options.graph_optimization_level,
option = "model={},".format(os.path.basename(model_path))
option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level,
sess_options.intra_op_num_threads).replace(
'GraphOptimizationLevel.ORT_', '')
option += ",OMP_NUM_THREADS={}".format(os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else "")
option += ",OMP_WAIT_POLICY={}".format(os.environ["OMP_WAIT_POLICY"] if "OMP_WAIT_POLICY" in os.environ else "")
option += ",{}".format(test_setting.get_setting())
option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
return option
def setup_openmp_environ(omp_num_threads, omp_wait_policy):
if omp_num_threads is None:
if "OMP_NUM_THREADS" in os.environ:
del os.environ["OMP_NUM_THREADS"]
else:
os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
if omp_wait_policy is None:
if "OMP_WAIT_POLICY" in os.environ:
del os.environ["OMP_WAIT_POLICY"]
else:
assert omp_wait_policy in ["ACTIVE", "PASSIVE"], f"{omp_wait_policy} is not a valid policy"
os.environ["OMP_WAIT_POLICY"] = omp_wait_policy
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
omp_wait_policy):
# Environment variable shall be set before import onnxruntime.
setup_openmp_environ(omp_num_threads, omp_wait_policy)
def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
model_setting.opt_level)
output_names = [output.name for output in session.get_outputs()]
@ -206,11 +126,11 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
all_latency_list = []
for i in range(test_setting.test_times):
results, latency_list = onnxruntime_inference(session, all_inputs, output_names, test_setting.warmup)
results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
all_latency_list.extend(latency_list)
# latency in miliseconds
latency_ms = np.array(all_latency_list) * 1000 + test_setting.extra_latency
latency_ms = np.array(all_latency_list) * 1000
average_latency = statistics.mean(latency_ms)
latency_50 = np.percentile(latency_ms, 50)
@ -226,91 +146,31 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
format(throughput, '.2f')))
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
omp_wait_policy):
if not test_setting.check(intra_op_num_threads, omp_num_threads, omp_wait_policy):
return
def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
process = multiprocessing.Process(target=run_one_test,
args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
omp_num_threads, omp_wait_policy))
args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads))
process.start()
process.join()
def run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs):
def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
if (test_setting.intra_op_num_threads is not None):
launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads)
return
cpu_count = psutil.cpu_count(logical=False)
logical_cores = psutil.cpu_count(logical=True)
candidate_threads = list(set([1, logical_cores, cpu_count]))
if (test_setting.intra_op_num_threads is not None) or (test_setting.omp_num_threads is not None):
if test_setting.intra_op_num_threads is not None:
intra_op_threads = [test_setting.intra_op_num_threads]
else:
intra_op_threads = [None] + candidate_threads
if test_setting.omp_num_threads is not None:
omp_threads = [test_setting.omp_num_threads]
else:
omp_threads = [None] + candidate_threads
if test_setting.omp_wait_policy is not None:
omp_policies = [test_setting.omp_wait_policy]
else:
omp_policies = [None, 'PASSIVE', 'ACTIVE']
for it in intra_op_threads:
for ot in omp_threads:
for op in omp_policies:
launch_test(model_setting, test_setting, perf_results, all_inputs, it, ot, op)
return
# Test a setting without any setting as baseline 1.
launch_test(model_setting, test_setting, perf_results, all_inputs, None, None, None)
if not test_setting.use_gpu:
# For CPU: intra_op_num_threads = 1, omp_num_threads=None, omp_wait_policy=None
# Another setting without environment variable as baseline 2.
launch_test(model_setting, test_setting, perf_results, all_inputs, 1, None, None)
else:
# For GPU, we test two more settings by default:
# (1) intra_op_num_threads = 1, omp_num_threads=cpu_count, omp_wait_policy=PASSIVE
# (2) intra_op_num_threads = logical_cores, omp_num_threads=1, omp_wait_policy=ACTIVE
launch_test(model_setting, test_setting, perf_results, all_inputs, 1, cpu_count, 'PASSIVE')
launch_test(model_setting, test_setting, perf_results, all_inputs, logical_cores, 1, 'ACTIVE')
# GPU latency is not sensitive to these settings. No need to test many combinations.
# Skip remaining settings for GPU without --all flag.
if test_setting.use_gpu and not test_all:
return
candidate_threads = list(set([logical_cores, cpu_count]))
for i in range(1, min(16, logical_cores)):
if i not in candidate_threads:
candidate_threads.append(i)
candidate_threads.sort(reverse=True)
for intra_op_num_threads in candidate_threads:
for omp_num_threads in candidate_threads:
# skip settings that are very slow
if intra_op_num_threads == 1 and omp_num_threads == 1 and logical_cores != 1:
continue
# When logical and physical cores are not the same, there are many combinations.
# Remove some settings are not good normally.
if logical_cores > cpu_count:
if omp_num_threads == logical_cores and intra_op_num_threads != 1:
continue
if intra_op_num_threads == logical_cores and omp_num_threads != 1:
continue
if not test_all:
if intra_op_num_threads != 1 and omp_num_threads != 1:
continue
for omp_wait_policy in ['ACTIVE', 'PASSIVE']:
launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
omp_num_threads, omp_wait_policy)
def run_performance(model_setting, test_setting, perf_results, test_all):
launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)
def run_performance(model_setting, test_setting, perf_results):
input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name,
model_setting.segment_ids_name, model_setting.input_mask_name)
@ -327,29 +187,25 @@ def run_performance(model_setting, test_setting, perf_results, test_all):
segment_ids,
input_mask,
random_mask_length=False)
if test_setting.contiguous:
all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
print("Extra latency for converting inputs to contiguous: {} ms".format(format(contiguous_latency, '.2f')))
test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0
run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs)
run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
def parse_arguments():
parser = argparse.ArgumentParser()
parser.add_argument('--model', required=True, type=str, help="bert onnx model path")
parser.add_argument('--batch_size',
parser.add_argument('-b', '--batch_size',
required=True,
type=int,
nargs="+",
help="batch size of input. Allow one or multiple values in the range of [1, 128].")
parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input")
parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input")
parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated")
parser.add_argument('--test_times',
parser.add_argument('-t', '--test_times',
required=False,
type=int,
default=0,
@ -375,40 +231,12 @@ def parse_arguments():
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument('--inclusive',
required=False,
action='store_true',
help="include the latency of converting array to contiguous")
parser.set_defaults(inclusive=False)
parser.add_argument('--all', required=False, action='store_true', help="test all candidate settings")
parser.set_defaults(all=False)
parser.add_argument('--omp_num_threads',
required=False,
type=int,
default=None,
help=">0, set OMP_NUM_THREADS value. 0, do not set")
parser.add_argument('--intra_op_num_threads',
parser.add_argument('-n', '--intra_op_num_threads',
required=False,
type=int,
default=None,
help=">=0, set intra_op_num_threads")
parser.add_argument('--omp_wait_policy',
required=False,
type=str,
default=None,
choices=['ACTIVE', 'PASSIVE'],
help="OMP_WAIT_POLICY")
parser.add_argument('--contiguous', required=False, action='store_true', help="contiguous input")
parser.set_defaults(contiguous=False)
parser.add_argument('--no_warmup', required=False, action='store_true', help="do not use one sample for warm-up.")
parser.set_defaults(no_warmup=False)
parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
parser.add_argument('--input_mask_name',
@ -443,18 +271,13 @@ def main():
args.sequence_length,
args.samples,
args.test_times,
None, #contiguous
args.use_gpu,
not args.no_warmup,
args.omp_num_threads,
args.omp_wait_policy,
args.intra_op_num_threads,
args.seed,
args.verbose,
args.contiguous,
args.inclusive)
args.verbose)
print("test setting", test_setting)
run_performance(model_setting, test_setting, perf_results, args.all)
run_performance(model_setting, test_setting, perf_results)
# Sort the results so that the first one has smallest latency.
sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1])

View file

@ -140,7 +140,8 @@ def generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, i
def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
assert input_index < len(embed_node.input)
if input_index >= len(embed_node.input):
return None
input = embed_node.input[input_index]
graph_input = onnx_model.find_graph_input(input)
@ -195,6 +196,15 @@ def find_bert_inputs(onnx_model, input_ids_name=None, segment_ids_name=None, inp
input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
segment_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 1)
input_mask = get_graph_input_from_embed_node(onnx_model, embed_node, 7)
if input_mask is None:
for input in graph_inputs:
input_name_lower = input.name.lower()
if "mask" in input_name_lower:
input_mask = input
if input_mask is None:
raise ValueError(f"Failed to find attention mask input")
return input_ids, segment_ids, input_mask
# Try guess the inputs based on naming.
@ -231,7 +241,7 @@ def get_bert_inputs(onnx_file, input_ids_name=None, segment_ids_name=None, input
model.ParseFromString(f.read())
onnx_model = OnnxModel(model)
find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
return find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
def parse_arguments():

View file

@ -21,19 +21,17 @@ from datetime import datetime
from onnx import ModelProto, TensorProto, numpy_helper
from onnx_model import OnnxModel
from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
from bert_perf_test import create_session, onnxruntime_inference, setup_openmp_environ
from bert_perf_test import create_session, onnxruntime_inference
def run_model(model_path, all_inputs, use_gpu, use_openmp, disable_optimization):
# Import onnxruntime shall be after OpenMP environment variable setting.
# So we put import here to delay importing.
def run_model(model_path, all_inputs, use_gpu, disable_optimization):
import onnxruntime
graph_optimization_level = None
if disable_optimization:
graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
intra_op_num_threads = 1 if use_openmp else psutil.cpu_count(logical=False)
intra_op_num_threads = psutil.cpu_count(logical=False)
session = create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level)
@ -78,7 +76,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed,
use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
# Try deduce input names from optimized model.
input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name,
@ -95,16 +93,9 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
input_mask,
random_mask_length=True)
# OpenMP environment variables must be set before the very first "import onnxruntime"
if use_openmp:
setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE')
else:
setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE')
baseline_results, baseline_latency, output_names = run_model(baseline_model,
all_inputs,
use_gpu,
use_openmp,
disable_optimization=True)
if verbose:
print("baseline average latency (all optimizations disabled): {} ms".format(
@ -117,7 +108,6 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model,
all_inputs,
use_gpu,
use_openmp,
disable_optimization=False)
if verbose:
print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
@ -157,9 +147,6 @@ def parse_arguments():
parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
parser.set_defaults(use_gpu=False)
parser.add_argument('--openmp', required=False, action='store_true', help="use openmp")
parser.set_defaults(openmp=False)
parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
parser.set_defaults(verbose=False)
@ -180,7 +167,7 @@ def main():
path.mkdir(parents=True, exist_ok=True)
run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length,
args.use_gpu, args.samples, args.seed, args.openmp, args.verbose, args.rtol, args.atol, args.input_ids,
args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids,
args.segment_ids, args.input_mask)

View file

@ -23,6 +23,7 @@ import torch
import numpy
import json
from pathlib import Path
from packaging import version
from transformers import AutoConfig
from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
from gpt2_tester import Gpt2Tester
@ -104,6 +105,10 @@ def parse_arguments():
def main():
from transformers import __version__ as transformers_version
if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
raise RuntimeError("This tool requires transformers 3.1.0 or later.")
args = parse_arguments()
setup_logger(args.verbose)

View file

@ -7,10 +7,9 @@ REM Please install PyTorch (see https://pytorch.org/) before running this benchm
REM GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
REM CPU: conda install pytorch torchvision cpuonly -c pytorch
REM When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
REM it will use onnxruntime-tools package.
REM If run_cli=false, it depends on other python script (*.py) files in this directory.
set run_cli=false
REM When use_package=true, you need not copy other files to run benchmarks except this sh file.
REM Otherwise, it will use python script (*.py) files in this directory.
set use_package=false
REM only need once
set run_install=false
@ -72,13 +71,12 @@ if %run_install% == true (
)
pip install --upgrade onnxconverter_common
pip install --upgrade onnxruntime-tools
pip install --upgrade git+https://github.com/huggingface/transformers
pip install --upgrade transformers
)
if %run_cli% == true (
echo Use onnxruntime_tools.transformers.benchmark
set optimizer_script=-m onnxruntime_tools.transformers.benchmark
if %use_package% == true (
echo Use onnxruntime.transformers.benchmark
set optimizer_script=-m onnxruntime.transformers.benchmark
) else (
set optimizer_script=benchmark.py
)

View file

@ -31,7 +31,7 @@ class GPT2ModelNoPastState(GPT2Model):
super().__init__(config)
def forward(self, input_ids):
return super().forward(input_ids, use_cache=False)
return super().forward(input_ids, use_cache=False, return_dict=False)
class MyGPT2Model(GPT2Model):
@ -40,11 +40,26 @@ class MyGPT2Model(GPT2Model):
def __init__(self, config):
super().__init__(config)
@staticmethod
def post_process(result, num_layer):
if isinstance(result[1][0], tuple) or isinstance(result[1][0], list):
assert len(result[1]) == num_layer and len(result[1][0]) == 2 #and len(result[1][0][0].shape) == 4 and result[1][0][0].shape == result[1][0][1].shape
present = []
for i in range(num_layer):
# Since transformers v4.*, past key and values are separated outputs.
# Here we concate them into one tensor to be compatible with Attention operator.
present.append(torch.cat((result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)), dim=0))
return (result[0], tuple(present))
return result
def forward(self, input_ids, position_ids, attention_mask, *past):
return super().forward(input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past)
result = super().forward(input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past,
return_dict=False)
return MyGPT2Model.post_process(result, self.config.n_layer)
class MyGPT2LMHeadModel(GPT2LMHeadModel):
@ -54,10 +69,13 @@ class MyGPT2LMHeadModel(GPT2LMHeadModel):
super().__init__(config)
def forward(self, input_ids, position_ids, attention_mask, *past):
return super().forward(input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past)
result = super().forward(input_ids,
position_ids=position_ids,
attention_mask=attention_mask,
past_key_values=past,
return_dict=False)
return MyGPT2Model.post_process(result, self.config.n_layer)
class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
@ -216,6 +234,7 @@ class Gpt2Helper:
is_all_close = is_close
num_layers = len(ort_outputs) - 1
for layer in range(num_layers):
is_close = numpy.allclose(ort_outputs[1 + layer],
torch_outputs[1][layer].cpu().numpy(),
@ -288,10 +307,12 @@ class Gpt2Helper:
input_names.append('attention_mask')
input_names.extend(past_names)
assert len(outputs) == 2 and len(outputs[1]) == num_layer
logger.info(
f"Shapes: input_ids={dummy_inputs.input_ids.shape} past={dummy_inputs.past[0].shape} output={outputs[0].shape} present={outputs[1][0].shape}"
)
Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
torch.onnx.export(model,

View file

@ -26,67 +26,44 @@ MODELS = {
"bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
"token_type_ids"], 11, False, "bert"),
"bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
"token_type_ids"], 11, False, "bert"),
"bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
"bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
# todo: more models to add
# GPT
"openai-gpt": (["input_ids"], 11, False, "gpt2"), # no past state inputs
# GPT-2
"gpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs
# GPT (no past state)
"openai-gpt": (["input_ids"], 11, False, "gpt2"),
# GPT-2 (no past state, use benchmark_gpt2.py for past_key_values)
"gpt2": (["input_ids"], 11, False, "gpt2"),
"gpt2-medium": (["input_ids"], 11, False, "gpt2"),
"gpt2-large":
(["input_ids"], 11, True,
"gpt2"), # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models.
"gpt2-large": (["input_ids"], 11, True, "gpt2"),
"gpt2-xl": (["input_ids"], 11, True, "gpt2"),
"distilgpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs
"distilgpt2": (["input_ids"], 11, False, "gpt2"),
# Transformer-XL
#"transfo-xl-wt103": (["input_ids"], 11, False, "bert"),
# XLNet
#"xlnet-base-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
#"xlnet-large-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
"xlnet-base-cased": (["input_ids"], 12, False, "bert"),
"xlnet-large-cased": (["input_ids"], 12, False, "bert"),
# XLM
"xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"),
"xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"),
"xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"),
"xlm-mlm-enro-1024": (["input_ids"], 11, False, "bert"),
"xlm-mlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
"xlm-mlm-tlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
"xlm-clm-enfr-1024": (["input_ids"], 11, False, "bert"),
"xlm-clm-ende-1024": (["input_ids"], 11, False, "bert"),
"xlm-mlm-17-1280": (["input_ids"], 11, True, "bert"),
"xlm-mlm-100-1280": (["input_ids"], 11, True, "bert"),
# XML Roberta
"xlm-roberta-base": (["input_ids"], 12, False, "bert"),
# RoBERTa
"roberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
"roberta-large": (["input_ids", "attention_mask"], 11, False, "bert"),
"roberta-large-mnli": (["input_ids", "attention_mask"], 11, False, "bert"),
"deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"),
"distilroberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
"roberta-base-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
"roberta-large-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
# DistilBERT
"distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"),
"distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
"distilbert-base-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
"distilbert-base-cased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
"distilbert-base-german-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
"distilbert-base-multilingual-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
# CTRL
"ctrl": (["input_ids"], 11, True, "bert"),
# CamemBERT
"camembert-base": (["input_ids"], 11, False, "bert"),
# ALBERT
# Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
"albert-base-v1": (["input_ids"], 12, False, "bert"),
"albert-large-v1": (["input_ids"], 12, False, "bert"),
"albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
@ -95,36 +72,37 @@ MODELS = {
"albert-large-v2": (["input_ids"], 12, False, "bert"),
"albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
#"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
# T5
"t5-small": (["input_ids"], 12, False, "bert"),
"t5-base": (["input_ids"], 12, False, "bert"),
"t5-large": (["input_ids"], 12, True, "bert"),
"t5-3b": (["input_ids"], 12, True, "bert"),
"t5-11b": (["input_ids"], 12, True, "bert"),
# T5 (use benchmark_t5.py instead)
#"t5-small": (["input_ids"], 12, False, "bert"),
#"t5-base": (["input_ids"], 12, False, "bert"),
#"t5-large": (["input_ids"], 12, True, "bert"),
#"t5-3b": (["input_ids"], 12, True, "bert"),
#"t5-11b": (["input_ids"], 12, True, "bert"),
#"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"),
# XLM-RoBERTa
"xlm-roberta-base": (["input_ids"], 11, False, "bert"),
"xlm-roberta-large": (["input_ids"], 11, True, "bert"),
# FlauBERT
"flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"),
"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
#"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
"flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"),
"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
#"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
# Bart
"facebook/bart-large": (["input_ids"], 11, False, "bert"),
"facebook/bart-base": (["input_ids"], 11, False, "bert"),
"facebook/bart-large-mnli": (["input_ids"], 11, False, "bert"),
"facebook/bart-large-cnn": (["input_ids"], 11, False, "bert"),
#"facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"),
# DialoGPT
"microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"),
"microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"),
"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
#"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
# Reformer
#"google/reformer-enwik8": (["input_ids"], 11, False, "bert"),
#"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"),
# MarianMT
#"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
# Longformer
# Longformer (use benchmark_longformer.py instead)
#"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
#"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"),
}

View file

@ -5,13 +5,12 @@
# --------------------------------------------------------------------------
# This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models.
# Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following:
# GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
# GPU: conda install pytorch torchvision cudatoolkit=11.0 -c pytorch
# CPU: conda install pytorch torchvision cpuonly -c pytorch
# When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
# it will use onnxruntime-tools package.
# If run_cli=false, it depends on other python script (*.py) files in this directory.
run_cli=true
# When use_package=true, you need not copy other files to run benchmarks except this sh file.
# Otherwise, it will use python script (*.py) files in this directory.
use_package=true
# only need once
run_install=true
@ -50,7 +49,7 @@ sequence_lengths="8 16 32 64 128 256 512 1024"
input_counts=1
# Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
models_to_test="bert-base-cased roberta-base gpt2"
models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
# If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
# export CUDA_VISIBLE_DEVICES=1
@ -81,7 +80,7 @@ fi
if [ "$run_install" = true ] ; then
pip uninstall --yes ort_nightly
pip uninstall --yes ort-nightly ort-gpu-nightly
pip uninstall --yes onnxruntime
pip uninstall --yes onnxruntime-gpu
if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
@ -89,14 +88,12 @@ if [ "$run_install" = true ] ; then
else
pip install onnxruntime-gpu
fi
pip install --upgrade onnxconverter_common
pip install --upgrade onnxruntime-tools
pip install --upgrade transformers
pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers
fi
if [ "$run_cli" = true ] ; then
echo "Use onnxruntime_tools.transformers.benchmark"
benchmark_script="-m onnxruntime_tools.transformers.benchmark"
if [ "$use_package" = true ] ; then
echo "Use onnxruntime.transformers.benchmark"
benchmark_script="-m onnxruntime.transformers.benchmark"
else
benchmark_script="benchmark.py"
fi
@ -187,4 +184,4 @@ fi
# Remove duplicated lines
awk '!x[$0]++' ./result.csv > summary_result.csv
awk '!x[$0]++' ./fusion.csv > summary_fusion.csv
awk '!x[$0]++' ./detail.csv > summary_detail.csv
awk '!x[$0]++' ./detail.csv > summary_detail.csv

View file

@ -26,7 +26,7 @@ class TestGpt2(unittest.TestCase):
def test_gpt2_fp16(self):
if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128')
self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128 --use_gpu')
def test_gpt2_int8(self):
self.run_benchmark_gpt2('-m gpt2 --precision int8 -o -b 1 -s 128')