diff --git a/onnxruntime/python/tools/transformers/README.md b/onnxruntime/python/tools/transformers/README.md index a9805fe5a9..99d015751d 100644 --- a/onnxruntime/python/tools/transformers/README.md +++ b/onnxruntime/python/tools/transformers/README.md @@ -210,7 +210,7 @@ For GPU, please append --use_gpu to the command. bert_perf_test.py can be used to check the BERT model inference performance. Below are examples: ```console -python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128 --samples 100 --test_times 10 --inclusive +python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128 ``` For GPU, please append --use_gpu to the command. @@ -219,7 +219,7 @@ After test is finished, a file like perf_results_CPU_B1_S128_.txt or ## Profiling -profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and time spent on a node or subgraph. +profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and CPU time spent on a node or subgraph. Examples commands: diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 3fed9c88fa..20f7a52113 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -80,9 +80,6 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b ) return results - if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): - logger.warning("Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.") - for model_name in model_names: all_input_names = MODELS[model_name][0] for num_inputs in input_counts: diff --git a/onnxruntime/python/tools/transformers/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/benchmark_gpt2.py index 5354db9ed7..ed5d3de6c8 100644 --- a/onnxruntime/python/tools/transformers/benchmark_gpt2.py +++ b/onnxruntime/python/tools/transformers/benchmark_gpt2.py @@ -16,6 +16,7 @@ import argparse import logging import torch import onnx +from packaging import version from transformers import AutoConfig from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS from quantize_helper import QuantizeHelper @@ -113,6 +114,10 @@ def parse_arguments(argv=None): def main(args): + from transformers import __version__ as transformers_version + if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older + raise RuntimeError("This tool requires transformers 3.1.0 or later.") + logger.info(f"Arguments:{args}") if args.precision == Precision.FLOAT16: assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu" @@ -279,7 +284,7 @@ def main(args): return csv_filename -if __name__ == '__main__': +if __name__ == '__main__': args = parse_arguments() setup_logger(args.verbose) main(args) diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py index 16d8a04939..ff7738d4a8 100644 --- a/onnxruntime/python/tools/transformers/bert_perf_test.py +++ b/onnxruntime/python/tools/transformers/bert_perf_test.py @@ -35,44 +35,10 @@ class TestSetting: sequence_length: int test_cases: int test_times: int - contiguous: bool use_gpu: bool - warmup: bool - omp_num_threads: int - omp_wait_policy: str intra_op_num_threads: int seed: int verbose: bool - contiguous: bool - inclusive: bool - extra_latency: float = 0 - - def get_setting(self) -> str: - return f"batch_size={self.batch_size},sequence_length={self.sequence_length},test_cases={self.test_cases},test_times={self.test_times},contiguous={self.contiguous},use_gpu={self.use_gpu},warmup={self.warmup}" - - def check(self, intra_op_threads, omp_threads, omp_policy) -> bool: - if intra_op_threads is None: - if self.intra_op_num_threads is not None and self.intra_op_num_threads > 0: - return False - else: - assert intra_op_threads > 0 - if not (self.intra_op_num_threads is None or self.intra_op_num_threads == intra_op_threads): - return False - - if omp_threads is None: - if self.omp_num_threads is not None and self.omp_num_threads > 0: - return False - else: - assert omp_threads > 0 - if not (self.omp_num_threads is None or self.omp_num_threads == omp_threads): - return False - - if self.omp_wait_policy is not None: - if omp_policy != self.omp_wait_policy: - return False - - return True - @dataclass class ModelSetting: @@ -84,22 +50,17 @@ class ModelSetting: def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None): - # Import onnxruntime shall be after OpenMP environment variable setting. - # So we put the import in function to delay importing instead of top of this script. import onnxruntime if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()): print( "Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance." ) - elif (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()): - print("Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.") if intra_op_num_threads is None and graph_optimization_level is None: session = onnxruntime.InferenceSession(model_path) else: - execution_providers = ['CPUExecutionProvider' - ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] + execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider'] sess_options = onnxruntime.SessionOptions() sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL @@ -127,8 +88,8 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization return session -def onnxruntime_inference(session, all_inputs, output_names, warmup=True): - if warmup and len(all_inputs) > 0: +def onnxruntime_inference(session, all_inputs, output_names): + if len(all_inputs) > 0: # Use a random input as warm up. session.run(output_names, random.choice(all_inputs)) @@ -142,57 +103,16 @@ def onnxruntime_inference(session, all_inputs, output_names, warmup=True): latency_list.append(latency) return results, latency_list - -def get_contiguous_inputs(all_inputs): - """ - Convert input to be contiguous. - """ - contiguous_inputs = [] - - start_time = timeit.default_timer() - for test_case_id, inputs in enumerate(all_inputs): - real_inputs = {} - for key, value in inputs.items(): - real_inputs[key] = np.ascontiguousarray(value) - contiguous_inputs.append(real_inputs) - latency = timeit.default_timer() - start_time - - average_latency_ms = latency / len(contiguous_inputs) * 1000 - return contiguous_inputs, average_latency_ms - - def to_string(model_path, session, test_setting): sess_options = session.get_session_options() - option = "model={}".format(os.path.basename(model_path)) - option += ",graph_optimization_level={},intra_op_num_threads={}".format(sess_options.graph_optimization_level, + option = "model={},".format(os.path.basename(model_path)) + option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level, sess_options.intra_op_num_threads).replace( 'GraphOptimizationLevel.ORT_', '') - option += ",OMP_NUM_THREADS={}".format(os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else "") - option += ",OMP_WAIT_POLICY={}".format(os.environ["OMP_WAIT_POLICY"] if "OMP_WAIT_POLICY" in os.environ else "") - option += ",{}".format(test_setting.get_setting()) + option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}" return option - -def setup_openmp_environ(omp_num_threads, omp_wait_policy): - if omp_num_threads is None: - if "OMP_NUM_THREADS" in os.environ: - del os.environ["OMP_NUM_THREADS"] - else: - os.environ["OMP_NUM_THREADS"] = str(omp_num_threads) - - if omp_wait_policy is None: - if "OMP_WAIT_POLICY" in os.environ: - del os.environ["OMP_WAIT_POLICY"] - else: - assert omp_wait_policy in ["ACTIVE", "PASSIVE"], f"{omp_wait_policy} is not a valid policy" - os.environ["OMP_WAIT_POLICY"] = omp_wait_policy - - -def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads, - omp_wait_policy): - # Environment variable shall be set before import onnxruntime. - setup_openmp_environ(omp_num_threads, omp_wait_policy) - +def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads): session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads, model_setting.opt_level) output_names = [output.name for output in session.get_outputs()] @@ -206,11 +126,11 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op all_latency_list = [] for i in range(test_setting.test_times): - results, latency_list = onnxruntime_inference(session, all_inputs, output_names, test_setting.warmup) + results, latency_list = onnxruntime_inference(session, all_inputs, output_names) all_latency_list.extend(latency_list) # latency in miliseconds - latency_ms = np.array(all_latency_list) * 1000 + test_setting.extra_latency + latency_ms = np.array(all_latency_list) * 1000 average_latency = statistics.mean(latency_ms) latency_50 = np.percentile(latency_ms, 50) @@ -226,91 +146,31 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op format(throughput, '.2f'))) -def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads, - omp_wait_policy): - if not test_setting.check(intra_op_num_threads, omp_num_threads, omp_wait_policy): - return - +def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads): process = multiprocessing.Process(target=run_one_test, - args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, - omp_num_threads, omp_wait_policy)) + args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)) process.start() process.join() -def run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs): +def run_perf_tests(model_setting, test_setting, perf_results, all_inputs): + if (test_setting.intra_op_num_threads is not None): + launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads) + return + cpu_count = psutil.cpu_count(logical=False) logical_cores = psutil.cpu_count(logical=True) - candidate_threads = list(set([1, logical_cores, cpu_count])) - - if (test_setting.intra_op_num_threads is not None) or (test_setting.omp_num_threads is not None): - - if test_setting.intra_op_num_threads is not None: - intra_op_threads = [test_setting.intra_op_num_threads] - else: - intra_op_threads = [None] + candidate_threads - - if test_setting.omp_num_threads is not None: - omp_threads = [test_setting.omp_num_threads] - else: - omp_threads = [None] + candidate_threads - - if test_setting.omp_wait_policy is not None: - omp_policies = [test_setting.omp_wait_policy] - else: - omp_policies = [None, 'PASSIVE', 'ACTIVE'] - - for it in intra_op_threads: - for ot in omp_threads: - for op in omp_policies: - launch_test(model_setting, test_setting, perf_results, all_inputs, it, ot, op) - return - - # Test a setting without any setting as baseline 1. - launch_test(model_setting, test_setting, perf_results, all_inputs, None, None, None) - - if not test_setting.use_gpu: - # For CPU: intra_op_num_threads = 1, omp_num_threads=None, omp_wait_policy=None - # Another setting without environment variable as baseline 2. - launch_test(model_setting, test_setting, perf_results, all_inputs, 1, None, None) - else: - # For GPU, we test two more settings by default: - # (1) intra_op_num_threads = 1, omp_num_threads=cpu_count, omp_wait_policy=PASSIVE - # (2) intra_op_num_threads = logical_cores, omp_num_threads=1, omp_wait_policy=ACTIVE - launch_test(model_setting, test_setting, perf_results, all_inputs, 1, cpu_count, 'PASSIVE') - - launch_test(model_setting, test_setting, perf_results, all_inputs, logical_cores, 1, 'ACTIVE') - - # GPU latency is not sensitive to these settings. No need to test many combinations. - # Skip remaining settings for GPU without --all flag. - if test_setting.use_gpu and not test_all: - return + candidate_threads = list(set([logical_cores, cpu_count])) + for i in range(1, min(16, logical_cores)): + if i not in candidate_threads: + candidate_threads.append(i) + candidate_threads.sort(reverse=True) for intra_op_num_threads in candidate_threads: - for omp_num_threads in candidate_threads: - # skip settings that are very slow - if intra_op_num_threads == 1 and omp_num_threads == 1 and logical_cores != 1: - continue - - # When logical and physical cores are not the same, there are many combinations. - # Remove some settings are not good normally. - if logical_cores > cpu_count: - if omp_num_threads == logical_cores and intra_op_num_threads != 1: - continue - if intra_op_num_threads == logical_cores and omp_num_threads != 1: - continue - - if not test_all: - if intra_op_num_threads != 1 and omp_num_threads != 1: - continue - - for omp_wait_policy in ['ACTIVE', 'PASSIVE']: - launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, - omp_num_threads, omp_wait_policy) - - -def run_performance(model_setting, test_setting, perf_results, test_all): + launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads) + +def run_performance(model_setting, test_setting, perf_results): input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name, model_setting.segment_ids_name, model_setting.input_mask_name) @@ -327,29 +187,25 @@ def run_performance(model_setting, test_setting, perf_results, test_all): segment_ids, input_mask, random_mask_length=False) - if test_setting.contiguous: - all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs) - print("Extra latency for converting inputs to contiguous: {} ms".format(format(contiguous_latency, '.2f'))) - test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0 - run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs) + run_perf_tests(model_setting, test_setting, perf_results, all_inputs) def parse_arguments(): parser = argparse.ArgumentParser() parser.add_argument('--model', required=True, type=str, help="bert onnx model path") - parser.add_argument('--batch_size', + parser.add_argument('-b', '--batch_size', required=True, type=int, nargs="+", help="batch size of input. Allow one or multiple values in the range of [1, 128].") - parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input") + parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input") parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated") - parser.add_argument('--test_times', + parser.add_argument('-t', '--test_times', required=False, type=int, default=0, @@ -375,40 +231,12 @@ def parse_arguments(): parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU") parser.set_defaults(use_gpu=False) - parser.add_argument('--inclusive', - required=False, - action='store_true', - help="include the latency of converting array to contiguous") - parser.set_defaults(inclusive=False) - - parser.add_argument('--all', required=False, action='store_true', help="test all candidate settings") - parser.set_defaults(all=False) - - parser.add_argument('--omp_num_threads', - required=False, - type=int, - default=None, - help=">0, set OMP_NUM_THREADS value. 0, do not set") - - parser.add_argument('--intra_op_num_threads', + parser.add_argument('-n', '--intra_op_num_threads', required=False, type=int, default=None, help=">=0, set intra_op_num_threads") - parser.add_argument('--omp_wait_policy', - required=False, - type=str, - default=None, - choices=['ACTIVE', 'PASSIVE'], - help="OMP_WAIT_POLICY") - - parser.add_argument('--contiguous', required=False, action='store_true', help="contiguous input") - parser.set_defaults(contiguous=False) - - parser.add_argument('--no_warmup', required=False, action='store_true', help="do not use one sample for warm-up.") - parser.set_defaults(no_warmup=False) - parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids") parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids") parser.add_argument('--input_mask_name', @@ -443,18 +271,13 @@ def main(): args.sequence_length, args.samples, args.test_times, - None, #contiguous args.use_gpu, - not args.no_warmup, - args.omp_num_threads, - args.omp_wait_policy, args.intra_op_num_threads, args.seed, - args.verbose, - args.contiguous, - args.inclusive) + args.verbose) + print("test setting", test_setting) - run_performance(model_setting, test_setting, perf_results, args.all) + run_performance(model_setting, test_setting, perf_results) # Sort the results so that the first one has smallest latency. sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1]) diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py index c2e4435977..5e008db16c 100644 --- a/onnxruntime/python/tools/transformers/bert_test_data.py +++ b/onnxruntime/python/tools/transformers/bert_test_data.py @@ -140,7 +140,8 @@ def generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, i def get_graph_input_from_embed_node(onnx_model, embed_node, input_index): - assert input_index < len(embed_node.input) + if input_index >= len(embed_node.input): + return None input = embed_node.input[input_index] graph_input = onnx_model.find_graph_input(input) @@ -195,6 +196,15 @@ def find_bert_inputs(onnx_model, input_ids_name=None, segment_ids_name=None, inp input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0) segment_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 1) input_mask = get_graph_input_from_embed_node(onnx_model, embed_node, 7) + + if input_mask is None: + for input in graph_inputs: + input_name_lower = input.name.lower() + if "mask" in input_name_lower: + input_mask = input + if input_mask is None: + raise ValueError(f"Failed to find attention mask input") + return input_ids, segment_ids, input_mask # Try guess the inputs based on naming. @@ -231,7 +241,7 @@ def get_bert_inputs(onnx_file, input_ids_name=None, segment_ids_name=None, input model.ParseFromString(f.read()) onnx_model = OnnxModel(model) - find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name) + return find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name) def parse_arguments(): diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py index c1dec79092..5837581893 100644 --- a/onnxruntime/python/tools/transformers/compare_bert_results.py +++ b/onnxruntime/python/tools/transformers/compare_bert_results.py @@ -21,19 +21,17 @@ from datetime import datetime from onnx import ModelProto, TensorProto, numpy_helper from onnx_model import OnnxModel from bert_test_data import get_bert_inputs, generate_test_data, output_test_data -from bert_perf_test import create_session, onnxruntime_inference, setup_openmp_environ +from bert_perf_test import create_session, onnxruntime_inference -def run_model(model_path, all_inputs, use_gpu, use_openmp, disable_optimization): - # Import onnxruntime shall be after OpenMP environment variable setting. - # So we put import here to delay importing. +def run_model(model_path, all_inputs, use_gpu, disable_optimization): import onnxruntime graph_optimization_level = None if disable_optimization: graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL - intra_op_num_threads = 1 if use_openmp else psutil.cpu_count(logical=False) + intra_op_num_threads = psutil.cpu_count(logical=False) session = create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level) @@ -78,7 +76,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4): def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed, - use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name): + verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name): # Try deduce input names from optimized model. input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name, @@ -95,16 +93,9 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l input_mask, random_mask_length=True) - # OpenMP environment variables must be set before the very first "import onnxruntime" - if use_openmp: - setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE') - else: - setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE') - baseline_results, baseline_latency, output_names = run_model(baseline_model, all_inputs, use_gpu, - use_openmp, disable_optimization=True) if verbose: print("baseline average latency (all optimizations disabled): {} ms".format( @@ -117,7 +108,6 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model, all_inputs, use_gpu, - use_openmp, disable_optimization=False) if verbose: print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000)) @@ -157,9 +147,6 @@ def parse_arguments(): parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU") parser.set_defaults(use_gpu=False) - parser.add_argument('--openmp', required=False, action='store_true', help="use openmp") - parser.set_defaults(openmp=False) - parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information") parser.set_defaults(verbose=False) @@ -180,7 +167,7 @@ def main(): path.mkdir(parents=True, exist_ok=True) run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length, - args.use_gpu, args.samples, args.seed, args.openmp, args.verbose, args.rtol, args.atol, args.input_ids, + args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids, args.segment_ids, args.input_mask) diff --git a/onnxruntime/python/tools/transformers/convert_to_onnx.py b/onnxruntime/python/tools/transformers/convert_to_onnx.py index 7b2267905d..e88e4c5dff 100644 --- a/onnxruntime/python/tools/transformers/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/convert_to_onnx.py @@ -23,6 +23,7 @@ import torch import numpy import json from pathlib import Path +from packaging import version from transformers import AutoConfig from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS from gpt2_tester import Gpt2Tester @@ -104,6 +105,10 @@ def parse_arguments(): def main(): + from transformers import __version__ as transformers_version + if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older + raise RuntimeError("This tool requires transformers 3.1.0 or later.") + args = parse_arguments() setup_logger(args.verbose) diff --git a/onnxruntime/python/tools/transformers/dev_benchmark.cmd b/onnxruntime/python/tools/transformers/dev_benchmark.cmd index 61553d8f2d..3f0b397a14 100644 --- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd +++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd @@ -7,10 +7,9 @@ REM Please install PyTorch (see https://pytorch.org/) before running this benchm REM GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch REM CPU: conda install pytorch torchvision cpuonly -c pytorch -REM When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks -REM it will use onnxruntime-tools package. -REM If run_cli=false, it depends on other python script (*.py) files in this directory. -set run_cli=false +REM When use_package=true, you need not copy other files to run benchmarks except this sh file. +REM Otherwise, it will use python script (*.py) files in this directory. +set use_package=false REM only need once set run_install=false @@ -72,13 +71,12 @@ if %run_install% == true ( ) pip install --upgrade onnxconverter_common - pip install --upgrade onnxruntime-tools - pip install --upgrade git+https://github.com/huggingface/transformers + pip install --upgrade transformers ) -if %run_cli% == true ( - echo Use onnxruntime_tools.transformers.benchmark - set optimizer_script=-m onnxruntime_tools.transformers.benchmark +if %use_package% == true ( + echo Use onnxruntime.transformers.benchmark + set optimizer_script=-m onnxruntime.transformers.benchmark ) else ( set optimizer_script=benchmark.py ) diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py index 5490c7267a..8079d6277b 100644 --- a/onnxruntime/python/tools/transformers/gpt2_helper.py +++ b/onnxruntime/python/tools/transformers/gpt2_helper.py @@ -31,7 +31,7 @@ class GPT2ModelNoPastState(GPT2Model): super().__init__(config) def forward(self, input_ids): - return super().forward(input_ids, use_cache=False) + return super().forward(input_ids, use_cache=False, return_dict=False) class MyGPT2Model(GPT2Model): @@ -40,11 +40,26 @@ class MyGPT2Model(GPT2Model): def __init__(self, config): super().__init__(config) + @staticmethod + def post_process(result, num_layer): + if isinstance(result[1][0], tuple) or isinstance(result[1][0], list): + assert len(result[1]) == num_layer and len(result[1][0]) == 2 #and len(result[1][0][0].shape) == 4 and result[1][0][0].shape == result[1][0][1].shape + present = [] + for i in range(num_layer): + # Since transformers v4.*, past key and values are separated outputs. + # Here we concate them into one tensor to be compatible with Attention operator. + present.append(torch.cat((result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)), dim=0)) + return (result[0], tuple(present)) + + return result + def forward(self, input_ids, position_ids, attention_mask, *past): - return super().forward(input_ids, - position_ids=position_ids, - attention_mask=attention_mask, - past_key_values=past) + result = super().forward(input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past, + return_dict=False) + return MyGPT2Model.post_process(result, self.config.n_layer) class MyGPT2LMHeadModel(GPT2LMHeadModel): @@ -54,10 +69,13 @@ class MyGPT2LMHeadModel(GPT2LMHeadModel): super().__init__(config) def forward(self, input_ids, position_ids, attention_mask, *past): - return super().forward(input_ids, - position_ids=position_ids, - attention_mask=attention_mask, - past_key_values=past) + result = super().forward(input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_values=past, + return_dict=False) + + return MyGPT2Model.post_process(result, self.config.n_layer) class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel): @@ -216,6 +234,7 @@ class Gpt2Helper: is_all_close = is_close num_layers = len(ort_outputs) - 1 + for layer in range(num_layers): is_close = numpy.allclose(ort_outputs[1 + layer], torch_outputs[1][layer].cpu().numpy(), @@ -288,10 +307,12 @@ class Gpt2Helper: input_names.append('attention_mask') input_names.extend(past_names) + assert len(outputs) == 2 and len(outputs[1]) == num_layer + logger.info( f"Shapes: input_ids={dummy_inputs.input_ids.shape} past={dummy_inputs.past[0].shape} output={outputs[0].shape} present={outputs[1][0].shape}" ) - + Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True) torch.onnx.export(model, diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py index 3668fe0515..1f4a1c9fa2 100644 --- a/onnxruntime/python/tools/transformers/huggingface_models.py +++ b/onnxruntime/python/tools/transformers/huggingface_models.py @@ -26,67 +26,44 @@ MODELS = { "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask", - "token_type_ids"], 11, False, "bert"), "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"), - # todo: more models to add - # GPT - "openai-gpt": (["input_ids"], 11, False, "gpt2"), # no past state inputs - # GPT-2 - "gpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs + + # GPT (no past state) + "openai-gpt": (["input_ids"], 11, False, "gpt2"), + # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values) + "gpt2": (["input_ids"], 11, False, "gpt2"), "gpt2-medium": (["input_ids"], 11, False, "gpt2"), - "gpt2-large": - (["input_ids"], 11, True, - "gpt2"), # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models. + "gpt2-large": (["input_ids"], 11, True, "gpt2"), "gpt2-xl": (["input_ids"], 11, True, "gpt2"), - "distilgpt2": (["input_ids"], 11, False, "gpt2"), # no past state inputs & outputs + "distilgpt2": (["input_ids"], 11, False, "gpt2"), # Transformer-XL #"transfo-xl-wt103": (["input_ids"], 11, False, "bert"), # XLNet - #"xlnet-base-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above. - #"xlnet-large-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above. + "xlnet-base-cased": (["input_ids"], 12, False, "bert"), + "xlnet-large-cased": (["input_ids"], 12, False, "bert"), # XLM "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"), "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"), "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"), - "xlm-mlm-enro-1024": (["input_ids"], 11, False, "bert"), - "xlm-mlm-xnli15-1024": (["input_ids"], 11, False, "bert"), - "xlm-mlm-tlm-xnli15-1024": (["input_ids"], 11, False, "bert"), - "xlm-clm-enfr-1024": (["input_ids"], 11, False, "bert"), - "xlm-clm-ende-1024": (["input_ids"], 11, False, "bert"), - "xlm-mlm-17-1280": (["input_ids"], 11, True, "bert"), - "xlm-mlm-100-1280": (["input_ids"], 11, True, "bert"), + # XML Roberta + "xlm-roberta-base": (["input_ids"], 12, False, "bert"), # RoBERTa "roberta-base": (["input_ids", "attention_mask"], 11, False, "bert"), "roberta-large": (["input_ids", "attention_mask"], 11, False, "bert"), "roberta-large-mnli": (["input_ids", "attention_mask"], 11, False, "bert"), + "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"), "distilroberta-base": (["input_ids", "attention_mask"], 11, False, "bert"), - "roberta-base-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"), - "roberta-large-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"), + # DistilBERT "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"), "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"), - "distilbert-base-cased": (["input_ids", "attention_mask"], 11, False, "bert"), - "distilbert-base-cased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"), - "distilbert-base-german-cased": (["input_ids", "attention_mask"], 11, False, "bert"), - "distilbert-base-multilingual-cased": (["input_ids", "attention_mask"], 11, False, "bert"), # CTRL "ctrl": (["input_ids"], 11, True, "bert"), # CamemBERT "camembert-base": (["input_ids"], 11, False, "bert"), # ALBERT - # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above. "albert-base-v1": (["input_ids"], 12, False, "bert"), "albert-large-v1": (["input_ids"], 12, False, "bert"), "albert-xlarge-v1": (["input_ids"], 12, True, "bert"), @@ -95,36 +72,37 @@ MODELS = { "albert-large-v2": (["input_ids"], 12, False, "bert"), "albert-xlarge-v2": (["input_ids"], 12, True, "bert"), #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"), - # T5 - "t5-small": (["input_ids"], 12, False, "bert"), - "t5-base": (["input_ids"], 12, False, "bert"), - "t5-large": (["input_ids"], 12, True, "bert"), - "t5-3b": (["input_ids"], 12, True, "bert"), - "t5-11b": (["input_ids"], 12, True, "bert"), + # T5 (use benchmark_t5.py instead) + #"t5-small": (["input_ids"], 12, False, "bert"), + #"t5-base": (["input_ids"], 12, False, "bert"), + #"t5-large": (["input_ids"], 12, True, "bert"), + #"t5-3b": (["input_ids"], 12, True, "bert"), + #"t5-11b": (["input_ids"], 12, True, "bert"), + #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"), # XLM-RoBERTa "xlm-roberta-base": (["input_ids"], 11, False, "bert"), "xlm-roberta-large": (["input_ids"], 11, True, "bert"), # FlauBERT "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"), - "flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"), + #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"), "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"), - "flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"), + #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"), # Bart "facebook/bart-large": (["input_ids"], 11, False, "bert"), "facebook/bart-base": (["input_ids"], 11, False, "bert"), "facebook/bart-large-mnli": (["input_ids"], 11, False, "bert"), "facebook/bart-large-cnn": (["input_ids"], 11, False, "bert"), - #"facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"), + # DialoGPT "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"), "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"), - "microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"), + #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"), # Reformer #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"), #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"), # MarianMT #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"), - # Longformer + # Longformer (use benchmark_longformer.py instead) #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"), #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"), } diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh index 2cbf29d07a..05be3c560e 100644 --- a/onnxruntime/python/tools/transformers/run_benchmark.sh +++ b/onnxruntime/python/tools/transformers/run_benchmark.sh @@ -5,13 +5,12 @@ # -------------------------------------------------------------------------- # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models. # Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following: -# GPU: conda install pytorch torchvision cudatoolkit=10.1 -c pytorch +# GPU: conda install pytorch torchvision cudatoolkit=11.0 -c pytorch # CPU: conda install pytorch torchvision cpuonly -c pytorch -# When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks -# it will use onnxruntime-tools package. -# If run_cli=false, it depends on other python script (*.py) files in this directory. -run_cli=true +# When use_package=true, you need not copy other files to run benchmarks except this sh file. +# Otherwise, it will use python script (*.py) files in this directory. +use_package=true # only need once run_install=true @@ -50,7 +49,7 @@ sequence_lengths="8 16 32 64 128 256 512 1024" input_counts=1 # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased -models_to_test="bert-base-cased roberta-base gpt2" +models_to_test="bert-base-cased roberta-base distilbert-base-uncased" # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU: # export CUDA_VISIBLE_DEVICES=1 @@ -81,7 +80,7 @@ fi if [ "$run_install" = true ] ; then - pip uninstall --yes ort_nightly + pip uninstall --yes ort-nightly ort-gpu-nightly pip uninstall --yes onnxruntime pip uninstall --yes onnxruntime-gpu if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then @@ -89,14 +88,12 @@ if [ "$run_install" = true ] ; then else pip install onnxruntime-gpu fi - pip install --upgrade onnxconverter_common - pip install --upgrade onnxruntime-tools - pip install --upgrade transformers + pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers fi -if [ "$run_cli" = true ] ; then - echo "Use onnxruntime_tools.transformers.benchmark" - benchmark_script="-m onnxruntime_tools.transformers.benchmark" +if [ "$use_package" = true ] ; then + echo "Use onnxruntime.transformers.benchmark" + benchmark_script="-m onnxruntime.transformers.benchmark" else benchmark_script="benchmark.py" fi @@ -187,4 +184,4 @@ fi # Remove duplicated lines awk '!x[$0]++' ./result.csv > summary_result.csv awk '!x[$0]++' ./fusion.csv > summary_fusion.csv -awk '!x[$0]++' ./detail.csv > summary_detail.csv \ No newline at end of file +awk '!x[$0]++' ./detail.csv > summary_detail.csv diff --git a/onnxruntime/python/tools/transformers/test/test_gpt2.py b/onnxruntime/python/tools/transformers/test/test_gpt2.py index 4f5ca65363..cb6b680af5 100644 --- a/onnxruntime/python/tools/transformers/test/test_gpt2.py +++ b/onnxruntime/python/tools/transformers/test/test_gpt2.py @@ -26,7 +26,7 @@ class TestGpt2(unittest.TestCase): def test_gpt2_fp16(self): if 'CUDAExecutionProvider' in onnxruntime.get_available_providers(): - self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128') + self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128 --use_gpu') def test_gpt2_int8(self): self.run_benchmark_gpt2('-m gpt2 --precision int8 -o -b 1 -s 128')