Update transformers benchmark for transformers 4.3.* and ORT 1.7 (#6796)

* update benchmark for transformers 4.* and ORT 1.7 * Fix gpt2 onnx conversion for transformers 4.3.*. Add a check of transformer version >= 3.1. * remove code related to openmp * update pretrain model list: keep representitive models only
2026-05-25 22:26:24 +00:00 · 2021-02-24 12:52:35 -08:00 · 2021-02-24 12:52:35 -08:00 · f4acdb2ecd
commit f4acdb2ecd
parent 71a70ecf6e
12 changed files with 137 additions and 316 deletions
--- a/onnxruntime/python/tools/transformers/README.md
+++ b/onnxruntime/python/tools/transformers/README.md
@ -210,7 +210,7 @@ For GPU, please append --use_gpu to the command.
 bert_perf_test.py can be used to check the BERT model inference performance. Below are examples:

 ```console
-python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128 --samples 100 --test_times 10 --inclusive
+python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128
 ```

 For GPU, please append --use_gpu to the command.
@ -219,7 +219,7 @@ After test is finished, a file like perf_results_CPU_B1_S128_<date_time>.txt or

 ## Profiling

-profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and time spent on a node or subgraph.
+profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and CPU time spent on a node or subgraph.

 Examples commands:

--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@ -80,9 +80,6 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b
        )
        return results

-    if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
-        logger.warning("Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
-
    for model_name in model_names:
        all_input_names = MODELS[model_name][0]
        for num_inputs in input_counts:
--- a/onnxruntime/python/tools/transformers/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/benchmark_gpt2.py
@ -16,6 +16,7 @@ import argparse
 import logging
 import torch
 import onnx
+from packaging import version
 from transformers import AutoConfig
 from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
 from quantize_helper import QuantizeHelper
@ -113,6 +114,10 @@ def parse_arguments(argv=None):


 def main(args):
+    from transformers import __version__ as transformers_version
+    if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
+        raise RuntimeError("This tool requires transformers 3.1.0 or later.")
+
    logger.info(f"Arguments:{args}")
    if args.precision == Precision.FLOAT16:
        assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"
@ -279,7 +284,7 @@ def main(args):
    return csv_filename


-if __name__ == '__main__':
+if __name__ == '__main__':       
    args = parse_arguments()
    setup_logger(args.verbose)
    main(args)
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@ -35,44 +35,10 @@ class TestSetting:
    sequence_length: int
    test_cases: int
    test_times: int
-    contiguous: bool
    use_gpu: bool
-    warmup: bool
-    omp_num_threads: int
-    omp_wait_policy: str
    intra_op_num_threads: int
    seed: int
    verbose: bool
-    contiguous: bool
-    inclusive: bool
-    extra_latency: float = 0
-
-    def get_setting(self) -> str:
-        return f"batch_size={self.batch_size},sequence_length={self.sequence_length},test_cases={self.test_cases},test_times={self.test_times},contiguous={self.contiguous},use_gpu={self.use_gpu},warmup={self.warmup}"
-
-    def check(self, intra_op_threads, omp_threads, omp_policy) -> bool:
-        if intra_op_threads is None:
-            if self.intra_op_num_threads is not None and self.intra_op_num_threads > 0:
-                return False
-        else:
-            assert intra_op_threads > 0
-            if not (self.intra_op_num_threads is None or self.intra_op_num_threads == intra_op_threads):
-                return False
-
-        if omp_threads is None:
-            if self.omp_num_threads is not None and self.omp_num_threads > 0:
-                return False
-        else:
-            assert omp_threads > 0
-            if not (self.omp_num_threads is None or self.omp_num_threads == omp_threads):
-                return False
-
-        if self.omp_wait_policy is not None:
-            if omp_policy != self.omp_wait_policy:
-                return False
-
-        return True
-

@dataclass
 class ModelSetting:
@ -84,22 +50,17 @@ class ModelSetting:


 def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
-    # Import onnxruntime shall be after OpenMP environment variable setting.
-    # So we put the import in function to delay importing instead of top of this script.
    import onnxruntime

    if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
        print(
            "Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
        )
-    elif (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
-        print("Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")

    if intra_op_num_threads is None and graph_optimization_level is None:
        session = onnxruntime.InferenceSession(model_path)
    else:
-        execution_providers = ['CPUExecutionProvider'
-                               ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']

        sess_options = onnxruntime.SessionOptions()
        sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
@ -127,8 +88,8 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
    return session


-def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
-    if warmup and len(all_inputs) > 0:
+def onnxruntime_inference(session, all_inputs, output_names):
+    if len(all_inputs) > 0:
        # Use a random input as warm up.
        session.run(output_names, random.choice(all_inputs))

@ -142,57 +103,16 @@ def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
        latency_list.append(latency)
    return results, latency_list

-
-def get_contiguous_inputs(all_inputs):
-    """
-    Convert input to be contiguous.
-    """
-    contiguous_inputs = []
-
-    start_time = timeit.default_timer()
-    for test_case_id, inputs in enumerate(all_inputs):
-        real_inputs = {}
-        for key, value in inputs.items():
-            real_inputs[key] = np.ascontiguousarray(value)
-        contiguous_inputs.append(real_inputs)
-    latency = timeit.default_timer() - start_time
-
-    average_latency_ms = latency / len(contiguous_inputs) * 1000
-    return contiguous_inputs, average_latency_ms
-
-
 def to_string(model_path, session, test_setting):
    sess_options = session.get_session_options()
-    option = "model={}".format(os.path.basename(model_path))
-    option += ",graph_optimization_level={},intra_op_num_threads={}".format(sess_options.graph_optimization_level,
+    option = "model={},".format(os.path.basename(model_path))
+    option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level,
                                                                            sess_options.intra_op_num_threads).replace(
                                                                                'GraphOptimizationLevel.ORT_', '')
-    option += ",OMP_NUM_THREADS={}".format(os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else "")
-    option += ",OMP_WAIT_POLICY={}".format(os.environ["OMP_WAIT_POLICY"] if "OMP_WAIT_POLICY" in os.environ else "")
-    option += ",{}".format(test_setting.get_setting())
+    option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
    return option

-
-def setup_openmp_environ(omp_num_threads, omp_wait_policy):
-    if omp_num_threads is None:
-        if "OMP_NUM_THREADS" in os.environ:
-            del os.environ["OMP_NUM_THREADS"]
-    else:
-        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
-
-    if omp_wait_policy is None:
-        if "OMP_WAIT_POLICY" in os.environ:
-            del os.environ["OMP_WAIT_POLICY"]
-    else:
-        assert omp_wait_policy in ["ACTIVE", "PASSIVE"], f"{omp_wait_policy} is not a valid policy"
-        os.environ["OMP_WAIT_POLICY"] = omp_wait_policy
-
-
-def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
-                 omp_wait_policy):
-    # Environment variable shall be set before import onnxruntime.
-    setup_openmp_environ(omp_num_threads, omp_wait_policy)
-
+def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
    session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
                             model_setting.opt_level)
    output_names = [output.name for output in session.get_outputs()]
@ -206,11 +126,11 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op

    all_latency_list = []
    for i in range(test_setting.test_times):
-        results, latency_list = onnxruntime_inference(session, all_inputs, output_names, test_setting.warmup)
+        results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
        all_latency_list.extend(latency_list)

    # latency in miliseconds
-    latency_ms = np.array(all_latency_list) * 1000 + test_setting.extra_latency
+    latency_ms = np.array(all_latency_list) * 1000

    average_latency = statistics.mean(latency_ms)
    latency_50 = np.percentile(latency_ms, 50)
@ -226,91 +146,31 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
                                                                format(throughput, '.2f')))


-def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
-                omp_wait_policy):
-    if not test_setting.check(intra_op_num_threads, omp_num_threads, omp_wait_policy):
-        return
-
+def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
    process = multiprocessing.Process(target=run_one_test,
-                                      args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
-                                            omp_num_threads, omp_wait_policy))
+                                      args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads))
    process.start()
    process.join()


-def run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs):
+def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
+    if (test_setting.intra_op_num_threads is not None):
+        launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads)
+        return
+
    cpu_count = psutil.cpu_count(logical=False)
    logical_cores = psutil.cpu_count(logical=True)

-    candidate_threads = list(set([1, logical_cores, cpu_count]))
-
-    if (test_setting.intra_op_num_threads is not None) or (test_setting.omp_num_threads is not None):
-
-        if test_setting.intra_op_num_threads is not None:
-            intra_op_threads = [test_setting.intra_op_num_threads]
-        else:
-            intra_op_threads = [None] + candidate_threads
-
-        if test_setting.omp_num_threads is not None:
-            omp_threads = [test_setting.omp_num_threads]
-        else:
-            omp_threads = [None] + candidate_threads
-
-        if test_setting.omp_wait_policy is not None:
-            omp_policies = [test_setting.omp_wait_policy]
-        else:
-            omp_policies = [None, 'PASSIVE', 'ACTIVE']
-
-        for it in intra_op_threads:
-            for ot in omp_threads:
-                for op in omp_policies:
-                    launch_test(model_setting, test_setting, perf_results, all_inputs, it, ot, op)
-        return
-
-    # Test a setting without any setting as baseline 1.
-    launch_test(model_setting, test_setting, perf_results, all_inputs, None, None, None)
-
-    if not test_setting.use_gpu:
-        # For CPU: intra_op_num_threads = 1, omp_num_threads=None, omp_wait_policy=None
-        # Another setting without environment variable as baseline 2.
-        launch_test(model_setting, test_setting, perf_results, all_inputs, 1, None, None)
-    else:
-        # For GPU, we test two more settings by default:
-        # (1) intra_op_num_threads = 1, omp_num_threads=cpu_count, omp_wait_policy=PASSIVE
-        # (2) intra_op_num_threads = logical_cores, omp_num_threads=1, omp_wait_policy=ACTIVE
-        launch_test(model_setting, test_setting, perf_results, all_inputs, 1, cpu_count, 'PASSIVE')
-
-        launch_test(model_setting, test_setting, perf_results, all_inputs, logical_cores, 1, 'ACTIVE')
-
-    # GPU latency is not sensitive to these settings. No need to test many combinations.
-    # Skip remaining settings for GPU without --all flag.
-    if test_setting.use_gpu and not test_all:
-        return
+    candidate_threads = list(set([logical_cores, cpu_count]))
+    for i in range(1, min(16, logical_cores)):
+        if i not in candidate_threads:
+            candidate_threads.append(i)
+    candidate_threads.sort(reverse=True)

    for intra_op_num_threads in candidate_threads:
-        for omp_num_threads in candidate_threads:
-            # skip settings that are very slow
-            if intra_op_num_threads == 1 and omp_num_threads == 1 and logical_cores != 1:
-                continue
-
-            # When logical and physical cores are not the same, there are many combinations.
-            # Remove some settings are not good normally.
-            if logical_cores > cpu_count:
-                if omp_num_threads == logical_cores and intra_op_num_threads != 1:
-                    continue
-                if intra_op_num_threads == logical_cores and omp_num_threads != 1:
-                    continue
-
-            if not test_all:
-                if intra_op_num_threads != 1 and omp_num_threads != 1:
-                    continue
-
-            for omp_wait_policy in ['ACTIVE', 'PASSIVE']:
-                launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
-                            omp_num_threads, omp_wait_policy)
-
-
-def run_performance(model_setting, test_setting, perf_results, test_all):
+        launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)
+        
+def run_performance(model_setting, test_setting, perf_results):
    input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name,
                                                         model_setting.segment_ids_name, model_setting.input_mask_name)

@ -327,29 +187,25 @@ def run_performance(model_setting, test_setting, perf_results, test_all):
                                    segment_ids,
                                    input_mask,
                                    random_mask_length=False)
-    if test_setting.contiguous:
-        all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
-        print("Extra latency for converting inputs to contiguous: {} ms".format(format(contiguous_latency, '.2f')))
-        test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0

-    run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs)
+    run_perf_tests(model_setting, test_setting, perf_results, all_inputs)


 def parse_arguments():
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', required=True, type=str, help="bert onnx model path")

-    parser.add_argument('--batch_size',
+    parser.add_argument('-b', '--batch_size',
                        required=True,
                        type=int,
                        nargs="+",
                        help="batch size of input. Allow one or multiple values in the range of [1, 128].")

-    parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input")
+    parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input")

    parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated")

-    parser.add_argument('--test_times',
+    parser.add_argument('-t', '--test_times',
                        required=False,
                        type=int,
                        default=0,
@ -375,40 +231,12 @@ def parse_arguments():
    parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
    parser.set_defaults(use_gpu=False)

-    parser.add_argument('--inclusive',
-                        required=False,
-                        action='store_true',
-                        help="include the latency of converting array to contiguous")
-    parser.set_defaults(inclusive=False)
-
-    parser.add_argument('--all', required=False, action='store_true', help="test all candidate settings")
-    parser.set_defaults(all=False)
-
-    parser.add_argument('--omp_num_threads',
-                        required=False,
-                        type=int,
-                        default=None,
-                        help=">0, set OMP_NUM_THREADS value. 0, do not set")
-
-    parser.add_argument('--intra_op_num_threads',
+    parser.add_argument('-n', '--intra_op_num_threads',
                        required=False,
                        type=int,
                        default=None,
                        help=">=0, set intra_op_num_threads")

-    parser.add_argument('--omp_wait_policy',
-                        required=False,
-                        type=str,
-                        default=None,
-                        choices=['ACTIVE', 'PASSIVE'],
-                        help="OMP_WAIT_POLICY")
-
-    parser.add_argument('--contiguous', required=False, action='store_true', help="contiguous input")
-    parser.set_defaults(contiguous=False)
-
-    parser.add_argument('--no_warmup', required=False, action='store_true', help="do not use one sample for warm-up.")
-    parser.set_defaults(no_warmup=False)
-
    parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
    parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
    parser.add_argument('--input_mask_name',
@ -443,18 +271,13 @@ def main():
            args.sequence_length,
            args.samples,
            args.test_times,
-            None,  #contiguous
            args.use_gpu,
-            not args.no_warmup,
-            args.omp_num_threads,
-            args.omp_wait_policy,
            args.intra_op_num_threads,
            args.seed,
-            args.verbose,
-            args.contiguous,
-            args.inclusive)
+            args.verbose)
+
        print("test setting", test_setting)
-        run_performance(model_setting, test_setting, perf_results, args.all)
+        run_performance(model_setting, test_setting, perf_results)

    # Sort the results so that the first one has smallest latency.
    sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1])
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@ -140,7 +140,8 @@ def generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, i


 def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
-    assert input_index < len(embed_node.input)
+    if input_index >= len(embed_node.input):
+        return None

    input = embed_node.input[input_index]
    graph_input = onnx_model.find_graph_input(input)
@ -195,6 +196,15 @@ def find_bert_inputs(onnx_model, input_ids_name=None, segment_ids_name=None, inp
        input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
        segment_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 1)
        input_mask = get_graph_input_from_embed_node(onnx_model, embed_node, 7)
+
+        if input_mask is None:
+            for input in graph_inputs:
+                input_name_lower = input.name.lower()
+                if "mask" in input_name_lower:
+                    input_mask = input
+        if input_mask is None:
+            raise ValueError(f"Failed to find attention mask input")
+            
        return input_ids, segment_ids, input_mask

    # Try guess the inputs based on naming.
@ -231,7 +241,7 @@ def get_bert_inputs(onnx_file, input_ids_name=None, segment_ids_name=None, input
        model.ParseFromString(f.read())

    onnx_model = OnnxModel(model)
-    find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
+    return find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)


 def parse_arguments():
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@ -21,19 +21,17 @@ from datetime import datetime
 from onnx import ModelProto, TensorProto, numpy_helper
 from onnx_model import OnnxModel
 from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
-from bert_perf_test import create_session, onnxruntime_inference, setup_openmp_environ
+from bert_perf_test import create_session, onnxruntime_inference


-def run_model(model_path, all_inputs, use_gpu, use_openmp, disable_optimization):
-    # Import onnxruntime shall be after OpenMP environment variable setting.
-    # So we put import here to delay importing.
+def run_model(model_path, all_inputs, use_gpu, disable_optimization):
    import onnxruntime

    graph_optimization_level = None
    if disable_optimization:
        graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL

-    intra_op_num_threads = 1 if use_openmp else psutil.cpu_count(logical=False)
+    intra_op_num_threads = psutil.cpu_count(logical=False)

    session = create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level)

@ -78,7 +76,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):


 def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed,
-             use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
+             verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):

    # Try deduce input names from optimized model.
    input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name,
@ -95,16 +93,9 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
                                    input_mask,
                                    random_mask_length=True)

-    # OpenMP environment variables must be set before the very first "import onnxruntime"
-    if use_openmp:
-        setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE')
-    else:
-        setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE')
-
    baseline_results, baseline_latency, output_names = run_model(baseline_model,
                                                                 all_inputs,
                                                                 use_gpu,
-                                                                 use_openmp,
                                                                 disable_optimization=True)
    if verbose:
        print("baseline average latency (all optimizations disabled): {} ms".format(
@ -117,7 +108,6 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
    treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model,
                                                                             all_inputs,
                                                                             use_gpu,
-                                                                             use_openmp,
                                                                             disable_optimization=False)
    if verbose:
        print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
@ -157,9 +147,6 @@ def parse_arguments():
    parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
    parser.set_defaults(use_gpu=False)

-    parser.add_argument('--openmp', required=False, action='store_true', help="use openmp")
-    parser.set_defaults(openmp=False)
-
    parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
    parser.set_defaults(verbose=False)

@ -180,7 +167,7 @@ def main():
        path.mkdir(parents=True, exist_ok=True)

    run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length,
-             args.use_gpu, args.samples, args.seed, args.openmp, args.verbose, args.rtol, args.atol, args.input_ids,
+             args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids,
             args.segment_ids, args.input_mask)


--- a/onnxruntime/python/tools/transformers/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/convert_to_onnx.py
@ -23,6 +23,7 @@ import torch
 import numpy
 import json
 from pathlib import Path
+from packaging import version
 from transformers import AutoConfig
 from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
 from gpt2_tester import Gpt2Tester
@ -104,6 +105,10 @@ def parse_arguments():


 def main():
+    from transformers import __version__ as transformers_version
+    if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
+        raise RuntimeError("This tool requires transformers 3.1.0 or later.")
+
    args = parse_arguments()
    setup_logger(args.verbose)

--- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd
+++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
@ -7,10 +7,9 @@ REM Please install PyTorch (see https://pytorch.org/) before running this benchm
 REM   GPU:   conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
 REM   CPU:   conda install pytorch torchvision cpuonly -c pytorch

-REM When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
-REM                    it will use onnxruntime-tools package.
-REM If run_cli=false, it depends on other python script (*.py) files in this directory.
-set run_cli=false
+REM When use_package=true, you need not copy other files to run benchmarks except this sh file.
+REM Otherwise, it will use python script (*.py) files in this directory.
+set use_package=false

 REM only need once
 set run_install=false
@ -72,13 +71,12 @@ if %run_install% == true (
  )

  pip install --upgrade onnxconverter_common
-  pip install --upgrade onnxruntime-tools
-  pip install --upgrade git+https://github.com/huggingface/transformers
+  pip install --upgrade transformers
 )

-if %run_cli% == true (
-  echo Use onnxruntime_tools.transformers.benchmark
-  set optimizer_script=-m onnxruntime_tools.transformers.benchmark
+if %use_package% == true (
+  echo Use onnxruntime.transformers.benchmark
+  set optimizer_script=-m onnxruntime.transformers.benchmark
 ) else (
  set optimizer_script=benchmark.py
 )
--- a/onnxruntime/python/tools/transformers/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/gpt2_helper.py
@ -31,7 +31,7 @@ class GPT2ModelNoPastState(GPT2Model):
        super().__init__(config)

    def forward(self, input_ids):
-        return super().forward(input_ids, use_cache=False)
+        return super().forward(input_ids, use_cache=False, return_dict=False)


 class MyGPT2Model(GPT2Model):
@ -40,11 +40,26 @@ class MyGPT2Model(GPT2Model):
    def __init__(self, config):
        super().__init__(config)

+    @staticmethod
+    def post_process(result, num_layer):
+        if isinstance(result[1][0], tuple) or isinstance(result[1][0], list):
+            assert len(result[1]) == num_layer and len(result[1][0]) == 2 #and len(result[1][0][0].shape) == 4 and result[1][0][0].shape == result[1][0][1].shape
+            present = []
+            for i in range(num_layer):
+                # Since transformers v4.*, past key and values are separated outputs.
+                # Here we concate them into one tensor to be compatible with Attention operator.
+                present.append(torch.cat((result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)), dim=0))
+            return (result[0], tuple(present))
+
+        return result
+
    def forward(self, input_ids, position_ids, attention_mask, *past):
-        return super().forward(input_ids,
-                               position_ids=position_ids,
-                               attention_mask=attention_mask,
-                               past_key_values=past)
+        result = super().forward(input_ids,
+                                 position_ids=position_ids,
+                                 attention_mask=attention_mask,
+                                 past_key_values=past,
+                                 return_dict=False)
+        return MyGPT2Model.post_process(result, self.config.n_layer)


 class MyGPT2LMHeadModel(GPT2LMHeadModel):
@ -54,10 +69,13 @@ class MyGPT2LMHeadModel(GPT2LMHeadModel):
        super().__init__(config)

    def forward(self, input_ids, position_ids, attention_mask, *past):
-        return super().forward(input_ids,
-                               position_ids=position_ids,
-                               attention_mask=attention_mask,
-                               past_key_values=past)
+        result = super().forward(input_ids,
+                                 position_ids=position_ids,
+                                 attention_mask=attention_mask,
+                                 past_key_values=past,
+                                 return_dict=False)
+
+        return MyGPT2Model.post_process(result, self.config.n_layer)


 class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
@ -216,6 +234,7 @@ class Gpt2Helper:

        is_all_close = is_close
        num_layers = len(ort_outputs) - 1
+
        for layer in range(num_layers):
            is_close = numpy.allclose(ort_outputs[1 + layer],
                                      torch_outputs[1][layer].cpu().numpy(),
@ -288,10 +307,12 @@ class Gpt2Helper:
            input_names.append('attention_mask')
        input_names.extend(past_names)

+        assert len(outputs) == 2 and len(outputs[1]) == num_layer
+
        logger.info(
            f"Shapes: input_ids={dummy_inputs.input_ids.shape} past={dummy_inputs.past[0].shape} output={outputs[0].shape} present={outputs[1][0].shape}"
        )
-
+    
        Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)

        torch.onnx.export(model,
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@ -26,67 +26,44 @@ MODELS = {
    "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
    "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
    "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
    "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
                                                               "token_type_ids"], 11, False, "bert"),
-    "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
-                                                             "token_type_ids"], 11, False, "bert"),
    "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    # todo: more models to add
-    # GPT
-    "openai-gpt": (["input_ids"], 11, False, "gpt2"),  # no past state inputs
-    # GPT-2
-    "gpt2": (["input_ids"], 11, False, "gpt2"),  # no past state inputs & outputs
+
+    # GPT (no past state)
+    "openai-gpt": (["input_ids"], 11, False, "gpt2"),
+    # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values)
+    "gpt2": (["input_ids"], 11, False, "gpt2"),
    "gpt2-medium": (["input_ids"], 11, False, "gpt2"),
-    "gpt2-large":
-    (["input_ids"], 11, True,
-     "gpt2"),  # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models.
+    "gpt2-large": (["input_ids"], 11, True, "gpt2"),
    "gpt2-xl": (["input_ids"], 11, True, "gpt2"),
-    "distilgpt2": (["input_ids"], 11, False, "gpt2"),  # no past state inputs & outputs
+    "distilgpt2": (["input_ids"], 11, False, "gpt2"),
    # Transformer-XL
    #"transfo-xl-wt103": (["input_ids"], 11, False, "bert"),
    # XLNet
-    #"xlnet-base-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
-    #"xlnet-large-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
+    "xlnet-base-cased": (["input_ids"], 12, False, "bert"),
+    "xlnet-large-cased": (["input_ids"], 12, False, "bert"),
    # XLM
    "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"),
    "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"),
    "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-enro-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-tlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-clm-enfr-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-clm-ende-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-17-1280": (["input_ids"], 11, True, "bert"),
-    "xlm-mlm-100-1280": (["input_ids"], 11, True, "bert"),
+    # XML Roberta
+    "xlm-roberta-base": (["input_ids"], 12, False, "bert"),
    # RoBERTa
    "roberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
    "roberta-large": (["input_ids", "attention_mask"], 11, False, "bert"),
    "roberta-large-mnli": (["input_ids", "attention_mask"], 11, False, "bert"),
+    "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"),
    "distilroberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "roberta-base-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "roberta-large-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
+
    # DistilBERT
    "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"),
    "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-cased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-german-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-multilingual-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
    # CTRL
    "ctrl": (["input_ids"], 11, True, "bert"),
    # CamemBERT
    "camembert-base": (["input_ids"], 11, False, "bert"),
    # ALBERT
-    # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
    "albert-base-v1": (["input_ids"], 12, False, "bert"),
    "albert-large-v1": (["input_ids"], 12, False, "bert"),
    "albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
@ -95,36 +72,37 @@ MODELS = {
    "albert-large-v2": (["input_ids"], 12, False, "bert"),
    "albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
    #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
-    # T5
-    "t5-small": (["input_ids"], 12, False, "bert"),
-    "t5-base": (["input_ids"], 12, False, "bert"),
-    "t5-large": (["input_ids"], 12, True, "bert"),
-    "t5-3b": (["input_ids"], 12, True, "bert"),
-    "t5-11b": (["input_ids"], 12, True, "bert"),
+    # T5 (use benchmark_t5.py instead)
+    #"t5-small": (["input_ids"], 12, False, "bert"),
+    #"t5-base": (["input_ids"], 12, False, "bert"),
+    #"t5-large": (["input_ids"], 12, True, "bert"),
+    #"t5-3b": (["input_ids"], 12, True, "bert"),
+    #"t5-11b": (["input_ids"], 12, True, "bert"),
+    #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"),
    # XLM-RoBERTa
    "xlm-roberta-base": (["input_ids"], 11, False, "bert"),
    "xlm-roberta-large": (["input_ids"], 11, True, "bert"),
    # FlauBERT
    "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"),
-    "flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
+    #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
    "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"),
-    "flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
+    #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
    # Bart
    "facebook/bart-large": (["input_ids"], 11, False, "bert"),
    "facebook/bart-base": (["input_ids"], 11, False, "bert"),
    "facebook/bart-large-mnli": (["input_ids"], 11, False, "bert"),
    "facebook/bart-large-cnn": (["input_ids"], 11, False, "bert"),
-    #"facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"),
+
    # DialoGPT
    "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"),
    "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"),
-    "microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
+    #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
    # Reformer
    #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"),
    #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"),
    # MarianMT
    #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
-    # Longformer
+    # Longformer (use benchmark_longformer.py instead)
    #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
    #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"),
 }
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@ -5,13 +5,12 @@
 # --------------------------------------------------------------------------
 # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models.
 # Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following:
-# GPU:   conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
+# GPU:   conda install pytorch torchvision cudatoolkit=11.0 -c pytorch
 # CPU:   conda install pytorch torchvision cpuonly -c pytorch

-# When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
-#                    it will use onnxruntime-tools package.
-# If run_cli=false, it depends on other python script (*.py) files in this directory.
-run_cli=true
+# When use_package=true, you need not copy other files to run benchmarks except this sh file.
+# Otherwise, it will use python script (*.py) files in this directory.
+use_package=true

 # only need once
 run_install=true
@ -50,7 +49,7 @@ sequence_lengths="8 16 32 64 128 256 512 1024"
 input_counts=1

 # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
-models_to_test="bert-base-cased roberta-base gpt2"
+models_to_test="bert-base-cased roberta-base distilbert-base-uncased"

 # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 # export CUDA_VISIBLE_DEVICES=1
@ -81,7 +80,7 @@ fi


 if [ "$run_install" = true ] ; then
-  pip uninstall --yes ort_nightly
+  pip uninstall --yes ort-nightly ort-gpu-nightly
  pip uninstall --yes onnxruntime
  pip uninstall --yes onnxruntime-gpu
  if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
@ -89,14 +88,12 @@ if [ "$run_install" = true ] ; then
  else
    pip install onnxruntime-gpu
  fi
-  pip install --upgrade onnxconverter_common
-  pip install --upgrade onnxruntime-tools
-  pip install --upgrade transformers
+  pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers
 fi

-if [ "$run_cli" = true ] ; then
-  echo "Use onnxruntime_tools.transformers.benchmark"
-  benchmark_script="-m onnxruntime_tools.transformers.benchmark"
+if [ "$use_package" = true ] ; then
+  echo "Use onnxruntime.transformers.benchmark"
+  benchmark_script="-m onnxruntime.transformers.benchmark"
 else
  benchmark_script="benchmark.py"
 fi
@ -187,4 +184,4 @@ fi
 # Remove duplicated lines
 awk '!x[$0]++' ./result.csv > summary_result.csv
 awk '!x[$0]++' ./fusion.csv > summary_fusion.csv
-awk '!x[$0]++' ./detail.csv > summary_detail.csv
+awk '!x[$0]++' ./detail.csv > summary_detail.csv
--- a/onnxruntime/python/tools/transformers/test/test_gpt2.py
+++ b/onnxruntime/python/tools/transformers/test/test_gpt2.py
@ -26,7 +26,7 @@ class TestGpt2(unittest.TestCase):

    def test_gpt2_fp16(self):
        if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
-            self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128')
+            self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128 --use_gpu')

    def test_gpt2_int8(self):
        self.run_benchmark_gpt2('-m gpt2 --precision int8 -o -b 1 -s 128')