diff --git a/onnxruntime/python/tools/transformers/README.md b/onnxruntime/python/tools/transformers/README.md
index a9805fe5a9..99d015751d 100644
--- a/onnxruntime/python/tools/transformers/README.md
+++ b/onnxruntime/python/tools/transformers/README.md
@@ -210,7 +210,7 @@ For GPU, please append --use_gpu to the command.
 bert_perf_test.py can be used to check the BERT model inference performance. Below are examples:
 
 ```console
-python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128 --samples 100 --test_times 10 --inclusive
+python -m onnxruntime.transformers.bert_perf_test --model optimized_model_cpu.onnx --batch_size 1 --sequence_length 128
 ```
 
 For GPU, please append --use_gpu to the command.
@@ -219,7 +219,7 @@ After test is finished, a file like perf_results_CPU_B1_S128_<date_time>.txt or
 
 ## Profiling
 
-profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and time spent on a node or subgraph.
+profiler.py can be used to run profiling on a transformer model. It can help figure out the bottleneck of a model, and CPU time spent on a node or subgraph.
 
 Examples commands:
 
diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 3fed9c88fa..20f7a52113 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -80,9 +80,6 @@ def run_onnxruntime(use_gpu, model_names, model_class, precision, num_threads, b
         )
         return results
 
-    if (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
-        logger.warning("Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
-
     for model_name in model_names:
         all_input_names = MODELS[model_name][0]
         for num_inputs in input_counts:
diff --git a/onnxruntime/python/tools/transformers/benchmark_gpt2.py b/onnxruntime/python/tools/transformers/benchmark_gpt2.py
index 5354db9ed7..ed5d3de6c8 100644
--- a/onnxruntime/python/tools/transformers/benchmark_gpt2.py
+++ b/onnxruntime/python/tools/transformers/benchmark_gpt2.py
@@ -16,6 +16,7 @@ import argparse
 import logging
 import torch
 import onnx
+from packaging import version
 from transformers import AutoConfig
 from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
 from quantize_helper import QuantizeHelper
@@ -113,6 +114,10 @@ def parse_arguments(argv=None):
 
 
 def main(args):
+    from transformers import __version__ as transformers_version
+    if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
+        raise RuntimeError("This tool requires transformers 3.1.0 or later.")
+
     logger.info(f"Arguments:{args}")
     if args.precision == Precision.FLOAT16:
         assert args.optimize_onnx and args.use_gpu, "fp16 requires --optimize_onnx --use_gpu"
@@ -279,7 +284,7 @@ def main(args):
     return csv_filename
 
 
-if __name__ == '__main__':
+if __name__ == '__main__':       
     args = parse_arguments()
     setup_logger(args.verbose)
     main(args)
diff --git a/onnxruntime/python/tools/transformers/bert_perf_test.py b/onnxruntime/python/tools/transformers/bert_perf_test.py
index 16d8a04939..ff7738d4a8 100644
--- a/onnxruntime/python/tools/transformers/bert_perf_test.py
+++ b/onnxruntime/python/tools/transformers/bert_perf_test.py
@@ -35,44 +35,10 @@ class TestSetting:
     sequence_length: int
     test_cases: int
     test_times: int
-    contiguous: bool
     use_gpu: bool
-    warmup: bool
-    omp_num_threads: int
-    omp_wait_policy: str
     intra_op_num_threads: int
     seed: int
     verbose: bool
-    contiguous: bool
-    inclusive: bool
-    extra_latency: float = 0
-
-    def get_setting(self) -> str:
-        return f"batch_size={self.batch_size},sequence_length={self.sequence_length},test_cases={self.test_cases},test_times={self.test_times},contiguous={self.contiguous},use_gpu={self.use_gpu},warmup={self.warmup}"
-
-    def check(self, intra_op_threads, omp_threads, omp_policy) -> bool:
-        if intra_op_threads is None:
-            if self.intra_op_num_threads is not None and self.intra_op_num_threads > 0:
-                return False
-        else:
-            assert intra_op_threads > 0
-            if not (self.intra_op_num_threads is None or self.intra_op_num_threads == intra_op_threads):
-                return False
-
-        if omp_threads is None:
-            if self.omp_num_threads is not None and self.omp_num_threads > 0:
-                return False
-        else:
-            assert omp_threads > 0
-            if not (self.omp_num_threads is None or self.omp_num_threads == omp_threads):
-                return False
-
-        if self.omp_wait_policy is not None:
-            if omp_policy != self.omp_wait_policy:
-                return False
-
-        return True
-
 
 @dataclass
 class ModelSetting:
@@ -84,22 +50,17 @@ class ModelSetting:
 
 
 def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level=None):
-    # Import onnxruntime shall be after OpenMP environment variable setting.
-    # So we put the import in function to delay importing instead of top of this script.
     import onnxruntime
 
     if use_gpu and ('CUDAExecutionProvider' not in onnxruntime.get_available_providers()):
         print(
             "Warning: Please install onnxruntime-gpu package instead of onnxruntime, and use a machine with GPU for testing gpu performance."
         )
-    elif (not use_gpu) and ('CUDAExecutionProvider' in onnxruntime.get_available_providers()):
-        print("Warning: Please install onnxruntime package instead of onnxruntime-gpu to get best cpu performance.")
 
     if intra_op_num_threads is None and graph_optimization_level is None:
         session = onnxruntime.InferenceSession(model_path)
     else:
-        execution_providers = ['CPUExecutionProvider'
-                               ] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
+        execution_providers = ['CPUExecutionProvider'] if not use_gpu else ['CUDAExecutionProvider', 'CPUExecutionProvider']
 
         sess_options = onnxruntime.SessionOptions()
         sess_options.execution_mode = onnxruntime.ExecutionMode.ORT_SEQUENTIAL
@@ -127,8 +88,8 @@ def create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization
     return session
 
 
-def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
-    if warmup and len(all_inputs) > 0:
+def onnxruntime_inference(session, all_inputs, output_names):
+    if len(all_inputs) > 0:
         # Use a random input as warm up.
         session.run(output_names, random.choice(all_inputs))
 
@@ -142,57 +103,16 @@ def onnxruntime_inference(session, all_inputs, output_names, warmup=True):
         latency_list.append(latency)
     return results, latency_list
 
-
-def get_contiguous_inputs(all_inputs):
-    """
-    Convert input to be contiguous.
-    """
-    contiguous_inputs = []
-
-    start_time = timeit.default_timer()
-    for test_case_id, inputs in enumerate(all_inputs):
-        real_inputs = {}
-        for key, value in inputs.items():
-            real_inputs[key] = np.ascontiguousarray(value)
-        contiguous_inputs.append(real_inputs)
-    latency = timeit.default_timer() - start_time
-
-    average_latency_ms = latency / len(contiguous_inputs) * 1000
-    return contiguous_inputs, average_latency_ms
-
-
 def to_string(model_path, session, test_setting):
     sess_options = session.get_session_options()
-    option = "model={}".format(os.path.basename(model_path))
-    option += ",graph_optimization_level={},intra_op_num_threads={}".format(sess_options.graph_optimization_level,
+    option = "model={},".format(os.path.basename(model_path))
+    option += "graph_optimization_level={},intra_op_num_threads={},".format(sess_options.graph_optimization_level,
                                                                             sess_options.intra_op_num_threads).replace(
                                                                                 'GraphOptimizationLevel.ORT_', '')
-    option += ",OMP_NUM_THREADS={}".format(os.environ["OMP_NUM_THREADS"] if "OMP_NUM_THREADS" in os.environ else "")
-    option += ",OMP_WAIT_POLICY={}".format(os.environ["OMP_WAIT_POLICY"] if "OMP_WAIT_POLICY" in os.environ else "")
-    option += ",{}".format(test_setting.get_setting())
+    option += f"batch_size={test_setting.batch_size},sequence_length={test_setting.sequence_length},test_cases={test_setting.test_cases},test_times={test_setting.test_times},use_gpu={test_setting.use_gpu}"
     return option
 
-
-def setup_openmp_environ(omp_num_threads, omp_wait_policy):
-    if omp_num_threads is None:
-        if "OMP_NUM_THREADS" in os.environ:
-            del os.environ["OMP_NUM_THREADS"]
-    else:
-        os.environ["OMP_NUM_THREADS"] = str(omp_num_threads)
-
-    if omp_wait_policy is None:
-        if "OMP_WAIT_POLICY" in os.environ:
-            del os.environ["OMP_WAIT_POLICY"]
-    else:
-        assert omp_wait_policy in ["ACTIVE", "PASSIVE"], f"{omp_wait_policy} is not a valid policy"
-        os.environ["OMP_WAIT_POLICY"] = omp_wait_policy
-
-
-def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
-                 omp_wait_policy):
-    # Environment variable shall be set before import onnxruntime.
-    setup_openmp_environ(omp_num_threads, omp_wait_policy)
-
+def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
     session = create_session(model_setting.model_path, test_setting.use_gpu, intra_op_num_threads,
                              model_setting.opt_level)
     output_names = [output.name for output in session.get_outputs()]
@@ -206,11 +126,11 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
 
     all_latency_list = []
     for i in range(test_setting.test_times):
-        results, latency_list = onnxruntime_inference(session, all_inputs, output_names, test_setting.warmup)
+        results, latency_list = onnxruntime_inference(session, all_inputs, output_names)
         all_latency_list.extend(latency_list)
 
     # latency in miliseconds
-    latency_ms = np.array(all_latency_list) * 1000 + test_setting.extra_latency
+    latency_ms = np.array(all_latency_list) * 1000
 
     average_latency = statistics.mean(latency_ms)
     latency_50 = np.percentile(latency_ms, 50)
@@ -226,91 +146,31 @@ def run_one_test(model_setting, test_setting, perf_results, all_inputs, intra_op
                                                                 format(throughput, '.2f')))
 
 
-def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads, omp_num_threads,
-                omp_wait_policy):
-    if not test_setting.check(intra_op_num_threads, omp_num_threads, omp_wait_policy):
-        return
-
+def launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads):
     process = multiprocessing.Process(target=run_one_test,
-                                      args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
-                                            omp_num_threads, omp_wait_policy))
+                                      args=(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads))
     process.start()
     process.join()
 
 
-def run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs):
+def run_perf_tests(model_setting, test_setting, perf_results, all_inputs):
+    if (test_setting.intra_op_num_threads is not None):
+        launch_test(model_setting, test_setting, perf_results, all_inputs, test_setting.intra_op_num_threads)
+        return
+
     cpu_count = psutil.cpu_count(logical=False)
     logical_cores = psutil.cpu_count(logical=True)
 
-    candidate_threads = list(set([1, logical_cores, cpu_count]))
-
-    if (test_setting.intra_op_num_threads is not None) or (test_setting.omp_num_threads is not None):
-
-        if test_setting.intra_op_num_threads is not None:
-            intra_op_threads = [test_setting.intra_op_num_threads]
-        else:
-            intra_op_threads = [None] + candidate_threads
-
-        if test_setting.omp_num_threads is not None:
-            omp_threads = [test_setting.omp_num_threads]
-        else:
-            omp_threads = [None] + candidate_threads
-
-        if test_setting.omp_wait_policy is not None:
-            omp_policies = [test_setting.omp_wait_policy]
-        else:
-            omp_policies = [None, 'PASSIVE', 'ACTIVE']
-
-        for it in intra_op_threads:
-            for ot in omp_threads:
-                for op in omp_policies:
-                    launch_test(model_setting, test_setting, perf_results, all_inputs, it, ot, op)
-        return
-
-    # Test a setting without any setting as baseline 1.
-    launch_test(model_setting, test_setting, perf_results, all_inputs, None, None, None)
-
-    if not test_setting.use_gpu:
-        # For CPU: intra_op_num_threads = 1, omp_num_threads=None, omp_wait_policy=None
-        # Another setting without environment variable as baseline 2.
-        launch_test(model_setting, test_setting, perf_results, all_inputs, 1, None, None)
-    else:
-        # For GPU, we test two more settings by default:
-        # (1) intra_op_num_threads = 1, omp_num_threads=cpu_count, omp_wait_policy=PASSIVE
-        # (2) intra_op_num_threads = logical_cores, omp_num_threads=1, omp_wait_policy=ACTIVE
-        launch_test(model_setting, test_setting, perf_results, all_inputs, 1, cpu_count, 'PASSIVE')
-
-        launch_test(model_setting, test_setting, perf_results, all_inputs, logical_cores, 1, 'ACTIVE')
-
-    # GPU latency is not sensitive to these settings. No need to test many combinations.
-    # Skip remaining settings for GPU without --all flag.
-    if test_setting.use_gpu and not test_all:
-        return
+    candidate_threads = list(set([logical_cores, cpu_count]))
+    for i in range(1, min(16, logical_cores)):
+        if i not in candidate_threads:
+            candidate_threads.append(i)
+    candidate_threads.sort(reverse=True)
 
     for intra_op_num_threads in candidate_threads:
-        for omp_num_threads in candidate_threads:
-            # skip settings that are very slow
-            if intra_op_num_threads == 1 and omp_num_threads == 1 and logical_cores != 1:
-                continue
-
-            # When logical and physical cores are not the same, there are many combinations.
-            # Remove some settings are not good normally.
-            if logical_cores > cpu_count:
-                if omp_num_threads == logical_cores and intra_op_num_threads != 1:
-                    continue
-                if intra_op_num_threads == logical_cores and omp_num_threads != 1:
-                    continue
-
-            if not test_all:
-                if intra_op_num_threads != 1 and omp_num_threads != 1:
-                    continue
-
-            for omp_wait_policy in ['ACTIVE', 'PASSIVE']:
-                launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads,
-                            omp_num_threads, omp_wait_policy)
-
-
-def run_performance(model_setting, test_setting, perf_results, test_all):
+        launch_test(model_setting, test_setting, perf_results, all_inputs, intra_op_num_threads)
+        
+def run_performance(model_setting, test_setting, perf_results):
     input_ids, segment_ids, input_mask = get_bert_inputs(model_setting.model_path, model_setting.input_ids_name,
                                                          model_setting.segment_ids_name, model_setting.input_mask_name)
 
@@ -327,29 +187,25 @@ def run_performance(model_setting, test_setting, perf_results, test_all):
                                     segment_ids,
                                     input_mask,
                                     random_mask_length=False)
-    if test_setting.contiguous:
-        all_inputs, contiguous_latency = get_contiguous_inputs(all_inputs)
-        print("Extra latency for converting inputs to contiguous: {} ms".format(format(contiguous_latency, '.2f')))
-        test_setting.extra_latency = contiguous_latency if test_setting.inclusive else 0
 
-    run_perf_tests(model_setting, test_setting, perf_results, test_all, all_inputs)
+    run_perf_tests(model_setting, test_setting, perf_results, all_inputs)
 
 
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--model', required=True, type=str, help="bert onnx model path")
 
-    parser.add_argument('--batch_size',
+    parser.add_argument('-b', '--batch_size',
                         required=True,
                         type=int,
                         nargs="+",
                         help="batch size of input. Allow one or multiple values in the range of [1, 128].")
 
-    parser.add_argument('--sequence_length', required=True, type=int, help="maximum sequence length of input")
+    parser.add_argument('-s', '--sequence_length', required=True, type=int, help="maximum sequence length of input")
 
     parser.add_argument('--samples', required=False, type=int, default=10, help="number of samples to be generated")
 
-    parser.add_argument('--test_times',
+    parser.add_argument('-t', '--test_times',
                         required=False,
                         type=int,
                         default=0,
@@ -375,40 +231,12 @@ def parse_arguments():
     parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
     parser.set_defaults(use_gpu=False)
 
-    parser.add_argument('--inclusive',
-                        required=False,
-                        action='store_true',
-                        help="include the latency of converting array to contiguous")
-    parser.set_defaults(inclusive=False)
-
-    parser.add_argument('--all', required=False, action='store_true', help="test all candidate settings")
-    parser.set_defaults(all=False)
-
-    parser.add_argument('--omp_num_threads',
-                        required=False,
-                        type=int,
-                        default=None,
-                        help=">0, set OMP_NUM_THREADS value. 0, do not set")
-
-    parser.add_argument('--intra_op_num_threads',
+    parser.add_argument('-n', '--intra_op_num_threads',
                         required=False,
                         type=int,
                         default=None,
                         help=">=0, set intra_op_num_threads")
 
-    parser.add_argument('--omp_wait_policy',
-                        required=False,
-                        type=str,
-                        default=None,
-                        choices=['ACTIVE', 'PASSIVE'],
-                        help="OMP_WAIT_POLICY")
-
-    parser.add_argument('--contiguous', required=False, action='store_true', help="contiguous input")
-    parser.set_defaults(contiguous=False)
-
-    parser.add_argument('--no_warmup', required=False, action='store_true', help="do not use one sample for warm-up.")
-    parser.set_defaults(no_warmup=False)
-
     parser.add_argument('--input_ids_name', required=False, type=str, default=None, help="input name for input ids")
     parser.add_argument('--segment_ids_name', required=False, type=str, default=None, help="input name for segment ids")
     parser.add_argument('--input_mask_name',
@@ -443,18 +271,13 @@ def main():
             args.sequence_length,
             args.samples,
             args.test_times,
-            None,  #contiguous
             args.use_gpu,
-            not args.no_warmup,
-            args.omp_num_threads,
-            args.omp_wait_policy,
             args.intra_op_num_threads,
             args.seed,
-            args.verbose,
-            args.contiguous,
-            args.inclusive)
+            args.verbose)
+
         print("test setting", test_setting)
-        run_performance(model_setting, test_setting, perf_results, args.all)
+        run_performance(model_setting, test_setting, perf_results)
 
     # Sort the results so that the first one has smallest latency.
     sorted_results = sorted(perf_results.items(), reverse=False, key=lambda x: x[1])
diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index c2e4435977..5e008db16c 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -140,7 +140,8 @@ def generate_test_data(batch_size, sequence_length, test_cases, seed, verbose, i
 
 
 def get_graph_input_from_embed_node(onnx_model, embed_node, input_index):
-    assert input_index < len(embed_node.input)
+    if input_index >= len(embed_node.input):
+        return None
 
     input = embed_node.input[input_index]
     graph_input = onnx_model.find_graph_input(input)
@@ -195,6 +196,15 @@ def find_bert_inputs(onnx_model, input_ids_name=None, segment_ids_name=None, inp
         input_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 0)
         segment_ids = get_graph_input_from_embed_node(onnx_model, embed_node, 1)
         input_mask = get_graph_input_from_embed_node(onnx_model, embed_node, 7)
+
+        if input_mask is None:
+            for input in graph_inputs:
+                input_name_lower = input.name.lower()
+                if "mask" in input_name_lower:
+                    input_mask = input
+        if input_mask is None:
+            raise ValueError(f"Failed to find attention mask input")
+            
         return input_ids, segment_ids, input_mask
 
     # Try guess the inputs based on naming.
@@ -231,7 +241,7 @@ def get_bert_inputs(onnx_file, input_ids_name=None, segment_ids_name=None, input
         model.ParseFromString(f.read())
 
     onnx_model = OnnxModel(model)
-    find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
+    return find_bert_inputs(onnx_model, input_ids_name, segment_ids_name, input_mask_name)
 
 
 def parse_arguments():
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index c1dec79092..5837581893 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -21,19 +21,17 @@ from datetime import datetime
 from onnx import ModelProto, TensorProto, numpy_helper
 from onnx_model import OnnxModel
 from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
-from bert_perf_test import create_session, onnxruntime_inference, setup_openmp_environ
+from bert_perf_test import create_session, onnxruntime_inference
 
 
-def run_model(model_path, all_inputs, use_gpu, use_openmp, disable_optimization):
-    # Import onnxruntime shall be after OpenMP environment variable setting.
-    # So we put import here to delay importing.
+def run_model(model_path, all_inputs, use_gpu, disable_optimization):
     import onnxruntime
 
     graph_optimization_level = None
     if disable_optimization:
         graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL
 
-    intra_op_num_threads = 1 if use_openmp else psutil.cpu_count(logical=False)
+    intra_op_num_threads = psutil.cpu_count(logical=False)
 
     session = create_session(model_path, use_gpu, intra_op_num_threads, graph_optimization_level)
 
@@ -78,7 +76,7 @@ def compare(baseline_results, treatment_results, verbose, rtol=1e-3, atol=1e-4):
 
 
 def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_length, use_gpu, test_cases, seed,
-             use_openmp, verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
+             verbose, rtol, atol, input_ids_name, segment_ids_name, input_mask_name):
 
     # Try deduce input names from optimized model.
     input_ids, segment_ids, input_mask = get_bert_inputs(optimized_model, input_ids_name, segment_ids_name,
@@ -95,16 +93,9 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
                                     input_mask,
                                     random_mask_length=True)
 
-    # OpenMP environment variables must be set before the very first "import onnxruntime"
-    if use_openmp:
-        setup_openmp_environ(omp_num_threads=psutil.cpu_count(logical=False), omp_wait_policy='ACTIVE')
-    else:
-        setup_openmp_environ(omp_num_threads=1, omp_wait_policy='ACTIVE')
-
     baseline_results, baseline_latency, output_names = run_model(baseline_model,
                                                                  all_inputs,
                                                                  use_gpu,
-                                                                 use_openmp,
                                                                  disable_optimization=True)
     if verbose:
         print("baseline average latency (all optimizations disabled): {} ms".format(
@@ -117,7 +108,6 @@ def run_test(baseline_model, optimized_model, output_dir, batch_size, sequence_l
     treatment_results, treatment_latency, treatment_output_names = run_model(optimized_model,
                                                                              all_inputs,
                                                                              use_gpu,
-                                                                             use_openmp,
                                                                              disable_optimization=False)
     if verbose:
         print("treatment average latency: {} ms".format(statistics.mean(treatment_latency) * 1000))
@@ -157,9 +147,6 @@ def parse_arguments():
     parser.add_argument('--use_gpu', required=False, action='store_true', help="use GPU")
     parser.set_defaults(use_gpu=False)
 
-    parser.add_argument('--openmp', required=False, action='store_true', help="use openmp")
-    parser.set_defaults(openmp=False)
-
     parser.add_argument('--verbose', required=False, action='store_true', help="print verbose information")
     parser.set_defaults(verbose=False)
 
@@ -180,7 +167,7 @@ def main():
         path.mkdir(parents=True, exist_ok=True)
 
     run_test(args.baseline_model, args.optimized_model, args.output_dir, args.batch_size, args.sequence_length,
-             args.use_gpu, args.samples, args.seed, args.openmp, args.verbose, args.rtol, args.atol, args.input_ids,
+             args.use_gpu, args.samples, args.seed, args.verbose, args.rtol, args.atol, args.input_ids,
              args.segment_ids, args.input_mask)
 
 
diff --git a/onnxruntime/python/tools/transformers/convert_to_onnx.py b/onnxruntime/python/tools/transformers/convert_to_onnx.py
index 7b2267905d..e88e4c5dff 100644
--- a/onnxruntime/python/tools/transformers/convert_to_onnx.py
+++ b/onnxruntime/python/tools/transformers/convert_to_onnx.py
@@ -23,6 +23,7 @@ import torch
 import numpy
 import json
 from pathlib import Path
+from packaging import version
 from transformers import AutoConfig
 from gpt2_helper import Gpt2Helper, MODEL_CLASSES, DEFAULT_TOLERANCE, PRETRAINED_GPT2_MODELS
 from gpt2_tester import Gpt2Tester
@@ -104,6 +105,10 @@ def parse_arguments():
 
 
 def main():
+    from transformers import __version__ as transformers_version
+    if version.parse(transformers_version) < version.parse("3.1.0"): # past_key_values name does not exist in 3.0.2 or older
+        raise RuntimeError("This tool requires transformers 3.1.0 or later.")
+
     args = parse_arguments()
     setup_logger(args.verbose)
 
diff --git a/onnxruntime/python/tools/transformers/dev_benchmark.cmd b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
index 61553d8f2d..3f0b397a14 100644
--- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd
+++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
@@ -7,10 +7,9 @@ REM Please install PyTorch (see https://pytorch.org/) before running this benchm
 REM   GPU:   conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
 REM   CPU:   conda install pytorch torchvision cpuonly -c pytorch
 
-REM When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
-REM                    it will use onnxruntime-tools package.
-REM If run_cli=false, it depends on other python script (*.py) files in this directory.
-set run_cli=false
+REM When use_package=true, you need not copy other files to run benchmarks except this sh file.
+REM Otherwise, it will use python script (*.py) files in this directory.
+set use_package=false
 
 REM only need once
 set run_install=false
@@ -72,13 +71,12 @@ if %run_install% == true (
   )
 
   pip install --upgrade onnxconverter_common
-  pip install --upgrade onnxruntime-tools
-  pip install --upgrade git+https://github.com/huggingface/transformers
+  pip install --upgrade transformers
 )
 
-if %run_cli% == true (
-  echo Use onnxruntime_tools.transformers.benchmark
-  set optimizer_script=-m onnxruntime_tools.transformers.benchmark
+if %use_package% == true (
+  echo Use onnxruntime.transformers.benchmark
+  set optimizer_script=-m onnxruntime.transformers.benchmark
 ) else (
   set optimizer_script=benchmark.py
 )
diff --git a/onnxruntime/python/tools/transformers/gpt2_helper.py b/onnxruntime/python/tools/transformers/gpt2_helper.py
index 5490c7267a..8079d6277b 100644
--- a/onnxruntime/python/tools/transformers/gpt2_helper.py
+++ b/onnxruntime/python/tools/transformers/gpt2_helper.py
@@ -31,7 +31,7 @@ class GPT2ModelNoPastState(GPT2Model):
         super().__init__(config)
 
     def forward(self, input_ids):
-        return super().forward(input_ids, use_cache=False)
+        return super().forward(input_ids, use_cache=False, return_dict=False)
 
 
 class MyGPT2Model(GPT2Model):
@@ -40,11 +40,26 @@ class MyGPT2Model(GPT2Model):
     def __init__(self, config):
         super().__init__(config)
 
+    @staticmethod
+    def post_process(result, num_layer):
+        if isinstance(result[1][0], tuple) or isinstance(result[1][0], list):
+            assert len(result[1]) == num_layer and len(result[1][0]) == 2 #and len(result[1][0][0].shape) == 4 and result[1][0][0].shape == result[1][0][1].shape
+            present = []
+            for i in range(num_layer):
+                # Since transformers v4.*, past key and values are separated outputs.
+                # Here we concate them into one tensor to be compatible with Attention operator.
+                present.append(torch.cat((result[1][i][0].unsqueeze(0), result[1][i][1].unsqueeze(0)), dim=0))
+            return (result[0], tuple(present))
+
+        return result
+
     def forward(self, input_ids, position_ids, attention_mask, *past):
-        return super().forward(input_ids,
-                               position_ids=position_ids,
-                               attention_mask=attention_mask,
-                               past_key_values=past)
+        result = super().forward(input_ids,
+                                 position_ids=position_ids,
+                                 attention_mask=attention_mask,
+                                 past_key_values=past,
+                                 return_dict=False)
+        return MyGPT2Model.post_process(result, self.config.n_layer)
 
 
 class MyGPT2LMHeadModel(GPT2LMHeadModel):
@@ -54,10 +69,13 @@ class MyGPT2LMHeadModel(GPT2LMHeadModel):
         super().__init__(config)
 
     def forward(self, input_ids, position_ids, attention_mask, *past):
-        return super().forward(input_ids,
-                               position_ids=position_ids,
-                               attention_mask=attention_mask,
-                               past_key_values=past)
+        result = super().forward(input_ids,
+                                 position_ids=position_ids,
+                                 attention_mask=attention_mask,
+                                 past_key_values=past,
+                                 return_dict=False)
+
+        return MyGPT2Model.post_process(result, self.config.n_layer)
 
 
 class MyGPT2LMHeadModel_NoPadding(GPT2LMHeadModel):
@@ -216,6 +234,7 @@ class Gpt2Helper:
 
         is_all_close = is_close
         num_layers = len(ort_outputs) - 1
+
         for layer in range(num_layers):
             is_close = numpy.allclose(ort_outputs[1 + layer],
                                       torch_outputs[1][layer].cpu().numpy(),
@@ -288,10 +307,12 @@ class Gpt2Helper:
             input_names.append('attention_mask')
         input_names.extend(past_names)
 
+        assert len(outputs) == 2 and len(outputs[1]) == num_layer
+
         logger.info(
             f"Shapes: input_ids={dummy_inputs.input_ids.shape} past={dummy_inputs.past[0].shape} output={outputs[0].shape} present={outputs[1][0].shape}"
         )
-
+    
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
         torch.onnx.export(model,
diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index 3668fe0515..1f4a1c9fa2 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -26,67 +26,44 @@ MODELS = {
     "bert-base-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
     "bert-large-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
     "bert-base-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-large-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-multilingual-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-multilingual-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-chinese": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-german-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-large-uncased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-large-cased-whole-word-masking": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
     "bert-large-uncased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
                                                                "token_type_ids"], 11, False, "bert"),
-    "bert-large-cased-whole-word-masking-finetuned-squad": (["input_ids", "attention_mask",
-                                                             "token_type_ids"], 11, False, "bert"),
     "bert-base-cased-finetuned-mrpc": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-german-dbmdz-cased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    "bert-base-german-dbmdz-uncased": (["input_ids", "attention_mask", "token_type_ids"], 11, False, "bert"),
-    # todo: more models to add
-    # GPT
-    "openai-gpt": (["input_ids"], 11, False, "gpt2"),  # no past state inputs
-    # GPT-2
-    "gpt2": (["input_ids"], 11, False, "gpt2"),  # no past state inputs & outputs
+
+    # GPT (no past state)
+    "openai-gpt": (["input_ids"], 11, False, "gpt2"),
+    # GPT-2 (no past state, use benchmark_gpt2.py for past_key_values)
+    "gpt2": (["input_ids"], 11, False, "gpt2"),
     "gpt2-medium": (["input_ids"], 11, False, "gpt2"),
-    "gpt2-large":
-    (["input_ids"], 11, True,
-     "gpt2"),  # Model>2GB. Need use_external_data_format=True to export it. No past state inputs for GPT models.
+    "gpt2-large": (["input_ids"], 11, True, "gpt2"),
     "gpt2-xl": (["input_ids"], 11, True, "gpt2"),
-    "distilgpt2": (["input_ids"], 11, False, "gpt2"),  # no past state inputs & outputs
+    "distilgpt2": (["input_ids"], 11, False, "gpt2"),
     # Transformer-XL
     #"transfo-xl-wt103": (["input_ids"], 11, False, "bert"),
     # XLNet
-    #"xlnet-base-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
-    #"xlnet-large-cased": (["input_ids"], 12, False, "bert"), # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
+    "xlnet-base-cased": (["input_ids"], 12, False, "bert"),
+    "xlnet-large-cased": (["input_ids"], 12, False, "bert"),
     # XLM
     "xlm-mlm-en-2048": (["input_ids"], 11, True, "bert"),
     "xlm-mlm-ende-1024": (["input_ids"], 11, False, "bert"),
     "xlm-mlm-enfr-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-enro-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-tlm-xnli15-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-clm-enfr-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-clm-ende-1024": (["input_ids"], 11, False, "bert"),
-    "xlm-mlm-17-1280": (["input_ids"], 11, True, "bert"),
-    "xlm-mlm-100-1280": (["input_ids"], 11, True, "bert"),
+    # XML Roberta
+    "xlm-roberta-base": (["input_ids"], 12, False, "bert"),
     # RoBERTa
     "roberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
     "roberta-large": (["input_ids", "attention_mask"], 11, False, "bert"),
     "roberta-large-mnli": (["input_ids", "attention_mask"], 11, False, "bert"),
+    "deepset/roberta-base-squad2": (["input_ids", "attention_mask"], 11, False, "bert"),
     "distilroberta-base": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "roberta-base-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "roberta-large-openai-detector": (["input_ids", "attention_mask"], 11, False, "bert"),
+
     # DistilBERT
     "distilbert-base-uncased": (["input_ids", "attention_mask"], 11, False, "bert"),
     "distilbert-base-uncased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-cased-distilled-squad": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-german-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
-    "distilbert-base-multilingual-cased": (["input_ids", "attention_mask"], 11, False, "bert"),
     # CTRL
     "ctrl": (["input_ids"], 11, True, "bert"),
     # CamemBERT
     "camembert-base": (["input_ids"], 11, False, "bert"),
     # ALBERT
-    # Models uses Einsum, which need opset version 12 and PyTorch 1.5.0 or above.
     "albert-base-v1": (["input_ids"], 12, False, "bert"),
     "albert-large-v1": (["input_ids"], 12, False, "bert"),
     "albert-xlarge-v1": (["input_ids"], 12, True, "bert"),
@@ -95,36 +72,37 @@ MODELS = {
     "albert-large-v2": (["input_ids"], 12, False, "bert"),
     "albert-xlarge-v2": (["input_ids"], 12, True, "bert"),
     #"albert-xxlarge-v2": (["input_ids"], 12, True, "bert"),
-    # T5
-    "t5-small": (["input_ids"], 12, False, "bert"),
-    "t5-base": (["input_ids"], 12, False, "bert"),
-    "t5-large": (["input_ids"], 12, True, "bert"),
-    "t5-3b": (["input_ids"], 12, True, "bert"),
-    "t5-11b": (["input_ids"], 12, True, "bert"),
+    # T5 (use benchmark_t5.py instead)
+    #"t5-small": (["input_ids"], 12, False, "bert"),
+    #"t5-base": (["input_ids"], 12, False, "bert"),
+    #"t5-large": (["input_ids"], 12, True, "bert"),
+    #"t5-3b": (["input_ids"], 12, True, "bert"),
+    #"t5-11b": (["input_ids"], 12, True, "bert"),
+    #"valhalla/t5-small-qa-qg-hl": (["input_ids"], 12, True, "bert"),
     # XLM-RoBERTa
     "xlm-roberta-base": (["input_ids"], 11, False, "bert"),
     "xlm-roberta-large": (["input_ids"], 11, True, "bert"),
     # FlauBERT
     "flaubert/flaubert_small_cased": (["input_ids"], 11, False, "bert"),
-    "flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
+    #"flaubert/flaubert_base_uncased": (["input_ids"], 11, False, "bert"),
     "flaubert/flaubert_base_cased": (["input_ids"], 11, False, "bert"),
-    "flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
+    #"flaubert/flaubert_large_cased": (["input_ids"], 11, False, "bert"),
     # Bart
     "facebook/bart-large": (["input_ids"], 11, False, "bert"),
     "facebook/bart-base": (["input_ids"], 11, False, "bert"),
     "facebook/bart-large-mnli": (["input_ids"], 11, False, "bert"),
     "facebook/bart-large-cnn": (["input_ids"], 11, False, "bert"),
-    #"facebook/mbart-large-en-ro": (["input_ids"], 11, True, "bert"),
+
     # DialoGPT
     "microsoft/DialoGPT-small": (["input_ids"], 11, False, "gpt2"),
     "microsoft/DialoGPT-medium": (["input_ids"], 11, False, "gpt2"),
-    "microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
+    #"microsoft/DialoGPT-large": (["input_ids"], 11, True, "gpt2"),
     # Reformer
     #"google/reformer-enwik8": (["input_ids"], 11, False, "bert"),
     #"google/reformer-crime-and-punishment": (["input_ids"], 11, False, "bert"),
     # MarianMT
     #"Helsinki-NLP/opus-mt-ROMANCE-en": (["input_ids"], 12, False, "bert"),
-    # Longformer
+    # Longformer (use benchmark_longformer.py instead)
     #"allenai/longformer-base-4096": (["input_ids"], 12, False, "bert"),
     #"allenai/longformer-large-4096": (["input_ids"], 12, False, "bert"),
 }
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
index 2cbf29d07a..05be3c560e 100644
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -5,13 +5,12 @@
 # --------------------------------------------------------------------------
 # This measures the performance of OnnxRuntime, PyTorch and TorchScript on transformer models.
 # Please install PyTorch (see https://pytorch.org/) before running this benchmark. Like the following:
-# GPU:   conda install pytorch torchvision cudatoolkit=10.1 -c pytorch
+# GPU:   conda install pytorch torchvision cudatoolkit=11.0 -c pytorch
 # CPU:   conda install pytorch torchvision cpuonly -c pytorch
 
-# When run_cli=true, this script is self-contained and you need not copy other files to run benchmarks
-#                    it will use onnxruntime-tools package.
-# If run_cli=false, it depends on other python script (*.py) files in this directory.
-run_cli=true
+# When use_package=true, you need not copy other files to run benchmarks except this sh file.
+# Otherwise, it will use python script (*.py) files in this directory.
+use_package=true
 
 # only need once
 run_install=true
@@ -50,7 +49,7 @@ sequence_lengths="8 16 32 64 128 256 512 1024"
 input_counts=1
 
 # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
-models_to_test="bert-base-cased roberta-base gpt2"
+models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
 
 # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 # export CUDA_VISIBLE_DEVICES=1
@@ -81,7 +80,7 @@ fi
 
 
 if [ "$run_install" = true ] ; then
-  pip uninstall --yes ort_nightly
+  pip uninstall --yes ort-nightly ort-gpu-nightly
   pip uninstall --yes onnxruntime
   pip uninstall --yes onnxruntime-gpu
   if [ "$run_cpu_fp32" = true ] || [ "$run_cpu_int8" = true ]; then
@@ -89,14 +88,12 @@ if [ "$run_install" = true ] ; then
   else
     pip install onnxruntime-gpu
   fi
-  pip install --upgrade onnxconverter_common
-  pip install --upgrade onnxruntime-tools
-  pip install --upgrade transformers
+  pip install --upgrade onnx coloredlogs packaging psutil py3nvml onnxconverter_common numpy transformers
 fi
 
-if [ "$run_cli" = true ] ; then
-  echo "Use onnxruntime_tools.transformers.benchmark"
-  benchmark_script="-m onnxruntime_tools.transformers.benchmark"
+if [ "$use_package" = true ] ; then
+  echo "Use onnxruntime.transformers.benchmark"
+  benchmark_script="-m onnxruntime.transformers.benchmark"
 else
   benchmark_script="benchmark.py"
 fi
@@ -187,4 +184,4 @@ fi
 # Remove duplicated lines
 awk '!x[$0]++' ./result.csv > summary_result.csv
 awk '!x[$0]++' ./fusion.csv > summary_fusion.csv
-awk '!x[$0]++' ./detail.csv > summary_detail.csv
\ No newline at end of file
+awk '!x[$0]++' ./detail.csv > summary_detail.csv
diff --git a/onnxruntime/python/tools/transformers/test/test_gpt2.py b/onnxruntime/python/tools/transformers/test/test_gpt2.py
index 4f5ca65363..cb6b680af5 100644
--- a/onnxruntime/python/tools/transformers/test/test_gpt2.py
+++ b/onnxruntime/python/tools/transformers/test/test_gpt2.py
@@ -26,7 +26,7 @@ class TestGpt2(unittest.TestCase):
 
     def test_gpt2_fp16(self):
         if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():
-            self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128')
+            self.run_benchmark_gpt2('-m gpt2 --precision fp16 -o -b 1 -s 128 --use_gpu')
 
     def test_gpt2_int8(self):
         self.run_benchmark_gpt2('-m gpt2 --precision int8 -o -b 1 -s 128')