From dfe42e185c6c6de68177db8ecf307645ce831aec Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Mon, 19 Jul 2021 14:16:59 -0700
Subject: [PATCH] update bert notebook to use onnxruntime 1.8.1 (#8379)

---
 .../PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb  | 565 +++++------
 .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb  | 934 ++++++++----------
 2 files changed, 685 insertions(+), 814 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
index 435fd8a3a2..1cb36fab0b 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
@@ -47,76 +47,23 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
-      "Requirement already up-to-date: torch==1.6.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.6.0+cpu)\n",
-      "Requirement already up-to-date: torchvision==0.7.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.7.0+cpu)\n",
-      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (0.18.2)\n",
-      "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.7.0+cpu) (7.0.0)\n",
-      "Requirement already up-to-date: onnxruntime==1.4.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0)\n",
-      "Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (45.2.0.post20200210)\n",
-      "Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.1)\n",
-      "Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n",
-      "Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n",
-      "Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n",
-      "Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n",
-      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n",
-      "Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n",
-      "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n",
-      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n",
-      "Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n",
-      "Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n",
-      "Requirement already satisfied: transformers==3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.0.2)\n",
-      "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n",
-      "Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n",
-      "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n",
-      "Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n",
-      "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n",
-      "Requirement already satisfied: tokenizers==0.8.1.rc1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc1)\n",
-      "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n",
-      "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (3.0.12)\n",
-      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (1.18.1)\n",
-      "Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (2.4.6)\n",
-      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n",
-      "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (7.0)\n",
-      "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (0.14.1)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
-      "Requirement already satisfied: wget in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.2)\n",
-      "Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "# Install PyTorch 1.6.0 and OnnxRuntime 1.4.0 for CPU-only.\n",
     "import sys\n",
-    "if sys.platform == 'darwin': # Mac\n",
-    "    !{sys.executable} -m pip install --upgrade torch torchvision\n",
-    "else:\n",
-    "    !{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n",
-    "!{sys.executable} -m pip install --upgrade onnxconverter_common\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime-tools\n",
     "\n",
-    "# Install other packages used in this notebook.\n",
-    "!{sys.executable} -m pip install transformers==3.0.2\n",
-    "!{sys.executable} -m pip install wget netron"
+    "run_install = False # Only need install once\n",
+    "if run_install:\n",
+    "    if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+    "        !{sys.executable} -m pip install --upgrade torch torchvision torchaudio\n",
+    "    else: # Mac\n",
+    "        !{sys.executable} -m pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
+    "\n",
+    "    !{sys.executable} -m pip install onnxruntime==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
+    "\n",
+    "    # Install other packages used in this notebook.\n",
+    "    !{sys.executable} -m pip install transformers==4.8.2\n",
+    "    !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml\n",
+    "    !{sys.executable} -m pip install wget netron"
    ]
   },
   {
@@ -196,14 +143,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
-      "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
+      "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']\n",
+      "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
       "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
       "Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']\n",
       "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.87it/s]\n",
-      "convert squad examples to features: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 131.41it/s]\n",
-      "add example index and unique id: 100%|████████████████████████████████████████████| 100/100 [00:00<00:00, 96776.74it/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.15it/s]\n",
+      "convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 135.87it/s]\n",
+      "add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100031.10it/s]\n"
      ]
     }
    ],
@@ -252,6 +199,14 @@
    "execution_count": 5,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\git\\transformers\\src\\transformers\\modeling_utils.py:2074: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -319,7 +274,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch cpu Inference time = 144.73 ms\n"
+      "PyTorch cpu Inference time = 119.80 ms\n"
      ]
     }
    ],
@@ -348,45 +303,26 @@
    "source": [
     "## 4. Inference ONNX Model with ONNX Runtime ##\n",
     "\n",
-    "### OpenMP Environment Variable\n",
-    "\n",
-    "OpenMP environment variables are very important for CPU inference of Bert model. It has large performance impact on Bert model so you might need set it carefully according to [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n",
-    "\n",
-    "Setting environment variables shall be done before importing onnxruntime. Otherwise, they might not take effect."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import psutil\n",
-    "\n",
-    "# You may change the settings in this cell according to Performance Test Tool result.\n",
-    "os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n",
-    "os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'"
+    "For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Now we are ready to inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. \n",
-    "\n",
-    "It is better to use standalone python script like [Performance Test tool](#Performance-Test-tool) to get accurate performance results."
+    "Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "OnnxRuntime cpu Inference time = 88.55 ms\n"
+      "OnnxRuntime cpu Inference time = 72.46 ms\n"
      ]
     }
    ],
@@ -394,19 +330,15 @@
     "import onnxruntime\n",
     "import numpy\n",
     "\n",
-    "# Print warning if user uses onnxruntime-gpu instead of onnxruntime package.\n",
-    "if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():\n",
-    "    print(\"warning: onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.\")\n",
-    "\n",
     "sess_options = onnxruntime.SessionOptions()\n",
     "\n",
     "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n",
     "# Note that this will increase session creation time, so it is for debugging only.\n",
     "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_cpu.onnx\")\n",
     "\n",
-    "# For OnnxRuntime 1.2.0, you might need set intra_op_num_threads to 1 to enable OpenMP\n",
-    "#    sess_options.intra_op_num_threads=1\n",
-    "# For OnnxRuntime 1.3.0 or later, it is recommended to use the default setting so you need not set it.\n",
+    "# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like\n",
+    "#    sess_options.intra_op_num_threads=4\n",
+    "# Here we use the default value which is a good choice in most cases.\n",
     "\n",
     "# Specify providers when you use onnxruntime-gpu for CPU inference.\n",
     "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@@ -427,7 +359,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -474,17 +406,17 @@
     "\n",
     "Example Usage:\n",
     "```\n",
-    "from onnxruntime_tools import optimizer\n",
+    "from onnxruntime.transformers import optimizer\n",
     "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
     "optimized_model.save_model_to_file(optimized_model_path)\n",
     "```\n",
     "\n",
-    "You can also use optimizer_cli like the following:"
+    "You can also use command line like the following:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -493,15 +425,17 @@
      "text": [
       "               apply: Fused LayerNormalization count: 25\n",
       "               apply: Fused Gelu count: 12\n",
-      "               apply: Fused SkipLayerNormalization count: 25\n",
+      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+      "               apply: Fused SkipLayerNormalization count: 24\n",
       "               apply: Fused Attention count: 12\n",
       "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
       "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
       "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
       "               apply: Fused BiasGelu count: 12\n",
       "               apply: Fused SkipLayerNormalization(add bias) count: 24\n",
       "            optimize: opset verion: 11\n",
+      "  save_model_to_file: Sort graphs in topological order\n",
       "  save_model_to_file: Output model to ..\\onnx_models\\bert-base-cased-squad_opt_cpu.onnx\n",
       "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
       "                main: The model has been fully optimized.\n"
@@ -511,7 +445,7 @@
    "source": [
     "optimized_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opt_cpu.onnx')\n",
     "\n",
-    "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
+    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
    ]
   },
   {
@@ -527,7 +461,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -561,7 +495,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -569,13 +503,13 @@
      "output_type": "stream",
      "text": [
       "100% passed for 100 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
-      "maximum absolute difference=5.930662155151367e-06\n",
-      "maximum relative difference=0.021568937227129936\n"
+      "maximum absolute difference=4.604458808898926e-06\n",
+      "maximum relative difference=0.006278202868998051\n"
      ]
     }
    ],
    "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
+    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
    ]
   },
   {
@@ -591,45 +525,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 80.08 ms, Throughput = 12.49 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 78.56 ms, Throughput = 12.73 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 77.78 ms, Throughput = 12.86 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 73.52 ms, Throughput = 13.60 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 95.36 ms, Throughput = 10.49 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 78.49 ms, Throughput = 12.74 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.71 ms, Throughput = 3.46 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.61 ms, Throughput = 3.46 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.97 ms, Throughput = 3.46 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.37 ms, Throughput = 3.47 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 95.49 ms, Throughput = 10.47 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 79.17 ms, Throughput = 12.63 QPS\n",
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 54.26 ms, Throughput = 18.43 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 55.80 ms, Throughput = 17.92 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 65.31 ms, Throughput = 15.31 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 57.66 ms, Throughput = 17.34 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 62.84 ms, Throughput = 15.91 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 69.29 ms, Throughput = 14.43 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 56.19 ms, Throughput = 17.80 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 59.90 ms, Throughput = 16.70 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 63.72 ms, Throughput = 15.69 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 82.44 ms, Throughput = 12.13 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 119.64 ms, Throughput = 8.36 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 223.21 ms, Throughput = 4.48 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, use_gpu=False, intra_op_num_threads=None, seed=3, verbose=False)\n",
       "Generating 100 samples for batch_size=1 sequence_length=128\n",
-      "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
+      "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
      ]
     }
    ],
    "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all"
+    "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1"
    ]
   },
   {
@@ -641,14 +575,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
+      "..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
      ]
     },
     {
@@ -678,155 +612,116 @@
        "      <th>Latency_P99</th>\n",
        "      <th>Throughput(QPS)</th>\n",
        "      <th>intra_op_num_threads</th>\n",
-       "      <th>OMP_NUM_THREADS</th>\n",
-       "      <th>OMP_WAIT_POLICY</th>\n",
-       "      <th>contiguous</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>73.52</td>\n",
-       "      <td>75.78</td>\n",
-       "      <td>78.21</td>\n",
-       "      <td>89.29</td>\n",
-       "      <td>13.60</td>\n",
-       "      <td>1</td>\n",
+       "      <td>54.26</td>\n",
+       "      <td>56.05</td>\n",
+       "      <td>60.32</td>\n",
+       "      <td>109.21</td>\n",
+       "      <td>18.43</td>\n",
        "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>77.78</td>\n",
-       "      <td>82.35</td>\n",
-       "      <td>87.02</td>\n",
-       "      <td>104.54</td>\n",
-       "      <td>12.86</td>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>55.80</td>\n",
+       "      <td>56.74</td>\n",
+       "      <td>59.67</td>\n",
+       "      <td>73.62</td>\n",
+       "      <td>17.92</td>\n",
+       "      <td>11</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>78.49</td>\n",
-       "      <td>80.92</td>\n",
-       "      <td>85.77</td>\n",
-       "      <td>98.98</td>\n",
-       "      <td>12.74</td>\n",
-       "      <td>1</td>\n",
+       "      <td>56.19</td>\n",
+       "      <td>61.29</td>\n",
+       "      <td>71.69</td>\n",
+       "      <td>80.15</td>\n",
+       "      <td>17.80</td>\n",
        "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>78.56</td>\n",
-       "      <td>82.29</td>\n",
-       "      <td>93.46</td>\n",
-       "      <td>108.73</td>\n",
-       "      <td>12.73</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>None</td>\n",
+       "      <td>57.66</td>\n",
+       "      <td>58.50</td>\n",
+       "      <td>61.96</td>\n",
+       "      <td>65.12</td>\n",
+       "      <td>17.34</td>\n",
+       "      <td>9</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>79.17</td>\n",
-       "      <td>82.02</td>\n",
-       "      <td>87.60</td>\n",
-       "      <td>99.55</td>\n",
-       "      <td>12.63</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>59.90</td>\n",
+       "      <td>59.72</td>\n",
+       "      <td>65.16</td>\n",
+       "      <td>116.16</td>\n",
+       "      <td>16.70</td>\n",
+       "      <td>5</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>80.08</td>\n",
-       "      <td>83.18</td>\n",
-       "      <td>95.60</td>\n",
-       "      <td>107.72</td>\n",
-       "      <td>12.49</td>\n",
-       "      <td>0</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>None</td>\n",
+       "      <td>62.84</td>\n",
+       "      <td>67.05</td>\n",
+       "      <td>69.07</td>\n",
+       "      <td>75.99</td>\n",
+       "      <td>15.91</td>\n",
+       "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>95.36</td>\n",
-       "      <td>101.25</td>\n",
-       "      <td>103.61</td>\n",
-       "      <td>105.15</td>\n",
-       "      <td>10.49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>63.72</td>\n",
+       "      <td>64.17</td>\n",
+       "      <td>69.44</td>\n",
+       "      <td>73.10</td>\n",
+       "      <td>15.69</td>\n",
+       "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>95.49</td>\n",
-       "      <td>101.50</td>\n",
-       "      <td>102.66</td>\n",
-       "      <td>104.82</td>\n",
-       "      <td>10.47</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>65.31</td>\n",
+       "      <td>65.35</td>\n",
+       "      <td>80.70</td>\n",
+       "      <td>177.94</td>\n",
+       "      <td>15.31</td>\n",
+       "      <td>10</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>288.37</td>\n",
-       "      <td>290.48</td>\n",
-       "      <td>295.37</td>\n",
-       "      <td>308.91</td>\n",
-       "      <td>3.47</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>69.29</td>\n",
+       "      <td>69.04</td>\n",
+       "      <td>70.68</td>\n",
+       "      <td>85.03</td>\n",
+       "      <td>14.43</td>\n",
+       "      <td>7</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>288.61</td>\n",
-       "      <td>291.10</td>\n",
-       "      <td>295.78</td>\n",
-       "      <td>301.52</td>\n",
-       "      <td>3.46</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>82.44</td>\n",
+       "      <td>83.20</td>\n",
+       "      <td>89.64</td>\n",
+       "      <td>98.80</td>\n",
+       "      <td>12.13</td>\n",
+       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>288.71</td>\n",
-       "      <td>292.64</td>\n",
-       "      <td>298.28</td>\n",
-       "      <td>305.92</td>\n",
-       "      <td>3.46</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>119.64</td>\n",
+       "      <td>119.07</td>\n",
+       "      <td>122.62</td>\n",
+       "      <td>135.67</td>\n",
+       "      <td>8.36</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>288.97</td>\n",
-       "      <td>291.18</td>\n",
-       "      <td>297.68</td>\n",
-       "      <td>309.30</td>\n",
-       "      <td>3.46</td>\n",
-       "      <td>6</td>\n",
+       "      <td>223.21</td>\n",
+       "      <td>223.22</td>\n",
+       "      <td>226.83</td>\n",
+       "      <td>249.08</td>\n",
+       "      <td>4.48</td>\n",
        "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -834,35 +729,35 @@
       ],
       "text/plain": [
        "    Latency(ms)  Latency_P75  Latency_P90  Latency_P99  Throughput(QPS)  \\\n",
-       "0         73.52        75.78        78.21        89.29            13.60   \n",
-       "1         77.78        82.35        87.02       104.54            12.86   \n",
-       "2         78.49        80.92        85.77        98.98            12.74   \n",
-       "3         78.56        82.29        93.46       108.73            12.73   \n",
-       "4         79.17        82.02        87.60        99.55            12.63   \n",
-       "5         80.08        83.18        95.60       107.72            12.49   \n",
-       "6         95.36       101.25       103.61       105.15            10.49   \n",
-       "7         95.49       101.50       102.66       104.82            10.47   \n",
-       "8        288.37       290.48       295.37       308.91             3.47   \n",
-       "9        288.61       291.10       295.78       301.52             3.46   \n",
-       "10       288.71       292.64       298.28       305.92             3.46   \n",
-       "11       288.97       291.18       297.68       309.30             3.46   \n",
+       "0         54.26        56.05        60.32       109.21            18.43   \n",
+       "1         55.80        56.74        59.67        73.62            17.92   \n",
+       "2         56.19        61.29        71.69        80.15            17.80   \n",
+       "3         57.66        58.50        61.96        65.12            17.34   \n",
+       "4         59.90        59.72        65.16       116.16            16.70   \n",
+       "5         62.84        67.05        69.07        75.99            15.91   \n",
+       "6         63.72        64.17        69.44        73.10            15.69   \n",
+       "7         65.31        65.35        80.70       177.94            15.31   \n",
+       "8         69.29        69.04        70.68        85.03            14.43   \n",
+       "9         82.44        83.20        89.64        98.80            12.13   \n",
+       "10       119.64       119.07       122.62       135.67             8.36   \n",
+       "11       223.21       223.22       226.83       249.08             4.48   \n",
        "\n",
-       "    intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY contiguous  \n",
-       "0                      1              12         PASSIVE       None  \n",
-       "1                      1              12          ACTIVE       None  \n",
-       "2                      1               6         PASSIVE       None  \n",
-       "3                      1                                       None  \n",
-       "4                      6               6         PASSIVE       None  \n",
-       "5                      0                                       None  \n",
-       "6                      1               6          ACTIVE       None  \n",
-       "7                      6               6          ACTIVE       None  \n",
-       "8                      6               1         PASSIVE       None  \n",
-       "9                     12               1         PASSIVE       None  \n",
-       "10                    12               1          ACTIVE       None  \n",
-       "11                     6               1          ACTIVE       None  "
+       "    intra_op_num_threads  \n",
+       "0                     12  \n",
+       "1                     11  \n",
+       "2                      6  \n",
+       "3                      9  \n",
+       "4                      5  \n",
+       "5                      8  \n",
+       "6                      4  \n",
+       "7                     10  \n",
+       "8                      7  \n",
+       "9                      3  \n",
+       "10                     2  \n",
+       "11                     1  "
       ]
      },
-     "execution_count": 14,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -876,7 +771,7 @@
     "print(latest_result_file)\n",
     "\n",
     "# Remove some columns that have same values for all rows.\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup']\n",
+    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
     "# Hide some latency percentile columns to fit screen width.\n",
     "columns_to_remove.extend(['Latency_P50', 'Latency_P95'])\n",
     "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
@@ -901,7 +796,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -910,12 +805,12 @@
      "text": [
       "{\n",
       "  \"gpu\": {\n",
-      "    \"driver_version\": \"442.23\",\n",
+      "    \"driver_version\": \"470.14\",\n",
       "    \"devices\": [\n",
       "      {\n",
       "        \"memory_total\": 8589934592,\n",
-      "        \"memory_available\": 6997721088,\n",
-      "        \"name\": \"GeForce GTX 1070\"\n",
+      "        \"memory_available\": 6782619648,\n",
+      "        \"name\": \"NVIDIA GeForce GTX 1070\"\n",
       "      }\n",
       "    ]\n",
       "  },\n",
@@ -925,22 +820,98 @@
       "    \"logical_cores\": 12,\n",
       "    \"hz\": \"3.1920 GHz\",\n",
       "    \"l2_cache\": \"1536 KB\",\n",
-      "    \"l3_cache\": \"12288 KB\",\n",
+      "    \"flags\": [\n",
+      "      \"3dnow\",\n",
+      "      \"3dnowprefetch\",\n",
+      "      \"abm\",\n",
+      "      \"acpi\",\n",
+      "      \"adx\",\n",
+      "      \"aes\",\n",
+      "      \"apic\",\n",
+      "      \"avx\",\n",
+      "      \"avx2\",\n",
+      "      \"bmi1\",\n",
+      "      \"bmi2\",\n",
+      "      \"clflush\",\n",
+      "      \"clflushopt\",\n",
+      "      \"cmov\",\n",
+      "      \"cx16\",\n",
+      "      \"cx8\",\n",
+      "      \"de\",\n",
+      "      \"dtes64\",\n",
+      "      \"dts\",\n",
+      "      \"erms\",\n",
+      "      \"est\",\n",
+      "      \"f16c\",\n",
+      "      \"fma\",\n",
+      "      \"fpu\",\n",
+      "      \"fxsr\",\n",
+      "      \"hle\",\n",
+      "      \"ht\",\n",
+      "      \"hypervisor\",\n",
+      "      \"ia64\",\n",
+      "      \"invpcid\",\n",
+      "      \"lahf_lm\",\n",
+      "      \"mca\",\n",
+      "      \"mce\",\n",
+      "      \"mmx\",\n",
+      "      \"movbe\",\n",
+      "      \"mpx\",\n",
+      "      \"msr\",\n",
+      "      \"mtrr\",\n",
+      "      \"osxsave\",\n",
+      "      \"pae\",\n",
+      "      \"pat\",\n",
+      "      \"pbe\",\n",
+      "      \"pcid\",\n",
+      "      \"pclmulqdq\",\n",
+      "      \"pdcm\",\n",
+      "      \"pge\",\n",
+      "      \"pni\",\n",
+      "      \"popcnt\",\n",
+      "      \"pse\",\n",
+      "      \"pse36\",\n",
+      "      \"rdrnd\",\n",
+      "      \"rdseed\",\n",
+      "      \"rtm\",\n",
+      "      \"sep\",\n",
+      "      \"serial\",\n",
+      "      \"sgx\",\n",
+      "      \"sgx_lc\",\n",
+      "      \"smap\",\n",
+      "      \"smep\",\n",
+      "      \"ss\",\n",
+      "      \"sse\",\n",
+      "      \"sse2\",\n",
+      "      \"sse4_1\",\n",
+      "      \"sse4_2\",\n",
+      "      \"ssse3\",\n",
+      "      \"tm\",\n",
+      "      \"tm2\",\n",
+      "      \"tsc\",\n",
+      "      \"tscdeadline\",\n",
+      "      \"vme\",\n",
+      "      \"x2apic\",\n",
+      "      \"xsave\",\n",
+      "      \"xtpr\"\n",
+      "    ],\n",
       "    \"processor\": \"Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\"\n",
       "  },\n",
       "  \"memory\": {\n",
-      "    \"total\": 16971276288,\n",
-      "    \"available\": 4723568640\n",
+      "    \"total\": 16977195008,\n",
+      "    \"available\": 6085459968\n",
       "  },\n",
       "  \"python\": \"3.6.10.final.0 (64 bit)\",\n",
-      "  \"os\": \"Windows-10-10.0.19041-SP0\",\n",
+      "  \"os\": \"Windows-10-10.0.21390-SP0\",\n",
       "  \"onnxruntime\": {\n",
-      "    \"version\": \"1.4.0\",\n",
+      "    \"version\": \"1.8.1\",\n",
       "    \"support_gpu\": false\n",
       "  },\n",
+      "  \"onnxruntime_tools\": null,\n",
       "  \"pytorch\": {\n",
-      "    \"version\": \"1.6.0+cpu\",\n",
-      "    \"support_gpu\": false\n",
+      "    \"version\": \"1.9.0+cpu\",\n",
+      "    \"support_gpu\": false,\n",
+      "    \"cuda\": null\n",
       "  },\n",
       "  \"tensorflow\": {\n",
       "    \"version\": \"2.3.0\",\n",
@@ -954,20 +925,14 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "2020-08-06 17:30:50.400838: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
+      "2021-07-13 14:41:45.376756: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found\n",
+      "2021-07-13 14:41:45.376780: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
      ]
     }
    ],
    "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
+    "!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 3b090b8232..1016aef1fe 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -36,17 +36,16 @@
     "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.5.0 and OnnxRuntime 1.3.0.\n",
     "\n",
     "```console\n",
-    "conda create -n gpu_env python=3.7\n",
+    "conda create -n gpu_env python=3.6\n",
     "conda activate gpu_env\n",
-    "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n",
     "conda install -c anaconda ipykernel\n",
     "conda install -c conda-forge ipywidgets\n",
-    "python -m ipykernel install --user --name=gpu_env_py37\n",
+    "python -m ipykernel install --user --name=gpu_env\n",
     "jupyter notebook\n",
     "```\n",
-    "Finally, launch Jupyter Notebook and you can choose gpu_env_py37 as kernel to run this notebook.\n",
+    "Finally, launch Jupyter Notebook and you can choose gpu_env as kernel to run this notebook.\n",
     "\n",
-    "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the corresponding version in [requirements](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements). If the version is different from above cudatoolkit version, you have to install them separately, and add their bin directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
+    "Onnxruntime-gpu need specified version of CUDA and cuDNN. You can find the Requirements [here]( http://www.onnxruntime.ai/docs/how-to/install.html). Remember to add the directories to PATH environment variable (See [CUDA and cuDNN Path](#CUDA-and-cuDNN-Path) below)."
    ]
   },
   {
@@ -56,12 +55,46 @@
    "outputs": [],
    "source": [
     "import sys\n",
-    "!{sys.executable} -m pip uninstall --quiet --yes onnxruntime-gpu\n",
-    "!{sys.executable} -m pip install --quiet onnxruntime-gpu\n",
-    "!{sys.executable} -m pip install --quiet --upgrade transformers\n",
-    "!{sys.executable} -m pip install --quiet --upgrade onnxconverter_common\n",
-    "!{sys.executable} -m pip install --quiet --upgrade onnxruntime-tools\n",
-    "!{sys.executable} -m pip install --quiet wget netron pandas"
+    "\n",
+    "run_install = False # Only need install once\n",
+    "if run_install:\n",
+    "    if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+    "        !{sys.executable} -m pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
+    "    else: # Mac\n",
+    "        print(\"PyTorch 1.9 MacOS Binaries do not support CUDA, install from source instead\")\n",
+    "\n",
+    "    !{sys.executable} -m pip install onnxruntime-gpu==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
+    "\n",
+    "    # Install other packages used in this notebook.\n",
+    "    !{sys.executable} -m pip install transformers==4.8.2\n",
+    "    !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml coloredlogs wget netron sympy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "pytorch: 1.9.0+cu111\n",
+      "onnxruntime: 1.8.1\n",
+      "onnx: 1.9.0\n",
+      "transformers: 4.8.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "import torch\n",
+    "import onnx\n",
+    "import onnxruntime\n",
+    "import transformers\n",
+    "print(\"pytorch:\", torch.__version__)\n",
+    "print(\"onnxruntime:\", onnxruntime.__version__)\n",
+    "print(\"onnx:\", onnx.__version__)\n",
+    "print(\"transformers:\", transformers.__version__)"
    ]
   },
   {
@@ -80,7 +113,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -108,7 +141,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -131,12 +164,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
-    "# For fine-tuned large model, the model name is \"bert-large-uncased-whole-word-masking-finetuned-squad\". Here we use bert-base for demo.\n",
-    "model_name_or_path = \"bert-base-cased\"\n",
+    "# fine-tuned model from https://huggingface.co/models?search=squad\n",
+    "model_name_or_path = \"bert-large-uncased-whole-word-masking-finetuned-squad\"\n",
     "max_seq_length = 128\n",
     "doc_stride = 128\n",
     "max_query_length = 64"
@@ -151,16 +184,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████| 48/48 [00:04<00:00, 11.28it/s]\n",
-      "convert squad examples to features: 100%|██████████| 1000/1000 [00:09<00:00, 102.15it/s]\n",
-      "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 161306.98it/s]\n"
+      "100%|██████████| 48/48 [00:03<00:00, 14.24it/s]\n",
+      "convert squad examples to features: 100%|██████████| 1000/1000 [00:08<00:00, 112.67it/s]\n",
+      "add example index and unique id: 100%|██████████| 1000/1000 [00:00<00:00, 836518.55it/s]\n"
      ]
     }
    ],
@@ -206,9 +239,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/disk/conda3/envs/gpu_env/lib/python3.6/site-packages/transformers/modeling_utils.py:1974: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
@@ -270,14 +311,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch cuda Inference time = 16.57 ms\n"
+      "PyTorch cuda Inference time = 16.56 ms\n"
      ]
     }
    ],
@@ -307,47 +348,7 @@
     "## 4. Inference ONNX Model with ONNX Runtime ##\n",
     "\n",
     "### CUDA and cuDNN Path\n",
-    "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn):\n",
-    "\n",
-    "* [onnxruntime-gpu v1.3.0](https://github.com/microsoft/onnxruntime/tree/rel-1.3.0#system-requirements) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n",
-    "* [onnxruntime-gpu v1.2.0](https://github.com/microsoft/onnxruntime/releases/tag/v1.2.0) requires CUDA Runtime 10.1 and CUDNN 7.6.5.\n",
-    "\n",
-    "During installing PyTorch 1.5, we installed cudatoolkit 10.1.243 in this conda environment. That shall be good for onnxruntime-gpu 1.3.0 in Jupyter Notebook."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n",
-    "add_cuda_path = False\n",
-    "\n",
-    "if add_cuda_path:\n",
-    "    # Add path of CUDA 10.0 and CUDNN 7.6 for onnxruntime-gpu 1.0.0 ~ 1.1.2\n",
-    "    cuda_dir = 'D:/NVidia/CUDA/v10.1/bin'\n",
-    "    cudnn_dir = 'D:/NVidia/CUDA/v10.1/bin'\n",
-    "    if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n",
-    "        raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n",
-    "    else:\n",
-    "        if cuda_dir == cudnn_dir:\n",
-    "            os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n",
-    "        else:\n",
-    "            os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### OpenMP Environment Variable\n",
-    "\n",
-    "OpenMP environment variables are optional for GPU inference of standard Bert model. It has little performance impact on Bert model since most nodes are executed in GPU. \n",
-    "\n",
-    "You can find the best setting based on [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n",
-    "\n",
-    "**Attention: Setting environment variables shall be done before importing onnxruntime**. Otherwise, they might not take effect."
+    "onnxruntime-gpu has dependency on [CUDA](https://developer.nvidia.com/cuda-downloads) and [cuDNN](https://developer.nvidia.com/cudnn). Required CUDA version can be found [here](http://www.onnxruntime.ai/docs/reference/execution-providers/CUDA-ExecutionProvider.html#requirements)\n"
    ]
   },
   {
@@ -356,9 +357,21 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Optional. You can change them according to Performance Test Tool result.\n",
-    "#os.environ[\"OMP_NUM_THREADS\"] = '1'\n",
-    "#os.environ[\"OMP_WAIT_POLICY\"] = 'PASSIVE'"
+    "# Change to True when onnxruntime (like onnxruntime-gpu 1.0.0 ~ 1.1.2) cannot be imported.\n",
+    "add_cuda_path = False\n",
+    "\n",
+    "# For Linux, see https://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#environment-setup\n",
+    "# Below is example for Windows\n",
+    "if add_cuda_path:\n",
+    "    cuda_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
+    "    cudnn_dir = 'D:/NVidia/CUDA/v11.0/bin'\n",
+    "    if not (os.path.exists(cuda_dir) and os.path.exists(cudnn_dir)):\n",
+    "        raise ValueError(\"Please specify correct path for CUDA and cuDNN. Otherwise onnxruntime cannot be imported.\")\n",
+    "    else:\n",
+    "        if cuda_dir == cudnn_dir:\n",
+    "            os.environ[\"PATH\"] = cuda_dir + ';' + os.environ[\"PATH\"]\n",
+    "        else:\n",
+    "            os.environ[\"PATH\"] = cuda_dir + ';' + cudnn_dir + ';' + os.environ[\"PATH\"]"
    ]
   },
   {
@@ -377,7 +390,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "OnnxRuntime gpu Inference time = 4.43 ms\n"
+      "OnnxRuntime gpu Inference time = 25.28 ms\n"
      ]
     }
    ],
@@ -403,7 +416,7 @@
     "latency = []\n",
     "for i in range(total_samples):\n",
     "    data = dataset[i]\n",
-    "    # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n",
+    "    # TODO: use IO Binding (see https://www.onnxruntime.ai/python/api_summary.html) to improve performance.\n",
     "    ort_inputs = {\n",
     "        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),\n",
     "        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n",
@@ -436,9 +449,9 @@
      "text": [
       "***** Verifying correctness *****\n",
       "PyTorch and ONNX Runtime output 0 are close: True\n",
-      "maximum_diff=9.499490261077881e-07 average_diff=1.4225952327251434e-07\n",
+      "maximum_diff=5.7220458984375e-06 average_diff=1.3103708624839783e-06\n",
       "PyTorch and ONNX Runtime output 1 are close: True\n",
-      "maximum_diff=6.92903995513916e-07 average_diff=1.2441887520253658e-07\n"
+      "maximum_diff=5.7220458984375e-06 average_diff=1.2257369235157967e-06\n"
      ]
     }
    ],
@@ -472,13 +485,13 @@
     {
      "data": {
       "text/plain": [
-       "{'input_ids': tensor([[  101,  1293,  1242,  2557,  1127,  1226,  1104,  1103,  3613, 16429,\n",
-       "           5235,   136,   102,  3613, 16429,  5988,   170,   107,  1353,  1671,\n",
-       "           1992,  1342,   107,  5235,   117,  1107,  1134,  1473,  3683,  3538,\n",
-       "           1125,   170,  1476,   118,  1248,  2595,  4086,  1714,  1104,  2965,\n",
-       "          15897,  1104,  3613, 16429,   119,  1473,  3683,  3538,  3222,  1149,\n",
-       "           2551,  1168, 23759,  1116,  1121,  1506,  1103, 10280,  2231,  1111,\n",
-       "           1103,  1714, 16355,   119,   102,     0,     0,     0,     0,     0,\n",
+       "{'input_ids': tensor([[  101,  2054,  2329,  2694,  2897,  2097,  4287,  1996,  3565,  4605,\n",
+       "           1029,   102,  1999,  1996,  2142,  2983,  1010,  4035,  2557,  1019,\n",
+       "           2444,  1998,  1019,  2444,  2998,  4469,  2097,  4287,  1996,  5049,\n",
+       "           1012,  1996,  4035,  2097,  4287,  2049,  2219,  2329,  2394,  3743,\n",
+       "           1010,  2007,  6754, 10184,  1010, 12270, 10589,  1998,  6857,  8945,\n",
+       "          18505,  2006,  8570,  1012,   102,     0,     0,     0,     0,     0,\n",
+       "              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
        "              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
        "              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
        "              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,\n",
@@ -488,13 +501,13 @@
        "        device='cuda:0'),\n",
        " 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
        "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
+       "          1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
        "          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
        "          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
        "          0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'),\n",
-       " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
+       " 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
        "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n",
-       "          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,\n",
+       "          1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
        "          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
        "          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n",
        "          0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')}"
@@ -526,8 +539,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Average length 101\n",
-      "OnnxRuntime gpu Inference time with actual sequence length = 4.23 ms\n"
+      "Average length 94\n",
+      "OnnxRuntime gpu Inference time with actual sequence length = 21.93 ms\n"
      ]
     }
    ],
@@ -611,12 +624,12 @@
     "\n",
     "Example Usage:\n",
     "```\n",
-    "from onnxruntime_tools import optimizer\n",
+    "from onnxruntime.transformers import optimizer\n",
     "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
     "optimized_model.save_model_to_file(optimized_model_path)\n",
     "```\n",
     "\n",
-    "You can also use optimizer_cli like the following:"
+    "You can also use command line like the following:"
    ]
   },
   {
@@ -638,20 +651,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n",
-      "               apply: Fused LayerNormalization count: 25\n",
-      "               apply: Fused Gelu count: 12\n",
-      "               apply: Fused SkipLayerNormalization count: 25\n",
-      "               apply: Fused Attention count: 12\n",
+      "               apply: Fused LayerNormalization count: 49\n",
+      "               apply: Fused Gelu count: 24\n",
+      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+      "               apply: Fused SkipLayerNormalization count: 48\n",
+      "               apply: Fused Attention count: 24\n",
       "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
       "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
       "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
-      "               apply: Fused BiasGelu count: 12\n",
-      "               apply: Fused SkipLayerNormalization(add bias) count: 24\n",
+      "               apply: Fused BiasGelu count: 24\n",
+      "               apply: Fused SkipLayerNormalization(add bias) count: 48\n",
       "            optimize: opset verion: 11\n",
+      "  save_model_to_file: Sort graphs in topological order\n",
       "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp32.onnx\n",
-      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
       "                main: The model has been fully optimized.\n"
      ]
     }
@@ -659,7 +673,7 @@
    "source": [
     "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n",
     "\n",
-    "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path"
+    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp32_model_path"
    ]
   },
   {
@@ -712,32 +726,24 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.92 ms, Throughput = 203.24 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.90 ms, Throughput = 203.88 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 5.07 ms, Throughput = 197.16 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.82 ms, Throughput = 207.33 QPS\n",
-      "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.93 ms, Throughput = 202.92 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.91 ms, Throughput = 203.55 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.88 ms, Throughput = 204.90 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n"
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 23.72 ms, Throughput = 42.15 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 24.24 ms, Throughput = 41.25 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 24.36 ms, Throughput = 41.05 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp32.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 24.39 ms, Throughput = 41.01 QPS\n",
+      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
      ]
     }
    ],
    "source": [
     "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
     "\n",
-    "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
+    "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
    ]
   },
   {
@@ -756,7 +762,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232134.txt\n"
+      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-001817.txt\n"
      ]
     },
     {
@@ -788,117 +794,52 @@
        "      <th>Latency_P99</th>\n",
        "      <th>Throughput(QPS)</th>\n",
        "      <th>intra_op_num_threads</th>\n",
-       "      <th>OMP_NUM_THREADS</th>\n",
-       "      <th>OMP_WAIT_POLICY</th>\n",
-       "      <th>contiguous</th>\n",
-       "      <th>warmup</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>4.82</td>\n",
-       "      <td>4.53</td>\n",
-       "      <td>4.57</td>\n",
-       "      <td>5.15</td>\n",
-       "      <td>7.25</td>\n",
-       "      <td>8.75</td>\n",
-       "      <td>207.33</td>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
+       "      <td>23.72</td>\n",
+       "      <td>23.72</td>\n",
+       "      <td>23.87</td>\n",
+       "      <td>23.99</td>\n",
+       "      <td>24.11</td>\n",
+       "      <td>24.37</td>\n",
+       "      <td>42.15</td>\n",
+       "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>4.88</td>\n",
-       "      <td>4.54</td>\n",
-       "      <td>4.58</td>\n",
-       "      <td>6.47</td>\n",
-       "      <td>7.13</td>\n",
-       "      <td>8.68</td>\n",
-       "      <td>204.90</td>\n",
-       "      <td>12</td>\n",
-       "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
+       "      <td>24.24</td>\n",
+       "      <td>24.24</td>\n",
+       "      <td>24.42</td>\n",
+       "      <td>24.60</td>\n",
+       "      <td>24.76</td>\n",
+       "      <td>25.23</td>\n",
+       "      <td>41.25</td>\n",
+       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>4.90</td>\n",
-       "      <td>4.54</td>\n",
-       "      <td>4.57</td>\n",
-       "      <td>6.16</td>\n",
-       "      <td>7.64</td>\n",
-       "      <td>8.82</td>\n",
-       "      <td>203.88</td>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
+       "      <td>24.36</td>\n",
+       "      <td>24.36</td>\n",
+       "      <td>24.47</td>\n",
+       "      <td>24.69</td>\n",
+       "      <td>25.01</td>\n",
+       "      <td>26.52</td>\n",
+       "      <td>41.05</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>4.91</td>\n",
-       "      <td>4.55</td>\n",
-       "      <td>4.59</td>\n",
-       "      <td>6.70</td>\n",
-       "      <td>7.43</td>\n",
-       "      <td>8.78</td>\n",
-       "      <td>203.55</td>\n",
-       "      <td>12</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>4.92</td>\n",
-       "      <td>4.57</td>\n",
-       "      <td>4.60</td>\n",
-       "      <td>6.50</td>\n",
-       "      <td>7.82</td>\n",
-       "      <td>8.90</td>\n",
-       "      <td>203.24</td>\n",
-       "      <td>0</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>4.93</td>\n",
-       "      <td>4.55</td>\n",
-       "      <td>4.59</td>\n",
-       "      <td>6.66</td>\n",
-       "      <td>7.57</td>\n",
-       "      <td>8.80</td>\n",
-       "      <td>202.92</td>\n",
-       "      <td>12</td>\n",
+       "      <td>24.39</td>\n",
+       "      <td>24.37</td>\n",
+       "      <td>24.47</td>\n",
+       "      <td>24.65</td>\n",
+       "      <td>24.73</td>\n",
+       "      <td>25.12</td>\n",
+       "      <td>41.01</td>\n",
        "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>5.07</td>\n",
-       "      <td>4.56</td>\n",
-       "      <td>4.61</td>\n",
-       "      <td>7.19</td>\n",
-       "      <td>8.11</td>\n",
-       "      <td>9.01</td>\n",
-       "      <td>197.16</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -906,31 +847,16 @@
       ],
       "text/plain": [
        "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         4.82         4.53         4.57         5.15         7.25   \n",
-       "1         4.88         4.54         4.58         6.47         7.13   \n",
-       "2         4.90         4.54         4.57         6.16         7.64   \n",
-       "3         4.91         4.55         4.59         6.70         7.43   \n",
-       "4         4.92         4.57         4.60         6.50         7.82   \n",
-       "5         4.93         4.55         4.59         6.66         7.57   \n",
-       "6         5.07         4.56         4.61         7.19         8.11   \n",
+       "0        23.72        23.72        23.87        23.99        24.11   \n",
+       "1        24.24        24.24        24.42        24.60        24.76   \n",
+       "2        24.36        24.36        24.47        24.69        25.01   \n",
+       "3        24.39        24.37        24.47        24.65        24.73   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  intra_op_num_threads OMP_NUM_THREADS  \\\n",
-       "0         8.75           207.33                     1              12   \n",
-       "1         8.68           204.90                    12              12   \n",
-       "2         8.82           203.88                     1              12   \n",
-       "3         8.78           203.55                    12              12   \n",
-       "4         8.90           203.24                     0                   \n",
-       "5         8.80           202.92                    12               1   \n",
-       "6         9.01           197.16                    12               1   \n",
-       "\n",
-       "  OMP_WAIT_POLICY contiguous  warmup  \n",
-       "0          ACTIVE       None    True  \n",
-       "1         PASSIVE       None    True  \n",
-       "2         PASSIVE       None    True  \n",
-       "3          ACTIVE       None    True  \n",
-       "4                       None    True  \n",
-       "5         PASSIVE       None    True  \n",
-       "6          ACTIVE       None    True  "
+       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0        24.37            42.15                     4  \n",
+       "1        25.23            41.25                     3  \n",
+       "2        26.52            41.05                     2  \n",
+       "3        25.12            41.01                     1  "
       ]
      },
      "execution_count": 18,
@@ -943,7 +869,7 @@
     "import glob     \n",
     "import pandas\n",
     "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
+    "result_data = pandas.read_table(latest_result_file)\n",
     "print(\"Float32 model perf results from\", latest_result_file)\n",
     "# Remove some columns that have same values for all rows.\n",
     "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
@@ -974,13 +900,13 @@
      "output_type": "stream",
      "text": [
       "100% passed for 100 random inputs given thresholds (rtol=0.01, atol=0.01).\r\n",
-      "maximum absolute difference=1.9222497940063477e-06\r\n",
-      "maximum relative difference=0.05027933046221733\r\n"
+      "maximum absolute difference=5.316734313964844e-05\r\n",
+      "maximum relative difference=0.00012461667938623577\r\n"
      ]
     }
    ],
    "source": [
-    "!python -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
+    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
    ]
   },
   {
@@ -1003,27 +929,28 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx/bert-base-cased-squad_opset11_o1_cpu.onnx\n",
-      "               apply: Fused LayerNormalization count: 25\n",
-      "               apply: Fused Gelu count: 12\n",
-      "               apply: Fused SkipLayerNormalization count: 25\n",
-      "               apply: Fused Attention count: 12\n",
+      "               apply: Fused LayerNormalization count: 49\n",
+      "               apply: Fused Gelu count: 24\n",
+      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+      "               apply: Fused SkipLayerNormalization count: 48\n",
+      "               apply: Fused Attention count: 24\n",
       "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
       "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
       "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
-      "               apply: Fused BiasGelu count: 12\n",
-      "               apply: Fused SkipLayerNormalization(add bias) count: 24\n",
+      "               apply: Fused BiasGelu count: 24\n",
+      "               apply: Fused SkipLayerNormalization(add bias) count: 48\n",
       "            optimize: opset verion: 11\n",
+      "  save_model_to_file: Sort graphs in topological order\n",
       "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_gpu_fp16.onnx\n",
-      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 24, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 24, 'LayerNormalization': 0, 'SkipLayerNormalization': 48}\n",
       "                main: The model has been fully optimized.\n"
      ]
     }
    ],
    "source": [
     "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n",
-    "!python -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16"
+    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_fp16_model_path --float16"
    ]
   },
   {
@@ -1035,31 +962,23 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=None, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.01 ms, Throughput = 331.90 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.12 ms, Throughput = 320.00 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.02 ms, Throughput = 331.39 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.01 ms, Throughput = 332.53 QPS\n",
-      "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "skip duplicated test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.04 ms, Throughput = 328.67 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.01 ms, Throughput = 331.72 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.04 ms, Throughput = 329.32 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n"
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 6.78 ms, Throughput = 147.54 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 6.76 ms, Throughput = 147.85 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 6.79 ms, Throughput = 147.30 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 6.81 ms, Throughput = 146.75 QPS\n",
+      "Test summary is saved to onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
      ]
     }
    ],
    "source": [
     "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
-    "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
+    "!python -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 $GPU_OPTION"
    ]
   },
   {
@@ -1071,7 +990,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20200617-232234.txt\n"
+      "Float32 model perf results from ./onnx/perf_results_GPU_B1_S128_20210714-002224.txt\n"
      ]
     },
     {
@@ -1103,117 +1022,52 @@
        "      <th>Latency_P99</th>\n",
        "      <th>Throughput(QPS)</th>\n",
        "      <th>intra_op_num_threads</th>\n",
-       "      <th>OMP_NUM_THREADS</th>\n",
-       "      <th>OMP_WAIT_POLICY</th>\n",
-       "      <th>contiguous</th>\n",
-       "      <th>warmup</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>3.01</td>\n",
-       "      <td>2.79</td>\n",
-       "      <td>2.81</td>\n",
-       "      <td>2.86</td>\n",
-       "      <td>5.08</td>\n",
-       "      <td>7.16</td>\n",
-       "      <td>332.53</td>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
+       "      <td>6.76</td>\n",
+       "      <td>6.79</td>\n",
+       "      <td>6.81</td>\n",
+       "      <td>6.90</td>\n",
+       "      <td>6.91</td>\n",
+       "      <td>7.00</td>\n",
+       "      <td>147.85</td>\n",
+       "      <td>3</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>3.01</td>\n",
-       "      <td>2.80</td>\n",
-       "      <td>2.81</td>\n",
-       "      <td>2.88</td>\n",
-       "      <td>4.52</td>\n",
-       "      <td>7.05</td>\n",
-       "      <td>331.90</td>\n",
-       "      <td>0</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
+       "      <td>6.78</td>\n",
+       "      <td>6.70</td>\n",
+       "      <td>6.79</td>\n",
+       "      <td>6.87</td>\n",
+       "      <td>6.90</td>\n",
+       "      <td>7.63</td>\n",
+       "      <td>147.54</td>\n",
+       "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>3.01</td>\n",
-       "      <td>2.78</td>\n",
-       "      <td>2.80</td>\n",
-       "      <td>2.92</td>\n",
-       "      <td>5.01</td>\n",
-       "      <td>7.02</td>\n",
-       "      <td>331.72</td>\n",
-       "      <td>12</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
+       "      <td>6.79</td>\n",
+       "      <td>6.79</td>\n",
+       "      <td>6.81</td>\n",
+       "      <td>6.89</td>\n",
+       "      <td>6.91</td>\n",
+       "      <td>7.19</td>\n",
+       "      <td>147.30</td>\n",
+       "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>3.02</td>\n",
-       "      <td>2.79</td>\n",
-       "      <td>2.80</td>\n",
-       "      <td>2.85</td>\n",
-       "      <td>6.34</td>\n",
-       "      <td>7.04</td>\n",
-       "      <td>331.39</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>3.04</td>\n",
-       "      <td>2.80</td>\n",
-       "      <td>2.82</td>\n",
-       "      <td>2.93</td>\n",
-       "      <td>5.56</td>\n",
-       "      <td>7.08</td>\n",
-       "      <td>329.32</td>\n",
-       "      <td>12</td>\n",
-       "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5</th>\n",
-       "      <td>3.04</td>\n",
-       "      <td>2.79</td>\n",
-       "      <td>2.81</td>\n",
-       "      <td>2.92</td>\n",
-       "      <td>6.37</td>\n",
-       "      <td>7.08</td>\n",
-       "      <td>328.67</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>3.12</td>\n",
-       "      <td>2.79</td>\n",
-       "      <td>2.82</td>\n",
-       "      <td>2.96</td>\n",
-       "      <td>6.66</td>\n",
+       "      <td>6.81</td>\n",
+       "      <td>6.80</td>\n",
+       "      <td>6.89</td>\n",
+       "      <td>6.91</td>\n",
+       "      <td>6.97</td>\n",
        "      <td>7.20</td>\n",
-       "      <td>320.00</td>\n",
+       "      <td>146.75</td>\n",
        "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
-       "      <td>True</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -1221,31 +1075,16 @@
       ],
       "text/plain": [
        "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         3.01         2.79         2.81         2.86         5.08   \n",
-       "1         3.01         2.80         2.81         2.88         4.52   \n",
-       "2         3.01         2.78         2.80         2.92         5.01   \n",
-       "3         3.02         2.79         2.80         2.85         6.34   \n",
-       "4         3.04         2.80         2.82         2.93         5.56   \n",
-       "5         3.04         2.79         2.81         2.92         6.37   \n",
-       "6         3.12         2.79         2.82         2.96         6.66   \n",
+       "0         6.76         6.79         6.81         6.90         6.91   \n",
+       "1         6.78         6.70         6.79         6.87         6.90   \n",
+       "2         6.79         6.79         6.81         6.89         6.91   \n",
+       "3         6.81         6.80         6.89         6.91         6.97   \n",
        "\n",
-       "   Latency_P99  Throughput(QPS)  intra_op_num_threads OMP_NUM_THREADS  \\\n",
-       "0         7.16           332.53                     1              12   \n",
-       "1         7.05           331.90                     0                   \n",
-       "2         7.02           331.72                    12              12   \n",
-       "3         7.04           331.39                    12               1   \n",
-       "4         7.08           329.32                    12              12   \n",
-       "5         7.08           328.67                    12               1   \n",
-       "6         7.20           320.00                     1              12   \n",
-       "\n",
-       "  OMP_WAIT_POLICY contiguous  warmup  \n",
-       "0          ACTIVE       None    True  \n",
-       "1                       None    True  \n",
-       "2          ACTIVE       None    True  \n",
-       "3          ACTIVE       None    True  \n",
-       "4         PASSIVE       None    True  \n",
-       "5         PASSIVE       None    True  \n",
-       "6         PASSIVE       None    True  "
+       "   Latency_P99  Throughput(QPS)  intra_op_num_threads  \n",
+       "0         7.00           147.85                     3  \n",
+       "1         7.63           147.54                     4  \n",
+       "2         7.19           147.30                     2  \n",
+       "3         7.20           146.75                     1  "
       ]
      },
      "execution_count": 22,
@@ -1258,7 +1097,7 @@
     "import glob     \n",
     "import pandas\n",
     "latest_result_file = max(glob.glob(\"./onnx/perf_results_GPU_B1_S128_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
+    "result_data = pandas.read_table(latest_result_file)\n",
     "print(\"Float32 model perf results from\", latest_result_file)\n",
     "# Remove some columns that have same values for all rows.\n",
     "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
@@ -1286,47 +1125,43 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "test setting TestSetting(batch_size=32, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=32 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 16.17 ms, Throughput = 1979.41 QPS\n",
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=32,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 168.40 ms, Throughput = 190.02 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=1 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.00 ms, Throughput = 333.83 QPS\n",
-      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 7.14 ms, Throughput = 140.00 QPS\n",
+      "test setting TestSetting(batch_size=2, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=2 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 3.59 ms, Throughput = 557.32 QPS\n",
-      "test setting TestSetting(batch_size=64, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
-      "Generating 1000 samples for batch_size=64 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=64,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 29.26 ms, Throughput = 2187.15 QPS\n",
-      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=2,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 11.27 ms, Throughput = 177.41 QPS\n",
+      "test setting TestSetting(batch_size=4, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=4 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 4.32 ms, Throughput = 926.92 QPS\n",
-      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=4,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 21.15 ms, Throughput = 189.09 QPS\n",
+      "test setting TestSetting(batch_size=8, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=8 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 6.32 ms, Throughput = 1266.63 QPS\n",
-      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, contiguous=None, use_gpu=True, warmup=True, omp_num_threads=12, omp_wait_policy='ACTIVE', intra_op_num_threads=1, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=8,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 42.27 ms, Throughput = 189.27 QPS\n",
+      "test setting TestSetting(batch_size=16, sequence_length=128, test_cases=1000, test_times=1, use_gpu=True, intra_op_num_threads=3, seed=3, verbose=False)\n",
       "Generating 1000 samples for batch_size=16 sequence_length=128\n",
-      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,contiguous=None,use_gpu=True,warmup=True\n",
-      "Average latency = 9.60 ms, Throughput = 1666.05 QPS\n",
-      "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n"
+      "Running test: model=bert-base-cased-squad_opt_gpu_fp16.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=16,sequence_length=128,test_cases=1000,test_times=1,use_gpu=True\n",
+      "Average latency = 83.77 ms, Throughput = 191.01 QPS\n",
+      "Test summary is saved to onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
      ]
     }
    ],
    "source": [
     "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
-    "THREAD_SETTING = '--intra_op_num_threads 1 --omp_num_threads {} --omp_wait_policy ACTIVE'.format(psutil.cpu_count(logical=True))\n",
-    "!python -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $THREAD_SETTING $GPU_OPTION"
+    "THREAD_SETTING = '--intra_op_num_threads 3'\n",
+    "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 --sequence_length 128 --samples 1000 --test_times 1 $THREAD_SETTING $GPU_OPTION"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 24,
    "metadata": {
     "scrolled": false
    },
@@ -1335,7 +1170,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32-64_S128_20200617-232401.txt\n"
+      "Float16 model summary from ./onnx/perf_results_GPU_B1-2-4-8-16-32_S128_20210714-002816.txt\n"
      ]
     },
     {
@@ -1372,106 +1207,93 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>3.00</td>\n",
-       "      <td>2.79</td>\n",
-       "      <td>2.81</td>\n",
-       "      <td>2.86</td>\n",
-       "      <td>4.37</td>\n",
-       "      <td>7.08</td>\n",
-       "      <td>333.83</td>\n",
+       "      <td>7.14</td>\n",
+       "      <td>7.10</td>\n",
+       "      <td>7.13</td>\n",
+       "      <td>7.25</td>\n",
+       "      <td>7.35</td>\n",
+       "      <td>10.99</td>\n",
+       "      <td>140.00</td>\n",
        "      <td>1</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>3.59</td>\n",
-       "      <td>3.33</td>\n",
-       "      <td>3.35</td>\n",
-       "      <td>3.42</td>\n",
-       "      <td>6.60</td>\n",
-       "      <td>7.54</td>\n",
-       "      <td>557.32</td>\n",
+       "      <td>11.27</td>\n",
+       "      <td>11.23</td>\n",
+       "      <td>11.28</td>\n",
+       "      <td>11.53</td>\n",
+       "      <td>11.57</td>\n",
+       "      <td>12.05</td>\n",
+       "      <td>177.41</td>\n",
        "      <td>2</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>4.32</td>\n",
-       "      <td>3.98</td>\n",
-       "      <td>4.01</td>\n",
-       "      <td>4.64</td>\n",
-       "      <td>7.23</td>\n",
-       "      <td>8.11</td>\n",
-       "      <td>926.92</td>\n",
+       "      <td>21.15</td>\n",
+       "      <td>21.13</td>\n",
+       "      <td>21.25</td>\n",
+       "      <td>21.44</td>\n",
+       "      <td>21.59</td>\n",
+       "      <td>22.07</td>\n",
+       "      <td>189.09</td>\n",
        "      <td>4</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>6.32</td>\n",
-       "      <td>5.94</td>\n",
-       "      <td>5.97</td>\n",
-       "      <td>7.61</td>\n",
-       "      <td>8.96</td>\n",
-       "      <td>10.12</td>\n",
-       "      <td>1266.63</td>\n",
+       "      <td>42.27</td>\n",
+       "      <td>42.26</td>\n",
+       "      <td>42.68</td>\n",
+       "      <td>42.95</td>\n",
+       "      <td>43.11</td>\n",
+       "      <td>45.11</td>\n",
+       "      <td>189.27</td>\n",
        "      <td>8</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>9.60</td>\n",
-       "      <td>9.22</td>\n",
-       "      <td>9.25</td>\n",
-       "      <td>11.32</td>\n",
-       "      <td>12.33</td>\n",
-       "      <td>13.34</td>\n",
-       "      <td>1666.05</td>\n",
+       "      <td>83.77</td>\n",
+       "      <td>83.84</td>\n",
+       "      <td>84.29</td>\n",
+       "      <td>84.94</td>\n",
+       "      <td>85.35</td>\n",
+       "      <td>86.34</td>\n",
+       "      <td>191.01</td>\n",
        "      <td>16</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>16.17</td>\n",
-       "      <td>15.80</td>\n",
-       "      <td>15.90</td>\n",
-       "      <td>17.38</td>\n",
-       "      <td>18.80</td>\n",
-       "      <td>19.93</td>\n",
-       "      <td>1979.41</td>\n",
+       "      <td>168.40</td>\n",
+       "      <td>169.62</td>\n",
+       "      <td>170.78</td>\n",
+       "      <td>171.94</td>\n",
+       "      <td>172.82</td>\n",
+       "      <td>174.28</td>\n",
+       "      <td>190.02</td>\n",
        "      <td>32</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>6</th>\n",
-       "      <td>29.26</td>\n",
-       "      <td>28.89</td>\n",
-       "      <td>29.01</td>\n",
-       "      <td>30.63</td>\n",
-       "      <td>32.53</td>\n",
-       "      <td>33.28</td>\n",
-       "      <td>2187.15</td>\n",
-       "      <td>64</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "   Latency(ms)  Latency_P50  Latency_P75  Latency_P90  Latency_P95  \\\n",
-       "0         3.00         2.79         2.81         2.86         4.37   \n",
-       "1         3.59         3.33         3.35         3.42         6.60   \n",
-       "2         4.32         3.98         4.01         4.64         7.23   \n",
-       "3         6.32         5.94         5.97         7.61         8.96   \n",
-       "4         9.60         9.22         9.25        11.32        12.33   \n",
-       "5        16.17        15.80        15.90        17.38        18.80   \n",
-       "6        29.26        28.89        29.01        30.63        32.53   \n",
+       "0         7.14         7.10         7.13         7.25         7.35   \n",
+       "1        11.27        11.23        11.28        11.53        11.57   \n",
+       "2        21.15        21.13        21.25        21.44        21.59   \n",
+       "3        42.27        42.26        42.68        42.95        43.11   \n",
+       "4        83.77        83.84        84.29        84.94        85.35   \n",
+       "5       168.40       169.62       170.78       171.94       172.82   \n",
        "\n",
        "   Latency_P99  Throughput(QPS)  batch_size  \n",
-       "0         7.08           333.83           1  \n",
-       "1         7.54           557.32           2  \n",
-       "2         8.11           926.92           4  \n",
-       "3        10.12          1266.63           8  \n",
-       "4        13.34          1666.05          16  \n",
-       "5        19.93          1979.41          32  \n",
-       "6        33.28          2187.15          64  "
+       "0        10.99           140.00           1  \n",
+       "1        12.05           177.41           2  \n",
+       "2        22.07           189.09           4  \n",
+       "3        45.11           189.27           8  \n",
+       "4        86.34           191.01          16  \n",
+       "5       174.28           190.02          32  "
       ]
      },
-     "execution_count": 26,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1481,10 +1303,10 @@
     "import glob     \n",
     "import pandas\n",
     "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
-    "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
+    "result_data = pandas.read_table(latest_result_file)\n",
     "print(\"Float16 model summary from\", latest_result_file)\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'warmup', 'sequence_length']\n",
-    "columns_to_remove.extend(['intra_op_num_threads', 'OMP_NUM_THREADS', 'OMP_WAIT_POLICY', 'contiguous'])\n",
+    "columns_to_remove = ['model', 'graph_optimization_level', 'test_cases', 'test_times', 'use_gpu', 'sequence_length']\n",
+    "columns_to_remove.extend(['intra_op_num_threads'])\n",
     "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
     "result_data"
    ]
@@ -1506,7 +1328,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 25,
    "metadata": {
     "scrolled": true
    },
@@ -1517,42 +1339,126 @@
      "text": [
       "{\r\n",
       "  \"gpu\": {\r\n",
-      "    \"driver_version\": \"440.64.00\",\r\n",
+      "    \"driver_version\": \"450.51.05\",\r\n",
       "    \"devices\": [\r\n",
       "      {\r\n",
-      "        \"memory_total\": 16945512448,\r\n",
-      "        \"memory_available\": 14110883840,\r\n",
-      "        \"name\": \"Tesla V100-PCIE-16GB\"\r\n",
-      "      },\r\n",
-      "      {\r\n",
-      "        \"memory_total\": 16945512448,\r\n",
-      "        \"memory_available\": 16932601856,\r\n",
-      "        \"name\": \"Tesla V100-PCIE-16GB\"\r\n",
+      "        \"memory_total\": 15843721216,\r\n",
+      "        \"memory_available\": 9313189888,\r\n",
+      "        \"name\": \"Tesla T4\"\r\n",
       "      }\r\n",
       "    ]\r\n",
       "  },\r\n",
       "  \"cpu\": {\r\n",
-      "    \"brand\": \"Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz\",\r\n",
-      "    \"cores\": 12,\r\n",
-      "    \"logical_cores\": 12,\r\n",
-      "    \"hz\": \"2.5940 GHz\",\r\n",
-      "    \"l2_cache\": \"256 KB\",\r\n",
-      "    \"l3_cache\": \"35840 KB\",\r\n",
+      "    \"brand\": \"AMD EPYC 7V12 64-Core Processor\",\r\n",
+      "    \"cores\": 4,\r\n",
+      "    \"logical_cores\": 4,\r\n",
+      "    \"hz\": [\r\n",
+      "      2445417000,\r\n",
+      "      0\r\n",
+      "    ],\r\n",
+      "    \"l2_cache\": 524288,\r\n",
+      "    \"flags\": [\r\n",
+      "      \"3dnowext\",\r\n",
+      "      \"3dnowprefetch\",\r\n",
+      "      \"abm\",\r\n",
+      "      \"adx\",\r\n",
+      "      \"aes\",\r\n",
+      "      \"apic\",\r\n",
+      "      \"arat\",\r\n",
+      "      \"avx\",\r\n",
+      "      \"avx2\",\r\n",
+      "      \"bmi1\",\r\n",
+      "      \"bmi2\",\r\n",
+      "      \"clflush\",\r\n",
+      "      \"clflushopt\",\r\n",
+      "      \"clwb\",\r\n",
+      "      \"cmov\",\r\n",
+      "      \"cmp_legacy\",\r\n",
+      "      \"cpuid\",\r\n",
+      "      \"cr8_legacy\",\r\n",
+      "      \"cx16\",\r\n",
+      "      \"cx8\",\r\n",
+      "      \"de\",\r\n",
+      "      \"extd_apicid\",\r\n",
+      "      \"f16c\",\r\n",
+      "      \"fma\",\r\n",
+      "      \"fpu\",\r\n",
+      "      \"fsgsbase\",\r\n",
+      "      \"fxsr\",\r\n",
+      "      \"fxsr_opt\",\r\n",
+      "      \"ht\",\r\n",
+      "      \"hypervisor\",\r\n",
+      "      \"lahf_lm\",\r\n",
+      "      \"lm\",\r\n",
+      "      \"mca\",\r\n",
+      "      \"mce\",\r\n",
+      "      \"misalignsse\",\r\n",
+      "      \"mmx\",\r\n",
+      "      \"mmxext\",\r\n",
+      "      \"movbe\",\r\n",
+      "      \"msr\",\r\n",
+      "      \"mtrr\",\r\n",
+      "      \"nopl\",\r\n",
+      "      \"nx\",\r\n",
+      "      \"osvw\",\r\n",
+      "      \"osxsave\",\r\n",
+      "      \"pae\",\r\n",
+      "      \"pat\",\r\n",
+      "      \"pclmulqdq\",\r\n",
+      "      \"pdpe1gb\",\r\n",
+      "      \"pge\",\r\n",
+      "      \"pni\",\r\n",
+      "      \"popcnt\",\r\n",
+      "      \"pse\",\r\n",
+      "      \"pse36\",\r\n",
+      "      \"rdpid\",\r\n",
+      "      \"rdrand\",\r\n",
+      "      \"rdrnd\",\r\n",
+      "      \"rdseed\",\r\n",
+      "      \"rdtscp\",\r\n",
+      "      \"rep_good\",\r\n",
+      "      \"sep\",\r\n",
+      "      \"sha\",\r\n",
+      "      \"sha_ni\",\r\n",
+      "      \"smap\",\r\n",
+      "      \"smep\",\r\n",
+      "      \"ssbd\",\r\n",
+      "      \"sse\",\r\n",
+      "      \"sse2\",\r\n",
+      "      \"sse4_1\",\r\n",
+      "      \"sse4_2\",\r\n",
+      "      \"sse4a\",\r\n",
+      "      \"ssse3\",\r\n",
+      "      \"syscall\",\r\n",
+      "      \"topoext\",\r\n",
+      "      \"tsc\",\r\n",
+      "      \"umip\",\r\n",
+      "      \"vme\",\r\n",
+      "      \"vmmcall\",\r\n",
+      "      \"xgetbv1\",\r\n",
+      "      \"xsave\",\r\n",
+      "      \"xsavec\",\r\n",
+      "      \"xsaveerptr\",\r\n",
+      "      \"xsaveopt\",\r\n",
+      "      \"xsaves\"\r\n",
+      "    ],\r\n",
       "    \"processor\": \"x86_64\"\r\n",
       "  },\r\n",
       "  \"memory\": {\r\n",
-      "    \"total\": 236645588992,\r\n",
-      "    \"available\": 222567559168\r\n",
+      "    \"total\": 29450223616,\r\n",
+      "    \"available\": 22402334720\r\n",
       "  },\r\n",
-      "  \"python\": \"3.7.7.final.0 (64 bit)\",\r\n",
-      "  \"os\": \"Linux-4.15.0-1089-azure-x86_64-with-debian-stretch-sid\",\r\n",
+      "  \"python\": \"3.6.13.final.0 (64 bit)\",\r\n",
+      "  \"os\": \"Linux-5.4.0-1046-azure-x86_64-with-debian-buster-sid\",\r\n",
       "  \"onnxruntime\": {\r\n",
-      "    \"version\": \"1.3.0\",\r\n",
+      "    \"version\": \"1.8.1\",\r\n",
       "    \"support_gpu\": true\r\n",
       "  },\r\n",
+      "  \"onnxruntime_tools\": null,\r\n",
       "  \"pytorch\": {\r\n",
-      "    \"version\": \"1.5.0\",\r\n",
-      "    \"support_gpu\": true\r\n",
+      "    \"version\": \"1.9.0+cu111\",\r\n",
+      "    \"support_gpu\": true,\r\n",
+      "    \"cuda\": \"11.1\"\r\n",
       "  },\r\n",
       "  \"tensorflow\": null\r\n",
       "}\r\n"
@@ -1560,15 +1466,15 @@
     }
    ],
    "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
+    "!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "gpu_env_py37",
+   "display_name": "gpu_env",
    "language": "python",
-   "name": "gpu_env_py37"
+   "name": "gpu_env"
   },
   "language_info": {
    "codemirror_mode": {
@@ -1580,7 +1486,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.7"
+   "version": "3.6.13"
   }
  },
  "nbformat": 4,