update bert notebook to use onnxruntime 1.8.1 (#8379)

2026-07-19 19:00:47 +00:00 · 2021-07-19 14:16:59 -07:00 · 2021-07-19 14:16:59 -07:00 · dfe42e185c
commit dfe42e185c
parent afce0e2543
2 changed files with 685 additions and 814 deletions
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
@ -47,76 +47,23 @@
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
-      "Requirement already up-to-date: torch==1.6.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.6.0+cpu)\n",
-      "Requirement already up-to-date: torchvision==0.7.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.7.0+cpu)\n",
-      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (0.18.2)\n",
-      "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.7.0+cpu) (7.0.0)\n",
-      "Requirement already up-to-date: onnxruntime==1.4.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0)\n",
-      "Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (45.2.0.post20200210)\n",
-      "Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.1)\n",
-      "Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n",
-      "Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n",
-      "Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n",
-      "Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n",
-      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n",
-      "Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n",
-      "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n",
-      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n",
-      "Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n",
-      "Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n",
-      "Requirement already satisfied: transformers==3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.0.2)\n",
-      "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n",
-      "Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n",
-      "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n",
-      "Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n",
-      "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n",
-      "Requirement already satisfied: tokenizers==0.8.1.rc1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc1)\n",
-      "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n",
-      "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (3.0.12)\n",
-      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (1.18.1)\n",
-      "Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (2.4.6)\n",
-      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n",
-      "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (7.0)\n",
-      "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (0.14.1)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
-      "Requirement already satisfied: wget in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.2)\n",
-      "Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
-    "# Install PyTorch 1.6.0 and OnnxRuntime 1.4.0 for CPU-only.\n",
    "import sys\n",
-    "if sys.platform == 'darwin': # Mac\n",
-    "    !{sys.executable} -m pip install --upgrade torch torchvision\n",
-    "else:\n",
-    "    !{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n",
-    "!{sys.executable} -m pip install --upgrade onnxconverter_common\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime-tools\n",
    "\n",
-    "# Install other packages used in this notebook.\n",
-    "!{sys.executable} -m pip install transformers==3.0.2\n",
-    "!{sys.executable} -m pip install wget netron"
+    "run_install = False # Only need install once\n",
+    "if run_install:\n",
+    "    if sys.platform in ['linux', 'win32']: # Linux or Windows\n",
+    "        !{sys.executable} -m pip install --upgrade torch torchvision torchaudio\n",
+    "    else: # Mac\n",
+    "        !{sys.executable} -m pip install torch==1.9.0+cpu torchvision==0.10.0+cpu torchaudio==0.9.0 -f https://download.pytorch.org/whl/torch_stable.html\n",
+    "\n",
+    "    !{sys.executable} -m pip install onnxruntime==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1\n",
+    "\n",
+    "    # Install other packages used in this notebook.\n",
+    "    !{sys.executable} -m pip install transformers==4.8.2\n",
+    "    !{sys.executable} -m pip install psutil pytz pandas py-cpuinfo py3nvml\n",
+    "    !{sys.executable} -m pip install wget netron"
   ]
  },
  {
@ -196,14 +143,14 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']\n",
-      "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n",
+      "Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForQuestionAnswering: ['cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']\n",
+      "- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
      "Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.87it/s]\n",
-      "convert squad examples to features: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 131.41it/s]\n",
-      "add example index and unique id: 100%|████████████████████████████████████████████| 100/100 [00:00<00:00, 96776.74it/s]\n"
+      "100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.15it/s]\n",
+      "convert squad examples to features: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 135.87it/s]\n",
+      "add example index and unique id: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 100031.10it/s]\n"
     ]
    }
   ],
@ -252,6 +199,14 @@
   "execution_count": 5,
   "metadata": {},
   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "d:\\git\\transformers\\src\\transformers\\modeling_utils.py:2074: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  input_tensor.shape[chunk_dim] == tensor_shape for input_tensor in input_tensors\n"
+     ]
+    },
    {
     "name": "stdout",
     "output_type": "stream",
@ -319,7 +274,7 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "PyTorch cpu Inference time = 144.73 ms\n"
+      "PyTorch cpu Inference time = 119.80 ms\n"
     ]
    }
   ],
@ -348,45 +303,26 @@
   "source": [
    "## 4. Inference ONNX Model with ONNX Runtime ##\n",
    "\n",
-    "### OpenMP Environment Variable\n",
-    "\n",
-    "OpenMP environment variables are very important for CPU inference of Bert model. It has large performance impact on Bert model so you might need set it carefully according to [Performance Test Tool](#Performance-Test-Tool) result in later part of this notebook.\n",
-    "\n",
-    "Setting environment variables shall be done before importing onnxruntime. Otherwise, they might not take effect."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import psutil\n",
-    "\n",
-    "# You may change the settings in this cell according to Performance Test Tool result.\n",
-    "os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n",
-    "os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'"
+    "For Onnx Runtime 1.6.0 or older, OpenMP environment variables are very important for CPU inference of Bert model. Since 1.7.0, the official package is not built with OpenMP."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "Now we are ready to inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. \n",
-    "\n",
-    "It is better to use standalone python script like [Performance Test tool](#Performance-Test-tool) to get accurate performance results."
+    "Now we inference the model with ONNX Runtime. Here we can see that OnnxRuntime has better performance than PyTorch. "
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "OnnxRuntime cpu Inference time = 88.55 ms\n"
+      "OnnxRuntime cpu Inference time = 72.46 ms\n"
     ]
    }
   ],
@ -394,19 +330,15 @@
    "import onnxruntime\n",
    "import numpy\n",
    "\n",
-    "# Print warning if user uses onnxruntime-gpu instead of onnxruntime package.\n",
-    "if 'CUDAExecutionProvider' in onnxruntime.get_available_providers():\n",
-    "    print(\"warning: onnxruntime-gpu is not built with OpenMP. You might try onnxruntime package to test CPU inference.\")\n",
-    "\n",
    "sess_options = onnxruntime.SessionOptions()\n",
    "\n",
    "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n",
    "# Note that this will increase session creation time, so it is for debugging only.\n",
    "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_cpu.onnx\")\n",
    "\n",
-    "# For OnnxRuntime 1.2.0, you might need set intra_op_num_threads to 1 to enable OpenMP\n",
-    "#    sess_options.intra_op_num_threads=1\n",
-    "# For OnnxRuntime 1.3.0 or later, it is recommended to use the default setting so you need not set it.\n",
+    "# For OnnxRuntime 1.7.0 or later, you can set intra_op_num_threads to set thread number like\n",
+    "#    sess_options.intra_op_num_threads=4\n",
+    "# Here we use the default value which is a good choice in most cases.\n",
    "\n",
    "# Specify providers when you use onnxruntime-gpu for CPU inference.\n",
    "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@ -427,7 +359,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@ -474,17 +406,17 @@
    "\n",
    "Example Usage:\n",
    "```\n",
-    "from onnxruntime_tools import optimizer\n",
+    "from onnxruntime.transformers import optimizer\n",
    "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
    "optimized_model.save_model_to_file(optimized_model_path)\n",
    "```\n",
    "\n",
-    "You can also use optimizer_cli like the following:"
+    "You can also use command line like the following:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
@ -493,15 +425,17 @@
     "text": [
      "               apply: Fused LayerNormalization count: 25\n",
      "               apply: Fused Gelu count: 12\n",
-      "               apply: Fused SkipLayerNormalization count: 25\n",
+      "adjust_reshape_and_expand: Removed Reshape and Expand count: 0\n",
+      "               apply: Fused SkipLayerNormalization count: 24\n",
      "               apply: Fused Attention count: 12\n",
      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
      "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
-      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 3 nodes are removed\n",
      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
      "               apply: Fused BiasGelu count: 12\n",
      "               apply: Fused SkipLayerNormalization(add bias) count: 24\n",
      "            optimize: opset verion: 11\n",
+      "  save_model_to_file: Sort graphs in topological order\n",
      "  save_model_to_file: Output model to ..\\onnx_models\\bert-base-cased-squad_opt_cpu.onnx\n",
      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
      "                main: The model has been fully optimized.\n"
@ -511,7 +445,7 @@
   "source": [
    "optimized_model_path = os.path.join(output_dir, 'bert-base-cased-squad_opt_cpu.onnx')\n",
    "\n",
-    "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
+    "!{sys.executable} -m onnxruntime.transformers.optimizer --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
   ]
  },
  {
@ -527,7 +461,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@ -561,7 +495,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
@ -569,13 +503,13 @@
     "output_type": "stream",
     "text": [
      "100% passed for 100 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
-      "maximum absolute difference=5.930662155151367e-06\n",
-      "maximum relative difference=0.021568937227129936\n"
+      "maximum absolute difference=4.604458808898926e-06\n",
+      "maximum relative difference=0.006278202868998051\n"
     ]
    }
   ],
   "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
+    "!{sys.executable} -m onnxruntime.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
   ]
  },
  {
@ -591,45 +525,45 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=0,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 80.08 ms, Throughput = 12.49 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 78.56 ms, Throughput = 12.73 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 77.78 ms, Throughput = 12.86 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 73.52 ms, Throughput = 13.60 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 95.36 ms, Throughput = 10.49 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 78.49 ms, Throughput = 12.74 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.71 ms, Throughput = 3.46 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.61 ms, Throughput = 3.46 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.97 ms, Throughput = 3.46 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 288.37 ms, Throughput = 3.47 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 95.49 ms, Throughput = 10.47 QPS\n",
-      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=None,use_gpu=False,warmup=True\n",
-      "Average latency = 79.17 ms, Throughput = 12.63 QPS\n",
-      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, contiguous=None, use_gpu=False, warmup=True, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=None, seed=3, verbose=False, inclusive=False, extra_latency=True)\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=12,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 54.26 ms, Throughput = 18.43 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=11,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 55.80 ms, Throughput = 17.92 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=10,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 65.31 ms, Throughput = 15.31 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=9,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 57.66 ms, Throughput = 17.34 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=8,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 62.84 ms, Throughput = 15.91 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=7,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 69.29 ms, Throughput = 14.43 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=6,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 56.19 ms, Throughput = 17.80 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=5,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 59.90 ms, Throughput = 16.70 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=4,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 63.72 ms, Throughput = 15.69 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=3,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 82.44 ms, Throughput = 12.13 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=2,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 119.64 ms, Throughput = 8.36 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,batch_size=1,sequence_length=128,test_cases=100,test_times=1,use_gpu=False\n",
+      "Average latency = 223.21 ms, Throughput = 4.48 QPS\n",
+      "test setting TestSetting(batch_size=1, sequence_length=128, test_cases=100, test_times=1, use_gpu=False, intra_op_num_threads=None, seed=3, verbose=False)\n",
      "Generating 100 samples for batch_size=1 sequence_length=128\n",
-      "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
+      "Test summary is saved to ..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
     ]
    }
   ],
   "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all"
+    "!{sys.executable} -m onnxruntime.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1"
   ]
  },
  {
@ -641,14 +575,14 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "..\\onnx_models\\perf_results_CPU_B1_S128_20200806-173045.txt\n"
+      "..\\onnx_models\\perf_results_CPU_B1_S128_20210713-144140.txt\n"
     ]
    },
    {
@ -678,155 +612,116 @@
       "      <th>Latency_P99</th>\n",
       "      <th>Throughput(QPS)</th>\n",
       "      <th>intra_op_num_threads</th>\n",
-       "      <th>OMP_NUM_THREADS</th>\n",
-       "      <th>OMP_WAIT_POLICY</th>\n",
-       "      <th>contiguous</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
-       "      <td>73.52</td>\n",
-       "      <td>75.78</td>\n",
-       "      <td>78.21</td>\n",
-       "      <td>89.29</td>\n",
-       "      <td>13.60</td>\n",
-       "      <td>1</td>\n",
+       "      <td>54.26</td>\n",
+       "      <td>56.05</td>\n",
+       "      <td>60.32</td>\n",
+       "      <td>109.21</td>\n",
+       "      <td>18.43</td>\n",
       "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>77.78</td>\n",
-       "      <td>82.35</td>\n",
-       "      <td>87.02</td>\n",
-       "      <td>104.54</td>\n",
-       "      <td>12.86</td>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>55.80</td>\n",
+       "      <td>56.74</td>\n",
+       "      <td>59.67</td>\n",
+       "      <td>73.62</td>\n",
+       "      <td>17.92</td>\n",
+       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>78.49</td>\n",
-       "      <td>80.92</td>\n",
-       "      <td>85.77</td>\n",
-       "      <td>98.98</td>\n",
-       "      <td>12.74</td>\n",
-       "      <td>1</td>\n",
+       "      <td>56.19</td>\n",
+       "      <td>61.29</td>\n",
+       "      <td>71.69</td>\n",
+       "      <td>80.15</td>\n",
+       "      <td>17.80</td>\n",
       "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>78.56</td>\n",
-       "      <td>82.29</td>\n",
-       "      <td>93.46</td>\n",
-       "      <td>108.73</td>\n",
-       "      <td>12.73</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>None</td>\n",
+       "      <td>57.66</td>\n",
+       "      <td>58.50</td>\n",
+       "      <td>61.96</td>\n",
+       "      <td>65.12</td>\n",
+       "      <td>17.34</td>\n",
+       "      <td>9</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>79.17</td>\n",
-       "      <td>82.02</td>\n",
-       "      <td>87.60</td>\n",
-       "      <td>99.55</td>\n",
-       "      <td>12.63</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>59.90</td>\n",
+       "      <td>59.72</td>\n",
+       "      <td>65.16</td>\n",
+       "      <td>116.16</td>\n",
+       "      <td>16.70</td>\n",
+       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
-       "      <td>80.08</td>\n",
-       "      <td>83.18</td>\n",
-       "      <td>95.60</td>\n",
-       "      <td>107.72</td>\n",
-       "      <td>12.49</td>\n",
-       "      <td>0</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>None</td>\n",
+       "      <td>62.84</td>\n",
+       "      <td>67.05</td>\n",
+       "      <td>69.07</td>\n",
+       "      <td>75.99</td>\n",
+       "      <td>15.91</td>\n",
+       "      <td>8</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
-       "      <td>95.36</td>\n",
-       "      <td>101.25</td>\n",
-       "      <td>103.61</td>\n",
-       "      <td>105.15</td>\n",
-       "      <td>10.49</td>\n",
-       "      <td>1</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>63.72</td>\n",
+       "      <td>64.17</td>\n",
+       "      <td>69.44</td>\n",
+       "      <td>73.10</td>\n",
+       "      <td>15.69</td>\n",
+       "      <td>4</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
-       "      <td>95.49</td>\n",
-       "      <td>101.50</td>\n",
-       "      <td>102.66</td>\n",
-       "      <td>104.82</td>\n",
-       "      <td>10.47</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>65.31</td>\n",
+       "      <td>65.35</td>\n",
+       "      <td>80.70</td>\n",
+       "      <td>177.94</td>\n",
+       "      <td>15.31</td>\n",
+       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
-       "      <td>288.37</td>\n",
-       "      <td>290.48</td>\n",
-       "      <td>295.37</td>\n",
-       "      <td>308.91</td>\n",
-       "      <td>3.47</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>69.29</td>\n",
+       "      <td>69.04</td>\n",
+       "      <td>70.68</td>\n",
+       "      <td>85.03</td>\n",
+       "      <td>14.43</td>\n",
+       "      <td>7</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
-       "      <td>288.61</td>\n",
-       "      <td>291.10</td>\n",
-       "      <td>295.78</td>\n",
-       "      <td>301.52</td>\n",
-       "      <td>3.46</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>82.44</td>\n",
+       "      <td>83.20</td>\n",
+       "      <td>89.64</td>\n",
+       "      <td>98.80</td>\n",
+       "      <td>12.13</td>\n",
+       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
-       "      <td>288.71</td>\n",
-       "      <td>292.64</td>\n",
-       "      <td>298.28</td>\n",
-       "      <td>305.92</td>\n",
-       "      <td>3.46</td>\n",
-       "      <td>12</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
+       "      <td>119.64</td>\n",
+       "      <td>119.07</td>\n",
+       "      <td>122.62</td>\n",
+       "      <td>135.67</td>\n",
+       "      <td>8.36</td>\n",
+       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
-       "      <td>288.97</td>\n",
-       "      <td>291.18</td>\n",
-       "      <td>297.68</td>\n",
-       "      <td>309.30</td>\n",
-       "      <td>3.46</td>\n",
-       "      <td>6</td>\n",
+       "      <td>223.21</td>\n",
+       "      <td>223.22</td>\n",
+       "      <td>226.83</td>\n",
+       "      <td>249.08</td>\n",
+       "      <td>4.48</td>\n",
       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>None</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
@ -834,35 +729,35 @@
      ],
      "text/plain": [
       "    Latency(ms)  Latency_P75  Latency_P90  Latency_P99  Throughput(QPS)  \\\n",
-       "0         73.52        75.78        78.21        89.29            13.60   \n",
-       "1         77.78        82.35        87.02       104.54            12.86   \n",
-       "2         78.49        80.92        85.77        98.98            12.74   \n",
-       "3         78.56        82.29        93.46       108.73            12.73   \n",
-       "4         79.17        82.02        87.60        99.55            12.63   \n",
-       "5         80.08        83.18        95.60       107.72            12.49   \n",
-       "6         95.36       101.25       103.61       105.15            10.49   \n",
-       "7         95.49       101.50       102.66       104.82            10.47   \n",
-       "8        288.37       290.48       295.37       308.91             3.47   \n",
-       "9        288.61       291.10       295.78       301.52             3.46   \n",
-       "10       288.71       292.64       298.28       305.92             3.46   \n",
-       "11       288.97       291.18       297.68       309.30             3.46   \n",
+       "0         54.26        56.05        60.32       109.21            18.43   \n",
+       "1         55.80        56.74        59.67        73.62            17.92   \n",
+       "2         56.19        61.29        71.69        80.15            17.80   \n",
+       "3         57.66        58.50        61.96        65.12            17.34   \n",
+       "4         59.90        59.72        65.16       116.16            16.70   \n",
+       "5         62.84        67.05        69.07        75.99            15.91   \n",
+       "6         63.72        64.17        69.44        73.10            15.69   \n",
+       "7         65.31        65.35        80.70       177.94            15.31   \n",
+       "8         69.29        69.04        70.68        85.03            14.43   \n",
+       "9         82.44        83.20        89.64        98.80            12.13   \n",
+       "10       119.64       119.07       122.62       135.67             8.36   \n",
+       "11       223.21       223.22       226.83       249.08             4.48   \n",
       "\n",
-       "    intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY contiguous  \n",
-       "0                      1              12         PASSIVE       None  \n",
-       "1                      1              12          ACTIVE       None  \n",
-       "2                      1               6         PASSIVE       None  \n",
-       "3                      1                                       None  \n",
-       "4                      6               6         PASSIVE       None  \n",
-       "5                      0                                       None  \n",
-       "6                      1               6          ACTIVE       None  \n",
-       "7                      6               6          ACTIVE       None  \n",
-       "8                      6               1         PASSIVE       None  \n",
-       "9                     12               1         PASSIVE       None  \n",
-       "10                    12               1          ACTIVE       None  \n",
-       "11                     6               1          ACTIVE       None  "
+       "    intra_op_num_threads  \n",
+       "0                     12  \n",
+       "1                     11  \n",
+       "2                      6  \n",
+       "3                      9  \n",
+       "4                      5  \n",
+       "5                      8  \n",
+       "6                      4  \n",
+       "7                     10  \n",
+       "8                      7  \n",
+       "9                      3  \n",
+       "10                     2  \n",
+       "11                     1  "
      ]
     },
-     "execution_count": 14,
+     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -876,7 +771,7 @@
    "print(latest_result_file)\n",
    "\n",
    "# Remove some columns that have same values for all rows.\n",
-    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup']\n",
+    "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu']\n",
    "# Hide some latency percentile columns to fit screen width.\n",
    "columns_to_remove.extend(['Latency_P50', 'Latency_P95'])\n",
    "result_data.drop(columns_to_remove, axis=1, inplace=True)\n",
@ -901,7 +796,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
@ -910,12 +805,12 @@
     "text": [
      "{\n",
      "  \"gpu\": {\n",
-      "    \"driver_version\": \"442.23\",\n",
+      "    \"driver_version\": \"470.14\",\n",
      "    \"devices\": [\n",
      "      {\n",
      "        \"memory_total\": 8589934592,\n",
-      "        \"memory_available\": 6997721088,\n",
-      "        \"name\": \"GeForce GTX 1070\"\n",
+      "        \"memory_available\": 6782619648,\n",
+      "        \"name\": \"NVIDIA GeForce GTX 1070\"\n",
      "      }\n",
      "    ]\n",
      "  },\n",
@ -925,22 +820,98 @@
      "    \"logical_cores\": 12,\n",
      "    \"hz\": \"3.1920 GHz\",\n",
      "    \"l2_cache\": \"1536 KB\",\n",
-      "    \"l3_cache\": \"12288 KB\",\n",
+      "    \"flags\": [\n",
+      "      \"3dnow\",\n",
+      "      \"3dnowprefetch\",\n",
+      "      \"abm\",\n",
+      "      \"acpi\",\n",
+      "      \"adx\",\n",
+      "      \"aes\",\n",
+      "      \"apic\",\n",
+      "      \"avx\",\n",
+      "      \"avx2\",\n",
+      "      \"bmi1\",\n",
+      "      \"bmi2\",\n",
+      "      \"clflush\",\n",
+      "      \"clflushopt\",\n",
+      "      \"cmov\",\n",
+      "      \"cx16\",\n",
+      "      \"cx8\",\n",
+      "      \"de\",\n",
+      "      \"dtes64\",\n",
+      "      \"dts\",\n",
+      "      \"erms\",\n",
+      "      \"est\",\n",
+      "      \"f16c\",\n",
+      "      \"fma\",\n",
+      "      \"fpu\",\n",
+      "      \"fxsr\",\n",
+      "      \"hle\",\n",
+      "      \"ht\",\n",
+      "      \"hypervisor\",\n",
+      "      \"ia64\",\n",
+      "      \"invpcid\",\n",
+      "      \"lahf_lm\",\n",
+      "      \"mca\",\n",
+      "      \"mce\",\n",
+      "      \"mmx\",\n",
+      "      \"movbe\",\n",
+      "      \"mpx\",\n",
+      "      \"msr\",\n",
+      "      \"mtrr\",\n",
+      "      \"osxsave\",\n",
+      "      \"pae\",\n",
+      "      \"pat\",\n",
+      "      \"pbe\",\n",
+      "      \"pcid\",\n",
+      "      \"pclmulqdq\",\n",
+      "      \"pdcm\",\n",
+      "      \"pge\",\n",
+      "      \"pni\",\n",
+      "      \"popcnt\",\n",
+      "      \"pse\",\n",
+      "      \"pse36\",\n",
+      "      \"rdrnd\",\n",
+      "      \"rdseed\",\n",
+      "      \"rtm\",\n",
+      "      \"sep\",\n",
+      "      \"serial\",\n",
+      "      \"sgx\",\n",
+      "      \"sgx_lc\",\n",
+      "      \"smap\",\n",
+      "      \"smep\",\n",
+      "      \"ss\",\n",
+      "      \"sse\",\n",
+      "      \"sse2\",\n",
+      "      \"sse4_1\",\n",
+      "      \"sse4_2\",\n",
+      "      \"ssse3\",\n",
+      "      \"tm\",\n",
+      "      \"tm2\",\n",
+      "      \"tsc\",\n",
+      "      \"tscdeadline\",\n",
+      "      \"vme\",\n",
+      "      \"x2apic\",\n",
+      "      \"xsave\",\n",
+      "      \"xtpr\"\n",
+      "    ],\n",
      "    \"processor\": \"Intel64 Family 6 Model 158 Stepping 10, GenuineIntel\"\n",
      "  },\n",
      "  \"memory\": {\n",
-      "    \"total\": 16971276288,\n",
-      "    \"available\": 4723568640\n",
+      "    \"total\": 16977195008,\n",
+      "    \"available\": 6085459968\n",
      "  },\n",
      "  \"python\": \"3.6.10.final.0 (64 bit)\",\n",
-      "  \"os\": \"Windows-10-10.0.19041-SP0\",\n",
+      "  \"os\": \"Windows-10-10.0.21390-SP0\",\n",
      "  \"onnxruntime\": {\n",
-      "    \"version\": \"1.4.0\",\n",
+      "    \"version\": \"1.8.1\",\n",
      "    \"support_gpu\": false\n",
      "  },\n",
+      "  \"onnxruntime_tools\": null,\n",
      "  \"pytorch\": {\n",
-      "    \"version\": \"1.6.0+cpu\",\n",
-      "    \"support_gpu\": false\n",
+      "    \"version\": \"1.9.0+cpu\",\n",
+      "    \"support_gpu\": false,\n",
+      "    \"cuda\": null\n",
      "  },\n",
      "  \"tensorflow\": {\n",
      "    \"version\": \"2.3.0\",\n",
@ -954,20 +925,14 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2020-08-06 17:30:50.400838: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
+      "2021-07-13 14:41:45.376756: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found\n",
+      "2021-07-13 14:41:45.376780: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\n"
     ]
    }
   ],
   "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
+    "!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb