Update gpt2 notebook for int8 quantization (#5346)

* Update gpt2 notebook for ORT 1.5 * add sections for int8 quantization including QAT note
2026-07-17 18:40:28 +00:00 · 2020-10-02 09:41:52 -07:00 · 2020-10-02 09:41:52 -07:00 · e33de20861
commit e33de20861
parent ce49cfa67c
1 changed files with 270 additions and 163 deletions
--- a/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/Inference_GPT2_with_OnnxRuntime_on_CPU.ipynb
@ -51,46 +51,30 @@
      "Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (0.18.2)\n",
      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.6.0+cpu) (1.18.1)\n",
      "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.7.0+cpu) (7.0.0)\n",
-      "Requirement already up-to-date: onnxruntime==1.4.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0)\n",
-      "Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.4.0) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (45.2.0.post20200210)\n",
-      "Requirement already satisfied, skipping upgrade: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.4.0) (1.14.0)\n",
-      "Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.2)\n",
-      "Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n",
-      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n",
-      "Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n",
-      "Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n",
-      "Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n",
-      "Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n",
-      "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n",
-      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n",
-      "Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n",
-      "Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n",
-      "Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n",
-      "Requirement already satisfied: transformers==3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.0.2)\n",
-      "Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n",
-      "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n",
-      "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n",
-      "Requirement already satisfied: tokenizers==0.8.1.rc1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc1)\n",
-      "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n",
-      "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n",
+      "Requirement already satisfied: onnxruntime==1.5.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.5.1)\n",
+      "Requirement already satisfied: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.5.1) (1.18.1)\n",
+      "Requirement already satisfied: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.5.1) (3.11.3)\n",
+      "Requirement already satisfied: six>=1.9 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.5.1) (1.14.0)\n",
+      "Requirement already satisfied: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.5.1) (45.2.0.post20200210)\n",
+      "Requirement already satisfied: transformers==3.0.2 in d:\\git\\transformers\\src (3.0.2)\n",
      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (1.18.1)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n",
-      "Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n",
+      "Requirement already satisfied: tokenizers==0.8.1.rc2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.8.1rc2)\n",
+      "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (20.1)\n",
      "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (3.0.12)\n",
-      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n",
+      "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2.23.0)\n",
+      "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (4.43.0)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (2020.2.20)\n",
+      "Requirement already satisfied: sentencepiece!=0.1.92 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.1.85)\n",
+      "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.0.38)\n",
+      "Requirement already satisfied: dataclasses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==3.0.2) (0.7)\n",
      "Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (2.4.6)\n",
+      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==3.0.2) (1.14.0)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n",
      "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (0.14.1)\n",
      "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==3.0.2) (7.0)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2020.4.5.1)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (2.9)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==3.0.2) (1.25.8)\n",
      "Requirement already satisfied: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.7.0)\n",
      "Requirement already satisfied: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (5.7.0)\n",
      "Requirement already satisfied: pytz in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (2019.3)\n",
@ -98,10 +82,10 @@
      "Requirement already satisfied: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (5.0.0)\n",
      "Requirement already satisfied: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.2.5)\n",
      "Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n",
-      "Requirement already satisfied: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (3.11.3)\n",
-      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (1.18.1)\n",
      "Requirement already satisfied: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (3.7.4.1)\n",
      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (1.14.0)\n",
+      "Requirement already satisfied: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (3.11.3)\n",
+      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (1.18.1)\n",
      "Requirement already satisfied: python-dateutil>=2.6.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from pandas) (2.8.1)\n",
      "Requirement already satisfied: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml) (0.12.0)\n",
      "Requirement already satisfied: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx) (45.2.0.post20200210)\n"
@ -109,14 +93,13 @@
    }
   ],
   "source": [
-    "# Install PyTorch 1.6.0 and OnnxRuntime 1.4.0 for CPU-only.\n",
+    "# Install PyTorch 1.6.0 and OnnxRuntime 1.5.1 for CPU-only.\n",
    "import sys\n",
    "if sys.platform == 'darwin': # Mac\n",
    "    !{sys.executable} -m pip install --upgrade torch torchvision\n",
    "else:\n",
    "    !{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime-tools\n",
+    "!{sys.executable} -m pip install onnxruntime==1.5.1\n",
    "\n",
    "# Install other packages used in this notebook.\n",
    "!{sys.executable} -m pip install transformers==3.0.2\n",
@ -147,7 +130,7 @@
    "\n",
    "The script accepts a pretrained model name or path of a checkpoint directory as input, and converts the model to ONNX. It also verifies that the ONNX model could generate same input as the pytorch model. The usage is like \n",
    "```\n",
-    "python -m onnxruntime_tools.transformers.convert_to_onnx -m model_name_or_path --output gpt2.onnx -o -p fp32|fp16|int8\n",
+    "python -m onnxruntime.transformers.convert_to_onnx -m model_name_or_path --output gpt2.onnx -o -p fp32|fp16|int8\n",
    "```\n",
    "The -p option can be used to choose the precision: fp32 (float32), fp16 (mixed precision) or int8 (quantization). The -o option will generate optimized model, which is required for fp16 or int8.\n",
    "\n",
@ -159,14 +142,6 @@
   "execution_count": 3,
   "metadata": {},
   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Some weights of MyGPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
-     ]
-    },
    {
     "name": "stdout",
     "output_type": "stream",
@ -186,6 +161,7 @@
      "  \"n_ctx\": 1024,\n",
      "  \"n_embd\": 768,\n",
      "  \"n_head\": 12,\n",
+      "  \"n_inner\": null,\n",
      "  \"n_layer\": 12,\n",
      "  \"n_positions\": 1024,\n",
      "  \"resid_pdrop\": 0.1,\n",
@ -207,7 +183,7 @@
    }
   ],
   "source": [
-    "from onnxruntime_tools.transformers.gpt2_helper import Gpt2Helper, MyGPT2LMHeadModel\n",
+    "from onnxruntime.transformers.gpt2_helper import Gpt2Helper, MyGPT2LMHeadModel\n",
    "from transformers import AutoConfig\n",
    "import torch\n",
    "\n",
@ -233,11 +209,13 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "D:\\Anaconda3\\envs\\cpu_env\\lib\\site-packages\\transformers\\modeling_gpt2.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:714: FutureWarning: The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.\n",
+      "  FutureWarning,\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:560: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
-      "D:\\Anaconda3\\envs\\cpu_env\\lib\\site-packages\\transformers\\modeling_gpt2.py:149: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:166: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  w = w / (float(v.size(-1)) ** 0.5)\n",
-      "D:\\Anaconda3\\envs\\cpu_env\\lib\\site-packages\\transformers\\modeling_gpt2.py:151: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:171: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  mask = self.bias[:, :, ns - nd : ns, :ns]\n"
     ]
    }
@ -263,18 +241,22 @@
   "metadata": {},
   "outputs": [
    {
-     "name": "stderr",
+     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+      "input_ids tensor([[50256, 50256, 50256, 50256, 13466,  7541,   287, 15489,  1989],\n",
+      "        [ 1456,   318,   281,  1672,   286,   308,   457,    17,  2746]])\n",
+      "attention_mask tensor([[0., 0., 0., 0., 1., 1., 1., 1., 1.],\n",
+      "        [1., 1., 1., 1., 1., 1., 1., 1., 1.]])\n",
+      "position_ids tensor([[0, 0, 0, 0, 0, 1, 2, 3, 4],\n",
+      "        [0, 1, 2, 3, 4, 5, 6, 7, 8]])\n"
     ]
    }
   ],
   "source": [
    "from transformers import AutoTokenizer\n",
    "\n",
-    "EXAMPLE_Text = ['ONNX runtime is', 'here is an example of gpt2 model']\n",
+    "EXAMPLE_Text = ['best hotel in bay area', 'here is an example of gpt2 model']\n",
    "\n",
    "def get_tokenizer(model_name_or_path, cache_dir):\n",
    "    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)\n",
@ -283,9 +265,9 @@
    "    #okenizer.add_special_tokens({'pad_token': '[PAD]'})\n",
    "    return tokenizer\n",
    "\n",
-    "def get_example_inputs(prompt_text=EXAMPLE_Text, verbose=False):    \n",
+    "def get_example_inputs(prompt_text=EXAMPLE_Text):    \n",
    "    tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
-    "    encodings_dict = tokenizer.batch_encode_plus(prompt_text, pad_to_max_length=True)\n",
+    "    encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)\n",
    "\n",
    "    input_ids = torch.tensor(encodings_dict['input_ids'], dtype=torch.int64)\n",
    "    attention_mask = torch.tensor(encodings_dict['attention_mask'], dtype=torch.float32)\n",
@ -299,12 +281,7 @@
    "    past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]\n",
    "    for i in range(num_layer):\n",
    "        empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))\n",
-    "    \n",
-    "    if verbose:\n",
-    "        print(\"input_ids\", input_ids)\n",
-    "        print(\"attention_mask\", attention_mask)\n",
-    "        print(\"position_ids\", position_ids)\n",
-    "    \n",
+    "       \n",
    "    return input_ids, attention_mask, position_ids, empty_past\n",
    "\n",
    "\n",
@ -314,6 +291,17 @@
    "torch_model.eval().to(device)\n",
    "\n",
    "input_ids, attention_mask, position_ids, empty_past = get_example_inputs()\n",
+    "print(\"input_ids\", input_ids)\n",
+    "print(\"attention_mask\", attention_mask)\n",
+    "print(\"position_ids\", position_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "with torch.no_grad():\n",
    "    torch_output = torch_model(input_ids, past=empty_past, attention_mask=attention_mask, position_ids=position_ids)"
   ]
@ -331,7 +319,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
@ -360,24 +348,21 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "max logits diff (ignored padding) tensor(0.0001)\n"
+      "max logits diff (ignored padding) tensor(6.8665e-05)\n"
     ]
    }
   ],
   "source": [
    "logits_masked_diff = (torch_output[0] - ort_outputs[0]) * attention_mask.unsqueeze(2)\n",
    "max_logits_diff = logits_masked_diff.abs().max()\n",
-    "print(\"max logits diff (ignored padding)\", max_logits_diff)\n",
-    "\n",
-    "#past_diff = [(torch_output[1][i] - ort_outputs[i + 1]).abs().max() for i in range(num_layer)]\n",
-    "#print(\"past state diff for each layer\", past_diff)"
+    "print(\"max logits diff (ignored padding)\", max_logits_diff)"
   ]
  },
  {
@ -391,7 +376,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
@ -420,7 +405,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
@ -450,11 +435,12 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
-    "def test_generation(tokenizer, input_text, use_onnxruntime=True):\n",
+    "def test_generation(tokenizer, input_text, ort_session=None, num_tokens_to_produce = 30):\n",
+    "    use_onnxruntime = (ort_session is not None)\n",
    "    print(\"Text generation using\", \"OnnxRuntime\" if use_onnxruntime else \"PyTorch\", \"...\")\n",
    "    eos_token_id = tokenizer.eos_token_id\n",
    "    \n",
@ -465,10 +451,9 @@
    "\n",
    "    all_token_ids = input_ids.clone()\n",
    "\n",
-    "    num_tokens_to_produce = 30\n",
    "    for step in range(num_tokens_to_produce):\n",
-    "        if use_onnxruntime:\n",
-    "            outputs = inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, past)\n",
+    "        if ort_session is not None:\n",
+    "            outputs = inference_with_io_binding(ort_session, config, input_ids, position_ids, attention_mask, past)\n",
    "        else:\n",
    "            outputs = torch_model(input_ids, attention_mask=attention_mask, position_ids=position_ids, past=past)  \n",
    "\n",
@ -497,49 +482,8 @@
    "            break\n",
    "\n",
    "    for i, output in enumerate(all_token_ids):\n",
-    "        print(f\"Example {i}:\", tokenizer.decode(output, skip_special_tokens=True))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Text generation using OnnxRuntime ...\n",
-      "Example 0: ONNX runtime is not supported.\n",
-      "\n",
-      "The following is a list of the supported languages:\n",
-      "\n",
-      "English\n",
-      "\n",
-      "French\n",
-      "\n",
-      "German\n",
-      "\n",
-      "Italian\n",
-      "\n",
-      "Japanese\n",
-      "Example 1: here is an example of gpt2 model.\n",
-      "\n",
-      "The gpt2 model is a simple, but powerful, way to generate a GPT2-like data structure. It is a\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
-    "input_text = EXAMPLE_Text\n",
-    "test_generation(tokenizer, input_text, use_onnxruntime=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Next, we use PyTorch to run again and we can see that the result is exactly same."
+    "        print(\"------------\")\n",
+    "        print(tokenizer.decode(output, skip_special_tokens=True))"
   ]
  },
  {
@ -551,28 +495,114 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "Text generation using PyTorch ...\n",
-      "Example 0: ONNX runtime is not supported.\n",
+      "Text generation using OnnxRuntime ...\n",
+      "------------\n",
+      "best hotel in bay area.\n",
      "\n",
-      "The following is a list of the supported languages:\n",
+      "The hotel is located in the historic Bayview neighborhood of San Francisco.\n",
      "\n",
-      "English\n",
-      "\n",
-      "French\n",
-      "\n",
-      "German\n",
-      "\n",
-      "Italian\n",
-      "\n",
-      "Japanese\n",
-      "Example 1: here is an example of gpt2 model.\n",
+      "The hotel is open daily from 9 a.m.\n",
+      "------------\n",
+      "here is an example of gpt2 model.\n",
      "\n",
      "The gpt2 model is a simple, but powerful, way to generate a GPT2-like data structure. It is a\n"
     ]
    }
   ],
   "source": [
-    "test_generation(tokenizer, input_text, use_onnxruntime=False)"
+    "tokenizer = get_tokenizer(model_name_or_path, cache_dir)\n",
+    "input_text = EXAMPLE_Text\n",
+    "test_generation(tokenizer, input_text, ort_session=session)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Next, we use PyTorch to run again and we can see that the result is exactly same."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text generation using PyTorch ...\n",
+      "------------\n",
+      "best hotel in bay area.\n",
+      "\n",
+      "The hotel is located in the historic Bayview neighborhood of San Francisco.\n",
+      "\n",
+      "The hotel is open daily from 9 a.m.\n",
+      "------------\n",
+      "here is an example of gpt2 model.\n",
+      "\n",
+      "The gpt2 model is a simple, but powerful, way to generate a GPT2-like data structure. It is a\n"
+     ]
+    }
+   ],
+   "source": [
+    "test_generation(tokenizer, input_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Int8 Quantization ##\n",
+    "Next, we will apply dynamic quantization to the model. We optimize the model before quantization to get better performance.\n",
+    "\n",
+    "Note that text generation result from fp32 and int8 models could be quite different. User shall evaluate the precision metric for your application for both fp32 and int8 models. If the quality of int8 model result is acceptable, you will be glad to find that it is faster than fp32 model in inference. \n",
+    "\n",
+    "Note that you can leverage [quantization aware training (QAT)](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/) for accuracy improvement if needed."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Warning: onnxruntime.quantization.quantize is deprecated.\n",
+      "         Please use quantize_static for static quantization, quantize_dynamic for dynamic quantization.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from onnxruntime.transformers.quantize_helper import QuantizeHelper\n",
+    "\n",
+    "optimized_fp32_model_path = \"gpt2_fp32.onnx\"\n",
+    "quantized_int8_model_path = \"gpt2_int8.onnx\"\n",
+    "Gpt2Helper.optimize_onnx(\"gpt2.onnx\", optimized_fp32_model_path, False, model.config.num_attention_heads, model.config.hidden_size)\n",
+    "QuantizeHelper.quantize_onnx_model(optimized_fp32_model_path, quantized_int8_model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Text generation using OnnxRuntime ...\n",
+      "------------\n",
+      "bert model optimization, and the NLP model is a generalizable and robust model.\n"
+     ]
+    }
+   ],
+   "source": [
+    "session_int8 = onnxruntime.InferenceSession(quantized_int8_model_path)\n",
+    "input_text = ['bert model optimization']\n",
+    "test_generation(tokenizer, input_text, ort_session=session_int8, num_tokens_to_produce=14)"
   ]
  },
  {
@ -585,7 +615,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
@ -612,54 +642,131 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2020-08-14 00:40:03.220131: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n",
-      "Arguments:Namespace(batch_sizes=[1], cache_dir='.\\\\cache_models', include_copy_output_latency=False, model_class='GPT2LMHeadModel', model_name='gpt2', onnx_dir='.\\\\onnx_models', optimize_onnx=True, past_sequence_lengths=[8, 16, 32, 64, 128, 256], precision=<Precision.FLOAT32: 'fp32'>, result_csv=None, test_times=100, thread_num=-1, torchscript=False, use_gpu=False, validate_onnx=False, verbose=False)\n",
+      "2020-09-30 18:44:40.720277: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n",
+      "Arguments:Namespace(batch_sizes=[1], cache_dir='.\\\\cache_models', include_copy_output_latency=False, model_class='GPT2LMHeadModel', model_name_or_path='gpt2', onnx_dir='.\\\\onnx_models', optimize_onnx=True, past_sequence_lengths=[8, 16, 32, 64, 128, 256], precision=<Precision.FLOAT32: 'fp32'>, result_csv=None, test_times=100, thread_num=-1, torchscript=False, use_gpu=False, validate_onnx=False, verbose=False)\n",
      "PyTorch Version:1.6.0+cpu\n",
      "Transformers Version:3.0.2\n",
-      "Onnxruntime Version:1.4.0\n",
-      "Some weights of MyGPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.0.attn.masked_bias', 'h.1.attn.masked_bias', 'h.2.attn.masked_bias', 'h.3.attn.masked_bias', 'h.4.attn.masked_bias', 'h.5.attn.masked_bias', 'h.6.attn.masked_bias', 'h.7.attn.masked_bias', 'h.8.attn.masked_bias', 'h.9.attn.masked_bias', 'h.10.attn.masked_bias', 'h.11.attn.masked_bias', 'lm_head.weight']\n",
-      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Onnxruntime Version:1.5.1\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:714: FutureWarning: The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.\n",
+      "  FutureWarning,\n",
      "Shapes: input_ids=torch.Size([1, 1]) past=torch.Size([2, 1, 12, 1, 64]) output=torch.Size([1, 1, 50257]) present=torch.Size([2, 1, 12, 2, 64])\n",
-      "D:\\Anaconda3\\envs\\cpu_env\\lib\\site-packages\\transformers\\modeling_gpt2.py:445: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:560: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
-      "D:\\Anaconda3\\envs\\cpu_env\\lib\\site-packages\\transformers\\modeling_gpt2.py:149: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:166: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  w = w / (float(v.size(-1)) ** 0.5)\n",
-      "D:\\Anaconda3\\envs\\cpu_env\\lib\\site-packages\\transformers\\modeling_gpt2.py:151: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:171: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
      "  mask = self.bias[:, :, ns - nd : ns, :ns]\n",
      "Fused LayerNormalization count: 25\n",
      "Fused FastGelu count: 12\n",
      "Fused Attention(with past) count: 12\n",
-      "Graph pruned: 0 inputs, 0 outputs and 813 nodes are removed\n",
-      "Graph pruned: 0 inputs, 0 outputs and 360 nodes are removed\n",
+      "Graph pruned: 0 inputs, 0 outputs and 741 nodes are removed\n",
+      "Graph pruned: 0 inputs, 0 outputs and 312 nodes are removed\n",
      "postprocess: remove Reshape count:48\n",
      "Fused FastGelu(add bias) count: 12\n",
      "opset verion: 11\n",
      "Output model to .\\onnx_models\\gpt2_past_fp32.onnx\n",
-      "batch_size=1, past_sequence_length=8, torch_latency=41.62, ort_latency=31.69, ort_io_latency=31.39\n",
-      "batch_size=1, past_sequence_length=16, torch_latency=42.37, ort_latency=31.67, ort_io_latency=31.36\n",
-      "batch_size=1, past_sequence_length=32, torch_latency=43.34, ort_latency=32.20, ort_io_latency=32.25\n",
-      "batch_size=1, past_sequence_length=64, torch_latency=44.01, ort_latency=34.90, ort_io_latency=31.53\n",
-      "batch_size=1, past_sequence_length=128, torch_latency=47.43, ort_latency=39.52, ort_io_latency=33.05\n",
-      "batch_size=1, past_sequence_length=256, torch_latency=54.88, ort_latency=45.20, ort_io_latency=38.01\n",
-      "Results are saved to file benchmark_result_20200814-004123.csv\n"
+      "batch_size=1, past_sequence_length=8, torch_latency=40.68, ort_latency=24.07, ort_io_latency=24.03\n",
+      "batch_size=1, past_sequence_length=16, torch_latency=40.87, ort_latency=23.14, ort_io_latency=22.27\n",
+      "batch_size=1, past_sequence_length=32, torch_latency=41.36, ort_latency=23.74, ort_io_latency=23.05\n",
+      "batch_size=1, past_sequence_length=64, torch_latency=42.97, ort_latency=26.25, ort_io_latency=23.64\n",
+      "batch_size=1, past_sequence_length=128, torch_latency=44.30, ort_latency=30.48, ort_io_latency=25.85\n",
+      "batch_size=1, past_sequence_length=256, torch_latency=54.77, ort_latency=40.60, ort_io_latency=28.20\n",
+      "Results are saved to file benchmark_result_20200930-184558.csv\n"
     ]
    }
   ],
   "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.benchmark_gpt2 -m gpt2 -o"
+    "!{sys.executable} -m onnxruntime.transformers.benchmark_gpt2 -m gpt2 -o"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "ATen/Parallel:\n",
+      "\tat::get_num_threads() : 12\n",
+      "\tat::get_num_interop_threads() : 6\n",
+      "OpenMP 2019\n",
+      "\tomp_get_max_threads() : 12\n",
+      "Intel(R) Math Kernel Library Version 2020.0.0 Product Build 20191125 for Intel(R) 64 architecture applications\n",
+      "\tmkl_get_max_threads() : 12\n",
+      "Intel(R) MKL-DNN v1.5.0 (Git Hash e2ac1fac44c5078ca927cb9b90e1b3066a0b2ed0)\n",
+      "std::thread::hardware_concurrency() : 12\n",
+      "Environment variables:\n",
+      "\tOMP_NUM_THREADS : [not set]\n",
+      "\tMKL_NUM_THREADS : [not set]\n",
+      "ATen parallel backend: OpenMP\n",
+      "\n",
+      "Warning: onnxruntime.quantization.quantize is deprecated.\n",
+      "         Please use quantize_static for static quantization, quantize_dynamic for dynamic quantization.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-09-30 18:47:09.756025: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n",
+      "Arguments:Namespace(batch_sizes=[1], cache_dir='.\\\\cache_models', include_copy_output_latency=False, model_class='GPT2LMHeadModel', model_name_or_path='gpt2', onnx_dir='.\\\\onnx_models', optimize_onnx=True, past_sequence_lengths=[8, 16, 32, 64, 128, 256], precision=<Precision.INT8: 'int8'>, result_csv=None, test_times=100, thread_num=-1, torchscript=False, use_gpu=False, validate_onnx=False, verbose=False)\n",
+      "PyTorch Version:1.6.0+cpu\n",
+      "Transformers Version:3.0.2\n",
+      "Onnxruntime Version:1.5.1\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:714: FutureWarning: The `past` argument is deprecated and will be removed in a future version, use `past_key_values` instead.\n",
+      "  FutureWarning,\n",
+      "Shapes: input_ids=torch.Size([1, 1]) past=torch.Size([2, 1, 12, 1, 64]) output=torch.Size([1, 1, 50257]) present=torch.Size([2, 1, 12, 2, 64])\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:560: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  assert batch_size > 0, \"batch_size has to be defined and > 0\"\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:166: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  w = w / (float(v.size(-1)) ** 0.5)\n",
+      "d:\\git\\transformers\\src\\transformers\\modeling_gpt2.py:171: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+      "  mask = self.bias[:, :, ns - nd : ns, :ns]\n",
+      "Fused LayerNormalization count: 25\n",
+      "Fused FastGelu count: 12\n",
+      "Fused Attention(with past) count: 12\n",
+      "Graph pruned: 0 inputs, 0 outputs and 741 nodes are removed\n",
+      "Graph pruned: 0 inputs, 0 outputs and 312 nodes are removed\n",
+      "postprocess: remove Reshape count:48\n",
+      "Fused FastGelu(add bias) count: 12\n",
+      "opset verion: 11\n",
+      "Output model to .\\onnx_models\\gpt2_past_int8.onnx\n",
+      "quantizing model...\n",
+      "Size of full precision ONNX model(MB):621.9615631103516\n",
+      "quantized model saved to:.\\onnx_models\\gpt2_past_int8.onnx\n",
+      "Size of quantized ONNX model(MB):155.89412593841553\n",
+      "Size of full precision Torch model(MB):486.7606954574585\n",
+      "Size of quantized Torch model(MB):280.60562801361084\n",
+      "finished quantizing model\n",
+      "batch_size=1, past_sequence_length=8, torch_latency=19.50, ort_latency=11.35, ort_io_latency=11.24\n",
+      "batch_size=1, past_sequence_length=16, torch_latency=20.13, ort_latency=11.53, ort_io_latency=10.24\n",
+      "batch_size=1, past_sequence_length=32, torch_latency=20.54, ort_latency=12.05, ort_io_latency=11.97\n",
+      "batch_size=1, past_sequence_length=64, torch_latency=21.29, ort_latency=13.90, ort_io_latency=12.15\n",
+      "batch_size=1, past_sequence_length=128, torch_latency=23.40, ort_latency=19.22, ort_io_latency=13.96\n",
+      "batch_size=1, past_sequence_length=256, torch_latency=30.26, ort_latency=29.05, ort_io_latency=16.77\n",
+      "Results are saved to file benchmark_result_20200930-184855.csv\n"
+     ]
+    }
+   ],
+   "source": [
+    "!{sys.executable} -m onnxruntime.transformers.benchmark_gpt2 -m gpt2 -o --precision int8"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
+    "We can see that quantized model has significant speed up (close to 2x).\n",
+    "\n",
    "### Test Environment ###\n",
    "The following is the hardware of the test machine, and software version:"
   ]
  },
  {
   "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
@ -668,11 +775,11 @@
     "text": [
      "{\n",
      "  \"gpu\": {\n",
-      "    \"driver_version\": \"442.23\",\n",
+      "    \"driver_version\": \"451.67\",\n",
      "    \"devices\": [\n",
      "      {\n",
      "        \"memory_total\": 8589934592,\n",
-      "        \"memory_available\": 5534052352,\n",
+      "        \"memory_available\": 8480882688,\n",
      "        \"name\": \"GeForce GTX 1070\"\n",
      "      }\n",
      "    ]\n",
@ -761,16 +868,16 @@
      "  },\n",
      "  \"memory\": {\n",
      "    \"total\": 16971276288,\n",
-      "    \"available\": 3952107520\n",
+      "    \"available\": 6431543296\n",
      "  },\n",
      "  \"python\": \"3.6.10.final.0 (64 bit)\",\n",
      "  \"os\": \"Windows-10-10.0.19041-SP0\",\n",
      "  \"onnxruntime\": {\n",
-      "    \"version\": \"1.4.0\",\n",
+      "    \"version\": \"1.5.1\",\n",
      "    \"support_gpu\": false\n",
      "  },\n",
      "  \"onnxruntime_tools\": {\n",
-      "    \"version\": \"1.4.2\"\n",
+      "    \"version\": \"1.4.4\"\n",
      "  },\n",
      "  \"pytorch\": {\n",
      "    \"version\": \"1.6.0+cpu\",\n",
@ -789,12 +896,12 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "2020-08-14 00:42:57.062514: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
+      "2020-09-30 18:49:40.600527: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library cudart64_101.dll\n"
     ]
    }
   ],
   "source": [
-    "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
+    "!{sys.executable} -m onnxruntime.transformers.machine_info --silent"
   ]
  }
 ],