From 61fa5476d55d98f6fb66b5d7b076169073bdb2c8 Mon Sep 17 00:00:00 2001
From: Tianlei Wu <tlwu@microsoft.com>
Date: Tue, 16 Jun 2020 09:36:51 -0700
Subject: [PATCH 1/5] Update PyTorch Bert notebooks (#4239)

update PyTorch Bert SquAD notebooks to use onnxruntim-tools and update usage of intra_op_num_threads.
rename python files according to coding style
Fix change_input_to_int32.
update keras notebook to copy script from rel-1.3.0 branch (Will update them later)
---
 .../tools/transformers/bert_test_data.py      |   2 +-
 .../transformers/compare_bert_results.py      |   2 +-
 .../tools/transformers/fusion_attention.py    |   2 +-
 .../python/tools/transformers/fusion_base.py  |   2 +-
 .../tools/transformers/fusion_biasgelu.py     |   2 +-
 .../tools/transformers/fusion_embedlayer.py   |   6 +-
 .../tools/transformers/fusion_fastgelu.py     |   2 +-
 .../python/tools/transformers/fusion_gelu.py  |   2 +-
 .../transformers/fusion_gelu_approximation.py |   2 +-
 .../transformers/fusion_gpt_attention.py      |   2 +-
 .../fusion_gpt_attention_no_past.py           |   2 +-
 .../tools/transformers/fusion_layernorm.py    |   2 +-
 .../tools/transformers/fusion_reshape.py      |   2 +-
 .../transformers/fusion_skiplayernorm.py      |   2 +-
 .../python/tools/transformers/fusion_utils.py |   2 +-
 .../{MachineInfo.py => machine_info.py}       |   0
 .../PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb  | 754 +++++++-----------
 .../PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb  | 115 +--
 ...low_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb |   2 +-
 .../{OnnxModel.py => onnx_model.py}           |   0
 .../{BertOnnxModel.py => onnx_model_bert.py}  |   6 +-
 ...ModelKeras.py => onnx_model_bert_keras.py} |   2 +-
 ...rtOnnxModelTF.py => onnx_model_bert_tf.py} |   2 +-
 .../{Gpt2OnnxModel.py => onnx_model_gpt2.py}  |   2 +-
 .../python/tools/transformers/optimizer.py    |  10 +-
 .../{ShapeOptimizer.py => shape_optimizer.py} |   3 +-
 .../generate_tiny_keras2onnx_bert_models.py   |   2 +-
 .../generate_tiny_gpt2_model.py               |   2 +-
 .../generate_tiny_gpt2_model.py               |   2 +-
 .../tools/transformers/test_optimizer.py      |   2 +-
 30 files changed, 376 insertions(+), 562 deletions(-)
 rename onnxruntime/python/tools/transformers/{MachineInfo.py => machine_info.py} (100%)
 rename onnxruntime/python/tools/transformers/{OnnxModel.py => onnx_model.py} (100%)
 rename onnxruntime/python/tools/transformers/{BertOnnxModel.py => onnx_model_bert.py} (98%)
 rename onnxruntime/python/tools/transformers/{BertOnnxModelKeras.py => onnx_model_bert_keras.py} (99%)
 rename onnxruntime/python/tools/transformers/{BertOnnxModelTF.py => onnx_model_bert_tf.py} (99%)
 rename onnxruntime/python/tools/transformers/{Gpt2OnnxModel.py => onnx_model_gpt2.py} (98%)
 rename onnxruntime/python/tools/transformers/{ShapeOptimizer.py => shape_optimizer.py} (99%)

diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py
index cb2ccb16e9..c1609219cf 100644
--- a/onnxruntime/python/tools/transformers/bert_test_data.py
+++ b/onnxruntime/python/tools/transformers/bert_test_data.py
@@ -12,7 +12,7 @@ import os
 import random
 from pathlib import Path
 from onnx import ModelProto, TensorProto, numpy_helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 
 
 def fake_input_ids_data(input_ids, batch_size, sequence_length, dictionary_size):
diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py
index 13b603a6a2..700051f460 100644
--- a/onnxruntime/python/tools/transformers/compare_bert_results.py
+++ b/onnxruntime/python/tools/transformers/compare_bert_results.py
@@ -19,7 +19,7 @@ import csv
 import timeit
 from datetime import datetime
 from onnx import ModelProto, TensorProto, numpy_helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from bert_test_data import get_bert_inputs, generate_test_data, output_test_data
 from bert_perf_test import create_session, onnxruntime_inference, setup_openmp_environ
 
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 67a73b91df..4118d1348b 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -5,7 +5,7 @@
 import numpy as np
 from logging import getLogger
 from onnx import helper, numpy_helper, TensorProto
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
 
diff --git a/onnxruntime/python/tools/transformers/fusion_base.py b/onnxruntime/python/tools/transformers/fusion_base.py
index 94b19eb7f7..863588dc06 100644
--- a/onnxruntime/python/tools/transformers/fusion_base.py
+++ b/onnxruntime/python/tools/transformers/fusion_base.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 #--------------------------------------------------------------------------
 from logging import getLogger
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from typing import Union, List
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_biasgelu.py b/onnxruntime/python/tools/transformers/fusion_biasgelu.py
index 2d7fd3d45a..bf0341ae60 100644
--- a/onnxruntime/python/tools/transformers/fusion_biasgelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_biasgelu.py
@@ -5,7 +5,7 @@
 
 from logging import getLogger
 from onnx import helper, numpy_helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_embedlayer.py b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
index 92af1a749d..d42e01bb5a 100644
--- a/onnxruntime/python/tools/transformers/fusion_embedlayer.py
+++ b/onnxruntime/python/tools/transformers/fusion_embedlayer.py
@@ -6,7 +6,7 @@
 from typing import Dict
 from logging import getLogger
 from onnx import helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
 
@@ -36,9 +36,7 @@ class FusionEmbedLayerNoMask(Fusion):
                                   v            v
                               SkipLayerNormalization
     """
-    def __init__(self,
-                 model: OnnxModel,
-                 description='no mask'):
+    def __init__(self, model: OnnxModel, description='no mask'):
         super().__init__(model, "EmbedLayerNormalization", "SkipLayerNormalization", description)
         self.utils = FusionUtils(model)
 
diff --git a/onnxruntime/python/tools/transformers/fusion_fastgelu.py b/onnxruntime/python/tools/transformers/fusion_fastgelu.py
index dd800135a1..a0900d6804 100644
--- a/onnxruntime/python/tools/transformers/fusion_fastgelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_fastgelu.py
@@ -5,7 +5,7 @@
 from typing import Dict, Optional
 from logging import getLogger
 from onnx import helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_gelu.py b/onnxruntime/python/tools/transformers/fusion_gelu.py
index 34eab944a8..895ae8238e 100644
--- a/onnxruntime/python/tools/transformers/fusion_gelu.py
+++ b/onnxruntime/python/tools/transformers/fusion_gelu.py
@@ -5,7 +5,7 @@
 from typing import Dict, Optional
 from logging import getLogger
 from onnx import helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py b/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
index 32f2e0af5c..10cbac4d6c 100644
--- a/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
+++ b/onnxruntime/python/tools/transformers/fusion_gelu_approximation.py
@@ -5,7 +5,7 @@
 
 from logging import getLogger
 from onnx import helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 
 
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
index 2d060fba82..31d7b1ebff 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention.py
@@ -5,7 +5,7 @@
 import numpy as np
 from logging import getLogger
 from onnx import helper, numpy_helper, TensorProto
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
 
diff --git a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
index 9585e3739a..b3ecceae2d 100644
--- a/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
+++ b/onnxruntime/python/tools/transformers/fusion_gpt_attention_no_past.py
@@ -5,7 +5,7 @@
 import numpy as np
 from logging import getLogger
 from onnx import helper, numpy_helper, TensorProto
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 from fusion_utils import FusionUtils
 
diff --git a/onnxruntime/python/tools/transformers/fusion_layernorm.py b/onnxruntime/python/tools/transformers/fusion_layernorm.py
index 3be6747bf2..ade39da01c 100644
--- a/onnxruntime/python/tools/transformers/fusion_layernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_layernorm.py
@@ -5,7 +5,7 @@
 from typing import Dict
 from logging import getLogger
 from onnx import helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_reshape.py b/onnxruntime/python/tools/transformers/fusion_reshape.py
index 909a258500..4fa6e68256 100644
--- a/onnxruntime/python/tools/transformers/fusion_reshape.py
+++ b/onnxruntime/python/tools/transformers/fusion_reshape.py
@@ -5,7 +5,7 @@
 
 from logging import getLogger
 from onnx import helper, numpy_helper, TensorProto
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 import numpy as np
 
diff --git a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
index 93e52d37ae..49d194c20c 100644
--- a/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
+++ b/onnxruntime/python/tools/transformers/fusion_skiplayernorm.py
@@ -5,7 +5,7 @@
 
 from logging import getLogger
 from onnx import helper, numpy_helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_base import Fusion
 
 logger = getLogger(__name__)
diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py
index de63b18301..9d2ad5b902 100644
--- a/onnxruntime/python/tools/transformers/fusion_utils.py
+++ b/onnxruntime/python/tools/transformers/fusion_utils.py
@@ -3,7 +3,7 @@
 # Licensed under the MIT License.
 #--------------------------------------------------------------------------
 from logging import getLogger
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from typing import Tuple
 from onnx import helper, TensorProto
 
diff --git a/onnxruntime/python/tools/transformers/MachineInfo.py b/onnxruntime/python/tools/transformers/machine_info.py
similarity index 100%
rename from onnxruntime/python/tools/transformers/MachineInfo.py
rename to onnxruntime/python/tools/transformers/machine_info.py
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
index 82e5bc6e60..4a827b7c0d 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb
@@ -37,8 +37,6 @@
     "```console\n",
     "conda create -n cpu_env python=3.6\n",
     "conda activate cpu_env\n",
-    "conda install pytorch torchvision cpuonly -c pytorch\n",
-    "pip install onnxruntime\n",
     "conda install jupyter\n",
     "jupyter notebook\n",
     "```\n",
@@ -48,75 +46,75 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {
-    "collapsed": true
-   },
+   "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
       "Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
-      "Requirement already up-to-date: torch==1.4.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.4.0+cpu)\n",
-      "Requirement already up-to-date: torchvision==0.5.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.5.0+cpu)\n",
-      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torchvision==0.5.0+cpu) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torchvision==0.5.0+cpu) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.5.0+cpu) (7.0.0)\n",
-      "Requirement already up-to-date: onnxruntime in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.2.0)\n",
-      "Requirement already satisfied, skipping upgrade: onnx>=1.2.3 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime) (1.6.0)\n",
-      "Requirement already satisfied, skipping upgrade: numpy>=1.18.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime) (1.18.1)\n",
-      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx>=1.2.3->onnxruntime) (3.11.3)\n",
-      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx>=1.2.3->onnxruntime) (1.14.0)\n",
-      "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx>=1.2.3->onnxruntime) (3.7.4.1)\n",
-      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx>=1.2.3->onnxruntime) (45.2.0.post20200210)\n",
-      "Requirement already satisfied: transformers==2.5.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (2.5.1)\n",
-      "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (4.43.0)\n",
-      "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (2020.2.20)\n",
-      "Requirement already satisfied: boto3 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (1.12.11)\n",
-      "Requirement already satisfied: sentencepiece in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (0.1.85)\n",
-      "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (0.0.38)\n",
-      "Requirement already satisfied: tokenizers==0.5.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (0.5.2)\n",
-      "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (2.23.0)\n",
-      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (1.18.1)\n",
-      "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.5.1) (3.0.12)\n",
-      "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from boto3->transformers==2.5.1) (0.3.3)\n",
-      "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from boto3->transformers==2.5.1) (0.9.5)\n",
-      "Requirement already satisfied: botocore<1.16.0,>=1.15.11 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from boto3->transformers==2.5.1) (1.15.11)\n",
-      "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==2.5.1) (7.0)\n",
-      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==2.5.1) (1.14.0)\n",
-      "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==2.5.1) (0.14.1)\n",
-      "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.5.1) (2.9)\n",
-      "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.5.1) (3.0.4)\n",
-      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.5.1) (1.25.8)\n",
-      "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.5.1) (2019.11.28)\n",
-      "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from botocore<1.16.0,>=1.15.11->boto3->transformers==2.5.1) (2.8.1)\n",
-      "Requirement already satisfied: docutils<0.16,>=0.10 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from botocore<1.16.0,>=1.15.11->boto3->transformers==2.5.1) (0.15.2)\n",
+      "Requirement already up-to-date: torch==1.5.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.5.0+cpu)\n",
+      "Requirement already up-to-date: torchvision==0.6.0+cpu in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.6.0+cpu)\n",
+      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.5.0+cpu) (1.18.1)\n",
+      "Requirement already satisfied, skipping upgrade: future in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from torch==1.5.0+cpu) (0.18.2)\n",
+      "Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in c:\\users\\tianl\\appdata\\roaming\\python\\python36\\site-packages (from torchvision==0.6.0+cpu) (7.0.0)\n",
+      "Requirement already up-to-date: onnxruntime==1.3.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.3.0)\n",
+      "Requirement already satisfied, skipping upgrade: onnx>=1.2.3 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.3.0) (1.7.0)\n",
+      "Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.3.0) (1.18.1)\n",
+      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime==1.3.0) (3.11.3)\n",
+      "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx>=1.2.3->onnxruntime==1.3.0) (3.7.4.1)\n",
+      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx>=1.2.3->onnxruntime==1.3.0) (1.14.0)\n",
+      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnxruntime==1.3.0) (45.2.0.post20200210)\n",
+      "Requirement already up-to-date: onnxruntime-tools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.3.0.1007)\n",
+      "Requirement already satisfied, skipping upgrade: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.18.1)\n",
+      "Requirement already satisfied, skipping upgrade: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (0.2.5)\n",
+      "Requirement already satisfied, skipping upgrade: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (20.1)\n",
+      "Requirement already satisfied, skipping upgrade: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (1.7.0)\n",
+      "Requirement already satisfied, skipping upgrade: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.7.0)\n",
+      "Requirement already satisfied, skipping upgrade: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (5.0.0)\n",
+      "Requirement already satisfied, skipping upgrade: coloredlogs in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnxruntime-tools) (14.0)\n",
+      "Requirement already satisfied, skipping upgrade: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
+      "Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (2.4.6)\n",
+      "Requirement already satisfied, skipping upgrade: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->onnxruntime-tools) (1.14.0)\n",
+      "Requirement already satisfied, skipping upgrade: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.11.3)\n",
+      "Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx->onnxruntime-tools) (3.7.4.1)\n",
+      "Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from coloredlogs->onnxruntime-tools) (8.1)\n",
+      "Requirement already satisfied, skipping upgrade: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx->onnxruntime-tools) (45.2.0.post20200210)\n",
+      "Requirement already satisfied, skipping upgrade: pyreadline; sys_platform == \"win32\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from humanfriendly>=7.1->coloredlogs->onnxruntime-tools) (2.1)\n",
+      "Requirement already satisfied: transformers==2.11.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (2.11.0)\n",
+      "Requirement already satisfied: regex!=2019.12.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (2020.2.20)\n",
+      "Requirement already satisfied: dataclasses; python_version < \"3.7\" in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (0.7)\n",
+      "Requirement already satisfied: filelock in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (3.0.12)\n",
+      "Requirement already satisfied: requests in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (2.23.0)\n",
+      "Requirement already satisfied: sacremoses in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (0.0.38)\n",
+      "Requirement already satisfied: packaging in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (20.1)\n",
+      "Requirement already satisfied: sentencepiece in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (0.1.85)\n",
+      "Requirement already satisfied: tqdm>=4.27 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (4.43.0)\n",
+      "Requirement already satisfied: tokenizers==0.7.0 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (0.7.0)\n",
+      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from transformers==2.11.0) (1.18.1)\n",
+      "Requirement already satisfied: chardet<4,>=3.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.11.0) (3.0.4)\n",
+      "Requirement already satisfied: idna<3,>=2.5 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.11.0) (2.9)\n",
+      "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.11.0) (1.25.8)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from requests->transformers==2.11.0) (2020.4.5.1)\n",
+      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==2.11.0) (1.14.0)\n",
+      "Requirement already satisfied: joblib in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==2.11.0) (0.14.1)\n",
+      "Requirement already satisfied: click in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from sacremoses->transformers==2.11.0) (7.0)\n",
+      "Requirement already satisfied: pyparsing>=2.0.2 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from packaging->transformers==2.11.0) (2.4.6)\n",
       "Requirement already satisfied: wget in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.2)\n",
-      "Requirement already satisfied: psutil in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (5.7.0)\n",
-      "Requirement already satisfied: onnx in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.6.0)\n",
-      "Requirement already satisfied: pytz in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (2019.3)\n",
-      "Requirement already satisfied: pandas in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (1.0.1)\n",
-      "Requirement already satisfied: py-cpuinfo in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (5.0.0)\n",
-      "Requirement already satisfied: py3nvml in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (0.2.5)\n",
-      "Requirement already satisfied: typing-extensions>=3.6.2.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (3.7.4.1)\n",
-      "Requirement already satisfied: protobuf in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (3.11.3)\n",
-      "Requirement already satisfied: numpy in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (1.18.1)\n",
-      "Requirement already satisfied: six in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from onnx) (1.14.0)\n",
-      "Requirement already satisfied: python-dateutil>=2.6.1 in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from pandas) (2.8.1)\n",
-      "Requirement already satisfied: xmltodict in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from py3nvml) (0.12.0)\n",
-      "Requirement already satisfied: setuptools in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (from protobuf->onnx) (45.2.0.post20200210)\n"
+      "Requirement already satisfied: netron in d:\\anaconda3\\envs\\cpu_env\\lib\\site-packages (3.9.6)\n"
      ]
     }
    ],
    "source": [
-    "# Install or upgrade PyTorch 1.4.0 and OnnxRuntime for CPU-only.\n",
+    "# Install or upgrade PyTorch 1.5.0 and OnnxRuntime 1.3.0 for CPU-only.\n",
     "import sys\n",
-    "!{sys.executable} -m pip install --upgrade torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
-    "!{sys.executable} -m pip install --upgrade onnxruntime\n",
+    "!{sys.executable} -m pip install --upgrade torch==1.5.0+cpu torchvision==0.6.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
+    "!{sys.executable} -m pip install --upgrade onnxruntime==1.3.0\n",
+    "!{sys.executable} -m pip install --upgrade onnxruntime-tools\n",
     "\n",
     "# Install other packages used in this notebook.\n",
-    "!{sys.executable} -m pip install transformers==2.5.1\n",
-    "!{sys.executable} -m pip install wget psutil onnx pytz pandas py-cpuinfo py3nvml netron"
+    "!{sys.executable} -m pip install transformers==2.11.0\n",
+    "!{sys.executable} -m pip install wget netron"
    ]
   },
   {
@@ -196,9 +194,9 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:03<00:00, 12.65it/s]\n",
-      "convert squad examples to features: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 145.99it/s]\n",
-      "add example index and unique id: 100%|███████████████████████████████████████████████████████| 100/100 [00:00<?, ?it/s]\n"
+      "100%|██████████████████████████████████████████████████████████████████████████████████| 48/48 [00:04<00:00, 10.47it/s]\n",
+      "convert squad examples to features: 100%|████████████████████████████████████████████| 100/100 [00:01<00:00, 91.16it/s]\n",
+      "add example index and unique id: 100%|███████████████████████████████████████████| 100/100 [00:00<00:00, 100007.25it/s]\n"
      ]
     }
    ],
@@ -314,7 +312,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "PyTorch cpu Inference time = 164.97 ms\n"
+      "PyTorch cpu Inference time = 198.99 ms\n"
      ]
     }
    ],
@@ -359,14 +357,7 @@
     "import psutil\n",
     "\n",
     "# You may change the settings in this cell according to Performance Test Tool result.\n",
-    "use_openmp = False\n",
-    "\n",
-    "# ATTENTION: these environment variables must be set before importing onnxruntime.\n",
-    "if use_openmp:\n",
-    "    os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n",
-    "else:\n",
-    "    os.environ[\"OMP_NUM_THREADS\"] = '1'\n",
-    "\n",
+    "os.environ[\"OMP_NUM_THREADS\"] = str(psutil.cpu_count(logical=True))\n",
     "os.environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'"
    ]
   },
@@ -388,7 +379,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "OnnxRuntime cpu Inference time = 91.51 ms\n"
+      "OnnxRuntime cpu Inference time = 176.96 ms\n"
      ]
     }
    ],
@@ -405,11 +396,11 @@
     "# Optional: store the optimized graph and view it using Netron to verify that model is fully optimized.\n",
     "# Note that this will increase session creation time, so it is for debugging only.\n",
     "sess_options.optimized_model_filepath = os.path.join(output_dir, \"optimized_model_cpu.onnx\")\n",
-    "   \n",
-    "if use_openmp:\n",
-    "    sess_options.intra_op_num_threads=1\n",
-    "else:\n",
-    "    sess_options.intra_op_num_threads=psutil.cpu_count(logical=True)\n",
+    "\n",
+    "# intra_op_num_threads is needed for OnnxRuntime 1.2.0.\n",
+    "# For OnnxRuntime 1.3.0 or later, this does not have effect unless you are using onnxruntime-gpu package.\n",
+    "sess_options.intra_op_num_threads=1\n",
+    "\n",
     "\n",
     "# Specify providers when you use onnxruntime-gpu for CPU inference.\n",
     "session = onnxruntime.InferenceSession(export_model_path, sess_options, providers=['CPUExecutionProvider'])\n",
@@ -417,12 +408,11 @@
     "latency = []\n",
     "for i in range(total_samples):\n",
     "    data = dataset[i]\n",
-    "    # Use contiguous array as input might improve performance.\n",
-    "    # You can check the results from performance test tool to see whether you need it.\n",
+    "    # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n",
     "    ort_inputs = {\n",
-    "        'input_ids':  numpy.ascontiguousarray(data[0].cpu().reshape(1, max_seq_length).numpy()),\n",
-    "        'input_mask': numpy.ascontiguousarray(data[1].cpu().reshape(1, max_seq_length).numpy()),\n",
-    "        'segment_ids': numpy.ascontiguousarray(data[2].cpu().reshape(1, max_seq_length).numpy())\n",
+    "        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),\n",
+    "        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n",
+    "        'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n",
     "    }\n",
     "    start = time.time()\n",
     "    ort_outputs = session.run(None, ort_inputs)\n",
@@ -457,10 +447,34 @@
    "source": [
     "## 5. Offline Optimization Script and Test Tools\n",
     "\n",
-    "It is recommended to download the [OnnxRuntime Python Tools for BERT](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers), and try them on the exported ONNX models. It could help verify whether the model is fully optimized, and get performance test results.\n",
+    "It is recommended to try [OnnxRuntime Transformer Model Optimization Tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) on the exported ONNX models. It could help verify whether the model can be fully optimized, and get performance test results."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Transformer Optimizer\n",
     "\n",
-    "### Download OnnxRuntime Python Tools for Bert\n",
-    "You may copy the whole [directory](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) to a sub-directory named bert_scripts for this notebook. The list of script files might need update if import error happens when you run some script."
+    "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n",
+    "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. \n",
+    "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n",
+    "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n",
+    "\n",
+    "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n",
+    "\n",
+    "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n",
+    "\n",
+    "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph.\n",
+    "\n",
+    "Example Usage:\n",
+    "```\n",
+    "from onnxruntime_tools import optimizer\n",
+    "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768)\n",
+    "optimized_model.save_model_to_file(optimized_model_path)\n",
+    "```\n",
+    "\n",
+    "You can also use optimizer_cli like the following:"
    ]
   },
   {
@@ -469,86 +483,36 @@
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
+     "name": "stderr",
      "output_type": "stream",
      "text": [
-      "100% [..............................................................................] 15310 / 15310Downloaded bert_perf_test.py\n",
-      "100% [................................................................................] 9571 / 9571Downloaded bert_test_data.py\n",
-      "100% [................................................................................] 7272 / 7272Downloaded compare_bert_results.py\n",
-      "100% [..............................................................................] 44905 / 44905Downloaded BertOnnxModel.py\n",
-      "100% [..............................................................................] 21565 / 21565Downloaded BertOnnxModelKeras.py\n",
-      "100% [..............................................................................] 26114 / 26114Downloaded BertOnnxModelTF.py\n",
-      "100% [..............................................................................] 22773 / 22773Downloaded OnnxModel.py\n",
-      "100% [................................................................................] 7917 / 7917Downloaded optimizer.py\n",
-      "100% [................................................................................] 5478 / 5478Downloaded MachineInfo.py\n"
-     ]
-    }
-   ],
-    "source": [
-      "import os\n",
-      "import wget\n",
-      "\n",
-      "url_prfix = \"https://raw.githubusercontent.com/microsoft/onnxruntime/master/onnxruntime/python/tools/transformers/\"\n",
-      "script_files = ['bert_perf_test.py', 'bert_test_data.py', 'compare_bert_results.py', 'BertOnnxModel.py', 'BertOnnxModelKeras.py', 'BertOnnxModelTF.py', 'Gpt2OnnxModel.py', 'OnnxModel.py', 'optimizer.py', 'MachineInfo.py']\n",
-      "\n",
-      "script_dir = './bert_scripts'\n",
-      "if not os.path.exists(script_dir):\n",
-      "    os.makedirs(script_dir)\n",
-      "\n",
-      "for filename in script_files:\n",
-      "    target_file = os.path.join(script_dir, filename)\n",
-      "    if enable_overwrite and os.path.exists(target_file):\n",
-      "        os.remove(target_file)\n",
-      "    if not os.path.exists(target_file):\n",
-      "        wget.download(url_prfix + filename, target_file)\n",
-      "        print(\"Downloaded\", filename)"
-    ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "#### BERT Optimization Script\n",
-    "\n",
-    "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n",
-    "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. For example, Gelu from PyTorch 1.4 is not fused by OnnxRuntime 1.1.2.\n",
-    "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n",
-    "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n",
-    "\n",
-    "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n",
-    "\n",
-    "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n",
-    "\n",
-    "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "optimizer.py: Save optimized model by onnxruntime to ./onnx\\bert-base-cased-squad_ort_cpu.onnx\n",
-      "optimizer.py: Use OnnxRuntime to optimize and save the optimized model to ./onnx\\bert-base-cased-squad_ort_cpu.onnx\n",
-      "    BertOnnxModel.py: Fused LayerNormalization count: 0\n",
-      "    BertOnnxModel.py: Fused Reshape count:0\n",
-      "    BertOnnxModel.py: Fused SkipLayerNormalization count: 24\n",
-      "    BertOnnxModel.py: Fused Attention count:0\n",
-      "    BertOnnxModel.py: skip embed layer fusion since mask input is not found\n",
-      "    BertOnnxModel.py: Fused SkipLayerNormalization with Bias count:24\n",
-      "    BertOnnxModel.py: opset verion: 11\n",
-      "        OnnxModel.py: Output model to ./onnx/bert-base-cased-squad_opt_cpu.onnx\n",
-      "    BertOnnxModel.py: EmbedLayer=1, Attention=12, Gelu=12, LayerNormalization=24, Succesful=True\n",
-      "optimizer.py: The output model is fully optimized.\n"
+      "optimize_by_onnxruntime: Save optimized model by onnxruntime to ./onnx\\bert-base-cased-squad_o1_cpu.onnx\n",
+      "               apply: Fused LayerNormalization count: 25\n",
+      "               apply: Fused LayerNormalization count: 0\n",
+      "               apply: Fused Gelu count: 12\n",
+      "               apply: Fused FastGelu count: 0\n",
+      "               apply: Fused Reshape count: 0\n",
+      "               apply: Fused SkipLayerNormalization count: 25\n",
+      "               apply: Fused Attention count: 12\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 5 nodes are removed\n",
+      "               apply: Fused EmbedLayerNormalization(with mask) count: 1\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 12 nodes are removed\n",
+      "         prune_graph: Graph pruned: 0 inputs, 0 outputs and 0 nodes are removed\n",
+      "               apply: Fused FastGelu(add bias) count: 0\n",
+      "               apply: Fused BiasGelu count: 12\n",
+      "               apply: Fused SkipLayerNormalization(add bias) count: 24\n",
+      "            optimize: opset verion: 11\n",
+      "  save_model_to_file: Output model to ./onnx/bert-base-cased-squad_opt_cpu.onnx\n",
+      "get_fused_operator_statistics: Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'Gelu': 0, 'FastGelu': 0, 'BiasGelu': 12, 'LayerNormalization': 0, 'SkipLayerNormalization': 24}\n",
+      "  is_fully_optimized: EmbedLayer=1, Attention=12, Gelu=12, LayerNormalization=24, Successful=True\n",
+      "                main: The output model is fully optimized.\n"
      ]
     }
    ],
    "source": [
     "optimized_model_path = './onnx/bert-base-cased-squad_opt_cpu.onnx'\n",
-    "%run ./bert_scripts/optimizer.py --input $export_model_path --output $optimized_model_path"
+    "\n",
+    "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_model_path --model_type bert --num_heads 12 --hidden_size 768"
    ]
   },
   {
@@ -564,14 +528,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Serving './onnx/bert-base-cased-squad_opt_cpu.onnx' at http://localhost:8080\n"
+     ]
+    }
+   ],
    "source": [
     "import netron\n",
     "\n",
-    "# Change it to True if want to view the optimized model in browser.\n",
-    "enable_netron = False\n",
+    "# Change it to False to skip viewing the optimized model in browser.\n",
+    "enable_netron = True\n",
     "if enable_netron:\n",
     "    # If you encounter error \"access a socket in a way forbidden by its access permissions\", install Netron as standalone application instead.\n",
     "    netron.start(optimized_model_path)"
@@ -590,7 +562,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 12,
    "metadata": {},
    "outputs": [
     {
@@ -598,13 +570,13 @@
      "output_type": "stream",
      "text": [
       "100% passed for 100 random inputs given thresholds (rtol=0.001, atol=0.0001).\n",
-      "maximum absolute difference=5.0961971282958984e-06\n",
-      "maximum relative difference=0.003811897709965706\n"
+      "maximum absolute difference=3.46451997756958e-06\n",
+      "maximum relative difference=0.03302651643753052\n"
      ]
     }
    ],
    "source": [
-    "%run ./bert_scripts/compare_bert_results.py --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
+    "!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100"
    ]
   },
   {
@@ -620,21 +592,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 16,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 102.17 ms, Throughput = 9.79 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 91.39 ms, Throughput = 10.94 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 105.09 ms, Throughput = 9.52 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 291.53 ms, Throughput = 3.43 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 306.75 ms, Throughput = 3.26 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=1,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 289.82 ms, Throughput = 3.45 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 101.56 ms, Throughput = 9.85 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 84.73 ms, Throughput = 11.80 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=12,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 101.53 ms, Throughput = 9.85 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 100.54 ms, Throughput = 9.95 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=PASSIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 111.82 ms, Throughput = 8.94 QPS\n",
+      "Running test: model=bert-base-cased-squad_opt_cpu.onnx,graph_optimization_level=ENABLE_ALL,intra_op_num_threads=1,OMP_NUM_THREADS=6,OMP_WAIT_POLICY=ACTIVE,batch_size=1,sequence_length=128,test_cases=100,test_times=1,contiguous=False,use_gpu=False,warmup=True\n",
+      "Average latency = 101.14 ms, Throughput = 9.89 QPS\n",
+      "test setting TestSetting(use_gpu=False, batch_size=1, sequence_length=128, test_cases=100, test_times=1, omp_num_threads=None, omp_wait_policy=None, intra_op_num_threads=1, seed=3, verbose=False, contiguous=False, inclusive=True, extra_latency=0, warmup=True)\n",
       "Generating 100 samples for batch_size=1 sequence_length=128\n",
-      "Extra latency for converting inputs to contiguous: 0.00 ms\n",
-      "Test summary is saved to onnx\\perf_results_CPU_B1_S128_20200313-001048.txt\n"
+      "Test summary is saved to onnx\\perf_results_CPU_B1_S128_20200612-115010.txt\n"
      ]
     }
    ],
    "source": [
-    "%run ./bert_scripts/bert_perf_test.py --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --inclusive --all"
+    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_model_path --batch_size 1 --sequence_length 128 --samples 100 --test_times 1 --intra_op_num_threads 1 --inclusive --all"
    ]
   },
   {
@@ -646,15 +642,15 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "./onnx\\perf_results_CPU_B1_S128_20200313-001048.txt\n",
-      "The best setting is: NO openmp; use contiguous array\n"
+      "./onnx\\perf_results_CPU_B1_S128_20200612-115010.txt\n",
+      "The best setting is: use openmp; NO contiguous array\n"
      ]
     },
     {
@@ -692,107 +688,107 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>74.68</td>\n",
-       "      <td>76.74</td>\n",
-       "      <td>83.75</td>\n",
-       "      <td>89.26</td>\n",
-       "      <td>13.39</td>\n",
-       "      <td>12</td>\n",
+       "      <td>84.73</td>\n",
+       "      <td>87.19</td>\n",
+       "      <td>91.51</td>\n",
+       "      <td>95.46</td>\n",
+       "      <td>11.80</td>\n",
        "      <td>1</td>\n",
+       "      <td>12</td>\n",
        "      <td>PASSIVE</td>\n",
-       "      <td>True</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>74.70</td>\n",
-       "      <td>76.57</td>\n",
-       "      <td>85.86</td>\n",
-       "      <td>89.30</td>\n",
-       "      <td>13.39</td>\n",
-       "      <td>12</td>\n",
+       "      <td>91.39</td>\n",
+       "      <td>94.92</td>\n",
+       "      <td>101.16</td>\n",
+       "      <td>119.06</td>\n",
+       "      <td>10.94</td>\n",
        "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>True</td>\n",
+       "      <td></td>\n",
+       "      <td>PASSIVE</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>75.23</td>\n",
-       "      <td>78.13</td>\n",
-       "      <td>86.49</td>\n",
-       "      <td>88.99</td>\n",
-       "      <td>13.29</td>\n",
-       "      <td>12</td>\n",
+       "      <td>100.54</td>\n",
+       "      <td>101.63</td>\n",
+       "      <td>103.12</td>\n",
+       "      <td>109.20</td>\n",
+       "      <td>9.95</td>\n",
        "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
+       "      <td>6</td>\n",
+       "      <td></td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>75.66</td>\n",
-       "      <td>78.36</td>\n",
-       "      <td>87.60</td>\n",
-       "      <td>95.17</td>\n",
-       "      <td>13.22</td>\n",
-       "      <td>12</td>\n",
+       "      <td>101.14</td>\n",
+       "      <td>102.05</td>\n",
+       "      <td>106.15</td>\n",
+       "      <td>112.17</td>\n",
+       "      <td>9.89</td>\n",
        "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
+       "      <td>6</td>\n",
+       "      <td>ACTIVE</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>76.56</td>\n",
-       "      <td>78.73</td>\n",
-       "      <td>91.02</td>\n",
-       "      <td>101.58</td>\n",
-       "      <td>13.06</td>\n",
+       "      <td>101.53</td>\n",
+       "      <td>102.30</td>\n",
+       "      <td>105.58</td>\n",
+       "      <td>113.53</td>\n",
+       "      <td>9.85</td>\n",
        "      <td>1</td>\n",
        "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
+       "      <td>ACTIVE</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>76.71</td>\n",
-       "      <td>78.99</td>\n",
-       "      <td>91.71</td>\n",
-       "      <td>98.70</td>\n",
-       "      <td>13.04</td>\n",
+       "      <td>101.56</td>\n",
+       "      <td>102.54</td>\n",
+       "      <td>104.08</td>\n",
+       "      <td>106.72</td>\n",
+       "      <td>9.85</td>\n",
        "      <td>1</td>\n",
        "      <td>12</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>True</td>\n",
+       "      <td></td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>79.08</td>\n",
-       "      <td>80.87</td>\n",
-       "      <td>85.98</td>\n",
-       "      <td>121.36</td>\n",
-       "      <td>12.65</td>\n",
+       "      <td>102.17</td>\n",
+       "      <td>104.58</td>\n",
+       "      <td>106.50</td>\n",
+       "      <td>111.18</td>\n",
+       "      <td>9.79</td>\n",
        "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>True</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>80.30</td>\n",
-       "      <td>82.26</td>\n",
-       "      <td>96.97</td>\n",
-       "      <td>122.55</td>\n",
-       "      <td>12.45</td>\n",
+       "      <td>105.09</td>\n",
+       "      <td>107.26</td>\n",
+       "      <td>112.70</td>\n",
+       "      <td>123.89</td>\n",
+       "      <td>9.52</td>\n",
        "      <td>1</td>\n",
        "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>True</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>88.43</td>\n",
-       "      <td>91.29</td>\n",
-       "      <td>111.84</td>\n",
-       "      <td>119.99</td>\n",
-       "      <td>11.31</td>\n",
+       "      <td>111.82</td>\n",
+       "      <td>113.28</td>\n",
+       "      <td>116.62</td>\n",
+       "      <td>119.12</td>\n",
+       "      <td>8.94</td>\n",
        "      <td>1</td>\n",
        "      <td>6</td>\n",
        "      <td>PASSIVE</td>\n",
@@ -800,243 +796,75 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>89.03</td>\n",
-       "      <td>92.76</td>\n",
-       "      <td>110.57</td>\n",
-       "      <td>121.06</td>\n",
-       "      <td>11.23</td>\n",
+       "      <td>289.82</td>\n",
+       "      <td>294.33</td>\n",
+       "      <td>300.76</td>\n",
+       "      <td>333.44</td>\n",
+       "      <td>3.45</td>\n",
        "      <td>1</td>\n",
-       "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>True</td>\n",
+       "      <td>1</td>\n",
+       "      <td>ACTIVE</td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>94.60</td>\n",
-       "      <td>94.41</td>\n",
-       "      <td>101.17</td>\n",
-       "      <td>110.39</td>\n",
-       "      <td>10.57</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>True</td>\n",
+       "      <td>291.53</td>\n",
+       "      <td>298.85</td>\n",
+       "      <td>312.26</td>\n",
+       "      <td>339.63</td>\n",
+       "      <td>3.43</td>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td></td>\n",
+       "      <td>False</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>95.32</td>\n",
-       "      <td>94.32</td>\n",
-       "      <td>102.61</td>\n",
-       "      <td>119.79</td>\n",
-       "      <td>10.49</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>100.75</td>\n",
-       "      <td>107.69</td>\n",
-       "      <td>112.40</td>\n",
-       "      <td>119.28</td>\n",
-       "      <td>9.93</td>\n",
+       "      <td>306.75</td>\n",
+       "      <td>314.65</td>\n",
+       "      <td>335.34</td>\n",
+       "      <td>422.32</td>\n",
+       "      <td>3.26</td>\n",
        "      <td>1</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>13</th>\n",
-       "      <td>101.89</td>\n",
-       "      <td>108.09</td>\n",
-       "      <td>113.35</td>\n",
-       "      <td>118.92</td>\n",
-       "      <td>9.81</td>\n",
-       "      <td>1</td>\n",
-       "      <td>6</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>14</th>\n",
-       "      <td>104.49</td>\n",
-       "      <td>105.83</td>\n",
-       "      <td>107.46</td>\n",
-       "      <td>109.16</td>\n",
-       "      <td>9.57</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>104.73</td>\n",
-       "      <td>106.29</td>\n",
-       "      <td>108.70</td>\n",
-       "      <td>109.33</td>\n",
-       "      <td>9.55</td>\n",
-       "      <td>6</td>\n",
        "      <td>1</td>\n",
        "      <td>PASSIVE</td>\n",
        "      <td>False</td>\n",
        "    </tr>\n",
-       "    <tr>\n",
-       "      <th>16</th>\n",
-       "      <td>104.84</td>\n",
-       "      <td>106.26</td>\n",
-       "      <td>107.87</td>\n",
-       "      <td>109.73</td>\n",
-       "      <td>9.54</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>105.21</td>\n",
-       "      <td>106.93</td>\n",
-       "      <td>109.11</td>\n",
-       "      <td>110.32</td>\n",
-       "      <td>9.51</td>\n",
-       "      <td>6</td>\n",
-       "      <td>1</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>107.31</td>\n",
-       "      <td>108.37</td>\n",
-       "      <td>111.24</td>\n",
-       "      <td>115.96</td>\n",
-       "      <td>9.32</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>107.98</td>\n",
-       "      <td>111.01</td>\n",
-       "      <td>119.32</td>\n",
-       "      <td>129.66</td>\n",
-       "      <td>9.26</td>\n",
-       "      <td>1</td>\n",
-       "      <td>12</td>\n",
-       "      <td>ACTIVE</td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>108.65</td>\n",
-       "      <td>110.69</td>\n",
-       "      <td>112.22</td>\n",
-       "      <td>113.69</td>\n",
-       "      <td>9.20</td>\n",
-       "      <td>6</td>\n",
-       "      <td>6</td>\n",
-       "      <td>PASSIVE</td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>109.86</td>\n",
-       "      <td>110.35</td>\n",
-       "      <td>116.37</td>\n",
-       "      <td>127.00</td>\n",
-       "      <td>9.10</td>\n",
-       "      <td>1</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>111.36</td>\n",
-       "      <td>110.36</td>\n",
-       "      <td>125.15</td>\n",
-       "      <td>157.79</td>\n",
-       "      <td>8.98</td>\n",
-       "      <td>0</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>True</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>119.46</td>\n",
-       "      <td>119.96</td>\n",
-       "      <td>135.38</td>\n",
-       "      <td>171.81</td>\n",
-       "      <td>8.37</td>\n",
-       "      <td>0</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "      <td>False</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
        "    Latency(ms)  Latency_P75  Latency_P90  Latency_P99  Throughput(QPS)  \\\n",
-       "0         74.68        76.74        83.75        89.26            13.39   \n",
-       "1         74.70        76.57        85.86        89.30            13.39   \n",
-       "2         75.23        78.13        86.49        88.99            13.29   \n",
-       "3         75.66        78.36        87.60        95.17            13.22   \n",
-       "4         76.56        78.73        91.02       101.58            13.06   \n",
-       "5         76.71        78.99        91.71        98.70            13.04   \n",
-       "6         79.08        80.87        85.98       121.36            12.65   \n",
-       "7         80.30        82.26        96.97       122.55            12.45   \n",
-       "8         88.43        91.29       111.84       119.99            11.31   \n",
-       "9         89.03        92.76       110.57       121.06            11.23   \n",
-       "10        94.60        94.41       101.17       110.39            10.57   \n",
-       "11        95.32        94.32       102.61       119.79            10.49   \n",
-       "12       100.75       107.69       112.40       119.28             9.93   \n",
-       "13       101.89       108.09       113.35       118.92             9.81   \n",
-       "14       104.49       105.83       107.46       109.16             9.57   \n",
-       "15       104.73       106.29       108.70       109.33             9.55   \n",
-       "16       104.84       106.26       107.87       109.73             9.54   \n",
-       "17       105.21       106.93       109.11       110.32             9.51   \n",
-       "18       107.31       108.37       111.24       115.96             9.32   \n",
-       "19       107.98       111.01       119.32       129.66             9.26   \n",
-       "20       108.65       110.69       112.22       113.69             9.20   \n",
-       "21       109.86       110.35       116.37       127.00             9.10   \n",
-       "22       111.36       110.36       125.15       157.79             8.98   \n",
-       "23       119.46       119.96       135.38       171.81             8.37   \n",
+       "0         84.73        87.19        91.51        95.46            11.80   \n",
+       "1         91.39        94.92       101.16       119.06            10.94   \n",
+       "2        100.54       101.63       103.12       109.20             9.95   \n",
+       "3        101.14       102.05       106.15       112.17             9.89   \n",
+       "4        101.53       102.30       105.58       113.53             9.85   \n",
+       "5        101.56       102.54       104.08       106.72             9.85   \n",
+       "6        102.17       104.58       106.50       111.18             9.79   \n",
+       "7        105.09       107.26       112.70       123.89             9.52   \n",
+       "8        111.82       113.28       116.62       119.12             8.94   \n",
+       "9        289.82       294.33       300.76       333.44             3.45   \n",
+       "10       291.53       298.85       312.26       339.63             3.43   \n",
+       "11       306.75       314.65       335.34       422.32             3.26   \n",
        "\n",
        "    intra_op_num_threads OMP_NUM_THREADS OMP_WAIT_POLICY  contiguous  \n",
-       "0                     12               1         PASSIVE        True  \n",
-       "1                     12               1          ACTIVE        True  \n",
-       "2                     12               1          ACTIVE       False  \n",
-       "3                     12               1         PASSIVE       False  \n",
-       "4                      1              12         PASSIVE       False  \n",
-       "5                      1              12         PASSIVE        True  \n",
-       "6                      1              12          ACTIVE        True  \n",
-       "7                      1                                        True  \n",
+       "0                      1              12         PASSIVE       False  \n",
+       "1                      1                         PASSIVE       False  \n",
+       "2                      1               6                       False  \n",
+       "3                      1               6          ACTIVE       False  \n",
+       "4                      1              12          ACTIVE       False  \n",
+       "5                      1              12                       False  \n",
+       "6                      1                                       False  \n",
+       "7                      1                          ACTIVE       False  \n",
        "8                      1               6         PASSIVE       False  \n",
-       "9                      1               6         PASSIVE        True  \n",
-       "10                     6               6          ACTIVE        True  \n",
-       "11                     6               6          ACTIVE       False  \n",
-       "12                     1               6          ACTIVE        True  \n",
-       "13                     1               6          ACTIVE       False  \n",
-       "14                     6               1          ACTIVE        True  \n",
-       "15                     6               1         PASSIVE       False  \n",
-       "16                     6               1         PASSIVE        True  \n",
-       "17                     6               1          ACTIVE       False  \n",
-       "18                     6               6         PASSIVE       False  \n",
-       "19                     1              12          ACTIVE       False  \n",
-       "20                     6               6         PASSIVE        True  \n",
-       "21                     1                                       False  \n",
-       "22                     0                                        True  \n",
-       "23                     0                                       False  "
+       "9                      1               1          ACTIVE       False  \n",
+       "10                     1               1                       False  \n",
+       "11                     1               1         PASSIVE       False  "
       ]
      },
-     "execution_count": 15,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1048,7 +876,7 @@
     "latest_result_file = max(glob.glob(\"./onnx/perf_results_*.txt\"), key=os.path.getmtime)\n",
     "result_data = pandas.read_table(latest_result_file, converters={'OMP_NUM_THREADS': str, 'OMP_WAIT_POLICY':str})\n",
     "print(latest_result_file)\n",
-    "print(\"The best setting is: {} openmp; {} contiguous array\".format('use' if result_data['intra_op_num_threads'].iloc[0] == 1 else 'NO', 'use' if result_data['contiguous'].iloc[0] else 'NO'))\n",
+    "\n",
     "# Remove some columns that have same values for all rows.\n",
     "columns_to_remove = ['model', 'graph_optimization_level', 'batch_size', 'sequence_length', 'test_cases', 'test_times', 'use_gpu', 'warmup']\n",
     "# Hide some latency percentile columns to fit screen width.\n",
@@ -1065,6 +893,8 @@
     "\n",
     "Note that running Jupyter Notebook has slight impact on performance result since Jupyter Notebook is using system resources like CPU and memory etc. It is recommended to close Jupyter Notebook and other applications, then run the performance test tool in a console to get more accurate performance numbers.\n",
     "\n",
+    "We have a [benchmark script](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/run_benchmark.sh). It is recommended to use it compare inference speed of OnnxRuntime with PyTorch.\n",
+    "\n",
     "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n",
     "\n",
     "Here is the machine configuration that generated the above results. The machine has GPU but not used in CPU inference.\n",
@@ -1073,7 +903,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 18,
    "metadata": {},
    "outputs": [
     {
@@ -1082,11 +912,11 @@
      "text": [
       "{\n",
       "  \"gpu\": {\n",
-      "    \"driver_version\": \"441.22\",\n",
+      "    \"driver_version\": \"442.23\",\n",
       "    \"devices\": [\n",
       "      {\n",
       "        \"memory_total\": 8589934592,\n",
-      "        \"memory_available\": 6741864448,\n",
+      "        \"memory_available\": 8480882688,\n",
       "        \"name\": \"GeForce GTX 1070\"\n",
       "      }\n",
       "    ]\n",
@@ -1102,16 +932,16 @@
       "  },\n",
       "  \"memory\": {\n",
       "    \"total\": 16971259904,\n",
-      "    \"available\": 2581991424\n",
+      "    \"available\": 6604914688\n",
       "  },\n",
       "  \"python\": \"3.6.10.final.0 (64 bit)\",\n",
       "  \"os\": \"Windows-10-10.0.18362-SP0\",\n",
       "  \"onnxruntime\": {\n",
-      "    \"version\": \"1.2.0\",\n",
+      "    \"version\": \"1.3.0\",\n",
       "    \"support_gpu\": false\n",
       "  },\n",
       "  \"pytorch\": {\n",
-      "    \"version\": \"1.4.0+cpu\",\n",
+      "    \"version\": \"1.5.0+cpu\",\n",
       "    \"support_gpu\": false\n",
       "  },\n",
       "  \"tensorflow\": {\n",
@@ -1121,11 +951,25 @@
       "  }\n",
       "}\n"
      ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2020-06-12 11:50:22.559054: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cudart64_101.dll\n"
+     ]
     }
    ],
    "source": [
-    "%run ./bert_scripts/MachineInfo.py --silent"
+    "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
index 6ef81a2dbb..23138a44d0 100644
--- a/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/PyTorch_Bert-Squad_OnnxRuntime_GPU.ipynb
@@ -21,7 +21,9 @@
    "source": [
     "In this tutorial, you'll be introduced to how to load a Bert model from PyTorch, convert it to ONNX, and inference it for high performance using ONNX Runtime and NVIDIA GPU. In the following sections, we are going to use the Bert model trained with Stanford Question Answering Dataset (SQuAD) dataset as an example. Bert SQuAD model is used in question answering scenarios, where the answer to every question is a segment of text, or span, from the corresponding reading passage, or the question might be unanswerable.\n",
     "\n",
-    "This notebook is for GPU inference. For CPU inference, please look at another notebook [Inference PyTorch Bert Model with ONNX Runtime on CPU](PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb)."
+    "This notebook is for GPU inference. For CPU inference, please look at another notebook [Inference PyTorch Bert Model with ONNX Runtime on CPU](PyTorch_Bert-Squad_OnnxRuntime_CPU.ipynb).\n",
+    "\n",
+    "Note that you might need change !{sys.executable} to !python when running the notebook in Linux."
    ]
   },
   {
@@ -33,15 +35,16 @@
     "\n",
     "#### GPU Environment Setup using AnaConda\n",
     "\n",
-    "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook is tested with PyTorch 1.4 and OnnxRuntime 1.2.0.\n",
+    "First, we install [AnaConda](https://www.anaconda.com/distribution/) in a target machine and open an AnaConda prompt window when it is done. Then run the following commands to create a conda environment. This notebook was run with PyTorch 1.4 and OnnxRuntime 1.2.0. (We also verified it with PyTorch 1.5 and OnnxRuntime 1.3.0).\n",
     "\n",
     "```console\n",
     "conda create -n gpu_env python=3.6\n",
     "conda activate gpu_env\n",
     "conda install pytorch torchvision cudatoolkit=10.1 -c pytorch\n",
     "pip install onnxruntime-gpu\n",
-    "pip install transformers==2.5.1\n",
-    "pip install wget psutil onnx pytz pandas py-cpuinfo py3nvml netron\n",
+    "pip install transformers==2.11.0\n",
+    "pip install onnxruntime-tools\n",
+    "pip install wget netron\n",
     "conda install jupyter\n",
     "jupyter notebook\n",
     "```\n",
@@ -390,11 +393,11 @@
     "latency = []\n",
     "for i in range(total_samples):\n",
     "    data = dataset[i]\n",
-    "    # Use contiguous array as input might improve performance\n",
+    "    # TODO: use IO Binding (see https://github.com/microsoft/onnxruntime/pull/4206) to improve performance.\n",
     "    ort_inputs = {\n",
-    "        'input_ids':  numpy.ascontiguousarray(data[0].cpu().reshape(1, max_seq_length).numpy()),\n",
-    "        'input_mask': numpy.ascontiguousarray(data[1].cpu().reshape(1, max_seq_length).numpy()),\n",
-    "        'segment_ids': numpy.ascontiguousarray(data[2].cpu().reshape(1, max_seq_length).numpy())\n",
+    "        'input_ids':  data[0].cpu().reshape(1, max_seq_length).numpy(),\n",
+    "        'input_mask': data[1].cpu().reshape(1, max_seq_length).numpy(),\n",
+    "        'segment_ids': data[2].cpu().reshape(1, max_seq_length).numpy()\n",
     "    }\n",
     "    start = time.time()\n",
     "    ort_outputs = session.run(None, ort_inputs)\n",
@@ -544,9 +547,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's compare the output and see whether the results are close.\n",
-    "\n",
-    "**Note**: Need end-to-end evaluation on performance and accuracy if you use this strategy."
+    "Let's compare the output and see whether the results are close."
    ]
   },
   {
@@ -576,69 +577,35 @@
    "source": [
     "## 5. Offline Optimization and Test Tools\n",
     "\n",
-    "It is recommended to download the [OnnxRuntime Python Tools for BERT](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers), and try them on the exported ONNX models. It could help verify whether the model is fully optimized, and get performance test results.\n",
-    "\n",
-    "### Download OnnxRuntime Python Tools for Bert\n",
-    "You may copy the whole [directory](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) to a sub-directory named bert_scripts for this notebook. The list of script files might need update if import error happens when you run some script."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "100% [..............................................................................] 15310 / 15310Downloaded bert_perf_test.py\n",
-      "100% [................................................................................] 9571 / 9571Downloaded bert_test_data.py\n",
-      "100% [................................................................................] 7272 / 7272Downloaded compare_bert_results.py\n",
-      "100% [..............................................................................] 44905 / 44905Downloaded BertOnnxModel.py\n",
-      "100% [..............................................................................] 21565 / 21565Downloaded BertOnnxModelKeras.py\n",
-      "100% [..............................................................................] 26114 / 26114Downloaded BertOnnxModelTF.py\n",
-      "100% [..............................................................................] 22773 / 22773Downloaded OnnxModel.py\n",
-      "100% [................................................................................] 7795 / 7795Downloaded optimizer.py\n",
-      "100% [................................................................................] 5885 / 5885Downloaded MachineInfo.py\n"
-     ]
-    }
-   ],
-   "source": [
-    "import os\n",
-    "import wget\n",
-    "\n",
-    "url_prfix = \"https://raw.githubusercontent.com/microsoft/onnxruntime/master/onnxruntime/python/tools/transformers/\"\n",
-    "script_files = ['bert_perf_test.py', 'bert_test_data.py', 'compare_bert_results.py', 'BertOnnxModel.py', 'BertOnnxModelKeras.py', 'BertOnnxModelTF.py', 'Gpt2OnnxModel.py', 'OnnxModel.py', 'optimizer.py', 'MachineInfo.py']\n",
-    "\n",
-    "script_dir = './bert_scripts'\n",
-    "if not os.path.exists(script_dir):\n",
-    "    os.makedirs(script_dir)\n",
-    "\n",
-    "for filename in script_files:\n",
-    "    target_file = os.path.join(script_dir, filename)\n",
-    "    if enable_overwrite and os.path.exists(target_file):\n",
-    "        os.remove(target_file)\n",
-    "    if not os.path.exists(target_file):\n",
-    "        wget.download(url_prfix + filename, target_file)\n",
-    "        print(\"Downloaded\", filename)"
+    "It is recommended to try [OnnxRuntime Transformer Model Optimization Tool](https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers) on the exported ONNX models. It could help verify whether the model can be fully optimized, and get performance test results."
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### BERT Optimization Script\n",
+    "#### Transformer Optimizer\n",
     "\n",
-    "Sometime, some optimization of OnnxRuntime cannot be applied to a Bert model due to different reasons:\n",
-    "* A new subgraph pattern is exported, which is not covered by the onnxruntime version users are using. For example, Gelu from PyTorch 1.4 is not fused by OnnxRuntime 1.1.2 (Note: it is covered in OnnxRuntime v1.2.0).\n",
-    "* The exported model uses dynamic axis. That impacts shape inference. Without enough shape information, some optimization cannot be applied due to the constraint on the input shape.\n",
-    "* Some optimization are not supported by OnnxRuntime, but it is feasible in offline script. Like changing input tensor type from int64 to int32 to avoid extra Cast nodes, or converting model to float16 to achieve better performance in V100 or T4 GPU.\n",
+    "Although OnnxRuntime could optimize Bert model exported by PyTorch. Sometime, model cannot be fully optimized due to different reasons:\n",
+    "* A new subgraph pattern is generated by new version of export tool, and the pattern is not covered by older version of OnnxRuntime. \n",
+    "* The exported model uses dynamic axis and this makes it harder for shape inference of the graph. That blocks some optimization to be applied.\n",
+    "* Some optimization is better to be done offline. Like change input tensor type from int64 to int32 to avoid extra Cast nodes, or convert model to float16 to achieve better performance in V100 or T4 GPU.\n",
     "\n",
-    "We have python script **optimizer.py**, which is flexible in graph pattern matching and model conversions to tackle these problems.\n",
+    "We have python script **optimizer.py**, which is more flexible in graph pattern matching and model conversion (like float32 to float16). You can also use it to verify whether a Bert model is fully optimized.\n",
     "\n",
-    "In below example, we can see that the tool provide an extra optimization - SkipLayerNormalization and bias (Add) are not fused in OnnxRuntime due to shape inference.\n",
+    "In this example, we can see that it introduces optimization that is not provided by onnxruntime: SkipLayerNormalization and bias fusion, which is not fused in OnnxRuntime due to shape inference as mentioned.\n",
     "\n",
-    "The tool will tell whether a model is fully optimized or not. If not, that means you might need change the script to handle some new subgraph patern."
+    "It will also tell whether the model is fully optimized or not. If not, that means you might need change the script to fuse some new pattern of subgraph.\n",
+    "\n",
+    "Example Usage:\n",
+    "```\n",
+    "from onnxruntime_tools import optimizer\n",
+    "optimized_model = optimizer.optimize_model(export_model_path, model_type='bert', num_heads=12, hidden_size=768, use_gpu=True)\n",
+    "optimized_model.convert_model_float32_to_float16()\n",
+    "optimized_model.save_model_to_file(optimized_model_path)\n",
+    "```\n",
+    "\n",
+    "You can also use optimizer_cli as the following."
    ]
   },
   {
@@ -677,13 +644,15 @@
    ],
    "source": [
     "optimized_fp32_model_path = './onnx/bert-base-cased-squad_opt_{}_fp32.onnx'.format('gpu' if use_gpu else 'cpu')\n",
-    "%run ./bert_scripts/optimizer.py --input $export_model_path --output $optimized_fp32_model_path --input_int32"
+    "\n",
+    "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp32_model_path --input_int32"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "Note: you might change \n",
     "#### Optimized Graph\n",
     "We can open the optimized model using [Netron](https://github.com/lutzroeder/netron) to visualize.\n",
     "\n",
@@ -702,7 +671,7 @@
     "import netron\n",
     "\n",
     "# change it to True if want to view the optimized model in browser\n",
-    "enable_netron = False\n",
+    "enable_netron = True\n",
     "if enable_netron:\n",
     "    # If you encounter error \"access a socket in a way forbidden by its access permissions\", install Netron as standalone application instead.\n",
     "    netron.start(optimized_fp32_model_path)"
@@ -739,7 +708,7 @@
    "source": [
     "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
     "\n",
-    "%run ./bert_scripts/bert_perf_test.py --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
+    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
    ]
   },
   {
@@ -1108,7 +1077,7 @@
     }
    ],
    "source": [
-    "%run ./bert_scripts/compare_bert_results.py --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
+    "!{sys.executable} -m onnxruntime_tools.transformers.compare_bert_results --baseline_model $export_model_path --optimized_model $optimized_fp32_model_path --batch_size 1 --sequence_length 128 --samples 100 --rtol 0.01 --atol 0.01 $GPU_OPTION"
    ]
   },
   {
@@ -1148,7 +1117,7 @@
    ],
    "source": [
     "optimized_fp16_model_path = './onnx/bert-base-cased-squad_opt_{}_fp16.onnx'.format('gpu' if use_gpu else 'cpu')\n",
-    "%run  ./bert_scripts/optimizer.py --input $export_model_path --output $optimized_fp16_model_path --float16 --input_int32"
+    "!{sys.executable} -m onnxruntime_tools.optimizer_cli --input $export_model_path --output $optimized_fp16_model_path --float16 --input_int32"
    ]
   },
   {
@@ -1168,7 +1137,7 @@
    ],
    "source": [
     "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
-    "%run ./bert_scripts/bert_perf_test.py --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
+    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 --sequence_length 128 --samples 1000 --test_times 1 --inclusive --all $GPU_OPTION"
    ]
   },
   {
@@ -1534,7 +1503,7 @@
    ],
    "source": [
     "GPU_OPTION = '--use_gpu' if use_gpu else ''\n",
-    "%run ./bert_scripts/bert_perf_test.py --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $GPU_OPTION"
+    "!{sys.executable} -m onnxruntime_tools.transformers.bert_perf_test --model $optimized_fp16_model_path --batch_size 1 2 4 8 16 32 64 --sequence_length 128 --samples 1000 --test_times 1 --inclusive $GPU_OPTION"
    ]
   },
   {
@@ -1715,6 +1684,8 @@
     "\n",
     "Note that running Jupyter Notebook has slight impact on performance result since Jupyter Notebook is using system resources like CPU etc. You can close Jupyter Notebook and other applications, then run the performance test in a console to get more accurate performance numbers.\n",
     "\n",
+    "We have a [benchmark script](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/transformers/run_benchmark.sh). It is recommended to use it compare inference speed of OnnxRuntime with PyTorch.\n",
+    "\n",
     "[OnnxRuntime C API](https://github.com/microsoft/onnxruntime/blob/master/docs/C_API.md) could get slightly better performance than python API. If you use C API in inference, you can use OnnxRuntime_Perf_Test.exe built from source to measure performance instead.\n",
     "\n",
     "Here is the machine configuration that generated the above results. You might get slower or faster result according to your hardware."
@@ -1771,7 +1742,7 @@
     }
    ],
    "source": [
-    "%run ./bert_scripts/MachineInfo.py --silent"
+    "!{sys.executable} -m onnxruntime_tools.transformers.machine_info --silent"
    ]
   }
  ],
diff --git a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb
index 76b9d45b90..d9d0979744 100644
--- a/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb
+++ b/onnxruntime/python/tools/transformers/notebooks/Tensorflow_Keras_Bert-Squad_OnnxRuntime_CPU.ipynb
@@ -113,7 +113,7 @@
       "        os.makedirs(directory)\n",
       "\n",
       "# Download scripts for BERT optimization.\n",
-      "url_prfix = \"https://raw.githubusercontent.com/microsoft/onnxruntime/master/onnxruntime/python/tools/transformers/\"\n",
+      "url_prfix = \"https://raw.githubusercontent.com/microsoft/onnxruntime/rel-1.3.0/onnxruntime/python/tools/bert/\"\n",
       "script_files = ['bert_perf_test.py', 'bert_test_data.py', 'compare_bert_results.py', 'BertOnnxModel.py', 'BertOnnxModelKeras.py', 'BertOnnxModelTF.py', 'Gpt2OnnxModel.py', 'OnnxModel.py', 'optimizer.py']\n",
       "\n",
       "for filename in script_files:\n",
diff --git a/onnxruntime/python/tools/transformers/OnnxModel.py b/onnxruntime/python/tools/transformers/onnx_model.py
similarity index 100%
rename from onnxruntime/python/tools/transformers/OnnxModel.py
rename to onnxruntime/python/tools/transformers/onnx_model.py
diff --git a/onnxruntime/python/tools/transformers/BertOnnxModel.py b/onnxruntime/python/tools/transformers/onnx_model_bert.py
similarity index 98%
rename from onnxruntime/python/tools/transformers/BertOnnxModel.py
rename to onnxruntime/python/tools/transformers/onnx_model_bert.py
index 1dcaad65f2..5a002298a1 100644
--- a/onnxruntime/python/tools/transformers/BertOnnxModel.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert.py
@@ -5,7 +5,7 @@
 
 from logging import getLogger
 from onnx import TensorProto, helper
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 from fusion_reshape import FusionReshape
 from fusion_layernorm import FusionLayerNormalization, FusionLayerNormalizationTF
 from fusion_skiplayernorm import FusionSkipLayerNormalization, FusionBiasSkipLayerNormalization
@@ -15,6 +15,7 @@ from fusion_gelu import FusionGelu
 from fusion_fastgelu import FusionFastGelu
 from fusion_biasgelu import FusionBiasGelu
 from fusion_gelu_approximation import FusionGeluApproximation
+from fusion_utils import FusionUtils
 
 logger = getLogger(__name__)
 
@@ -126,9 +127,10 @@ class BertOnnxModel(OnnxModel):
         new_graph_inputs = []
 
         bert_inputs = self.get_bert_inputs()
+        utils = FusionUtils(self)
         for input in graph.input:
             if input.name in bert_inputs:
-                self.remove_cast_int32(input.name)
+                utils.remove_cast_int32(input.name)
                 input_shape = [
                     batch_size if isinstance(batch_size, int) else 1,
                     sequence_length if isinstance(sequence_length, int) else 128
diff --git a/onnxruntime/python/tools/transformers/BertOnnxModelKeras.py b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
similarity index 99%
rename from onnxruntime/python/tools/transformers/BertOnnxModelKeras.py
rename to onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
index e672e1a3fe..97180ec64c 100644
--- a/onnxruntime/python/tools/transformers/BertOnnxModelKeras.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py
@@ -10,7 +10,7 @@ import argparse
 import numpy as np
 from collections import deque
 from onnx import ModelProto, TensorProto, numpy_helper
-from BertOnnxModelTF import BertOnnxModelTF
+from onnx_model_bert_tf import BertOnnxModelTF
 logger = logging.getLogger(__name__)
 
 
diff --git a/onnxruntime/python/tools/transformers/BertOnnxModelTF.py b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
similarity index 99%
rename from onnxruntime/python/tools/transformers/BertOnnxModelTF.py
rename to onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
index 77e2ec88c6..a13d72485e 100644
--- a/onnxruntime/python/tools/transformers/BertOnnxModelTF.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py
@@ -10,7 +10,7 @@ import argparse
 import numpy as np
 from collections import deque
 from onnx import ModelProto, TensorProto, numpy_helper
-from BertOnnxModel import BertOnnxModel
+from onnx_model_bert import BertOnnxModel
 
 logger = logging.getLogger(__name__)
 
diff --git a/onnxruntime/python/tools/transformers/Gpt2OnnxModel.py b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
similarity index 98%
rename from onnxruntime/python/tools/transformers/Gpt2OnnxModel.py
rename to onnxruntime/python/tools/transformers/onnx_model_gpt2.py
index 02824451bf..cc7bd3d6e0 100644
--- a/onnxruntime/python/tools/transformers/Gpt2OnnxModel.py
+++ b/onnxruntime/python/tools/transformers/onnx_model_gpt2.py
@@ -9,7 +9,7 @@ import argparse
 import numpy as np
 from collections import deque
 from onnx import ModelProto, TensorProto, numpy_helper
-from BertOnnxModel import BertOnnxModel
+from onnx_model_bert import BertOnnxModel
 from fusion_gpt_attention_no_past import FusionGptAttentionNoPast
 from fusion_gpt_attention import FusionGptAttention
 
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index c4f1e0c07c..e211593aba 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -27,12 +27,12 @@ import numpy as np
 from typing import Dict
 from collections import deque
 from onnx import ModelProto, TensorProto, numpy_helper, load_model
-from BertOnnxModel import BertOnnxModel, BertOptimizationOptions
-from BertOnnxModelTF import BertOnnxModelTF
-from BertOnnxModelKeras import BertOnnxModelKeras
-from Gpt2OnnxModel import Gpt2OnnxModel
+from onnx_model_bert import BertOnnxModel, BertOptimizationOptions
+from onnx_model_bert_tf import BertOnnxModelTF
+from onnx_model_bert_keras import BertOnnxModelKeras
+from onnx_model_gpt2 import Gpt2OnnxModel
 
-logger = logging.getLogger('')
+logger = logging.getLogger(__name__)
 
 # Map model type to tuple: optimizer class, export tools (pytorch, tf2onnx, keras2onnx) and whether OnnxRuntime has the optimization.
 MODEL_CLASSES = {
diff --git a/onnxruntime/python/tools/transformers/ShapeOptimizer.py b/onnxruntime/python/tools/transformers/shape_optimizer.py
similarity index 99%
rename from onnxruntime/python/tools/transformers/ShapeOptimizer.py
rename to onnxruntime/python/tools/transformers/shape_optimizer.py
index 8f2f7a1ba4..a58b8d49ba 100644
--- a/onnxruntime/python/tools/transformers/ShapeOptimizer.py
+++ b/onnxruntime/python/tools/transformers/shape_optimizer.py
@@ -21,7 +21,7 @@ from datetime import datetime
 from pathlib import Path
 from onnx import ModelProto, TensorProto, numpy_helper
 import onnxruntime
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 
 logger = logging.getLogger(__name__)
 
@@ -34,7 +34,6 @@ class BertOnnxModelShapeOptimizer(OnnxModel):
     This optimizer will replace Shape output or the shape input of Reshape node by initializer. Currently, it requires
     model inputs to have static shape.
     """
-
     def __init__(self, onnx_model):
         super().__init__(onnx_model.model)
 
diff --git a/onnxruntime/python/tools/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/python/tools/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
index 5ece39b234..9b8b14dfd8 100644
--- a/onnxruntime/python/tools/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
+++ b/onnxruntime/python/tools/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py
@@ -29,7 +29,7 @@ import sys
 import argparse
 import numpy as np
 from onnx import ModelProto, TensorProto, numpy_helper
-from optimizer import OnnxModel
+from onnxruntime_tools.transformers.onnx_model import OnnxModel
 import os
 import onnxruntime
 import random
diff --git a/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.4_opset11_no_past/generate_tiny_gpt2_model.py b/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.4_opset11_no_past/generate_tiny_gpt2_model.py
index b122136d00..22efde1c2a 100644
--- a/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.4_opset11_no_past/generate_tiny_gpt2_model.py
+++ b/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.4_opset11_no_past/generate_tiny_gpt2_model.py
@@ -11,7 +11,7 @@ import sys
 import argparse
 import numpy as np
 from onnx import ModelProto, TensorProto, numpy_helper
-from OnnxModel import OnnxModel
+from onnxruntime_tools.transformers.onnx_model import OnnxModel
 import os
 import onnxruntime
 import random
diff --git a/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
index 3ee3147235..03267f614d 100644
--- a/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
+++ b/onnxruntime/python/tools/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py
@@ -11,7 +11,7 @@ import sys
 import argparse
 import numpy as np
 from onnx import ModelProto, TensorProto, numpy_helper
-from OnnxModel import OnnxModel
+from onnxruntime_tools.transformers.onnx_model import OnnxModel
 import os
 import onnxruntime
 import random
diff --git a/onnxruntime/python/tools/transformers/test_optimizer.py b/onnxruntime/python/tools/transformers/test_optimizer.py
index 0d1288380f..f886d70ea7 100644
--- a/onnxruntime/python/tools/transformers/test_optimizer.py
+++ b/onnxruntime/python/tools/transformers/test_optimizer.py
@@ -18,7 +18,7 @@ from onnx.helper import make_node, make_tensor_value_info
 import numpy as np
 from onnx import numpy_helper
 from optimizer import optimize_model, optimize_by_onnxruntime
-from OnnxModel import OnnxModel
+from onnx_model import OnnxModel
 
 BERT_TEST_MODELS = {
     "bert_pytorch_0": ('bert_squad_pytorch1.4_opset11', 'BertForQuestionAnswering_0.onnx'),

From 63bf587623f75cd4299f569bf24290a1628a934b Mon Sep 17 00:00:00 2001
From: edgchen1 <18449977+edgchen1@users.noreply.github.com>
Date: Tue, 16 Jun 2020 10:14:34 -0700
Subject: [PATCH 2/5] Use azcopy to download test data (#4221)

Use azcopy from download_e2e_test_data.py, add helper function for downloading azcopy.
Update download_test_data.py to use helper function.
---
 .../tools/ci_test/download_e2e_test_data.py   | 24 +++---
 .../linux-ort-srv-nightly-pipeline.yml        |  8 --
 .../mac-set-variables-and-download.yml        |  8 --
 .../templates/win-download-test-data.yml      |  2 +-
 tools/ci_build/github/download_test_data.py   | 50 ++++--------
 tools/python/get_azcopy.py                    | 78 +++++++++++++++++++
 6 files changed, 111 insertions(+), 59 deletions(-)
 create mode 100644 tools/python/get_azcopy.py

diff --git a/orttraining/tools/ci_test/download_e2e_test_data.py b/orttraining/tools/ci_test/download_e2e_test_data.py
index 39eb2b7057..0978145ebf 100755
--- a/orttraining/tools/ci_test/download_e2e_test_data.py
+++ b/orttraining/tools/ci_test/download_e2e_test_data.py
@@ -5,17 +5,26 @@
 import argparse
 import hashlib
 import os
+import shutil
+import subprocess
 import sys
 import tempfile
 import urllib.request
 import zipfile
 
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
+
+sys.path.append(os.path.join(REPO_DIR, "tools", "python"))
+
+import get_azcopy  # noqa: E402
+
 # update these if the E2E test data changes
 ARCHIVE_BLOB_URL = "https://onnxruntimetestdata.blob.core.windows.net/training/onnxruntime_training_data.zip?snapshot=2020-06-15T23:17:35.8314853Z"
 ARCHIVE_SHA256_DIGEST = "B01C169B6550D1A0A6F1B4E2F34AE2A8714B52DBB70AC04DA85D371F691BDFF9"
 
-def _download(url, local_path):
-  urllib.request.urlretrieve(url, local_path)
+def _download(azcopy_path, url, local_path):
+  subprocess.run([azcopy_path, "cp", "--log-level", "NONE", url, local_path], check=True)
 
 def _get_sha256_digest(file_path):
   alg = hashlib.sha256()
@@ -36,22 +45,19 @@ def _check_file_sha256_digest(path, expected_digest):
     raise RuntimeError(
         "SHA256 digest mismatch, expected: {}, actual: {}".format(expected_digest.lower(), actual_digest.lower()))
 
-def _extract_archive(archive_path, target_dir):
-  with zipfile.ZipFile(archive_path) as archive:
-    archive.extractall(target_dir)
-
 def main():
   parser = argparse.ArgumentParser(description="Downloads training end-to-end test data.")
   parser.add_argument("target_dir", help="The test data destination directory.")
   args = parser.parse_args()
 
-  with tempfile.TemporaryDirectory() as temp_dir:
+  with tempfile.TemporaryDirectory() as temp_dir, \
+       get_azcopy.get_azcopy() as azcopy_path:
     archive_path = os.path.join(temp_dir, "archive.zip")
     print("Downloading E2E test data from '{}'...".format(ARCHIVE_BLOB_URL))
-    _download(ARCHIVE_BLOB_URL, archive_path)
+    _download(azcopy_path, ARCHIVE_BLOB_URL, archive_path)
     _check_file_sha256_digest(archive_path, ARCHIVE_SHA256_DIGEST)
     print("Extracting to '{}'...".format(args.target_dir))
-    _extract_archive(archive_path, args.target_dir)
+    shutil.unpack_archive(archive_path, args.target_dir)
     print("Done.")
 
 if __name__ == "__main__":
diff --git a/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml
index 7fbd8e56c4..b6de177541 100644
--- a/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-ort-srv-nightly-pipeline.yml
@@ -14,14 +14,6 @@ jobs:
       continueOnError: true
       condition: always()
 
-    - task: CmdLine@2
-      displayName: 'Download azcopy'
-      inputs:
-        script: |
-          curl -so azcopy.tar.gz -L 'https://aka.ms/downloadazcopy-v10-linux'
-          tar -zxvf azcopy.tar.gz --strip 1
-        workingDirectory: $(Build.BinariesDirectory)
-
     - task: PythonScript@0
       displayName: 'Download test data'
       inputs:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/mac-set-variables-and-download.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/mac-set-variables-and-download.yml
index 20127f44d7..d689be215e 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/mac-set-variables-and-download.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/mac-set-variables-and-download.yml
@@ -1,12 +1,4 @@
 steps:
-- task: CmdLine@2
-  displayName: 'Download azcopy'
-  inputs:
-    script: |
-      curl -so azcopy.tar.gz -L 'https://aka.ms/downloadazcopy-v10-mac'
-      tar -zxvf azcopy.tar.gz --strip 1
-    workingDirectory: $(Build.BinariesDirectory)
-
 - task: PythonScript@0
   displayName: 'Download test data'
   inputs:
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/win-download-test-data.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/win-download-test-data.yml
index dbaf52ef28..4ba4595458 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/win-download-test-data.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/win-download-test-data.yml
@@ -1,4 +1,4 @@
-# Assumes AZCopy and Python download is already done
+# Assumes Python download is already done
 steps:
 - task: PythonScript@0
   displayName: 'Download test data'
diff --git a/tools/ci_build/github/download_test_data.py b/tools/ci_build/github/download_test_data.py
index 28e8340a14..0be773b51e 100755
--- a/tools/ci_build/github/download_test_data.py
+++ b/tools/ci_build/github/download_test_data.py
@@ -10,6 +10,13 @@ from urllib.parse import urlparse
 from urllib.parse import urljoin
 from urllib.parse import urlsplit
 
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+REPO_DIR = os.path.normpath(os.path.join(SCRIPT_DIR, "..", "..", ".."))
+sys.path.append(os.path.join(REPO_DIR, "tools", "python"))
+
+from get_azcopy import get_azcopy  # noqa: E402
+
+
 # Hardcoded map of storage account to azure region endpoint
 storage_account_to_endpoint_map = {
     'onnxruntimetestdata.blob.core.windows.net': {
@@ -47,7 +54,7 @@ def get_azure_region():
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="ONNXRuntime Data Downloader.")
-    parser.add_argument("--test_data_url", help="Test data URL.")
+    parser.add_argument("--test_data_url", required=True, help="Test data URL.")
     parser.add_argument("--azure_region", help="Azure region")
     parser.add_argument("--build_dir", required=True, help="Path to the build directory.")
     parser.add_argument("--edge_device", action="store_true", help="Edge device with limit disk space.")
@@ -80,7 +87,7 @@ def get_region_based_url(url, azure_location):
     return url
 
 
-def download_and_unzip(build_dir, url, dest_folder, use_token=True):
+def download_and_unzip(azcopy_path, build_dir, url, dest_folder, use_token=True):
     dest_folder = os.path.join(build_dir, dest_folder)
     # attach the SAS token to the url. Note DO NOT print the url with the token in any logs.
     token = os.environ.get('Test_Data_Download_Key')
@@ -90,14 +97,13 @@ def download_and_unzip(build_dir, url, dest_folder, use_token=True):
         url_with_token = url
 
     # Download data using AZCopy tool
-    # Our linux CI build machine has azcopy in /usr/bin but the version is too old
-    azcopy_exe = \
-        'azcopy.exe' if sys.platform.startswith("win") and shutil.which('azcopy') else os.path.join(build_dir, 'azcopy')
     try:
-        subprocess.run([azcopy_exe, 'cp', '--log-level', 'ERROR', '--recursive', url_with_token, build_dir], check=True)
+        subprocess.run(
+            [azcopy_path, 'cp', '--log-level', 'ERROR', '--recursive', url_with_token, build_dir],
+            check=True)
     except Exception as e:
         print(e)
-        print(azcopy_exe)
+        print(azcopy_path)
         raise Exception("Downloading data failed. Source: " + url + " Destination: " + build_dir)
 
     os.makedirs(dest_folder, exist_ok=True)
@@ -116,29 +122,6 @@ def download_and_unzip(build_dir, url, dest_folder, use_token=True):
     os.unlink(local_file_name)
 
 
-def download_additional_data(build_dir, azure_region):
-    additional_data_url = 'https://onnxruntimetestdata.blob.core.windows.net/models/'
-    # url = get_region_based_url(args.test_data_url, azure_region)
-    if not shutil.which('cmake'):
-        cmake_url = urljoin(additional_data_url, 'cmake-3.15.1-win64-x64.zip')
-        print("Starting download for cmake : " + cmake_url)
-        download_and_unzip(build_dir, cmake_url, 'cmake_temp', False)
-        dest_dir = os.path.join(build_dir, 'cmake')
-        if os.path.exists(dest_dir):
-            print('deleting %s' % dest_dir)
-            shutil.rmtree(dest_dir)
-        shutil.move(os.path.join(build_dir, 'cmake_temp', 'cmake-3.15.1-win64-x64'), dest_dir)
-
-    # Download OpenCPPCoverageSetup.exe
-    opencpp_url = urljoin(additional_data_url, 'OpenCppCoverageSetup-x64-0.9.7.0.exe')
-    print("Starting download for opencppcoverage " + opencpp_url)
-    dest_folder = os.path.join(build_dir, 'installer', 'opencppcoverage')
-    os.makedirs(dest_folder, exist_ok=True)
-    azcopy_exe = 'azcopy.exe' if shutil.which('azcopy') else os.path.join(build_dir, 'azcopy')
-    subprocess.run([azcopy_exe, 'cp', '--log-level', 'ERROR', opencpp_url, os.path.join(dest_folder, 'installer.exe')],
-                   check=True)
-
-
 args = parse_arguments()
 models_folder = 'models'
 
@@ -157,9 +140,10 @@ else:
         azure_region = get_azure_region()
     try:
         # Download test data
-        url = get_region_based_url(args.test_data_url, azure_region)
-        print("Starting test data download %s" % url)
-        download_and_unzip(args.build_dir, url, models_folder)
+        with get_azcopy(os.path.join(args.build_dir, "azcopy")) as azcopy_path:
+            url = get_region_based_url(args.test_data_url, azure_region)
+            print("Starting test data download %s" % url)
+            download_and_unzip(azcopy_path, args.build_dir, url, models_folder)
 
         all_downloads_done = True
 
diff --git a/tools/python/get_azcopy.py b/tools/python/get_azcopy.py
new file mode 100644
index 0000000000..520d9b17cf
--- /dev/null
+++ b/tools/python/get_azcopy.py
@@ -0,0 +1,78 @@
+import contextlib
+import os
+import platform
+import re
+import shutil
+import stat
+import subprocess
+import tempfile
+import urllib.parse
+import urllib.request
+
+AZCOPY_VERSION = "10.4.3"
+
+# See here for instructions on getting stable download links:
+# https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10#obtain-a-static-download-link
+_AZCOPY_DOWNLOAD_URLS = {
+    "Linux": "https://azcopyvnext.azureedge.net/release20200501/azcopy_linux_amd64_10.4.3.tar.gz",
+    "Darwin": "https://azcopyvnext.azureedge.net/release20200501/azcopy_darwin_amd64_10.4.3.zip",
+    "Windows": "https://azcopyvnext.azureedge.net/release20200501/azcopy_windows_amd64_10.4.3.zip",
+}
+
+
+def _check_version(azcopy_path):
+    proc = subprocess.run(
+        [azcopy_path, "--version"],
+        stdout=subprocess.PIPE, universal_newlines=True)
+    match = re.search(r"\d+(?:\.\d+)+", proc.stdout)
+
+    if not match:
+        raise RuntimeError("Failed to determine azcopy version.")
+
+    return match.group(0) == AZCOPY_VERSION
+
+
+def _find_azcopy(start_dir):
+    for root, _, file_names in os.walk(start_dir):
+        for file_name in file_names:
+            if file_name == "azcopy" or file_name == "azcopy.exe":
+                return os.path.join(root, file_name)
+    raise RuntimeError("Failed to azcopy in '{}'.".format(start_dir))
+
+
+@contextlib.contextmanager
+def get_azcopy(local_azcopy_path="azcopy"):
+    """
+    Creates a context manager that returns a path to a particular version of
+    azcopy (specified in AZCOPY_VERSION). Downloads a temporary copy if needed.
+
+    :param local_azcopy_path: Path to a local azcopy to try first.
+
+    Example usage:
+        with get_azcopy() as azcopy_path:
+            subprocess.run([azcopy_path, "--version"])
+    """
+    with contextlib.ExitStack() as context_stack:
+        azcopy_path = shutil.which(local_azcopy_path)
+
+        if azcopy_path is None or not _check_version(azcopy_path):
+            temp_dir = context_stack.enter_context(
+                tempfile.TemporaryDirectory())
+
+            download_url = _AZCOPY_DOWNLOAD_URLS[platform.system()]
+            download_basename = urllib.parse.urlsplit(
+                download_url).path.rsplit("/", 1)[-1]
+            assert len(download_basename) > 0
+            downloaded_path = os.path.join(temp_dir, download_basename)
+
+            print("Downloading azcopy from '{}'...".format(download_url))
+            urllib.request.urlretrieve(download_url, downloaded_path)
+
+            extracted_path = os.path.join(temp_dir, "azcopy")
+            shutil.unpack_archive(downloaded_path, extracted_path)
+
+            azcopy_path = _find_azcopy(extracted_path)
+
+            os.chmod(azcopy_path, stat.S_IXUSR)
+
+        yield azcopy_path

From 189fb60ef97ba715cdbd4a35afbc7155992c2c7b Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wschin@outlook.com>
Date: Tue, 16 Jun 2020 10:17:27 -0700
Subject: [PATCH 3/5] Fix a bug and add code to profile memory (#4241)

* Fix a bug and add code to profile memory

1. Compile Send/Recv again (currently broken because of
   HOROVOD refactor).
2. Add code to print out initializer allocation size and
   activation memory size.

* Address comments

* Split memory counts per locations

* Fix a metric
---
 cmake/onnxruntime_providers.cmake             |  4 ++
 onnxruntime/core/framework/execution_frame.cc | 15 ++++++-
 onnxruntime/core/framework/execution_frame.h  | 20 +++++++++
 .../core/framework/sequential_executor.cc     | 10 +++++
 .../framework/session_state_initializer.cc    | 16 +++++--
 .../core/framework/simple_tensor_allocator.h  |  7 ++-
 onnxruntime/core/framework/tensor_allocator.h | 10 ++++-
 .../tensor_allocator_with_mem_pattern.h       | 44 ++++++++++++-------
 .../models/runner/training_runner.cc          |  8 +++-
 .../training_ops/cuda/communication/recv.cc   |  2 +-
 .../training_ops/cuda/communication/recv.h    |  2 +-
 .../training_ops/cuda/communication/send.cc   |  2 +-
 .../training_ops/cuda/communication/send.h    |  2 +-
 .../cuda/cuda_training_kernels.cc             | 18 +++++---
 14 files changed, 124 insertions(+), 36 deletions(-)

diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 574f393e69..8a44df44fa 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -144,6 +144,7 @@ if(HAS_DEPRECATED_COPY)
 endif()
 
 target_include_directories(onnxruntime_providers PRIVATE ${ONNXRUNTIME_ROOT} ${eigen_INCLUDE_DIRS} ${gemmlowp_src} ${RE2_INCLUDE_DIR})
+
 add_dependencies(onnxruntime_providers onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
 
 if (onnxruntime_ENABLE_TRAINING)
@@ -154,6 +155,9 @@ if (onnxruntime_ENABLE_TRAINING)
   if (onnxruntime_USE_HOROVOD)
     target_include_directories(onnxruntime_providers PRIVATE ${HOROVOD_INCLUDE_DIRS})
   endif()
+  if (onnxruntime_USE_NCCL OR onnxruntime_USE_HOROVOD) 
+    target_include_directories(onnxruntime_providers PUBLIC ${MPI_INCLUDE_DIRS})
+  endif()
 endif()
 
 install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cpu  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
diff --git a/onnxruntime/core/framework/execution_frame.cc b/onnxruntime/core/framework/execution_frame.cc
index 9f24a51670..b62e3e539c 100644
--- a/onnxruntime/core/framework/execution_frame.cc
+++ b/onnxruntime/core/framework/execution_frame.cc
@@ -254,8 +254,17 @@ ExecutionFrame::ExecutionFrame(const std::vector<int>& feed_mlvalue_idxs, const
             // it's less efficient (the arena will add some overhead to coalesce individual allocations
             // back into blocks on 'free'), but better than failing completely.
             try {
-              buffer = alloc->Alloc(mem_patterns_->patterns[i].PeakSize());
-
+              // static_activation_memory_in_bytes_ is max virtual memory size the planner computes 
+              auto peak_size = mem_patterns_->patterns[i].PeakSize();
+              // Planning of one memory type should only happen once.
+              ORT_ENFORCE(
+                static_activation_memory_sizes_in_byte_.find(location.name) ==
+                static_activation_memory_sizes_in_byte_.end(),
+                "Memory type ",
+                location.name,
+                " should only appear once.");
+              static_activation_memory_sizes_in_byte_[location.name] = peak_size;
+              buffer = alloc->Alloc(peak_size);
               // handle allocator that doesn't throw
               if (buffer == nullptr) {
                 // INFO level as this may fire on every run and there may not be much a user can do
@@ -375,6 +384,8 @@ Status ExecutionFrame::AllocateMLValueTensorSelfOwnBufferHelper(OrtValue& ort_va
     TraceAllocate(ort_value_index, size);
   }
 
+  dynamic_activation_memory_sizes_in_byte_[location.name] += size;
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/framework/execution_frame.h b/onnxruntime/core/framework/execution_frame.h
index f0b211994f..cda11b1133 100644
--- a/onnxruntime/core/framework/execution_frame.h
+++ b/onnxruntime/core/framework/execution_frame.h
@@ -129,6 +129,18 @@ class ExecutionFrame final : public IExecutionFrame {
     return planner_ != nullptr;
   }
 
+  // Return the size of virtual memory allocated in runtime.
+  // The memory is usually used for activations in forward and backward passes.
+  const std::unordered_map<std::string, size_t>& GetDynamicMemorySizeInfo() {
+    return dynamic_activation_memory_sizes_in_byte_;
+  }
+
+  // Return the size of virtual memory allocated before computation.
+  // The memory is usually used for activations in forward and backward passes.
+  const std::unordered_map<std::string, size_t>& GetStaticMemorySizeInfo() {
+    return static_activation_memory_sizes_in_byte_;
+  }
+
  private:
   ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ExecutionFrame);
 
@@ -168,5 +180,13 @@ class ExecutionFrame final : public IExecutionFrame {
 
   // Big chunks on different locations that will be used by mem_pattern.
   std::map<OrtMemoryInfo, BufferUniquePtr> buffers_;
+
+  // Size of virtual memory allocated before any kernel execution.
+  // This field is not physical memory size.
+  std::unordered_map<std::string, size_t> static_activation_memory_sizes_in_byte_;
+  // Size of virtual memory allocated during kernel execution (i.e., inside a kernel,
+  // we may allocate some memory for its outputs, if not planned.).
+  // This field is not physical memory size.
+  std::unordered_map<std::string, size_t> dynamic_activation_memory_sizes_in_byte_;
 };
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/framework/sequential_executor.cc b/onnxruntime/core/framework/sequential_executor.cc
index 66168b171e..7685681d31 100644
--- a/onnxruntime/core/framework/sequential_executor.cc
+++ b/onnxruntime/core/framework/sequential_executor.cc
@@ -446,6 +446,16 @@ Status SequentialExecutor::Execute(const SessionState& session_state, const std:
     session_state.Profiler().EndTimeAndRecordEvent(profiling::SESSION_EVENT, "SequentialExecutor::Execute", tp);
   }
 
+  for (auto i: frame.GetStaticMemorySizeInfo()) {
+    LOGS(logger, INFO) << "[Memory] ExecutionFrame statically allocates "
+                       << i.second << " bytes for " << i.first << std::endl;
+  }
+
+  for (auto i: frame.GetDynamicMemorySizeInfo()) {
+    LOGS(logger, INFO) << "[Memory] ExecutionFrame dynamically allocates "
+                       << i.second << " bytes for " << i.first << std::endl;
+  }
+
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/framework/session_state_initializer.cc b/onnxruntime/core/framework/session_state_initializer.cc
index 0bf32ae6f7..8a05cbd161 100644
--- a/onnxruntime/core/framework/session_state_initializer.cc
+++ b/onnxruntime/core/framework/session_state_initializer.cc
@@ -86,13 +86,13 @@ common::Status SessionStateInitializer::CreatePlan(
   const auto* exec_plan_ptr = session_state_.GetExecutionPlan();
   ORT_ENFORCE(exec_plan_ptr, "Execution plan was not found in SessionState. CreatePlan must be called first.");
 
-  std::unique_ptr<ITensorAllocator> tensor_allocator_(ITensorAllocator::Create(
+  std::unique_ptr<ITensorAllocator> tensor_allocator(ITensorAllocator::Create(
       enable_mem_pattern_, *exec_plan_ptr, execution_providers_, session_state_.GetMutableWeightsBuffers()));
 
   // lambda to save initialized tensors into SessionState directly
   const Env& env = Env::Default();
   ORT_RETURN_IF_ERROR(SaveInitializedTensors(
-      env, graph_loc_, graph_, execution_providers_, ort_value_name_idx_map, tensor_allocator_.get(),
+      env, graph_loc_, graph_, execution_providers_, ort_value_name_idx_map, tensor_allocator.get(),
       [this](int idx, const OrtValue& value, const OrtCallback& d, bool constant) -> Status {
         return session_state_.AddInitializedTensor(idx, value, &d, constant);
       },
@@ -191,7 +191,17 @@ common::Status SaveInitializedTensors(const Env& env, const std::basic_string<PA
   }
 
   //2. allocate weight buffer on different locations
-  ORT_RETURN_IF_ERROR(planner->FinalizePlan());
+  // planned_initializers_memory_size_in_byte is not actual physical size.
+  // It's the virtual size computed by planner.
+  std::unordered_map<std::string, size_t> planned_initializers_memory_sizes_in_byte;
+  ORT_RETURN_IF_ERROR(
+    planner->FinalizePlan(planned_initializers_memory_sizes_in_byte));
+
+  for (auto i: planned_initializers_memory_sizes_in_byte) {
+    LOGS(logger, INFO) << "[Memory] SessionStateInitializer statically allocates "
+                       << i.second << " bytes for " << i.first << std::endl;
+  }
+
   OrtCallback deleter;
   //3. create weight tensors based on weights buffer
   for (const auto& entry : id_to_initialized_tensor) {
diff --git a/onnxruntime/core/framework/simple_tensor_allocator.h b/onnxruntime/core/framework/simple_tensor_allocator.h
index 2ab7af4e94..5c9b8a9ff8 100644
--- a/onnxruntime/core/framework/simple_tensor_allocator.h
+++ b/onnxruntime/core/framework/simple_tensor_allocator.h
@@ -27,7 +27,12 @@ class SimpleTensorAllocator : public ITensorAllocator {
       : ITensorAllocator(exec_providers),
         weights_buffers_(weights_buffers),
         seq_plan_(execution_plan) {}
-  common::Status FinalizePlan() override { return Status::OK(); }
+  common::Status FinalizePlan(std::unordered_map<std::string, size_t>& planned_memory_sizes_in_byte) override {
+    // There is no memory plan to allocate a big block of memory, so
+    // planned memory sizes in different locations are all empty.
+    planned_memory_sizes_in_byte = std::unordered_map<std::string, size_t>();
+    return Status::OK();
+  }
   common::Status GetPreallocatedBuffer(int ort_value_index, const char* name, std::unique_ptr<MemBuffer>& out) override;
   common::Status Trace(int id, const ONNX_NAMESPACE::TensorProto* value) override;
 };
diff --git a/onnxruntime/core/framework/tensor_allocator.h b/onnxruntime/core/framework/tensor_allocator.h
index fa9e655766..a6fad41ad4 100644
--- a/onnxruntime/core/framework/tensor_allocator.h
+++ b/onnxruntime/core/framework/tensor_allocator.h
@@ -22,7 +22,15 @@ class ITensorAllocator {
  public:
   AllocatorPtr GetAllocator(const OrtMemoryInfo& memory_info);
 
-  virtual common::Status FinalizePlan() = 0;
+  /**
+   *
+   * \param planned_memory_size_in_byte The size of memory allocated inside FinalizePlan
+   *
+   * When there is no more tensor to trace, call this function to finalize the
+   * allocation.
+   */
+  virtual common::Status FinalizePlan(std::unordered_map<std::string, size_t>& planned_memory_sizes_in_byte) = 0;
+
   /**
    *
    * \param ort_value_index The index in planner
diff --git a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
index 8267508eda..925a96c81b 100644
--- a/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
+++ b/onnxruntime/core/framework/tensor_allocator_with_mem_pattern.h
@@ -21,7 +21,8 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
   bool is_sealed_ = false;
   const ExecutionPlanBase& seq_plan_;
 
-  common::Status AllocatePlannedBuffers() {
+  common::Status AllocatePlannedBuffersAndReportTotalSize(
+      std::unordered_map<std::string, size_t>& planned_memory_sizes_in_byte) {
     const size_t location_len = mem_patterns_.locations.size();
     for (size_t i = 0; i < location_len; ++i) {
       auto& location = mem_patterns_.locations[i];
@@ -30,21 +31,30 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
         return Status(common::ONNXRUNTIME, common::FAIL,
                       "Failed to get allocator for location: " + location.ToString());
 
-      if (mem_patterns_.patterns[i].PeakSize() > 0) {
-        void* buffer;
-        if (alloc->Info().alloc_type == OrtArenaAllocator) {
-          buffer = static_cast<IArenaAllocator*>(alloc.get())->Reserve(mem_patterns_.patterns[i].PeakSize());
-        }
-        else {
-          buffer = alloc->Alloc(mem_patterns_.patterns[i].PeakSize());
-        }
-        weights_buffers_.push_back(BufferUniquePtr(buffer, alloc));
-        auto kvp = buffers_.insert(std::make_pair(location, buffer));
-        if (!kvp.second) {
-          alloc->Free(buffer);
-          return Status(common::ONNXRUNTIME, common::FAIL, "duplicated location");
-        }
+      // Don't allocate memory when there is no memory usage..
+      if (mem_patterns_.patterns[i].PeakSize() <= 0) {
+        continue;
       }
+
+      const auto peak_size = mem_patterns_.patterns[i].PeakSize();
+      void* buffer;
+      if (alloc->Info().alloc_type == OrtArenaAllocator) {
+        // Arena has a specific way to store static memory.
+        // Arena does not reuse static memory allocated by Reserve.
+        buffer = static_cast<IArenaAllocator*>(
+          alloc.get())->Reserve(peak_size);
+      }
+      else {
+        buffer = alloc->Alloc(peak_size);
+      }
+      weights_buffers_.push_back(BufferUniquePtr(buffer, alloc));
+      auto kvp = buffers_.insert(std::make_pair(location, buffer));
+      if (!kvp.second) {
+        alloc->Free(buffer);
+        return Status(common::ONNXRUNTIME, common::FAIL, "duplicated location");
+      }
+
+      planned_memory_sizes_in_byte[location.name] += peak_size;
     }
     return Status::OK();
   }
@@ -57,9 +67,9 @@ class TensorAllocatorWithMemPattern : public ITensorAllocator {
         weights_buffers_(weights_buffers),
         seq_plan_(execution_plan) {}
 
-  common::Status FinalizePlan() override {
+  common::Status FinalizePlan(std::unordered_map<std::string, size_t>& planned_memory_sizes_in_byte) override {
     ORT_RETURN_IF_ERROR(planner_.GeneratePatterns(&mem_patterns_));
-    ORT_RETURN_IF_ERROR(AllocatePlannedBuffers());
+    ORT_RETURN_IF_ERROR(AllocatePlannedBuffersAndReportTotalSize(planned_memory_sizes_in_byte));
     is_sealed_ = true;
     return Status::OK();
   }
diff --git a/orttraining/orttraining/models/runner/training_runner.cc b/orttraining/orttraining/models/runner/training_runner.cc
index 37385ecc8b..95e7f08eaf 100644
--- a/orttraining/orttraining/models/runner/training_runner.cc
+++ b/orttraining/orttraining/models/runner/training_runner.cc
@@ -795,6 +795,7 @@ Status TrainingRunner::TrainingLoop(IDataLoader& training_data_loader, IDataLoad
   auto end_to_end_start = std::chrono::high_resolution_clock::now();
   bool end_to_end_measurement_started = false;
 
+  auto all_steps_time_start = std::chrono::high_resolution_clock::now();
   while (step_ < params_.num_train_steps) {
     for (size_t shard_it = 0; shard_it < num_shards_to_visit; ++shard_it) {
       auto training_data = training_data_loader.CurrentDataSet();
@@ -921,6 +922,8 @@ Status TrainingRunner::TrainingLoop(IDataLoader& training_data_loader, IDataLoad
 
     ++epoch;
   }
+  auto all_steps_time_end = std::chrono::high_resolution_clock::now();
+  std::chrono::duration<double> all_steps_duration_seconds = all_steps_time_end - all_steps_time_start;
 
   const double e2e_throughput = [&]() {
     if (end_to_end_perf_start_step >= params_.num_train_steps) return 0.0;
@@ -959,7 +962,9 @@ Status TrainingRunner::TrainingLoop(IDataLoader& training_data_loader, IDataLoad
             << "Average Running Time Per Batch: " << avg_time_per_batch << " ms\n"
             << "Throughput: " << throughput << " Examples / Second\n"
             << "Stabilized Throughput: " << stabilized_throughput << " Examples / Second\n"
-            << "EndToEnd Throughput: " << e2e_throughput << " Examples / Second\n";
+            << "EndToEnd Throughput: " << e2e_throughput << " Examples / Second\n"
+            << "Average Step Time: " << all_steps_duration_seconds.count() / (step_ - step_start)<< " Second\n"
+            << "Average Step Throughput: " << params_.batch_size * (step_ - step_start) / (all_steps_duration_seconds.count()) << " Examples / Second\n";
 
   return Status::OK();
 }
@@ -1170,7 +1175,6 @@ Status TrainingRunner::Evaluate(InferenceSession& session, IDataLoader& data_loa
         &fetches));
     }
 
-
     // Assume that user-specified fetches are avaliable only on the last pipeline stage.
     // When there is no pipeline, all pipeline_context_.pipeline_stage_id should be 0 and
     // params_.pipeline_parallel_size is 1. Thus, the following condition is always true if there
diff --git a/orttraining/orttraining/training_ops/cuda/communication/recv.cc b/orttraining/orttraining/training_ops/cuda/communication/recv.cc
index 68460c1511..db48bbe4e5 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/recv.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/recv.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_HOROVOD
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
 
 #include "orttraining/training_ops/cuda/communication/recv.h"
 #include "orttraining/training_ops/cuda/communication/common.h"
diff --git a/orttraining/orttraining/training_ops/cuda/communication/recv.h b/orttraining/orttraining/training_ops/cuda/communication/recv.h
index 0d1a812038..cf24a8b84c 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/recv.h
+++ b/orttraining/orttraining/training_ops/cuda/communication/recv.h
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_HOROVOD
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
 
 #pragma once
 #include "core/common/common.h"
diff --git a/orttraining/orttraining/training_ops/cuda/communication/send.cc b/orttraining/orttraining/training_ops/cuda/communication/send.cc
index 72d0dae39e..a3863e5d97 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/send.cc
+++ b/orttraining/orttraining/training_ops/cuda/communication/send.cc
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_HOROVOD
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
 
 #include "orttraining/training_ops/cuda/communication/send.h"
 #include "orttraining/training_ops/cuda/communication/common.h"
diff --git a/orttraining/orttraining/training_ops/cuda/communication/send.h b/orttraining/orttraining/training_ops/cuda/communication/send.h
index 878fee48d7..170b2aff6e 100644
--- a/orttraining/orttraining/training_ops/cuda/communication/send.h
+++ b/orttraining/orttraining/training_ops/cuda/communication/send.h
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#ifdef USE_HOROVOD
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
 
 #pragma once
 #include "core/common/common.h"
diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
index 7ce8c95aa0..2b5214af7c 100644
--- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
+++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc
@@ -114,14 +114,17 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SliceGrad);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherElementsGrad);
 
-#ifdef USE_HOROVOD
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, HorovodAllReduce);
-class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, HorovodBarrier);
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
 // P2P communication operators.
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Send);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Recv);
 #endif
 
+#ifdef USE_HOROVOD
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, HorovodAllReduce);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, HorovodBarrier);
+#endif
+
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, RecordEvent);
 class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, WaitEvent);
 
@@ -240,12 +243,15 @@ Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) {
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, SliceGrad)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, GatherElementsGrad)>,
 
+      // P2P communication operators.
+#if defined(USE_NCCL) || defined(USE_HOROVOD)
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Send)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Recv)>,
+#endif
+
 #ifdef USE_HOROVOD
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, HorovodAllReduce)>,
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kOnnxDomain, 9, HorovodBarrier)>,
-      // P2P communication operators.
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Send)>,
-      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Recv)>,
 #endif
 
       BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, RecordEvent)>,

From 12367a6b11e96e70112026cb7975f7035c7ad0ed Mon Sep 17 00:00:00 2001
From: Yulong Wang <yulongw@microsoft.com>
Date: Tue, 16 Jun 2020 11:06:11 -0700
Subject: [PATCH 4/5] [C#] enable string-typed FixedBufferOnnxValue in input
 (#4178)

---
 .../FixedBufferOnnxValue.cs                   |  9 +----
 .../InferenceSession.cs                       | 14 ++++++-
 .../InferenceTest.cs                          | 37 +++++++++++++++++--
 3 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.cs b/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.cs
index 92aab02a2f..206e87d035 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/FixedBufferOnnxValue.cs
@@ -31,16 +31,9 @@ namespace Microsoft.ML.OnnxRuntime
         /// <returns></returns>
         public static FixedBufferOnnxValue CreateFromTensor<T>(Tensor<T> value)
         {
-            if (value is Tensor<string>)
-            {
-                throw new ArgumentException("Only numeric tensors can be used to create FixedBufferOnnxValue.", nameof(value));
-            }
-
             NativeOnnxValueHelper.CreateNativeOnnxValue(value, out IntPtr onnxValue, out MemoryHandle pinnedMemoryHandle, out OnnxValueType onnxValueType, out TensorElementType elementType);
 
-            Debug.Assert(
-                onnxValueType == OnnxValueType.ONNX_TYPE_TENSOR && elementType != TensorElementType.String,
-                "the value should always be a numeric tensor");
+            Debug.Assert(onnxValueType == OnnxValueType.ONNX_TYPE_TENSOR, "the value should always be a tensor");
 
             return new FixedBufferOnnxValue(pinnedMemoryHandle, onnxValue, onnxValueType, elementType);
         }
diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs
index e03a70e4d9..cd3145a4c3 100644
--- a/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs
+++ b/csharp/src/Microsoft.ML.OnnxRuntime/InferenceSession.cs
@@ -366,6 +366,11 @@ namespace Microsoft.ML.OnnxRuntime
             int outputIndex = 0;
             foreach (var output in outputValues)
             {
+                if (output.ElementType == TensorElementType.String)
+                {
+                    throw new NotSupportedException("Using string type FixedBufferOnnxValue in outputs is not supported.");
+                }
+
                 outputValuesArray[outputIndex] = output.Value;
 
                 outputIndex++;
@@ -556,6 +561,11 @@ namespace Microsoft.ML.OnnxRuntime
                 int outputIndex = 0;
                 foreach (var output in outputValues)
                 {
+                    if (output.ElementType == TensorElementType.String)
+                    {
+                        throw new NotSupportedException("Using string type FixedBufferOnnxValue in outputs is not supported.");
+                    }
+
                     outputValuesArray[outputIndex] = output.Value;
 
                     outputIndex++;
@@ -695,7 +705,7 @@ namespace Microsoft.ML.OnnxRuntime
             IntPtr nameHandle = IntPtr.Zero;
             string str = null;
 
-            IntPtr status = NativeMethods.OrtSessionEndProfiling(_nativeHandle, 
+            IntPtr status = NativeMethods.OrtSessionEndProfiling(_nativeHandle,
                                                                   NativeMemoryAllocator.DefaultInstance.Handle,
                                                                   out nameHandle);
 
@@ -708,7 +718,7 @@ namespace Microsoft.ML.OnnxRuntime
             {
                 if (nameHandle != IntPtr.Zero)
                 {
-                  NativeMemoryAllocator.DefaultInstance.FreeMemory(nameHandle);
+                    NativeMemoryAllocator.DefaultInstance.FreeMemory(nameHandle);
                 }
             }
 
diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs
index 1a5c2e94cb..38a70d923d 100644
--- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs
+++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests/InferenceTest.cs
@@ -1056,12 +1056,41 @@ namespace Microsoft.ML.OnnxRuntime.Tests
         public void TestCreateFixedBufferOnnxValueFromStringTensor()
         {
             var tensor = new DenseTensor<string>(new string[] { "a", "b" }, new int[] { 1, 2 });
+            using (var value = FixedBufferOnnxValue.CreateFromTensor(tensor)) { }
+        }
 
-            Assert.Throws<ArgumentException>("value", () =>
+        [Fact]
+        public void TestReusingStringFixedBufferOnnxValue()
+        {
+            string modelPath = Path.Combine(Directory.GetCurrentDirectory(), "test_types_STRING.pb");
+            using (var session = new InferenceSession(modelPath))
             {
-                // cannot create from string tensor
-                FixedBufferOnnxValue.CreateFromTensor(tensor);
-            });
+                var tensorA = new DenseTensor<string>(new string[] { "a", "b", "c", "d", "e" }, new int[] { 1, 5 });
+                var tensorB = new DenseTensor<string>(new string[] { "v", "w", "x", "y", "z" }, new int[] { 1, 5 });
+                var tensorC = new DenseTensor<string>(new string[] { "i", "j", "k", "l", "m" }, new int[] { 1, 5 });
+                var tensorD = new DenseTensor<string>(new string[] { "i", "j", "k", "l", "m" }, new int[] { 1, 5 });
+                using (FixedBufferOnnxValue a = FixedBufferOnnxValue.CreateFromTensor(tensorA),
+                                            b = FixedBufferOnnxValue.CreateFromTensor(tensorB),
+                                            c = FixedBufferOnnxValue.CreateFromTensor(tensorC),
+                                            d = FixedBufferOnnxValue.CreateFromTensor(tensorD))
+                {
+                    // OK to use string type FixedBufferOnnxValue only in input
+                    session.Run(new[] { "input" }, new[] { a });
+
+                    // Cannot use string type FixedBufferOnnxValue in output
+                    Assert.Throws<NotSupportedException>(() =>
+                    {
+                        // NamedOnnxValue inputs
+                        session.Run(new[] { NamedOnnxValue.CreateFromTensor("input", tensorB) }, new[] { "output" }, new[] { b });
+                    });
+                    Assert.Throws<NotSupportedException>(() =>
+                    {
+                        // both FixedBufferOnnxValue for inputs and outputs
+                        session.Run(new[] { "input" }, new[] { c }, new[] { "output" }, new[] { d });
+                    });
+                }
+
+            }
         }
 
         [Fact]

From b41fcf1570b5ad771fe65f85b758f5db3bb0cd11 Mon Sep 17 00:00:00 2001
From: Vincent Wang <wangwchpku@outlook.com>
Date: Wed, 17 Jun 2020 15:11:02 +0800
Subject: [PATCH 5/5] Bugfix for shape inference and GetShape. (#4243)

Co-authored-by: Vincent Wang <weicwang@microsoft.com>
---
 .../core/graph/gradient_builder_base.cc       |  5 ++++-
 .../core/graph/gradient_schema_defs.cc        | 20 ++++++++++++++++++-
 .../core/optimizer/insert_output_rewriter.cc  |  8 ++++++--
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/orttraining/orttraining/core/graph/gradient_builder_base.cc b/orttraining/orttraining/core/graph/gradient_builder_base.cc
index 207f10850b..341742486f 100644
--- a/orttraining/orttraining/core/graph/gradient_builder_base.cc
+++ b/orttraining/orttraining/core/graph/gradient_builder_base.cc
@@ -83,7 +83,10 @@ void ComputeBroadcastBackwardAxes(
 }
 
 std::vector<Dimension> GetShape(const ArgDef& arg_def) {
-  ORT_ENFORCE(arg_def.type_proto, "During GetShape, ", arg_def.name, "'s type_proto is null.");
+  ORT_ENFORCE(arg_def.type_proto
+              && arg_def.type_proto->has_tensor_type()
+              && arg_def.type_proto->tensor_type().has_shape(),
+              "During GetShape, ", arg_def.name, "'s shape is null.");
   std::vector<Dimension> shape;
   const auto& dims = arg_def.type_proto->tensor_type().shape().dim();
   for (auto dim = dims.begin(); dim < dims.end(); dim++) {
diff --git a/orttraining/orttraining/core/graph/gradient_schema_defs.cc b/orttraining/orttraining/core/graph/gradient_schema_defs.cc
index e49c858181..ec81e08261 100644
--- a/orttraining/orttraining/core/graph/gradient_schema_defs.cc
+++ b/orttraining/orttraining/core/graph/gradient_schema_defs.cc
@@ -937,7 +937,25 @@ Example 4:
       .TypeConstraint("Tind",
                       {"tensor(int32)", "tensor(int64)"},
                       "Constrain indices to integer types")
-      .SetDoc(R"DOC(SparseSoftmaxCrossEntropy)DOC");
+      .SetDoc(R"DOC(SparseSoftmaxCrossEntropy)DOC")
+      .TypeAndShapeInferenceFunction([](InferenceContext& ctx) {
+        propagateElemTypeFromInputToOutput(ctx, 0, 0);
+        std::string reduction = getAttribute(ctx, "reduction", "mean");
+        if (reduction.compare("none") == 0) {
+          if (hasInputShape(ctx, 1)) {
+            propagateShapeFromInputToOutput(ctx, 1, 0);
+          }
+        } else {
+          updateOutputShape(ctx, 0, TensorShapeProto());
+        }
+
+        if(ctx.getNumOutputs() == 2) {
+          propagateElemTypeFromInputToOutput(ctx, 0, 1);
+          if (hasInputShape(ctx, 0)) {
+            propagateShapeFromInputToOutput(ctx, 0, 1);
+          }
+        }
+      });
 
   ONNX_CONTRIB_OPERATOR_SCHEMA(SparseSoftmaxCrossEntropyGrad)
       .SetDomain(kOnnxDomain)
diff --git a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
index ceff7c3f77..ac8f1caad2 100644
--- a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
+++ b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc
@@ -14,7 +14,9 @@ Status InsertMaxPoolOutput::Apply(Graph& graph, Node& node, RewriteRuleEffect& r
 
   TypeProto t;
   t.mutable_tensor_type()->set_elem_type(TensorProto_DataType_INT64);
-  t.mutable_tensor_type()->mutable_shape()->CopyFrom(*Y->Shape());
+  if (Y->Shape() != nullptr) {
+    t.mutable_tensor_type()->mutable_shape()->CopyFrom(*Y->Shape());
+  }
 
   NodeArg& node_arg = graph.GetOrCreateNodeArg(Y->Name() + "_mask", &t);
 
@@ -38,7 +40,9 @@ Status InsertSoftmaxCrossEntropyLossOutput::Apply(Graph& graph, Node& node, Rewr
 
   TypeProto t;
   t.mutable_tensor_type()->set_elem_type(X->TypeAsProto()->tensor_type().elem_type());
-  t.mutable_tensor_type()->mutable_shape()->CopyFrom(*X->Shape());  // log probability should have the same shape as logits.
+  if (X->Shape() != nullptr) {
+    t.mutable_tensor_type()->mutable_shape()->CopyFrom(*X->Shape());  // log probability should have the same shape as logits.
+  }
 
   NodeArg& node_arg = graph.GetOrCreateNodeArg(X->Name() + "_log_prob", &t);